From 511e169531ee16499cf467c9c455bb645f1bdfaa Mon Sep 17 00:00:00 2001
From: Sergei Smirnov <89378719+sergey-grovety@users.noreply.github.com>
Date: Mon, 15 Nov 2021 20:41:18 +0300
Subject: [PATCH] [Topi] Cortex-M DSP support (#9233)

Co-authored-by: Sergey Smirnov <Sergey.Smirnov@mir.dev>
Co-authored-by: Ekaterina Bern <Ekaterina.Bern@mir.dev>
Co-authored-by: Mikhail Trubnikov <Mikhail.Trubnikov@mir.dev>
Co-authored-by: German Tretiakov <german.tretiakov@mir.dev>
Co-authored-by: Ilya Gozman <Ilya.Gozman@mir.dev>
Co-authored-by: Alexey.Yazev <Alexey.Yazev@mir.dev>
Co-authored-by: Ilya Gozman <92577591+ilyag-grovety@users.noreply.github.com>
---
 python/tvm/relay/op/strategy/arm_cpu.py       |  93 ++++-
 python/tvm/target/arm_isa.py                  |  24 +-
 python/tvm/testing/plugin.py                  |   1 +
 python/tvm/testing/utils.py                   |  12 +
 python/tvm/topi/arm_cpu/__init__.py           |   4 +-
 python/tvm/topi/arm_cpu/conv1d.py             |  37 ++
 python/tvm/topi/arm_cpu/conv2d.py             |  23 +-
 .../topi/arm_cpu/cortex_m7/conv2d/direct.py   | 186 ---------
 python/tvm/topi/arm_cpu/dense.py              |  25 ++
 .../{cortex_m7 => mprofile}/__init__.py       |   5 +-
 .../micro_kernel => mprofile/dsp}/__init__.py |   0
 .../tvm/topi/arm_cpu/mprofile/dsp/conv1d.py   | 177 +++++++++
 .../direct_simd.py => mprofile/dsp/conv2d.py} |  30 +-
 python/tvm/topi/arm_cpu/mprofile/dsp/dense.py |  52 +++
 .../dsp/micro_kernel}/__init__.py             |   3 -
 .../mprofile/dsp/micro_kernel/avg_pool.py     | 146 +++++++
 .../mprofile/dsp/micro_kernel/common.py       |  32 ++
 .../dsp}/micro_kernel/gemm.py                 |  91 +++--
 .../mprofile/dsp/micro_kernel/max_pool.py     | 165 ++++++++
 python/tvm/topi/arm_cpu/mprofile/dsp/pool.py  | 125 ++++++
 python/tvm/topi/arm_cpu/pooling.py            |  25 ++
 tests/python/conftest.py                      |  21 ++
 .../integration/test_arm_mprofile_dsp.py      | 355 ++++++++++++++++++
 tests/python/relay/aot/aot_test_utils.py      |  24 +-
 tests/scripts/task_python_integration.sh      |   5 +
 .../task_python_integration_gpuonly.sh        |   1 +
 26 files changed, 1387 insertions(+), 275 deletions(-)
 create mode 100644 python/tvm/topi/arm_cpu/conv1d.py
 delete mode 100644 python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct.py
 create mode 100644 python/tvm/topi/arm_cpu/dense.py
 rename python/tvm/topi/arm_cpu/{cortex_m7 => mprofile}/__init__.py (92%)
 rename python/tvm/topi/arm_cpu/{cortex_m7/micro_kernel => mprofile/dsp}/__init__.py (100%)
 create mode 100644 python/tvm/topi/arm_cpu/mprofile/dsp/conv1d.py
 rename python/tvm/topi/arm_cpu/{cortex_m7/conv2d/direct_simd.py => mprofile/dsp/conv2d.py} (88%)
 create mode 100644 python/tvm/topi/arm_cpu/mprofile/dsp/dense.py
 rename python/tvm/topi/arm_cpu/{cortex_m7/conv2d => mprofile/dsp/micro_kernel}/__init__.py (91%)
 create mode 100644 python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/avg_pool.py
 create mode 100644 python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py
 rename python/tvm/topi/arm_cpu/{cortex_m7 => mprofile/dsp}/micro_kernel/gemm.py (89%)
 create mode 100644 python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/max_pool.py
 create mode 100644 python/tvm/topi/arm_cpu/mprofile/dsp/pool.py
 create mode 100644 python/tvm/topi/arm_cpu/pooling.py
 create mode 100644 tests/python/integration/test_arm_mprofile_dsp.py

diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py
index 06dfc87038fe..35db043e8c93 100644
--- a/python/tvm/relay/op/strategy/arm_cpu.py
+++ b/python/tvm/relay/op/strategy/arm_cpu.py
@@ -19,7 +19,7 @@
 import re
 import logging
 
-from tvm import topi
+from tvm import relay, topi
 from ....target import arm_isa
 from ....topi.generic import conv2d as conv2d_generic
 from .generic import *
@@ -49,6 +49,25 @@ def schedule_concatenate_arm_cpu(_, outs, target):
         return topi.arm_cpu.schedule_concatenate(outs)
 
 
+@schedule_pool.register(["arm_cpu"])
+def schedule_pool_arm_cpu(attrs, outs, target):
+    """schedule pooling ops arm cpu"""
+    layout = attrs.layout
+    isa = arm_isa.IsaAnalyzer(target)
+    avg_pool = isinstance(attrs, relay.op.op_attrs.AvgPool2DAttrs)
+    with target:
+        if (
+            avg_pool
+            and isa.has_dsp_support
+            and layout in ("NCW", "NCHW")
+            or not avg_pool
+            and isa.has_dsp_support
+            and layout in ("NWC", "NHWC")
+        ):
+            return topi.arm_cpu.schedule_pool(outs, layout)
+        return topi.generic.schedule_pool(outs, layout)
+
+
 @conv2d_strategy.register(["arm_cpu", "micro_dev"])
 def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
     """conv2d arm cpu strategy"""
@@ -128,11 +147,11 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
                 name="conv2d_hwcn.generic",
             )
         elif layout == "NHWC":
-            if "SMLAD" in isa and kernel_layout == "HWOI":
+            if isa.has_dsp_support and kernel_layout == "HWOI":
                 strategy.add_implementation(
-                    wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_direct_simd),
-                    wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_direct_simd),
-                    name="conv2d_nhwc_direct_simd.micro_dev",
+                    wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_dsp),
+                    wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_dsp),
+                    name="conv2d_nhwc_dsp.micro_dev",
                 )
             elif kernel_layout == "HWIO":
                 is_aarch64 = topi.arm_cpu.arm_utils.is_aarch64_arm()
@@ -415,3 +434,67 @@ def schedule_bitserial_dense_arm_cpu(attrs, inputs, out_type, target):
         name="bitserial_dense.arm_cpu",
     )
     return strategy
+
+
+@dense_strategy.register(["arm_cpu"])
+def schedule_dense_arm_cpu(attrs, inputs, out_type, target):
+    """dense arm cpu strategy"""
+    strategy = _op.OpStrategy()
+    isa = arm_isa.IsaAnalyzer(target)
+    if isa.has_dsp_support:
+        strategy.add_implementation(
+            wrap_compute_dense(topi.nn.dense),
+            wrap_topi_schedule(topi.arm_cpu.schedule_dense_dsp),
+            name="dense_dsp",
+        )
+    else:
+        strategy.add_implementation(
+            wrap_compute_dense(topi.nn.dense),
+            wrap_topi_schedule(topi.generic.schedule_dense),
+            name="dense.generic",
+        )
+    return strategy
+
+
+@conv1d_strategy.register("arm_cpu")
+def conv1d_strategy_arm_cpu(attrs, inputs, out_type, target):
+    """conv1d strategy"""
+    strategy = _op.OpStrategy()
+    layout = attrs.data_layout
+    kernel_layout = attrs.kernel_layout
+    dilation = get_const_tuple(attrs.dilation)
+    if dilation[0] < 1:
+        raise ValueError("dilation should be a positive value")
+
+    isa = arm_isa.IsaAnalyzer(target)
+
+    if kernel_layout == "WOI":
+        if layout == "NWC" and isa.has_dsp_support:
+            strategy.add_implementation(
+                wrap_compute_conv1d(topi.arm_cpu.conv1d_nwc_dsp),
+                wrap_topi_schedule(topi.arm_cpu.schedule_conv1d_nwc_dsp),
+                name="conv1d_dsp",
+            )
+        else:
+            raise RuntimeError(
+                "Unsupported kernel layout {} for conv1d {} for arm cpu.".format(
+                    kernel_layout, layout
+                )
+            )
+    elif layout == "NCW":
+        strategy.add_implementation(
+            wrap_compute_conv1d(topi.nn.conv1d_ncw),
+            wrap_topi_schedule(topi.generic.schedule_conv1d_ncw),
+            name="conv1d_ncw.generic",
+        )
+    elif layout == "NWC":
+        strategy.add_implementation(
+            wrap_compute_conv1d(topi.nn.conv1d_nwc),
+            wrap_topi_schedule(topi.generic.schedule_conv1d_nwc),
+            name="conv1d_nwc.generic",
+        )
+    else:
+        raise RuntimeError(
+            "Unsupported kernel layout {} for conv1d {} for arm cpu.".format(kernel_layout, layout)
+        )
+    return strategy
diff --git a/python/tvm/target/arm_isa.py b/python/tvm/target/arm_isa.py
index 60fc6593cb5f..a5ac9b1563a5 100644
--- a/python/tvm/target/arm_isa.py
+++ b/python/tvm/target/arm_isa.py
@@ -16,18 +16,24 @@
 # under the License.
 """Defines functions to analyze available opcodes in the ARM ISA."""
 
+import tvm.target
 
-ARM_ISA_MAP = {
-    "armv7e-m": ["SMLAD"],
-}
+
+ARM_MPROFILE_DSP_SUPPORT_LIST = [
+    "cortex-m7",
+    "cortex-m4",
+    "cortex-m33",
+    "cortex-m35p",
+    "cortex-m55",
+]
 
 
 class IsaAnalyzer(object):
+    """Checks ISA support for given target"""
+
     def __init__(self, target):
-        self.target = target
-        # TODO: actually parse -mcpu
-        arch = "armv7e-m"
-        self._isa_map = ARM_ISA_MAP[arch]
+        self.target = tvm.target.Target(target)
 
-    def __contains__(self, instruction):
-        return instruction in self._isa_map
+    @property
+    def has_dsp_support(self):
+        return self.target.mcpu is not None and self.target.mcpu in ARM_MPROFILE_DSP_SUPPORT_LIST
diff --git a/python/tvm/testing/plugin.py b/python/tvm/testing/plugin.py
index c0decb7747bd..e90bd5e6dbf5 100644
--- a/python/tvm/testing/plugin.py
+++ b/python/tvm/testing/plugin.py
@@ -49,6 +49,7 @@
     "llvm": "mark a test as requiring llvm",
     "ethosn": "mark a test as requiring ethosn",
     "hexagon": "mark a test as requiring hexagon",
+    "corstone300": "mark a test as requiring Corstone300 FVP",
 }
 
 
diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py
index 4188fea7500c..768705a40a25 100644
--- a/python/tvm/testing/utils.py
+++ b/python/tvm/testing/utils.py
@@ -674,6 +674,18 @@ def requires_opencl(*args):
     return _compose(args, _requires_opencl)
 
 
+def requires_corstone300(*args):
+    """Mark a test as requiring the corstone300 FVP
+
+    Parameters
+    ----------
+    f : function
+        Function to mark
+    """
+    _requires_corstone300 = [pytest.mark.corstone300]
+    return _compose(args, _requires_corstone300)
+
+
 def requires_rocm(*args):
     """Mark a test as requiring the rocm runtime.
 
diff --git a/python/tvm/topi/arm_cpu/__init__.py b/python/tvm/topi/arm_cpu/__init__.py
index 9e2057a7126f..20f92a8895dd 100644
--- a/python/tvm/topi/arm_cpu/__init__.py
+++ b/python/tvm/topi/arm_cpu/__init__.py
@@ -17,6 +17,7 @@
 # pylint: disable=wildcard-import
 """Schedule for ARM CPU"""
 
+from .conv1d import *
 from .conv2d import *
 from .depthwise_conv2d import *
 from .conv2d_transpose import *
@@ -25,5 +26,6 @@
 from .bitserial_conv2d import *
 from .bitserial_dense import *
 from .injective import *
-from . import cortex_m7
 from .group_conv2d import *
+from .pooling import *
+from .dense import *
diff --git a/python/tvm/topi/arm_cpu/conv1d.py b/python/tvm/topi/arm_cpu/conv1d.py
new file mode 100644
index 000000000000..54a6968777e7
--- /dev/null
+++ b/python/tvm/topi/arm_cpu/conv1d.py
@@ -0,0 +1,37 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-variable, no-else-return, unused-argument, import-outside-toplevel
+"""Conv1D schedule for ARM CPU"""
+from __future__ import absolute_import as _abs
+
+from tvm import autotvm
+
+from .mprofile.dsp.conv1d import (
+    conv1d_nwc_dsp_compute,
+    conv1d_nwc_dsp_schedule,
+)
+
+
+@autotvm.register_topi_compute("conv1d_nwc_dsp.arm_cpu")
+def conv1d_nwc_dsp(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    """Compute conv1d with v7e-m DSP instructions."""
+    return conv1d_nwc_dsp_compute(cfg, data, kernel, strides, padding, dilation, out_dtype)
+
+
+@autotvm.register_topi_schedule("conv1d_nwc_dsp.arm_cpu")
+def schedule_conv1d_nwc_dsp(cfg, outs):
+    return conv1d_nwc_dsp_schedule(cfg, outs)
diff --git a/python/tvm/topi/arm_cpu/conv2d.py b/python/tvm/topi/arm_cpu/conv2d.py
index 0500eb55996c..ab489161a8fa 100644
--- a/python/tvm/topi/arm_cpu/conv2d.py
+++ b/python/tvm/topi/arm_cpu/conv2d.py
@@ -33,7 +33,10 @@
     schedule_conv2d_spatial_pack_nchw,
     schedule_conv2d_spatial_pack_nhwc,
 )
-from .cortex_m7.conv2d import direct_simd
+from .mprofile.dsp.conv2d import (
+    conv2d_nhwc_dsp_compute,
+    conv2d_nhwc_dsp_schedule,
+)
 
 
 @autotvm.register_topi_compute("conv2d_nchw_spatial_pack.arm_cpu")
@@ -505,15 +508,13 @@ def _callback(op):
     return s
 
 
-@autotvm.register_topi_compute("conv2d_nhwc_direct_simd.arm_cpu")
-def conv2d_nhwc_direct_simd(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """Compute conv2d_nhwc with SIMD (v7e-m)."""
-    return direct_simd.conv2d_nhwc_direct_simd_compute(
-        cfg, data, kernel, strides, padding, dilation, out_dtype
-    )
+@autotvm.register_topi_compute("conv2d_nhwc_dsp.arm_cpu")
+def conv2d_nhwc_dsp(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    """Compute conv2d_nhwc with v7e-m DSP instructions."""
+    return conv2d_nhwc_dsp_compute(cfg, data, kernel, strides, padding, dilation, out_dtype)
 
 
-@autotvm.register_topi_schedule("conv2d_nhwc_direct_simd.arm_cpu")
-def schedule_conv2d_nhwc_direct_simd(cfg, outs):
-    """Create schedule for conv2d_nhwc_direct_simd"""
-    return direct_simd.conv2d_nhwc_direct_simd_schedule(cfg, outs)
+@autotvm.register_topi_schedule("conv2d_nhwc_dsp.arm_cpu")
+def schedule_conv2d_nhwc_dsp(cfg, outs):
+    """Create schedule for conv2d_nhwc_dsp"""
+    return conv2d_nhwc_dsp_schedule(cfg, outs)
diff --git a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct.py b/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct.py
deleted file mode 100644
index 4f721da5420c..000000000000
--- a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct.py
+++ /dev/null
@@ -1,186 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Direct implementation of conv2d."""
-
-import tvm
-from tvm import autotvm
-from tvm.autotvm.task import deserialize_args
-from tvm.topi.nn.conv2d import conv2d_nchw, conv2d_nhwc
-from tvm.topi.utils import get_const_tuple, get_const_int, traverse_inline
-
-
-def conv2d_direct(*args, **kwargs):
-    """Schedule function for directly-scheduled conv2d."""
-    assert not kwargs, "Do not support kwargs in template function call"
-    args = deserialize_args(args)
-    data, kernel = args[:2]
-    layout = args[-2]
-    cfg = autotvm.get_config()
-    args = [cfg] + args
-    conv = conv2d_direct_compute(*args)
-    if layout == "NHWC":
-        sched = conv2d_direct_nhwc_schedule(cfg, [data, kernel, conv])
-    elif layout == "NCHW":
-        sched = conv2d_direct_nchw_schedule(cfg, [data, kernel, conv])
-    else:
-        raise RuntimeError(f'unsupported data layout "{layout}"')
-    return sched, [data, kernel, conv]
-
-
-conv2d_direct.template_key = "direct"
-conv2d_direct.default_data_layout = "NHWC"
-conv2d_direct.default_kernel_layout = "HWIO"
-
-
-@autotvm.register_topi_compute("conv2d_direct.micro_dev")
-def conv2d_direct_compute(*args):
-    layout = args[-2]
-    if layout == "NHWC":
-        return _conv2d_direct_nhwc_compute(*args)
-    if layout == "NCHW":
-        return _conv2d_direct_nchw_compute(*args)
-
-    raise RuntimeError(f'unsupported data layout "{layout}"')
-
-
-def _conv2d_direct_nhwc_compute(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
-    assert layout == "NHWC"
-    conv = conv2d_nhwc(data, kernel, strides, padding, dilation, out_dtype)
-
-    # Config Space Definition
-    N, H, W, CI = get_const_tuple(data.shape)
-    KH, KW, _, CO = get_const_tuple(kernel.shape)
-    n, oh, ow, co = cfg.axis(N), cfg.axis(H), cfg.axis(W), cfg.axis(CO)
-    kh, kw, ci = cfg.reduce_axis(KH), cfg.reduce_axis(KW), cfg.reduce_axis(CI)
-
-    # TODO should we add a max_factor attr to these splits?
-    co, vc = cfg.define_split("tile_co", co, num_outputs=2)
-    oh, vh = cfg.define_split("tile_oh", oh, num_outputs=2)
-    ow, vw = cfg.define_split("tile_ow", ow, num_outputs=2)
-
-    cfg.define_reorder(
-        "reorder_0",
-        [n, co, oh, ow, ci, kh, kw, vh, vw, vc],
-        policy="candidate",
-        candidate=[
-            [n, co, oh, ow, ci, kh, kw, vh, vw, vc],
-            [n, co, oh, ow, ci, kh, kw, vc, vh, vw],
-            [n, co, oh, ow, ci, vh, vw, vc, kh, kw],
-            [n, co, oh, ow, ci, vc, vh, vw, kh, kw],
-        ],
-    )
-
-    cfg.define_annotate("ann_reduce", [kh, kw], policy="try_unroll")
-    cfg.define_annotate("ann_spatial", [vh, vw, vc], policy="try_unroll")
-
-    cfg.define_knob("auto_unroll_max_step", [0, 2, 4, 8, 16, 32])
-    cfg.define_knob("unroll_explicit", [0, 1])
-
-    return conv
-
-
-def _conv2d_direct_nchw_compute(cfg, data, kernel, strides, padding, dilation, layout, out_dtype):
-    assert layout == "NCHW"
-    conv = conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype)
-
-    ###########################
-    # Config Space Definition #
-    ###########################
-    cfg.define_knob("auto_unroll_max_step", [0, 2, 4, 8, 16, 32])
-    cfg.define_knob("unroll_explicit", [0, 1])
-
-    return conv
-
-
-@autotvm.register_topi_schedule("conv2d_direct_nhwc.micro_dev")
-def conv2d_direct_nhwc_schedule(cfg, outs):
-    """Schedule function for directly-scheduled conv2d on NHWC layout."""
-    sched = tvm.create_schedule([x.op for x in outs])
-
-    def _callback(op):
-        if "conv2d_nhwc" not in op.tag:
-            return
-
-        ### extract tensors ###
-        output = op.output(0)
-        conv = op
-        data_vec = conv.input_tensors[0]
-        kernel = conv.input_tensors[1]  # pylint: disable=unused-variable
-        last = outs[0]  # pylint: disable=unused-variable
-
-        # tile reduction axes
-        n, oh, ow, co = sched[conv].op.axis
-        kh, kw, ci = sched[conv].op.reduce_axis
-        # NOTE we can't inline data padding in the SIMD path, because it
-        # introduces conditionals in the inner loop.
-        data_pad = data_vec.op
-        sched[data_pad].compute_inline()
-
-        co, vc = cfg["tile_co"].apply(sched, conv, co)
-        oh, vh = cfg["tile_oh"].apply(sched, conv, oh)
-        ow, vw = cfg["tile_ow"].apply(sched, conv, ow)
-        cfg["reorder_0"].apply(sched, conv, [n, co, oh, ow, ci, kh, kw, vh, vw, vc])
-        cfg["ann_reduce"].apply(
-            sched,
-            conv,
-            [kh, kw],
-            axis_lens=[get_const_int(kh.dom.extent), get_const_int(kw.dom.extent)],
-            max_unroll=8,
-            cfg=cfg,
-        )
-        cfg["ann_spatial"].apply(
-            sched,
-            conv,
-            [vh, vw, vc],
-            axis_lens=[cfg["tile_oh"].size[-1], cfg["tile_ow"].size[-1], cfg["tile_co"].size[-1]],
-            max_unroll=8,
-            cfg=cfg,
-        )
-
-        kernel_scope = n  # this is the scope to attach global config inside this kernel
-
-        # tune unroll
-        sched[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-        sched[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
-
-    traverse_inline(sched, outs[-1].op, _callback)
-    return sched
-
-
-@autotvm.register_topi_schedule("conv2d_direct_nchw.micro_dev")
-def conv2d_direct_nchw_schedule(cfg, outs):
-    """Schedule function for Cortex-M7 direct implementation of conv2d."""
-    # use default schedule
-    sched = tvm.create_schedule([x.op for x in outs])
-
-    conv = outs[-1].op
-    output = conv.output(0)
-    data_vec = conv.input_tensors[0]
-    data_pad = data_vec.op
-    sched[data_pad].compute_inline()
-
-    # TODO add more schedule opts (similar to the NHWC template)
-
-    n, _, _, _ = sched[conv].op.axis
-    kernel_scope = n  # this is the scope to attach global config inside this kernel
-
-    # tune unroll
-    sched[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
-    sched[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
-
-    return sched
diff --git a/python/tvm/topi/arm_cpu/dense.py b/python/tvm/topi/arm_cpu/dense.py
new file mode 100644
index 000000000000..f2e2eb6288fb
--- /dev/null
+++ b/python/tvm/topi/arm_cpu/dense.py
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-variable, no-else-return, unused-argument, import-outside-toplevel
+"""Dense schedule for ARM CPU"""
+
+from .mprofile.dsp.dense import dense_dsp_schedule
+
+
+def schedule_dense_dsp(outs):
+    """Create schedule for dense_dsp"""
+    return dense_dsp_schedule(outs)
diff --git a/python/tvm/topi/arm_cpu/cortex_m7/__init__.py b/python/tvm/topi/arm_cpu/mprofile/__init__.py
similarity index 92%
rename from python/tvm/topi/arm_cpu/cortex_m7/__init__.py
rename to python/tvm/topi/arm_cpu/mprofile/__init__.py
index 631c5f7ff447..32ce4d3a5447 100644
--- a/python/tvm/topi/arm_cpu/cortex_m7/__init__.py
+++ b/python/tvm/topi/arm_cpu/mprofile/__init__.py
@@ -14,7 +14,4 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Schedules specialized for cortex-m7."""
-
-
-from . import conv2d
+"""Schedules specialized for cortex-m DSP instructions."""
diff --git a/python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/__init__.py b/python/tvm/topi/arm_cpu/mprofile/dsp/__init__.py
similarity index 100%
rename from python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/__init__.py
rename to python/tvm/topi/arm_cpu/mprofile/dsp/__init__.py
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/conv1d.py b/python/tvm/topi/arm_cpu/mprofile/dsp/conv1d.py
new file mode 100644
index 000000000000..521a58d0c1fc
--- /dev/null
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/conv1d.py
@@ -0,0 +1,177 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, no-value-for-parameter
+"""Direct implementation of conv1d."""
+from tvm import autotvm
+from tvm.autotvm.task import deserialize_args
+from tvm import te
+from tvm.topi.utils import simplify, traverse_inline
+from tvm.topi.nn.pad import pad
+from tvm.topi.nn.utils import get_pad_tuple1d
+from tvm.tir.expr import Mul
+
+from .micro_kernel.gemm import (
+    intrin_gemm_MxKxN,
+    gemm_MxKxN_impl,
+)
+
+
+def conv1d_nwc_dsp(*args, **kwargs):
+    """Defines the v7e-m DSP instructions of conv1d on NWC layout."""
+    assert not kwargs, "Do not support kwargs in template function call"
+    args = deserialize_args(args)
+    data, kernel = args[:2]
+    layout = args[-2]
+    cfg = autotvm.get_config()
+    args = [cfg] + args
+    assert layout == "NWC"
+    conv = conv1d_nwc_dsp_compute(*args)
+    sched = conv1d_nwc_dsp_schedule(cfg, [data, kernel, conv])
+    return sched, [data, kernel, conv]
+
+
+conv1d_nwc_dsp.template_key = "dsp"
+conv1d_nwc_dsp.default_data_layout = "NWC"
+conv1d_nwc_dsp.default_kernel_layout = "WOI"
+
+
+def conv1d_nwc_dsp_compute(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    """Compute function for v7e-m DSP instructions of conv1d on NWC layout."""
+    if isinstance(strides, (tuple, list)):
+        strides = strides[0]
+    if isinstance(dilation, (tuple, list)):
+        dilation = dilation[0]
+
+    batch_size, data_width, in_channels = data.shape
+    kernel_size, out_channels, _ = kernel.shape
+
+    # Compute the output shape
+    dilated_kernel_size = (kernel_size - 1) * dilation + 1
+    pad_left, pad_right = get_pad_tuple1d(padding, (dilated_kernel_size,))
+    out_channels = simplify(out_channels)
+    out_width = simplify((data_width - dilated_kernel_size + pad_left + pad_right) // strides + 1)
+
+    # Apply padding
+    pad_before = [0, pad_left, 0]
+    pad_after = [0, pad_right, 0]
+    padded_data = pad(data, pad_before, pad_after, name="padded_data")
+
+    # Compute graph
+    rc = te.reduce_axis((0, in_channels), name="rc")
+    rw = te.reduce_axis((0, kernel_size), name="rw")
+
+    conv = te.compute(
+        (batch_size, out_width, out_channels),
+        lambda b, w, c: te.sum(
+            padded_data[b, w * strides + rw * dilation, rc].astype(out_dtype)
+            * kernel[rw, c, rc].astype(out_dtype),
+            axis=[rw, rc],
+        ),
+        name="conv1d",
+        tag="conv1d_nwc",
+    )
+
+    ###########################
+    # Config Space Definition #
+    ###########################
+    n, ow, co = (
+        cfg.axis(batch_size.value),
+        cfg.axis(out_width.value),
+        cfg.axis(out_channels.value),
+    )
+    kw, ci = (
+        cfg.reduce_axis(kernel_size.value),
+        cfg.reduce_axis(in_channels.value),
+    )
+
+    owo, owi = cfg.define_split("tile_ow", ow, policy="factors", num_outputs=2)
+    cio, cii = cfg.define_split(
+        "tile_ci",
+        ci,
+        policy="factors",
+        num_outputs=2,
+        # TODO: check case with in_channels.value % 4 != 0 with AutoTVM
+        filter=None if cfg.is_fallback else lambda x: x.size[-1] % 4 == 0,
+    )
+    coo, coi = cfg.define_split("tile_co", co, policy="factors", num_outputs=2)
+
+    cfg.define_reorder(
+        "reorder_0_simd",
+        [n, owo, owi, coo, coi, kw, cio, cii],
+        policy="candidate",
+        candidate=[
+            [n, kw, owo, coo, cio, owi, coi, cii],
+            [n, kw, coo, owo, cio, owi, coi, cii],
+            [n, kw, owo, coo, cio, owi, coi, cii],
+            [n, kw, coo, owo, cio, owi, coi, cii],
+        ],
+    )
+
+    cfg.define_knob("auto_unroll_max_step", [0, 2, 4, 8, 16, 32])
+    cfg.define_knob("unroll_explicit", [0, 1])
+
+    if cfg.is_fallback:
+        cfg.fallback_split("tile_ow", [-1, out_width.value])
+        cfg.fallback_split("tile_ci", [-1, in_channels.value])
+        cfg.fallback_split("tile_co", [-1, out_channels.value])
+
+    return conv
+
+
+def conv1d_nwc_dsp_schedule(cfg, outs):
+    """Schedule function for v7e-m DSP instructions of conv1d on NWC layout."""
+    sched = te.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if "conv1d_nwc" not in op.tag:
+            return
+
+        # extract tensors
+        output = op.output(0)
+        conv = op
+        data_vec = conv.input_tensors[0]
+
+        source_index_w = output.op.body[0].source[0].a.value.indices[1].a
+        stride_w = source_index_w.b.value if isinstance(source_index_w, Mul) else 1
+
+        # tile reduction axes
+        n, ow, co = sched[conv].op.axis
+        kw, ci = sched[conv].op.reduce_axis
+
+        M = cfg["tile_ow"].size[-1]
+        K = cfg["tile_ci"].size[-1]
+        N = cfg["tile_co"].size[-1]
+
+        owo, owi = cfg["tile_ow"].apply(sched, conv, ow)
+        cio, cii = cfg["tile_ci"].apply(sched, conv, ci)
+        coo, coi = cfg["tile_co"].apply(sched, conv, co)
+
+        cfg["reorder_0_simd"].apply(sched, conv, [n, owo, owi, coo, coi, kw, cio, cii])
+
+        gemm, uniq_id = intrin_gemm_MxKxN(M, K, N, data_vec.dtype, output.dtype, stride_w)
+        sched[output].tensorize(owi, gemm)
+        sched[output].pragma(n, "import_c", gemm_MxKxN_impl(M, K, N, uniq_id))
+
+        # this is the scope to attach global config inside this kernel
+        kernel_scope = n
+
+        # tune unroll
+        sched[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val)
+        sched[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val)
+
+    traverse_inline(sched, outs[-1].op, _callback)
+    return sched
diff --git a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py b/python/tvm/topi/arm_cpu/mprofile/dsp/conv2d.py
similarity index 88%
rename from python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py
rename to python/tvm/topi/arm_cpu/mprofile/dsp/conv2d.py
index 5ef9fd813eb2..470d46b92a7a 100644
--- a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/conv2d.py
@@ -23,15 +23,16 @@
 from tvm.topi.utils import simplify, traverse_inline
 from tvm.topi.nn.pad import pad
 from tvm.topi.nn.utils import get_pad_tuple
+from tvm.tir.expr import Mul
 
-from ..micro_kernel.gemm import (
+from .micro_kernel.gemm import (
     intrin_gemm_MxKxN,
     gemm_MxKxN_impl,
 )
 
 
-def conv2d_nhwc_direct_simd(*args, **kwargs):
-    """Defines the Cortex-M7 SIMD implementation of conv2d."""
+def conv2d_nhwc_dsp(*args, **kwargs):
+    """Defines the v7e-m DSP instructions of conv2d."""
     assert not kwargs, "Do not support kwargs in template function call"
     args = deserialize_args(args)
     data, kernel = args[:2]
@@ -39,18 +40,18 @@ def conv2d_nhwc_direct_simd(*args, **kwargs):
     cfg = autotvm.get_config()
     args = [cfg] + args
     assert layout == "NHWC"
-    conv = conv2d_nhwc_direct_simd_compute(*args)
-    sched = conv2d_nhwc_direct_simd_schedule(cfg, [data, kernel, conv])
+    conv = conv2d_nhwc_dsp_compute(*args)
+    sched = conv2d_nhwc_dsp_schedule(cfg, [data, kernel, conv])
     return sched, [data, kernel, conv]
 
 
-conv2d_nhwc_direct_simd.template_key = "direct_simd"
-conv2d_nhwc_direct_simd.default_data_layout = "NHWC"
-conv2d_nhwc_direct_simd.default_kernel_layout = "HWOI"
+conv2d_nhwc_dsp.template_key = "dsp"
+conv2d_nhwc_dsp.default_data_layout = "NHWC"
+conv2d_nhwc_dsp.default_kernel_layout = "HWOI"
 
 
-def conv2d_nhwc_direct_simd_compute(cfg, data, kernel, strides, padding, dilation, out_dtype):
-    """Compute function for Cortex-M7 SIMD implementation of conv2d."""
+def conv2d_nhwc_dsp_compute(cfg, data, kernel, strides, padding, dilation, out_dtype):
+    """Compute function for v7e-m DSP instructions of conv2d."""
     assert isinstance(strides, int) or len(strides) == 2
     assert isinstance(dilation, int) or len(dilation) == 2
 
@@ -146,8 +147,8 @@ def conv2d_nhwc_direct_simd_compute(cfg, data, kernel, strides, padding, dilatio
     return conv
 
 
-def conv2d_nhwc_direct_simd_schedule(cfg, outs):
-    """Schedule function for Cortex-M7 SIMD implementation of conv2d."""
+def conv2d_nhwc_dsp_schedule(cfg, outs):
+    """Schedule function for v7e-m DSP instructions of conv2d."""
     sched = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
@@ -161,6 +162,9 @@ def _callback(op):
         kernel = conv.input_tensors[1]  # pylint: disable=unused-variable
         last = outs[0]  # pylint: disable=unused-variable
 
+        source_index_w = output.op.body[0].source[0].a.value.indices[2].a
+        stride_w = source_index_w.b.value if isinstance(source_index_w, Mul) else 1
+
         # tile reduction axes
         n, oh, ow, co = sched[conv].op.axis
         kh, kw, ci = sched[conv].op.reduce_axis
@@ -175,7 +179,7 @@ def _callback(op):
 
         cfg["reorder_0_simd"].apply(sched, conv, [n, oh, owo, owi, coo, coi, kh, kw, cio, cii])
 
-        gemm, uniq_id = intrin_gemm_MxKxN(M, K, N, data_vec.dtype, output.dtype)
+        gemm, uniq_id = intrin_gemm_MxKxN(M, K, N, data_vec.dtype, output.dtype, stride_w)
         sched[output].tensorize(owi, gemm)
         sched[output].pragma(n, "import_c", gemm_MxKxN_impl(M, K, N, uniq_id))
 
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/dense.py b/python/tvm/topi/arm_cpu/mprofile/dsp/dense.py
new file mode 100644
index 000000000000..20dfb09f2dd3
--- /dev/null
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/dense.py
@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, no-value-for-parameter
+"""Direct implementation of dense."""
+
+from tvm import te
+from tvm.topi.utils import traverse_inline
+
+from .micro_kernel.gemm import (
+    intrin_gemm_MxKxN,
+    gemm_MxKxN_impl,
+)
+
+
+def dense_dsp_schedule(outs):
+    """Schedule function for v7e-m DSP instructions of dense."""
+    sched = te.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        if "dense" not in op.tag:
+            return
+
+        # extract tensors
+        output = op.output(0)
+        dense = op
+        data_vec = dense.input_tensors[0]
+        M, K = data_vec.shape
+        N, _ = dense.input_tensors[1].shape
+
+        n, _ = sched[dense].op.axis
+        no, ni = sched[dense].split(n, nparts=1)
+
+        gemm, uniq_id = intrin_gemm_MxKxN(M, K, N, data_vec.dtype, output.dtype)
+        sched[output].tensorize(ni, gemm)
+        sched[output].pragma(no, "import_c", gemm_MxKxN_impl(M, K, N, uniq_id))
+
+    traverse_inline(sched, outs[-1].op, _callback)
+    return sched
diff --git a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/__init__.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/__init__.py
similarity index 91%
rename from python/tvm/topi/arm_cpu/cortex_m7/conv2d/__init__.py
rename to python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/__init__.py
index cc4faf97b126..13a83393a912 100644
--- a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/__init__.py
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/__init__.py
@@ -14,6 +14,3 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Conv2d implementations for cortex-m7."""
-
-from . import direct_simd
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/avg_pool.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/avg_pool.py
new file mode 100644
index 000000000000..786ac2607b7f
--- /dev/null
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/avg_pool.py
@@ -0,0 +1,146 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, no-value-for-parameter
+"""Defines sum intrinsics for sum operation with v7e-m DSP instructions."""
+
+import random
+import string
+
+import tvm
+from tvm import te
+from . import common
+
+
+def intrin_sum(shape, in_dtype, out_dtype, reset=False):
+    """Defines a v7e-m DSP-accelerated sum operation."""
+    UNIQ_ID_LEN = 8
+    uniq_id = "".join(random.choices(string.ascii_uppercase, k=UNIQ_ID_LEN))
+    func_prefix = "sum16"
+
+    assert in_dtype == "int16"
+    assert out_dtype == "int16"
+
+    width = shape[-1]
+    x = te.placeholder(shape, name="x", dtype=in_dtype)
+    k = te.reduce_axis((0, width), name="rc")
+
+    def get_slice(indices, k):
+        s = list(indices)
+        s[-1] = s[-1] + k
+        return tuple(s)
+
+    z = te.compute(
+        (1,) * len(shape), lambda *i: te.sum(x[get_slice(i, k)], axis=[k]).astype(out_dtype)
+    )
+
+    def _intrin_func(ins, outs):
+        aa = ins[0]
+        cc = outs[0]
+
+        def _body():
+            ib = tvm.tir.ir_builder.create()
+            ib.emit(
+                tvm.tir.call_extern(
+                    cc.dtype,
+                    f"{func_prefix}_{width}_{uniq_id}",
+                    aa.access_ptr("r"),
+                    cc.access_ptr("w"),
+                    aa.elem_offset,
+                    1 if reset else 0,
+                )
+            )
+            return ib.get()
+
+        def _reduce_reset():
+            ib = tvm.tir.ir_builder.create()
+            ib.emit(
+                tvm.tir.call_extern(cc.dtype, f"{func_prefix}_reset_{uniq_id}", cc.access_ptr("w"))
+            )
+            return ib.get()
+
+        def _reduce_update():
+            return _body()
+
+        return _body(), _reduce_reset(), _reduce_update()
+
+    binds = {
+        t: tvm.tir.decl_buffer(
+            t.shape,
+            t.dtype,
+            t.op.name,
+            strides=[te.var(f"{t.op.name}_s_{i}") for i in range(0, len(t.shape))],
+            offset_factor=1,
+        )
+        for t in [x, z]
+    }
+
+    intrin_decl = te.decl_tensor_intrin(z.op, _intrin_func, binds=binds)
+    return intrin_decl, uniq_id
+
+
+def sum_impl(N, uniq_id):
+    """Emit C code for sum impl."""
+    cc_code = (
+        common.common_includes
+        + f"""
+
+#ifdef __cplusplus
+extern "C"
+#endif // __cplusplus
+__STATIC_FORCEINLINE int32_t sum16_reset_{uniq_id}(
+    int16_t *res) {{
+  *res = (int16_t)0;
+  return 0;
+}}
+
+#ifdef __cplusplus
+extern "C"
+#endif
+__STATIC_FORCEINLINE int32_t sum16_{N}_{uniq_id}(
+    int16_t *arr,
+    int16_t *res16,
+    long arr_offset,
+    int reset) {{
+  int n;
+  int32_t *p32;
+  int32_t res = reset ? 0 : *res16;
+
+  if ( arr_offset % 4 != 0 ) {{
+    res += *arr;
+    p32 = (int32_t *)(&arr[1]);
+    n = {N} - 1;
+  }} else {{
+    p32 = (int32_t *)arr;
+    n = {N};
+  }}
+
+  for ( int i = 0; i < n / 2; ++ i ) {{
+    res = __SMLAD(*p32, 0x00010001, res);
+    ++ p32;
+  }}
+
+  if ( n % 2 != 0 )
+    res += *(int16_t *)p32;
+
+  *res16 = res;
+
+  return 0;
+}}
+
+"""
+    )
+    return cc_code
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py
new file mode 100644
index 000000000000..a37b297ffd45
--- /dev/null
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py
@@ -0,0 +1,32 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, no-value-for-parameter
+"""Defines common C code for all microkernel operations."""
+
+
+common_includes = """
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <arm_math.h>
+#include <arm_nnsupportfunctions.h>
+
+#include <tvm/runtime/crt/error_codes.h>
+
+"""
diff --git a/python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/gemm.py
similarity index 89%
rename from python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py
rename to python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/gemm.py
index 9a00fe272087..ffc48eaabd59 100644
--- a/python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/gemm.py
@@ -15,21 +15,23 @@
 # specific language governing permissions and limitations
 # under the License.
 # pylint: disable=invalid-name, no-value-for-parameter
-"""Defines gemm intrinsics for SIMD matrix multiplication."""
+"""Defines gemm intrinsics for matrix multiplication with v7e-m DSP instructions."""
 
 import random
 import string
 
 import tvm
 from tvm import te
+from . import common
+
 
 ##########################
 # MxKxN MatMul Intrinsic #
 ##########################
 
 # NOTE this is transposed matmul (A * B^T)
-def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype):
-    """Defines a SIMD-accelerated transposed matmul."""
+def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype, stride_w=1):
+    """Defines a v7e-m DSP-accelerated transposed matmul."""
     # we generate a unique ID for every intrinsic definition, to prevent name
     # collisions in the generated source (e.g., if there are multiple operators
     # in the same module that use the same intrinsic)
@@ -49,12 +51,14 @@ def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype):
     # TODO(weberlo, areusch): support more dtypes?
     assert in_dtype in ("int8", "int16")
     assert out_dtype == "int32"
-    A = te.placeholder((M, K), name="a", dtype=in_dtype)
+    A = te.placeholder((M * stride_w - (stride_w - 1), K), name="a", dtype=in_dtype)
     B = te.placeholder((N, K), name="b", dtype=in_dtype)
     k = te.reduce_axis((0, K), name="k")
     C = te.compute(
         (M, N),
-        lambda i, j: te.sum(A[i, k].astype(out_dtype) * B[j, k].astype(out_dtype), axis=k),
+        lambda i, j: te.sum(
+            A[i * stride_w, k].astype(out_dtype) * B[j, k].astype(out_dtype), axis=k
+        ),
         name="c",
     )
     A_buf = tvm.tir.decl_buffer(
@@ -81,7 +85,7 @@ def _reduce_update():
                     aa.access_ptr("r"),
                     bb.access_ptr("r"),
                     cc.access_ptr("w"),
-                    aa.strides[0],
+                    aa.strides[0] * stride_w,
                     bb.strides[0],
                     cc.strides[0],
                 )
@@ -106,7 +110,7 @@ def _body():
                     aa.access_ptr("r"),
                     bb.access_ptr("r"),
                     cc.access_ptr("w"),
-                    aa.strides[0],
+                    aa.strides[0] * stride_w,
                     bb.strides[0],
                     cc.strides[0],
                 )
@@ -125,12 +129,10 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
     # aa_pad_size = M * K
     bb_pad_size = N * K
     # code reference: CMSIS-NN paper (https://arxiv.org/abs/1801.06601)
-    cc_code = f"""
-#ifdef __cplusplus
-extern "C"
-#endif
-#include <arm_math.h>
-#include <arm_nnsupportfunctions.h>
+    cc_code = (
+        common.common_includes
+        + f"""
+
 
 #ifdef __cplusplus
 extern "C"
@@ -203,9 +205,12 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
     int8_t *aa, int8_t *bb, int32_t *cc,
     int A_stride, int B_stride, int C_stride) {{
   int16_t bb_pad[{bb_pad_size}];
+  int32_t retcode = 0;
 
-  if ( {M} < 16 || {N} < 16 )
-    return gemm_{M}x{K}x{N}_body_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride);
+  if ( {M} < 16 || {N} < 16 ) {{
+    retcode = gemm_{M}x{K}x{N}_body_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride);
+    goto out;
+  }}
 
   for (int i = 0; i < {N}; i++)
     for (int j = 0; j < {K} / 4; j++)
@@ -234,10 +239,10 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
   if ( {K} % 4 != 0 )
     gemm_{M}x{N}_body_rest_{uniq_id}({K}, aa, bb, cc, A_stride, B_stride, C_stride);
 
-  return 0;
+out:
+  return retcode;
 }}
 
-
 #ifdef __cplusplus
 extern "C"
 #endif
@@ -306,9 +311,12 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
     int8_t *aa, int8_t *bb, int32_t *cc,
     int A_stride, int B_stride, int C_stride) {{
   int16_t bb_pad[{bb_pad_size}];
+  int32_t retcode = 0;
 
-  if ( {M} < 16 || {N} < 16 )
-    return gemm_{M}x{K}x{N}_update_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride);
+  if ( {M} < 16 || {N} < 16 ) {{
+    retcode = gemm_{M}x{K}x{N}_update_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride);
+    goto out;
+  }}
 
   for (int i = 0; i < {N}; i++)
     for (int j = 0; j < {K} / 4; j++)
@@ -334,11 +342,10 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
   if ( {K} % 4 != 0 )
     gemm_{M}x{N}_update_rest_{uniq_id}({K}, aa, bb, cc, A_stride, B_stride, C_stride);
 
-  return 0;
+out:
+  return retcode;
 }}
 
-
-
 #ifdef __cplusplus
 extern "C"
 #endif
@@ -383,15 +390,24 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 #endif
 __STATIC_FORCEINLINE int32_t gemm16_{M}x{K}x{N}_body_{uniq_id}(
     int16_t *aa, int16_t *bb, int32_t *cc,
-    int A_stride, int B_stride, int C_stride) {{  
-  if ( {M} < 2 || {N} < 2 )
-    return gemm16_{M}x{K}x{N}_body_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride);  
+    int A_stride, int B_stride, int C_stride) {{
+  int32_t retcode = 0;
+
+  if ( {M} < 2 || {N} < 2 ) {{
+    retcode = gemm16_{M}x{K}x{N}_body_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride);
+    goto out;
+  }}
+
+  if(((uint32_t)aa & 0x3) != 0 || ((uint32_t)bb & 0x3) != 0){{
+    retcode = kTvmErrorFunctionCallInvalidArg;
+    goto out;
+  }}
 
   for (int i = 0; i < {M}; i++) {{
     for (int j = 0; j < {N}; j++) {{
       int32_t *aa_ptr = (int32_t *) &aa[i*A_stride];
       int32_t *bb_ptr = (int32_t *) &bb[j*B_stride];
-    
+
       int32_t sum = 0;
       for (int l = 0; l < {K} / 2; l++) {{
         sum = __SMLAD(*aa_ptr, *bb_ptr, sum);
@@ -407,10 +423,10 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
   if ( {K} % 2 != 0 )
     gemm16_{M}x{N}_body_rest_{uniq_id}({K}, aa, bb, cc, A_stride, B_stride, C_stride);
 
-  return 0;
+out:
+  return retcode;
 }}
 
-
 #ifdef __cplusplus
 extern "C"
 #endif
@@ -452,9 +468,13 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
 #endif
 __STATIC_FORCEINLINE int32_t gemm16_{M}x{K}x{N}_update_{uniq_id}(
     int16_t *aa, int16_t *bb, int32_t *cc,
-    int A_stride, int B_stride, int C_stride) {{  
-  if ( {M} < 2 || {N} < 2 )
-    return gemm16_{M}x{K}x{N}_update_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride);  
+    int A_stride, int B_stride, int C_stride) {{
+  int32_t retcode = 0;
+
+  if ( {M} < 2 || {N} < 2 ) {{
+    retcode = gemm16_{M}x{K}x{N}_update_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride);
+    goto out;
+  }}
 
   for (int i = 0; i < {M}; i++) {{
     for (int j = 0; j < {N}; j++) {{
@@ -473,11 +493,10 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
   if ( {K} % 2 != 0 )
     gemm16_{M}x{N}_update_rest_{uniq_id}({K}, aa, bb, cc, A_stride, B_stride, C_stride);
 
-  return 0;
+out:
+  return retcode;
 }}
 
-
-
 #ifdef __cplusplus
 extern "C"
 #endif
@@ -489,5 +508,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id):
   }}
   return 0;
 }}
-    """
+
+"""
+    )
     return cc_code
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/max_pool.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/max_pool.py
new file mode 100644
index 000000000000..4d410427c0cc
--- /dev/null
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/max_pool.py
@@ -0,0 +1,165 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, no-value-for-parameter
+"""Defines max intrinsics for elemwise max operation with v7e-m DSP instructions."""
+
+import random
+import string
+
+import tvm
+from tvm import te
+from . import common
+
+
+def intrin_max(shape, in_dtype, out_dtype):
+    """Defines a v7e-m DSP-accelerated max pool."""
+    UNIQ_ID_LEN = 8
+    uniq_id = "".join(random.choices(string.ascii_uppercase, k=UNIQ_ID_LEN))
+    func_prefix = "max8"
+
+    assert in_dtype == "int8"
+    assert out_dtype == "int8"
+
+    x = te.placeholder(shape, name="x", dtype=in_dtype)
+    k = te.reduce_axis((0, 1), name="rc")
+    z = te.compute(shape, lambda *i: tvm.tir.max(x[i], axis=[k]).astype(out_dtype))
+
+    def _intrin_func(ins, outs):
+        aa = ins[0]
+        cc = outs[0]
+
+        def _body():
+            ib = tvm.tir.ir_builder.create()
+            ib.emit(
+                tvm.tir.call_extern(
+                    cc.dtype,
+                    f"{func_prefix}_{uniq_id}",
+                    aa.access_ptr("r"),
+                    cc.access_ptr("w"),
+                    cc.strides[0],
+                )
+            )
+            return ib.get()
+
+        def _reduce_reset():
+            ib = tvm.tir.ir_builder.create()
+            ib.emit(
+                tvm.tir.call_extern(
+                    cc.dtype, f"{func_prefix}_reset_{uniq_id}", cc.access_ptr("w"), cc.strides[0]
+                )
+            )
+            return ib.get()
+
+        def _reduce_update():
+            return _body()
+
+        return _body(), _reduce_reset(), _reduce_update()
+
+    binds = {
+        t: tvm.tir.decl_buffer(
+            t.shape,
+            t.dtype,
+            t.op.name,
+            strides=[te.var(f"{t.op.name}_s_{i}") for i in range(0, len(t.shape))],
+            offset_factor=1,
+        )
+        for t in [x, z]
+    }
+
+    intrin_decl = te.decl_tensor_intrin(z.op, _intrin_func, binds=binds)
+    return intrin_decl, uniq_id
+
+
+def max_impl(uniq_id):
+    """Emit C code for pool impl."""
+    cc_code = (
+        common.common_includes
+        + f"""
+
+
+#ifdef __cplusplus
+extern "C"
+#endif
+__STATIC_FORCEINLINE int32_t max8_reset_{uniq_id}(
+    int8_t *res,
+    int N) {{
+  memset(res, (int8_t)-128, N * sizeof(*res));
+  return 0;
+}}
+
+#ifdef __cplusplus
+extern "C"
+#endif
+__STATIC_FORCEINLINE int32_t max8_loop_{uniq_id}(
+    int8_t *arg,
+    int8_t *res,
+    int N) {{
+  for ( int i = 0; i < N; ++ i )
+    if ( arg[i] > res[i] )
+      res[i] = arg[i];
+  return 0;
+}}
+
+#ifdef __cplusplus
+extern "C"
+#endif
+__STATIC_FORCEINLINE int32_t max8_{uniq_id}(
+    int8_t *arg,
+    int8_t *res,
+    int N) {{
+  int32_t *parg32, *pres32;
+  int una_arg = (int32_t)arg & 0x3, una_res = (int32_t)res & 0x3;
+  int32_t retcode = 0;
+
+  if ( N < 4 || ((una_arg || una_res) && una_arg != una_res) ) {{
+    retcode = max8_loop_{uniq_id}(arg, res, N);
+    goto out;
+  }}
+  if ( una_arg ) {{
+    int n = (4 - una_arg);
+    if ( n > N || (N - n) < 4 )
+      n = N;
+    retcode = max8_loop_{uniq_id}(arg, res, n);
+    N -= n;
+    if ( N == 0 )
+      goto out;
+    arg += n; res += n;
+  }}
+
+  parg32 = (int32_t *)arg;
+  pres32 = (int32_t *)res;
+
+  for ( int i = 0; i < N / 4; ++ i ) {{
+    int32_t arg32 = *parg32 ++;
+    int32_t res32 = *pres32;
+    __SSUB8(arg32, res32);
+    res32 = __SEL(arg32, res32);
+    *pres32 ++ = res32;
+  }}
+
+  if ( N & 0x3 ) {{
+    retcode = max8_loop_{uniq_id}((int8_t *)parg32, (int8_t *)pres32, N & 0x3);
+    goto out;
+  }}
+
+out:
+  return retcode;
+}}
+
+"""
+    )
+    return cc_code
diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/pool.py b/python/tvm/topi/arm_cpu/mprofile/dsp/pool.py
new file mode 100644
index 000000000000..99470a28530a
--- /dev/null
+++ b/python/tvm/topi/arm_cpu/mprofile/dsp/pool.py
@@ -0,0 +1,125 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, no-value-for-parameter
+"""Direct implementation of pool."""
+import logging
+
+import tvm
+
+from tvm import te
+from tvm.topi.utils import traverse_inline
+
+from .micro_kernel.max_pool import (
+    intrin_max,
+    max_impl,
+)
+
+from .micro_kernel.avg_pool import (
+    intrin_sum,
+    sum_impl,
+)
+
+logger = logging.getLogger("topi")
+
+
+def schedule_maxpool_1d_nwc(s, op):
+    """Schedule function for v7e-m DSP instructions of maxpool 1d NWC layout."""
+    output = op.output(0)
+    data_vec = op.input_tensors[0]
+
+    channels = data_vec.shape[-1]
+    if isinstance(channels, tvm.tir.IntImm):
+        channels = channels.value
+
+    n, w, c = s[op].op.axis
+    (k,) = s[op].op.reduce_axis
+
+    s[op].reorder(n, w, k, c)
+    max_val, uniq_id = intrin_max((1, 1, channels), data_vec.dtype, output.dtype)
+    s[op].tensorize(c, max_val)
+    s[output].pragma(n, "import_c", max_impl(uniq_id))
+
+
+def schedule_maxpool_2d_nhwc(s, op):
+    """Schedule function for v7e-m DSP instructions of maxpool 2d NHWC layout."""
+    output = op.output(0)
+    data_vec = op.input_tensors[0]
+
+    channels = data_vec.shape[-1]
+    if isinstance(channels, tvm.tir.IntImm):
+        channels = channels.value
+
+    n, h, w, c = s[op].op.axis
+    ko, ki = s[op].op.reduce_axis
+
+    s[op].reorder(n, h, w, ko, ki, c)
+    max_val, uniq_id = intrin_max((1, 1, 1, channels), data_vec.dtype, output.dtype)
+    s[op].tensorize(c, max_val)
+    s[output].pragma(n, "import_c", max_impl(uniq_id))
+
+
+def schedule_avgpool_1d_ncw(s, op):
+    """Schedule function for v7e-m DSP instructions of avgpool 1d NCW layout."""
+    output = op.output(0)
+    data_vec = op.input_tensors[0]
+
+    n, _, _ = s[op].op.axis
+    (k,) = s[op].op.reduce_axis
+    pool_w = k.dom.extent.value
+
+    summary, uniq_id = intrin_sum((1, 1, pool_w), data_vec.dtype, output.dtype, reset=True)
+    s[op].tensorize(k, summary)
+    s[output].pragma(n, "import_c", sum_impl(pool_w, uniq_id))
+
+
+def schedule_avgpool_2d_nchw(s, op):
+    """Schedule function for v7e-m DSP instructions of avgpool 2d NCHW layout."""
+    output = op.output(0)
+    data_vec = op.input_tensors[0]
+
+    n, _, _, _ = s[op].op.axis
+    _, ki = s[op].op.reduce_axis
+    pool_w = ki.dom.extent.value
+
+    summary, uniq_id = intrin_sum((1, 1, 1, pool_w), data_vec.dtype, output.dtype)
+    s[op].tensorize(ki, summary)
+    s[output].pragma(n, "import_c", sum_impl(pool_w, uniq_id))
+
+
+def pool_dsp_schedule(outs, layout):
+    """Schedule function for v7e-m DSP instructions of pooling."""
+    s = te.create_schedule([x.op for x in outs])
+
+    def _callback(op):
+        in_dtype = op.input_tensors[0].dtype
+        if "pool_max" in op.tag:
+            if in_dtype != "int8":
+                logger.warning("Does not have micro-kernel for %s maxpool.", in_dtype)
+            elif layout == "NWC":
+                schedule_maxpool_1d_nwc(s, op)
+            elif layout == "NHWC":
+                schedule_maxpool_2d_nhwc(s, op)
+        elif "pool_sum" in op.tag:
+            if in_dtype != "int16":
+                logger.warning("Does not have micro-kernel for %s avgpool.", in_dtype)
+            elif layout == "NCW":
+                schedule_avgpool_1d_ncw(s, op)
+            elif layout == "NCHW":
+                schedule_avgpool_2d_nchw(s, op)
+
+    traverse_inline(s, outs[-1].op, _callback)
+    return s
diff --git a/python/tvm/topi/arm_cpu/pooling.py b/python/tvm/topi/arm_cpu/pooling.py
new file mode 100644
index 000000000000..f09f0089342d
--- /dev/null
+++ b/python/tvm/topi/arm_cpu/pooling.py
@@ -0,0 +1,25 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+# pylint: disable=invalid-name, unused-variable
+"""Schedule for pooling operators"""
+
+from .mprofile.dsp.pool import pool_dsp_schedule
+
+
+def schedule_pool(outs, layout):
+    """Create schedule for avgpool/maxpool with dsp"""
+    return pool_dsp_schedule(outs, layout)
diff --git a/tests/python/conftest.py b/tests/python/conftest.py
index ab3ea4e4ec06..0dbb3dcc79e8 100644
--- a/tests/python/conftest.py
+++ b/tests/python/conftest.py
@@ -17,6 +17,7 @@
 
 import sys
 import tvm
+import pytest
 
 collect_ignore = []
 if sys.platform.startswith("win"):
@@ -37,3 +38,23 @@
     # collect_ignore.append("unittest/test_auto_scheduler_measure.py") # exception ignored
 
     collect_ignore.append("unittest/test_tir_intrin.py")
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--enable-corstone300-tests",
+        action="store_true",
+        default=False,
+        help="Run Corstone-300 FVP tests",
+    )
+
+
+def pytest_collection_modifyitems(config, items):
+    if not config.getoption("--enable-corstone300-tests"):
+        for item in items:
+            if "corstone300" in item.keywords:
+                item.add_marker(
+                    pytest.mark.skip(
+                        reason="Need --enable-corstone300-tests option to run this test"
+                    )
+                )
diff --git a/tests/python/integration/test_arm_mprofile_dsp.py b/tests/python/integration/test_arm_mprofile_dsp.py
new file mode 100644
index 000000000000..cdafa91f42f9
--- /dev/null
+++ b/tests/python/integration/test_arm_mprofile_dsp.py
@@ -0,0 +1,355 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+import sys
+import numpy as np
+import pytest
+import tvm
+from tvm import relay
+from tests.python.relay.aot.aot_test_utils import (
+    AOTTestModel,
+    AOT_CORSTONE300_RUNNER,
+    generate_ref_data,
+    compile_and_run,
+)
+
+
+@tvm.testing.requires_corstone300
+@pytest.mark.parametrize(
+    "data_shape_nhwc, kernel_size, num_filter, strides, padding, dilation",
+    [
+        ((1, 32, 32, 1), (3, 3), 12, 1, 0, 1),
+        ((1, 32, 10, 3), (3, 3), 16, 1, 0, 1),
+        ((1, 49, 10, 1), (10, 4), 64, (2, 1), (4, 1, 5, 1), 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 2),
+        ((1, 32, 32, 16), (3, 3), 16, 1, (1, 1, 2, 2), 2),
+        # bug https://github.com/apache/tvm/issues/9226
+        ((1, 49, 10, 1), (10, 4), 64, (2, 2), (4, 1, 5, 1), 1),
+        # from Visual Wake Word model
+        ((1, 96, 96, 3), (3, 3), 8, (2, 2), (0, 0, 1, 1), 1),
+        # from Image Classification model (one of the MLPerfTiny models)
+        ((1, 16, 16, 32), (1, 1), 64, (2, 2), 0, 1),
+        ((4, 16, 16, 8), (5, 5), 8, 2, (0, 4, 4, 0), 1),
+        ((4, 16, 16, 8), (5, 5), 16, 2, (0, 4, 4, 0), 1),
+        ((4, 16, 16, 8), (5, 5), 8, 2, 0, 1),
+        ((4, 16, 16, 8), (5, 5), 16, 2, 0, 1),
+        ((1, 16, 16, 8), (3, 3), 16, 2, (0, 0, 1, 1), 1),
+        ((1, 16, 16, 8), (3, 3), 16, 2, (1, 1, 2, 2), 1),
+        ((1, 16, 16, 8), (5, 5), 16, 2, (3, 3, 2, 2), 1),
+        ((1, 16, 16, 8), (3, 3), 16, 2, (0, 1, 2, 3), 1),
+    ],
+)
+@pytest.mark.parametrize("dtype", ["int8", "int16"])
+def test_conv2d(data_shape_nhwc, kernel_size, num_filter, strides, padding, dilation, dtype):
+    """Test a subgraph with a single conv2d operator."""
+    ishape = data_shape_nhwc
+    wshape = (*kernel_size, data_shape_nhwc[-1], num_filter)
+
+    weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype)
+
+    input0 = relay.var("input", relay.TensorType(ishape, dtype))
+    weight0 = relay.const(weight_data)
+    out0 = relay.op.nn.conv2d(
+        input0,
+        weight0,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        dilation=(dilation, dilation),
+        data_layout="NHWC",
+        kernel_layout="HWIO",
+        out_dtype="int32",
+        out_layout="NHWC",
+    )
+    ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
+
+    input1 = relay.var("input", relay.TensorType(ishape, dtype))
+    weight1 = relay.const(np.moveaxis(weight_data, 2, -1))
+    out1 = relay.op.nn.conv2d(
+        input1,
+        weight1,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding=padding,
+        dilation=(dilation, dilation),
+        data_layout="NHWC",
+        kernel_layout="HWOI",
+        out_dtype="int32",
+        out_layout="NHWC",
+    )
+    mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
+
+    inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)}
+    output_list = generate_ref_data(ref_mod, inputs)
+
+    compile_and_run(
+        AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+        runner=AOT_CORSTONE300_RUNNER,
+        interface_api="c",
+        use_unpacked_api=True,
+        target_opts={
+            "-keys": "arm_cpu",
+            "-mcpu": "cortex-m7",
+        },
+    )
+
+
+@tvm.testing.requires_corstone300
+@pytest.mark.parametrize(
+    "data_shape_nwc, kernel_size, num_filter, strides, padding",
+    [
+        ((1, 32, 12), 3, 16, 1, 0),
+        ((3, 12, 10), 4, 24, 1, 0),
+        ((1, 7, 7), 3, 5, 1, 0),
+        ((1, 10, 2), 4, 4, 2, (1, 1)),
+        ((1, 20, 2), 4, 4, 2, (0, 1)),
+        ((1, 16, 4), 1, 12, 1, (1, 0)),
+        ((1, 24, 16), 1, 32, 3, (2, 2)),
+    ],
+)
+@pytest.mark.parametrize("dtype", ["int8", "int16"])
+def test_conv1d(data_shape_nwc, kernel_size, num_filter, strides, padding, dtype):
+    """Test a subgraph with a single conv1d operator."""
+    ishape = data_shape_nwc
+    wshape = (kernel_size, data_shape_nwc[-1], num_filter)
+
+    weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype)
+
+    input0 = relay.var("input", relay.TensorType(ishape, dtype))
+    weight0 = relay.const(weight_data)
+    out0 = relay.op.nn.conv1d(
+        input0,
+        weight0,
+        strides=strides,
+        padding=padding,
+        data_layout="NWC",
+        kernel_layout="WIO",
+        out_dtype="int32",
+        out_layout="NWC",
+    )
+    ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
+
+    input1 = relay.var("input", relay.TensorType(ishape, dtype))
+    weight1 = relay.const(np.moveaxis(weight_data, 1, -1))
+    out1 = relay.op.nn.conv1d(
+        input1,
+        weight1,
+        strides=strides,
+        padding=padding,
+        data_layout="NWC",
+        kernel_layout="WOI",
+        out_dtype="int32",
+        out_layout="NWC",
+    )
+    mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
+
+    inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)}
+    output_list = generate_ref_data(ref_mod, inputs)
+
+    compile_and_run(
+        AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+        runner=AOT_CORSTONE300_RUNNER,
+        interface_api="c",
+        use_unpacked_api=True,
+        target_opts={
+            "-keys": "arm_cpu",
+            "-mcpu": "cortex-m7",
+        },
+    )
+
+
+@tvm.testing.requires_corstone300
+@pytest.mark.parametrize(
+    "M, K, N",
+    [
+        (1, 32, 64),
+        (3, 12, 10),
+    ],
+)
+def test_dense(M, K, N):
+    """Test a subgraph with a single dense operator."""
+    ishape = (M, K)
+    wshape = (N, K)
+
+    input0 = relay.var("input", relay.TensorType(ishape, "int8"))
+    dense_f = relay.op.nn.batch_flatten(input0)
+    weight0 = relay.const(np.random.randint(low=-10, high=10, size=wshape, dtype="int8"))
+    out = relay.op.nn.dense(dense_f, weight0, out_dtype="int32")
+
+    mod = tvm.IRModule.from_expr(relay.Function([input0], out))
+    inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype="int8")}
+    output_list = generate_ref_data(mod, inputs)
+
+    compile_and_run(
+        AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+        runner=AOT_CORSTONE300_RUNNER,
+        interface_api="c",
+        use_unpacked_api=True,
+        target_opts={
+            "-keys": "arm_cpu",
+            "-mcpu": "cortex-m7",
+        },
+    )
+
+
+@tvm.testing.requires_corstone300
+@pytest.mark.parametrize(
+    "data_shape_nhwc, pool_size, strides, padding",
+    [
+        ((1, 32, 32, 1), (3, 3), 1, 0),
+        ((1, 32, 20, 4), (3, 3), (2, 2), 0),
+    ],
+)
+def test_maxpool_2d(data_shape_nhwc, pool_size, strides, padding):
+    """Test a subgraph with a single maxpool_2d operator."""
+
+    ishape = data_shape_nhwc
+
+    input0 = relay.var("input", relay.TensorType(ishape, "int8"))
+    out = relay.op.nn.max_pool2d(input0, pool_size, layout="NHWC", strides=strides, padding=padding)
+
+    mod = tvm.IRModule.from_expr(relay.Function([input0], out))
+    inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype="int8")}
+    output_list = generate_ref_data(mod, inputs)
+
+    compile_and_run(
+        AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+        runner=AOT_CORSTONE300_RUNNER,
+        interface_api="c",
+        use_unpacked_api=True,
+        target_opts={
+            "-keys": "arm_cpu",
+            "-mcpu": "cortex-m7",
+        },
+    )
+
+
+@tvm.testing.requires_corstone300
+@pytest.mark.parametrize(
+    "data_shape_nwc, pool_size, strides, padding",
+    [
+        ((1, 32, 1), 3, 1, 0),
+        ((1, 20, 4), 3, 2, 0),
+    ],
+)
+def test_maxpool_1d(data_shape_nwc, pool_size, strides, padding):
+    """Test a subgraph with a single maxpool_1d operator."""
+    ishape = data_shape_nwc
+
+    input0 = relay.var("input", relay.TensorType(ishape, "int8"))
+    out = relay.op.nn.max_pool1d(input0, pool_size, layout="NWC", strides=strides, padding=padding)
+
+    mod = tvm.IRModule.from_expr(relay.Function([input0], out))
+    inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype="int8")}
+    output_list = generate_ref_data(mod, inputs)
+
+    compile_and_run(
+        AOTTestModel(module=mod, inputs=inputs, outputs=output_list),
+        runner=AOT_CORSTONE300_RUNNER,
+        interface_api="c",
+        use_unpacked_api=True,
+        target_opts={
+            "-keys": "arm_cpu",
+            "-mcpu": "cortex-m7",
+        },
+    )
+
+
+@tvm.testing.requires_corstone300
+@pytest.mark.parametrize(
+    "data_shape_nchw, pool_size, strides, padding",
+    [
+        ((1, 1, 32, 32), (3, 3), 1, 0),
+        ((1, 4, 32, 20), (3, 3), (2, 2), 0),
+    ],
+)
+def test_avgpool_2d(data_shape_nchw, pool_size, strides, padding):
+    """Test a subgraph with a single avgpool_2d operator."""
+
+    ishape = data_shape_nchw
+
+    input0 = relay.var("input", relay.TensorType(ishape, "int32"))
+    out0 = relay.nn.avg_pool2d(input0, pool_size, layout="NCHW", strides=strides, padding=padding)
+    ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
+
+    input1 = relay.var("input", relay.TensorType(ishape, "int16"))
+    out1 = relay.op.nn.avg_pool2d(
+        input1, pool_size, layout="NCHW", strides=strides, padding=padding
+    )
+    mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
+
+    input_data = np.random.randint(low=-128, high=127, size=ishape, dtype="int32")
+    inputs = {"input": input_data}
+    output_list = generate_ref_data(ref_mod, inputs)
+
+    compile_and_run(
+        AOTTestModel(
+            module=mod, inputs={"input": input_data.astype(dtype="int16")}, outputs=output_list
+        ),
+        runner=AOT_CORSTONE300_RUNNER,
+        interface_api="c",
+        use_unpacked_api=True,
+        target_opts={
+            "-keys": "arm_cpu",
+            "-mcpu": "cortex-m7",
+        },
+    )
+
+
+@tvm.testing.requires_corstone300
+@pytest.mark.parametrize(
+    "data_shape_ncw, pool_size, strides, padding",
+    [
+        ((1, 1, 32), 3, 1, 0),
+        ((1, 4, 20), 3, 2, 2),
+    ],
+)
+def test_avgpool_1d(data_shape_ncw, pool_size, strides, padding):
+    """Test a subgraph with a single avgpool_1d operator."""
+
+    ishape = data_shape_ncw
+
+    input0 = relay.var("input", relay.TensorType(ishape, "int32"))
+    out0 = relay.op.nn.avg_pool1d(input0, pool_size, layout="NCW", strides=strides, padding=padding)
+    ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0))
+
+    input1 = relay.var("input", relay.TensorType(ishape, "int16"))
+    out1 = relay.op.nn.avg_pool1d(input1, pool_size, layout="NCW", strides=strides, padding=padding)
+    mod = tvm.IRModule.from_expr(relay.Function([input1], out1))
+
+    input_data = np.random.randint(low=-10, high=10, size=ishape, dtype="int32")
+    inputs = {"input": input_data}
+    output_list = generate_ref_data(ref_mod, inputs)
+
+    compile_and_run(
+        AOTTestModel(
+            module=mod, inputs={"input": input_data.astype(dtype="int16")}, outputs=output_list
+        ),
+        runner=AOT_CORSTONE300_RUNNER,
+        interface_api="c",
+        use_unpacked_api=True,
+        target_opts={
+            "-keys": "arm_cpu",
+            "-mcpu": "cortex-m7",
+        },
+    )
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py
index 7d8a4f043548..c73af1948b57 100644
--- a/tests/python/relay/aot/aot_test_utils.py
+++ b/tests/python/relay/aot/aot_test_utils.py
@@ -545,6 +545,15 @@ def create_header_file(tensor_name, npy_data, output_path, data_linkage):
     It is used to capture the tensor data (for both inputs and expected outputs) to be bundled into the standalone application.
     """
     file_path = pathlib.Path(f"{output_path}/" + tensor_name).resolve()
+    np_type_to_c = {
+        "int8": "int8_t",
+        "uint8": "uint8_t",
+        "int16": "int16_t",
+        "uint16": "uint16_t",
+        "int32": "int32_t",
+        "uint32": "uint32_t",
+        "float32": "float",
+    }
     # create header file
     raw_path = file_path.with_suffix(".h").resolve()
     with open(raw_path, "w") as header_file:
@@ -555,14 +564,7 @@ def create_header_file(tensor_name, npy_data, output_path, data_linkage):
 
         emit_data_linkage(header_file, data_linkage)
 
-        if npy_data.dtype == "int8":
-            header_file.write(f"int8_t {tensor_name}[] =")
-        elif npy_data.dtype == "int32":
-            header_file.write(f"int32_t {tensor_name}[] = ")
-        elif npy_data.dtype == "uint8":
-            header_file.write(f"uint8_t {tensor_name}[] = ")
-        elif npy_data.dtype == "float32":
-            header_file.write(f"float {tensor_name}[] = ")
+        header_file.write(f"{np_type_to_c[str(npy_data.dtype)]} {tensor_name}[] =")
 
         header_file.write("{")
         for i in np.ndindex(npy_data.shape):
@@ -577,6 +579,7 @@ def compile_models(
     workspace_byte_alignment: int = 8,
     enable_op_fusion: bool = True,
     pass_config: Dict[str, Any] = None,
+    target_opts: Dict = None,
 ) -> List[AOTCompiledTestModel]:
     """
     This method generates runtime.Modules for the tests
@@ -586,6 +589,9 @@ def compile_models(
 
     base_target = "c -runtime=c --link-params --executor=aot"
     extra_target = f"--workspace-byte-alignment={workspace_byte_alignment} --interface-api={interface_api} --unpacked-api={int(use_unpacked_api)}"
+    if target_opts:
+        for key, val in target_opts.items():
+            extra_target += f" {key}={val}"
     target = f"{base_target} {extra_target}"
 
     config = {"tir.disable_vectorize": True}
@@ -727,6 +733,7 @@ def compile_and_run(
     workspace_byte_alignment: int = 8,
     enable_op_fusion: bool = True,
     data_linkage: AOTDataLinkage = None,
+    target_opts: Dict = None,
 ):
     """This is a wrapper API to compile and run models as test for AoT"""
     compiled_test_mods = compile_models(
@@ -736,6 +743,7 @@ def compile_and_run(
         workspace_byte_alignment=workspace_byte_alignment,
         enable_op_fusion=enable_op_fusion,
         pass_config=runner.pass_config,
+        target_opts=target_opts,
     )
     run_and_check(
         models=compiled_test_mods,
diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh
index 8618619d65ad..615caa50aa91 100755
--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
@@ -74,3 +74,8 @@ run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-driver tests/python/driver
 
 # Do not enable OpenGL
 # run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-webgl tests/webgl
+
+
+if [ -z "${TVM_INTEGRATION_GPU_ONLY:-}" ] && [ -z "${TVM_INTEGRATION_I386_ONLY:-}" ] ; then
+    run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-m7-simd tests/python/integration/test_arm_mprofile_dsp.py --enable-corstone300-tests
+fi
diff --git a/tests/scripts/task_python_integration_gpuonly.sh b/tests/scripts/task_python_integration_gpuonly.sh
index 36c3883d4379..cb6bec40c22f 100755
--- a/tests/scripts/task_python_integration_gpuonly.sh
+++ b/tests/scripts/task_python_integration_gpuonly.sh
@@ -20,5 +20,6 @@ export TVM_TEST_TARGETS="cuda;opencl;metal;rocm;nvptx;opencl -device=mali,aocl_s
 export PYTEST_ADDOPTS="-m gpu $PYTEST_ADDOPTS"
 export TVM_RELAY_TEST_TARGETS="cuda"
 export TVM_INTEGRATION_TESTSUITE_NAME=python-integration-gpu
+export TVM_INTEGRATION_GPU_ONLY=1
 
 ./tests/scripts/task_python_integration.sh