From 511e169531ee16499cf467c9c455bb645f1bdfaa Mon Sep 17 00:00:00 2001 From: Sergei Smirnov <89378719+sergey-grovety@users.noreply.github.com> Date: Mon, 15 Nov 2021 20:41:18 +0300 Subject: [PATCH] [Topi] Cortex-M DSP support (#9233) Co-authored-by: Sergey Smirnov Co-authored-by: Ekaterina Bern Co-authored-by: Mikhail Trubnikov Co-authored-by: German Tretiakov Co-authored-by: Ilya Gozman Co-authored-by: Alexey.Yazev Co-authored-by: Ilya Gozman <92577591+ilyag-grovety@users.noreply.github.com> --- python/tvm/relay/op/strategy/arm_cpu.py | 93 ++++- python/tvm/target/arm_isa.py | 24 +- python/tvm/testing/plugin.py | 1 + python/tvm/testing/utils.py | 12 + python/tvm/topi/arm_cpu/__init__.py | 4 +- python/tvm/topi/arm_cpu/conv1d.py | 37 ++ python/tvm/topi/arm_cpu/conv2d.py | 23 +- .../topi/arm_cpu/cortex_m7/conv2d/direct.py | 186 --------- python/tvm/topi/arm_cpu/dense.py | 25 ++ .../{cortex_m7 => mprofile}/__init__.py | 5 +- .../micro_kernel => mprofile/dsp}/__init__.py | 0 .../tvm/topi/arm_cpu/mprofile/dsp/conv1d.py | 177 +++++++++ .../direct_simd.py => mprofile/dsp/conv2d.py} | 30 +- python/tvm/topi/arm_cpu/mprofile/dsp/dense.py | 52 +++ .../dsp/micro_kernel}/__init__.py | 3 - .../mprofile/dsp/micro_kernel/avg_pool.py | 146 +++++++ .../mprofile/dsp/micro_kernel/common.py | 32 ++ .../dsp}/micro_kernel/gemm.py | 91 +++-- .../mprofile/dsp/micro_kernel/max_pool.py | 165 ++++++++ python/tvm/topi/arm_cpu/mprofile/dsp/pool.py | 125 ++++++ python/tvm/topi/arm_cpu/pooling.py | 25 ++ tests/python/conftest.py | 21 ++ .../integration/test_arm_mprofile_dsp.py | 355 ++++++++++++++++++ tests/python/relay/aot/aot_test_utils.py | 24 +- tests/scripts/task_python_integration.sh | 5 + .../task_python_integration_gpuonly.sh | 1 + 26 files changed, 1387 insertions(+), 275 deletions(-) create mode 100644 python/tvm/topi/arm_cpu/conv1d.py delete mode 100644 python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct.py create mode 100644 python/tvm/topi/arm_cpu/dense.py rename python/tvm/topi/arm_cpu/{cortex_m7 => mprofile}/__init__.py (92%) rename python/tvm/topi/arm_cpu/{cortex_m7/micro_kernel => mprofile/dsp}/__init__.py (100%) create mode 100644 python/tvm/topi/arm_cpu/mprofile/dsp/conv1d.py rename python/tvm/topi/arm_cpu/{cortex_m7/conv2d/direct_simd.py => mprofile/dsp/conv2d.py} (88%) create mode 100644 python/tvm/topi/arm_cpu/mprofile/dsp/dense.py rename python/tvm/topi/arm_cpu/{cortex_m7/conv2d => mprofile/dsp/micro_kernel}/__init__.py (91%) create mode 100644 python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/avg_pool.py create mode 100644 python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py rename python/tvm/topi/arm_cpu/{cortex_m7 => mprofile/dsp}/micro_kernel/gemm.py (89%) create mode 100644 python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/max_pool.py create mode 100644 python/tvm/topi/arm_cpu/mprofile/dsp/pool.py create mode 100644 python/tvm/topi/arm_cpu/pooling.py create mode 100644 tests/python/integration/test_arm_mprofile_dsp.py diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py index 06dfc87038fe..35db043e8c93 100644 --- a/python/tvm/relay/op/strategy/arm_cpu.py +++ b/python/tvm/relay/op/strategy/arm_cpu.py @@ -19,7 +19,7 @@ import re import logging -from tvm import topi +from tvm import relay, topi from ....target import arm_isa from ....topi.generic import conv2d as conv2d_generic from .generic import * @@ -49,6 +49,25 @@ def schedule_concatenate_arm_cpu(_, outs, target): return topi.arm_cpu.schedule_concatenate(outs) +@schedule_pool.register(["arm_cpu"]) +def schedule_pool_arm_cpu(attrs, outs, target): + """schedule pooling ops arm cpu""" + layout = attrs.layout + isa = arm_isa.IsaAnalyzer(target) + avg_pool = isinstance(attrs, relay.op.op_attrs.AvgPool2DAttrs) + with target: + if ( + avg_pool + and isa.has_dsp_support + and layout in ("NCW", "NCHW") + or not avg_pool + and isa.has_dsp_support + and layout in ("NWC", "NHWC") + ): + return topi.arm_cpu.schedule_pool(outs, layout) + return topi.generic.schedule_pool(outs, layout) + + @conv2d_strategy.register(["arm_cpu", "micro_dev"]) def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): """conv2d arm cpu strategy""" @@ -128,11 +147,11 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): name="conv2d_hwcn.generic", ) elif layout == "NHWC": - if "SMLAD" in isa and kernel_layout == "HWOI": + if isa.has_dsp_support and kernel_layout == "HWOI": strategy.add_implementation( - wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_direct_simd), - wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_direct_simd), - name="conv2d_nhwc_direct_simd.micro_dev", + wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_dsp), + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_dsp), + name="conv2d_nhwc_dsp.micro_dev", ) elif kernel_layout == "HWIO": is_aarch64 = topi.arm_cpu.arm_utils.is_aarch64_arm() @@ -415,3 +434,67 @@ def schedule_bitserial_dense_arm_cpu(attrs, inputs, out_type, target): name="bitserial_dense.arm_cpu", ) return strategy + + +@dense_strategy.register(["arm_cpu"]) +def schedule_dense_arm_cpu(attrs, inputs, out_type, target): + """dense arm cpu strategy""" + strategy = _op.OpStrategy() + isa = arm_isa.IsaAnalyzer(target) + if isa.has_dsp_support: + strategy.add_implementation( + wrap_compute_dense(topi.nn.dense), + wrap_topi_schedule(topi.arm_cpu.schedule_dense_dsp), + name="dense_dsp", + ) + else: + strategy.add_implementation( + wrap_compute_dense(topi.nn.dense), + wrap_topi_schedule(topi.generic.schedule_dense), + name="dense.generic", + ) + return strategy + + +@conv1d_strategy.register("arm_cpu") +def conv1d_strategy_arm_cpu(attrs, inputs, out_type, target): + """conv1d strategy""" + strategy = _op.OpStrategy() + layout = attrs.data_layout + kernel_layout = attrs.kernel_layout + dilation = get_const_tuple(attrs.dilation) + if dilation[0] < 1: + raise ValueError("dilation should be a positive value") + + isa = arm_isa.IsaAnalyzer(target) + + if kernel_layout == "WOI": + if layout == "NWC" and isa.has_dsp_support: + strategy.add_implementation( + wrap_compute_conv1d(topi.arm_cpu.conv1d_nwc_dsp), + wrap_topi_schedule(topi.arm_cpu.schedule_conv1d_nwc_dsp), + name="conv1d_dsp", + ) + else: + raise RuntimeError( + "Unsupported kernel layout {} for conv1d {} for arm cpu.".format( + kernel_layout, layout + ) + ) + elif layout == "NCW": + strategy.add_implementation( + wrap_compute_conv1d(topi.nn.conv1d_ncw), + wrap_topi_schedule(topi.generic.schedule_conv1d_ncw), + name="conv1d_ncw.generic", + ) + elif layout == "NWC": + strategy.add_implementation( + wrap_compute_conv1d(topi.nn.conv1d_nwc), + wrap_topi_schedule(topi.generic.schedule_conv1d_nwc), + name="conv1d_nwc.generic", + ) + else: + raise RuntimeError( + "Unsupported kernel layout {} for conv1d {} for arm cpu.".format(kernel_layout, layout) + ) + return strategy diff --git a/python/tvm/target/arm_isa.py b/python/tvm/target/arm_isa.py index 60fc6593cb5f..a5ac9b1563a5 100644 --- a/python/tvm/target/arm_isa.py +++ b/python/tvm/target/arm_isa.py @@ -16,18 +16,24 @@ # under the License. """Defines functions to analyze available opcodes in the ARM ISA.""" +import tvm.target -ARM_ISA_MAP = { - "armv7e-m": ["SMLAD"], -} + +ARM_MPROFILE_DSP_SUPPORT_LIST = [ + "cortex-m7", + "cortex-m4", + "cortex-m33", + "cortex-m35p", + "cortex-m55", +] class IsaAnalyzer(object): + """Checks ISA support for given target""" + def __init__(self, target): - self.target = target - # TODO: actually parse -mcpu - arch = "armv7e-m" - self._isa_map = ARM_ISA_MAP[arch] + self.target = tvm.target.Target(target) - def __contains__(self, instruction): - return instruction in self._isa_map + @property + def has_dsp_support(self): + return self.target.mcpu is not None and self.target.mcpu in ARM_MPROFILE_DSP_SUPPORT_LIST diff --git a/python/tvm/testing/plugin.py b/python/tvm/testing/plugin.py index c0decb7747bd..e90bd5e6dbf5 100644 --- a/python/tvm/testing/plugin.py +++ b/python/tvm/testing/plugin.py @@ -49,6 +49,7 @@ "llvm": "mark a test as requiring llvm", "ethosn": "mark a test as requiring ethosn", "hexagon": "mark a test as requiring hexagon", + "corstone300": "mark a test as requiring Corstone300 FVP", } diff --git a/python/tvm/testing/utils.py b/python/tvm/testing/utils.py index 4188fea7500c..768705a40a25 100644 --- a/python/tvm/testing/utils.py +++ b/python/tvm/testing/utils.py @@ -674,6 +674,18 @@ def requires_opencl(*args): return _compose(args, _requires_opencl) +def requires_corstone300(*args): + """Mark a test as requiring the corstone300 FVP + + Parameters + ---------- + f : function + Function to mark + """ + _requires_corstone300 = [pytest.mark.corstone300] + return _compose(args, _requires_corstone300) + + def requires_rocm(*args): """Mark a test as requiring the rocm runtime. diff --git a/python/tvm/topi/arm_cpu/__init__.py b/python/tvm/topi/arm_cpu/__init__.py index 9e2057a7126f..20f92a8895dd 100644 --- a/python/tvm/topi/arm_cpu/__init__.py +++ b/python/tvm/topi/arm_cpu/__init__.py @@ -17,6 +17,7 @@ # pylint: disable=wildcard-import """Schedule for ARM CPU""" +from .conv1d import * from .conv2d import * from .depthwise_conv2d import * from .conv2d_transpose import * @@ -25,5 +26,6 @@ from .bitserial_conv2d import * from .bitserial_dense import * from .injective import * -from . import cortex_m7 from .group_conv2d import * +from .pooling import * +from .dense import * diff --git a/python/tvm/topi/arm_cpu/conv1d.py b/python/tvm/topi/arm_cpu/conv1d.py new file mode 100644 index 000000000000..54a6968777e7 --- /dev/null +++ b/python/tvm/topi/arm_cpu/conv1d.py @@ -0,0 +1,37 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name, unused-variable, no-else-return, unused-argument, import-outside-toplevel +"""Conv1D schedule for ARM CPU""" +from __future__ import absolute_import as _abs + +from tvm import autotvm + +from .mprofile.dsp.conv1d import ( + conv1d_nwc_dsp_compute, + conv1d_nwc_dsp_schedule, +) + + +@autotvm.register_topi_compute("conv1d_nwc_dsp.arm_cpu") +def conv1d_nwc_dsp(cfg, data, kernel, strides, padding, dilation, out_dtype): + """Compute conv1d with v7e-m DSP instructions.""" + return conv1d_nwc_dsp_compute(cfg, data, kernel, strides, padding, dilation, out_dtype) + + +@autotvm.register_topi_schedule("conv1d_nwc_dsp.arm_cpu") +def schedule_conv1d_nwc_dsp(cfg, outs): + return conv1d_nwc_dsp_schedule(cfg, outs) diff --git a/python/tvm/topi/arm_cpu/conv2d.py b/python/tvm/topi/arm_cpu/conv2d.py index 0500eb55996c..ab489161a8fa 100644 --- a/python/tvm/topi/arm_cpu/conv2d.py +++ b/python/tvm/topi/arm_cpu/conv2d.py @@ -33,7 +33,10 @@ schedule_conv2d_spatial_pack_nchw, schedule_conv2d_spatial_pack_nhwc, ) -from .cortex_m7.conv2d import direct_simd +from .mprofile.dsp.conv2d import ( + conv2d_nhwc_dsp_compute, + conv2d_nhwc_dsp_schedule, +) @autotvm.register_topi_compute("conv2d_nchw_spatial_pack.arm_cpu") @@ -505,15 +508,13 @@ def _callback(op): return s -@autotvm.register_topi_compute("conv2d_nhwc_direct_simd.arm_cpu") -def conv2d_nhwc_direct_simd(cfg, data, kernel, strides, padding, dilation, out_dtype): - """Compute conv2d_nhwc with SIMD (v7e-m).""" - return direct_simd.conv2d_nhwc_direct_simd_compute( - cfg, data, kernel, strides, padding, dilation, out_dtype - ) +@autotvm.register_topi_compute("conv2d_nhwc_dsp.arm_cpu") +def conv2d_nhwc_dsp(cfg, data, kernel, strides, padding, dilation, out_dtype): + """Compute conv2d_nhwc with v7e-m DSP instructions.""" + return conv2d_nhwc_dsp_compute(cfg, data, kernel, strides, padding, dilation, out_dtype) -@autotvm.register_topi_schedule("conv2d_nhwc_direct_simd.arm_cpu") -def schedule_conv2d_nhwc_direct_simd(cfg, outs): - """Create schedule for conv2d_nhwc_direct_simd""" - return direct_simd.conv2d_nhwc_direct_simd_schedule(cfg, outs) +@autotvm.register_topi_schedule("conv2d_nhwc_dsp.arm_cpu") +def schedule_conv2d_nhwc_dsp(cfg, outs): + """Create schedule for conv2d_nhwc_dsp""" + return conv2d_nhwc_dsp_schedule(cfg, outs) diff --git a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct.py b/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct.py deleted file mode 100644 index 4f721da5420c..000000000000 --- a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct.py +++ /dev/null @@ -1,186 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# pylint: disable=invalid-name -"""Direct implementation of conv2d.""" - -import tvm -from tvm import autotvm -from tvm.autotvm.task import deserialize_args -from tvm.topi.nn.conv2d import conv2d_nchw, conv2d_nhwc -from tvm.topi.utils import get_const_tuple, get_const_int, traverse_inline - - -def conv2d_direct(*args, **kwargs): - """Schedule function for directly-scheduled conv2d.""" - assert not kwargs, "Do not support kwargs in template function call" - args = deserialize_args(args) - data, kernel = args[:2] - layout = args[-2] - cfg = autotvm.get_config() - args = [cfg] + args - conv = conv2d_direct_compute(*args) - if layout == "NHWC": - sched = conv2d_direct_nhwc_schedule(cfg, [data, kernel, conv]) - elif layout == "NCHW": - sched = conv2d_direct_nchw_schedule(cfg, [data, kernel, conv]) - else: - raise RuntimeError(f'unsupported data layout "{layout}"') - return sched, [data, kernel, conv] - - -conv2d_direct.template_key = "direct" -conv2d_direct.default_data_layout = "NHWC" -conv2d_direct.default_kernel_layout = "HWIO" - - -@autotvm.register_topi_compute("conv2d_direct.micro_dev") -def conv2d_direct_compute(*args): - layout = args[-2] - if layout == "NHWC": - return _conv2d_direct_nhwc_compute(*args) - if layout == "NCHW": - return _conv2d_direct_nchw_compute(*args) - - raise RuntimeError(f'unsupported data layout "{layout}"') - - -def _conv2d_direct_nhwc_compute(cfg, data, kernel, strides, padding, dilation, layout, out_dtype): - assert layout == "NHWC" - conv = conv2d_nhwc(data, kernel, strides, padding, dilation, out_dtype) - - # Config Space Definition - N, H, W, CI = get_const_tuple(data.shape) - KH, KW, _, CO = get_const_tuple(kernel.shape) - n, oh, ow, co = cfg.axis(N), cfg.axis(H), cfg.axis(W), cfg.axis(CO) - kh, kw, ci = cfg.reduce_axis(KH), cfg.reduce_axis(KW), cfg.reduce_axis(CI) - - # TODO should we add a max_factor attr to these splits? - co, vc = cfg.define_split("tile_co", co, num_outputs=2) - oh, vh = cfg.define_split("tile_oh", oh, num_outputs=2) - ow, vw = cfg.define_split("tile_ow", ow, num_outputs=2) - - cfg.define_reorder( - "reorder_0", - [n, co, oh, ow, ci, kh, kw, vh, vw, vc], - policy="candidate", - candidate=[ - [n, co, oh, ow, ci, kh, kw, vh, vw, vc], - [n, co, oh, ow, ci, kh, kw, vc, vh, vw], - [n, co, oh, ow, ci, vh, vw, vc, kh, kw], - [n, co, oh, ow, ci, vc, vh, vw, kh, kw], - ], - ) - - cfg.define_annotate("ann_reduce", [kh, kw], policy="try_unroll") - cfg.define_annotate("ann_spatial", [vh, vw, vc], policy="try_unroll") - - cfg.define_knob("auto_unroll_max_step", [0, 2, 4, 8, 16, 32]) - cfg.define_knob("unroll_explicit", [0, 1]) - - return conv - - -def _conv2d_direct_nchw_compute(cfg, data, kernel, strides, padding, dilation, layout, out_dtype): - assert layout == "NCHW" - conv = conv2d_nchw(data, kernel, strides, padding, dilation, out_dtype) - - ########################### - # Config Space Definition # - ########################### - cfg.define_knob("auto_unroll_max_step", [0, 2, 4, 8, 16, 32]) - cfg.define_knob("unroll_explicit", [0, 1]) - - return conv - - -@autotvm.register_topi_schedule("conv2d_direct_nhwc.micro_dev") -def conv2d_direct_nhwc_schedule(cfg, outs): - """Schedule function for directly-scheduled conv2d on NHWC layout.""" - sched = tvm.create_schedule([x.op for x in outs]) - - def _callback(op): - if "conv2d_nhwc" not in op.tag: - return - - ### extract tensors ### - output = op.output(0) - conv = op - data_vec = conv.input_tensors[0] - kernel = conv.input_tensors[1] # pylint: disable=unused-variable - last = outs[0] # pylint: disable=unused-variable - - # tile reduction axes - n, oh, ow, co = sched[conv].op.axis - kh, kw, ci = sched[conv].op.reduce_axis - # NOTE we can't inline data padding in the SIMD path, because it - # introduces conditionals in the inner loop. - data_pad = data_vec.op - sched[data_pad].compute_inline() - - co, vc = cfg["tile_co"].apply(sched, conv, co) - oh, vh = cfg["tile_oh"].apply(sched, conv, oh) - ow, vw = cfg["tile_ow"].apply(sched, conv, ow) - cfg["reorder_0"].apply(sched, conv, [n, co, oh, ow, ci, kh, kw, vh, vw, vc]) - cfg["ann_reduce"].apply( - sched, - conv, - [kh, kw], - axis_lens=[get_const_int(kh.dom.extent), get_const_int(kw.dom.extent)], - max_unroll=8, - cfg=cfg, - ) - cfg["ann_spatial"].apply( - sched, - conv, - [vh, vw, vc], - axis_lens=[cfg["tile_oh"].size[-1], cfg["tile_ow"].size[-1], cfg["tile_co"].size[-1]], - max_unroll=8, - cfg=cfg, - ) - - kernel_scope = n # this is the scope to attach global config inside this kernel - - # tune unroll - sched[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) - sched[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val) - - traverse_inline(sched, outs[-1].op, _callback) - return sched - - -@autotvm.register_topi_schedule("conv2d_direct_nchw.micro_dev") -def conv2d_direct_nchw_schedule(cfg, outs): - """Schedule function for Cortex-M7 direct implementation of conv2d.""" - # use default schedule - sched = tvm.create_schedule([x.op for x in outs]) - - conv = outs[-1].op - output = conv.output(0) - data_vec = conv.input_tensors[0] - data_pad = data_vec.op - sched[data_pad].compute_inline() - - # TODO add more schedule opts (similar to the NHWC template) - - n, _, _, _ = sched[conv].op.axis - kernel_scope = n # this is the scope to attach global config inside this kernel - - # tune unroll - sched[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) - sched[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val) - - return sched diff --git a/python/tvm/topi/arm_cpu/dense.py b/python/tvm/topi/arm_cpu/dense.py new file mode 100644 index 000000000000..f2e2eb6288fb --- /dev/null +++ b/python/tvm/topi/arm_cpu/dense.py @@ -0,0 +1,25 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name, unused-variable, no-else-return, unused-argument, import-outside-toplevel +"""Dense schedule for ARM CPU""" + +from .mprofile.dsp.dense import dense_dsp_schedule + + +def schedule_dense_dsp(outs): + """Create schedule for dense_dsp""" + return dense_dsp_schedule(outs) diff --git a/python/tvm/topi/arm_cpu/cortex_m7/__init__.py b/python/tvm/topi/arm_cpu/mprofile/__init__.py similarity index 92% rename from python/tvm/topi/arm_cpu/cortex_m7/__init__.py rename to python/tvm/topi/arm_cpu/mprofile/__init__.py index 631c5f7ff447..32ce4d3a5447 100644 --- a/python/tvm/topi/arm_cpu/cortex_m7/__init__.py +++ b/python/tvm/topi/arm_cpu/mprofile/__init__.py @@ -14,7 +14,4 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""Schedules specialized for cortex-m7.""" - - -from . import conv2d +"""Schedules specialized for cortex-m DSP instructions.""" diff --git a/python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/__init__.py b/python/tvm/topi/arm_cpu/mprofile/dsp/__init__.py similarity index 100% rename from python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/__init__.py rename to python/tvm/topi/arm_cpu/mprofile/dsp/__init__.py diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/conv1d.py b/python/tvm/topi/arm_cpu/mprofile/dsp/conv1d.py new file mode 100644 index 000000000000..521a58d0c1fc --- /dev/null +++ b/python/tvm/topi/arm_cpu/mprofile/dsp/conv1d.py @@ -0,0 +1,177 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name, no-value-for-parameter +"""Direct implementation of conv1d.""" +from tvm import autotvm +from tvm.autotvm.task import deserialize_args +from tvm import te +from tvm.topi.utils import simplify, traverse_inline +from tvm.topi.nn.pad import pad +from tvm.topi.nn.utils import get_pad_tuple1d +from tvm.tir.expr import Mul + +from .micro_kernel.gemm import ( + intrin_gemm_MxKxN, + gemm_MxKxN_impl, +) + + +def conv1d_nwc_dsp(*args, **kwargs): + """Defines the v7e-m DSP instructions of conv1d on NWC layout.""" + assert not kwargs, "Do not support kwargs in template function call" + args = deserialize_args(args) + data, kernel = args[:2] + layout = args[-2] + cfg = autotvm.get_config() + args = [cfg] + args + assert layout == "NWC" + conv = conv1d_nwc_dsp_compute(*args) + sched = conv1d_nwc_dsp_schedule(cfg, [data, kernel, conv]) + return sched, [data, kernel, conv] + + +conv1d_nwc_dsp.template_key = "dsp" +conv1d_nwc_dsp.default_data_layout = "NWC" +conv1d_nwc_dsp.default_kernel_layout = "WOI" + + +def conv1d_nwc_dsp_compute(cfg, data, kernel, strides, padding, dilation, out_dtype): + """Compute function for v7e-m DSP instructions of conv1d on NWC layout.""" + if isinstance(strides, (tuple, list)): + strides = strides[0] + if isinstance(dilation, (tuple, list)): + dilation = dilation[0] + + batch_size, data_width, in_channels = data.shape + kernel_size, out_channels, _ = kernel.shape + + # Compute the output shape + dilated_kernel_size = (kernel_size - 1) * dilation + 1 + pad_left, pad_right = get_pad_tuple1d(padding, (dilated_kernel_size,)) + out_channels = simplify(out_channels) + out_width = simplify((data_width - dilated_kernel_size + pad_left + pad_right) // strides + 1) + + # Apply padding + pad_before = [0, pad_left, 0] + pad_after = [0, pad_right, 0] + padded_data = pad(data, pad_before, pad_after, name="padded_data") + + # Compute graph + rc = te.reduce_axis((0, in_channels), name="rc") + rw = te.reduce_axis((0, kernel_size), name="rw") + + conv = te.compute( + (batch_size, out_width, out_channels), + lambda b, w, c: te.sum( + padded_data[b, w * strides + rw * dilation, rc].astype(out_dtype) + * kernel[rw, c, rc].astype(out_dtype), + axis=[rw, rc], + ), + name="conv1d", + tag="conv1d_nwc", + ) + + ########################### + # Config Space Definition # + ########################### + n, ow, co = ( + cfg.axis(batch_size.value), + cfg.axis(out_width.value), + cfg.axis(out_channels.value), + ) + kw, ci = ( + cfg.reduce_axis(kernel_size.value), + cfg.reduce_axis(in_channels.value), + ) + + owo, owi = cfg.define_split("tile_ow", ow, policy="factors", num_outputs=2) + cio, cii = cfg.define_split( + "tile_ci", + ci, + policy="factors", + num_outputs=2, + # TODO: check case with in_channels.value % 4 != 0 with AutoTVM + filter=None if cfg.is_fallback else lambda x: x.size[-1] % 4 == 0, + ) + coo, coi = cfg.define_split("tile_co", co, policy="factors", num_outputs=2) + + cfg.define_reorder( + "reorder_0_simd", + [n, owo, owi, coo, coi, kw, cio, cii], + policy="candidate", + candidate=[ + [n, kw, owo, coo, cio, owi, coi, cii], + [n, kw, coo, owo, cio, owi, coi, cii], + [n, kw, owo, coo, cio, owi, coi, cii], + [n, kw, coo, owo, cio, owi, coi, cii], + ], + ) + + cfg.define_knob("auto_unroll_max_step", [0, 2, 4, 8, 16, 32]) + cfg.define_knob("unroll_explicit", [0, 1]) + + if cfg.is_fallback: + cfg.fallback_split("tile_ow", [-1, out_width.value]) + cfg.fallback_split("tile_ci", [-1, in_channels.value]) + cfg.fallback_split("tile_co", [-1, out_channels.value]) + + return conv + + +def conv1d_nwc_dsp_schedule(cfg, outs): + """Schedule function for v7e-m DSP instructions of conv1d on NWC layout.""" + sched = te.create_schedule([x.op for x in outs]) + + def _callback(op): + if "conv1d_nwc" not in op.tag: + return + + # extract tensors + output = op.output(0) + conv = op + data_vec = conv.input_tensors[0] + + source_index_w = output.op.body[0].source[0].a.value.indices[1].a + stride_w = source_index_w.b.value if isinstance(source_index_w, Mul) else 1 + + # tile reduction axes + n, ow, co = sched[conv].op.axis + kw, ci = sched[conv].op.reduce_axis + + M = cfg["tile_ow"].size[-1] + K = cfg["tile_ci"].size[-1] + N = cfg["tile_co"].size[-1] + + owo, owi = cfg["tile_ow"].apply(sched, conv, ow) + cio, cii = cfg["tile_ci"].apply(sched, conv, ci) + coo, coi = cfg["tile_co"].apply(sched, conv, co) + + cfg["reorder_0_simd"].apply(sched, conv, [n, owo, owi, coo, coi, kw, cio, cii]) + + gemm, uniq_id = intrin_gemm_MxKxN(M, K, N, data_vec.dtype, output.dtype, stride_w) + sched[output].tensorize(owi, gemm) + sched[output].pragma(n, "import_c", gemm_MxKxN_impl(M, K, N, uniq_id)) + + # this is the scope to attach global config inside this kernel + kernel_scope = n + + # tune unroll + sched[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) + sched[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val) + + traverse_inline(sched, outs[-1].op, _callback) + return sched diff --git a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py b/python/tvm/topi/arm_cpu/mprofile/dsp/conv2d.py similarity index 88% rename from python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py rename to python/tvm/topi/arm_cpu/mprofile/dsp/conv2d.py index 5ef9fd813eb2..470d46b92a7a 100644 --- a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/direct_simd.py +++ b/python/tvm/topi/arm_cpu/mprofile/dsp/conv2d.py @@ -23,15 +23,16 @@ from tvm.topi.utils import simplify, traverse_inline from tvm.topi.nn.pad import pad from tvm.topi.nn.utils import get_pad_tuple +from tvm.tir.expr import Mul -from ..micro_kernel.gemm import ( +from .micro_kernel.gemm import ( intrin_gemm_MxKxN, gemm_MxKxN_impl, ) -def conv2d_nhwc_direct_simd(*args, **kwargs): - """Defines the Cortex-M7 SIMD implementation of conv2d.""" +def conv2d_nhwc_dsp(*args, **kwargs): + """Defines the v7e-m DSP instructions of conv2d.""" assert not kwargs, "Do not support kwargs in template function call" args = deserialize_args(args) data, kernel = args[:2] @@ -39,18 +40,18 @@ def conv2d_nhwc_direct_simd(*args, **kwargs): cfg = autotvm.get_config() args = [cfg] + args assert layout == "NHWC" - conv = conv2d_nhwc_direct_simd_compute(*args) - sched = conv2d_nhwc_direct_simd_schedule(cfg, [data, kernel, conv]) + conv = conv2d_nhwc_dsp_compute(*args) + sched = conv2d_nhwc_dsp_schedule(cfg, [data, kernel, conv]) return sched, [data, kernel, conv] -conv2d_nhwc_direct_simd.template_key = "direct_simd" -conv2d_nhwc_direct_simd.default_data_layout = "NHWC" -conv2d_nhwc_direct_simd.default_kernel_layout = "HWOI" +conv2d_nhwc_dsp.template_key = "dsp" +conv2d_nhwc_dsp.default_data_layout = "NHWC" +conv2d_nhwc_dsp.default_kernel_layout = "HWOI" -def conv2d_nhwc_direct_simd_compute(cfg, data, kernel, strides, padding, dilation, out_dtype): - """Compute function for Cortex-M7 SIMD implementation of conv2d.""" +def conv2d_nhwc_dsp_compute(cfg, data, kernel, strides, padding, dilation, out_dtype): + """Compute function for v7e-m DSP instructions of conv2d.""" assert isinstance(strides, int) or len(strides) == 2 assert isinstance(dilation, int) or len(dilation) == 2 @@ -146,8 +147,8 @@ def conv2d_nhwc_direct_simd_compute(cfg, data, kernel, strides, padding, dilatio return conv -def conv2d_nhwc_direct_simd_schedule(cfg, outs): - """Schedule function for Cortex-M7 SIMD implementation of conv2d.""" +def conv2d_nhwc_dsp_schedule(cfg, outs): + """Schedule function for v7e-m DSP instructions of conv2d.""" sched = te.create_schedule([x.op for x in outs]) def _callback(op): @@ -161,6 +162,9 @@ def _callback(op): kernel = conv.input_tensors[1] # pylint: disable=unused-variable last = outs[0] # pylint: disable=unused-variable + source_index_w = output.op.body[0].source[0].a.value.indices[2].a + stride_w = source_index_w.b.value if isinstance(source_index_w, Mul) else 1 + # tile reduction axes n, oh, ow, co = sched[conv].op.axis kh, kw, ci = sched[conv].op.reduce_axis @@ -175,7 +179,7 @@ def _callback(op): cfg["reorder_0_simd"].apply(sched, conv, [n, oh, owo, owi, coo, coi, kh, kw, cio, cii]) - gemm, uniq_id = intrin_gemm_MxKxN(M, K, N, data_vec.dtype, output.dtype) + gemm, uniq_id = intrin_gemm_MxKxN(M, K, N, data_vec.dtype, output.dtype, stride_w) sched[output].tensorize(owi, gemm) sched[output].pragma(n, "import_c", gemm_MxKxN_impl(M, K, N, uniq_id)) diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/dense.py b/python/tvm/topi/arm_cpu/mprofile/dsp/dense.py new file mode 100644 index 000000000000..20dfb09f2dd3 --- /dev/null +++ b/python/tvm/topi/arm_cpu/mprofile/dsp/dense.py @@ -0,0 +1,52 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name, no-value-for-parameter +"""Direct implementation of dense.""" + +from tvm import te +from tvm.topi.utils import traverse_inline + +from .micro_kernel.gemm import ( + intrin_gemm_MxKxN, + gemm_MxKxN_impl, +) + + +def dense_dsp_schedule(outs): + """Schedule function for v7e-m DSP instructions of dense.""" + sched = te.create_schedule([x.op for x in outs]) + + def _callback(op): + if "dense" not in op.tag: + return + + # extract tensors + output = op.output(0) + dense = op + data_vec = dense.input_tensors[0] + M, K = data_vec.shape + N, _ = dense.input_tensors[1].shape + + n, _ = sched[dense].op.axis + no, ni = sched[dense].split(n, nparts=1) + + gemm, uniq_id = intrin_gemm_MxKxN(M, K, N, data_vec.dtype, output.dtype) + sched[output].tensorize(ni, gemm) + sched[output].pragma(no, "import_c", gemm_MxKxN_impl(M, K, N, uniq_id)) + + traverse_inline(sched, outs[-1].op, _callback) + return sched diff --git a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/__init__.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/__init__.py similarity index 91% rename from python/tvm/topi/arm_cpu/cortex_m7/conv2d/__init__.py rename to python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/__init__.py index cc4faf97b126..13a83393a912 100644 --- a/python/tvm/topi/arm_cpu/cortex_m7/conv2d/__init__.py +++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/__init__.py @@ -14,6 +14,3 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""Conv2d implementations for cortex-m7.""" - -from . import direct_simd diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/avg_pool.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/avg_pool.py new file mode 100644 index 000000000000..786ac2607b7f --- /dev/null +++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/avg_pool.py @@ -0,0 +1,146 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name, no-value-for-parameter +"""Defines sum intrinsics for sum operation with v7e-m DSP instructions.""" + +import random +import string + +import tvm +from tvm import te +from . import common + + +def intrin_sum(shape, in_dtype, out_dtype, reset=False): + """Defines a v7e-m DSP-accelerated sum operation.""" + UNIQ_ID_LEN = 8 + uniq_id = "".join(random.choices(string.ascii_uppercase, k=UNIQ_ID_LEN)) + func_prefix = "sum16" + + assert in_dtype == "int16" + assert out_dtype == "int16" + + width = shape[-1] + x = te.placeholder(shape, name="x", dtype=in_dtype) + k = te.reduce_axis((0, width), name="rc") + + def get_slice(indices, k): + s = list(indices) + s[-1] = s[-1] + k + return tuple(s) + + z = te.compute( + (1,) * len(shape), lambda *i: te.sum(x[get_slice(i, k)], axis=[k]).astype(out_dtype) + ) + + def _intrin_func(ins, outs): + aa = ins[0] + cc = outs[0] + + def _body(): + ib = tvm.tir.ir_builder.create() + ib.emit( + tvm.tir.call_extern( + cc.dtype, + f"{func_prefix}_{width}_{uniq_id}", + aa.access_ptr("r"), + cc.access_ptr("w"), + aa.elem_offset, + 1 if reset else 0, + ) + ) + return ib.get() + + def _reduce_reset(): + ib = tvm.tir.ir_builder.create() + ib.emit( + tvm.tir.call_extern(cc.dtype, f"{func_prefix}_reset_{uniq_id}", cc.access_ptr("w")) + ) + return ib.get() + + def _reduce_update(): + return _body() + + return _body(), _reduce_reset(), _reduce_update() + + binds = { + t: tvm.tir.decl_buffer( + t.shape, + t.dtype, + t.op.name, + strides=[te.var(f"{t.op.name}_s_{i}") for i in range(0, len(t.shape))], + offset_factor=1, + ) + for t in [x, z] + } + + intrin_decl = te.decl_tensor_intrin(z.op, _intrin_func, binds=binds) + return intrin_decl, uniq_id + + +def sum_impl(N, uniq_id): + """Emit C code for sum impl.""" + cc_code = ( + common.common_includes + + f""" + +#ifdef __cplusplus +extern "C" +#endif // __cplusplus +__STATIC_FORCEINLINE int32_t sum16_reset_{uniq_id}( + int16_t *res) {{ + *res = (int16_t)0; + return 0; +}} + +#ifdef __cplusplus +extern "C" +#endif +__STATIC_FORCEINLINE int32_t sum16_{N}_{uniq_id}( + int16_t *arr, + int16_t *res16, + long arr_offset, + int reset) {{ + int n; + int32_t *p32; + int32_t res = reset ? 0 : *res16; + + if ( arr_offset % 4 != 0 ) {{ + res += *arr; + p32 = (int32_t *)(&arr[1]); + n = {N} - 1; + }} else {{ + p32 = (int32_t *)arr; + n = {N}; + }} + + for ( int i = 0; i < n / 2; ++ i ) {{ + res = __SMLAD(*p32, 0x00010001, res); + ++ p32; + }} + + if ( n % 2 != 0 ) + res += *(int16_t *)p32; + + *res16 = res; + + return 0; +}} + +""" + ) + return cc_code diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py new file mode 100644 index 000000000000..a37b297ffd45 --- /dev/null +++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/common.py @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name, no-value-for-parameter +"""Defines common C code for all microkernel operations.""" + + +common_includes = """ + +#include +#include +#include + +#include +#include + +#include + +""" diff --git a/python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/gemm.py similarity index 89% rename from python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py rename to python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/gemm.py index 9a00fe272087..ffc48eaabd59 100644 --- a/python/tvm/topi/arm_cpu/cortex_m7/micro_kernel/gemm.py +++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/gemm.py @@ -15,21 +15,23 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=invalid-name, no-value-for-parameter -"""Defines gemm intrinsics for SIMD matrix multiplication.""" +"""Defines gemm intrinsics for matrix multiplication with v7e-m DSP instructions.""" import random import string import tvm from tvm import te +from . import common + ########################## # MxKxN MatMul Intrinsic # ########################## # NOTE this is transposed matmul (A * B^T) -def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype): - """Defines a SIMD-accelerated transposed matmul.""" +def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype, stride_w=1): + """Defines a v7e-m DSP-accelerated transposed matmul.""" # we generate a unique ID for every intrinsic definition, to prevent name # collisions in the generated source (e.g., if there are multiple operators # in the same module that use the same intrinsic) @@ -49,12 +51,14 @@ def intrin_gemm_MxKxN(M, K, N, in_dtype, out_dtype): # TODO(weberlo, areusch): support more dtypes? assert in_dtype in ("int8", "int16") assert out_dtype == "int32" - A = te.placeholder((M, K), name="a", dtype=in_dtype) + A = te.placeholder((M * stride_w - (stride_w - 1), K), name="a", dtype=in_dtype) B = te.placeholder((N, K), name="b", dtype=in_dtype) k = te.reduce_axis((0, K), name="k") C = te.compute( (M, N), - lambda i, j: te.sum(A[i, k].astype(out_dtype) * B[j, k].astype(out_dtype), axis=k), + lambda i, j: te.sum( + A[i * stride_w, k].astype(out_dtype) * B[j, k].astype(out_dtype), axis=k + ), name="c", ) A_buf = tvm.tir.decl_buffer( @@ -81,7 +85,7 @@ def _reduce_update(): aa.access_ptr("r"), bb.access_ptr("r"), cc.access_ptr("w"), - aa.strides[0], + aa.strides[0] * stride_w, bb.strides[0], cc.strides[0], ) @@ -106,7 +110,7 @@ def _body(): aa.access_ptr("r"), bb.access_ptr("r"), cc.access_ptr("w"), - aa.strides[0], + aa.strides[0] * stride_w, bb.strides[0], cc.strides[0], ) @@ -125,12 +129,10 @@ def gemm_MxKxN_impl(M, K, N, uniq_id): # aa_pad_size = M * K bb_pad_size = N * K # code reference: CMSIS-NN paper (https://arxiv.org/abs/1801.06601) - cc_code = f""" -#ifdef __cplusplus -extern "C" -#endif -#include -#include + cc_code = ( + common.common_includes + + f""" + #ifdef __cplusplus extern "C" @@ -203,9 +205,12 @@ def gemm_MxKxN_impl(M, K, N, uniq_id): int8_t *aa, int8_t *bb, int32_t *cc, int A_stride, int B_stride, int C_stride) {{ int16_t bb_pad[{bb_pad_size}]; + int32_t retcode = 0; - if ( {M} < 16 || {N} < 16 ) - return gemm_{M}x{K}x{N}_body_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride); + if ( {M} < 16 || {N} < 16 ) {{ + retcode = gemm_{M}x{K}x{N}_body_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride); + goto out; + }} for (int i = 0; i < {N}; i++) for (int j = 0; j < {K} / 4; j++) @@ -234,10 +239,10 @@ def gemm_MxKxN_impl(M, K, N, uniq_id): if ( {K} % 4 != 0 ) gemm_{M}x{N}_body_rest_{uniq_id}({K}, aa, bb, cc, A_stride, B_stride, C_stride); - return 0; +out: + return retcode; }} - #ifdef __cplusplus extern "C" #endif @@ -306,9 +311,12 @@ def gemm_MxKxN_impl(M, K, N, uniq_id): int8_t *aa, int8_t *bb, int32_t *cc, int A_stride, int B_stride, int C_stride) {{ int16_t bb_pad[{bb_pad_size}]; + int32_t retcode = 0; - if ( {M} < 16 || {N} < 16 ) - return gemm_{M}x{K}x{N}_update_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride); + if ( {M} < 16 || {N} < 16 ) {{ + retcode = gemm_{M}x{K}x{N}_update_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride); + goto out; + }} for (int i = 0; i < {N}; i++) for (int j = 0; j < {K} / 4; j++) @@ -334,11 +342,10 @@ def gemm_MxKxN_impl(M, K, N, uniq_id): if ( {K} % 4 != 0 ) gemm_{M}x{N}_update_rest_{uniq_id}({K}, aa, bb, cc, A_stride, B_stride, C_stride); - return 0; +out: + return retcode; }} - - #ifdef __cplusplus extern "C" #endif @@ -383,15 +390,24 @@ def gemm_MxKxN_impl(M, K, N, uniq_id): #endif __STATIC_FORCEINLINE int32_t gemm16_{M}x{K}x{N}_body_{uniq_id}( int16_t *aa, int16_t *bb, int32_t *cc, - int A_stride, int B_stride, int C_stride) {{ - if ( {M} < 2 || {N} < 2 ) - return gemm16_{M}x{K}x{N}_body_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride); + int A_stride, int B_stride, int C_stride) {{ + int32_t retcode = 0; + + if ( {M} < 2 || {N} < 2 ) {{ + retcode = gemm16_{M}x{K}x{N}_body_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride); + goto out; + }} + + if(((uint32_t)aa & 0x3) != 0 || ((uint32_t)bb & 0x3) != 0){{ + retcode = kTvmErrorFunctionCallInvalidArg; + goto out; + }} for (int i = 0; i < {M}; i++) {{ for (int j = 0; j < {N}; j++) {{ int32_t *aa_ptr = (int32_t *) &aa[i*A_stride]; int32_t *bb_ptr = (int32_t *) &bb[j*B_stride]; - + int32_t sum = 0; for (int l = 0; l < {K} / 2; l++) {{ sum = __SMLAD(*aa_ptr, *bb_ptr, sum); @@ -407,10 +423,10 @@ def gemm_MxKxN_impl(M, K, N, uniq_id): if ( {K} % 2 != 0 ) gemm16_{M}x{N}_body_rest_{uniq_id}({K}, aa, bb, cc, A_stride, B_stride, C_stride); - return 0; +out: + return retcode; }} - #ifdef __cplusplus extern "C" #endif @@ -452,9 +468,13 @@ def gemm_MxKxN_impl(M, K, N, uniq_id): #endif __STATIC_FORCEINLINE int32_t gemm16_{M}x{K}x{N}_update_{uniq_id}( int16_t *aa, int16_t *bb, int32_t *cc, - int A_stride, int B_stride, int C_stride) {{ - if ( {M} < 2 || {N} < 2 ) - return gemm16_{M}x{K}x{N}_update_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride); + int A_stride, int B_stride, int C_stride) {{ + int32_t retcode = 0; + + if ( {M} < 2 || {N} < 2 ) {{ + retcode = gemm16_{M}x{K}x{N}_update_loop_{uniq_id}(aa, bb, cc, A_stride, B_stride, C_stride); + goto out; + }} for (int i = 0; i < {M}; i++) {{ for (int j = 0; j < {N}; j++) {{ @@ -473,11 +493,10 @@ def gemm_MxKxN_impl(M, K, N, uniq_id): if ( {K} % 2 != 0 ) gemm16_{M}x{N}_update_rest_{uniq_id}({K}, aa, bb, cc, A_stride, B_stride, C_stride); - return 0; +out: + return retcode; }} - - #ifdef __cplusplus extern "C" #endif @@ -489,5 +508,7 @@ def gemm_MxKxN_impl(M, K, N, uniq_id): }} return 0; }} - """ + +""" + ) return cc_code diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/max_pool.py b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/max_pool.py new file mode 100644 index 000000000000..4d410427c0cc --- /dev/null +++ b/python/tvm/topi/arm_cpu/mprofile/dsp/micro_kernel/max_pool.py @@ -0,0 +1,165 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name, no-value-for-parameter +"""Defines max intrinsics for elemwise max operation with v7e-m DSP instructions.""" + +import random +import string + +import tvm +from tvm import te +from . import common + + +def intrin_max(shape, in_dtype, out_dtype): + """Defines a v7e-m DSP-accelerated max pool.""" + UNIQ_ID_LEN = 8 + uniq_id = "".join(random.choices(string.ascii_uppercase, k=UNIQ_ID_LEN)) + func_prefix = "max8" + + assert in_dtype == "int8" + assert out_dtype == "int8" + + x = te.placeholder(shape, name="x", dtype=in_dtype) + k = te.reduce_axis((0, 1), name="rc") + z = te.compute(shape, lambda *i: tvm.tir.max(x[i], axis=[k]).astype(out_dtype)) + + def _intrin_func(ins, outs): + aa = ins[0] + cc = outs[0] + + def _body(): + ib = tvm.tir.ir_builder.create() + ib.emit( + tvm.tir.call_extern( + cc.dtype, + f"{func_prefix}_{uniq_id}", + aa.access_ptr("r"), + cc.access_ptr("w"), + cc.strides[0], + ) + ) + return ib.get() + + def _reduce_reset(): + ib = tvm.tir.ir_builder.create() + ib.emit( + tvm.tir.call_extern( + cc.dtype, f"{func_prefix}_reset_{uniq_id}", cc.access_ptr("w"), cc.strides[0] + ) + ) + return ib.get() + + def _reduce_update(): + return _body() + + return _body(), _reduce_reset(), _reduce_update() + + binds = { + t: tvm.tir.decl_buffer( + t.shape, + t.dtype, + t.op.name, + strides=[te.var(f"{t.op.name}_s_{i}") for i in range(0, len(t.shape))], + offset_factor=1, + ) + for t in [x, z] + } + + intrin_decl = te.decl_tensor_intrin(z.op, _intrin_func, binds=binds) + return intrin_decl, uniq_id + + +def max_impl(uniq_id): + """Emit C code for pool impl.""" + cc_code = ( + common.common_includes + + f""" + + +#ifdef __cplusplus +extern "C" +#endif +__STATIC_FORCEINLINE int32_t max8_reset_{uniq_id}( + int8_t *res, + int N) {{ + memset(res, (int8_t)-128, N * sizeof(*res)); + return 0; +}} + +#ifdef __cplusplus +extern "C" +#endif +__STATIC_FORCEINLINE int32_t max8_loop_{uniq_id}( + int8_t *arg, + int8_t *res, + int N) {{ + for ( int i = 0; i < N; ++ i ) + if ( arg[i] > res[i] ) + res[i] = arg[i]; + return 0; +}} + +#ifdef __cplusplus +extern "C" +#endif +__STATIC_FORCEINLINE int32_t max8_{uniq_id}( + int8_t *arg, + int8_t *res, + int N) {{ + int32_t *parg32, *pres32; + int una_arg = (int32_t)arg & 0x3, una_res = (int32_t)res & 0x3; + int32_t retcode = 0; + + if ( N < 4 || ((una_arg || una_res) && una_arg != una_res) ) {{ + retcode = max8_loop_{uniq_id}(arg, res, N); + goto out; + }} + if ( una_arg ) {{ + int n = (4 - una_arg); + if ( n > N || (N - n) < 4 ) + n = N; + retcode = max8_loop_{uniq_id}(arg, res, n); + N -= n; + if ( N == 0 ) + goto out; + arg += n; res += n; + }} + + parg32 = (int32_t *)arg; + pres32 = (int32_t *)res; + + for ( int i = 0; i < N / 4; ++ i ) {{ + int32_t arg32 = *parg32 ++; + int32_t res32 = *pres32; + __SSUB8(arg32, res32); + res32 = __SEL(arg32, res32); + *pres32 ++ = res32; + }} + + if ( N & 0x3 ) {{ + retcode = max8_loop_{uniq_id}((int8_t *)parg32, (int8_t *)pres32, N & 0x3); + goto out; + }} + +out: + return retcode; +}} + +""" + ) + return cc_code diff --git a/python/tvm/topi/arm_cpu/mprofile/dsp/pool.py b/python/tvm/topi/arm_cpu/mprofile/dsp/pool.py new file mode 100644 index 000000000000..99470a28530a --- /dev/null +++ b/python/tvm/topi/arm_cpu/mprofile/dsp/pool.py @@ -0,0 +1,125 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name, no-value-for-parameter +"""Direct implementation of pool.""" +import logging + +import tvm + +from tvm import te +from tvm.topi.utils import traverse_inline + +from .micro_kernel.max_pool import ( + intrin_max, + max_impl, +) + +from .micro_kernel.avg_pool import ( + intrin_sum, + sum_impl, +) + +logger = logging.getLogger("topi") + + +def schedule_maxpool_1d_nwc(s, op): + """Schedule function for v7e-m DSP instructions of maxpool 1d NWC layout.""" + output = op.output(0) + data_vec = op.input_tensors[0] + + channels = data_vec.shape[-1] + if isinstance(channels, tvm.tir.IntImm): + channels = channels.value + + n, w, c = s[op].op.axis + (k,) = s[op].op.reduce_axis + + s[op].reorder(n, w, k, c) + max_val, uniq_id = intrin_max((1, 1, channels), data_vec.dtype, output.dtype) + s[op].tensorize(c, max_val) + s[output].pragma(n, "import_c", max_impl(uniq_id)) + + +def schedule_maxpool_2d_nhwc(s, op): + """Schedule function for v7e-m DSP instructions of maxpool 2d NHWC layout.""" + output = op.output(0) + data_vec = op.input_tensors[0] + + channels = data_vec.shape[-1] + if isinstance(channels, tvm.tir.IntImm): + channels = channels.value + + n, h, w, c = s[op].op.axis + ko, ki = s[op].op.reduce_axis + + s[op].reorder(n, h, w, ko, ki, c) + max_val, uniq_id = intrin_max((1, 1, 1, channels), data_vec.dtype, output.dtype) + s[op].tensorize(c, max_val) + s[output].pragma(n, "import_c", max_impl(uniq_id)) + + +def schedule_avgpool_1d_ncw(s, op): + """Schedule function for v7e-m DSP instructions of avgpool 1d NCW layout.""" + output = op.output(0) + data_vec = op.input_tensors[0] + + n, _, _ = s[op].op.axis + (k,) = s[op].op.reduce_axis + pool_w = k.dom.extent.value + + summary, uniq_id = intrin_sum((1, 1, pool_w), data_vec.dtype, output.dtype, reset=True) + s[op].tensorize(k, summary) + s[output].pragma(n, "import_c", sum_impl(pool_w, uniq_id)) + + +def schedule_avgpool_2d_nchw(s, op): + """Schedule function for v7e-m DSP instructions of avgpool 2d NCHW layout.""" + output = op.output(0) + data_vec = op.input_tensors[0] + + n, _, _, _ = s[op].op.axis + _, ki = s[op].op.reduce_axis + pool_w = ki.dom.extent.value + + summary, uniq_id = intrin_sum((1, 1, 1, pool_w), data_vec.dtype, output.dtype) + s[op].tensorize(ki, summary) + s[output].pragma(n, "import_c", sum_impl(pool_w, uniq_id)) + + +def pool_dsp_schedule(outs, layout): + """Schedule function for v7e-m DSP instructions of pooling.""" + s = te.create_schedule([x.op for x in outs]) + + def _callback(op): + in_dtype = op.input_tensors[0].dtype + if "pool_max" in op.tag: + if in_dtype != "int8": + logger.warning("Does not have micro-kernel for %s maxpool.", in_dtype) + elif layout == "NWC": + schedule_maxpool_1d_nwc(s, op) + elif layout == "NHWC": + schedule_maxpool_2d_nhwc(s, op) + elif "pool_sum" in op.tag: + if in_dtype != "int16": + logger.warning("Does not have micro-kernel for %s avgpool.", in_dtype) + elif layout == "NCW": + schedule_avgpool_1d_ncw(s, op) + elif layout == "NCHW": + schedule_avgpool_2d_nchw(s, op) + + traverse_inline(s, outs[-1].op, _callback) + return s diff --git a/python/tvm/topi/arm_cpu/pooling.py b/python/tvm/topi/arm_cpu/pooling.py new file mode 100644 index 000000000000..f09f0089342d --- /dev/null +++ b/python/tvm/topi/arm_cpu/pooling.py @@ -0,0 +1,25 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name, unused-variable +"""Schedule for pooling operators""" + +from .mprofile.dsp.pool import pool_dsp_schedule + + +def schedule_pool(outs, layout): + """Create schedule for avgpool/maxpool with dsp""" + return pool_dsp_schedule(outs, layout) diff --git a/tests/python/conftest.py b/tests/python/conftest.py index ab3ea4e4ec06..0dbb3dcc79e8 100644 --- a/tests/python/conftest.py +++ b/tests/python/conftest.py @@ -17,6 +17,7 @@ import sys import tvm +import pytest collect_ignore = [] if sys.platform.startswith("win"): @@ -37,3 +38,23 @@ # collect_ignore.append("unittest/test_auto_scheduler_measure.py") # exception ignored collect_ignore.append("unittest/test_tir_intrin.py") + + +def pytest_addoption(parser): + parser.addoption( + "--enable-corstone300-tests", + action="store_true", + default=False, + help="Run Corstone-300 FVP tests", + ) + + +def pytest_collection_modifyitems(config, items): + if not config.getoption("--enable-corstone300-tests"): + for item in items: + if "corstone300" in item.keywords: + item.add_marker( + pytest.mark.skip( + reason="Need --enable-corstone300-tests option to run this test" + ) + ) diff --git a/tests/python/integration/test_arm_mprofile_dsp.py b/tests/python/integration/test_arm_mprofile_dsp.py new file mode 100644 index 000000000000..cdafa91f42f9 --- /dev/null +++ b/tests/python/integration/test_arm_mprofile_dsp.py @@ -0,0 +1,355 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import sys +import numpy as np +import pytest +import tvm +from tvm import relay +from tests.python.relay.aot.aot_test_utils import ( + AOTTestModel, + AOT_CORSTONE300_RUNNER, + generate_ref_data, + compile_and_run, +) + + +@tvm.testing.requires_corstone300 +@pytest.mark.parametrize( + "data_shape_nhwc, kernel_size, num_filter, strides, padding, dilation", + [ + ((1, 32, 32, 1), (3, 3), 12, 1, 0, 1), + ((1, 32, 10, 3), (3, 3), 16, 1, 0, 1), + ((1, 49, 10, 1), (10, 4), 64, (2, 1), (4, 1, 5, 1), 1), + ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 1), + ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1), + ((1, 32, 32, 16), (3, 3), 16, 1, 0, 1), + ((1, 32, 32, 16), (3, 3), 16, 1, (0, 2, 2, 0), 2), + ((1, 32, 32, 16), (3, 3), 16, 1, (1, 1, 2, 2), 2), + # bug https://github.com/apache/tvm/issues/9226 + ((1, 49, 10, 1), (10, 4), 64, (2, 2), (4, 1, 5, 1), 1), + # from Visual Wake Word model + ((1, 96, 96, 3), (3, 3), 8, (2, 2), (0, 0, 1, 1), 1), + # from Image Classification model (one of the MLPerfTiny models) + ((1, 16, 16, 32), (1, 1), 64, (2, 2), 0, 1), + ((4, 16, 16, 8), (5, 5), 8, 2, (0, 4, 4, 0), 1), + ((4, 16, 16, 8), (5, 5), 16, 2, (0, 4, 4, 0), 1), + ((4, 16, 16, 8), (5, 5), 8, 2, 0, 1), + ((4, 16, 16, 8), (5, 5), 16, 2, 0, 1), + ((1, 16, 16, 8), (3, 3), 16, 2, (0, 0, 1, 1), 1), + ((1, 16, 16, 8), (3, 3), 16, 2, (1, 1, 2, 2), 1), + ((1, 16, 16, 8), (5, 5), 16, 2, (3, 3, 2, 2), 1), + ((1, 16, 16, 8), (3, 3), 16, 2, (0, 1, 2, 3), 1), + ], +) +@pytest.mark.parametrize("dtype", ["int8", "int16"]) +def test_conv2d(data_shape_nhwc, kernel_size, num_filter, strides, padding, dilation, dtype): + """Test a subgraph with a single conv2d operator.""" + ishape = data_shape_nhwc + wshape = (*kernel_size, data_shape_nhwc[-1], num_filter) + + weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype) + + input0 = relay.var("input", relay.TensorType(ishape, dtype)) + weight0 = relay.const(weight_data) + out0 = relay.op.nn.conv2d( + input0, + weight0, + kernel_size=kernel_size, + strides=strides, + padding=padding, + dilation=(dilation, dilation), + data_layout="NHWC", + kernel_layout="HWIO", + out_dtype="int32", + out_layout="NHWC", + ) + ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0)) + + input1 = relay.var("input", relay.TensorType(ishape, dtype)) + weight1 = relay.const(np.moveaxis(weight_data, 2, -1)) + out1 = relay.op.nn.conv2d( + input1, + weight1, + kernel_size=kernel_size, + strides=strides, + padding=padding, + dilation=(dilation, dilation), + data_layout="NHWC", + kernel_layout="HWOI", + out_dtype="int32", + out_layout="NHWC", + ) + mod = tvm.IRModule.from_expr(relay.Function([input1], out1)) + + inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)} + output_list = generate_ref_data(ref_mod, inputs) + + compile_and_run( + AOTTestModel(module=mod, inputs=inputs, outputs=output_list), + runner=AOT_CORSTONE300_RUNNER, + interface_api="c", + use_unpacked_api=True, + target_opts={ + "-keys": "arm_cpu", + "-mcpu": "cortex-m7", + }, + ) + + +@tvm.testing.requires_corstone300 +@pytest.mark.parametrize( + "data_shape_nwc, kernel_size, num_filter, strides, padding", + [ + ((1, 32, 12), 3, 16, 1, 0), + ((3, 12, 10), 4, 24, 1, 0), + ((1, 7, 7), 3, 5, 1, 0), + ((1, 10, 2), 4, 4, 2, (1, 1)), + ((1, 20, 2), 4, 4, 2, (0, 1)), + ((1, 16, 4), 1, 12, 1, (1, 0)), + ((1, 24, 16), 1, 32, 3, (2, 2)), + ], +) +@pytest.mark.parametrize("dtype", ["int8", "int16"]) +def test_conv1d(data_shape_nwc, kernel_size, num_filter, strides, padding, dtype): + """Test a subgraph with a single conv1d operator.""" + ishape = data_shape_nwc + wshape = (kernel_size, data_shape_nwc[-1], num_filter) + + weight_data = np.random.randint(low=-10, high=10, size=wshape, dtype=dtype) + + input0 = relay.var("input", relay.TensorType(ishape, dtype)) + weight0 = relay.const(weight_data) + out0 = relay.op.nn.conv1d( + input0, + weight0, + strides=strides, + padding=padding, + data_layout="NWC", + kernel_layout="WIO", + out_dtype="int32", + out_layout="NWC", + ) + ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0)) + + input1 = relay.var("input", relay.TensorType(ishape, dtype)) + weight1 = relay.const(np.moveaxis(weight_data, 1, -1)) + out1 = relay.op.nn.conv1d( + input1, + weight1, + strides=strides, + padding=padding, + data_layout="NWC", + kernel_layout="WOI", + out_dtype="int32", + out_layout="NWC", + ) + mod = tvm.IRModule.from_expr(relay.Function([input1], out1)) + + inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype=dtype)} + output_list = generate_ref_data(ref_mod, inputs) + + compile_and_run( + AOTTestModel(module=mod, inputs=inputs, outputs=output_list), + runner=AOT_CORSTONE300_RUNNER, + interface_api="c", + use_unpacked_api=True, + target_opts={ + "-keys": "arm_cpu", + "-mcpu": "cortex-m7", + }, + ) + + +@tvm.testing.requires_corstone300 +@pytest.mark.parametrize( + "M, K, N", + [ + (1, 32, 64), + (3, 12, 10), + ], +) +def test_dense(M, K, N): + """Test a subgraph with a single dense operator.""" + ishape = (M, K) + wshape = (N, K) + + input0 = relay.var("input", relay.TensorType(ishape, "int8")) + dense_f = relay.op.nn.batch_flatten(input0) + weight0 = relay.const(np.random.randint(low=-10, high=10, size=wshape, dtype="int8")) + out = relay.op.nn.dense(dense_f, weight0, out_dtype="int32") + + mod = tvm.IRModule.from_expr(relay.Function([input0], out)) + inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype="int8")} + output_list = generate_ref_data(mod, inputs) + + compile_and_run( + AOTTestModel(module=mod, inputs=inputs, outputs=output_list), + runner=AOT_CORSTONE300_RUNNER, + interface_api="c", + use_unpacked_api=True, + target_opts={ + "-keys": "arm_cpu", + "-mcpu": "cortex-m7", + }, + ) + + +@tvm.testing.requires_corstone300 +@pytest.mark.parametrize( + "data_shape_nhwc, pool_size, strides, padding", + [ + ((1, 32, 32, 1), (3, 3), 1, 0), + ((1, 32, 20, 4), (3, 3), (2, 2), 0), + ], +) +def test_maxpool_2d(data_shape_nhwc, pool_size, strides, padding): + """Test a subgraph with a single maxpool_2d operator.""" + + ishape = data_shape_nhwc + + input0 = relay.var("input", relay.TensorType(ishape, "int8")) + out = relay.op.nn.max_pool2d(input0, pool_size, layout="NHWC", strides=strides, padding=padding) + + mod = tvm.IRModule.from_expr(relay.Function([input0], out)) + inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype="int8")} + output_list = generate_ref_data(mod, inputs) + + compile_and_run( + AOTTestModel(module=mod, inputs=inputs, outputs=output_list), + runner=AOT_CORSTONE300_RUNNER, + interface_api="c", + use_unpacked_api=True, + target_opts={ + "-keys": "arm_cpu", + "-mcpu": "cortex-m7", + }, + ) + + +@tvm.testing.requires_corstone300 +@pytest.mark.parametrize( + "data_shape_nwc, pool_size, strides, padding", + [ + ((1, 32, 1), 3, 1, 0), + ((1, 20, 4), 3, 2, 0), + ], +) +def test_maxpool_1d(data_shape_nwc, pool_size, strides, padding): + """Test a subgraph with a single maxpool_1d operator.""" + ishape = data_shape_nwc + + input0 = relay.var("input", relay.TensorType(ishape, "int8")) + out = relay.op.nn.max_pool1d(input0, pool_size, layout="NWC", strides=strides, padding=padding) + + mod = tvm.IRModule.from_expr(relay.Function([input0], out)) + inputs = {"input": np.random.randint(low=-128, high=127, size=ishape, dtype="int8")} + output_list = generate_ref_data(mod, inputs) + + compile_and_run( + AOTTestModel(module=mod, inputs=inputs, outputs=output_list), + runner=AOT_CORSTONE300_RUNNER, + interface_api="c", + use_unpacked_api=True, + target_opts={ + "-keys": "arm_cpu", + "-mcpu": "cortex-m7", + }, + ) + + +@tvm.testing.requires_corstone300 +@pytest.mark.parametrize( + "data_shape_nchw, pool_size, strides, padding", + [ + ((1, 1, 32, 32), (3, 3), 1, 0), + ((1, 4, 32, 20), (3, 3), (2, 2), 0), + ], +) +def test_avgpool_2d(data_shape_nchw, pool_size, strides, padding): + """Test a subgraph with a single avgpool_2d operator.""" + + ishape = data_shape_nchw + + input0 = relay.var("input", relay.TensorType(ishape, "int32")) + out0 = relay.nn.avg_pool2d(input0, pool_size, layout="NCHW", strides=strides, padding=padding) + ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0)) + + input1 = relay.var("input", relay.TensorType(ishape, "int16")) + out1 = relay.op.nn.avg_pool2d( + input1, pool_size, layout="NCHW", strides=strides, padding=padding + ) + mod = tvm.IRModule.from_expr(relay.Function([input1], out1)) + + input_data = np.random.randint(low=-128, high=127, size=ishape, dtype="int32") + inputs = {"input": input_data} + output_list = generate_ref_data(ref_mod, inputs) + + compile_and_run( + AOTTestModel( + module=mod, inputs={"input": input_data.astype(dtype="int16")}, outputs=output_list + ), + runner=AOT_CORSTONE300_RUNNER, + interface_api="c", + use_unpacked_api=True, + target_opts={ + "-keys": "arm_cpu", + "-mcpu": "cortex-m7", + }, + ) + + +@tvm.testing.requires_corstone300 +@pytest.mark.parametrize( + "data_shape_ncw, pool_size, strides, padding", + [ + ((1, 1, 32), 3, 1, 0), + ((1, 4, 20), 3, 2, 2), + ], +) +def test_avgpool_1d(data_shape_ncw, pool_size, strides, padding): + """Test a subgraph with a single avgpool_1d operator.""" + + ishape = data_shape_ncw + + input0 = relay.var("input", relay.TensorType(ishape, "int32")) + out0 = relay.op.nn.avg_pool1d(input0, pool_size, layout="NCW", strides=strides, padding=padding) + ref_mod = tvm.IRModule.from_expr(relay.Function([input0], out0)) + + input1 = relay.var("input", relay.TensorType(ishape, "int16")) + out1 = relay.op.nn.avg_pool1d(input1, pool_size, layout="NCW", strides=strides, padding=padding) + mod = tvm.IRModule.from_expr(relay.Function([input1], out1)) + + input_data = np.random.randint(low=-10, high=10, size=ishape, dtype="int32") + inputs = {"input": input_data} + output_list = generate_ref_data(ref_mod, inputs) + + compile_and_run( + AOTTestModel( + module=mod, inputs={"input": input_data.astype(dtype="int16")}, outputs=output_list + ), + runner=AOT_CORSTONE300_RUNNER, + interface_api="c", + use_unpacked_api=True, + target_opts={ + "-keys": "arm_cpu", + "-mcpu": "cortex-m7", + }, + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__] + sys.argv[1:])) diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py index 7d8a4f043548..c73af1948b57 100644 --- a/tests/python/relay/aot/aot_test_utils.py +++ b/tests/python/relay/aot/aot_test_utils.py @@ -545,6 +545,15 @@ def create_header_file(tensor_name, npy_data, output_path, data_linkage): It is used to capture the tensor data (for both inputs and expected outputs) to be bundled into the standalone application. """ file_path = pathlib.Path(f"{output_path}/" + tensor_name).resolve() + np_type_to_c = { + "int8": "int8_t", + "uint8": "uint8_t", + "int16": "int16_t", + "uint16": "uint16_t", + "int32": "int32_t", + "uint32": "uint32_t", + "float32": "float", + } # create header file raw_path = file_path.with_suffix(".h").resolve() with open(raw_path, "w") as header_file: @@ -555,14 +564,7 @@ def create_header_file(tensor_name, npy_data, output_path, data_linkage): emit_data_linkage(header_file, data_linkage) - if npy_data.dtype == "int8": - header_file.write(f"int8_t {tensor_name}[] =") - elif npy_data.dtype == "int32": - header_file.write(f"int32_t {tensor_name}[] = ") - elif npy_data.dtype == "uint8": - header_file.write(f"uint8_t {tensor_name}[] = ") - elif npy_data.dtype == "float32": - header_file.write(f"float {tensor_name}[] = ") + header_file.write(f"{np_type_to_c[str(npy_data.dtype)]} {tensor_name}[] =") header_file.write("{") for i in np.ndindex(npy_data.shape): @@ -577,6 +579,7 @@ def compile_models( workspace_byte_alignment: int = 8, enable_op_fusion: bool = True, pass_config: Dict[str, Any] = None, + target_opts: Dict = None, ) -> List[AOTCompiledTestModel]: """ This method generates runtime.Modules for the tests @@ -586,6 +589,9 @@ def compile_models( base_target = "c -runtime=c --link-params --executor=aot" extra_target = f"--workspace-byte-alignment={workspace_byte_alignment} --interface-api={interface_api} --unpacked-api={int(use_unpacked_api)}" + if target_opts: + for key, val in target_opts.items(): + extra_target += f" {key}={val}" target = f"{base_target} {extra_target}" config = {"tir.disable_vectorize": True} @@ -727,6 +733,7 @@ def compile_and_run( workspace_byte_alignment: int = 8, enable_op_fusion: bool = True, data_linkage: AOTDataLinkage = None, + target_opts: Dict = None, ): """This is a wrapper API to compile and run models as test for AoT""" compiled_test_mods = compile_models( @@ -736,6 +743,7 @@ def compile_and_run( workspace_byte_alignment=workspace_byte_alignment, enable_op_fusion=enable_op_fusion, pass_config=runner.pass_config, + target_opts=target_opts, ) run_and_check( models=compiled_test_mods, diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh index 8618619d65ad..615caa50aa91 100755 --- a/tests/scripts/task_python_integration.sh +++ b/tests/scripts/task_python_integration.sh @@ -74,3 +74,8 @@ run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-driver tests/python/driver # Do not enable OpenGL # run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-webgl tests/webgl + + +if [ -z "${TVM_INTEGRATION_GPU_ONLY:-}" ] && [ -z "${TVM_INTEGRATION_I386_ONLY:-}" ] ; then + run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-m7-simd tests/python/integration/test_arm_mprofile_dsp.py --enable-corstone300-tests +fi diff --git a/tests/scripts/task_python_integration_gpuonly.sh b/tests/scripts/task_python_integration_gpuonly.sh index 36c3883d4379..cb6bec40c22f 100755 --- a/tests/scripts/task_python_integration_gpuonly.sh +++ b/tests/scripts/task_python_integration_gpuonly.sh @@ -20,5 +20,6 @@ export TVM_TEST_TARGETS="cuda;opencl;metal;rocm;nvptx;opencl -device=mali,aocl_s export PYTEST_ADDOPTS="-m gpu $PYTEST_ADDOPTS" export TVM_RELAY_TEST_TARGETS="cuda" export TVM_INTEGRATION_TESTSUITE_NAME=python-integration-gpu +export TVM_INTEGRATION_GPU_ONLY=1 ./tests/scripts/task_python_integration.sh