From 7e3bfbda34ceeea179ae63545950050d89d06b45 Mon Sep 17 00:00:00 2001 From: Giuseppe Rossini Date: Tue, 6 Oct 2020 14:53:46 +0100 Subject: [PATCH] Add dot product support for quantized convolution. (#6445) * Add dot product support for quantized convolution. We added two new intrinsics in: topi/arm_cpu/tensor_intrin.py, namely - mmla4x4: compute a matrix multiplication between tile A(4,4) and tile B(4,4) - mmla16x4: compute a matrix multiplication between tile A(rows,4) and tile B(4,16) Then we used those intrinsics in two separate strategies. We added the strategies in topi/arm_cpu/conv2d_int8.py and implemented the schedules in topi/arm_cpu/conv2d_gemm.py. In particular: - schedule_conv2d_gemm, when accelerated, packs matrix A, compute GEMM, and unpack the resulting matrix. This uses the mmla4x4 intrinsic - schedule_conv2d_gemm_hybrid doesn't do any packing on A and C which are in native form. This uses the mmla16x4 intrinsic Please note that for the limitations of `tensorize` we need to pad matrix A in both cases (when dimensions are not multiple of the tiling shape) Change-Id: Id0d818d84ffc458c6dad7983fd350a0f3d5db395 * Add back nhwc_spatial_pack strategy as default Change-Id: I8b1826a7ae1d742956296e8d157da19955a4942c * Fix linting through Black Change-Id: Ic74ef5461a90bca9f4d4980a214137e384d5f923 * Fix python linting Change-Id: I5fb8a2ae4467a87bd3470f6b3753c074f9b7cc78 * Addressing review comments Change-Id: I284b1f2c121051e672f548d6c6ee2a3267854e31 * Fix black linting issues Change-Id: I1813b0226b536aedee0dce9eeeba27aa2d95518b * Fixing failing test and adding tests for dot-product compilation Change-Id: Ic040722abd5538fccb85af4de922394c939e7000 * Fixing linting and review comments Change-Id: If09e3baa514c85dc78d3c27c2ac2fa2e01773d89 * Fixing black linting and address comments Change-Id: I857b28b6f9b23307d8c1eebc509de6ad2783c756 * Address review comments Change-Id: I63d1a639d4a72abeb33148fd2868cd356ef84122 --- python/tvm/relay/op/strategy/arm_cpu.py | 44 ++- python/tvm/topi/arm_cpu/arm_utils.py | 100 ++++++ python/tvm/topi/arm_cpu/conv2d_alter_op.py | 105 ++++-- python/tvm/topi/arm_cpu/conv2d_gemm.py | 283 ++++++++++++----- python/tvm/topi/arm_cpu/conv2d_int8.py | 120 ++++++- python/tvm/topi/arm_cpu/tensor_intrin.py | 298 +++++++++++++++++- .../python/relay/test_pass_alter_op_layout.py | 2 +- .../topi/python/test_topi_conv2d_int8.py | 105 +++--- 8 files changed, 874 insertions(+), 183 deletions(-) create mode 100644 python/tvm/topi/arm_cpu/arm_utils.py diff --git a/python/tvm/relay/op/strategy/arm_cpu.py b/python/tvm/relay/op/strategy/arm_cpu.py index 54624ce35b55..6759a54d0b80 100644 --- a/python/tvm/relay/op/strategy/arm_cpu.py +++ b/python/tvm/relay/op/strategy/arm_cpu.py @@ -135,20 +135,29 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target): name="conv2d_direct_simd.micro_dev", ) elif kernel_layout == "HWIO": - is_aarch64 = "aarch64" in str(isa.target) - + is_aarch64 = topi.arm_cpu.arm_utils.is_aarch64_arm() + has_dot_prod = topi.arm_cpu.arm_utils.is_dotprod_available() + if has_dot_prod and data.dtype in ["int8", "uint8"]: + strategy.add_implementation( + wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized_native), + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_native), + name="conv2d_NHWC_quantized_native.arm_cpu", + ) if is_aarch64 and data.dtype in ["int8", "uint8"]: strategy.add_implementation( - wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized), - wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized), - name="conv2d_NHWC_quantized.arm_cpu", + wrap_compute_conv2d(topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved), + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved), + name="conv2d_NHWC_quantized_interleaved.arm_cpu", + ) + if (not is_aarch64) or (data.dtype not in ["int8", "uint8"]): + # TODO(@giuseros) + # This strategy errors out for quantized data types when tuning. + # Let's use this only for non-aarch64 or non-quantized cases + strategy.add_implementation( + wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack), + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack), + name="conv2d_nhwc_spatial_pack.arm_cpu", ) - - strategy.add_implementation( - wrap_compute_conv2d(topi.arm_cpu.conv2d_nhwc_spatial_pack), - wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nhwc_spatial_pack), - name="conv2d_nhwc_spatial_pack.arm_cpu", - ) else: raise RuntimeError( "Unsupported kernel layout {} for conv2d NHWC".format(kernel_layout) @@ -328,11 +337,18 @@ def conv2d_gemm_without_weight_transform_strategy_arm_cpu(attrs, inputs, out_typ data = inputs[0] strategy = _op.OpStrategy() + interleaved_compute = topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved_without_transform + native_compute = topi.arm_cpu.compute_conv2d_NHWC_quantized_native_without_transform if layout == "NHWC" and data.dtype in ["int8", "uint8"]: strategy.add_implementation( - wrap_compute_conv2d_gemm(topi.arm_cpu.compute_conv2d_NHWC_quantized_without_transform), - wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized), - name="conv2d_NHWC_quantized_without_transform.arm_cpu", + wrap_compute_conv2d_gemm(native_compute), + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_native), + name="conv2d_NHWC_quantized_native_without_transform.arm_cpu", + ) + strategy.add_implementation( + wrap_compute_conv2d_gemm(interleaved_compute), + wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved), + name="conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu", ) else: raise RuntimeError( diff --git a/python/tvm/topi/arm_cpu/arm_utils.py b/python/tvm/topi/arm_cpu/arm_utils.py new file mode 100644 index 000000000000..7e0f566b96f4 --- /dev/null +++ b/python/tvm/topi/arm_cpu/arm_utils.py @@ -0,0 +1,100 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name,unused-variable,unused-argument,no-member +"""Arm target utility functions""" + +import re +import tvm + + +def get_arch_version(target_mattr): + """Parse the LLVM target -mattr, and return + the architecture version in a decimal representation + (e.g., if -mattr=v8.4a, return 8.4) + """ + + arch_version = 8.0 + m = re.compile(r"\+v(.*)\.(.*)a") + for attr in target_mattr: + match_obj = m.match(attr) + if match_obj: + major = int(match_obj.group(1)) + minor = int(match_obj.group(2)) + decimal = 10 + if minor >= 10: + decimal = 100 + arch_version = major + float(minor) / decimal + + return arch_version + + +def is_dotprod_available(): + """ Checks whether the hardware has support for fast Int8 arithmetic operations. """ + target = tvm.target.Target.current(allow_none=False) + arch_version = get_arch_version(target.mattr) + return arch_version >= 8.4 or ((arch_version in (8.2, 8.3)) and "+dotprod" in target.mattr) + + +def is_aarch64_arm(): + """ Checks whether we are compiling for an AArch64 target. """ + target = tvm.target.Target.current(allow_none=False) + return "aarch64" in target.attrs.get("mtriple", "") + + +def get_tiling_B_interleaved_t(interleave_A): + """Compute the tiling information for matrix B', where B' + is the transposed and interleaved version of matrix B in C=A*B. + + The tiling information is chosen to maximize register usage during the + tile computation. + + Please refer to: + - https://discuss.tvm.apache.org/t/rfc-accelerate-quantized-convolution-through-dot-product + - Conv2DGemmWeightTransformRel in src/relay/op/nn/convolution.h + In order to have more information + + Parameters + ---------- + interleave_A: bool + determines if A is expected to be interleaved + + Returns + ---------- + tile_rows_B: the output tile rows of B' + tile_cols_B: the output tile columns of B' + """ + if is_dotprod_available(): + # The number of tile rows of B' vary depending on the + # strategy: + # * If we are interleaving A, then we select 12 columns from B'(i.e., + # 12 rows from B). + # * If we are not interleaving A, then we select 16 columns from B'(i.e., + # 16 rows from B). + tile_rows_B = 12 if interleave_A else 16 + + # Dot product instruction groups 2 (u)int16x8 vectors in + # groups of 4 and compute the dot product among those groups + # This means that the number of columns in a tile of B' (i.e., the + # rows of the original matrix B) need to be 4. + tile_cols_B = 4 + else: + # If dot product is not available, A must be interleaved. In this case + # we load 4 rows of B' (i.e., 4 columns of B). Each of them will contain 16 elements + tile_rows_B = 4 + tile_cols_B = 16 + + return tile_rows_B, tile_cols_B diff --git a/python/tvm/topi/arm_cpu/conv2d_alter_op.py b/python/tvm/topi/arm_cpu/conv2d_alter_op.py index 7bf7e42237d2..a64bc413e0c6 100644 --- a/python/tvm/topi/arm_cpu/conv2d_alter_op.py +++ b/python/tvm/topi/arm_cpu/conv2d_alter_op.py @@ -27,10 +27,64 @@ from ..nn import conv2d_alter_layout from ..util import get_const_tuple from ..x86.conv2d import _get_default_config as _get_x86_default_config +from .arm_utils import get_tiling_B_interleaved_t logger = logging.getLogger("topi") +def interleave_transpose_weights(inputs, data, kernel, interleave_A): + """Transform the weight matrix by reshaping, interleaving and transposing it + + Parameters + ---------- + inputs : tvm.relay.Expr + Grouped input symbols + data : + Input shape and dtype + kernel : + Input shape and dtype + interleave_A: indicates if we expect matrix A to be interleaved + + Returns + ---------- + new_kernel : tvm.te.placeholder + A placeholder with the new shape + new_kernel_expr : tvm.relay.Expr + The relay expression of the weights + """ + assert ( + data.dtype == "int8" + and kernel.dtype == "int8" + or data.dtype == "uint8" + and kernel.dtype == "uint8" + ) + + KH, KW, IC, OC = get_const_tuple(kernel.shape) + K = KH * KW * IC + N = OC + + # Get tiling information for the interleaved transposed version of B + tile_rows_B, tile_cols_B = get_tiling_B_interleaved_t(interleave_A) + + pad_K = 0 + pad_N = 0 + + if N % tile_rows_B != 0: + pad_N = tile_rows_B - (N % tile_rows_B) + if K % tile_cols_B != 0: + pad_K = tile_cols_B - (K % tile_cols_B) + + N_padded = N + pad_N + K_padded = K + pad_K + new_kernel_expr = relay.nn.contrib_conv2d_gemm_weight_transform( + inputs[1], tile_rows_B, tile_cols_B + ) + new_kernel = te.placeholder( + (N_padded // tile_rows_B, K_padded // tile_cols_B, tile_rows_B, tile_cols_B), kernel.dtype + ) + return new_kernel, new_kernel_expr + + @conv2d_alter_layout.register(["arm_cpu"]) def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): target = tvm.target.Target.current(allow_none=False) @@ -279,36 +333,13 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): ) dispatch_ctx.update(target, new_workload, cfg) return relay.nn.contrib_depthwise_conv2d_nchwc(*inputs, **new_attrs) - if topi_tmpl == "conv2d_NHWC_quantized.arm_cpu": - assert ( - data.dtype == "int8" - and kernel.dtype == "int8" - or data.dtype == "uint8" - and kernel.dtype == "uint8" - ) + if topi_tmpl == "conv2d_NHWC_quantized_interleaved.arm_cpu": assert data_layout == "NHWC" and kernel_layout == "HWIO" - KH, KW, IC, OC = get_const_tuple(kernel.shape) - K = KH * KW * IC - N = OC - - tile_rows = 4 - tile_cols = 16 - pad_K = 0 - pad_N = 0 - - if N % tile_rows != 0: - pad_N = tile_rows - (N % tile_rows) - if K % tile_cols != 0: - pad_K = tile_cols - (K % tile_cols) - - N_padded = N + pad_N - K_padded = K + pad_K - kernel_expr = relay.nn.contrib_conv2d_gemm_weight_transform(inputs[1], tile_rows, tile_cols) - new_kernel = te.placeholder( - (N_padded // tile_rows, K_padded // tile_cols, tile_rows, tile_cols), kernel.dtype + KH, KW, _, OC = get_const_tuple(kernel.shape) + new_workload_name = "conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu" + new_kernel, new_kernel_expr = interleave_transpose_weights( + inputs, data, kernel, interleave_A=True ) - - new_workload_name = "conv2d_NHWC_quantized_without_transform.arm_cpu" new_workload = autotvm.task.args_to_workload( [data, new_kernel, strides, padding, dilation, out_dtype, (KH, KW), OC], new_workload_name, @@ -316,7 +347,21 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): dispatch_ctx.update(target, new_workload, cfg) return relay.nn.contrib_conv2d_gemm_without_weight_transform( - inputs[0], kernel_expr, **new_attrs + inputs[0], new_kernel_expr, **new_attrs + ) + if topi_tmpl == "conv2d_NHWC_quantized_native.arm_cpu": + assert data_layout == "NHWC" and kernel_layout == "HWIO" + KH, KW, _, OC = get_const_tuple(kernel.shape) + new_workload_name = "conv2d_NHWC_quantized_native_without_transform.arm_cpu" + new_kernel, new_kernel_expr = interleave_transpose_weights( + inputs, data, kernel, interleave_A=False + ) + new_workload = autotvm.task.args_to_workload( + [data, new_kernel, strides, padding, dilation, out_dtype, (KH, KW), OC], + new_workload_name, + ) + dispatch_ctx.update(target, new_workload, cfg) + return relay.nn.contrib_conv2d_gemm_without_weight_transform( + inputs[0], new_kernel_expr, **new_attrs ) - return None diff --git a/python/tvm/topi/arm_cpu/conv2d_gemm.py b/python/tvm/topi/arm_cpu/conv2d_gemm.py index 7f73cc828fa7..b40fb89b5d33 100644 --- a/python/tvm/topi/arm_cpu/conv2d_gemm.py +++ b/python/tvm/topi/arm_cpu/conv2d_gemm.py @@ -23,18 +23,52 @@ from tvm.autotvm.task.space import AnnotateEntity, ReorderEntity, OtherOptionEntity from ..util import get_const_tuple, get_const_int from ..nn.util import get_pad_tuple -from .tensor_intrin import gemm_quantized, gemm_quantized_impl +from .tensor_intrin import ( + gemm_quantized, + gemm_quantized_impl, + gemm_acc_4x4_int8_int8_int32, + gemm_acc_nx16_int8_int8_int32, +) +from .arm_utils import is_aarch64_arm, is_dotprod_available -def is_aarch64_arm(): - """ Checks whether we are compiling for an AArch64 target. """ - target = tvm.target.Target.current(allow_none=False) - return "aarch64" in target.attrs.get("mtriple", "") +def configure_knobs(cfg, M, K): + """ Configure auto-tuning knobs for the interleaved strategy """ + + x, y = cfg.axis(M // 4), cfg.axis(K // 16) + cfg.define_reorder("reorder_gemm", [x, y], policy="candidate", candidate=[[x, y], [y, x]]) + + outer_loop, inner_loop = cfg.axis(4), cfg.axis(16) + cfg.define_annotate( + "A_interleaved_unroll_vec", [outer_loop, inner_loop], policy="try_unroll_vec" + ) + + # Fallback configuration + if cfg.is_fallback: + cfg["reorder_gemm"] = ReorderEntity([0, 1]) + cfg["A_interleaved_unroll_vec"] = AnnotateEntity(["unroll", "vec"]) + + if not is_dotprod_available(): + cfg.define_knob("gemm_quantized_unroll", [True, False]) + cfg.define_knob("gemm_quantized_interleave", [True, False]) + + if cfg.is_fallback: + cfg["gemm_quantized_unroll"] = OtherOptionEntity(False) + cfg["gemm_quantized_interleave"] = OtherOptionEntity(True) # Compute function def compute_conv2d_gemm_without_weight_transform( - cfg, data, B_interleaved_t, strides, padding, dilation, out_dtype, kernel_size, output_channels + cfg, + data, + B_interleaved_t, + strides, + padding, + dilation, + out_dtype, + kernel_size, + output_channels, + interleave_A, ): """Compute conv2d by transforming the input, executing GEMM and transforming the output back""" @@ -42,8 +76,7 @@ def compute_conv2d_gemm_without_weight_transform( KH, KW = get_const_tuple(kernel_size) OC = get_const_int(output_channels) - - K_AREA = KH * KW + kernel_area = KH * KW if isinstance(dilation, int): dilation_h = dilation_w = dilation @@ -67,18 +100,14 @@ def compute_conv2d_gemm_without_weight_transform( else: data_pad = data - # --- Im2col + # Im2col M = OH * OW - K = IC * K_AREA + K = IC * kernel_area N = OC A_shape = (batches, M, K) - if K_AREA == 1: - A = te.compute( - A_shape, - lambda n, x, y: data_pad[n, HSTR * (x // OW), WSTR * (x % OW), y], - name="data_flatten", - ) + if kernel_area == 1: + A = tvm.topi.reshape(data_pad, A_shape) else: A = te.compute( A_shape, @@ -90,82 +119,117 @@ def compute_conv2d_gemm_without_weight_transform( ], name="data_im2col", ) - N_transformed = B_interleaved_t.shape[0] - # --- Pad if necessary - idxm = tvm.tir.indexmod + # Pad if necessary + N_transformed = B_interleaved_t.shape[0] + tile_rows_B = B_interleaved_t.shape[2] + tile_cols_B = B_interleaved_t.shape[3] + + # Select the tiling strategy for A. + # The tiling information is chosen to maximize register usage during + # the tile computation. + # + # Please refer to: + # - https://discuss.tvm.apache.org/t/rfc-accelerate-quantized-convolution-through-dot-product + # - Conv2DGemmWeightTransformRel in src/relay/op/nn/convolution.h + # In order to have more information + # + if is_dotprod_available() and interleave_A: + # If dot product has been enabled, and we are interleaving A + # tile size should be 8x4 + tile_rows_A = 8 + tile_cols_A = 4 + else: + # If either there is no dot product or if we are using a native strategy + # tile size should be 4x16 + tile_rows_A = 4 + tile_cols_A = 16 - pad_m = 0 - pad_k = 0 + pad_M = 0 + pad_K = 0 - if M % 4 != 0: - pad_m = 4 - (M % 4) + if M % tile_rows_A != 0: + pad_M = tile_rows_A - (M % tile_rows_A) - if K % 16 != 0: - pad_k = 16 - (K % 16) + if K % tile_cols_A != 0: + pad_K = tile_cols_A - (K % tile_cols_A) - M_padded = M + pad_m - K_padded = K + pad_k + M_padded = M + pad_M + K_padded = K + pad_K + N_padded = N_transformed * tile_rows_B pad_before = (0, 0, 0) - pad_after = (0, pad_m, pad_k) + pad_after = (0, pad_M, pad_K) - if pad_m != 0 or pad_k != 0: + if pad_M != 0 or pad_K != 0: A = nn.pad(A, pad_before=pad_before, pad_after=pad_after, name="A_padded") - # --- GEMM: A*B' + idxm = tvm.tir.indexmod k = te.reduce_axis((0, K_padded), "k") - A_interleaved = te.compute( - (batches, M_padded // 4, K_padded // 16, 4, 16), - lambda b, x, y, z, w: A[b, z + 4 * x, w + 16 * y], - name="A_interleaved", - ) + if interleave_A: + # Configuration space + configure_knobs(cfg, M_padded, K_padded) - C_interleaved = te.compute( - (batches, M_padded // 4, N_transformed, 4, 4), - lambda b, x, y, w, z: te.sum( - A_interleaved[b, x, k // 16, w, idxm(k, 16)].astype(out_dtype) - * B_interleaved_t[y, k // 16, z, idxm(k, 16)].astype(out_dtype), - axis=k, - ), - name="C_interleaved", - ) + # Pack the input data + A_interleaved = te.compute( + (batches, M_padded // tile_rows_A, K_padded // tile_cols_A, tile_rows_A, tile_cols_A), + lambda b, x, y, z, w: A[b, z + tile_rows_A * x, w + tile_cols_A * y], + name="A_interleaved", + ) + # Execute GEMM + C_interleaved = te.compute( + (batches, M_padded // tile_rows_A, N_transformed, tile_rows_A, tile_rows_B), + lambda b, x, y, w, z: te.sum( + A_interleaved[b, x, k // tile_cols_A, w, idxm(k, tile_cols_A)].astype("int32") + * B_interleaved_t[y, k // tile_cols_B, z, idxm(k, tile_cols_B)].astype("int32"), + axis=k, + ), + name="C_interleaved", + ) + # Unpack the result + C = te.compute( + (batches, M, N), + lambda b, x, y: C_interleaved[ + b, x // tile_rows_A, y // tile_rows_B, idxm(x, tile_rows_A), idxm(y, tile_rows_B) + ].astype(out_dtype), + name="C", + ) + zero = tvm.tir.const(0) + else: + # No need to pack/unpack, execute GEMM directly + C = te.compute( + (batches, M_padded, N_padded), + lambda b, x, y: te.sum( + A[b, x, k].astype("int32") + * B_interleaved_t[ + y // tile_rows_B, k // tile_cols_B, idxm(y, tile_rows_B), idxm(k, tile_cols_B) + ].astype("int32"), + axis=k, + ), + name="C", + ) - # --- Unpack C - C = te.compute( - (batches, M, N), - lambda b, x, y: C_interleaved[b, x // 4, y // 4, idxm(x, 4), idxm(y, 4)], - name="C", - ) + # We need to ensure that infer bound pass does not remove the padding + # which is necessary for the tensorizations to work. So we need to + # add a dummy reference to the padding area of the result + zero = ( + tvm.tir.const(1, C.dtype) * C[0, M_padded - 1, N_padded - 1] + - tvm.tir.const(1, C.dtype) * C[0, M_padded - 1, N_padded - 1] + ) - # --- Produce the conv output + # Reshape the result into a convolution output out_shape = (batches, OH, OW, OC) - out = te.compute(out_shape, lambda b, x, y, z: C(b, y + OW * x, z), name="conv2d_gemm_output") - - # Configuration space - x, y = cfg.axis(M_padded // 4), cfg.axis(K_padded // 16) - cfg.define_reorder("reorder_gemm", [x, y], policy="candidate", candidate=[[x, y], [y, x]]) - - outer_loop, inner_loop = cfg.axis(4), cfg.axis(16) - cfg.define_annotate( - "A_interleaved_unroll_vec", [outer_loop, inner_loop], policy="try_unroll_vec" + out = te.compute( + out_shape, + lambda b, x, y, z: (C(b, y + OW * x, z) + zero).astype(out_dtype), + name="conv2d_gemm_output", ) - cfg.define_knob("gemm_quantized_unroll", [True, False]) - cfg.define_knob("gemm_quantized_interleave", [True, False]) - - # Fallback configuration - if cfg.is_fallback: - cfg["reorder_gemm"] = ReorderEntity([0, 1]) - cfg["A_interleaved_unroll_vec"] = AnnotateEntity(["unroll", "vec"]) - cfg["gemm_quantized_unroll"] = OtherOptionEntity(False) - cfg["gemm_quantized_interleave"] = OtherOptionEntity(True) return out -# Schedules -def schedule_conv2d_gemm(cfg, s, out, final_out): - """Create schedule for tensors""" +def schedule_conv2d_gemm_interleaved(cfg, s, out, final_out): + """ Schedule the conv2d_gemm interleaved strategy """ C = out.op.input_tensors[0] C_interleaved = C.op.input_tensors[0] A_interleaved = C_interleaved.op.input_tensors[0] @@ -193,7 +257,7 @@ def schedule_conv2d_gemm(cfg, s, out, final_out): # Computation(through tensorize) b, xo, yo, xi, yi = C_interleaved.op.axis outer_gemm, inner_gemm = cfg["reorder_gemm"].apply(s, C_interleaved, [xo, yo]) - s[C_interleaved].reorder(yi, xi) + b_outer_gemm_fused = s[C_interleaved].fuse(b, outer_gemm) s[C_interleaved].parallel(b_outer_gemm_fused) s[A_interleaved].compute_at(s[C_interleaved], b_outer_gemm_fused) @@ -204,9 +268,33 @@ def schedule_conv2d_gemm(cfg, s, out, final_out): in_type = A_interleaved.dtype out_type = C.dtype - if is_aarch64_arm() and out_type == "int32": + + k = C_interleaved.op.reduce_axis[0] + _, M, N = C.shape + if is_dotprod_available(): + gemm_acc = gemm_acc_4x4_int8_int8_int32(in_type) + xi_outer, yi_outer, xi_inner, yi_inner = s[C_interleaved].tile( + xi, yi, x_factor=8, y_factor=4 + ) + k_outer, k_inner = s[C_interleaved].split(k, 4) + xi_inner_outer, xi_inner_inner = s[C_interleaved].split(xi_inner, 4) + s[C_interleaved].reorder( + b_outer_gemm_fused, + inner_gemm, + xi_outer, + yi_outer, + k_outer, + xi_inner_outer, + xi_inner_inner, + yi_inner, + k_inner, + ) + s[C_interleaved].tensorize(xi_inner_inner, gemm_acc) + s[C_interleaved].unroll(xi_inner_outer) + + elif is_aarch64_arm(): + s[C_interleaved].reorder(yi, xi) K = A_interleaved_input.shape[2] - _, M, N = C.shape assert in_type in ["int8", "uint8"], "Only int8 and uint8 gemm are supported" unroll = cfg["gemm_quantized_unroll"].val interleave = cfg["gemm_quantized_interleave"].val @@ -225,3 +313,48 @@ def schedule_conv2d_gemm(cfg, s, out, final_out): s[C].compute_at(s[out], inner) s[out].vectorize(inner) return s + + +def schedule_conv2d_gemm_native(cfg, s, out, final_out): + """ Schedule the conv2d_gemm hybrid strategy """ + C = out.op.input_tensors[0] + A = C.op.input_tensors[0] + in_type = A.dtype + + # Computation + b, x, y = C.op.axis + (k,) = C.op.reduce_axis + k_outer, k_inner = s[C].split(k, 16) + x_outer, y_outer, x_inner, y_inner = s[C].tile(x, y, x_factor=4, y_factor=16) + s[C].reorder(b, x_outer, y_outer, k_outer, x_inner, y_inner, k_inner) + gemm_acc = gemm_acc_nx16_int8_int8_int32(in_type, rows=1) + s[C].unroll(x_inner) + s[C].tensorize(y_inner, gemm_acc) + s[C].parallel(x_outer) + + # Input transform + if A.op.name == "A_padded": + padding_A = True + data_im2col = A.op.input_tensors[0] + else: + padding_A = False + data_im2col = A + + b, m, n = data_im2col.op.axis + if data_im2col.op.name == "data_im2col": + n_outer, n_inner = s[data_im2col].split(n, 16) + s[data_im2col].unroll(n_outer) + s[data_im2col].vectorize(n_inner) + s[data_im2col].parallel(m) + elif padding_A: + s[data_im2col].compute_inline() + s[A].compute_at(s[C], x_inner) + else: + s[data_im2col].compute_at(s[C], x_inner) + + # Output transform + if out != final_out: + n, h, w, c = out.op.axis + _, inner = s[out].split(c, 4) + s[out].vectorize(inner) + return s diff --git a/python/tvm/topi/arm_cpu/conv2d_int8.py b/python/tvm/topi/arm_cpu/conv2d_int8.py index 307f9e102acf..43fe80178bd3 100644 --- a/python/tvm/topi/arm_cpu/conv2d_int8.py +++ b/python/tvm/topi/arm_cpu/conv2d_int8.py @@ -24,7 +24,12 @@ from .. import nn from ..nn.conv2d import _get_workload as _get_conv2d_workload from .tensor_intrin import dot_int8_int8_int32 -from .conv2d_gemm import compute_conv2d_gemm_without_weight_transform, schedule_conv2d_gemm +from .conv2d_gemm import ( + compute_conv2d_gemm_without_weight_transform, + schedule_conv2d_gemm_interleaved, + schedule_conv2d_gemm_native, +) +from .arm_utils import get_tiling_B_interleaved_t def _get_default_config(cfg, data, kernel, strides, padding, out_dtype): @@ -115,30 +120,46 @@ def traverse(op): return s -@autotvm.register_topi_compute("conv2d_NHWC_quantized.arm_cpu") -def compute_conv2d_NHWC_quantized(cfg, data, kernel, strides, padding, dilation, out_dtype): +def _compute_conv2d_NHWC_quantized( + cfg, data, kernel, strides, padding, dilation, out_dtype, interleave_A +): N, IH, IW, IC = get_const_tuple(data.shape) KH, KW, _, OC = get_const_tuple(kernel.shape) - tile_rows = 4 - tile_cols = 16 - kernel = nn.conv2d_gemm_weight_transform(kernel, tile_rows, tile_cols) + tile_rows_B, tile_cols_B = get_tiling_B_interleaved_t(interleave_A) + + kernel = nn.conv2d_gemm_weight_transform(kernel, tile_rows_B, tile_cols_B) return compute_conv2d_gemm_without_weight_transform( - cfg, data, kernel, strides, padding, dilation, out_dtype, (KH, KW), OC + cfg, data, kernel, strides, padding, dilation, out_dtype, (KH, KW), OC, interleave_A ) -@autotvm.register_topi_compute("conv2d_NHWC_quantized_without_transform.arm_cpu") -def compute_conv2d_NHWC_quantized_without_transform( - cfg, data, B, strides, padding, dilation, out_dtype, kernel_size=None, output_channels=None +def _compute_conv2d_NHWC_quantized_without_transform( + cfg, + data, + B, + strides, + padding, + dilation, + out_dtype, + kernel_size=None, + output_channels=None, + interleave_A=False, ): - """Compute for conv2d_NHWC_quantized without weight transform.""" return compute_conv2d_gemm_without_weight_transform( - cfg, data, B, strides, padding, dilation, out_dtype, kernel_size, output_channels + cfg, + data, + B, + strides, + padding, + dilation, + out_dtype, + kernel_size, + output_channels, + interleave_A, ) -@autotvm.register_topi_schedule("conv2d_NHWC_quantized.arm_cpu") -def schedule_conv2d_NHWC_quantized(cfg, outs): +def _schedule_conv2d_NHWC_quantized(cfg, outs, interleave_A): """Create schedule for tensors""" s = te.create_schedule([x.op for x in outs]) # Vectorize the output and then inline all the rest @@ -153,12 +174,79 @@ def _callback(op): """Traverse operators from computation graph""" if op.name == "conv2d_gemm_output": conv_out = op.output(0) - schedule_conv2d_gemm(cfg, s, conv_out, out) + if interleave_A: + schedule_conv2d_gemm_interleaved(cfg, s, conv_out, out) + else: + schedule_conv2d_gemm_native(cfg, s, conv_out, out) if out != conv_out: s[conv_out].compute_at(s[out], inner) else: C = conv_out.op.input_tensors[0] - s[C].compute_at(s[out], inner) + if interleave_A: + s[C].compute_at(s[out], inner) traverse_inline(s, outs[0].op, _callback) return s + + +# Interleaved schedules: those schedule will interleave the input data. The +# weights are interleaved and transposed +@autotvm.register_topi_compute("conv2d_NHWC_quantized_interleaved.arm_cpu") +def compute_conv2d_NHWC_quantized_interleaved( + cfg, data, kernel, strides, padding, dilation, out_dtype +): + """ Interface for interleaved compute_conv2d_NHWC_quantized_interleaved""" + return _compute_conv2d_NHWC_quantized( + cfg, data, kernel, strides, padding, dilation, out_dtype, True + ) + + +@autotvm.register_topi_compute("conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu") +def compute_conv2d_NHWC_quantized_interleaved_without_transform( + cfg, data, kernel, strides, padding, dilation, out_dtype, kernel_size, output_channels +): + """ Interface for interleaved compute_conv2d_NHWC_quantized_interleaved_without_transform""" + return _compute_conv2d_NHWC_quantized_without_transform( + cfg, data, kernel, strides, padding, dilation, out_dtype, kernel_size, output_channels, True + ) + + +@autotvm.register_topi_schedule("conv2d_NHWC_quantized_interleaved.arm_cpu") +def schedule_conv2d_NHWC_quantized_interleaved(cfg, outs): + """ Interface for interleaved schedule_conv2d_NHWC_quantized_interleaved""" + return _schedule_conv2d_NHWC_quantized(cfg, outs, True) + + +# Native schedules: those schedule won't interleave A (which is left in its native form). +# The weights are interleaved and transposed +@autotvm.register_topi_compute("conv2d_NHWC_quantized_native.arm_cpu") +def compute_conv2d_NHWC_quantized_native(cfg, data, kernel, strides, padding, dilation, out_dtype): + """ Interface for native compute_conv2d_NHWC_quantized""" + return _compute_conv2d_NHWC_quantized( + cfg, data, kernel, strides, padding, dilation, out_dtype, False + ) + + +@autotvm.register_topi_compute("conv2d_NHWC_quantized_native_without_transform.arm_cpu") +def compute_conv2d_NHWC_quantized_native_without_transform( + cfg, data, kernel, strides, padding, dilation, out_dtype, kernel_size, output_channels +): + """ Interface for compute_conv2d_NHWC_quantized_native_without_transform""" + return _compute_conv2d_NHWC_quantized_without_transform( + cfg, + data, + kernel, + strides, + padding, + dilation, + out_dtype, + kernel_size, + output_channels, + False, + ) + + +@autotvm.register_topi_schedule("conv2d_NHWC_quantized_native.arm_cpu") +def schedule_conv2d_NHWC_quantized_native(cfg, outs): + """ Interface for native schedule_conv2d_NHWC_quantized""" + return _schedule_conv2d_NHWC_quantized(cfg, outs, False) diff --git a/python/tvm/topi/arm_cpu/tensor_intrin.py b/python/tvm/topi/arm_cpu/tensor_intrin.py index e87bdc47d0b0..73cfacb62079 100644 --- a/python/tvm/topi/arm_cpu/tensor_intrin.py +++ b/python/tvm/topi/arm_cpu/tensor_intrin.py @@ -417,12 +417,10 @@ def gemm_quantized(M, N, K, unroll, interleave, in_type, out_type): idxm = tvm.tir.indexmod k = te.reduce_axis((0, K), "k") - C = te.compute( (te.var("m"), te.var("n")), lambda x, y: te.sum( - A[k // 16, x, idxm(k, 16)].astype(out_type) - * B[k // 16, y, idxm(k, 16)].astype(out_type), + A[k // 16, x, idxm(k, 16)].astype("int32") * B[k // 16, y, idxm(k, 16)].astype("int32"), axis=k, ), name="C", @@ -445,7 +443,7 @@ def gemm_quantized(M, N, K, unroll, interleave, in_type, out_type): ) c_buffer = tvm.tir.decl_buffer( - C.shape, dtype=out_type, name="c_buffer", offset_factor=1, strides=[te.var("sc"), 1] + C.shape, dtype="int32", name="c_buffer", offset_factor=1, strides=[te.var("sc"), 1] ) def _intrin_func(ins, outs): @@ -589,6 +587,298 @@ def _instr(index): ) +def select_word(vec, lane, dtype_vec): + """ + Utility function used to select a int8x4 word within a int8x16 vector + and replicate 4 times. + The pseudo-code for this operation is: + + v = [x0, ..., x15] + vsub(lane) = v[4*lane:4*lane+3] + replicated_v(lane) = [vsub(lane), vsub(lane), vsub(lane), vsub(lane)] + + Note that 0<=lane<4 + + Parameters + ---------- + vec: tvm.tir.Expr + int8x16 vector expression + lane: int + vector lane we want to replicate + dtype_vec: str + vector data type (e.g., int8x16) + + Returns + ---------- + output: tvm.tir.Expr + replicated vector + """ + # Reinterpret vec_a as 4 int32 words + vec_int32 = tvm.tir.call_intrin("int32x4", "tir.reinterpret", vec) + # Broadcast the lane-th word + vec_int32_shuffled = tvm.tir.Shuffle([vec_int32], [lane, lane, lane, lane]) + # Convert back to uint8x16 + vec_int8_broadcast = tvm.tir.call_intrin(dtype_vec, "tir.reinterpret", vec_int32_shuffled) + return vec_int8_broadcast + + +def gemm_acc_4x4_int8_int8_int32(dtype): + """ + Int8 4x4 matrix multiplication and accumulation using sdot/udot + instructions. This function takes two arrays of int8 datatype + -- A[4][4] and B[4][4] and produces a 4x4 matrix + which is equal to A*B. + + The pseudo code is as follows. + + .. code-block:: c + + void gemm_acc_4x4_int8_int8_int32(int8 A[4][4], int8 B[4][4], int32 C[4][4]){ + for (int i = 0; i < 4; i++){ + for (int j = 0; i < 4; i++){ + for (int k = 0; k < 4; k++){ + C[i][j] += A[i][k] * B[j][k] + } + } + } + + Notes: + * The rows of matrix B are transposed + * The tiling strategy is picked to maximize register usage. + + Parameters + ---------- + dtype: str, {"uint8", "int8"} + Whether it works on unsigned int or signed int + + Returns + ------- + intrin : TensorIntrin + The Arm TensorIntrin that can be used in tensorizing schedule + """ + # This needs to be a variable number of "rows" since TVM + # "thinks" I only need to compute one row because of + # padding + A = te.placeholder((te.var("rows"), 4), dtype, name="A") + B = te.placeholder((4, 4), dtype, name="B") + dtype_vec = dtype + "x16" + + k = te.reduce_axis((0, 4), name="k") + C = te.compute( + (te.var("rows"), 4), + lambda i, j: te.sum(A[i, k].astype("int32") * B[j, k].astype("int32"), axis=k), + name="C", + ) + + aa_buffer = tvm.tir.decl_buffer( + A.shape, dtype, name="aa_buffer", offset_factor=1, strides=[te.var("sa"), 1] + ) + bb_buffer = tvm.tir.decl_buffer( + B.shape, dtype, name="bb_buffer", offset_factor=1, strides=[te.var("sb"), 1] + ) + cc_buffer = tvm.tir.decl_buffer( + C.shape, dtype="int32", name="cc_buffer", offset_factor=1, strides=[te.var("sc"), 1] + ) + + llvm_intrin = "llvm.aarch64.neon.sdot" if dtype == "int8" else "llvm.aarch64.neon.udot" + + def _intrin_func(ins, outs): + def _instr(index): + ib = tvm.tir.ir_builder.create() + if index == 1: + for i in range(0, 4): + ib.emit(outs[0].vstore([i, 0], tvm.tir.const(0, "int32x4"))) + return ib.get() + # Load all the elements of tile A. + # vec_a = [a, b, c, d, + # e, f, g, h, + # l, m, n, o, + # p, q, r, s]; + vec_a = ins[0].vload([0, 0], dtype_vec) + + # Replicate 4 times the i-th row of A. For instance, + # vec_a[0] = [a, b, c, d, + # a, b, c, d, + # a, b, c, d, + # a, b, c, d,]; + vec_aa = [select_word(vec_a, i, dtype_vec) for i in range(0, 4)] + + # Load all the elements of B. Remember that B + # is transposed: + # vec_b = [0, 4, 8, 12, + # 1, 5, 9, 13, + # 2, 6, 10, 14, + # 3, 7, 11, 15,]; + vec_b = ins[1].vload([0, 0], dtype_vec) + + # Execute the dot product + for i in range(0, 4): + vec_c = outs[0].vload([i, 0], "int32x4") + # Compute the product between the i-th row of A + # and all the rows of B. Remember that sdot/udot + # subdive the input vectors in 16 elements + # and then take the dot product among each group. + # The result is stored in a int32x4 register + # + # For instance, for i=0, we have: + # sdot(vec_aa[0], vec_b) = [a*0+b*4+c*8+d*12, + # a*1+b*5+c*9+d*13, + # a*2+b*6+c*10+d*14, + # a*3+b*7+c*11+d*15] + vdot = tvm.tir.call_llvm_intrin( + "int32x4", + llvm_intrin, + tvm.tir.const(3, "uint32"), + vec_c, + vec_b, + vec_aa[i], + ) + + # Store the result + ib.emit(outs[0].vstore([i, 0], vdot)) + + return ib.get() + + # body, reset, update + return _instr(0), _instr(1), _instr(2) + + buffer_params = {"offset_factor": 1} + return te.decl_tensor_intrin( + C.op, + _intrin_func, + binds={A: aa_buffer, B: bb_buffer, C: cc_buffer}, + default_buffer_params=buffer_params, + ) + + +def gemm_acc_nx16_int8_int8_int32(dtype, rows): + """ + Int8 nx16 matrix multiplication and accumulation using sdot/udot instructions + This function takes two arrays of int8 datatype -- A[n][4] and + B[4][16] and produces a rowsx16 matrix which is equal to A*B + The pseudo code is as follows. + + .. code-block:: c + + void mmla_nx16_int8_int8_int32(int8 A[n][16], int8 B[4][16][4], int32 output[n][16]){ + for (int i = 0; i < n; i++){ + for (int j = 0; i < 16; i++){ + for (int k = 0; k < 16; k++){ + out[i][j] += A[i][k] * B[k//4][j][k%4] + } + } + } + } + + Notes: + * The rows of matrix B are transposed + * The tile size of B is 16x4. Since the reduction variable k moves between 0 and 16 + we need 4 tiles of B to compute a single row of the output. The first 4 values of + k will be fetched from B[0][j][k], the second batch of 4 from B[1][j][k] and so on + * The tiling strategy is picked to maximize register usage. + + Parameters + ---------- + dtype: str, {"uint8", "int8"} + Whether it works on unsigned int or signed int + rows: int + Number of of the output rows "n" + + Returns + ------- + intrin : TensorIntrin + The Arm TensorIntrin that can be used in tensorizing schedule + """ + A = te.placeholder((rows, 16), dtype, name="A") + B = te.placeholder((4, 16, 4), dtype, name="B") + dtype_vec = dtype + "x16" + idxm = tvm.tir.indexmod + k = te.reduce_axis((0, 16), name="k") + C = te.compute( + (rows, 16), + lambda i, j: te.sum( + A[i, k].astype("int32") * B[k // 4, j, idxm(k, 4)].astype("int32"), axis=k + ), + name="C", + ) + + aa_buffer = tvm.tir.decl_buffer( + A.shape, dtype, name="aa_buffer", offset_factor=1, strides=[te.var("sa"), 1] + ) + bb_buffer = tvm.tir.decl_buffer( + B.shape, + dtype, + name="bb_buffer", + offset_factor=1, + strides=[te.var("sb0"), te.var("sb1"), 1], + ) + cc_buffer = tvm.tir.decl_buffer( + C.shape, dtype="int32", name="cc_buffer", offset_factor=1, strides=[te.var("sc"), 1] + ) + + llvm_intrin = "llvm.aarch64.neon.sdot" if dtype == "int8" else "llvm.aarch64.neon.udot" + + def _intrin_func(ins, outs): + def _instr(index): + ib = tvm.tir.ir_builder.create() + if index == 1: + for i in range(0, rows): + ib.emit(outs[0].vstore([i, 0], tvm.tir.const(0, "int32x16"))) + return ib.get() + # Iterate on the number of rows of the output + for k in range(0, rows): + # Load 16 elements of A + # vec_a = [a, b, c, d, e, f, g, h, l, m, n, o, p, q, r, s]; + vec_a = ins[0].vload([k, 0], dtype_vec) + + # Iterate over each of the 4 rowsx4 tiles of the output + for j in range(0, 4): + # Accumulate over each of the 4 (16x4) tiles contained in B + for i in range(0, 4): + # Replicate a single 4-element group of A (A[k, i:i+4]) + vec_aa = select_word(vec_a, i, dtype_vec) + + # Load 4 rows (each rows with 4 elements) from B (B[i:i+4, j:j+4]) + # vec_b = [0, 16, 32, 48, + # 1, 17, 33, 49, + # 2, 18, 34, 50, + # 3, 19, 35, 51,]; + vec_b = ins[1].vload([i, 4 * j, 0], dtype_vec) + + # Accumulate in the correct part of the output + vec_c = outs[0].vload([k, 4 * j], "int32x4") + + # Compute the dot product between the rowsx4 tile + # from A and the 4x4 tile from B + # + # For instance, for i=0, we have: + # sdot(vec_aa[0], vec_b) = [a*0+b*16+c*32+d*48, + # a*1+b*17+c*33+d*49, + # a*2+b*18+c*34+d*50, + # a*3+b*19+c*35+d*51] + vdot = tvm.tir.call_llvm_intrin( + "int32x4", + llvm_intrin, + tvm.tir.const(3, "uint32"), + vec_c, + vec_b, + vec_aa, + ) + ib.emit(outs[0].vstore([k, 4 * j], vdot)) + return ib.get() + + # body, reset, update + return _instr(0), _instr(1), _instr(2) + + buffer_params = {"offset_factor": 1} + return te.decl_tensor_intrin( + C.op, + _intrin_func, + binds={A: aa_buffer, B: bb_buffer, C: cc_buffer}, + default_buffer_params=buffer_params, + ) + + def _q_multiply_shift_arm(op): """ Implementation of q_multiply_shift_arm through arm intrinsics diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py index 4d508403570f..9df838bfd717 100644 --- a/tests/python/relay/test_pass_alter_op_layout.py +++ b/tests/python/relay/test_pass_alter_op_layout.py @@ -1105,7 +1105,7 @@ def _query_inside(self, target, workload): def update(self, target, workload, cfg): key = (str(target), workload) assert workload[2][1] == expected_workload_shape - assert workload[0] == "conv2d_NHWC_quantized_without_transform.arm_cpu" + assert workload[0] == "conv2d_NHWC_quantized_interleaved_without_transform.arm_cpu" self.memory[key] = cfg def alter_conv2d(attrs, inputs, tinfos, out_type): diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py index 238517e5ed75..b2f9835c3d66 100644 --- a/tests/python/topi/python/test_topi_conv2d_int8.py +++ b/tests/python/topi/python/test_topi_conv2d_int8.py @@ -56,48 +56,67 @@ def compile_conv2d_NHWC_gemm_int8_arm( W = te.placeholder((kernel, kernel, in_channel, num_filter), name="W", dtype="int8") bias = te.placeholder((num_filter,), name="bias", dtype="int8") dtype = "int32" - device = "llvm --device arm_cpu --mtriple aarch64-linux-gnu" - - ctx = tvm.context(device, 0) - if not tvm.testing.device_enabled(device): - print("Skip because %s is not enabled" % device) - return - print("Compiling on arm AArch64 target: %s" % device) - with tvm.target.Target(device): - assert is_aarch64_arm(), "AArch64 target not recognized" - - C = topi.arm_cpu.compute_conv2d_NHWC_quantized( - A, W, (stride, stride), padding, (dilation, dilation), dtype - ) + devices = [ + ( + "llvm --device arm_cpu --mtriple aarch64-linux-gnu", + topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved, + topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved, + ), + ( + "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+dotprod", + topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved, + topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved, + ), + ( + "llvm --device arm_cpu --mtriple aarch64-linux-gnu -mattr=+v8.2a,+dotprod", + topi.arm_cpu.compute_conv2d_NHWC_quantized_native, + topi.arm_cpu.schedule_conv2d_NHWC_quantized_native, + ), + ] + + for device_tuple in devices: + device = device_tuple[0] + compute = device_tuple[1] + schedule = device_tuple[2] + + ctx = tvm.context(device, 0) + if not tvm.testing.device_enabled(device): + print("Skip because %s is not enabled" % device) + return + print("Compiling on arm AArch64 target: %s" % device) + with tvm.target.Target(device): + assert is_aarch64_arm(), "AArch64 target not recognized" + + C = compute(A, W, (stride, stride), padding, (dilation, dilation), dtype) + if add_bias: + C = topi.add(C, bias) + if add_relu: + C = topi.nn.relu(C) + s = schedule([C]) + if add_bias: - C = topi.add(C, bias) - if add_relu: - C = topi.nn.relu(C) - s = topi.arm_cpu.schedule_conv2d_NHWC_quantized([C]) - - if add_bias: - tvm.build( - s, - [A, W, bias, C], - device, - name="relu_%d_%d_%d_%d_%d_%d_%d_%d" - % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), - ) - func = tvm.build( - s, - [A, W, bias, C], - device, - name="relu_%d_%d_%d_%d_%d_%d_%d_%d" - % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), - ) - else: - func = tvm.build( - s, - [A, W, C], - device, - name="relu_%d_%d_%d_%d_%d_%d_%d_%d" - % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), - ) + tvm.build( + s, + [A, W, bias, C], + device, + name="relu_%d_%d_%d_%d_%d_%d_%d_%d" + % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), + ) + func = tvm.build( + s, + [A, W, bias, C], + device, + name="relu_%d_%d_%d_%d_%d_%d_%d_%d" + % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), + ) + else: + func = tvm.build( + s, + [A, W, C], + device, + name="relu_%d_%d_%d_%d_%d_%d_%d_%d" + % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), + ) def verify_conv2d_NHWC_gemm_int8( @@ -155,14 +174,14 @@ def check_device(device): return print("Running on target: %s" % device) with tvm.target.Target(device): - C = topi.arm_cpu.compute_conv2d_NHWC_quantized( + C = topi.arm_cpu.compute_conv2d_NHWC_quantized_interleaved( A, W, (stride, stride), padding, (dilation, dilation), dtype ) if add_bias: C = topi.add(C, bias) if add_relu: C = topi.nn.relu(C) - s = topi.arm_cpu.schedule_conv2d_NHWC_quantized([C]) + s = topi.arm_cpu.schedule_conv2d_NHWC_quantized_interleaved([C]) a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx)