Skip to content

Commit

Permalink
[ARM_CPU] Conv2d int8 intrinsic for cortex-A72 (#10310)
Browse files Browse the repository at this point in the history
* [ARM_CPU] Conv2d int8 intrinsic for cortex-A72

Add an intrinsic that performs a dot product of 8 4-element vectors at
once. Also conditionally inline fused operators into the main
convolution loop depending on convolutions size. Small convolution = no
inlining. Performance improves by ~20% on mobilenet on raspberry pi 4
and ~30% improvement on performance for the individual convolutions.

* ignore incorrect lints

* fixup fstring

* revert changes to conv2d_NCHWc (not int8)

* remove error check, apparently tests rely on it

* refactor alter op layout
  • Loading branch information
Tristan Konolige authored Feb 23, 2022
1 parent 97648d8 commit 6c6e873
Show file tree
Hide file tree
Showing 10 changed files with 679 additions and 226 deletions.
2 changes: 2 additions & 0 deletions python/tvm/relay/backend/te_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,8 @@ def select_implementation(op, attrs, inputs, out_type, target, use_autotvm=True)
The best op implementation and the corresponding output tensors.
"""
all_impls = get_valid_implementations(op, attrs, inputs, out_type, target)
if len(all_impls) == 0:
raise RuntimeError(f"No valid {op} implementations for {target}")
best_plevel_impl = max(all_impls, key=lambda x: x.plevel)

# Disable autotvm if auto_scheduler is enabled.
Expand Down
36 changes: 25 additions & 11 deletions python/tvm/relay/op/strategy/arm_cpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,12 +94,18 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
name="conv2d_nchw_spatial_pack.arm_cpu",
)

# Intel x86 conv2d schedule.
strategy.add_implementation(
wrap_compute_conv2d(topi.x86.conv2d_nchw),
wrap_topi_schedule(topi.x86.schedule_conv2d_nchw),
name="conv2d_nchw.x86",
)
if topi.arm_cpu.is_int8_hw_support(data.dtype, kernel.dtype):
strategy.add_implementation(
wrap_compute_conv2d(topi.arm_cpu.conv2d_nchw_int8),
wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_nchw_int8),
name="conv2d_nchw_int8.arm_cpu",
)
else:
strategy.add_implementation(
wrap_compute_conv2d(topi.x86.conv2d_nchw),
wrap_topi_schedule(topi.x86.schedule_conv2d_nchw),
name="conv2d_nchw.x86",
)

# check if winograd algorithm is applicable
_, _, kh, kw = get_const_tuple(kernel.shape)
Expand Down Expand Up @@ -256,11 +262,19 @@ def conv2d_strategy_arm_cpu(attrs, inputs, out_type, target):
def conv2d_NCHWc_strategy_arm_cpu(attrs, inputs, out_type, target):
"""conv2d_NCHWc adopted from x86"""
strategy = _op.OpStrategy()
strategy.add_implementation(
wrap_compute_conv2d(topi.x86.conv2d_NCHWc, True, True),
wrap_topi_schedule(topi.x86.schedule_conv2d_NCHWc),
name="conv2d_NCHWc.x86",
)
data, kernel = inputs
if topi.arm_cpu.is_int8_hw_support(data.dtype, kernel.dtype):
strategy.add_implementation(
wrap_compute_conv2d(topi.arm_cpu.conv2d_NCHWc_int8, True, True),
wrap_topi_schedule(topi.arm_cpu.schedule_conv2d_NCHWc_int8),
name="conv2d_NCHWc_int8.arm_cpu",
)
else:
strategy.add_implementation(
wrap_compute_conv2d(topi.x86.conv2d_NCHWc, True, True),
wrap_topi_schedule(topi.x86.schedule_conv2d_NCHWc),
name="conv2d_NCHWc.x86",
)
return strategy


Expand Down
6 changes: 6 additions & 0 deletions python/tvm/topi/arm_cpu/arm_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,12 @@ def is_aarch64_arm():
return "aarch64" in target.attrs.get("mtriple", "")


def is_neon_available():
"""Check if neon instructions are available"""
target = tvm.target.Target.current(allow_none=False)
return "+neon" in target.mattr


def get_tiling_B_interleaved_t(interleave_A):
"""Compute the tiling information for matrix B', where B'
is the transposed and interleaved version of matrix B in C=A*B.
Expand Down
105 changes: 103 additions & 2 deletions python/tvm/topi/arm_cpu/conv2d_alter_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,12 @@
from tvm import relay
from tvm import autotvm

from ..nn import conv2d_alter_layout
from ..nn import conv2d_alter_layout, conv2d_legalize
from ..utils import get_const_tuple
from ..x86.conv2d import _get_default_config as _get_x86_default_config
from .conv2d_int8 import is_int8_hw_support
from .arm_utils import get_tiling_B_interleaved_t
from ..generic.conv2d import conv2d_alter_int8_common

logger = logging.getLogger("topi")

Expand Down Expand Up @@ -257,7 +259,15 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
assert data_layout == "NCHW" and kernel_layout == "OIHW"
if cfg.is_fallback:
_get_x86_default_config(
cfg, data_tensor, kernel_tensor, strides, padding, out_dtype, False, data_layout
cfg,
data_tensor,
kernel_tensor,
strides,
padding,
dilation,
out_dtype,
False,
data_layout,
)
batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
out_channel, _, kh, kw = get_const_tuple(kernel_tensor.shape)
Expand Down Expand Up @@ -333,6 +343,57 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
)
dispatch_ctx.update(target, new_workload, cfg)
return relay.nn.contrib_depthwise_conv2d_nchwc(*inputs, **new_attrs)

if topi_tmpl == "conv2d_NCHWc_int8.arm_cpu":
assert data_layout == "NCHW" and kernel_layout == "OIHW"
if cfg.is_fallback:
_get_default_config_int8(
cfg,
data_tensor,
kernel_tensor,
strides,
padding,
dilation,
out_dtype,
False,
data_layout,
)

batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape)
out_channel, channel_multiplier, kh, kw = get_const_tuple(kernel_tensor.shape)
ic_bn, oc_bn = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1]
n_elems = 8

# update new attrs
new_attrs["channels"] = out_channel
new_attrs["data_layout"] = "NCHW%dc" % ic_bn
new_attrs["kernel_layout"] = "OIHW{:n}i{:n}o{:n}i".format(ic_bn // n_elems, oc_bn, n_elems)
new_attrs["out_layout"] = "NCHW%dc" % oc_bn

# Store altered operator's config.
new_data = te.placeholder(
(batch_size, in_channel // ic_bn, height, width, ic_bn), dtype=data_dtype
)
new_kernel = te.placeholder(
(out_channel // oc_bn, in_channel // ic_bn, kh, kw, ic_bn // n_elems, oc_bn, n_elems),
dtype=kernel_dtype,
)
new_workload = autotvm.task.args_to_workload(
[
new_data,
new_kernel,
strides,
padding,
dilation,
new_attrs["data_layout"],
new_attrs["out_layout"],
out_dtype,
],
topi_tmpl,
)
dispatch_ctx.update(target, new_workload, cfg)
return relay.nn.contrib_conv2d_nchwc(*inputs, **new_attrs)

if topi_tmpl == "conv2d_NHWC_quantized_interleaved.arm_cpu":
assert data_layout == "NHWC" and kernel_layout == "HWIO"
KH, KW, _, OC = get_const_tuple(kernel.shape)
Expand Down Expand Up @@ -365,3 +426,43 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type):
inputs[0], new_kernel_expr, **new_attrs
)
return None


@conv2d_legalize.register("arm_cpu")
def _conv2d_legalize(attrs, inputs, arg_types):
"""Legalizes Conv2D op.
Parameters
----------
attrs : tvm.ir.Attrs
Attributes of current convolution
inputs : list of tvm.relay.Expr
The args of the Relay expr to be legalized
types : list of types
List of input and output types
Returns
-------
result : tvm.relay.Expr
The legalized expr
"""
# Collect the input tensors.
data_tensor, kernel_tensor = arg_types[0], arg_types[1]
data_dtype = data_tensor.dtype
kernel_dtype = kernel_tensor.dtype

# Collect the output tensor.
output_tensor = arg_types[2]

# Collect the input exprs.
data, kernel = inputs

# ARM vector instructions operate on the same dtype for data and kernel, we
# provide those here and conv2d_alter_int8_common will convert to the
# correct datatype.
if is_int8_hw_support(kernel_dtype, kernel_dtype):
# ARM intrinsics need the datatypes of data and kernel to be the same
return conv2d_alter_int8_common(
data, data_tensor, kernel, kernel_tensor, output_tensor, attrs, kernel_dtype, 8, 8
)
return None
Loading

0 comments on commit 6c6e873

Please sign in to comment.