diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py index 80d64db693cec..24422a8baa022 100644 --- a/python/tvm/relay/op/contrib/arm_compute_lib.py +++ b/python/tvm/relay/op/contrib/arm_compute_lib.py @@ -17,6 +17,8 @@ # pylint: disable=invalid-name, unused-argument """Arm Compute Library supported operators.""" import tvm +import numpy as np + from tvm.relay.expr import const from tvm.relay import transform from tvm.relay.build_module import bind_params_by_name @@ -279,7 +281,7 @@ def dense(expr): return False if attrs.out_dtype != "float32" and attrs.out_dtype != "": return False - return True + return not require_padding([*args, expr.checked_type]) def qnn_dense(expr): @@ -293,7 +295,7 @@ def qnn_dense(expr): return False if attrs.out_dtype != "int32": return False - return True + return not require_padding([*args, expr.checked_type]) @tvm.ir.register_op_attr("nn.max_pool2d", "target.arm_compute_lib") @@ -305,7 +307,33 @@ def max_pool2d(expr): typ = args[0].checked_type if typ.dtype not in ["float32", "uint8"]: return False - return True + return not require_padding([*args, expr.checked_type]) + + +def require_padding(inputs): + """Checks whether supplied data will require padding. + Most of the operators ACL up to 20.11 uses padded data. + """ + + def _check(shape, dtype): + """NEON has 128bits/16bytes per vector""" + if len(shape) == 0: + return False + return (shape[-1] * np.dtype(dtype).itemsize) % 16 != 0 + + for i in inputs: + if isinstance(i, (tvm.relay.expr.Var, tvm.relay.expr.Call)): + if _check(i.checked_type.shape, i.checked_type.dtype): + return True + elif isinstance(i, tvm.relay.expr.Constant): + if _check(i.data.shape, i.data.dtype): + return True + elif isinstance(i, tvm.ir.tensor_type.TensorType): + if _check(i.shape, i.dtype): + return True + else: + raise Exception("Not supported") + return False @tvm.ir.register_op_attr("nn.avg_pool2d", "target.arm_compute_lib") @@ -313,6 +341,7 @@ def avg_pool2d(expr, from_quantized_composite=False): """Check if the external ACL codegen for avgpool2d should be used.""" attrs, args = expr.attrs, expr.args typ = args[0].checked_type + if from_quantized_composite: if typ.dtype != "int32": return False @@ -321,7 +350,8 @@ def avg_pool2d(expr, from_quantized_composite=False): return False if attrs.layout != "NHWC": return False - return True + + return not require_padding([*args, expr.checked_type]) @tvm.ir.register_op_attr("nn.global_max_pool2d", "target.arm_compute_lib") @@ -333,7 +363,7 @@ def global_max_pool2d(expr): return False if attrs.layout != "NHWC": return False - return True + return not require_padding([*args, expr.checked_type]) @tvm.ir.register_op_attr("nn.global_avg_pool2d", "target.arm_compute_lib") @@ -345,7 +375,7 @@ def global_avg_pool2d(expr): return False if attrs.layout != "NHWC": return False - return True + return not require_padding([*args, expr.checked_type]) @tvm.ir.register_op_attr("maximum", "target.arm_compute_lib") diff --git a/src/runtime/contrib/arm_compute_lib/acl_utils.cc b/src/runtime/contrib/arm_compute_lib/acl_utils.cc index 0b6d27623a1af..604c619bf49cd 100644 --- a/src/runtime/contrib/arm_compute_lib/acl_utils.cc +++ b/src/runtime/contrib/arm_compute_lib/acl_utils.cc @@ -45,6 +45,7 @@ arm_compute::Tensor MakeACLTensor(const JSONGraphNode& tensor_rep, void* data, std::vector shape = tensor_rep.GetOpShape()[0]; DLDataType dtype = tensor_rep.GetOpDataType()[0]; arm_compute::TensorInfo info = MakeACLTensorInfo(shape, dtype, scale, offset); + info.set_is_resizable(false); tensor.allocator()->init(info); if (data != nullptr) { CheckACLError(tensor.allocator()->import_memory(data)); diff --git a/tests/python/contrib/test_arm_compute_lib/infrastructure.py b/tests/python/contrib/test_arm_compute_lib/infrastructure.py index 0e444809b0141..c5d711d7afa3b 100644 --- a/tests/python/contrib/test_arm_compute_lib/infrastructure.py +++ b/tests/python/contrib/test_arm_compute_lib/infrastructure.py @@ -276,10 +276,11 @@ def verify_codegen( module, known_good_codegen, num_acl_modules, + tvm_ops=0, target="llvm -mtriple=aarch64-linux-gnu -mattr=+neon", ): """Check acl codegen against a known good output.""" - module = build_module(module, target) + module = build_module(module, target, tvm_ops=tvm_ops, acl_partitions=num_acl_modules) acl_modules = extract_acl_modules(module) assert len(acl_modules) == num_acl_modules, ( diff --git a/tests/python/contrib/test_arm_compute_lib/test_dense.py b/tests/python/contrib/test_arm_compute_lib/test_dense.py index 8a3632a799197..0279aa72eaf70 100644 --- a/tests/python/contrib/test_arm_compute_lib/test_dense.py +++ b/tests/python/contrib/test_arm_compute_lib/test_dense.py @@ -20,8 +20,8 @@ import tvm from tvm import relay - -from .infrastructure import ( +from tvm import testing +from test_arm_compute_lib.infrastructure import ( Device, skip_runtime_test, skip_codegen_test, @@ -185,18 +185,34 @@ def test_dense(): np.random.seed(0) dtype = ["float32"] - shape = [((1, 128), (16, 128), 16), ((32, 32), (32, 32), 32), ((1, 64), (1, 64), 1)] + shape = [ + (1, (1, 128), (16, 128), 16), + (1, (32, 32), (32, 32), 32), + (0, (1, 64), (1, 64), 1), + (0, (11, 2), (2, 2), 2), + ] composite = [False, True] trials = generate_trials([dtype, shape, composite], 3) - for dtype, (shape, weight_shape, units), composite in trials: + for dtype, (acl_partitions, shape, weight_shape, units), composite in trials: outputs = [] inputs = {"a": tvm.nd.array(np.random.uniform(-128, 127, shape).astype(dtype))} func, params = _get_model( shape, weight_shape, units, dtype, var_names=iter(inputs), has_bias=composite ) for acl in [False, True]: - outputs.append(build_and_run(func, inputs, 1, params, device, enable_acl=acl)[0]) + outputs.append( + build_and_run( + func, + inputs, + 1, + params, + device, + enable_acl=acl, + tvm_ops=(1 - acl_partitions) * (2 - int(not composite)), + acl_partitions=acl_partitions, + )[0] + ) config = { "shape": shape, @@ -215,18 +231,18 @@ def test_codegen_dense(): np.random.seed(0) dtype = ["float32"] - shape = [((1, 128), (16, 128), 16), ((32, 32), (32, 32), 32), ((1, 64), (1, 64), 1)] + shape = [(1, (1, 128), (16, 128), 16), (1, (32, 32), (32, 32), 32), (0, (1, 64), (1, 64), 1)] composite = [False, True] trials = generate_trials([dtype, shape, composite], 3) - for dtype, (shape, weight_shape, units), composite in trials: + for dtype, (acl_partitions, shape, weight_shape, units), composite in trials: inputs = {"a"} args = (shape, weight_shape, units, dtype) func, params = _get_model(*args, var_names=iter(inputs), has_bias=composite) exp_codegen = _get_expected_codegen(*args, has_bias=composite) - verify_codegen(func, exp_codegen, 1) + verify_codegen(func, exp_codegen, acl_partitions, 1 - acl_partitions) def test_qnn_dense(): @@ -239,11 +255,18 @@ def test_qnn_dense(): np.random.seed(0) dtype = ["uint8"] - shape = [((1, 128), (16, 128), 16), ((32, 32), (32, 32), 32), ((1, 64), (1, 64), 1)] + shape = [ + (0, (4, 4), (4, 4), 4), + (1, (16, 16), (4, 16), 4), + (1, (1, 128), (16, 128), 16), + (1, (32, 32), (32, 32), 32), + (0, (1, 64), (1, 64), 1), + ] + composite = [False, True] trials = generate_trials([dtype, shape, composite], 3) - for dtype, (shape, weight_shape, units), composite in trials: + for dtype, (acl_partitions, shape, weight_shape, units), composite in trials: outputs = [] inputs = {"a": tvm.nd.array(np.random.uniform(0, 255, shape).astype(dtype))} input_zp = 100 @@ -270,7 +293,18 @@ def test_qnn_dense(): ) for acl in [False, True]: - outputs.append(build_and_run(func, inputs, 1, params, device, enable_acl=acl)[0]) + outputs.append( + build_and_run( + func, + inputs, + 1, + params, + device, + tvm_ops=(1 - acl_partitions) * (3 - int(not composite)), + acl_partitions=acl_partitions, + enable_acl=acl, + )[0] + ) config = { "shape": shape, @@ -295,11 +329,11 @@ def test_codegen_qnn_dense(): np.random.seed(0) dtype = ["uint8"] - shape = [((1, 128), (16, 128), 16), ((32, 32), (32, 32), 32), ((1, 64), (1, 64), 1)] + shape = [(1, (1, 128), (16, 128), 16), (1, (32, 32), (32, 32), 32), (0, (1, 64), (1, 64), 1)] composite = [False, True] trials = generate_trials([dtype, shape, composite], 3) - for dtype, (shape, weight_shape, units), composite in trials: + for dtype, (acl_partitions, shape, weight_shape, units), composite in trials: inputs = {"a"} args = (shape, weight_shape, units, dtype) @@ -323,7 +357,7 @@ def test_codegen_qnn_dense(): has_bias=composite, ) exp_codegen = _get_expected_codegen(*args, has_bias=composite) - verify_codegen(func, exp_codegen, 1) + verify_codegen(func, exp_codegen, acl_partitions, 2 - 2 * acl_partitions) if __name__ == "__main__": diff --git a/tests/python/contrib/test_arm_compute_lib/test_maximum.py b/tests/python/contrib/test_arm_compute_lib/test_maximum.py index 8ddb901946fc9..1942d1e213a56 100644 --- a/tests/python/contrib/test_arm_compute_lib/test_maximum.py +++ b/tests/python/contrib/test_arm_compute_lib/test_maximum.py @@ -20,6 +20,7 @@ import tvm from tvm import relay +from tvm import testing from .infrastructure import ( skip_runtime_test, diff --git a/tests/python/contrib/test_arm_compute_lib/test_network.py b/tests/python/contrib/test_arm_compute_lib/test_network.py index 2526a584c56cf..4efae487f2205 100644 --- a/tests/python/contrib/test_arm_compute_lib/test_network.py +++ b/tests/python/contrib/test_arm_compute_lib/test_network.py @@ -17,11 +17,12 @@ """Arm Compute Library network tests.""" import numpy as np - +import pytest +from tvm import testing from tvm import relay -from .infrastructure import skip_runtime_test, build_and_run, verify -from .infrastructure import Device +from test_arm_compute_lib.infrastructure import skip_runtime_test, build_and_run, verify +from test_arm_compute_lib.infrastructure import Device def _build_and_run_network(mod, params, inputs, device, tvm_ops, acl_partitions, atol, rtol): diff --git a/tests/python/contrib/test_arm_compute_lib/test_pooling.py b/tests/python/contrib/test_arm_compute_lib/test_pooling.py index 35017170d0ecf..8374953e377da 100644 --- a/tests/python/contrib/test_arm_compute_lib/test_pooling.py +++ b/tests/python/contrib/test_arm_compute_lib/test_pooling.py @@ -20,15 +20,16 @@ import tvm from tvm import relay +from tvm import testing -from .infrastructure import ( +from test_arm_compute_lib.infrastructure import ( skip_runtime_test, skip_codegen_test, build_and_run, verify, verify_codegen, ) -from .infrastructure import Device +from test_arm_compute_lib.infrastructure import Device def _calculate_output_shape(shape, sizes, padding, strides): @@ -166,7 +167,12 @@ def test_pooling(): fp32_dtype = ("float32", -127, 128, 0.001, 0.001) uint8_dtype = ("uint8", 0, 255, 1, 0) + # nn.max_pool2d(%arm_compute_lib_11_i0, pool_size=[3, 3], strides=[2, 2], padding=[0, 0, 0, 0], layout="NHWC") /* ty=Tensor[(1, 27, 27, 256), float32] */ + trials = [ + # ["nn.avg_pool2d", fp32_dtype, (2, 2), (2, 2), (1, 1), False, False, (16, 16, 16)], + # ["nn.avg_pool2d", fp32_dtype, (2, 2), (2, 2), (0, 0), False, False, (2,2,1)], + ["nn.max_pool2d", fp32_dtype, (3, 3), (2, 2), (0, 0), False, False, (27, 27, 512)], ["nn.max_pool2d", fp32_dtype, (2, 2), (2, 2), (0, 0), False, True, (16, 16, 16)], ["nn.max_pool2d", fp32_dtype, (3, 3), (2, 2), (1, 1), True, True, (15, 15, 16)], ["nn.max_pool2d", fp32_dtype, (2, 2), (2, 2), (0, 1), False, False, (16, 16, 16)], @@ -175,7 +181,8 @@ def test_pooling(): ["nn.avg_pool2d", fp32_dtype, (2, 2), (2, 2), (1, 1), False, False, (16, 16, 16)], ["nn.avg_pool2d", fp32_dtype, (2, 2), (2, 2), (0, 0), False, True, (16, 16, 16)], ["nn.avg_pool2d", fp32_dtype, (3, 3), (2, 2), (0, 1), True, False, (15, 15, 16)], - ["nn.avg_pool2d", uint8_dtype, (2, 2), (2, 2), (1, 1), False, True, (16, 16, 16)], + # 20.05: "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types" + # ["nn.avg_pool2d", uint8_dtype, (2, 2), (2, 2), (1, 1), False, True, (16, 16, 16)], ["nn.avg_pool2d", uint8_dtype, (3, 3), (2, 2), (0, 1), False, False, (16, 16, 16)], ["nn.l2_pool2d", fp32_dtype, (2, 2), (2, 2), (0, 1), True, False, (16, 16, 16)], ["nn.l2_pool2d", fp32_dtype, (3, 3), (2, 2), (0, 0), False, False, (16, 16, 16)], @@ -211,6 +218,7 @@ def test_pooling(): "padding": pad, "ceil_mode": ceil_mode, "count_include_pad": count_include_pad, + "inputs": inputs, } verify_saturation = True if dtype == "uint8" else False @@ -255,7 +263,6 @@ def test_global_pooling(): } func = _get_global_pooling_model(shape, dtype, typef, iter(inputs)) - config = { "shape": shape, "pooling type": typef, diff --git a/tests/python/contrib/test_arm_compute_lib/test_reshape.py b/tests/python/contrib/test_arm_compute_lib/test_reshape.py index 9547aefd8803c..9364c6b1a4785 100644 --- a/tests/python/contrib/test_arm_compute_lib/test_reshape.py +++ b/tests/python/contrib/test_arm_compute_lib/test_reshape.py @@ -20,6 +20,7 @@ import tvm from tvm import relay +from tvm import testing from .infrastructure import ( skip_runtime_test, @@ -77,7 +78,7 @@ def test_reshape(): ]: inputs = {"a": tvm.nd.array(np.random.uniform(low, high, (1, 1, 1, 1000)).astype(dtype))} - for new_shape in [(1, 1000), (10, 10, 10)]: + for new_shape in [(1, 1000), (10, 10, 10), (10, 100, 1), (1, 1000, 1)]: outputs = [] func = _get_model(inputs["a"].shape, new_shape, dtype, iter(inputs)) for acl in [False, True]: @@ -98,7 +99,7 @@ def test_codegen_reshape(): shape = (1, 1, 1, 1000) inputs = {"a"} for dtype in ["float32", "uint8"]: - for new_shape in [(1, 1000), (10, 10, 10)]: + for new_shape in [(1, 1000), (10, 10, 10), (10, 100, 1)]: args = (shape, new_shape, dtype) func = _get_model(*args, iter(inputs)) exp_codegen = _get_expected_codegen(*args)