[BYOC] [ACL] 20.05 memory corruption temporarely fix

This fix intended to prevent execution of operations via ACL runtime in case if its arguments require memory padding. This fix is temprary and intended for ACL 20.05 and should be removed after migration to ACL 20.11
apache · Oct 22, 2020 · 701b42b · 701b42b
1 parent f65e320
commit 701b42b
Show file tree

Hide file tree

Showing 10 changed files with 178 additions and 44 deletions.
diff --git a/include/tvm/relay/op_attr_types.h b/include/tvm/relay/op_attr_types.h
@@ -176,7 +176,7 @@ using FTVMLegalize = runtime::TypedPackedFunc<Expr(const Attrs& attrs, const Arr
  * \brief Annotates an expression to indicate if an op should be compiled using
  * the given compiler/target.
  *
- * \param attrs The attribute of the original expr.
+ :* \param attrs The attribute of the original expr.
  * \param args The arguments of the original expr.
  *
  * \return true if this op should be registered to invoke a specific compiler
@@ -185,6 +185,20 @@ using FTVMLegalize = runtime::TypedPackedFunc<Expr(const Attrs& attrs, const Arr
 using FTVMAnnotateTarget = runtime::TypedPackedFunc<bool(const Attrs& attrs,  // NOLINT(*)
                                                          const Array<Expr>& args)>;
 
+/*!
+ * \brief Annotates an expression to indicate if an op should be compiled using
+ *  the given compiler/target.
+ *  \param attrs The attribute of the original expr.
+ *  \param args The arguments of the original expr.
+ *  \param out_type The return type of the original expr.
+ *
+ *  \return true if this op should be registered to invoke a specific compiler
+ *  for codegen, otherwise, false.
+ */
+using FTVMAnnotateTarget3 =
+    runtime::TypedPackedFunc<bool(const Attrs& attrs,  // NOLINT(*)
+                                  const Array<Expr>& args, const Type& out_type)>;
+
 /*!
  * \brief Forward rewriting rule for a specific op.
  *

diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py
@@ -17,6 +17,8 @@
 # pylint: disable=invalid-name, unused-argument
 """Arm Compute Library supported operators."""
 import tvm
+import numpy as np
+
 from tvm.relay.expr import const
 from tvm.relay import transform
 from tvm.relay.build_module import bind_params_by_name
@@ -183,7 +185,7 @@ def check_dense(extract):
         call = extract
         while call.op.name != "nn.dense":
             call = call.args[0]
-        return dense(call.attrs, call.args)
+        return dense(call.attrs, call.args, call.checked_type)
 
     def check_qnn_dense(extract):
         """Check qnn conv pattern is supported by ACL."""
@@ -192,7 +194,7 @@ def check_qnn_dense(extract):
         call = extract
         while call.op.name != "qnn.dense":
             call = call.args[0]
-        return qnn_dense(call.attrs, call.args)
+        return qnn_dense(call.attrs, call.args, call.checked_type)
 
     def check_avg_pool2d(extract):
         """Check average pool2d pattern is supported by ACL."""
@@ -201,12 +203,12 @@ def check_avg_pool2d(extract):
         pool = extract.args[0]
         if pool.args[0].attrs.dtype != "int32":
             return False
-        return avg_pool2d(pool.attrs, pool.args, from_quantized_composite=True)
+        return avg_pool2d(pool.attrs, pool.args, pool.checked_type, from_quantized_composite=True)
 
     def check_l2_pool2d(extract):
         """Check l2 pool2d pattern is supported by ACL."""
         pool = extract.args[0]
-        return avg_pool2d(pool.attrs, pool.args)
+        return avg_pool2d(pool.attrs, pool.args, pool.checked_type)
 
     return [
         ("arm_compute_lib.conv2d", conv_pattern(), check_conv),
@@ -227,9 +229,10 @@ def _func_wrapper(attrs, args):
     return _func_wrapper
 
 
+# Reshape does not need padding check in 20.05
 _register_external_op_helper("reshape")
 
-
+# conv2d does not need padding check in 20.05
 @tvm.ir.register_op_attr("nn.conv2d", "target.arm_compute_lib")
 def conv2d(attrs, args):
     """Check if the external ACL codegen for conv2d should be used."""
@@ -248,6 +251,7 @@ def conv2d(attrs, args):
     return True
 
 
+# conv2d does not need padding check in 20.05
 def qnn_conv2d(attrs, args):
     """Check if the external ACL codegen for qnn.conv2d should be used."""
     if attrs.groups != 1:
@@ -266,7 +270,7 @@ def qnn_conv2d(attrs, args):
 
 
 @tvm.ir.register_op_attr("nn.dense", "target.arm_compute_lib")
-def dense(attrs, args):
+def dense(attrs, args, out_type):
     """Check if the external ACL codegen for dense should be used."""
     data_typ = args[0].checked_type
     if data_typ.dtype != "float32":
@@ -276,10 +280,10 @@ def dense(attrs, args):
         return False
     if attrs.out_dtype != "float32" and attrs.out_dtype != "":
         return False
-    return True
+    return not padding_required([*args, out_type])
 
 
-def qnn_dense(attrs, args):
+def qnn_dense(attrs, args, out_type):
     """Check if the external ACL codegen for qnn.dense should be used."""
     data_typ = args[0].checked_type
     if data_typ.dtype != "uint8":
@@ -289,24 +293,57 @@ def qnn_dense(attrs, args):
         return False
     if attrs.out_dtype != "int32":
         return False
-    return True
+
+    return not padding_required([*args, out_type])
 
 
 @tvm.ir.register_op_attr("nn.max_pool2d", "target.arm_compute_lib")
-def max_pool2d(attrs, args):
+def max_pool2d(attrs, args, out_type):
     """Check if the external ACL codegen for maxpool2d should be used."""
     if attrs.layout != "NHWC":
         return False
     typ = args[0].checked_type
     if typ.dtype not in ["float32", "uint8"]:
         return False
-    return True
+    return not padding_required([*args, out_type])
+
+
+def padding_required(inputs):
+    """Checks whether supplied data will require padding.
+    Most of the operators ACL up to 20.11 uses padded data.
+    """
+
+    def _check(shape, dtype):
+        """NEON has 128bits/16bytes per vector"""
+        if len(shape) == 0:
+            return False
+        return (shape[-1] * np.dtype(dtype).itemsize) % 16 != 0
+
+    def _padding_required():
+        for i in inputs:
+            if isinstance(i, (tvm.relay.expr.Var, tvm.relay.expr.Call)):
+                if _check(i.checked_type.shape, i.checked_type.dtype):
+                    return True
+            elif isinstance(i, tvm.relay.expr.Constant):
+                if _check(i.data.shape, i.data.dtype):
+                    return True
+            elif isinstance(i, tvm.ir.tensor_type.TensorType):
+                if _check(i.shape, i.dtype):
+                    return True
+            else:
+                raise Exception("Not supported")
+
+        return False
+
+    result = _padding_required()
+    return result
 
 
 @tvm.ir.register_op_attr("nn.avg_pool2d", "target.arm_compute_lib")
-def avg_pool2d(attrs, args, from_quantized_composite=False):
+def avg_pool2d(attrs, args, out_type, from_quantized_composite=False):
     """Check if the external ACL codegen for avgpool2d should be used."""
     typ = args[0].checked_type
+
     if from_quantized_composite:
         if typ.dtype != "int32":
             return False
@@ -315,29 +352,30 @@ def avg_pool2d(attrs, args, from_quantized_composite=False):
             return False
     if attrs.layout != "NHWC":
         return False
-    return True
+
+    return not padding_required([*args, out_type])
 
 
 @tvm.ir.register_op_attr("nn.global_max_pool2d", "target.arm_compute_lib")
-def global_max_pool2d(attrs, args):
+def global_max_pool2d(attrs, args, out_type):
     """Check if the external ACL codegen for gloval_maxpool2d should be used."""
     typ = args[0].checked_type
     if typ.dtype not in ["float32", "uint8"]:
         return False
     if attrs.layout != "NHWC":
         return False
-    return True
+    return not padding_required([*args, out_type])
 
 
 @tvm.ir.register_op_attr("nn.global_avg_pool2d", "target.arm_compute_lib")
-def global_avg_pool2d(attrs, args):
+def global_avg_pool2d(attrs, args, out_type):
     """Check if the external ACL codegen for global_avgpool2d should be used."""
     typ = args[0].checked_type
     if typ.dtype not in ["float32"]:
         return False
     if attrs.layout != "NHWC":
         return False
-    return True
+    return not padding_required([*args, out_type])
 
 
 @tvm.ir.register_op_attr("maximum", "target.arm_compute_lib")

diff --git a/src/relay/transforms/annotate_target.cc b/src/relay/transforms/annotate_target.cc
@@ -169,8 +169,17 @@ class AnnotateTargetRewriter : public ExprRewriter {
         if (!Op::HasAttrMap("target." + std::string(target))) {
           continue;
         }
-        auto fannotate = Op::GetAttrMap<FTVMAnnotateTarget>("target." + std::string(target));
-        if (fannotate.count(op) && fannotate[op](pre->attrs, pre->args)) {
+        bool result = false;
+        try {
+          auto fannotate = Op::GetAttrMap<FTVMAnnotateTarget>("target." + std::string(target));
+          result = (fannotate.count(op) && fannotate[op](pre->attrs, pre->args));
+        } catch (...) {
+          auto fannotate = Op::GetAttrMap<FTVMAnnotateTarget3>("target." + std::string(target));
+          result =
+              (fannotate.count(op) && fannotate[op](pre->attrs, pre->args, pre->checked_type()));
+        }
+
+        if (result) {
           supported_targets.push_back(target);
         }
       }

diff --git a/src/runtime/contrib/arm_compute_lib/acl_utils.cc b/src/runtime/contrib/arm_compute_lib/acl_utils.cc
@@ -44,6 +44,7 @@ arm_compute::Tensor MakeACLTensor(const JSONGraphNode& tensor_rep, void* data,
   std::vector<int64_t> shape = tensor_rep.GetOpShape()[0];
   DLDataType dtype = tensor_rep.GetOpDataType()[0];
   arm_compute::TensorInfo info = MakeACLTensorInfo(shape, dtype, scale, offset);
+  info.set_is_resizable(false);
   tensor.allocator()->init(info);
   if (data != nullptr) {
     CheckACLError(tensor.allocator()->import_memory(data));

diff --git a/tests/python/contrib/test_arm_compute_lib/infrastructure.py b/tests/python/contrib/test_arm_compute_lib/infrastructure.py
@@ -276,10 +276,11 @@ def verify_codegen(
     module,
     known_good_codegen,
     num_acl_modules,
+    tvm_ops=0,
     target="llvm -mtriple=aarch64-linux-gnu -mattr=+neon",
 ):
     """Check acl codegen against a known good output."""
-    module = build_module(module, target)
+    module = build_module(module, target, tvm_ops=tvm_ops, acl_partitions=num_acl_modules)
     acl_modules = extract_acl_modules(module)
 
     assert len(acl_modules) == num_acl_modules, (

diff --git a/tests/python/contrib/test_arm_compute_lib/test_dense.py b/tests/python/contrib/test_arm_compute_lib/test_dense.py
@@ -20,8 +20,8 @@
 
 import tvm
 from tvm import relay
-
-from .infrastructure import (
+from tvm import testing
+from test_arm_compute_lib.infrastructure import (
     Device,
     skip_runtime_test,
     skip_codegen_test,
@@ -185,18 +185,34 @@ def test_dense():
     np.random.seed(0)
 
     dtype = ["float32"]
-    shape = [((1, 128), (16, 128), 16), ((32, 32), (32, 32), 32), ((1, 64), (1, 64), 1)]
+    shape = [
+        (1, (1, 128), (16, 128), 16),
+        (1, (32, 32), (32, 32), 32),
+        (0, (1, 64), (1, 64), 1),
+        (0, (11, 2), (2, 2), 2),
+    ]
     composite = [False, True]
     trials = generate_trials([dtype, shape, composite], 3)
 
-    for dtype, (shape, weight_shape, units), composite in trials:
+    for dtype, (acl_partitions, shape, weight_shape, units), composite in trials:
         outputs = []
         inputs = {"a": tvm.nd.array(np.random.uniform(-128, 127, shape).astype(dtype))}
         func, params = _get_model(
             shape, weight_shape, units, dtype, var_names=iter(inputs), has_bias=composite
         )
         for acl in [False, True]:
-            outputs.append(build_and_run(func, inputs, 1, params, device, enable_acl=acl)[0])
+            outputs.append(
+                build_and_run(
+                    func,
+                    inputs,
+                    1,
+                    params,
+                    device,
+                    enable_acl=acl,
+                    tvm_ops=(1 - acl_partitions) * (2 - int(not composite)),
+                    acl_partitions=acl_partitions,
+                )[0]
+            )
 
         config = {
             "shape": shape,
@@ -215,18 +231,18 @@ def test_codegen_dense():
     np.random.seed(0)
 
     dtype = ["float32"]
-    shape = [((1, 128), (16, 128), 16), ((32, 32), (32, 32), 32), ((1, 64), (1, 64), 1)]
+    shape = [(1, (1, 128), (16, 128), 16), (1, (32, 32), (32, 32), 32), (0, (1, 64), (1, 64), 1)]
     composite = [False, True]
     trials = generate_trials([dtype, shape, composite], 3)
 
-    for dtype, (shape, weight_shape, units), composite in trials:
+    for dtype, (acl_partitions, shape, weight_shape, units), composite in trials:
         inputs = {"a"}
 
         args = (shape, weight_shape, units, dtype)
 
         func, params = _get_model(*args, var_names=iter(inputs), has_bias=composite)
         exp_codegen = _get_expected_codegen(*args, has_bias=composite)
-        verify_codegen(func, exp_codegen, 1)
+        verify_codegen(func, exp_codegen, acl_partitions, 1 - acl_partitions)
 
 
 def test_qnn_dense():
@@ -239,11 +255,18 @@ def test_qnn_dense():
     np.random.seed(0)
 
     dtype = ["uint8"]
-    shape = [((1, 128), (16, 128), 16), ((32, 32), (32, 32), 32), ((1, 64), (1, 64), 1)]
+    shape = [
+        (0, (4, 4), (4, 4), 4),
+        (1, (16, 16), (4, 16), 4),
+        (1, (1, 128), (16, 128), 16),
+        (1, (32, 32), (32, 32), 32),
+        (0, (1, 64), (1, 64), 1),
+    ]
+
     composite = [False, True]
     trials = generate_trials([dtype, shape, composite], 3)
 
-    for dtype, (shape, weight_shape, units), composite in trials:
+    for dtype, (acl_partitions, shape, weight_shape, units), composite in trials:
         outputs = []
         inputs = {"a": tvm.nd.array(np.random.uniform(0, 255, shape).astype(dtype))}
         input_zp = 100
@@ -270,7 +293,18 @@ def test_qnn_dense():
         )
 
         for acl in [False, True]:
-            outputs.append(build_and_run(func, inputs, 1, params, device, enable_acl=acl)[0])
+            outputs.append(
+                build_and_run(
+                    func,
+                    inputs,
+                    1,
+                    params,
+                    device,
+                    tvm_ops=(1 - acl_partitions) * (3 - int(not composite)),
+                    acl_partitions=acl_partitions,
+                    enable_acl=acl,
+                )[0]
+            )
 
         config = {
             "shape": shape,
@@ -295,11 +329,11 @@ def test_codegen_qnn_dense():
     np.random.seed(0)
 
     dtype = ["uint8"]
-    shape = [((1, 128), (16, 128), 16), ((32, 32), (32, 32), 32), ((1, 64), (1, 64), 1)]
+    shape = [(1, (1, 128), (16, 128), 16), (1, (32, 32), (32, 32), 32), (0, (1, 64), (1, 64), 1)]
     composite = [False, True]
     trials = generate_trials([dtype, shape, composite], 3)
 
-    for dtype, (shape, weight_shape, units), composite in trials:
+    for dtype, (acl_partitions, shape, weight_shape, units), composite in trials:
         inputs = {"a"}
         args = (shape, weight_shape, units, dtype)
 
@@ -323,7 +357,7 @@ def test_codegen_qnn_dense():
             has_bias=composite,
         )
         exp_codegen = _get_expected_codegen(*args, has_bias=composite)
-        verify_codegen(func, exp_codegen, 1)
+        verify_codegen(func, exp_codegen, acl_partitions, 2 - 2 * acl_partitions)
 
 
 if __name__ == "__main__":

diff --git a/tests/python/contrib/test_arm_compute_lib/test_maximum.py b/tests/python/contrib/test_arm_compute_lib/test_maximum.py
@@ -20,6 +20,7 @@
 
 import tvm
 from tvm import relay
+from tvm import testing
 
 from .infrastructure import (
     skip_runtime_test,