From a854d61ac7bd0bb9870cfa92ca79a119f3b79ae0 Mon Sep 17 00:00:00 2001
From: Manupa Karunaratne <manupa.karunaratne@arm.com>
Date: Fri, 28 Jan 2022 16:39:47 +0000
Subject: [PATCH] [microNPU] enable USMP

* Fixing unit tests
* Added a guard for USMP bufferinfo extraction ignore non-global
  allocates
* fixed export_model_library_format to use target_kind type

Change-Id: I9c6c90d8787c39697fca24af299f8309f40d3743
---
 python/tvm/micro/model_library_format.py      | 99 ++++++++++---------
 src/tir/usmp/analysis/extract_buffer_info.cc  |  6 +-
 tests/python/contrib/test_ethosu/infra.py     | 10 +-
 .../contrib/test_ethosu/test_networks.py      |  5 +-
 .../test_ethosu/test_tir_to_cs_translator.py  | 58 ++++++++---
 5 files changed, 107 insertions(+), 71 deletions(-)

diff --git a/python/tvm/micro/model_library_format.py b/python/tvm/micro/model_library_format.py
index a14c20957765f..5ec3e856ffbf7 100644
--- a/python/tvm/micro/model_library_format.py
+++ b/python/tvm/micro/model_library_format.py
@@ -177,37 +177,26 @@ def _build_function_memory_map(function_metadata):
     """
     device_max_workspace = dict()
     main_func_metadata = function_metadata[MAIN_FUNC_NAME_STR]
-    main_targets = dict(main_func_metadata.workspace_sizes).keys()
-    from tvm.driver import tvmc  # pylint: disable=import-outside-toplevel
-
-    external_codegens = tvmc.composite_target.get_codegen_names()
     func_entries = []
     target_local_entries = dict()
-    for main_target in main_targets:
-        device_max_workspace[main_target] = 0
-        for func_name, finfo in function_metadata.items():
-            if func_name == MAIN_FUNC_NAME_STR:
-                continue
-            target_local_entries[func_name] = list()
 
-        for func_name, finfo in function_metadata.items():
-            # Skip a few unsupported cases:
-            # 1. The main function metadata is exported elsewhere.
-            # 2. BYOC operator implementations do not currently export useful FunctionInfo.
-            if func_name == MAIN_FUNC_NAME_STR or not finfo.tir_primfuncs:
-                continue
-            if main_target in finfo.workspace_sizes.keys():
-                workspace_size = finfo.workspace_sizes[main_target]
-                target_entry = {
-                    "device": int(main_target.kind.device_type),
-                    "workspace_size_bytes": int(workspace_size),
-                }
-                target_local_entries[func_name].append(target_entry)
-                if workspace_size > device_max_workspace.get(main_target, 0):
-                    device_max_workspace[main_target] = workspace_size
-                # TODO(Mousius) - Remove this massive hack when Targets are unified
-                if main_target.kind.name in external_codegens:
-                    device_max_workspace[main_target] += int(workspace_size)
+    for func_name, finfo in function_metadata.items():
+        # Skip a few unsupported cases:
+        # 1. The main function metadata is exported elsewhere.
+        # 2. BYOC operator implementations do not currently export useful FunctionInfo.
+        if func_name == MAIN_FUNC_NAME_STR or not finfo.tir_primfuncs:
+            continue
+        if func_name not in target_local_entries.keys():
+            target_local_entries[func_name] = list()
+        for target in dict(finfo.workspace_sizes).keys():
+            workspace_size = finfo.workspace_sizes[target]
+            target_entry = {
+                "device": int(target.kind.device_type),
+                "workspace_size_bytes": int(workspace_size),
+            }
+            target_local_entries[func_name].append(target_entry)
+            if workspace_size >= device_max_workspace.get(int(target.kind.device_type), 0):
+                device_max_workspace[int(target.kind.device_type)] = workspace_size
 
     for func_name, target_entries_ in target_local_entries.items():
         func_entry = {
@@ -216,32 +205,46 @@ def _build_function_memory_map(function_metadata):
         }
         func_entries.append(func_entry)
 
-    target_main_entries = list()
-    for main_target in main_targets:
-        main_func_local_workspace = main_func_metadata.workspace_sizes[main_target]
-        main_func_constants = (
-            main_func_metadata.constant_sizes[main_target]
-            if main_target in main_func_metadata.constant_sizes.keys()
-            else 0
+    target_main_entries = dict()
+
+    def _create_empty_entry(target_device_type):
+        return {
+            "device": int(target_device_type),
+            "workspace_size_bytes": 0,
+            "constants_size_bytes": 0,
+            "io_size_bytes": 0,
+        }
+
+    for target in dict(main_func_metadata.workspace_sizes).keys():
+        main_func_local_workspace = main_func_metadata.workspace_sizes[target]
+        target_main_entries[int(target.kind.device_type)] = _create_empty_entry(
+            int(target.kind.device_type)
         )
-        main_func_io = (
-            main_func_metadata.io_sizes[main_target]
-            if main_target in main_func_metadata.io_sizes.keys()
-            else 0
+        target_main_entries[int(target.kind.device_type)]["workspace_size_bytes"] = int(
+            device_max_workspace[int(target.kind.device_type)]
+        ) + int(main_func_local_workspace)
+
+    for target in dict(main_func_metadata.constant_sizes).keys():
+        if int(target.kind.device_type) not in target_main_entries.keys():
+            target_main_entries[int(target.kind.device_type)] = _create_empty_entry(
+                int(target.kind.device_type)
+            )
+        target_main_entries[int(target.kind.device_type)]["constants_size_bytes"] = int(
+            main_func_metadata.constant_sizes[target]
         )
-        target_main_entries.append(
-            {
-                "device": int(main_target.kind.device_type),
-                "workspace_size_bytes": int(device_max_workspace[main_target])
-                + int(main_func_local_workspace),
-                "constants_size_bytes": int(main_func_constants),
-                "io_size_bytes": int(main_func_io),
-            }
+
+    for target in dict(main_func_metadata.io_sizes).keys():
+        if int(target.kind.device_type) not in target_main_entries.keys():
+            target_main_entries[int(target.kind.device_type)] = _create_empty_entry(
+                int(target.kind.device_type)
+            )
+        target_main_entries[int(target.kind.device_type)]["io_size_bytes"] = int(
+            main_func_metadata.io_sizes[target]
         )
 
     ret = {
         "operator_functions": func_entries,
-        "main": target_main_entries,
+        "main": list(target_main_entries.values()),
     }
     return ret
 
diff --git a/src/tir/usmp/analysis/extract_buffer_info.cc b/src/tir/usmp/analysis/extract_buffer_info.cc
index 5bd5fa700ae22..bb959647c7f04 100644
--- a/src/tir/usmp/analysis/extract_buffer_info.cc
+++ b/src/tir/usmp/analysis/extract_buffer_info.cc
@@ -36,6 +36,8 @@
 
 #include <stack>
 
+#include "../../../runtime/thread_storage_scope.h"
+
 namespace tvm {
 namespace tir {
 namespace usmp {
@@ -257,14 +259,14 @@ void BufferInfoExtractor::RecordAllocateNodeInfo(const AllocateNode* op) {
 void BufferInfoExtractor::VisitStmt_(const AllocateNode* op) {
   ScopeInfo& current_scope_info = scope_stack_.top();
   const auto& type = Downcast<PointerType>(op->buffer_var->type_annotation);
-  const auto& storage_scope = type->storage_scope;
+  const auto& storage_scope = runtime::StorageScope::Create(type->storage_scope);
 
   // If the allocate is in a for loop, USMP currently only looks at serial for loops.
   // If its not a serial for loop, then memory planner will omit them in the current memory planning
   // process leaving them to as tir.allocate nodes for codegen. Additionally, the USMP can only work
   // with buffers that have global storage_scope
 
-  if (storage_scope == "global") {
+  if (storage_scope.rank == runtime::StorageRank::kGlobal) {
     if (!current_scope_info.for_loop.defined()) {
       RecordAllocateNodeInfo(op);
     } else if (current_scope_info.for_loop.defined() &&
diff --git a/tests/python/contrib/test_ethosu/infra.py b/tests/python/contrib/test_ethosu/infra.py
index 50f0174f2b0bd..81c1e00708610 100644
--- a/tests/python/contrib/test_ethosu/infra.py
+++ b/tests/python/contrib/test_ethosu/infra.py
@@ -189,7 +189,7 @@ def deserialize_command_stream(blob):
     return cmms
 
 
-def create_test_runner(accel="ethos-u55-256"):
+def create_test_runner(accel="ethos-u55-256", enable_usmp=True):
     file_dir = os.path.dirname(os.path.abspath(__file__))
     test_root = os.path.join(file_dir, "reference_system")
     _, ethosu_variant, ethosu_macs = accel.split("-")
@@ -215,13 +215,15 @@ def create_test_runner(accel="ethos-u55-256"):
             "relay.ext.ethos-u.options": {
                 "accelerator_config": accel,
             },
-            "tir.usmp.enable": True,
+            "tir.usmp.enable": enable_usmp,
         },
     )
 
 
-def build_source(module, inputs, outputs, accel="ethos-u55-256", output_tolerance=0):
-    test_runner = create_test_runner(accel)
+def build_source(
+    module, inputs, outputs, accel="ethos-u55-256", output_tolerance=0, enable_usmp=True
+):
+    test_runner = create_test_runner(accel, enable_usmp)
     return compile_models(
         models=AOTTestModel(
             module=module,
diff --git a/tests/python/contrib/test_ethosu/test_networks.py b/tests/python/contrib/test_ethosu/test_networks.py
index a0acf12439503..9442f7419711c 100644
--- a/tests/python/contrib/test_ethosu/test_networks.py
+++ b/tests/python/contrib/test_ethosu/test_networks.py
@@ -37,7 +37,8 @@
 ACCEL_TYPES = ["ethos-u55-256", "ethos-u55-128", "ethos-u55-64", "ethos-u55-32"]
 
 
-def test_forward_mobilenet_v1(accel_type="ethos-u55-256"):
+@pytest.mark.parametrize("enable_usmp", [True, False])
+def test_forward_mobilenet_v1(enable_usmp, accel_type="ethos-u55-256"):
     """Test the Mobilenet V1 TF Lite model."""
     np.random.seed(23)
     tflite_model_file = tf_testing.get_workload_official(
@@ -59,7 +60,7 @@ def test_forward_mobilenet_v1(accel_type="ethos-u55-256"):
 
     mod = partition_for_ethosu(relay_mod, params)
     compiled_models = infra.build_source(
-        mod, input_data, output_data, accel_type, output_tolerance=10
+        mod, input_data, output_data, accel_type, output_tolerance=10, enable_usmp=enable_usmp
     )
     infra.verify_source(compiled_models, accel_type)
 
diff --git a/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py b/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py
index add8021083c64..de214888be6b2 100644
--- a/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py
+++ b/tests/python/contrib/test_ethosu/test_tir_to_cs_translator.py
@@ -645,7 +645,7 @@ def populate_ethosu_copy_calls(stmt):
                 {
                     "src": "placeholder_5",
                     "dest": "placeholder_d_global",
-                    "length": 8,
+                    "length": 32,
                 },
             ],
         },
@@ -851,24 +851,45 @@ def _check_buffer(address, region, length, buffer_var):
                 length, dtype=buffer_dtype
             )
         elif buffer_type == tir_to_cs_translator.BufferType.scratch:
-            shape = list(buffer_info[buffer_var].shape)
-            assert length == np.prod(shape)
-            assert address < scratch_size
+            assert address < tvmbaw_workspace_size
 
-            size_in_bytes = int(np.prod(shape)) * dtype_bytes
+            size_in_bytes = allocate_node_sizes[buffer_var]
             # Every buffer is adjusted to align to 16 bytes
             size_in_bytes = util.round_up(size_in_bytes, 16)
-            assert address + size_in_bytes <= scratch_size
+            assert address + size_in_bytes <= tvmbaw_workspace_size
             # The scratch area should not be used by any other buffer
-            assert not scratch_mask[address : address + size_in_bytes].any()
+            assert not tvmbaw_workspace_mask[address : address + size_in_bytes].any()
             # The scratch area is marked as used
-            scratch_mask[address : address + size_in_bytes] = np.ones(size_in_bytes, dtype="uint8")
+            tvmbaw_workspace_mask[address : address + size_in_bytes] = np.ones(
+                size_in_bytes, dtype="uint8"
+            )
         elif buffer_type == tir_to_cs_translator.BufferType.input:
             assert address == 0
         else:
             assert buffer_type == tir_to_cs_translator.BufferType.output
             assert address == 0
 
+    def _get_allocate_node_sizes(mod):
+        # There should only be a single function
+        assert len(mod.functions.items()) == 1
+        primfunc = mod.functions.items()[0][1]
+        _allocate_node_sizes = dict()
+
+        def analyze_remaining_allocates(stmt):
+            if isinstance(stmt, tvm.tir.stmt.Allocate):
+                allocate = stmt
+                pointer_type = allocate.buffer_var.type_annotation
+                storage_scope = pointer_type.storage_scope
+                if storage_scope == "global":
+                    dtype_bytes = np.iinfo(np.dtype(allocate.dtype)).bits // 8
+                    size_in_bytes = int(dtype_bytes * np.prod(list(allocate.extents)))
+                    # Every memory address the NPU access have to be 16 byte aligned
+                    size_in_bytes = util.round_up(size_in_bytes, 16)
+                    _allocate_node_sizes[allocate.buffer_var] = size_in_bytes
+
+        tvm.tir.stmt_functor.post_order_visit(primfunc.body, analyze_remaining_allocates)
+        return _allocate_node_sizes
+
     def verify(npu_ops):
         """This wrapper verifies the allocated addresses matches with original tir buffers"""
         checked_buffers = set()
@@ -933,22 +954,29 @@ def check_buffer(address, region, length, buffer_var):
         tir_mod = test_case["tir_module"]
         tir_mod["main"] = tir_mod["main"].with_attr("target", tvm.target.Target("ethos-u"))
         tir_mod = tvm.tir.transform.MakeUnpackedAPI()(tir_mod)
+        candidate_regions_for_scratch = [5, 2, 1]
+        (
+            scratch_region_map,
+            tvmbaw_workspace_size,
+            _,
+        ) = tir_to_cs_translator.analyze_scratch_memory_acesses(
+            tir_mod, candidate_regions_for_scratch
+        )
+        allocate_node_sizes = _get_allocate_node_sizes(tir_mod)
         buffer_info = tir_to_cs_translator.extract_buffer_info(tir_mod, test_case["param_dict"])
         extern_calls = extract_call_extern_list(tir_mod)
         _npu_ops = list()
         for extern_call in extern_calls:
             _npu_ops.append(tir_to_cs_translator.translate_ethosu_tir_call_extern(extern_call))
         npu_op_tir_buffers = collect_tir_buffer_info(_npu_ops)
-        (
-            _npu_ops,
-            constant_hex_string,
-            scratch_size,
-        ) = tir_to_cs_translator.assign_addresses(buffer_info, _npu_ops)
-        scratch_mask = np.zeros(scratch_size, dtype="uint8")
+        (_npu_ops, constant_hex_string) = tir_to_cs_translator.assign_addresses(
+            buffer_info, _npu_ops, scratch_region_map
+        )
+        tvmbaw_workspace_mask = np.zeros(tvmbaw_workspace_size, dtype="uint8")
         constant_tensor_read_mask = np.zeros(len(constant_hex_string) // 2, dtype="uint8")
         verify(_npu_ops)
         # This will be only 1 if all allocated scratch is used.
-        assert np.prod(scratch_mask) == 1
+        assert np.prod(tvmbaw_workspace_mask) == 1
         # This will be only 1 if all constant tensors is read at least once.
         assert np.prod(constant_tensor_read_mask) == 1