diff --git a/python/tvm/contrib/hexagon/session.py b/python/tvm/contrib/hexagon/session.py index 9308e396b2a5..5619d036e283 100644 --- a/python/tvm/contrib/hexagon/session.py +++ b/python/tvm/contrib/hexagon/session.py @@ -58,7 +58,7 @@ def __init__( remote_kw: dict, session_name: str = "hexagon-rpc", remote_stack_size_bytes: int = 256 * 1024, # Min size for main thread in QuRT/sim - rpc_receive_buffer_size_bytes: int = 5 * 1024 * 1024, # Size for passing hexagon tests + rpc_receive_buffer_size_bytes: int = 256 * 1024 * 1024, # Size for passing hexagon tests ): self._launcher = launcher self._session_name: str = session_name diff --git a/src/runtime/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon_buffer.cc index 0fc71d8ac29c..7e12d8cb4692 100644 --- a/src/runtime/hexagon/hexagon_buffer.cc +++ b/src/runtime/hexagon/hexagon_buffer.cc @@ -62,7 +62,7 @@ struct VTCMAllocation : public Allocation { // allocate nbytes of vtcm on a single page HEXAGON_SAFE_CALL(HAP_compute_res_attr_set_vtcm_param(&res_info, /*vtcm_size = */ nbytes, - /*b_single_page = */ 1)); + /*b_single_page = */ 0)); // TODO(HWE): Investigate why a non-zero timeout results in // hanging, both in the simulator and on hardware. @@ -71,13 +71,14 @@ struct VTCMAllocation : public Allocation { if (context_id_) { data_ = HAP_compute_res_attr_get_vtcm_ptr(&res_info); if (!data_) { - LOG(ERROR) << "ERROR: Allocated VTCM ptr is null."; + LOG(ERROR) << "ERROR: HAP_compute_res_acquire returned nullptr when allocating VTCM."; HEXAGON_SAFE_CALL(HAP_compute_res_release(context_id_)); return; } } else { - LOG(ERROR) << "ERROR: Unable to acquire requeisted resource."; - return; + LOG(FATAL) << "FATAL: HAP_compute_res_acquire failed to acquire requested VTCM resource."; + throw std::runtime_error( + "HAP_compute_res_acquire failed to acquire requested VTCM resource."); } } ~VTCMAllocation() { diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc index f22afca10bfa..343933acff71 100644 --- a/src/runtime/hexagon/hexagon_device_api.cc +++ b/src/runtime/hexagon/hexagon_device_api.cc @@ -165,6 +165,17 @@ void HexagonDeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void memcpy(static_cast(to) + to_offset, static_cast(from) + from_offset, size); } +TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy_DLTensor") + .set_body([](TVMArgs args, TVMRetValue* rv) { + DLTensor* dst = args[0]; + DLTensor* src = args[1]; + int size = args[2]; + + hexagon_user_dma_1d_sync(dst->data, src->data, size); + + *rv = static_cast(0); + }); + TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVMRetValue* rv) { void* dst = args[0]; void* src = args[1]; diff --git a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py new file mode 100644 index 000000000000..c9ff07c490c8 --- /dev/null +++ b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py @@ -0,0 +1,537 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +""" Test different strategies for loading data into vtcm before running HVX workloads. """ + +import numpy as np +import tvm + +from tvm.script import tir as T +from numpy.random import default_rng + +TEST_OUTPUT_TEMPLATE = "Test with {} MB of data to load... \n -No VTCM: {} Gops \n -Basic VTCM: {} Gops \n -Vectorized: {} Gops\n -Vectorized and Parallelized: {} Gops\n -Preallocated and Vectorized: {} Gops\n -Preallocated, Vectorized, and Parallelized: {} Gops\n -Single DMA: {} Gops\n -Preloaded: {} Gops\n" + + +def apply_parallel_unroll_vectorize(sch, blocks, outer_split, unroll_split, vector_split): + for block in blocks: + vb, vi = sch.get_loops(block) + v = sch.fuse(vb, vi) + vbo, vbi, vio, vii = sch.split(v, factors=[outer_split, None, unroll_split, vector_split]) + sch.vectorize(vii) + sch.unroll(vio) + sch.parallel(vbo) + return sch + + +def apply_unroll_vectorize(sch, blocks, unroll_split, vector_split): + for block in blocks: + vb, vi = sch.get_loops(block) + v = sch.fuse(vb, vi) + _, vio, vii = sch.split(v, factors=[None, unroll_split, vector_split]) + sch.vectorize(vii) + sch.unroll(vio) + return sch + + +def apply_vrmpy_parallelization(sch): + block = sch.get_block("C") + b = sch.get_loops(block) + bo, _ = sch.split(b[0], factors=[4, None]) + sch.parallel(bo) + return sch + + +def apply_vtcm_cache_read_write(sch): + block = sch.get_block("C") + sch.cache_read(block, 0, "global.vtcm") + sch.cache_read(block, 1, "global.vtcm") + sch.cache_write(block, 0, "global.vtcm") + return sch + + +def vrmpy(operations): + @T.prim_func + def operator(a: T.handle, b: T.handle, c: T.handle) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer(a, [operations, 128], dtype="uint8", align=128) + B = T.match_buffer(b, [operations, 128], dtype="uint8", align=128) + C = T.match_buffer(c, [operations, 32], dtype="int32", align=128) + for n in T.grid(operations): + with T.block("C"): + vn = T.axis.remap("S", [n]) + C[vn, T.ramp(0, 1, 32)] = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"), + T.uint32(2), + T.reinterpret(A[vn, T.ramp(0, 1, 128)], dtype="int32x32"), + T.reinterpret(B[vn, T.ramp(0, 1, 128)], dtype="int32x32"), + dtype="int32x32", + ) + + return operator + + +def preloaded_vrmpy(operations): + @T.prim_func + def operator(a: T.handle, b: T.handle, c: T.handle) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer( + a, + [T.cast(operations, "int32") * 128], + dtype="uint8", + align=128, + mem_scope="global.vtcm", + ) + B = T.match_buffer( + b, + [T.cast(operations, "int32") * 128], + dtype="uint8", + align=128, + mem_scope="global.vtcm", + ) + C = T.match_buffer( + c, [T.cast(operations, "int32") * 32], dtype="int32", align=128, mem_scope="global.vtcm" + ) + for n in T.grid(operations): + with T.block("C"): + vn = T.axis.remap("S", [n]) + C[T.ramp(T.cast(vn, "int32") * 32, 1, 32)] = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"), + T.uint32(2), + T.reinterpret(A[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32"), + T.reinterpret(B[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32"), + dtype="int32x32", + ) + + return operator + + +def preallocated_vrmpy(operations): + size = operations * 128 + out_size = operations * 32 + + @T.prim_func + def operator( + a: T.handle, b: T.handle, c: T.handle, a_v: T.handle, b_v: T.handle, c_v: T.handle + ) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer(a, [operations, 128], dtype="uint8", align=128, mem_scope="global") + B = T.match_buffer(b, [operations, 128], dtype="uint8", align=128, mem_scope="global") + C = T.match_buffer(c, [operations, 32], dtype="int32", align=128, mem_scope="global") + A_global_vtcm = T.match_buffer( + a_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm" + ) + B_global_vtcm = T.match_buffer( + b_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm" + ) + C_global_vtcm = T.match_buffer( + c_v, [out_size], dtype="int32", align=128, mem_scope="global.vtcm" + ) + for n, i in T.grid(operations, 128): + with T.block("A_global.vtcm"): + vn, vi = T.axis.remap("SS", [n, i]) + A_global_vtcm[vn * 128 + vi] = A[vn, vi] + for n, i in T.grid(operations, 128): + with T.block("B_global.vtcm"): + vn, vi = T.axis.remap("SS", [n, i]) + B_global_vtcm[vn * 128 + vi] = B[vn, vi] + for n in T.grid(operations): + with T.block("C"): + vn = T.axis.remap("S", [n]) + C_global_vtcm[T.ramp(T.cast(vn, "int32") * 32, 1, 32)] = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"), + T.uint32(2), + T.reinterpret( + A_global_vtcm[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32" + ), + T.reinterpret( + B_global_vtcm[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32" + ), + dtype="int32x32", + ) + for n, i in T.grid(operations, 32): + with T.block("C_global.vtcm"): + vn, vi = T.axis.remap("SS", [n, i]) + C[vn, vi] = C_global_vtcm[vn * 32 + vi] + + return operator + + +def preallocated_single_dma_vrmpy(operations): + size = operations * 128 + out_size = operations * 32 + + @T.prim_func + def operator( + a: T.handle, + b: T.handle, + c: T.handle, + a_v: T.handle, + b_v: T.handle, + c_v: T.handle, + ) -> None: + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + A = T.match_buffer(a, [operations, 128], dtype="uint8", align=128, mem_scope="global") + B = T.match_buffer(b, [operations, 128], dtype="uint8", align=128, mem_scope="global") + C = T.match_buffer(c, [operations, 32], dtype="int32", align=128, mem_scope="global") + A_global_vtcm = T.match_buffer( + a_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm" + ) + B_global_vtcm = T.match_buffer( + b_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm" + ) + C_global_vtcm = T.match_buffer( + c_v, [out_size], dtype="int32", align=128, mem_scope="global.vtcm" + ) + T.evaluate( + T.tvm_call_packed( + "device_api.hexagon.mem_copy_DLTensor", + T.tvm_stack_make_array( + A_global_vtcm.data, + T.tvm_stack_make_shape(size, dtype="handle"), + 0, + 1, + A_global_vtcm.dtype, + 0, + dtype="handle", + ), + T.tvm_stack_make_array( + A.data, + T.tvm_stack_make_shape(size, dtype="handle"), + 0, + 1, + A.dtype, + 0, + dtype="handle", + ), + T.cast(size, dtype="int"), + dtype="int32", + ) + ) + T.evaluate( + T.tvm_call_packed( + "device_api.hexagon.mem_copy_DLTensor", + T.tvm_stack_make_array( + B_global_vtcm.data, + T.tvm_stack_make_shape(size, dtype="handle"), + 0, + 1, + B_global_vtcm.dtype, + 0, + dtype="handle", + ), + T.tvm_stack_make_array( + B.data, + T.tvm_stack_make_shape(size, dtype="handle"), + 0, + 1, + B.dtype, + 0, + dtype="handle", + ), + T.cast(size, dtype="int"), + dtype="int32", + ) + ) + for n in T.grid(operations): + with T.block("C"): + vn = T.axis.remap("S", [n]) + C_global_vtcm[T.ramp(T.cast(vn, "int32") * 32, 1, 32)] = T.call_llvm_intrin( + T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"), + T.uint32(2), + T.reinterpret( + A_global_vtcm[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32" + ), + T.reinterpret( + B_global_vtcm[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32" + ), + dtype="int32x32", + ) + T.evaluate( + T.tvm_call_packed( + "device_api.hexagon.mem_copy_DLTensor", + T.tvm_stack_make_array( + C.data, + T.tvm_stack_make_shape(size, dtype="handle"), + 0, + 1, + C.dtype, + 0, + dtype="handle", + ), + T.tvm_stack_make_array( + C_global_vtcm.data, + T.tvm_stack_make_shape(size, dtype="handle"), + 0, + 1, + C_global_vtcm.dtype, + 0, + dtype="handle", + ), + T.cast(size, dtype="int"), + dtype="int32", + ) + ) + + return operator + + +def evaluate_result(operations, tag, time, result, expected_output): + transfer_mb = round(3 * operations * 128 / 1e6, 2) + gops = round(operations * 128 * 3 / time.mean / 1e9, 3) + mean_ms = round(time.mean * 1000, 6) + + print("\ntest_{}MB_{} took {} ms @ GOPS: {}".format(transfer_mb, tag, mean_ms, gops)) + tvm.testing.assert_allclose(result, expected_output) + + +def setup_and_run(hexagon_session, sch, a, b, c, operations, mem_scope="global"): + target_hexagon = tvm.target.hexagon("v69") + func_tir = tvm.build( + sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon) + ) + module = hexagon_session.load_module(func_tir) + + a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device, mem_scope=mem_scope) + b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device, mem_scope=mem_scope) + c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device, mem_scope=mem_scope) + timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=100, repeat=10) + time = timer(a_hexagon, b_hexagon, c_hexagon) + gops = round(operations * 128 * 3 / time.mean / 1e9, 4) + return gops, c_hexagon.asnumpy() + + +def setup_and_run_preallocated(hexagon_session, sch, a, b, c, operations): + target_hexagon = tvm.target.hexagon("v69") + func_tir = tvm.build( + sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon) + ) + module = hexagon_session.load_module(func_tir) + + a_vtcm = np.zeros((a.size), dtype="uint8") + b_vtcm = np.zeros((b.size), dtype="uint8") + c_vtcm = np.zeros((c.size), dtype="int32") + + a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device, mem_scope="global") + b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device, mem_scope="global") + c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device, mem_scope="global") + a_vtcm_hexagon = tvm.runtime.ndarray.array( + a_vtcm, device=hexagon_session.device, mem_scope="global.vtcm" + ) + b_vtcm_hexagon = tvm.runtime.ndarray.array( + b_vtcm, device=hexagon_session.device, mem_scope="global.vtcm" + ) + c_vtcm_hexagon = tvm.runtime.ndarray.array( + c_vtcm, device=hexagon_session.device, mem_scope="global.vtcm" + ) + + timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=100, repeat=10) + time = timer(a_hexagon, b_hexagon, c_hexagon, a_vtcm_hexagon, b_vtcm_hexagon, c_vtcm_hexagon) + gops = round(operations * 128 * 3 / time.mean / 1e9, 4) + return gops, c_hexagon.asnumpy() + + +@tvm.testing.fixture +def input_a(operations): + return default_rng().integers(0, 16, (operations, 128), dtype="uint8") + + +@tvm.testing.fixture +def input_b(operations): + return default_rng().integers(0, 16, (operations, 128), dtype="uint8") + + +@tvm.testing.fixture +def input_c(operations): + return np.zeros((operations, 32), dtype="int32") + + +@tvm.testing.fixture +def expected_output(operations, input_a, input_b, input_c): + expected_output = np.zeros(input_c.shape, dtype="int32") + for n in range(operations): + for i in range(32): + for r in range(4): + expected_output[n, i] = expected_output[n, i] + np.uint32( + input_a[n, i * 4 + r] + ) * np.uint32(input_b[n, i * 4 + r]) + return expected_output + + +class TestMatMulVec: + + operations = tvm.testing.parameter( + 1024, + 2048, + 4096, + 5 * 2048, # 3.93MB of total transfer + # 16384, #Only works on 8Gen1 HDK's + # 5 * 4096, # 7.86MB of total transfer. Only works on 8Gen1 HDK's + ) + + # Experimentally best configurations for the memcopy + outer_split = tvm.testing.parameter(4) + unroll_split = tvm.testing.parameter(8) + vector_split = tvm.testing.parameter(64) + c_vector_split = tvm.testing.parameter(16) + c_vector_split_unallocated = tvm.testing.parameter(8) + + @tvm.testing.requires_hexagon + def test_loading_vtcm_for_vrmpy( + self, + hexagon_session, + operations, + input_a, + input_b, + input_c, + expected_output, + outer_split, + unroll_split, + vector_split, + c_vector_split, + c_vector_split_unallocated, + ): + + # Run parallel vrmpy without loading to VTCM. + sch = tvm.tir.Schedule(vrmpy(operations)) + sch = apply_vrmpy_parallelization(sch) + base_runtime, result = setup_and_run( + hexagon_session, sch, input_a, input_b, input_c, operations + ) + tvm.testing.assert_allclose(result, expected_output) + + # Run parallel vrmpy with basic memory loads to VTCM. + sch = tvm.tir.Schedule(vrmpy(operations)) + sch = apply_vtcm_cache_read_write(sch) + sch = apply_vrmpy_parallelization(sch) + basic_load_runtime, result = setup_and_run( + hexagon_session, sch, input_a, input_b, input_c, operations + ) + tvm.testing.assert_allclose(result, expected_output) + + # Run parallel vrmpy with vectorized memory loads to VTCM. + sch = tvm.tir.Schedule(vrmpy(operations)) + sch = apply_vtcm_cache_read_write(sch) + sch = apply_vrmpy_parallelization(sch) + sch = apply_unroll_vectorize( + sch, + [sch.get_block("A_global.vtcm"), sch.get_block("B_global.vtcm")], + unroll_split, + vector_split, + ) + sch = apply_unroll_vectorize( + sch, [sch.get_block("C_global.vtcm")], unroll_split, c_vector_split_unallocated + ) + vectorized_runtime, result = setup_and_run( + hexagon_session, sch, input_a, input_b, input_c, operations + ) + tvm.testing.assert_allclose(result, expected_output) + + # Run parallel vrmpy with vectorized and parallelized memory loads to VTCM. + sch = tvm.tir.Schedule(vrmpy(operations)) + sch = apply_vtcm_cache_read_write(sch) + sch = apply_vrmpy_parallelization(sch) + sch = apply_parallel_unroll_vectorize( + sch, + [sch.get_block("A_global.vtcm"), sch.get_block("B_global.vtcm")], + outer_split, + unroll_split, + vector_split, + ) + sch = apply_parallel_unroll_vectorize( + sch, + [sch.get_block("C_global.vtcm")], + outer_split, + unroll_split, + c_vector_split_unallocated, + ) + vectorized_parallelized_runtime, result = setup_and_run( + hexagon_session, sch, input_a, input_b, input_c, operations + ) + tvm.testing.assert_allclose(result, expected_output) + + # Run parallel vrmpy with preallocated and vectorized memory loads to VTCM. + sch = tvm.tir.Schedule(preallocated_vrmpy(operations)) + sch = apply_vrmpy_parallelization(sch) + sch = apply_unroll_vectorize( + sch, + [sch.get_block("A_global.vtcm"), sch.get_block("B_global.vtcm")], + unroll_split, + vector_split, + ) + sch = apply_unroll_vectorize( + sch, [sch.get_block("C_global.vtcm")], unroll_split, c_vector_split + ) + preallocated_vectorized_runtime, result = setup_and_run_preallocated( + hexagon_session, sch, input_a, input_b, input_c, operations + ) + result = result.reshape((operations, 32)) + tvm.testing.assert_allclose(result, expected_output) + + # Run parallel vrmpy with preallocated, vectorized, and parallelized memory loads to VTCM. + sch = tvm.tir.Schedule(preallocated_vrmpy(operations)) + sch = apply_vrmpy_parallelization(sch) + sch = apply_parallel_unroll_vectorize( + sch, + [sch.get_block("A_global.vtcm"), sch.get_block("B_global.vtcm")], + outer_split, + unroll_split, + vector_split, + ) + sch = apply_parallel_unroll_vectorize( + sch, [sch.get_block("C_global.vtcm")], outer_split, unroll_split, c_vector_split + ) + preallocated_vectorized_parallelized_runtime, result = setup_and_run_preallocated( + hexagon_session, sch, input_a, input_b, input_c, operations + ) + result = result.reshape((operations, 32)) + tvm.testing.assert_allclose(result, expected_output) + + # Run parallel vrmpy with preallocated single dma memory load to VTCM. + sch = tvm.tir.Schedule(preallocated_single_dma_vrmpy(operations)) + sch = apply_vrmpy_parallelization(sch) + single_dma_runtime, result = setup_and_run_preallocated( + hexagon_session, sch, input_a, input_b, input_c, operations + ) + result = result.reshape((operations, 32)) + tvm.testing.assert_allclose(result, expected_output) + + # Run parallel vrmpy with data preloaded in VTCM. + sch = tvm.tir.Schedule(preloaded_vrmpy(operations)) + sch = apply_vrmpy_parallelization(sch) + input_a = input_a.reshape(operations * 128) + input_b = input_b.reshape(operations * 128) + input_c = input_c.reshape(operations * 32) + preloaded_runtime, result = setup_and_run( + hexagon_session, sch, input_a, input_b, input_c, operations, "global.vtcm" + ) + result = result.reshape((operations, 32)) + tvm.testing.assert_allclose(result, expected_output) + + transfer_mb = round(3 * operations * 128 / 1e6, 2) + print( + TEST_OUTPUT_TEMPLATE.format( + transfer_mb, + base_runtime, + basic_load_runtime, + vectorized_runtime, + vectorized_parallelized_runtime, + preallocated_vectorized_runtime, + preallocated_vectorized_parallelized_runtime, + single_dma_runtime, + preloaded_runtime, + ) + ) diff --git a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py new file mode 100644 index 000000000000..6db8b9101997 --- /dev/null +++ b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py @@ -0,0 +1,169 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Test theoretical bandwith for data transfers to VTCM for different strategies.""" + +import numpy as np +from tests.python.contrib.test_hexagon.infrastructure import allocate_hexagon_array +import tvm + +from tvm.script import tir as T +from numpy.random import default_rng + +MB = 1024**2 +KB = 1024 +TEST_OUTPUT_TEMPLATE = "Test bandwidth with buffer size {}MB... \n -Base: {} GBps \n -Vectorized: {} GBps\n -Vectorized and Parallelized: {} GBps\n -Single DMA Copy: {} GBps\n" + + +def memcopy_operator(size): + @T.prim_func + def operator(a: T.handle, a_v: T.handle) -> None: + A = T.match_buffer(a, size, dtype="int8", align=128, scope="global") + A_global_vtcm = T.match_buffer(a_v, size, dtype="int8", align=128, scope="global.vtcm") + for ax0 in T.serial(size): + with T.block("A_global.vtcm"): + v0 = T.axis.spatial(size, ax0) + T.reads(A[v0]) + T.writes(A_global_vtcm[v0]) + A_global_vtcm[v0] = A[v0] + + return operator + + +def single_dma_operator(size): + @T.prim_func + def operator(a: T.handle, a_v: T.handle) -> None: + A = T.match_buffer(a, size, dtype="int8", align=128, scope="global") + A_global_vtcm = T.match_buffer(a_v, size, dtype="int8", align=128, scope="global.vtcm") + T.evaluate( + T.tvm_call_packed( + "device_api.hexagon.mem_copy_DLTensor", + T.tvm_stack_make_array( + A_global_vtcm.data, + T.tvm_stack_make_shape(size, dtype="handle"), + 0, + 1, + A_global_vtcm.dtype, + 0, + dtype="handle", + ), + T.tvm_stack_make_array( + A.data, + T.tvm_stack_make_shape(size, dtype="handle"), + 0, + 1, + A.dtype, + 0, + dtype="handle", + ), + T.cast(size, dtype="int"), + dtype="int32", + ) + ) + + return operator + + +def evaluate(hexagon_session, sch, size): + a_shape = size + + target_hexagon = tvm.target.hexagon("v69") + func_tir = tvm.build( + sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon) + ) + module = hexagon_session.load_module(func_tir) + + rng = default_rng() + a = rng.integers(-128, 127, a_shape, dtype="int8") + a_vtcm = np.zeros(a_shape, dtype="int8") + + a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device, mem_scope="global") + a_vtcm_hexagon = tvm.runtime.ndarray.array( + a_vtcm, device=hexagon_session.device, mem_scope="global.vtcm" + ) + + # a_hexagon = allocate_hexagon_array(hexagon_session.device, data=a, mem_scope="global") + # a_vtcm_hexagon = allocate_hexagon_array(hexagon_session.device, data=a_vtcm, mem_scope="global.vtcm") + + timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=100, repeat=10) + runtime = timer(a_hexagon, a_vtcm_hexagon) + + gbps = round((size / 2**30) / runtime.mean, 4) + tvm.testing.assert_allclose(a_vtcm_hexagon.asnumpy(), a) + + return gbps + + +class TestMatMulVec: + + size = tvm.testing.parameter( + 10 * KB, + 20 * KB, + 40 * KB, + 80 * KB, + 160 * KB, + 320 * KB, + 640 * KB, + MB, + 2 * MB, + 3 * MB, + 4 * MB, + # 8 * MB, # Only works on 8gen1 HDKs + ) + + outer_split = tvm.testing.parameter(4) + unroll_split = tvm.testing.parameter(2) + vector_split = tvm.testing.parameter(128) + + @tvm.testing.requires_hexagon + def test_bandwidth(self, hexagon_session, size, outer_split, unroll_split, vector_split): + + # Run the base memcopy operator. + sch = tvm.tir.Schedule(memcopy_operator(size)) + base_gpbs = evaluate(hexagon_session, sch, size) + + # Run with some basic unroll and vectorize scheduling. + sch = tvm.tir.Schedule(memcopy_operator(size)) + vtcm_block_a = sch.get_block("A_global.vtcm") + vb = sch.get_loops(vtcm_block_a) + vbi_a, vio_a, vii_a = sch.split(vb[0], factors=[None, unroll_split, vector_split]) + sch.unroll(vio_a) + sch.vectorize(vii_a) + vectorize_gbps = evaluate(hexagon_session, sch, size) + + # Run with some basic unroll and vectorize scheduling and parallelization. + sch = tvm.tir.Schedule(memcopy_operator(size)) + vtcm_block_a = sch.get_block("A_global.vtcm") + vb = sch.get_loops(vtcm_block_a) + vbo_a, vbi_a, vio_a, vii_a = sch.split( + vb[0], factors=[outer_split, None, unroll_split, vector_split] + ) + sch.unroll(vio_a) + sch.vectorize(vii_a) + sch.parallel(vbo_a) + parallel_gbps = evaluate(hexagon_session, sch, size) + + # Run using a single dma copy to transfer the data. + sch = tvm.tir.Schedule(single_dma_operator(size)) + single_dma_gbps = evaluate(hexagon_session, sch, size) + + mbs = round(size / MB, 2) + print( + TEST_OUTPUT_TEMPLATE.format( + mbs, base_gpbs, vectorize_gbps, parallel_gbps, single_dma_gbps + ) + )