diff --git a/python/tvm/contrib/hexagon/session.py b/python/tvm/contrib/hexagon/session.py
index 9308e396b2a5..5619d036e283 100644
--- a/python/tvm/contrib/hexagon/session.py
+++ b/python/tvm/contrib/hexagon/session.py
@@ -58,7 +58,7 @@ def __init__(
         remote_kw: dict,
         session_name: str = "hexagon-rpc",
         remote_stack_size_bytes: int = 256 * 1024,  # Min size for main thread in QuRT/sim
-        rpc_receive_buffer_size_bytes: int = 5 * 1024 * 1024,  # Size for passing hexagon tests
+        rpc_receive_buffer_size_bytes: int = 256 * 1024 * 1024,  # Size for passing hexagon tests
     ):
         self._launcher = launcher
         self._session_name: str = session_name
diff --git a/src/runtime/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon_buffer.cc
index 0fc71d8ac29c..7e12d8cb4692 100644
--- a/src/runtime/hexagon/hexagon_buffer.cc
+++ b/src/runtime/hexagon/hexagon_buffer.cc
@@ -62,7 +62,7 @@ struct VTCMAllocation : public Allocation {
 
     // allocate nbytes of vtcm on a single page
     HEXAGON_SAFE_CALL(HAP_compute_res_attr_set_vtcm_param(&res_info, /*vtcm_size = */ nbytes,
-                                                          /*b_single_page = */ 1));
+                                                          /*b_single_page = */ 0));
 
     // TODO(HWE): Investigate why a non-zero timeout results in
     // hanging, both in the simulator and on hardware.
@@ -71,13 +71,14 @@ struct VTCMAllocation : public Allocation {
     if (context_id_) {
       data_ = HAP_compute_res_attr_get_vtcm_ptr(&res_info);
       if (!data_) {
-        LOG(ERROR) << "ERROR: Allocated VTCM ptr is null.";
+        LOG(ERROR) << "ERROR: HAP_compute_res_acquire returned nullptr when allocating VTCM.";
         HEXAGON_SAFE_CALL(HAP_compute_res_release(context_id_));
         return;
       }
     } else {
-      LOG(ERROR) << "ERROR: Unable to acquire requeisted resource.";
-      return;
+      LOG(FATAL) << "FATAL: HAP_compute_res_acquire failed to acquire requested VTCM resource.";
+      throw std::runtime_error(
+          "HAP_compute_res_acquire failed to acquire requested VTCM resource.");
     }
   }
   ~VTCMAllocation() {
diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
index f22afca10bfa..343933acff71 100644
--- a/src/runtime/hexagon/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -165,6 +165,17 @@ void HexagonDeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void
   memcpy(static_cast<char*>(to) + to_offset, static_cast<const char*>(from) + from_offset, size);
 }
 
+TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy_DLTensor")
+    .set_body([](TVMArgs args, TVMRetValue* rv) {
+      DLTensor* dst = args[0];
+      DLTensor* src = args[1];
+      int size = args[2];
+
+      hexagon_user_dma_1d_sync(dst->data, src->data, size);
+
+      *rv = static_cast<int32_t>(0);
+    });
+
 TVM_REGISTER_GLOBAL("device_api.hexagon.mem_copy").set_body([](TVMArgs args, TVMRetValue* rv) {
   void* dst = args[0];
   void* src = args[1];
diff --git a/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
new file mode 100644
index 000000000000..c9ff07c490c8
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_parallel_hvx_load_vtcm.py
@@ -0,0 +1,537 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+""" Test different strategies for loading data into vtcm before running HVX workloads. """
+
+import numpy as np
+import tvm
+
+from tvm.script import tir as T
+from numpy.random import default_rng
+
+TEST_OUTPUT_TEMPLATE = "Test with {} MB of data to load... \n    -No VTCM: {} Gops \n    -Basic VTCM: {} Gops \n    -Vectorized: {} Gops\n    -Vectorized and Parallelized: {} Gops\n    -Preallocated and Vectorized: {} Gops\n    -Preallocated, Vectorized, and Parallelized: {} Gops\n    -Single DMA: {} Gops\n    -Preloaded: {} Gops\n"
+
+
+def apply_parallel_unroll_vectorize(sch, blocks, outer_split, unroll_split, vector_split):
+    for block in blocks:
+        vb, vi = sch.get_loops(block)
+        v = sch.fuse(vb, vi)
+        vbo, vbi, vio, vii = sch.split(v, factors=[outer_split, None, unroll_split, vector_split])
+        sch.vectorize(vii)
+        sch.unroll(vio)
+        sch.parallel(vbo)
+    return sch
+
+
+def apply_unroll_vectorize(sch, blocks, unroll_split, vector_split):
+    for block in blocks:
+        vb, vi = sch.get_loops(block)
+        v = sch.fuse(vb, vi)
+        _, vio, vii = sch.split(v, factors=[None, unroll_split, vector_split])
+        sch.vectorize(vii)
+        sch.unroll(vio)
+    return sch
+
+
+def apply_vrmpy_parallelization(sch):
+    block = sch.get_block("C")
+    b = sch.get_loops(block)
+    bo, _ = sch.split(b[0], factors=[4, None])
+    sch.parallel(bo)
+    return sch
+
+
+def apply_vtcm_cache_read_write(sch):
+    block = sch.get_block("C")
+    sch.cache_read(block, 0, "global.vtcm")
+    sch.cache_read(block, 1, "global.vtcm")
+    sch.cache_write(block, 0, "global.vtcm")
+    return sch
+
+
+def vrmpy(operations):
+    @T.prim_func
+    def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(a, [operations, 128], dtype="uint8", align=128)
+        B = T.match_buffer(b, [operations, 128], dtype="uint8", align=128)
+        C = T.match_buffer(c, [operations, 32], dtype="int32", align=128)
+        for n in T.grid(operations):
+            with T.block("C"):
+                vn = T.axis.remap("S", [n])
+                C[vn, T.ramp(0, 1, 32)] = T.call_llvm_intrin(
+                    T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"),
+                    T.uint32(2),
+                    T.reinterpret(A[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(B[vn, T.ramp(0, 1, 128)], dtype="int32x32"),
+                    dtype="int32x32",
+                )
+
+    return operator
+
+
+def preloaded_vrmpy(operations):
+    @T.prim_func
+    def operator(a: T.handle, b: T.handle, c: T.handle) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(
+            a,
+            [T.cast(operations, "int32") * 128],
+            dtype="uint8",
+            align=128,
+            mem_scope="global.vtcm",
+        )
+        B = T.match_buffer(
+            b,
+            [T.cast(operations, "int32") * 128],
+            dtype="uint8",
+            align=128,
+            mem_scope="global.vtcm",
+        )
+        C = T.match_buffer(
+            c, [T.cast(operations, "int32") * 32], dtype="int32", align=128, mem_scope="global.vtcm"
+        )
+        for n in T.grid(operations):
+            with T.block("C"):
+                vn = T.axis.remap("S", [n])
+                C[T.ramp(T.cast(vn, "int32") * 32, 1, 32)] = T.call_llvm_intrin(
+                    T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"),
+                    T.uint32(2),
+                    T.reinterpret(A[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32"),
+                    T.reinterpret(B[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32"),
+                    dtype="int32x32",
+                )
+
+    return operator
+
+
+def preallocated_vrmpy(operations):
+    size = operations * 128
+    out_size = operations * 32
+
+    @T.prim_func
+    def operator(
+        a: T.handle, b: T.handle, c: T.handle, a_v: T.handle, b_v: T.handle, c_v: T.handle
+    ) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(a, [operations, 128], dtype="uint8", align=128, mem_scope="global")
+        B = T.match_buffer(b, [operations, 128], dtype="uint8", align=128, mem_scope="global")
+        C = T.match_buffer(c, [operations, 32], dtype="int32", align=128, mem_scope="global")
+        A_global_vtcm = T.match_buffer(
+            a_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm"
+        )
+        B_global_vtcm = T.match_buffer(
+            b_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm"
+        )
+        C_global_vtcm = T.match_buffer(
+            c_v, [out_size], dtype="int32", align=128, mem_scope="global.vtcm"
+        )
+        for n, i in T.grid(operations, 128):
+            with T.block("A_global.vtcm"):
+                vn, vi = T.axis.remap("SS", [n, i])
+                A_global_vtcm[vn * 128 + vi] = A[vn, vi]
+        for n, i in T.grid(operations, 128):
+            with T.block("B_global.vtcm"):
+                vn, vi = T.axis.remap("SS", [n, i])
+                B_global_vtcm[vn * 128 + vi] = B[vn, vi]
+        for n in T.grid(operations):
+            with T.block("C"):
+                vn = T.axis.remap("S", [n])
+                C_global_vtcm[T.ramp(T.cast(vn, "int32") * 32, 1, 32)] = T.call_llvm_intrin(
+                    T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"),
+                    T.uint32(2),
+                    T.reinterpret(
+                        A_global_vtcm[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32"
+                    ),
+                    T.reinterpret(
+                        B_global_vtcm[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32"
+                    ),
+                    dtype="int32x32",
+                )
+        for n, i in T.grid(operations, 32):
+            with T.block("C_global.vtcm"):
+                vn, vi = T.axis.remap("SS", [n, i])
+                C[vn, vi] = C_global_vtcm[vn * 32 + vi]
+
+    return operator
+
+
+def preallocated_single_dma_vrmpy(operations):
+    size = operations * 128
+    out_size = operations * 32
+
+    @T.prim_func
+    def operator(
+        a: T.handle,
+        b: T.handle,
+        c: T.handle,
+        a_v: T.handle,
+        b_v: T.handle,
+        c_v: T.handle,
+    ) -> None:
+        T.func_attr({"global_symbol": "main", "tir.noalias": True})
+        A = T.match_buffer(a, [operations, 128], dtype="uint8", align=128, mem_scope="global")
+        B = T.match_buffer(b, [operations, 128], dtype="uint8", align=128, mem_scope="global")
+        C = T.match_buffer(c, [operations, 32], dtype="int32", align=128, mem_scope="global")
+        A_global_vtcm = T.match_buffer(
+            a_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm"
+        )
+        B_global_vtcm = T.match_buffer(
+            b_v, [size], dtype="uint8", align=128, mem_scope="global.vtcm"
+        )
+        C_global_vtcm = T.match_buffer(
+            c_v, [out_size], dtype="int32", align=128, mem_scope="global.vtcm"
+        )
+        T.evaluate(
+            T.tvm_call_packed(
+                "device_api.hexagon.mem_copy_DLTensor",
+                T.tvm_stack_make_array(
+                    A_global_vtcm.data,
+                    T.tvm_stack_make_shape(size, dtype="handle"),
+                    0,
+                    1,
+                    A_global_vtcm.dtype,
+                    0,
+                    dtype="handle",
+                ),
+                T.tvm_stack_make_array(
+                    A.data,
+                    T.tvm_stack_make_shape(size, dtype="handle"),
+                    0,
+                    1,
+                    A.dtype,
+                    0,
+                    dtype="handle",
+                ),
+                T.cast(size, dtype="int"),
+                dtype="int32",
+            )
+        )
+        T.evaluate(
+            T.tvm_call_packed(
+                "device_api.hexagon.mem_copy_DLTensor",
+                T.tvm_stack_make_array(
+                    B_global_vtcm.data,
+                    T.tvm_stack_make_shape(size, dtype="handle"),
+                    0,
+                    1,
+                    B_global_vtcm.dtype,
+                    0,
+                    dtype="handle",
+                ),
+                T.tvm_stack_make_array(
+                    B.data,
+                    T.tvm_stack_make_shape(size, dtype="handle"),
+                    0,
+                    1,
+                    B.dtype,
+                    0,
+                    dtype="handle",
+                ),
+                T.cast(size, dtype="int"),
+                dtype="int32",
+            )
+        )
+        for n in T.grid(operations):
+            with T.block("C"):
+                vn = T.axis.remap("S", [n])
+                C_global_vtcm[T.ramp(T.cast(vn, "int32") * 32, 1, 32)] = T.call_llvm_intrin(
+                    T.llvm_lookup_intrinsic_id("llvm.hexagon.V6.vrmpyubv.128B"),
+                    T.uint32(2),
+                    T.reinterpret(
+                        A_global_vtcm[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32"
+                    ),
+                    T.reinterpret(
+                        B_global_vtcm[T.ramp(T.cast(vn, "int32") * 128, 1, 128)], dtype="int32x32"
+                    ),
+                    dtype="int32x32",
+                )
+        T.evaluate(
+            T.tvm_call_packed(
+                "device_api.hexagon.mem_copy_DLTensor",
+                T.tvm_stack_make_array(
+                    C.data,
+                    T.tvm_stack_make_shape(size, dtype="handle"),
+                    0,
+                    1,
+                    C.dtype,
+                    0,
+                    dtype="handle",
+                ),
+                T.tvm_stack_make_array(
+                    C_global_vtcm.data,
+                    T.tvm_stack_make_shape(size, dtype="handle"),
+                    0,
+                    1,
+                    C_global_vtcm.dtype,
+                    0,
+                    dtype="handle",
+                ),
+                T.cast(size, dtype="int"),
+                dtype="int32",
+            )
+        )
+
+    return operator
+
+
+def evaluate_result(operations, tag, time, result, expected_output):
+    transfer_mb = round(3 * operations * 128 / 1e6, 2)
+    gops = round(operations * 128 * 3 / time.mean / 1e9, 3)
+    mean_ms = round(time.mean * 1000, 6)
+
+    print("\ntest_{}MB_{} took {} ms @ GOPS: {}".format(transfer_mb, tag, mean_ms, gops))
+    tvm.testing.assert_allclose(result, expected_output)
+
+
+def setup_and_run(hexagon_session, sch, a, b, c, operations, mem_scope="global"):
+    target_hexagon = tvm.target.hexagon("v69")
+    func_tir = tvm.build(
+        sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon)
+    )
+    module = hexagon_session.load_module(func_tir)
+
+    a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device, mem_scope=mem_scope)
+    b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device, mem_scope=mem_scope)
+    c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device, mem_scope=mem_scope)
+    timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=100, repeat=10)
+    time = timer(a_hexagon, b_hexagon, c_hexagon)
+    gops = round(operations * 128 * 3 / time.mean / 1e9, 4)
+    return gops, c_hexagon.asnumpy()
+
+
+def setup_and_run_preallocated(hexagon_session, sch, a, b, c, operations):
+    target_hexagon = tvm.target.hexagon("v69")
+    func_tir = tvm.build(
+        sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon)
+    )
+    module = hexagon_session.load_module(func_tir)
+
+    a_vtcm = np.zeros((a.size), dtype="uint8")
+    b_vtcm = np.zeros((b.size), dtype="uint8")
+    c_vtcm = np.zeros((c.size), dtype="int32")
+
+    a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device, mem_scope="global")
+    b_hexagon = tvm.runtime.ndarray.array(b, device=hexagon_session.device, mem_scope="global")
+    c_hexagon = tvm.runtime.ndarray.array(c, device=hexagon_session.device, mem_scope="global")
+    a_vtcm_hexagon = tvm.runtime.ndarray.array(
+        a_vtcm, device=hexagon_session.device, mem_scope="global.vtcm"
+    )
+    b_vtcm_hexagon = tvm.runtime.ndarray.array(
+        b_vtcm, device=hexagon_session.device, mem_scope="global.vtcm"
+    )
+    c_vtcm_hexagon = tvm.runtime.ndarray.array(
+        c_vtcm, device=hexagon_session.device, mem_scope="global.vtcm"
+    )
+
+    timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=100, repeat=10)
+    time = timer(a_hexagon, b_hexagon, c_hexagon, a_vtcm_hexagon, b_vtcm_hexagon, c_vtcm_hexagon)
+    gops = round(operations * 128 * 3 / time.mean / 1e9, 4)
+    return gops, c_hexagon.asnumpy()
+
+
+@tvm.testing.fixture
+def input_a(operations):
+    return default_rng().integers(0, 16, (operations, 128), dtype="uint8")
+
+
+@tvm.testing.fixture
+def input_b(operations):
+    return default_rng().integers(0, 16, (operations, 128), dtype="uint8")
+
+
+@tvm.testing.fixture
+def input_c(operations):
+    return np.zeros((operations, 32), dtype="int32")
+
+
+@tvm.testing.fixture
+def expected_output(operations, input_a, input_b, input_c):
+    expected_output = np.zeros(input_c.shape, dtype="int32")
+    for n in range(operations):
+        for i in range(32):
+            for r in range(4):
+                expected_output[n, i] = expected_output[n, i] + np.uint32(
+                    input_a[n, i * 4 + r]
+                ) * np.uint32(input_b[n, i * 4 + r])
+    return expected_output
+
+
+class TestMatMulVec:
+
+    operations = tvm.testing.parameter(
+        1024,
+        2048,
+        4096,
+        5 * 2048,  # 3.93MB of total transfer
+        # 16384, #Only works on 8Gen1 HDK's
+        # 5 * 4096,  # 7.86MB of total transfer. Only works on 8Gen1 HDK's
+    )
+
+    # Experimentally best configurations for the memcopy
+    outer_split = tvm.testing.parameter(4)
+    unroll_split = tvm.testing.parameter(8)
+    vector_split = tvm.testing.parameter(64)
+    c_vector_split = tvm.testing.parameter(16)
+    c_vector_split_unallocated = tvm.testing.parameter(8)
+
+    @tvm.testing.requires_hexagon
+    def test_loading_vtcm_for_vrmpy(
+        self,
+        hexagon_session,
+        operations,
+        input_a,
+        input_b,
+        input_c,
+        expected_output,
+        outer_split,
+        unroll_split,
+        vector_split,
+        c_vector_split,
+        c_vector_split_unallocated,
+    ):
+
+        # Run parallel vrmpy without loading to VTCM.
+        sch = tvm.tir.Schedule(vrmpy(operations))
+        sch = apply_vrmpy_parallelization(sch)
+        base_runtime, result = setup_and_run(
+            hexagon_session, sch, input_a, input_b, input_c, operations
+        )
+        tvm.testing.assert_allclose(result, expected_output)
+
+        # Run parallel vrmpy with basic memory loads to VTCM.
+        sch = tvm.tir.Schedule(vrmpy(operations))
+        sch = apply_vtcm_cache_read_write(sch)
+        sch = apply_vrmpy_parallelization(sch)
+        basic_load_runtime, result = setup_and_run(
+            hexagon_session, sch, input_a, input_b, input_c, operations
+        )
+        tvm.testing.assert_allclose(result, expected_output)
+
+        # Run parallel vrmpy with vectorized memory loads to VTCM.
+        sch = tvm.tir.Schedule(vrmpy(operations))
+        sch = apply_vtcm_cache_read_write(sch)
+        sch = apply_vrmpy_parallelization(sch)
+        sch = apply_unroll_vectorize(
+            sch,
+            [sch.get_block("A_global.vtcm"), sch.get_block("B_global.vtcm")],
+            unroll_split,
+            vector_split,
+        )
+        sch = apply_unroll_vectorize(
+            sch, [sch.get_block("C_global.vtcm")], unroll_split, c_vector_split_unallocated
+        )
+        vectorized_runtime, result = setup_and_run(
+            hexagon_session, sch, input_a, input_b, input_c, operations
+        )
+        tvm.testing.assert_allclose(result, expected_output)
+
+        # Run parallel vrmpy with vectorized and parallelized memory loads to VTCM.
+        sch = tvm.tir.Schedule(vrmpy(operations))
+        sch = apply_vtcm_cache_read_write(sch)
+        sch = apply_vrmpy_parallelization(sch)
+        sch = apply_parallel_unroll_vectorize(
+            sch,
+            [sch.get_block("A_global.vtcm"), sch.get_block("B_global.vtcm")],
+            outer_split,
+            unroll_split,
+            vector_split,
+        )
+        sch = apply_parallel_unroll_vectorize(
+            sch,
+            [sch.get_block("C_global.vtcm")],
+            outer_split,
+            unroll_split,
+            c_vector_split_unallocated,
+        )
+        vectorized_parallelized_runtime, result = setup_and_run(
+            hexagon_session, sch, input_a, input_b, input_c, operations
+        )
+        tvm.testing.assert_allclose(result, expected_output)
+
+        # Run parallel vrmpy with preallocated and vectorized memory loads to VTCM.
+        sch = tvm.tir.Schedule(preallocated_vrmpy(operations))
+        sch = apply_vrmpy_parallelization(sch)
+        sch = apply_unroll_vectorize(
+            sch,
+            [sch.get_block("A_global.vtcm"), sch.get_block("B_global.vtcm")],
+            unroll_split,
+            vector_split,
+        )
+        sch = apply_unroll_vectorize(
+            sch, [sch.get_block("C_global.vtcm")], unroll_split, c_vector_split
+        )
+        preallocated_vectorized_runtime, result = setup_and_run_preallocated(
+            hexagon_session, sch, input_a, input_b, input_c, operations
+        )
+        result = result.reshape((operations, 32))
+        tvm.testing.assert_allclose(result, expected_output)
+
+        # Run parallel vrmpy with preallocated, vectorized, and parallelized memory loads to VTCM.
+        sch = tvm.tir.Schedule(preallocated_vrmpy(operations))
+        sch = apply_vrmpy_parallelization(sch)
+        sch = apply_parallel_unroll_vectorize(
+            sch,
+            [sch.get_block("A_global.vtcm"), sch.get_block("B_global.vtcm")],
+            outer_split,
+            unroll_split,
+            vector_split,
+        )
+        sch = apply_parallel_unroll_vectorize(
+            sch, [sch.get_block("C_global.vtcm")], outer_split, unroll_split, c_vector_split
+        )
+        preallocated_vectorized_parallelized_runtime, result = setup_and_run_preallocated(
+            hexagon_session, sch, input_a, input_b, input_c, operations
+        )
+        result = result.reshape((operations, 32))
+        tvm.testing.assert_allclose(result, expected_output)
+
+        # Run parallel vrmpy with preallocated single dma memory load to VTCM.
+        sch = tvm.tir.Schedule(preallocated_single_dma_vrmpy(operations))
+        sch = apply_vrmpy_parallelization(sch)
+        single_dma_runtime, result = setup_and_run_preallocated(
+            hexagon_session, sch, input_a, input_b, input_c, operations
+        )
+        result = result.reshape((operations, 32))
+        tvm.testing.assert_allclose(result, expected_output)
+
+        # Run parallel vrmpy with data preloaded in VTCM.
+        sch = tvm.tir.Schedule(preloaded_vrmpy(operations))
+        sch = apply_vrmpy_parallelization(sch)
+        input_a = input_a.reshape(operations * 128)
+        input_b = input_b.reshape(operations * 128)
+        input_c = input_c.reshape(operations * 32)
+        preloaded_runtime, result = setup_and_run(
+            hexagon_session, sch, input_a, input_b, input_c, operations, "global.vtcm"
+        )
+        result = result.reshape((operations, 32))
+        tvm.testing.assert_allclose(result, expected_output)
+
+        transfer_mb = round(3 * operations * 128 / 1e6, 2)
+        print(
+            TEST_OUTPUT_TEMPLATE.format(
+                transfer_mb,
+                base_runtime,
+                basic_load_runtime,
+                vectorized_runtime,
+                vectorized_parallelized_runtime,
+                preallocated_vectorized_runtime,
+                preallocated_vectorized_parallelized_runtime,
+                single_dma_runtime,
+                preloaded_runtime,
+            )
+        )
diff --git a/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
new file mode 100644
index 000000000000..6db8b9101997
--- /dev/null
+++ b/tests/python/contrib/test_hexagon/test_vtcm_bandwidth.py
@@ -0,0 +1,169 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Test theoretical bandwith for data transfers to VTCM for different strategies."""
+
+import numpy as np
+from tests.python.contrib.test_hexagon.infrastructure import allocate_hexagon_array
+import tvm
+
+from tvm.script import tir as T
+from numpy.random import default_rng
+
+MB = 1024**2
+KB = 1024
+TEST_OUTPUT_TEMPLATE = "Test bandwidth with buffer size {}MB... \n    -Base: {} GBps \n    -Vectorized: {} GBps\n    -Vectorized and Parallelized: {} GBps\n    -Single DMA Copy: {} GBps\n"
+
+
+def memcopy_operator(size):
+    @T.prim_func
+    def operator(a: T.handle, a_v: T.handle) -> None:
+        A = T.match_buffer(a, size, dtype="int8", align=128, scope="global")
+        A_global_vtcm = T.match_buffer(a_v, size, dtype="int8", align=128, scope="global.vtcm")
+        for ax0 in T.serial(size):
+            with T.block("A_global.vtcm"):
+                v0 = T.axis.spatial(size, ax0)
+                T.reads(A[v0])
+                T.writes(A_global_vtcm[v0])
+                A_global_vtcm[v0] = A[v0]
+
+    return operator
+
+
+def single_dma_operator(size):
+    @T.prim_func
+    def operator(a: T.handle, a_v: T.handle) -> None:
+        A = T.match_buffer(a, size, dtype="int8", align=128, scope="global")
+        A_global_vtcm = T.match_buffer(a_v, size, dtype="int8", align=128, scope="global.vtcm")
+        T.evaluate(
+            T.tvm_call_packed(
+                "device_api.hexagon.mem_copy_DLTensor",
+                T.tvm_stack_make_array(
+                    A_global_vtcm.data,
+                    T.tvm_stack_make_shape(size, dtype="handle"),
+                    0,
+                    1,
+                    A_global_vtcm.dtype,
+                    0,
+                    dtype="handle",
+                ),
+                T.tvm_stack_make_array(
+                    A.data,
+                    T.tvm_stack_make_shape(size, dtype="handle"),
+                    0,
+                    1,
+                    A.dtype,
+                    0,
+                    dtype="handle",
+                ),
+                T.cast(size, dtype="int"),
+                dtype="int32",
+            )
+        )
+
+    return operator
+
+
+def evaluate(hexagon_session, sch, size):
+    a_shape = size
+
+    target_hexagon = tvm.target.hexagon("v69")
+    func_tir = tvm.build(
+        sch.mod["main"], target=tvm.target.Target(target_hexagon, host=target_hexagon)
+    )
+    module = hexagon_session.load_module(func_tir)
+
+    rng = default_rng()
+    a = rng.integers(-128, 127, a_shape, dtype="int8")
+    a_vtcm = np.zeros(a_shape, dtype="int8")
+
+    a_hexagon = tvm.runtime.ndarray.array(a, device=hexagon_session.device, mem_scope="global")
+    a_vtcm_hexagon = tvm.runtime.ndarray.array(
+        a_vtcm, device=hexagon_session.device, mem_scope="global.vtcm"
+    )
+
+    # a_hexagon = allocate_hexagon_array(hexagon_session.device, data=a, mem_scope="global")
+    # a_vtcm_hexagon = allocate_hexagon_array(hexagon_session.device, data=a_vtcm, mem_scope="global.vtcm")
+
+    timer = module.time_evaluator("__tvm_main__", hexagon_session.device, number=100, repeat=10)
+    runtime = timer(a_hexagon, a_vtcm_hexagon)
+
+    gbps = round((size / 2**30) / runtime.mean, 4)
+    tvm.testing.assert_allclose(a_vtcm_hexagon.asnumpy(), a)
+
+    return gbps
+
+
+class TestMatMulVec:
+
+    size = tvm.testing.parameter(
+        10 * KB,
+        20 * KB,
+        40 * KB,
+        80 * KB,
+        160 * KB,
+        320 * KB,
+        640 * KB,
+        MB,
+        2 * MB,
+        3 * MB,
+        4 * MB,
+        # 8 * MB,  # Only works on 8gen1 HDKs
+    )
+
+    outer_split = tvm.testing.parameter(4)
+    unroll_split = tvm.testing.parameter(2)
+    vector_split = tvm.testing.parameter(128)
+
+    @tvm.testing.requires_hexagon
+    def test_bandwidth(self, hexagon_session, size, outer_split, unroll_split, vector_split):
+
+        # Run the base memcopy operator.
+        sch = tvm.tir.Schedule(memcopy_operator(size))
+        base_gpbs = evaluate(hexagon_session, sch, size)
+
+        # Run with some basic unroll and vectorize scheduling.
+        sch = tvm.tir.Schedule(memcopy_operator(size))
+        vtcm_block_a = sch.get_block("A_global.vtcm")
+        vb = sch.get_loops(vtcm_block_a)
+        vbi_a, vio_a, vii_a = sch.split(vb[0], factors=[None, unroll_split, vector_split])
+        sch.unroll(vio_a)
+        sch.vectorize(vii_a)
+        vectorize_gbps = evaluate(hexagon_session, sch, size)
+
+        # Run with some basic unroll and vectorize scheduling and parallelization.
+        sch = tvm.tir.Schedule(memcopy_operator(size))
+        vtcm_block_a = sch.get_block("A_global.vtcm")
+        vb = sch.get_loops(vtcm_block_a)
+        vbo_a, vbi_a, vio_a, vii_a = sch.split(
+            vb[0], factors=[outer_split, None, unroll_split, vector_split]
+        )
+        sch.unroll(vio_a)
+        sch.vectorize(vii_a)
+        sch.parallel(vbo_a)
+        parallel_gbps = evaluate(hexagon_session, sch, size)
+
+        # Run using a single dma copy to transfer the data.
+        sch = tvm.tir.Schedule(single_dma_operator(size))
+        single_dma_gbps = evaluate(hexagon_session, sch, size)
+
+        mbs = round(size / MB, 2)
+        print(
+            TEST_OUTPUT_TEMPLATE.format(
+                mbs, base_gpbs, vectorize_gbps, parallel_gbps, single_dma_gbps
+            )
+        )