diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
index 9607b07f6..71c61bd5a 100755
--- a/build_tools/ci/cpu_comparison/run.py
+++ b/build_tools/ci/cpu_comparison/run.py
@@ -1721,6 +1721,22 @@ def __init__(self):
                 use_chess=False,
             )
         )
+        self.register(
+            Matmul(
+                512,
+                512,
+                256,
+                "i32",
+                "i32",
+                name_suffix="4rows_8cols_npu4_pack_peel_4_level_tiling",
+                tile_pipeline="pack-peel-4-level-tiling",
+                run_on_target=["npu4"],
+                aie_compilation_flags=[
+                    "--iree-amdaie-num-rows=4",
+                    "--iree-amdaie-num-cols=8",
+                ],
+            )
+        )
 
         for target in ["npu1_4col", "npu4"]:
             self.register(
@@ -1786,6 +1802,24 @@ def __init__(self):
                 additional_labels=["I8UKernel"],
             )
         )
+        self.register(
+            Matmul(
+                64,
+                64,
+                64,
+                "bf16",
+                "f32",
+                name_suffix="4rows_8cols_npu4_pack_peel_4_level_tiling_ukernel",
+                use_ukernel=True,
+                tile_pipeline="pack-peel-4-level-tiling",
+                run_on_target=["npu4"],
+                aie_compilation_flags=[
+                    "--iree-amdaie-num-rows=4",
+                    "--iree-amdaie-num-cols=8",
+                ],
+                use_chess=True,
+            )
+        )
 
         # Matmul test on 2(rows)x2(cols) cores
         self.register(
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp
index 48dc97d98..5031d688e 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp
@@ -18,8 +18,8 @@ SmallVector<mlir::CopyOpInterface> getCopyLikeConsumers(
   SmallVector<mlir::CopyOpInterface> copyLikOps;
   for (Operation *userOp : op->getUsers()) {
     if (auto copyOp = dyn_cast<CopyOpInterface>(userOp);
-        dyn_cast_if_present<LogicalObjFifoOpInterface>(
-            copyOp.getSource().getDefiningOp()) == op) {
+        copyOp && dyn_cast_if_present<LogicalObjFifoOpInterface>(
+                      copyOp.getSource().getDefiningOp()) == op) {
       copyLikOps.push_back(copyOp);
     }
   }
@@ -31,8 +31,8 @@ SmallVector<mlir::CopyOpInterface> getCopyLikeProducers(
   SmallVector<mlir::CopyOpInterface> copyLikOps;
   for (Operation *userOp : op->getUsers()) {
     if (auto copyOp = dyn_cast<CopyOpInterface>(userOp);
-        dyn_cast_if_present<LogicalObjFifoOpInterface>(
-            copyOp.getTarget().getDefiningOp()) == op) {
+        copyOp && dyn_cast_if_present<LogicalObjFifoOpInterface>(
+                      copyOp.getTarget().getDefiningOp()) == op) {
       copyLikOps.push_back(copyOp);
     }
   }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/mm_npu4.cc b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/mm_npu4.cc
index 38c364c6d..fe6568d8d 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/mm_npu4.cc
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/mm_npu4.cc
@@ -289,6 +289,7 @@ extern "C" {
   }
 
 matmul_combos(matmul_vectorized_c_func, 16, 8, 32)
+matmul_combos(matmul_vectorized_c_func, 16, 8, 64)
 matmul_combos(matmul_vectorized_c_func, 16, 16, 32)
 matmul_combos(matmul_vectorized_c_func, 32, 32, 32)
 matmul_combos(matmul_vectorized_c_func, 32, 32, 64)
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
index 89bc0fd05..1cf40aba4 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp
@@ -121,6 +121,46 @@ FailureOr<int64_t> getSplitStride(ArrayRef<AMDAIE::DmaCpyNdOp> dmaOps,
   return splitStride;
 }
 
+/// Given a list of Copy Ops, fetch the total no. of unique consumer/producer
+/// LogicalObjectFifos. This would helps us figure out the split factor for
+/// LogicalObjectFifos.
+/// And example case which necessitated this feature :-
+///      %lhs = LOF_on_L2
+///      %a = LOF_on_L1_0
+///      %b = LOF_on_L1_1
+///      %c = LOF_on_L1_2
+///      DMA(%a, %lhs)
+///      DMA(%b, %lhs)
+///      DMA(%c, %lhs)
+///      DMA(%b, %lhs)
+///      DMA(%c, %lhs)
+///
+///    In the above snippet, assume we want to split %lhs, it has 5 DMA ops.
+///    But only 3 of them are unique : (%lhs -> %a), (%lhs -> %b) (%lhs -> %c).
+///    Therefore this function is going to return 3. Which the caller is going
+///    to use as split factor.
+template <CopyOpOperateOn OperateOn>
+static FailureOr<int64_t> fetchTotalUniqueLogicalObjFifoUsers(
+    SmallVector<CopyOpInterface> copyLikeOps) {
+  DenseSet<Operation *> uniqueLof;
+  for (CopyOpInterface copyOp : copyLikeOps) {
+    AMDAIE::LogicalObjectFifoFromMemrefOp lof = nullptr;
+    if constexpr (OperateOn == CopyOpOperateOn::Target) {
+      lof = dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>(
+          copyOp.getTarget().getDefiningOp());
+    } else {
+      lof = dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>(
+          copyOp.getSource().getDefiningOp());
+    }
+    if (!lof) {
+      return copyOp.emitOpError()
+             << "could not retrieve source/target objectFifo";
+    }
+    uniqueLof.insert(lof);
+  }
+  return uniqueLof.size();
+}
+
 /// Find the logical objectFifo and DMA source/target splitting dimensions for
 /// each DMA and objectFifo pair.
 ///
@@ -138,6 +178,9 @@ FailureOr<int64_t> getSplitStride(ArrayRef<AMDAIE::DmaCpyNdOp> dmaOps,
 /// that has product size larger than the other side's product size after
 /// splitting because that's the number of elements that should be
 /// produced/consumed on the respective sides before splitting.
+/// Towards the end fetch the number of unique producers (or consumers) for the
+/// objectFifo which will be split. This would form the split factor which would
+/// be capped by the total no. of columns OR std::gcd of source/target size.
 LogicalResult collectSplittingDims(
     const SmallVector<DmaObjFifoPairT> &dmaObjFifoPairs,
     DenseMap<AMDAIE::DmaCpyNdOp, DmaSplitInfo> &dmaSplitInfoMap,
@@ -218,6 +261,19 @@ LogicalResult collectSplittingDims(
       // Calculate the new source stride to be used for splitting the DMA.
       int64_t newSourceStride =
           splitStride != 1 ? splitDimSize / splitStride : 1;
+      FailureOr<int64_t> maybeNumUniqueConsumers =
+          fetchTotalUniqueLogicalObjFifoUsers<CopyOpOperateOn::Target>(
+              objFifo.getCopyLikeConsumers());
+      if (failed(maybeNumUniqueConsumers)) {
+        objFifo.emitOpError() << "could not retrieve the total number of "
+                                 "unique consumer objFifos";
+      }
+      int64_t splitFactor = std::gcd(*maybeNumUniqueConsumers, numCols);
+      int64_t sourceSize = (*sourceSizes)[sourceSplitDim];
+      int64_t targetSize = (*targetSizes)[targetSplitDim];
+      if (sourceSize % splitFactor != 0 || targetSize % splitFactor != 0) {
+        splitFactor = std::gcd(sourceSize, targetSize);
+      }
       LLVM_DEBUG(llvm::dbgs() << "sourceSplitDim: " << sourceSplitDim << "\n");
       LLVM_DEBUG(llvm::dbgs() << "targetSplitDim: " << targetSplitDim << "\n");
       LLVM_DEBUG(llvm::dbgs()
@@ -225,10 +281,11 @@ LogicalResult collectSplittingDims(
       LLVM_DEBUG(llvm::dbgs()
                  << "objFifoSplitDim: " << objFifoSplitDim << "\n");
       LLVM_DEBUG(llvm::dbgs() << "splitStride: " << splitStride << "\n");
-      LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << numCols << "\n");
+      LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << splitFactor << "\n");
       dmaSplitInfoMap[dmaOp] = {sourceSplitDim, newSourceStride, targetSplitDim,
-                                1, numCols};
-      objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, numCols, splitStride};
+                                1, splitFactor};
+      objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, splitFactor,
+                                      splitStride};
     } else if (dmaOp.getSourceObjectFifo() == objFifo) {
       // Find outermost dimension in the access pattern that has stride ==
       // sizeAfterSplit and size != 1.
@@ -274,6 +331,19 @@ LogicalResult collectSplittingDims(
       // Calculate the new target stride to be used for splitting the DMA.
       int64_t newTargetStride =
           splitStride != 1 ? splitDimSize / splitStride : 1;
+      FailureOr<int64_t> maybeNumUniqueProducers =
+          fetchTotalUniqueLogicalObjFifoUsers<CopyOpOperateOn::Source>(
+              objFifo.getCopyLikeProducers());
+      if (failed(maybeNumUniqueProducers)) {
+        objFifo.emitOpError() << "could not retrieve the total number of "
+                                 "unique producer objFifos";
+      }
+      int64_t splitFactor = std::gcd(*maybeNumUniqueProducers, numCols);
+      int64_t sourceSize = (*sourceSizes)[sourceSplitDim];
+      int64_t targetSize = (*targetSizes)[targetSplitDim];
+      if (sourceSize % splitFactor != 0 || targetSize % splitFactor != 0) {
+        splitFactor = std::gcd(sourceSize, targetSize);
+      }
       LLVM_DEBUG(llvm::dbgs() << "sourceSplitDim: " << sourceSplitDim << "\n");
       LLVM_DEBUG(llvm::dbgs() << "targetSplitDim: " << targetSplitDim << "\n");
       LLVM_DEBUG(llvm::dbgs()
@@ -281,10 +351,11 @@ LogicalResult collectSplittingDims(
       LLVM_DEBUG(llvm::dbgs()
                  << "objFifoSplitDim: " << objFifoSplitDim << "\n");
       LLVM_DEBUG(llvm::dbgs() << "splitStride: " << splitStride << "\n");
-      LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << numCols << "\n");
+      LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << splitFactor << "\n");
       dmaSplitInfoMap[dmaOp] = {sourceSplitDim, 1, targetSplitDim,
-                                newTargetStride, numCols};
-      objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, numCols, splitStride};
+                                newTargetStride, splitFactor};
+      objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, splitFactor,
+                                      splitStride};
     }
   }
   return success();
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp
index eb92185d5..afde8fe8d 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp
@@ -755,7 +755,7 @@ LogicalResult splitLogicalObjectFifo(IRRewriter &rewriter,
 LogicalResult splitDoublyStridedOp(IRRewriter &rewriter,
                                    AMDAIE::DoublyStridedOpInterface op,
                                    size_t sourceSplitDim, size_t targetSplitDim,
-                                   std::optional<size_t> maybeSplitFactor,
+                                   int64_t splitFactor,
                                    int64_t sourceSplitStride,
                                    int64_t targetSplitStride) {
   if (!op->use_empty())
@@ -800,15 +800,6 @@ LogicalResult splitDoublyStridedOp(IRRewriter &rewriter,
   }
   int64_t sourceSize = maybeSourceSize.value();
   int64_t targetSize = maybeTargetSize.value();
-  int64_t splitFactor = maybeSplitFactor.has_value()
-                            ? maybeSplitFactor.value()
-                            : std::gcd(sourceSize, targetSize);
-  if (sourceSize % splitFactor != 0 || targetSize % splitFactor != 0) {
-    int64_t newSplitFactor = std::gcd(sourceSize, targetSize);
-    LLVM_DEBUG(llvm::dbgs() << "split factor has been changed from "
-                            << splitFactor << " to " << newSplitFactor);
-    splitFactor = newSplitFactor;
-  }
 
   int64_t newSourceSize = sourceSize / splitFactor;
   int64_t newTargetSize = targetSize / splitFactor;
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h
index fee4e510c..7a428df66 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h
@@ -38,13 +38,14 @@ LogicalResult splitLogicalObjectFifo(
     int64_t splitStride = 1);
 
 /// Split doubly strided operations on a source and target split dimension with
-/// the provided split factor. If no split factor is provided, the doubly
-/// strided operation will be split on the size of the dimension being split.
-LogicalResult splitDoublyStridedOp(
-    IRRewriter &rewriter, AMDAIE::DoublyStridedOpInterface op,
-    size_t sourceSplitDim = 0, size_t targetSplitDim = 0,
-    std::optional<size_t> splitFactor = std::nullopt,
-    int64_t sourceSplitStride = 1, int64_t targetSplitStride = 1);
+/// the provided split factor.
+LogicalResult splitDoublyStridedOp(IRRewriter &rewriter,
+                                   AMDAIE::DoublyStridedOpInterface op,
+                                   size_t sourceSplitDim = 0,
+                                   size_t targetSplitDim = 0,
+                                   int64_t splitFactor = 1,
+                                   int64_t sourceSplitStride = 1,
+                                   int64_t targetSplitStride = 1);
 
 }  // namespace mlir::iree_compiler::AMDAIE
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir
index f930ad5d0..4d377ebba 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir
@@ -42,6 +42,7 @@ module {
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
   func.func @split_L2_input_lhs(%arg0: memref<128x128xi32>) {
     %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
+    %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
     %alloc_0 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
     %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
     %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
@@ -49,8 +50,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       %2 = affine.apply #map(%arg1)
       %3 = amdaie.dma_cpy_nd(%1[0, 0, 0, 0] [2, 32, 1, 32] [1024, 32, 1024, 1], %0[%2, 0] [64, 32] [128, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
       %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
-      %5 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[0, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
-      %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[1, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
+      %5 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+      %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[0, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
+      %7 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[1, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
     memref.dealloc %alloc_0 : memref<2x1x32x32xi32, 1 : i32>
     memref.dealloc %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
@@ -93,6 +95,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
   func.func @split_L2_input_rhs(%arg0: memref<128x128xi32>) {
     %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
+    %alloc_1 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32>
     %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32>
     %0 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>
     %1 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
@@ -100,8 +103,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       %2 = affine.apply #map(%arg2)
       %3 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [1, 32, 2, 32] [2048, 32, 1024, 1], %1[0, %2] [32, 64] [128, 1]) : (!amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
       %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x4x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>
-      %5 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 8, 4] [1024, 1024, 32, 4, 128, 1], %0[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
-      %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 8, 4] [1024, 1024, 32, 4, 128, 1], %0[0, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
+      %5 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x8x4x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>
+      %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 8, 4] [1024, 1024, 32, 4, 128, 1], %0[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
+      %7 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 8, 4] [1024, 1024, 32, 4, 128, 1], %0[0, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x4x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x2x32x32xi32, 1 : i32>>)
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
     memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32>
     memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32>
@@ -151,6 +155,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
   func.func @split_L2_output(%arg0: memref<128x128xi32>) {
     %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+    %alloc_1 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+    %alloc_2 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+    %alloc_3 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
     %alloc_0 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32>
     %0 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>
     %1 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
@@ -158,11 +165,14 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       %2 = affine.apply #map(%arg2)
       %3 = affine.apply #map(%arg1)
       %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
-      %5 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
-      %6 = amdaie.dma_cpy_nd(%0[0, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
-      %7 = amdaie.dma_cpy_nd(%0[1, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
-      %8 = amdaie.dma_cpy_nd(%0[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
-      %9 = amdaie.dma_cpy_nd(%1[%3, %2] [64, 64] [128, 1], %0[0, 0, 0, 0] [2, 32, 2, 32] [2048, 32, 1024, 1]) : (!amdaie.logicalobjectfifo<memref<128x128xi32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
+      %5 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %6 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %7 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+      %8 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
+      %9 = amdaie.dma_cpy_nd(%0[0, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
+      %10 = amdaie.dma_cpy_nd(%0[1, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %6[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
+      %11 = amdaie.dma_cpy_nd(%0[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %7[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
+      %12 = amdaie.dma_cpy_nd(%1[%3, %2] [64, 64] [128, 1], %0[0, 0, 0, 0] [2, 32, 2, 32] [2048, 32, 1024, 1]) : (!amdaie.logicalobjectfifo<memref<128x128xi32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1 : i32>>)
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
     memref.dealloc %alloc_0 : memref<2x2x32x32xi32, 1 : i32>
     memref.dealloc %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
@@ -175,11 +185,11 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // Test of splitting matmul lhs input objectFifo and dma operations on 4x2 AIE array.
 // L2 buffer size `[4, 1, 32, 32]` is expected to be split into two `[2, 1, 32, 32]` buffers.
 
-// CHECK-label: func.func @split_L2_input_lhs_on_4x2_array
-//       CHECK:   %[[OBJ_L2_A0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_A0]], {} :
-//  CHECK-SAME:         memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
-//       CHECK:   %[[OBJ_L2_A1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_A1]], {} :
-//  CHECK-SAME:         memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
+// CHECK-LABEL: func.func @split_L2_input_lhs_on_4x2_array
+//   CHECK-DAG:   %[[ALLOC_0:.*]] = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
+//   CHECK-DAG:   %[[ALLOC_1:.*]] = memref.alloc() : memref<2x1x32x32xi32, 1 : i32>
+//       CHECK:   %[[OBJ_L2_A0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]]
+//       CHECK:   %[[OBJ_L2_A1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]]
 //       CHECK:   scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (4, 2)
 //       CHECK:       %[[DMA_L3_TO_L2_A0:.*]] = amdaie.dma_cpy_nd(
 //  CHECK-SAME:                                   %[[OBJ_L2_A0]][0, 0, 0, 0] [2, 32, 1, 32] [1024, 32, 1024, 1]
@@ -199,22 +209,26 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 //       CHECK:       %[[DMA_L2_TO_L1_A3:.*]] = amdaie.dma_cpy_nd(
 //  CHECK-SAME:                                   {{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]
 //  CHECK-SAME:                                   %[[OBJ_L2_A1]][1, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]
-//       CHECK:   memref.dealloc %[[ALLOC_A0]] : memref<2x1x32x32xi32, 1 : i32>
-//       CHECK:   memref.dealloc %[[ALLOC_A1]] : memref<2x1x32x32xi32, 1 : i32>
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 2 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}>
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}  {
   func.func @split_L2_input_lhs_on_4x2_array(%arg0: memref<128x128xi32>) {
     %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
+    %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
+    %alloc_2 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
+    %alloc_3 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
     %alloc_0 = memref.alloc() : memref<4x1x32x32xi32, 1 : i32>
     %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<128x128xi32> -> !amdaie.logicalobjectfifo<memref<128x128xi32>>
     %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>
     scf.forall (%arg1, %arg2) in (4, 2) {
       %3 = amdaie.dma_cpy_nd(%1[0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1], %0[0, 0] [128, 32] [128, 1]) : (!amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<128x128xi32>>)
       %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
-      %5 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[0, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>)
-      %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[1, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>)
-      %7 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[2, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>)
-      %8 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>)
+      %5 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+      %6 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+      %7 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+      %8 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[0, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>)
+      %9 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[1, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>)
+      %10 = amdaie.dma_cpy_nd(%6[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[2, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>)
+      %11 = amdaie.dma_cpy_nd(%7[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>)
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
     memref.dealloc %alloc_0 : memref<4x1x32x32xi32, 1 : i32>
     memref.dealloc %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
@@ -234,14 +248,16 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK-DAG:   amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1], %[[OBJ_FIFO_L3]][0, 0] [128, 32] [128, 1])
 // CHECK-DAG:   amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1], %[[OBJ_FIFO_L3]][128, 0] [128, 32] [128, 1])
 // CHECK:       scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 4) {
-// CHECK:         %[[OBJ_FIFO_L0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
-// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1])
-// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1])
+// CHECK:         %[[OBJ_FIFO_L1_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+// CHECK:         %[[OBJ_FIFO_L1_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L1_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1])
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L1_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1])
 #executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 2 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}>
 #map = affine_map<(d0) -> (d0 + 4)>
 module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
   func.func @split_producer_with_loop_dependency(%arg0: memref<256x128xi32>) {
     %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
+    %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
     %alloc_0 = memref.alloc() : memref<8x1x32x32xi32, 1 : i32>
     %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
     %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<8x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>
@@ -249,8 +265,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
     scf.forall (%arg1, %arg2) in (2, 4) {
       %3 = affine.apply #map(%arg2)
       %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
-      %5 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%arg2, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>)
-      %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>)
+      %5 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+      %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%arg2, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>)
+      %7 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>)
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
     memref.dealloc %alloc_0 : memref<8x1x32x32xi32, 1 : i32>
     memref.dealloc %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
@@ -265,9 +282,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
 // CHECK-DAG:   %[[OBJ_FIFO_L2_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>
 // CHECK-DAG:   %[[OBJ_FIFO_L2_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>
 // CHECK:       scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 4) {
-// CHECK:         %[[OBJ_FIFO_L0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
-// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1])
-// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1])
+// CHECK:         %[[OBJ_FIFO_L1_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+// CHECK:         %[[OBJ_FIFO_L1_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L1_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1])
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L1_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1])
 // CHECK:       }
 // CHECK-DAG:   amdaie.dma_cpy_nd(%[[OBJ_FIFO_L3]][0, 0] [128, 32] [128, 1], %[[OBJ_FIFO_L2_0]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1])
 // CHECK-DAG:   amdaie.dma_cpy_nd(%[[OBJ_FIFO_L3]][128, 0] [128, 32] [128, 1], %[[OBJ_FIFO_L2_1]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1])
@@ -276,14 +294,16 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
 module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
   func.func @split_consumer_with_loop_dependency(%arg0: memref<256x128xi32>) {
     %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
+    %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
     %alloc_0 = memref.alloc() : memref<8x1x32x32xi32, 1 : i32>
     %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
     %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<8x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>
     scf.forall (%arg1, %arg2) in (2, 4) {
       %3 = affine.apply #map(%arg2)
       %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
-      %5 = amdaie.dma_cpy_nd(%1[%arg2, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
-      %6 = amdaie.dma_cpy_nd(%1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
+      %5 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+      %6 = amdaie.dma_cpy_nd(%1[%arg2, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
+      %7 = amdaie.dma_cpy_nd(%1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
     %2 = amdaie.dma_cpy_nd(%0[0, 0] [256, 32] [128, 1], %1[0, 0, 0, 0] [8, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo<memref<256x128xi32>>, !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>)
     memref.dealloc %alloc_0 : memref<8x1x32x32xi32, 1 : i32>
@@ -327,15 +347,17 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
 // CHECK-DAG:   amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1], %[[OBJ_FIFO_L3]][0, 0, 0] [4, 32, 32] [8192, 128, 1])
 // CHECK-DAG:   amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1], %[[OBJ_FIFO_L3]][0, 32, 0] [4, 32, 32] [8192, 128, 1])
 // CHECK:       scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 4) {
-// CHECK:         %[[OBJ_FIFO_L0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
-// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1])
-// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1])
+// CHECK:         %[[OBJ_FIFO_L1_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+// CHECK:         %[[OBJ_FIFO_L1_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L1_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1])
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L1_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1])
 #executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 2 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}>
 #map = affine_map<(d0) -> (d0 * 2)>
 #map1 = affine_map<(d0) -> (d0 * 2 + 1)>
 module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
   func.func @split_producer_with_loop_dependency_and_stride(%arg0: memref<256x128xi32>) {
     %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
+    %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
     %alloc_0 = memref.alloc() : memref<8x1x32x32xi32, 1 : i32>
     %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
     %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<8x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>
@@ -344,8 +366,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
       %3 = affine.apply #map(%arg2)
       %4 = affine.apply #map1(%arg2)
       %5 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
-      %6 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>)
-      %7 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>)
+      %6 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+      %7 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>)
+      %8 = amdaie.dma_cpy_nd(%6[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>)
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
     memref.dealloc %alloc_0 : memref<8x1x32x32xi32, 1 : i32>
     memref.dealloc %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
@@ -363,9 +386,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
 // CHECK-DAG:   %[[OBJ_FIFO_L2_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>
 // CHECK-DAG:   %[[OBJ_FIFO_L2_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>
 // CHECK:       scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 4) {
-// CHECK:         %[[OBJ_FIFO_L0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
-// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1])
-// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1])
+// CHECK:         %[[OBJ_FIFO_L1_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+// CHECK:         %[[OBJ_FIFO_L1_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L1_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1])
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L1_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1])
 // CHECK:       }
 // CHECK-DAG:   amdaie.dma_cpy_nd(%[[OBJ_FIFO_L3]][0, 0, 0] [4, 32, 32] [8192, 128, 1], %[[OBJ_FIFO_L2_0]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1])
 // CHECK-DAG:   amdaie.dma_cpy_nd(%[[OBJ_FIFO_L3]][0, 32, 0] [4, 32, 32] [8192, 128, 1], %[[OBJ_FIFO_L2_1]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1])
@@ -375,6 +399,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
 module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
   func.func @split_consumer_with_loop_dependency_and_stride(%arg0: memref<256x128xi32>) {
     %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
+    %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
     %alloc_0 = memref.alloc() : memref<8x1x32x32xi32, 1 : i32>
     %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
     %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<8x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>
@@ -382,8 +407,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
       %3 = affine.apply #map(%arg2)
       %4 = affine.apply #map1(%arg2)
       %5 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
-      %6 = amdaie.dma_cpy_nd(%1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
-      %7 = amdaie.dma_cpy_nd(%1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
+      %6 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+      %7 = amdaie.dma_cpy_nd(%1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
+      %8 = amdaie.dma_cpy_nd(%1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %6[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
     %2 = amdaie.dma_cpy_nd(%0[0, 0] [256, 32] [128, 1], %1[0, 0, 0, 0] [8, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo<memref<256x128xi32>>, !amdaie.logicalobjectfifo<memref<8x1x32x32xi32, 1 : i32>>)
     memref.dealloc %alloc_0 : memref<8x1x32x32xi32, 1 : i32>
@@ -396,18 +422,15 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
 
 // CHECK-LABEL: func.func @change_split_factor_with_gcd_for_producer
 // CHECK-DAG:   %[[LOF_L3:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
-// CHECK-DAG:   %[[LOF_L2_0:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
-// CHECK-DAG:   %[[LOF_L2_1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
-// CHECK-DAG:   %[[LOF_L2_2:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
-// CHECK-DAG:   %[[LOF_L2_3:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
-// CHECK-DAG:   amdaie.dma_cpy_nd(%[[LOF_L2_0]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1], %[[LOF_L3]][0, 0, 0] [2, 32, 32] [16384, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xi32>>)
-// CHECK-DAG:   amdaie.dma_cpy_nd(%[[LOF_L2_1]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1], %[[LOF_L3]][0, 32, 0] [2, 32, 32] [16384, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xi32>>)
-// CHECK-DAG:   amdaie.dma_cpy_nd(%[[LOF_L2_2]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1], %[[LOF_L3]][0, 64, 0] [2, 32, 32] [16384, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xi32>>)
-// CHECK-DAG:   amdaie.dma_cpy_nd(%[[LOF_L2_3]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1], %[[LOF_L3]][0, 96, 0] [2, 32, 32] [16384, 128, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xi32>>)
+// CHECK-DAG:   %[[LOF_L2_0:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
+// CHECK-DAG:   %[[LOF_L2_1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
+// CHECK-DAG:   amdaie.dma_cpy_nd(%[[LOF_L2_0]][0, 0, 0, 0] [2, 32, 1, 32] [1024, 32, 1024, 1], %[[LOF_L3]][0, 0, 0] [2, 64, 32] [16384, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xi32>>)
+// CHECK-DAG:   amdaie.dma_cpy_nd(%[[LOF_L2_1]][0, 0, 0, 0] [2, 32, 1, 32] [1024, 32, 1024, 1], %[[LOF_L3]][0, 64, 0] [2, 64, 32] [16384, 128, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<256x128xi32>>)
 // CHECK:       scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (8, 4) {
-// CHECK:         %[[LOF_L1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
-// CHECK-DAG:     amdaie.dma_cpy_nd(%[[LOF_L1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[LOF_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>)
-// CHECK-DAG:     amdaie.dma_cpy_nd(%[[LOF_L1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[LOF_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>)
+// CHECK:         %[[LOF_L1_0:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+// CHECK:         %[[LOF_L1_1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[LOF_L1_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[LOF_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[LOF_L1_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[LOF_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>)
 // CHECK:       }
 #executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 8 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}>
 #map = affine_map<(d0) -> (d0 * 2)>
@@ -415,6 +438,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
 module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
   func.func @change_split_factor_with_gcd_for_producer(%arg0: memref<256x128xi32>) {
     %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
+    %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
     %alloc_0 = memref.alloc() : memref<4x1x32x32xi32, 1 : i32>
     %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
     %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>
@@ -423,8 +447,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
       %3 = affine.apply #map(%arg2)
       %4 = affine.apply #map1(%arg2)
       %5 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
-      %6 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>)
-      %7 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>)
+      %6 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+      %7 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>)
+      %8 = amdaie.dma_cpy_nd(%6[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>)
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
     memref.dealloc %alloc_0 : memref<4x1x32x32xi32, 1 : i32>
     memref.dealloc %alloc : memref<1x1x4x8x4x8xi32, 2 : i32>
@@ -436,25 +461,23 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
 
 // CHECK-LABEL: @change_split_factor_with_gcd_for_consumer
 // CHECK-DAG:   %[[LOF_L3:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
-// CHECK-DAG:   %[[LOF_L2_0:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
-// CHECK-DAG:   %[[LOF_L2_1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
-// CHECK-DAG:   %[[LOF_L2_2:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
-// CHECK-DAG:   %[[LOF_L2_3:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>
+// CHECK-DAG:   %[[LOF_L2_0:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
+// CHECK-DAG:   %[[LOF_L2_1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>
 // CHECK:       scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (8, 4) {
-// CHECK:         %[[LOF_L1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
-// CHECK-DAG:     amdaie.dma_cpy_nd(%[[LOF_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[LOF_L1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
-// CHECK-DAG:     amdaie.dma_cpy_nd(%[[LOF_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[LOF_L1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
+// CHECK:         %[[LOF_L1_0:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+// CHECK:         %[[LOF_L1_1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[LOF_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[LOF_L1_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
+// CHECK-DAG:     amdaie.dma_cpy_nd(%[[LOF_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[LOF_L1_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<2x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
 // CHECK:       }
-// CHECK-DAG:   amdaie.dma_cpy_nd(%[[LOF_L3]][0, 0, 0] [2, 32, 32] [16384, 128, 1], %[[LOF_L2_0]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo<memref<256x128xi32>>, !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>)
-// CHECK-DAG:   amdaie.dma_cpy_nd(%[[LOF_L3]][0, 32, 0] [2, 32, 32] [16384, 128, 1], %[[LOF_L2_1]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo<memref<256x128xi32>>, !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>)
-// CHECK-DAG:   amdaie.dma_cpy_nd(%[[LOF_L3]][0, 64, 0] [2, 32, 32] [16384, 128, 1], %[[LOF_L2_2]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo<memref<256x128xi32>>, !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>)
-// CHECK-DAG:   amdaie.dma_cpy_nd(%[[LOF_L3]][0, 96, 0] [2, 32, 32] [16384, 128, 1], %[[LOF_L2_3]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo<memref<256x128xi32>>, !amdaie.logicalobjectfifo<memref<1x1x32x32xi32, 1 : i32>>)
+// CHECK-DAG:   amdaie.dma_cpy_nd(%[[LOF_L3]][0, 0, 0] [2, 64, 32] [16384, 128, 1], %[[LOF_L2_0]][0, 0, 0, 0] [2, 32, 1, 32] [1024, 32, 1024, 1])
+// CHECK-DAG:   amdaie.dma_cpy_nd(%[[LOF_L3]][0, 64, 0] [2, 64, 32] [16384, 128, 1], %[[LOF_L2_1]][0, 0, 0, 0] [2, 32, 1, 32] [1024, 32, 1024, 1])
 #executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 8 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}>
 #map = affine_map<(d0) -> (d0 * 2)>
 #map1 = affine_map<(d0) -> (d0 * 2 + 1)>
 module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
   func.func @change_split_factor_with_gcd_for_consumer(%arg0: memref<256x128xi32>) {
     %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
+    %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32>
     %alloc_0 = memref.alloc() : memref<4x1x32x32xi32, 1 : i32>
     %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo<memref<256x128xi32>>
     %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>
@@ -462,8 +485,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
       %3 = affine.apply #map(%arg2)
       %4 = affine.apply #map1(%arg2)
       %5 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
-      %6 = amdaie.dma_cpy_nd(%1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
-      %7 = amdaie.dma_cpy_nd(%1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
+      %6 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>
+      %7 = amdaie.dma_cpy_nd(%1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
+      %8 = amdaie.dma_cpy_nd(%1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %6[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x4x8x4x8xi32, 2 : i32>>)
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
     %2 = amdaie.dma_cpy_nd(%0[0, 0] [256, 32] [128, 1], %1[0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo<memref<256x128xi32>>, !amdaie.logicalobjectfifo<memref<4x1x32x32xi32, 1 : i32>>)
     memref.dealloc %alloc_0 : memref<4x1x32x32xi32, 1 : i32>
@@ -471,3 +495,73 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
     return
   }
 }
+
+// -----
+
+// This test demonstrates the case when the factor is not simply decided by the number of
+// columns but the number of unique producers/consumers. In the example, although we are
+// using 8 AIE columns, L2 LHS and output buffers are not split because there's only one
+// producer/consumer, while L2 RHS buffer is split into 2 because there are 2 producers/consumers.
+//
+// CHECK-LABEL: @pack_peel_4_level_4x8_Strix
+// CHECK-DAG:     %[[C2:.*]] = arith.constant 2 : index
+// CHECK-DAG:     %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG:     %[[C0:.*]] = arith.constant 0 : index
+// CHECK:         scf.forall (%{{.*}}, %{{.*}}) in (2, 8) {
+// CHECK:             amdaie.dma_cpy_nd(%[[LOF_LHS_L2:.*]][0, 0, 0, 0] [8, 32, 8, 64] [16384, 64, 2048, 1], %{{.*}}[0, 0] [256, 512] [512, 1]) : (!amdaie.logicalobjectfifo<memref<8x8x32x64xi32, 1 : i32>>,
+// CHECK:             amdaie.dma_cpy_nd(%[[LOF_RHS_L2_0:.*]][0, 0, 0, 0] [8, 64, 8, 32] [2048, 32, 16384, 1], %{{.*}}[0, 0, 0] [512, 2, 128] [4096, 256, 1]) : (!amdaie.logicalobjectfifo<memref<8x8x64x32xi32, 1 : i32>>,
+// CHECK:             amdaie.dma_cpy_nd(%[[LOF_RHS_L2_1:.*]][0, 0, 0, 0] [8, 64, 8, 32] [2048, 32, 16384, 1], %{{.*}}[0, 0, 128] [512, 2, 128] [4096, 256, 1]) : (!amdaie.logicalobjectfifo<memref<8x8x64x32xi32, 1 : i32>>,
+// CHECK:             scf.forall (%{{.*}}, %{{.*}}) in (2, 2) {
+// CHECK:                 amdaie.dma_cpy_nd(%{{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %[[LOF_RHS_L2_0]][%{{.*}}, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) :
+// CHECK:                 amdaie.dma_cpy_nd(%{{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %[[LOF_RHS_L2_1]][%{{.*}}, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) :
+// CHECK:                 amdaie.dma_cpy_nd(%{{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 8] [2048, 2048, 32, 8, 256, 1], %[[LOF_LHS_L2]][0, 0, 0, 0] [1, 1, 32, 64] [16384, 2048, 64, 1]) :
+// CHECK:                 amdaie.dma_cpy_nd(%[[LOF_OUT_L2:.*]][0, 0, 0, 0] [1, 1, 32, 32] [8192, 1024, 32, 1], %{{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) :
+// CHECK:             }
+// CHECK:             amdaie.dma_cpy_nd(%{{.*}}[0, 0] [256, 512] [4096, 1], %[[LOF_OUT_L2]][0, 0, 0, 0] [8, 32, 16, 32] [1024, 32, 8192, 1]) :
+// CHECK:          }
+#executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 8 : i32, num_rows = 4 : i32, target_device = "npu4", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} {
+  func.func @pack_peel_4_level_4x8_Strix(%lhs: memref<512x512xi32>, %rhs: memref<512x4096xi32>, %out: memref<512x4096xi32>) {
+    %c2 = arith.constant 2 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32>
+    %alloc_0 = memref.alloc() : memref<1x1x8x8x8x4xi32, 2 : i32>
+    %alloc_1 = memref.alloc() : memref<1x1x8x8x4x8xi32, 2 : i32>
+    %alloc_2 = memref.alloc() : memref<16x8x32x32xi32, 1 : i32>
+    %alloc_3 = memref.alloc() : memref<16x8x64x32xi32, 1 : i32>
+    %alloc_4 = memref.alloc() : memref<8x8x32x64xi32, 1 : i32>
+    %lof_0_1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<16x8x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<16x8x32x32xi32, 1 : i32>>
+    %lof_1_1 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<16x8x64x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<16x8x64x32xi32, 1 : i32>>
+    %lof_2_1 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<8x8x32x64xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<8x8x32x64xi32, 1 : i32>>
+    %lof_0_0 = amdaie.logicalobjectfifo.from_memref %lhs, {} : memref<512x512xi32> -> !amdaie.logicalobjectfifo<memref<512x512xi32>>
+    %lof_1_0 = amdaie.logicalobjectfifo.from_memref %rhs, {} : memref<512x4096xi32> -> !amdaie.logicalobjectfifo<memref<512x4096xi32>>
+    %lof_2_0 = amdaie.logicalobjectfifo.from_memref %out, {} : memref<512x4096xi32> -> !amdaie.logicalobjectfifo<memref<512x4096xi32>>
+    scf.forall (%arg0, %arg1) in (2, 8) {
+      %0 = amdaie.dma_cpy_nd(%lof_2_1[0, 0, 0, 0] [8, 32, 8, 64] [16384, 64, 2048, 1], %lof_0_0[0, 0] [256, 512] [512, 1]) : (!amdaie.logicalobjectfifo<memref<8x8x32x64xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<512x512xi32>>)
+      %1 = amdaie.dma_cpy_nd(%lof_1_1[0, 0, 0, 0] [8, 64, 16, 32] [2048, 32, 16384, 1], %lof_1_0[0, 0] [512, 512] [4096, 1]) : (!amdaie.logicalobjectfifo<memref<16x8x64x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<512x4096xi32>>)
+      scf.forall (%arg2, %arg3) in (2, 2) {
+        %of0 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg2)
+        %of1 = affine.apply affine_map<(d0) -> (d0 * 8 + 1)>(%arg2)
+        %tile_1_2 = amdaie.tile(%c1, %c2)
+        %tile_0_2 = amdaie.tile(%c0, %c2)
+        %lof_1_2 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x8x8x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x8x4xi32, 2 : i32>>
+        %lof_0_2 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x8x8x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x8x4xi32, 2 : i32>>
+        %lof_c_2 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x8x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x8xi32, 2 : i32>>
+        %3 = amdaie.dma_cpy_nd(%lof_0_2[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %lof_1_1[%of0, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<16x8x64x32xi32, 1 : i32>>)
+        %4 = amdaie.dma_cpy_nd(%lof_1_2[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %lof_1_1[%of1, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x8x4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<16x8x64x32xi32, 1 : i32>>)
+        %5 = amdaie.dma_cpy_nd(%lof_c_2[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 8] [2048, 2048, 32, 8, 256, 1], %lof_2_1[0, 0, 0, 0] [1, 1, 32, 64] [16384, 2048, 64, 1]) : (!amdaie.logicalobjectfifo<memref<1x1x8x8x4x8xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<8x8x32x64xi32, 1 : i32>>)
+        %lof_0_2_8 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>
+        %6 = amdaie.dma_cpy_nd(%lof_0_1[0, 0, 0, 0] [1, 1, 32, 32] [8192, 1024, 32, 1], %lof_0_2_8[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo<memref<16x8x32x32xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<1x1x8x8x4x4xi32, 2 : i32>>)
+      }
+      %2 = amdaie.dma_cpy_nd(%lof_2_0[0, 0] [256, 512] [4096, 1], %lof_0_1[0, 0, 0, 0] [8, 32, 16, 32] [1024, 32, 8192, 1]) : (!amdaie.logicalobjectfifo<memref<512x4096xi32>>, !amdaie.logicalobjectfifo<memref<16x8x32x32xi32, 1 : i32>>)
+    }
+    memref.dealloc %alloc_4 : memref<8x8x32x64xi32, 1 : i32>
+    memref.dealloc %alloc_3 : memref<16x8x64x32xi32, 1 : i32>
+    memref.dealloc %alloc_2 : memref<16x8x32x32xi32, 1 : i32>
+    memref.dealloc %alloc_1 : memref<1x1x8x8x4x8xi32, 2 : i32>
+    memref.dealloc %alloc_0 : memref<1x1x8x8x8x4xi32, 2 : i32>
+    memref.dealloc %alloc : memref<1x1x8x8x4x4xi32, 2 : i32>
+    return
+  }
+}