diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py index 9607b07f6..71c61bd5a 100755 --- a/build_tools/ci/cpu_comparison/run.py +++ b/build_tools/ci/cpu_comparison/run.py @@ -1721,6 +1721,22 @@ def __init__(self): use_chess=False, ) ) + self.register( + Matmul( + 512, + 512, + 256, + "i32", + "i32", + name_suffix="4rows_8cols_npu4_pack_peel_4_level_tiling", + tile_pipeline="pack-peel-4-level-tiling", + run_on_target=["npu4"], + aie_compilation_flags=[ + "--iree-amdaie-num-rows=4", + "--iree-amdaie-num-cols=8", + ], + ) + ) for target in ["npu1_4col", "npu4"]: self.register( @@ -1786,6 +1802,24 @@ def __init__(self): additional_labels=["I8UKernel"], ) ) + self.register( + Matmul( + 64, + 64, + 64, + "bf16", + "f32", + name_suffix="4rows_8cols_npu4_pack_peel_4_level_tiling_ukernel", + use_ukernel=True, + tile_pipeline="pack-peel-4-level-tiling", + run_on_target=["npu4"], + aie_compilation_flags=[ + "--iree-amdaie-num-rows=4", + "--iree-amdaie-num-cols=8", + ], + use_chess=True, + ) + ) # Matmul test on 2(rows)x2(cols) cores self.register( diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp index 48dc97d98..5031d688e 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.cpp @@ -18,8 +18,8 @@ SmallVector getCopyLikeConsumers( SmallVector copyLikOps; for (Operation *userOp : op->getUsers()) { if (auto copyOp = dyn_cast(userOp); - dyn_cast_if_present( - copyOp.getSource().getDefiningOp()) == op) { + copyOp && dyn_cast_if_present( + copyOp.getSource().getDefiningOp()) == op) { copyLikOps.push_back(copyOp); } } @@ -31,8 +31,8 @@ SmallVector getCopyLikeProducers( SmallVector copyLikOps; for (Operation *userOp : op->getUsers()) { if (auto copyOp = dyn_cast(userOp); - dyn_cast_if_present( - copyOp.getTarget().getDefiningOp()) == op) { + copyOp && dyn_cast_if_present( + copyOp.getTarget().getDefiningOp()) == op) { copyLikOps.push_back(copyOp); } } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/mm_npu4.cc b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/mm_npu4.cc index 38c364c6d..fe6568d8d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/mm_npu4.cc +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/mm_npu4.cc @@ -289,6 +289,7 @@ extern "C" { } matmul_combos(matmul_vectorized_c_func, 16, 8, 32) +matmul_combos(matmul_vectorized_c_func, 16, 8, 64) matmul_combos(matmul_vectorized_c_func, 16, 16, 32) matmul_combos(matmul_vectorized_c_func, 32, 32, 32) matmul_combos(matmul_vectorized_c_func, 32, 32, 64) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp index 89bc0fd05..1cf40aba4 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIESplitLogicalObjFifos.cpp @@ -121,6 +121,46 @@ FailureOr getSplitStride(ArrayRef dmaOps, return splitStride; } +/// Given a list of Copy Ops, fetch the total no. of unique consumer/producer +/// LogicalObjectFifos. This would helps us figure out the split factor for +/// LogicalObjectFifos. +/// And example case which necessitated this feature :- +/// %lhs = LOF_on_L2 +/// %a = LOF_on_L1_0 +/// %b = LOF_on_L1_1 +/// %c = LOF_on_L1_2 +/// DMA(%a, %lhs) +/// DMA(%b, %lhs) +/// DMA(%c, %lhs) +/// DMA(%b, %lhs) +/// DMA(%c, %lhs) +/// +/// In the above snippet, assume we want to split %lhs, it has 5 DMA ops. +/// But only 3 of them are unique : (%lhs -> %a), (%lhs -> %b) (%lhs -> %c). +/// Therefore this function is going to return 3. Which the caller is going +/// to use as split factor. +template +static FailureOr fetchTotalUniqueLogicalObjFifoUsers( + SmallVector copyLikeOps) { + DenseSet uniqueLof; + for (CopyOpInterface copyOp : copyLikeOps) { + AMDAIE::LogicalObjectFifoFromMemrefOp lof = nullptr; + if constexpr (OperateOn == CopyOpOperateOn::Target) { + lof = dyn_cast_if_present( + copyOp.getTarget().getDefiningOp()); + } else { + lof = dyn_cast_if_present( + copyOp.getSource().getDefiningOp()); + } + if (!lof) { + return copyOp.emitOpError() + << "could not retrieve source/target objectFifo"; + } + uniqueLof.insert(lof); + } + return uniqueLof.size(); +} + /// Find the logical objectFifo and DMA source/target splitting dimensions for /// each DMA and objectFifo pair. /// @@ -138,6 +178,9 @@ FailureOr getSplitStride(ArrayRef dmaOps, /// that has product size larger than the other side's product size after /// splitting because that's the number of elements that should be /// produced/consumed on the respective sides before splitting. +/// Towards the end fetch the number of unique producers (or consumers) for the +/// objectFifo which will be split. This would form the split factor which would +/// be capped by the total no. of columns OR std::gcd of source/target size. LogicalResult collectSplittingDims( const SmallVector &dmaObjFifoPairs, DenseMap &dmaSplitInfoMap, @@ -218,6 +261,19 @@ LogicalResult collectSplittingDims( // Calculate the new source stride to be used for splitting the DMA. int64_t newSourceStride = splitStride != 1 ? splitDimSize / splitStride : 1; + FailureOr maybeNumUniqueConsumers = + fetchTotalUniqueLogicalObjFifoUsers( + objFifo.getCopyLikeConsumers()); + if (failed(maybeNumUniqueConsumers)) { + objFifo.emitOpError() << "could not retrieve the total number of " + "unique consumer objFifos"; + } + int64_t splitFactor = std::gcd(*maybeNumUniqueConsumers, numCols); + int64_t sourceSize = (*sourceSizes)[sourceSplitDim]; + int64_t targetSize = (*targetSizes)[targetSplitDim]; + if (sourceSize % splitFactor != 0 || targetSize % splitFactor != 0) { + splitFactor = std::gcd(sourceSize, targetSize); + } LLVM_DEBUG(llvm::dbgs() << "sourceSplitDim: " << sourceSplitDim << "\n"); LLVM_DEBUG(llvm::dbgs() << "targetSplitDim: " << targetSplitDim << "\n"); LLVM_DEBUG(llvm::dbgs() @@ -225,10 +281,11 @@ LogicalResult collectSplittingDims( LLVM_DEBUG(llvm::dbgs() << "objFifoSplitDim: " << objFifoSplitDim << "\n"); LLVM_DEBUG(llvm::dbgs() << "splitStride: " << splitStride << "\n"); - LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << numCols << "\n"); + LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << splitFactor << "\n"); dmaSplitInfoMap[dmaOp] = {sourceSplitDim, newSourceStride, targetSplitDim, - 1, numCols}; - objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, numCols, splitStride}; + 1, splitFactor}; + objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, splitFactor, + splitStride}; } else if (dmaOp.getSourceObjectFifo() == objFifo) { // Find outermost dimension in the access pattern that has stride == // sizeAfterSplit and size != 1. @@ -274,6 +331,19 @@ LogicalResult collectSplittingDims( // Calculate the new target stride to be used for splitting the DMA. int64_t newTargetStride = splitStride != 1 ? splitDimSize / splitStride : 1; + FailureOr maybeNumUniqueProducers = + fetchTotalUniqueLogicalObjFifoUsers( + objFifo.getCopyLikeProducers()); + if (failed(maybeNumUniqueProducers)) { + objFifo.emitOpError() << "could not retrieve the total number of " + "unique producer objFifos"; + } + int64_t splitFactor = std::gcd(*maybeNumUniqueProducers, numCols); + int64_t sourceSize = (*sourceSizes)[sourceSplitDim]; + int64_t targetSize = (*targetSizes)[targetSplitDim]; + if (sourceSize % splitFactor != 0 || targetSize % splitFactor != 0) { + splitFactor = std::gcd(sourceSize, targetSize); + } LLVM_DEBUG(llvm::dbgs() << "sourceSplitDim: " << sourceSplitDim << "\n"); LLVM_DEBUG(llvm::dbgs() << "targetSplitDim: " << targetSplitDim << "\n"); LLVM_DEBUG(llvm::dbgs() @@ -281,10 +351,11 @@ LogicalResult collectSplittingDims( LLVM_DEBUG(llvm::dbgs() << "objFifoSplitDim: " << objFifoSplitDim << "\n"); LLVM_DEBUG(llvm::dbgs() << "splitStride: " << splitStride << "\n"); - LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << numCols << "\n"); + LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << splitFactor << "\n"); dmaSplitInfoMap[dmaOp] = {sourceSplitDim, 1, targetSplitDim, - newTargetStride, numCols}; - objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, numCols, splitStride}; + newTargetStride, splitFactor}; + objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, splitFactor, + splitStride}; } } return success(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp index eb92185d5..afde8fe8d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.cpp @@ -755,7 +755,7 @@ LogicalResult splitLogicalObjectFifo(IRRewriter &rewriter, LogicalResult splitDoublyStridedOp(IRRewriter &rewriter, AMDAIE::DoublyStridedOpInterface op, size_t sourceSplitDim, size_t targetSplitDim, - std::optional maybeSplitFactor, + int64_t splitFactor, int64_t sourceSplitStride, int64_t targetSplitStride) { if (!op->use_empty()) @@ -800,15 +800,6 @@ LogicalResult splitDoublyStridedOp(IRRewriter &rewriter, } int64_t sourceSize = maybeSourceSize.value(); int64_t targetSize = maybeTargetSize.value(); - int64_t splitFactor = maybeSplitFactor.has_value() - ? maybeSplitFactor.value() - : std::gcd(sourceSize, targetSize); - if (sourceSize % splitFactor != 0 || targetSize % splitFactor != 0) { - int64_t newSplitFactor = std::gcd(sourceSize, targetSize); - LLVM_DEBUG(llvm::dbgs() << "split factor has been changed from " - << splitFactor << " to " << newSplitFactor); - splitFactor = newSplitFactor; - } int64_t newSourceSize = sourceSize / splitFactor; int64_t newTargetSize = targetSize / splitFactor; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h index fee4e510c..7a428df66 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIELogicalObjFifoSplittingUtils.h @@ -38,13 +38,14 @@ LogicalResult splitLogicalObjectFifo( int64_t splitStride = 1); /// Split doubly strided operations on a source and target split dimension with -/// the provided split factor. If no split factor is provided, the doubly -/// strided operation will be split on the size of the dimension being split. -LogicalResult splitDoublyStridedOp( - IRRewriter &rewriter, AMDAIE::DoublyStridedOpInterface op, - size_t sourceSplitDim = 0, size_t targetSplitDim = 0, - std::optional splitFactor = std::nullopt, - int64_t sourceSplitStride = 1, int64_t targetSplitStride = 1); +/// the provided split factor. +LogicalResult splitDoublyStridedOp(IRRewriter &rewriter, + AMDAIE::DoublyStridedOpInterface op, + size_t sourceSplitDim = 0, + size_t targetSplitDim = 0, + int64_t splitFactor = 1, + int64_t sourceSplitStride = 1, + int64_t targetSplitStride = 1); } // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir index f930ad5d0..4d377ebba 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/split_logicalobjfifos.mlir @@ -42,6 +42,7 @@ module { module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @split_L2_input_lhs(%arg0: memref<128x128xi32>) { %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> + %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> %alloc_0 = memref.alloc() : memref<2x1x32x32xi32, 1 : i32> %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<128x128xi32> -> !amdaie.logicalobjectfifo> %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> @@ -49,8 +50,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %2 = affine.apply #map(%arg1) %3 = amdaie.dma_cpy_nd(%1[0, 0, 0, 0] [2, 32, 1, 32] [1024, 32, 1024, 1], %0[%2, 0] [64, 32] [128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %5 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[0, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[1, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %5 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[0, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %7 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[1, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) } {mapping = [#gpu.block, #gpu.block]} memref.dealloc %alloc_0 : memref<2x1x32x32xi32, 1 : i32> memref.dealloc %alloc : memref<1x1x4x8x4x8xi32, 2 : i32> @@ -93,6 +95,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @split_L2_input_rhs(%arg0: memref<128x128xi32>) { %alloc = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32> + %alloc_1 = memref.alloc() : memref<1x1x8x4x8x4xi32, 2 : i32> %alloc_0 = memref.alloc() : memref<1x2x32x32xi32, 1 : i32> %0 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> %1 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<128x128xi32> -> !amdaie.logicalobjectfifo> @@ -100,8 +103,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %2 = affine.apply #map(%arg2) %3 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [1, 32, 2, 32] [2048, 32, 1024, 1], %1[0, %2] [32, 64] [128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x4x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %5 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 8, 4] [1024, 1024, 32, 4, 128, 1], %0[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 8, 4] [1024, 1024, 32, 4, 128, 1], %0[0, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %5 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x8x4x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 8, 4] [1024, 1024, 32, 4, 128, 1], %0[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %7 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 4, 8, 8, 4] [1024, 1024, 32, 4, 128, 1], %0[0, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) } {mapping = [#gpu.block, #gpu.block]} memref.dealloc %alloc_0 : memref<1x2x32x32xi32, 1 : i32> memref.dealloc %alloc : memref<1x1x8x4x8x4xi32, 2 : i32> @@ -151,6 +155,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @split_L2_output(%arg0: memref<128x128xi32>) { %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> + %alloc_1 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> + %alloc_2 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> + %alloc_3 = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> %alloc_0 = memref.alloc() : memref<2x2x32x32xi32, 1 : i32> %0 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<2x2x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> %1 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<128x128xi32> -> !amdaie.logicalobjectfifo> @@ -158,11 +165,14 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %2 = affine.apply #map(%arg2) %3 = affine.apply #map(%arg1) %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %5 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %6 = amdaie.dma_cpy_nd(%0[0, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %7 = amdaie.dma_cpy_nd(%0[1, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %8 = amdaie.dma_cpy_nd(%0[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %9 = amdaie.dma_cpy_nd(%1[%3, %2] [64, 64] [128, 1], %0[0, 0, 0, 0] [2, 32, 2, 32] [2048, 32, 1024, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %5 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %6 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %7 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %8 = amdaie.dma_cpy_nd(%0[0, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %9 = amdaie.dma_cpy_nd(%0[0, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %10 = amdaie.dma_cpy_nd(%0[1, 0, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %6[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %11 = amdaie.dma_cpy_nd(%0[1, 1, 0, 0] [1, 1, 32, 32] [2048, 1024, 32, 1], %7[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %12 = amdaie.dma_cpy_nd(%1[%3, %2] [64, 64] [128, 1], %0[0, 0, 0, 0] [2, 32, 2, 32] [2048, 32, 1024, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) } {mapping = [#gpu.block, #gpu.block]} memref.dealloc %alloc_0 : memref<2x2x32x32xi32, 1 : i32> memref.dealloc %alloc : memref<1x1x8x8x4x4xi32, 2 : i32> @@ -175,11 +185,11 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // Test of splitting matmul lhs input objectFifo and dma operations on 4x2 AIE array. // L2 buffer size `[4, 1, 32, 32]` is expected to be split into two `[2, 1, 32, 32]` buffers. -// CHECK-label: func.func @split_L2_input_lhs_on_4x2_array -// CHECK: %[[OBJ_L2_A0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_A0]], {} : -// CHECK-SAME: memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> -// CHECK: %[[OBJ_L2_A1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_A1]], {} : -// CHECK-SAME: memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-LABEL: func.func @split_L2_input_lhs_on_4x2_array +// CHECK-DAG: %[[ALLOC_0:.*]] = memref.alloc() : memref<2x1x32x32xi32, 1 : i32> +// CHECK-DAG: %[[ALLOC_1:.*]] = memref.alloc() : memref<2x1x32x32xi32, 1 : i32> +// CHECK: %[[OBJ_L2_A0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]] +// CHECK: %[[OBJ_L2_A1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]] // CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (4, 2) // CHECK: %[[DMA_L3_TO_L2_A0:.*]] = amdaie.dma_cpy_nd( // CHECK-SAME: %[[OBJ_L2_A0]][0, 0, 0, 0] [2, 32, 1, 32] [1024, 32, 1024, 1] @@ -199,22 +209,26 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: %[[DMA_L2_TO_L1_A3:.*]] = amdaie.dma_cpy_nd( // CHECK-SAME: {{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1] // CHECK-SAME: %[[OBJ_L2_A1]][1, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1] -// CHECK: memref.dealloc %[[ALLOC_A0]] : memref<2x1x32x32xi32, 1 : i32> -// CHECK: memref.dealloc %[[ALLOC_A1]] : memref<2x1x32x32xi32, 1 : i32> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 2 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { func.func @split_L2_input_lhs_on_4x2_array(%arg0: memref<128x128xi32>) { %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> + %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> + %alloc_2 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> + %alloc_3 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> %alloc_0 = memref.alloc() : memref<4x1x32x32xi32, 1 : i32> %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<128x128xi32> -> !amdaie.logicalobjectfifo> %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> scf.forall (%arg1, %arg2) in (4, 2) { %3 = amdaie.dma_cpy_nd(%1[0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1], %0[0, 0] [128, 32] [128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %5 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[0, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[1, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %7 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[2, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %8 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %5 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %6 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %7 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %8 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[0, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %9 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[1, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %10 = amdaie.dma_cpy_nd(%6[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[2, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %11 = amdaie.dma_cpy_nd(%7[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) } {mapping = [#gpu.block, #gpu.block]} memref.dealloc %alloc_0 : memref<4x1x32x32xi32, 1 : i32> memref.dealloc %alloc : memref<1x1x4x8x4x8xi32, 2 : i32> @@ -234,14 +248,16 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1], %[[OBJ_FIFO_L3]][0, 0] [128, 32] [128, 1]) // CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1], %[[OBJ_FIFO_L3]][128, 0] [128, 32] [128, 1]) // CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 4) { -// CHECK: %[[OBJ_FIFO_L0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) -// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) +// CHECK: %[[OBJ_FIFO_L1_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[OBJ_FIFO_L1_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L1_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L1_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) #executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 2 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}> #map = affine_map<(d0) -> (d0 + 4)> module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { func.func @split_producer_with_loop_dependency(%arg0: memref<256x128xi32>) { %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> + %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> %alloc_0 = memref.alloc() : memref<8x1x32x32xi32, 1 : i32> %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo> %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<8x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> @@ -249,8 +265,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { scf.forall (%arg1, %arg2) in (2, 4) { %3 = affine.apply #map(%arg2) %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %5 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%arg2, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %5 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %6 = amdaie.dma_cpy_nd(%4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%arg2, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %7 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) } {mapping = [#gpu.block, #gpu.block]} memref.dealloc %alloc_0 : memref<8x1x32x32xi32, 1 : i32> memref.dealloc %alloc : memref<1x1x4x8x4x8xi32, 2 : i32> @@ -265,9 +282,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { // CHECK-DAG: %[[OBJ_FIFO_L2_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> // CHECK-DAG: %[[OBJ_FIFO_L2_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> // CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 4) { -// CHECK: %[[OBJ_FIFO_L0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) -// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) +// CHECK: %[[OBJ_FIFO_L1_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[OBJ_FIFO_L1_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L1_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L1_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) // CHECK: } // CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L3]][0, 0] [128, 32] [128, 1], %[[OBJ_FIFO_L2_0]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1]) // CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L3]][128, 0] [128, 32] [128, 1], %[[OBJ_FIFO_L2_1]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1]) @@ -276,14 +294,16 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { func.func @split_consumer_with_loop_dependency(%arg0: memref<256x128xi32>) { %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> + %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> %alloc_0 = memref.alloc() : memref<8x1x32x32xi32, 1 : i32> %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo> %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<8x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> scf.forall (%arg1, %arg2) in (2, 4) { %3 = affine.apply #map(%arg2) %4 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %5 = amdaie.dma_cpy_nd(%1[%arg2, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %6 = amdaie.dma_cpy_nd(%1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %5 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %6 = amdaie.dma_cpy_nd(%1[%arg2, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %4[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %7 = amdaie.dma_cpy_nd(%1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) } {mapping = [#gpu.block, #gpu.block]} %2 = amdaie.dma_cpy_nd(%0[0, 0] [256, 32] [128, 1], %1[0, 0, 0, 0] [8, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) memref.dealloc %alloc_0 : memref<8x1x32x32xi32, 1 : i32> @@ -327,15 +347,17 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { // CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1], %[[OBJ_FIFO_L3]][0, 0, 0] [4, 32, 32] [8192, 128, 1]) // CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1], %[[OBJ_FIFO_L3]][0, 32, 0] [4, 32, 32] [8192, 128, 1]) // CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 4) { -// CHECK: %[[OBJ_FIFO_L0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) -// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) +// CHECK: %[[OBJ_FIFO_L1_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[OBJ_FIFO_L1_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L1_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L1_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) #executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 2 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}> #map = affine_map<(d0) -> (d0 * 2)> #map1 = affine_map<(d0) -> (d0 * 2 + 1)> module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { func.func @split_producer_with_loop_dependency_and_stride(%arg0: memref<256x128xi32>) { %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> + %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> %alloc_0 = memref.alloc() : memref<8x1x32x32xi32, 1 : i32> %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo> %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<8x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> @@ -344,8 +366,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { %3 = affine.apply #map(%arg2) %4 = affine.apply #map1(%arg2) %5 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %6 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %7 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %6 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %7 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %8 = amdaie.dma_cpy_nd(%6[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) } {mapping = [#gpu.block, #gpu.block]} memref.dealloc %alloc_0 : memref<8x1x32x32xi32, 1 : i32> memref.dealloc %alloc : memref<1x1x4x8x4x8xi32, 2 : i32> @@ -363,9 +386,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { // CHECK-DAG: %[[OBJ_FIFO_L2_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> // CHECK-DAG: %[[OBJ_FIFO_L2_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> // CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (2, 4) { -// CHECK: %[[OBJ_FIFO_L0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) -// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) +// CHECK: %[[OBJ_FIFO_L1_0:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[OBJ_FIFO_L1_1:.+]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L1_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) +// CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[OBJ_FIFO_L1_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) // CHECK: } // CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L3]][0, 0, 0] [4, 32, 32] [8192, 128, 1], %[[OBJ_FIFO_L2_0]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1]) // CHECK-DAG: amdaie.dma_cpy_nd(%[[OBJ_FIFO_L3]][0, 32, 0] [4, 32, 32] [8192, 128, 1], %[[OBJ_FIFO_L2_1]][0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1]) @@ -375,6 +399,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { func.func @split_consumer_with_loop_dependency_and_stride(%arg0: memref<256x128xi32>) { %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> + %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> %alloc_0 = memref.alloc() : memref<8x1x32x32xi32, 1 : i32> %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo> %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<8x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> @@ -382,8 +407,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { %3 = affine.apply #map(%arg2) %4 = affine.apply #map1(%arg2) %5 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %6 = amdaie.dma_cpy_nd(%1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %7 = amdaie.dma_cpy_nd(%1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %6 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %7 = amdaie.dma_cpy_nd(%1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %8 = amdaie.dma_cpy_nd(%1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %6[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) } {mapping = [#gpu.block, #gpu.block]} %2 = amdaie.dma_cpy_nd(%0[0, 0] [256, 32] [128, 1], %1[0, 0, 0, 0] [8, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) memref.dealloc %alloc_0 : memref<8x1x32x32xi32, 1 : i32> @@ -396,18 +422,15 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { // CHECK-LABEL: func.func @change_split_factor_with_gcd_for_producer // CHECK-DAG: %[[LOF_L3:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[LOF_L2_0:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[LOF_L2_1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[LOF_L2_2:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[LOF_L2_3:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L2_0]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1], %[[LOF_L3]][0, 0, 0] [2, 32, 32] [16384, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) -// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L2_1]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1], %[[LOF_L3]][0, 32, 0] [2, 32, 32] [16384, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) -// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L2_2]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1], %[[LOF_L3]][0, 64, 0] [2, 32, 32] [16384, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) -// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L2_3]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1], %[[LOF_L3]][0, 96, 0] [2, 32, 32] [16384, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) +// CHECK-DAG: %[[LOF_L2_0:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[LOF_L2_1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L2_0]][0, 0, 0, 0] [2, 32, 1, 32] [1024, 32, 1024, 1], %[[LOF_L3]][0, 0, 0] [2, 64, 32] [16384, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) +// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L2_1]][0, 0, 0, 0] [2, 32, 1, 32] [1024, 32, 1024, 1], %[[LOF_L3]][0, 64, 0] [2, 64, 32] [16384, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) // CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (8, 4) { -// CHECK: %[[LOF_L1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[LOF_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) -// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[LOF_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) +// CHECK: %[[LOF_L1_0:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[LOF_L1_1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L1_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[LOF_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) +// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L1_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %[[LOF_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) // CHECK: } #executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 8 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}> #map = affine_map<(d0) -> (d0 * 2)> @@ -415,6 +438,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { func.func @change_split_factor_with_gcd_for_producer(%arg0: memref<256x128xi32>) { %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> + %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> %alloc_0 = memref.alloc() : memref<4x1x32x32xi32, 1 : i32> %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo> %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> @@ -423,8 +447,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { %3 = affine.apply #map(%arg2) %4 = affine.apply #map1(%arg2) %5 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %6 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %7 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %6 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %7 = amdaie.dma_cpy_nd(%5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %8 = amdaie.dma_cpy_nd(%6[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1], %1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) } {mapping = [#gpu.block, #gpu.block]} memref.dealloc %alloc_0 : memref<4x1x32x32xi32, 1 : i32> memref.dealloc %alloc : memref<1x1x4x8x4x8xi32, 2 : i32> @@ -436,25 +461,23 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { // CHECK-LABEL: @change_split_factor_with_gcd_for_consumer // CHECK-DAG: %[[LOF_L3:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[LOF_L2_0:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[LOF_L2_1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[LOF_L2_2:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: %[[LOF_L2_3:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[LOF_L2_0:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[LOF_L2_1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<2x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> // CHECK: scf.forall (%[[IV0:.*]], %[[IV1:.*]]) in (8, 4) { -// CHECK: %[[LOF_L1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> -// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[LOF_L1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) -// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[LOF_L1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) +// CHECK: %[[LOF_L1_0:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> +// CHECK: %[[LOF_L1_1:.*]] = amdaie.logicalobjectfifo.from_memref %{{.+}}, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> +// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L2_1]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[LOF_L1_0]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) +// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L2_0]][%[[IV1]], 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %[[LOF_L1_1]][0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) // CHECK: } -// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L3]][0, 0, 0] [2, 32, 32] [16384, 128, 1], %[[LOF_L2_0]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) -// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L3]][0, 32, 0] [2, 32, 32] [16384, 128, 1], %[[LOF_L2_1]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) -// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L3]][0, 64, 0] [2, 32, 32] [16384, 128, 1], %[[LOF_L2_2]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) -// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L3]][0, 96, 0] [2, 32, 32] [16384, 128, 1], %[[LOF_L2_3]][0, 0, 0, 0] [1, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) +// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L3]][0, 0, 0] [2, 64, 32] [16384, 128, 1], %[[LOF_L2_0]][0, 0, 0, 0] [2, 32, 1, 32] [1024, 32, 1024, 1]) +// CHECK-DAG: amdaie.dma_cpy_nd(%[[LOF_L3]][0, 64, 0] [2, 64, 32] [16384, 128, 1], %[[LOF_L2_1]][0, 0, 0, 0] [2, 32, 1, 32] [1024, 32, 1024, 1]) #executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 8 : i32, num_rows = 4 : i32, target_device = "npu1_4col", ukernels = "none"}> #map = affine_map<(d0) -> (d0 * 2)> #map1 = affine_map<(d0) -> (d0 * 2 + 1)> module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { func.func @change_split_factor_with_gcd_for_consumer(%arg0: memref<256x128xi32>) { %alloc = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> + %alloc_1 = memref.alloc() : memref<1x1x4x8x4x8xi32, 2 : i32> %alloc_0 = memref.alloc() : memref<4x1x32x32xi32, 1 : i32> %0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<256x128xi32> -> !amdaie.logicalobjectfifo> %1 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<4x1x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> @@ -462,8 +485,9 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { %3 = affine.apply #map(%arg2) %4 = affine.apply #map1(%arg2) %5 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> - %6 = amdaie.dma_cpy_nd(%1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %7 = amdaie.dma_cpy_nd(%1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %6 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x4x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %7 = amdaie.dma_cpy_nd(%1[%4, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %5[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %8 = amdaie.dma_cpy_nd(%1[%3, 0, 0, 0] [1, 1, 32, 32] [1024, 1024, 32, 1], %6[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 4, 8] [1024, 1024, 32, 8, 256, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) } {mapping = [#gpu.block, #gpu.block]} %2 = amdaie.dma_cpy_nd(%0[0, 0] [256, 32] [128, 1], %1[0, 0, 0, 0] [4, 32, 1, 32] [1024, 32, 1024, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) memref.dealloc %alloc_0 : memref<4x1x32x32xi32, 1 : i32> @@ -471,3 +495,73 @@ module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { return } } + +// ----- + +// This test demonstrates the case when the factor is not simply decided by the number of +// columns but the number of unique producers/consumers. In the example, although we are +// using 8 AIE columns, L2 LHS and output buffers are not split because there's only one +// producer/consumer, while L2 RHS buffer is split into 2 because there are 2 producers/consumers. +// +// CHECK-LABEL: @pack_peel_4_level_4x8_Strix +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK: scf.forall (%{{.*}}, %{{.*}}) in (2, 8) { +// CHECK: amdaie.dma_cpy_nd(%[[LOF_LHS_L2:.*]][0, 0, 0, 0] [8, 32, 8, 64] [16384, 64, 2048, 1], %{{.*}}[0, 0] [256, 512] [512, 1]) : (!amdaie.logicalobjectfifo>, +// CHECK: amdaie.dma_cpy_nd(%[[LOF_RHS_L2_0:.*]][0, 0, 0, 0] [8, 64, 8, 32] [2048, 32, 16384, 1], %{{.*}}[0, 0, 0] [512, 2, 128] [4096, 256, 1]) : (!amdaie.logicalobjectfifo>, +// CHECK: amdaie.dma_cpy_nd(%[[LOF_RHS_L2_1:.*]][0, 0, 0, 0] [8, 64, 8, 32] [2048, 32, 16384, 1], %{{.*}}[0, 0, 128] [512, 2, 128] [4096, 256, 1]) : (!amdaie.logicalobjectfifo>, +// CHECK: scf.forall (%{{.*}}, %{{.*}}) in (2, 2) { +// CHECK: amdaie.dma_cpy_nd(%{{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %[[LOF_RHS_L2_0]][%{{.*}}, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : +// CHECK: amdaie.dma_cpy_nd(%{{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %[[LOF_RHS_L2_1]][%{{.*}}, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : +// CHECK: amdaie.dma_cpy_nd(%{{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 8] [2048, 2048, 32, 8, 256, 1], %[[LOF_LHS_L2]][0, 0, 0, 0] [1, 1, 32, 64] [16384, 2048, 64, 1]) : +// CHECK: amdaie.dma_cpy_nd(%[[LOF_OUT_L2:.*]][0, 0, 0, 0] [1, 1, 32, 32] [8192, 1024, 32, 1], %{{.*}}[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : +// CHECK: } +// CHECK: amdaie.dma_cpy_nd(%{{.*}}[0, 0] [256, 512] [4096, 1], %[[LOF_OUT_L2]][0, 0, 0, 0] [8, 32, 16, 32] [1024, 32, 8192, 1]) : +// CHECK: } +#executable_target_amdaie_pdi_fb = #hal.executable.target<"amd-aie", "amdaie-pdi-fb", {num_cols = 8 : i32, num_rows = 4 : i32, target_device = "npu4", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_pdi_fb} { + func.func @pack_peel_4_level_4x8_Strix(%lhs: memref<512x512xi32>, %rhs: memref<512x4096xi32>, %out: memref<512x4096xi32>) { + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %alloc = memref.alloc() : memref<1x1x8x8x4x4xi32, 2 : i32> + %alloc_0 = memref.alloc() : memref<1x1x8x8x8x4xi32, 2 : i32> + %alloc_1 = memref.alloc() : memref<1x1x8x8x4x8xi32, 2 : i32> + %alloc_2 = memref.alloc() : memref<16x8x32x32xi32, 1 : i32> + %alloc_3 = memref.alloc() : memref<16x8x64x32xi32, 1 : i32> + %alloc_4 = memref.alloc() : memref<8x8x32x64xi32, 1 : i32> + %lof_0_1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<16x8x32x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %lof_1_1 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<16x8x64x32xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %lof_2_1 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<8x8x32x64xi32, 1 : i32> -> !amdaie.logicalobjectfifo> + %lof_0_0 = amdaie.logicalobjectfifo.from_memref %lhs, {} : memref<512x512xi32> -> !amdaie.logicalobjectfifo> + %lof_1_0 = amdaie.logicalobjectfifo.from_memref %rhs, {} : memref<512x4096xi32> -> !amdaie.logicalobjectfifo> + %lof_2_0 = amdaie.logicalobjectfifo.from_memref %out, {} : memref<512x4096xi32> -> !amdaie.logicalobjectfifo> + scf.forall (%arg0, %arg1) in (2, 8) { + %0 = amdaie.dma_cpy_nd(%lof_2_1[0, 0, 0, 0] [8, 32, 8, 64] [16384, 64, 2048, 1], %lof_0_0[0, 0] [256, 512] [512, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %1 = amdaie.dma_cpy_nd(%lof_1_1[0, 0, 0, 0] [8, 64, 16, 32] [2048, 32, 16384, 1], %lof_1_0[0, 0] [512, 512] [4096, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + scf.forall (%arg2, %arg3) in (2, 2) { + %of0 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg2) + %of1 = affine.apply affine_map<(d0) -> (d0 * 8 + 1)>(%arg2) + %tile_1_2 = amdaie.tile(%c1, %c2) + %tile_0_2 = amdaie.tile(%c0, %c2) + %lof_1_2 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x8x8x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %lof_0_2 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<1x1x8x8x8x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %lof_c_2 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<1x1x8x8x4x8xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %3 = amdaie.dma_cpy_nd(%lof_0_2[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %lof_1_1[%of0, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %4 = amdaie.dma_cpy_nd(%lof_1_2[0, 0, 0, 0, 0, 0] [1, 1, 8, 8, 8, 4] [2048, 2048, 32, 4, 256, 1], %lof_1_1[%of1, 0, 0, 0] [1, 1, 64, 32] [16384, 2048, 32, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %5 = amdaie.dma_cpy_nd(%lof_c_2[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 8] [2048, 2048, 32, 8, 256, 1], %lof_2_1[0, 0, 0, 0] [1, 1, 32, 64] [16384, 2048, 64, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %lof_0_2_8 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<1x1x8x8x4x4xi32, 2 : i32> -> !amdaie.logicalobjectfifo> + %6 = amdaie.dma_cpy_nd(%lof_0_1[0, 0, 0, 0] [1, 1, 32, 32] [8192, 1024, 32, 1], %lof_0_2_8[0, 0, 0, 0, 0, 0] [1, 1, 8, 4, 8, 4] [1024, 1024, 16, 4, 128, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + } + %2 = amdaie.dma_cpy_nd(%lof_2_0[0, 0] [256, 512] [4096, 1], %lof_0_1[0, 0, 0, 0] [8, 32, 16, 32] [1024, 32, 8192, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + } + memref.dealloc %alloc_4 : memref<8x8x32x64xi32, 1 : i32> + memref.dealloc %alloc_3 : memref<16x8x64x32xi32, 1 : i32> + memref.dealloc %alloc_2 : memref<16x8x32x32xi32, 1 : i32> + memref.dealloc %alloc_1 : memref<1x1x8x8x4x8xi32, 2 : i32> + memref.dealloc %alloc_0 : memref<1x1x8x8x8x4xi32, 2 : i32> + memref.dealloc %alloc : memref<1x1x8x8x4x4xi32, 2 : i32> + return + } +}