From f13c033ab7b43c7a8bdffc0de1238c7bb0312c82 Mon Sep 17 00:00:00 2001 From: Jorn Tuyls Date: Wed, 5 Feb 2025 15:43:56 +0100 Subject: [PATCH] [AssignTiles] Refactor pass to make it more extensible (#1079) This PR refactors the `AMDAIEAssignTiles` pass to prepare for adding more complex tile assignment strategies to make effective use of the available memory. This PR on purpose doesn't make any (major) change in how the tile assignment happens currently. --- .../Transforms/AMDAIEAssignTiles.cpp | 524 +++++++++--------- .../AMDAIEDistributeCoresAndObjectFifos.cpp | 23 +- .../iree-amd-aie/Transforms/Transforms.h | 9 +- .../Transforms/Utils/AMDAIEOpUtils.h | 40 ++ .../Transforms/test/assign_tiles.mlir | 108 ++-- .../distribute_cores_and_objectfifos.mlir | 8 +- 6 files changed, 370 insertions(+), 342 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignTiles.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignTiles.cpp index 8cb37678a..58be729d2 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignTiles.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignTiles.cpp @@ -6,6 +6,7 @@ #include "iree-amd-aie/IR/AMDAIEOps.h" #include "iree-amd-aie/Transforms/Passes.h" +#include "iree-amd-aie/Transforms/Utils/AMDAIEOpUtils.h" #include "iree-amd-aie/Transforms/Utils/AMDAIEUtils.h" #include "iree-amd-aie/aie_runtime/iree_aie_runtime.h" #include "mlir/IR/Iterators.h" @@ -122,289 +123,300 @@ LogicalResult duplicateGlobalObjFifos(RewriterBase &rewriter, Operation *op) { return success(); } -/// Assign tiles to the logical objectfifos with local memory space (L1). -/// The tiles are derived from the usage of the logical objectfifos within -/// core operations, which are already assigned a tile location. -LogicalResult assignLocalTiles(RewriterBase &rewriter, Operation *op) { - WalkResult res = - op->walk([&](AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo) { - Attribute memSpace = logicalObjectFifo.getMemorySpace(); - if (!memSpace || dyn_cast(memSpace).getInt() != 2) - return WalkResult::advance(); - - llvm::SmallSetVector, 16> tileLocations; - if (failed(findUsersInCoreAndAddTiles( - logicalObjectFifo, - cast( - logicalObjectFifo.getOperation()), - tileLocations))) { - return WalkResult::interrupt(); - } - // Handle subviews. +/// Base class for tile allocators to support different tile assignment +/// strategies. +class TileAllocatorBase { + public: + TileAllocatorBase(RewriterBase &rewriter, + const AMDAIE::AMDAIEDeviceModel &deviceModel) + : rewriter(rewriter), deviceModel(deviceModel) {} + virtual ~TileAllocatorBase(){}; + /// Assign tiles to the provided vector of logical objectFifos on the same + /// memory space. This method expects a set of valid tile candidates to be + /// provided inside the operation. + virtual LogicalResult assignTiles( + SmallVector &objFifos, + uint8_t memSpace, function_ref emitError) = 0; + + protected: + RewriterBase &rewriter; + const AMDAIE::AMDAIEDeviceModel &deviceModel; +}; + +/// A custom tile allocater that takes into consideration the usage and column +/// of users to determine tile locations. +class UsageAndColumnBasedTileAllocator final : public TileAllocatorBase { + public: + DenseMap> uniqueL3L2Pair; + + UsageAndColumnBasedTileAllocator( + RewriterBase &rewriter, const AMDAIE::AMDAIEDeviceModel &deviceModel, + DenseMap> uniqueL3L2Pair) + : TileAllocatorBase(rewriter, deviceModel), + uniqueL3L2Pair(uniqueL3L2Pair) {} + + LogicalResult assignTiles( + SmallVector &objFifos, + uint8_t memSpace, function_ref emitError) { + assert(llvm::all_of(objFifos, + [&](AMDAIE::LogicalObjFifoOpInterface objFifo) { + return objFifo.getMemorySpaceAsUInt() == memSpace; + }) && + "All logical objectFifos should have ths same memory space"); + if (memSpace == 2) return assignLocalTiles(objFifos, memSpace, emitError); + if (memSpace == 0 || memSpace == 1) + return assignNonLocalTiles(objFifos, memSpace, emitError); + return emitError() << "Unsupported memory space : " + << std::to_string(memSpace); + } + + private: + /// Assign tiles to the logical objectfifos with local memory space (L1). + /// The tiles are derived from the usage of the logical objectfifos within + /// core operations, which are already assigned a tile location. + LogicalResult assignLocalTiles( + SmallVector &objFifos, + uint8_t memSpace, function_ref emitError) { + assert(memSpace == 2 && "Local memory space should be `2`."); + for (AMDAIE::LogicalObjFifoOpInterface objFifo : objFifos) { + llvm::SmallSetVector, 16> tileLocations; + if (failed(findUsersInCoreAndAddTiles(objFifo, objFifo, tileLocations))) { + return failure(); + } + // Handle subviews. + if (auto fromMemrefOp = dyn_cast( + objFifo.getOperation())) { for (Operation *userOp : - logicalObjectFifo.getMemref().getDefiningOp()->getUsers()) { + fromMemrefOp.getMemref().getDefiningOp()->getUsers()) { if (auto subviewOp = dyn_cast(userOp)) { - if (failed(findUsersInCoreAndAddTiles( - subviewOp, - cast( - logicalObjectFifo.getOperation()), - tileLocations))) { - return WalkResult::interrupt(); + if (failed(findUsersInCoreAndAddTiles(subviewOp, objFifo, + tileLocations))) { + return failure(); } } } + } - SmallVector tiles; - tiles.reserve(tileLocations.size()); - rewriter.setInsertionPoint(logicalObjectFifo); - for (auto [column, row] : tileLocations) { - auto colIndex = rewriter.create( - rewriter.getUnknownLoc(), column); - auto rowIndex = rewriter.create( - rewriter.getUnknownLoc(), row); - auto tileOp = rewriter.create( - rewriter.getUnknownLoc(), colIndex, rowIndex); - tiles.push_back(tileOp.getResult()); - } - // Sort for deterministic output IR. - llvm::sort(tiles.begin(), tiles.end(), - AMDAIE::TileOp::tileValueColumnAndRowComparator); - rewriter.replaceOpWithNewOp( - logicalObjectFifo, - cast( - logicalObjectFifo.getOutput().getType()), - logicalObjectFifo.getMemref(), tiles); - return WalkResult::advance(); - }); - if (res.wasInterrupted()) return failure(); - return success(); -} - -/// Assign a set of candidate physical AIE tiles to logical objectFifos. This -/// rewrite takes an iterative approach by matching logical objectfifos and only -/// assigning tiles when linked through dma ops with other logical objectfifos -/// which already have tiles assigned. If the linked logical objectfifos don't -/// have tiles assigned yet, we will return a failure and give the linked -/// logical objectfifos a chance to assign tiles before returning to this one. -class FillTiles - : public OpInterfaceRewritePattern { - using OpInterfaceRewritePattern< - AMDAIE::LogicalObjFifoOpInterface>::OpInterfaceRewritePattern; - - public: - FillTiles(MLIRContext *context, const AMDAIE::AMDAIEDeviceModel &deviceModel) - : OpInterfaceRewritePattern(context), deviceModel(deviceModel) {} - - LogicalResult matchAndRewrite( - AMDAIE::LogicalObjFifoOpInterface logicalObjectFifo, - PatternRewriter &rewriter) const override { - LLVM_DEBUG(llvm::dbgs() << "FillTiles: " << logicalObjectFifo << "\n"); - if (!logicalObjectFifo.getTiles().empty()) { - return rewriter.notifyMatchFailure(logicalObjectFifo, - "Tiles are already assigned."); - } - uint8_t memSpace = logicalObjectFifo.getMemorySpaceAsUInt(); - if (memSpace != 0 && memSpace != 1) { - return rewriter.notifyMatchFailure( - logicalObjectFifo, - "Skip logical objFifos that don't operate on L3 or L2"); + SmallVector tiles; + tiles.reserve(tileLocations.size()); + rewriter.setInsertionPoint(objFifo); + for (auto [column, row] : tileLocations) { + auto colIndex = rewriter.create( + rewriter.getUnknownLoc(), column); + auto rowIndex = rewriter.create( + rewriter.getUnknownLoc(), row); + auto tileOp = rewriter.create(rewriter.getUnknownLoc(), + colIndex, rowIndex); + tiles.push_back(tileOp.getResult()); + } + // Sort for deterministic output IR. + llvm::sort(tiles.begin(), tiles.end(), + AMDAIE::TileOp::tileValueColumnAndRowComparator); + if (failed(objFifo.replaceWithNewTiles(rewriter, tiles))) { + return objFifo.emitOpError() << "could not replace its tiles."; + } } + return success(); + } - SmallVector targetTiles; - SmallVector sourceTiles; - LogicalResult dstRes = - getUserTiles(logicalObjectFifo, targetTiles); - LogicalResult srcRes = - getUserTiles(logicalObjectFifo, sourceTiles); - if (failed(dstRes) && failed(srcRes)) { - return rewriter.notifyMatchFailure(logicalObjectFifo, - "No source or target tiles found"); - } + /// Assign tile locations to objectFifos. Start by searching for a set of + /// candidate tile locations and then assign tiles based on a simple + /// usage-based model that prioritizes tiles that have the least usage. + LogicalResult assignNonLocalTiles( + SmallVector &objFifos, + uint8_t memSpace, function_ref emitError) { + assert((memSpace == 0 || memSpace == 1) && + "The memory space of non-local objectFifos should be `0` or `1`"); + // Keep track of the buffer usage on tiles to try distributing buffers + // evenly over available tile resources. + DenseMap tileLocToUsage; + auto tileLocAndUsageCmp = [&](const TileLoc &a, const TileLoc &b) -> bool { + size_t usageA = tileLocToUsage[a]; + size_t usageB = tileLocToUsage[b]; + if (usageA < usageB) return true; + if (usageA > usageB) return false; + if (a.col < b.col) return true; + if (a.col > b.col) return false; + if (a.row < b.row) return true; + if (a.row > b.row) return false; + assert(false && "same tiles should never be compared"); + return false; + }; SmallVector memSpaceRows = deviceModel.getMemSpaceRows(memSpace); if (memSpaceRows.size() == 0) { - return rewriter.notifyMatchFailure( - logicalObjectFifo, - "No rows found for the memory space of this logical objFifo"); - } - if (memSpaceRows.size() > 1) { - logicalObjectFifo.emitWarning() - << "has a memory space with multiple available rows, the first one " - "of which is chosen for tile assignment, but this might not lead " - "to good usage of the available resources."; + return emitError() + << "No rows found for the memory space of this logical objFifo"; } uint32_t row = memSpaceRows[0]; - llvm::SmallSetVector, 16> tileLocations; - auto createTileLocations = - [&](SmallVector &tiles) -> LogicalResult { - // For deterministic and canonical output, sort on column index and erase - // duplicates. - std::sort(tiles.begin(), tiles.end(), - AMDAIE::TileOp::tileColumnComparator); - tiles.erase(std::unique(tiles.begin(), tiles.end()), tiles.end()); - for (AMDAIE::TileOp tile : tiles) { - std::optional column = getConstantIntValue(tile.getCol()); - if (!column) - return rewriter.notifyMatchFailure(tile, "found non-constant column"); - tileLocations.insert(std::make_pair(column.value(), row)); + + for (AMDAIE::LogicalObjFifoOpInterface objFifo : objFifos) { + LLVM_DEBUG(llvm::dbgs() + << "Assign tile for objFifo: " << objFifo << "\n"); + + mlir::FunctionOpInterface funcOp = + objFifo->getParentOfType(); + if (!funcOp) { + return objFifo.emitOpError() + << "Could not find a function-like parent op."; } - return success(); - }; + FailureOr coreRegionInfo = getCoreRegionInfo(funcOp); + if (failed(coreRegionInfo)) return failure(); + int startCol = coreRegionInfo.value().startCol; + int numCols = coreRegionInfo.value().numCols; + llvm::SmallSetVector tileLocations; + for (int i = startCol; i < startCol + numCols; i++) + tileLocations.insert(TileLoc(i, row)); + if (tileLocations.empty()) { + return objFifo.emitOpError() << "No tile locations found for this " + "logical objFifo. Maybe in a next " + "iteration, with more information, a " + "tile location can be found."; + } + SmallVector tiles = tileLocations.takeVector(); + + // Sort tiles on priority column + left to right; + FailureOr maybePriorityCol = getPriorityColumn(objFifo); + if (failed(maybePriorityCol)) return failure(); + int64_t priorityCol = maybePriorityCol.value(); + llvm::sort(tiles, [&](const TileLoc &a, const TileLoc &b) { + if (a.col == priorityCol) return true; + return a.col < b.col; + }); - if (!targetTiles.empty() && !sourceTiles.empty()) { - return rewriter.notifyMatchFailure( - logicalObjectFifo, - "Found logical objectfifo with both source and target tiles, which " - "is not supported yet"); - } else if (!targetTiles.empty()) { - // Create tile locations for this logical objectfifo based on the - // consumers' tiles. - if (failed(createTileLocations(targetTiles))) { - return rewriter.notifyMatchFailure( - logicalObjectFifo, - "Could not find tile locations based on the consumers' tiles."); + // Here we are limiting the number of tile options to buffer count to + // avoid repeated accesses of the same buffer being assigned to + // different tiles. Because if the repeated access of the same buffer + // are assigned to different tiles, that unnecessarily ends up consuming + // more DMA channels on those new tiles than needed, and as a result we + // will end up exhausting the DMA channels. Currently the following fix + // works for L3 buffers. + auto fromMemrefOp = dyn_cast( + objFifo.getOperation()); + if (fromMemrefOp) { + Operation *defOp = fromMemrefOp.getMemref().getDefiningOp(); + if (defOp && uniqueL3L2Pair.contains(defOp)) + tiles.truncate( + std::min((size_t)uniqueL3L2Pair[defOp].size(), tiles.size())); } - } else if (!sourceTiles.empty()) { - // Create tile locations for this logical objectfifo based on producers' - // tiles. - if (failed(createTileLocations(sourceTiles))) { - return rewriter.notifyMatchFailure( - logicalObjectFifo, - "Could not find tile locations based on the producers' tiles."); + + // Assing a tile location. + TileLoc assignedTileLoc; + const auto *tileIt = llvm::find_if(tiles, [&](const TileLoc &tileLoc) { + return tileLoc.col == priorityCol; + }); + if (tileIt != tiles.end()) { + assignedTileLoc = *tileIt; + LLVM_DEBUG(llvm::dbgs() + << "Assign to priority column: " << priorityCol << "\n"); + } else { + LLVM_DEBUG(llvm::dbgs() << "Assign based on usage comparator\n"); + assignedTileLoc = + *std::min_element(tiles.begin(), tiles.end(), tileLocAndUsageCmp); } - } else { - return rewriter.notifyMatchFailure( - logicalObjectFifo, - "Don't assign this logicalObjectFifo to a physical tile (yet!). Wait " - "for other logical objectfifos to be assigned first."); - } - if (tileLocations.empty()) { - return rewriter.notifyMatchFailure( - logicalObjectFifo, - "No tile locations found for this logical objFifo. Maybe in a next " - "iteration, with more information, a tile location can be found."); - } - rewriter.setInsertionPoint(logicalObjectFifo); - SmallVector tiles; - tiles.reserve(tileLocations.size()); - for (auto [column, row] : tileLocations) { + // Increase usage of the chosen tile as a new logical objectFifo will be + // assigned to it. This allows distributing the logical objectFifos + // evenly across the available tile resources. + LLVM_DEBUG(llvm::dbgs() + << "Assign to tile (col, row): (" << assignedTileLoc.col + << ", " << assignedTileLoc.row << ")\n"); + tileLocToUsage[assignedTileLoc] += 1; + + rewriter.setInsertionPoint(objFifo); auto getCol = rewriter.create( - rewriter.getUnknownLoc(), column); + rewriter.getUnknownLoc(), assignedTileLoc.col); auto getRow = rewriter.create( - rewriter.getUnknownLoc(), row); - auto tileOp = rewriter.create(rewriter.getUnknownLoc(), - getCol, getRow); - tiles.push_back(tileOp.getResult()); - } - if (failed(logicalObjectFifo.replaceWithNewTiles(rewriter, tiles))) { - return rewriter.notifyMatchFailure( - logicalObjectFifo, - "Could not replace the tiles in the provided logical objectFifo."); + rewriter.getUnknownLoc(), assignedTileLoc.row); + auto assignedTileOp = rewriter.create( + rewriter.getUnknownLoc(), getCol, getRow); + SmallVector tileResults = { + cast(assignedTileOp.getResult())}; + if (failed(objFifo.replaceWithNewTiles(rewriter, tileResults))) { + return objFifo.emitOpError() << "Could not replace its tiles."; + } } return success(); } - private: - // The device model used to retrieve device specific information. - const AMDAIEDeviceModel &deviceModel; + /// Utility to return a priority column for the provided objectFifo if tiles + /// of users are all within the same column. Returns `-1` if no priority + /// column was found. + FailureOr getPriorityColumn( + AMDAIE::LogicalObjFifoOpInterface objFifo) const { + int64_t priorityCol{-1}; + SmallVector targetTiles; + SmallVector sourceTiles; + LogicalResult dstRes = + getUserTiles(objFifo, targetTiles); + LogicalResult srcRes = + getUserTiles(objFifo, sourceTiles); + if (failed(dstRes) && failed(srcRes)) { + return objFifo.emitOpError() << "No source or target tiles found"; + } + SmallVector targetColsVec = + llvm::map_to_vector(targetTiles, [](AMDAIE::TileOp tileOp) { + std::optional column = getConstantIntValue(tileOp.getCol()); + return column.has_value() ? column.value() : -1; + }); + DenseSet targetCols(targetColsVec.begin(), targetColsVec.end()); + SmallVector sourceColsVec = + llvm::map_to_vector(sourceTiles, [](AMDAIE::TileOp tileOp) { + std::optional column = getConstantIntValue(tileOp.getCol()); + return column.has_value() ? column.value() : -1; + }); + DenseSet sourceCols(sourceColsVec.begin(), sourceColsVec.end()); + if (targetCols.size() == 1 && sourceCols.size() == 1) { + int64_t targetCol = *targetCols.begin(); + int64_t sourceCol = *sourceCols.begin(); + if (targetCol != -1 && sourceCol != -1 && targetCol != sourceCol) { + return objFifo.emitOpError() + << "Found two different priority columns, column " << targetCol + << " on the target side and column " << sourceCol + << " on the source side."; + } else { + priorityCol = targetCol; + } + } else if (targetCols.size() == 1) { + priorityCol = *targetCols.begin(); + } else if (sourceCols.size() == 1) { + priorityCol = *sourceCols.begin(); + } + return priorityCol; + } }; -/// Assign tile locations to objectFifos. Start by searching for a set of -/// candidate tile locations and then assign tiles based on a simple usage-based -/// model that prioritizes tiles that have the least usage. -LogicalResult assignNonLocalTiles( +/// Assign tile locations to objectFifos based on available resources. Visit +/// objectFifos based on locality to the cores, i.e. first visit the objectFifos +/// on L1, then L2, etc. +LogicalResult assignTiles( RewriterBase &rewriter, Operation *op, const AMDAIEDeviceModel &deviceModel, DenseMap> uniqueL3L2Pair) { - MLIRContext *context = rewriter.getContext(); if (failed(clearNonLocalTiles(rewriter, op))) return op->emitOpError() << "failed to clear non-local tile assignments"; - // Find and fill the tile candidates. - RewritePatternSet fillTilePatterns(context); - fillTilePatterns.insert(context, deviceModel); - if (failed(applyPatternsGreedily(op, std::move(fillTilePatterns)))) { - return op->emitOpError() - << "collection of tile candidates for logical objectFifos failed"; - } - if (failed(verify(op, true))) { - return failure(); - } - LLVM_DEBUG(llvm::dbgs() << "After fillTiles: \n" << *op << "\n"); - - // Keep track of the buffer usage on tiles to try distributing buffers evenly - // over available tile resources. - DenseMap tileLocToUsage; - auto tileLocAndUsageCmp = [&](AMDAIE::TileOp a, AMDAIE::TileOp b) -> bool { - int64_t colA = getConstantIndexOrAssert(a.getCol()); - int64_t rowA = getConstantIndexOrAssert(a.getRow()); - int64_t colB = getConstantIndexOrAssert(b.getCol()); - int64_t rowB = getConstantIndexOrAssert(b.getRow()); - size_t usageA = tileLocToUsage[TileLoc(colA, rowA)]; - size_t usageB = tileLocToUsage[TileLoc(colB, rowB)]; - if (usageA < usageB) return true; - if (usageA > usageB) return false; - if (colA < colB) return true; - if (colA > colB) return false; - if (rowA < rowB) return true; - if (rowA > rowB) return false; - assert(false && "same tiles should never be compared"); - return false; - }; - - // After filling tile candidates, find and assign a specific one. - DenseMap logicalObjFifoToTileId; - WalkResult res = - op->walk([&](AMDAIE::LogicalObjFifoOpInterface logicalObjectFifo) { - uint8_t memSpace = logicalObjectFifo.getMemorySpaceAsUInt(); - if (memSpace != 0 && memSpace != 1) return WalkResult::advance(); - if (logicalObjectFifo.getTiles().size() == 0) { - logicalObjectFifo.emitOpError() - << "should have at least one tile candidate"; - return WalkResult::interrupt(); - } - - SmallVector tiles = - llvm::map_to_vector(logicalObjectFifo.getTiles(), [](Value tile) { - return dyn_cast_if_present(tile.getDefiningOp()); - }); - - // Here we are limiting the number of tile options to buffer count to - // avoid repeated accesses of the same buffer being assigned to - // different tiles. Because if the repeated access of the same buffer - // are assigned to different tiles, that unnecessarily ends up consuming - // more DMA channels on those new tiles than needed, and as a result we - // will end up exhausting the DMA channels. Currently the following fix - // works for L3 buffers. - auto fromMemrefOp = dyn_cast( - logicalObjectFifo.getOperation()); - if (fromMemrefOp) { - Operation *defOp = fromMemrefOp.getMemref().getDefiningOp(); - if (defOp && uniqueL3L2Pair.contains(defOp)) - tiles.truncate( - std::min((size_t)uniqueL3L2Pair[defOp].size(), tiles.size())); - } - AMDAIE::TileOp assignedTileOp = - *std::min_element(tiles.begin(), tiles.end(), tileLocAndUsageCmp); + UsageAndColumnBasedTileAllocator tileAllocator(rewriter, deviceModel, + uniqueL3L2Pair); - // Increase usage of the chosen tile as a new logical objectFifo will be - // assigned to it. This allows distributing the logical objectFifos - // evenly across the available tile resources. - int64_t col = getConstantIndexOrAssert(assignedTileOp.getCol()); - int64_t row = getConstantIndexOrAssert(assignedTileOp.getRow()); - tileLocToUsage[TileLoc(col, row)] += 1; - - rewriter.setInsertionPoint(logicalObjectFifo); - SmallVector tileResults = { - cast(assignedTileOp.getResult())}; - if (failed( - logicalObjectFifo.replaceWithNewTiles(rewriter, tileResults))) { - logicalObjectFifo.emitOpError() << "could not replace its tiles."; - return WalkResult::interrupt(); - } - return WalkResult::advance(); - }); - if (res.wasInterrupted()) return failure(); + DenseMap> + memSpaceToObjFifos; + op->walk([&](AMDAIE::LogicalObjFifoOpInterface logicalObjectFifo) { + uint8_t memSpace = logicalObjectFifo.getMemorySpaceAsUInt(); + memSpaceToObjFifos[memSpace].push_back(logicalObjectFifo); + }); + SmallVector memSpaces = llvm::map_to_vector( + memSpaceToObjFifos, + [](const std::pair> + &memSpaceAndObjFifos) { return memSpaceAndObjFifos.first; }); + llvm::sort(memSpaces, std::greater()); + for (uint8_t memSpace : memSpaces) { + if (failed( + tileAllocator.assignTiles(memSpaceToObjFifos[memSpace], memSpace, + [&]() { return op->emitOpError(); }))) { + return failure(); + } + } return success(); } @@ -435,13 +447,6 @@ void AMDAIEAssignTilesPass::runOnOperation() { } AMDAIEDeviceModel deviceModel = getDeviceModel(maybeDevice.value()); - // Assign tile locations to logical objectFifos on local (L1) memory. - if (failed(assignLocalTiles(rewriter, parentOp))) { - parentOp->emitOpError() << "local tile assignment failed"; - return signalPassFailure(); - } - LLVM_DEBUG(llvm::dbgs() << "After assignLocalTiles: \n" << *parentOp << "\n"); - // Duplicate global objectFifos for each strided copy-like operation user to // allow global logical objectFifos to be assigned to different tile // locations. @@ -491,8 +496,7 @@ void AMDAIEAssignTilesPass::runOnOperation() { return WalkResult::advance(); }); // Assign tile locations to logical objectFifos on non-local (not L1) memory. - if (failed(assignNonLocalTiles(rewriter, parentOp, deviceModel, - uniqueL3L2Pair))) { + if (failed(assignTiles(rewriter, parentOp, deviceModel, uniqueL3L2Pair))) { parentOp->emitOpError() << "non-local tile assignment failed"; return signalPassFailure(); } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp index 5626cc496..540dd9141 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp @@ -507,23 +507,12 @@ void AMDAIEDistributeCoresAndObjectFifosPass::runOnOperation() { LLVM_DEBUG(llvm::dbgs() << "Module after insertLogicalObjectFifoAccess: \n" << moduleOp << "\n"); - // Assign tile locations to logical objectfifos on local (L1) memory. - if (failed(assignLocalTiles(rewriter, moduleOp))) { - moduleOp.emitOpError() << "local tile assignment failed"; - return signalPassFailure(); - } - - if (failed(verify(moduleOp, true))) { - return signalPassFailure(); - } - - LLVM_DEBUG(llvm::dbgs() << "Module after assignLocalTiles: \n" - << moduleOp << "\n"); - DenseMap> uniqueL3L2Pair; - // Assign tile locations to logical objectfifos on non-local (not L1) memory. - if (failed(assignNonLocalTiles(rewriter, moduleOp, deviceModel, - uniqueL3L2Pair))) { + // Assign tile locations to all logical objectfifos. + // TODO(jornt): This is needed inside this pass to make the output stable with + // respect to cse. When that gets resolved, we can avoid convoluting this + // pass. + if (failed(assignTiles(rewriter, moduleOp, deviceModel, uniqueL3L2Pair))) { moduleOp.emitOpError() << "local tile assignment failed"; return signalPassFailure(); } @@ -532,7 +521,7 @@ void AMDAIEDistributeCoresAndObjectFifosPass::runOnOperation() { return signalPassFailure(); } - LLVM_DEBUG(llvm::dbgs() << "Module after assignNonLocalTiles: \n" + LLVM_DEBUG(llvm::dbgs() << "Module after assignTiles: \n" << moduleOp << "\n"); } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Transforms.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Transforms.h index 9b2bae6d1..21bd240b3 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Transforms.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Transforms.h @@ -14,13 +14,8 @@ namespace mlir::iree_compiler::AMDAIE { -/// Assign tile locations to the logical objectfifos with local memory space -/// (L1). -LogicalResult assignLocalTiles(RewriterBase &rewriter, Operation *op); - -/// Assign tile locations to the logical objectfifos with non-local memory space -/// (L2, L3 etc, not L1). -LogicalResult assignNonLocalTiles( +/// Assign tile locations to the logical objectfifos. +LogicalResult assignTiles( RewriterBase &rewriter, Operation *op, const AMDAIEDeviceModel &deviceModel, DenseMap> uniqueL3L2Pair); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEOpUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEOpUtils.h index d71975b52..b07e971bb 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEOpUtils.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Utils/AMDAIEOpUtils.h @@ -7,6 +7,8 @@ #ifndef IREE_AMD_AIE_TRANSFORMS_AMDAIEOPUTILS_H_ #define IREE_AMD_AIE_TRANSFORMS_AMDAIEOPUTILS_H_ +#include + #include "iree-amd-aie/IR/AMDAIEOps.h" #include "mlir/Dialect/SCF/IR/SCF.h" #include "mlir/IR/Block.h" @@ -14,6 +16,44 @@ namespace mlir::iree_compiler::AMDAIE { +/// Utility struct to describe a region of core tiles on the array. +struct CoreRegionInfo { + int64_t startCol{0}; + int64_t numCols{0}; + int64_t startRow{0}; + int64_t numRows{0}; +}; + +/// Utility to return the core region info based on the provided op. This will +/// look for a parent Function op and for the tile region in which all core +/// tiles are found. +template +FailureOr getCoreRegionInfo(Op op) { + int64_t firstCol{std::numeric_limits::max()}; + int64_t lastCol{0}; + int64_t firstRow{std::numeric_limits::max()}; + int64_t lastRow{0}; + WalkResult res = op->walk([&](AMDAIE::CoreOp coreOp) { + AMDAIE::TileOp tileOp = coreOp.getTileOp(); + std::optional maybeCol = getConstantIntValue(tileOp.getCol()); + std::optional maybeRow = getConstantIntValue(tileOp.getRow()); + if (!maybeCol || !maybeCol) { + coreOp.emitOpError() << "has non-constant tile location"; + return WalkResult::interrupt(); + } + int64_t col = maybeCol.value(); + int64_t row = maybeRow.value(); + if (col < firstCol) firstCol = col; + if (col > lastCol) lastCol = col; + if (row < firstRow) firstRow = row; + if (row > lastRow) lastRow = row; + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return failure(); + return CoreRegionInfo( + {firstCol, lastCol - firstCol + 1, firstRow, lastRow - firstRow + 1}); +} + /// Return a vector of the parent operations that are of type 'OpTy', including /// this op if it has type 'OpTy' template diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_tiles.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_tiles.mlir index c01478eb4..00baeb982 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_tiles.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_tiles.mlir @@ -103,7 +103,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index // CHECK-DAG: %[[ALLOC_0:.*]] = memref.alloc() : memref<2048xi32, 1> // CHECK-DAG: %[[ALLOC_1:.*]] = memref.alloc() : memref<2048xi32, 2> -// CHECK: amdaie.workgroup +// CHECK-DAG: amdaie.workgroup // CHECK-DAG: %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]]) // CHECK-DAG: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) // CHECK-DAG: %[[TILE_0_3:.*]] = amdaie.tile(%[[C0]], %[[C3]]) @@ -203,7 +203,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK-DAG: %[[ALLOC_0:.*]] = memref.alloc() : memref<2048xi32> // CHECK-DAG: %[[ALLOC_1:.*]] = memref.alloc() : memref<2048xi32, 1> // CHECK-DAG: %[[ALLOC_2:.*]] = memref.alloc() : memref<2048xi32, 2> -// CHECK: amdaie.workgroup +// CHECK-DAG: amdaie.workgroup // CHECK-DAG: %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]]) // CHECK-DAG: %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]]) // CHECK-DAG: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) @@ -518,38 +518,38 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // This is needed for enabling 4x8 AIE array on Strix. // CHECK-LABEL: @same_tile_assg_for_l3_on_diff_block -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK-DAG: %[[C30:.*]] = arith.constant 30 : index -// CHECK: %[[LHS:.*]] = memref.alloc() : memref<2x3x4xi32> -// CHECK: %[[RHS:.*]] = memref.alloc() : memref<3x4x5xi32> -// CHECK: %[[OUT:.*]] = memref.alloc() : memref<4x5x6xi32> -// CHECK: scf.forall -// CHECK: %[[TILE_0_0:.*]] = amdaie.tile(%[[C0:.*]], %[[C0:.*]]) -// CHECK: %[[LOF_LHS_L3:.*]] = amdaie.logicalobjectfifo.from_memref %[[LHS]], {%[[TILE_0_0]]} : -// CHECK: %[[LHS_L3_to_L2:.*]] = amdaie.dma_cpy_nd +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C30:.*]] = arith.constant 30 : index +// CHECK-DAG: %[[LHS:.*]] = memref.alloc() : memref<2x3x4xi32> +// CHECK-DAG: %[[RHS:.*]] = memref.alloc() : memref<3x4x5xi32> +// CHECK-DAG: %[[OUT:.*]] = memref.alloc() : memref<4x5x6xi32> +// CHECK-DAG: %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK-DAG: %[[TILE_1_0:.*]] = amdaie.tile(%[[C1]], %[[C0]]) +// CHECK: scf.forall +// CHECK: %[[LOF_LHS_L3:.*]] = amdaie.logicalobjectfifo.from_memref %[[LHS]], {%[[TILE_0_0]]} : +// CHECK: %[[LHS_L3_to_L2:.*]] = amdaie.dma_cpy_nd // CHECK-SAME: %[[LOF_LHS_L3]][0, 0, 0] [1, 3, 4] [12, 4, 1]) : -// CHECK: %[[LOF_RHS_L3_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[RHS]], {%[[TILE_0_0]]} : -// CHECK: %[[RHS_L3_to_L2_0:.*]] = amdaie.dma_cpy_nd +// CHECK: %[[LOF_RHS_L3_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[RHS]], {%[[TILE_0_0]]} : +// CHECK: %[[RHS_L3_to_L2_0:.*]] = amdaie.dma_cpy_nd // CHECK-SAME: %[[LOF_RHS_L3_0]][0, 0, 0] [1, 4, 5] [20, 5, 1]) : -// CHECK: %[[TILE_1_0:.*]] = amdaie.tile(%[[C1:.*]], %[[C0:.*]]) -// CHECK: %[[LOF_RHS_L3_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[RHS]], {%[[TILE_1_0]]} : -// CHECK: %[[RHS_L3_to_L2_1:.*]] = amdaie.dma_cpy_nd +// CHECK: %[[LOF_RHS_L3_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[RHS]], {%[[TILE_1_0]]} : +// CHECK: %[[RHS_L3_to_L2_1:.*]] = amdaie.dma_cpy_nd // CHECK-SAME: %[[LOF_RHS_L3_1]][0, 0, 0] [1, 4, 5] [20, 5, 1]) : -// CHECK: scf.for -// CHECK-NOT: amdaie.logicalobjectfifo.from_memref %[[LHS]], {%[[TILE_1_0]]} : -// CHECK: %[[LHS_L3_to_L2:.*]] = amdaie.dma_cpy_nd +// CHECK: scf.for +// CHECK-NOT: amdaie.logicalobjectfifo.from_memref %[[LHS]], {%[[TILE_1_0]]} : +// CHECK: %[[LHS_L3_to_L2:.*]] = amdaie.dma_cpy_nd // CHECK-SAME: %[[LOF_LHS_L3]][0, 0, 0] [1, 3, 4] [12, 4, 1]) : -// CHECK: %[[RHS_L3_to_L2_0:.*]] = amdaie.dma_cpy_nd +// CHECK: %[[RHS_L3_to_L2_0:.*]] = amdaie.dma_cpy_nd // CHECK-SAME: %[[LOF_RHS_L3_0]][0, 0, 0] [1, 4, 5] [20, 5, 1]) : -// CHECK: %[[RHS_L3_to_L2_1:.*]] = amdaie.dma_cpy_nd +// CHECK: %[[RHS_L3_to_L2_1:.*]] = amdaie.dma_cpy_nd // CHECK-SAME: %[[LOF_RHS_L3_1]][0, 0, 0] [1, 4, 5] [20, 5, 1]) : -// CHECK: %[[LOF_OUT_L3_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[OUT]], {%[[TILE_0_0]]} : -// CHECK: %[[OUT_L2_to_L3_0:.*]] = amdaie.dma_cpy_nd +// CHECK: %[[LOF_OUT_L3_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[OUT]], {%[[TILE_0_0]]} : +// CHECK: %[[OUT_L2_to_L3_0:.*]] = amdaie.dma_cpy_nd // CHECK-SAME: (%[[LOF_OUT_L3_0]][0, 0, 0] [1, 5, 6] [30, 6, 1] -// CHECK: %[[LOF_OUT_L3_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[OUT]], {%[[TILE_1_0]]} : -// CHECK: %[[OUT_L2_to_L3_1:.*]] = amdaie.dma_cpy_nd +// CHECK: %[[LOF_OUT_L3_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[OUT]], {%[[TILE_1_0]]} : +// CHECK: %[[OUT_L2_to_L3_1:.*]] = amdaie.dma_cpy_nd // CHECK-SAME: (%[[LOF_OUT_L3_1]][0, 0, 1] [1, 5, 6] [30, 6, 1] #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu4", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { @@ -653,40 +653,40 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // This case is observed for 64x64x64 4x8 AIE array on Strix. // CHECK-LABEL: @same_tile_assg_for_l3_on_same_block -// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index -// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index -// CHECK: %[[LHS:.*]] = memref.alloc() : memref<2x3x4xi32> -// CHECK: %[[RHS:.*]] = memref.alloc() : memref<3x4x5xi32> -// CHECK: %[[OUT:.*]] = memref.alloc() : memref<4x5x6xi32> -// CHECK: scf.forall -// CHECK: %[[TILE_0_0:.*]] = amdaie.tile(%[[C0:.*]], %[[C0:.*]]) -// CHECK: %[[LOF_LHS_L3:.*]] = amdaie.logicalobjectfifo.from_memref %[[LHS]], {%[[TILE_0_0]]} : -// CHECK: %[[LHS_L3_to_L2:.*]] = amdaie.dma_cpy_nd +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[LHS:.*]] = memref.alloc() : memref<2x3x4xi32> +// CHECK-DAG: %[[RHS:.*]] = memref.alloc() : memref<3x4x5xi32> +// CHECK-DAG: %[[OUT:.*]] = memref.alloc() : memref<4x5x6xi32> +// CHECK-DAG: %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK-DAG: %[[TILE_1_0:.*]] = amdaie.tile(%[[C1]], %[[C0]]) +// CHECK: scf.forall +// CHECK: %[[LOF_LHS_L3:.*]] = amdaie.logicalobjectfifo.from_memref %[[LHS]], {%[[TILE_0_0]]} : +// CHECK: %[[LHS_L3_to_L2:.*]] = amdaie.dma_cpy_nd // CHECK-SAME: %[[LOF_LHS_L3]][0, 0, 0] [1, 3, 4] [12, 4, 1]) : -// CHECK: %[[LOF_RHS_L3_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[RHS]], {%[[TILE_0_0]]} : -// CHECK: %[[RHS_L3_to_L2_0:.*]] = amdaie.dma_cpy_nd +// CHECK: %[[LOF_RHS_L3_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[RHS]], {%[[TILE_0_0]]} : +// CHECK: %[[RHS_L3_to_L2_0:.*]] = amdaie.dma_cpy_nd // CHECK-SAME: %[[LOF_RHS_L3_0]][0, 0, 0] [1, 4, 5] [20, 5, 1]) : -// CHECK: %[[TILE_1_0:.*]] = amdaie.tile(%[[C1:.*]], %[[C0:.*]]) -// CHECK: %[[LOF_RHS_L3_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[RHS]], {%[[TILE_1_0]]} : -// CHECK: %[[RHS_L3_to_L2_1:.*]] = amdaie.dma_cpy_nd +// CHECK: %[[LOF_RHS_L3_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[RHS]], {%[[TILE_1_0]]} : +// CHECK: %[[RHS_L3_to_L2_1:.*]] = amdaie.dma_cpy_nd // CHECK-SAME: %[[LOF_RHS_L3_1]][0, 0, 0] [1, 4, 5] [20, 5, 1]) : -// CHECK: amdaie.core -// CHECK: amdaie.core -// CHECK: %[[LHS_L3_to_L2:.*]] = amdaie.dma_cpy_nd +// CHECK: amdaie.core +// CHECK: amdaie.core +// CHECK: %[[LHS_L3_to_L2:.*]] = amdaie.dma_cpy_nd // CHECK-SAME: %[[LOF_LHS_L3]][0, 0, 0] [1, 3, 4] [12, 4, 1]) : -// CHECK-NOT: amdaie.logicalobjectfifo.from_memref %[[LHS]], {%[[TILE_1_0]]} : -// CHECK: %[[RHS_L3_to_L2_0:.*]] = amdaie.dma_cpy_nd +// CHECK-NOT: amdaie.logicalobjectfifo.from_memref %[[LHS]], {%[[TILE_1_0]]} : +// CHECK: %[[RHS_L3_to_L2_0:.*]] = amdaie.dma_cpy_nd // CHECK-SAME: %[[LOF_RHS_L3_0]][0, 0, 0] [1, 4, 5] [20, 5, 1]) : -// CHECK: %[[RHS_L3_to_L2_1:.*]] = amdaie.dma_cpy_nd +// CHECK: %[[RHS_L3_to_L2_1:.*]] = amdaie.dma_cpy_nd // CHECK-SAME: %[[LOF_RHS_L3_1]][0, 0, 0] [1, 4, 5] [20, 5, 1]) : -// CHECK: amdaie.core -// CHECK: amdaie.core -// CHECK: %[[LOF_OUT_L3_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[OUT]], {%[[TILE_0_0]]} : -// CHECK: %[[OUT_L2_to_L3_0:.*]] = amdaie.dma_cpy_nd +// CHECK: amdaie.core +// CHECK: amdaie.core +// CHECK: %[[LOF_OUT_L3_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[OUT]], {%[[TILE_0_0]]} : +// CHECK: %[[OUT_L2_to_L3_0:.*]] = amdaie.dma_cpy_nd // CHECK-SAME: (%[[LOF_OUT_L3_0]][0, 0, 0] [1, 5, 6] [30, 6, 1] -// CHECK: %[[LOF_OUT_L3_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[OUT]], {%[[TILE_1_0]]} : -// CHECK: %[[OUT_L2_to_L3_1:.*]] = amdaie.dma_cpy_nd +// CHECK: %[[LOF_OUT_L3_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[OUT]], {%[[TILE_1_0]]} : +// CHECK: %[[OUT_L2_to_L3_1:.*]] = amdaie.dma_cpy_nd // CHECK-SAME: (%[[LOF_OUT_L3_1]][0, 0, 1] [1, 5, 6] [30, 6, 1] #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu4", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir index 57a15c673..37bb5cb45 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute_cores_and_objectfifos.mlir @@ -196,12 +196,12 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // // CHECK-LABEL: @hoist_dma_and_affine_single_loop_2x1 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index // CHECK-DAG: %[[ALLOC_0:.*]] = memref.alloc() : memref<32x1024xi32, 1> // CHECK-DAG: %[[ALLOC_1:.*]] = memref.alloc() : memref<32x64xi32, 2> // CHECK: scf.forall (%[[ARG0:.*]], %[[ARG1:.*]]) in (1, 1) { +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index // CHECK-DAG: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) // CHECK-DAG: %[[TILE_0_3:.*]] = amdaie.tile(%[[C0]], %[[C3]]) // CHECK-DAG: %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]]) @@ -251,12 +251,12 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // // CHECK-LABEL: @unroll_dma_and_affine_single_loop // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index -// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index // CHECK-DAG: %[[ALLOC_0:.*]] = memref.alloc() : memref<32x1024xi32, 1> // CHECK-DAG: %[[ALLOC_1:.*]] = memref.alloc() : memref<32x64xi32, 2> // CHECK: scf.forall (%[[ARG0:.*]], %[[ARG1:.*]]) in (1, 1) { +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index // CHECK-DAG: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) // CHECK-DAG: %[[TILE_0_3:.*]] = amdaie.tile(%[[C0]], %[[C3]]) // CHECK-DAG: %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]]) @@ -696,8 +696,8 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK-DAG: %[[FROM_MEMREF_8:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC]], {%[[TILE_1_2]]} // CHECK-DAG: %[[FROM_MEMREF_9:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC]], {%[[TILE_0_2]]} // CHECK-DAG: %[[FROM_MEMREF_10:.*]] = amdaie.logicalobjectfifo.from_memref %[[OUTPUT]], {%[[TILE_0_0]]} -// CHECK-DAG: %[[FROM_MEMREF_11:.*]] = amdaie.logicalobjectfifo.from_memref %[[IN_A]], {%[[TILE_1_0]]} -// CHECK-DAG: %[[FROM_MEMREF_12:.*]] = amdaie.logicalobjectfifo.from_memref %[[IN_B]], {%[[TILE_0_0]]} +// CHECK-DAG: %[[FROM_MEMREF_11:.*]] = amdaie.logicalobjectfifo.from_memref %[[IN_A]], {%[[TILE_0_0]]} +// CHECK-DAG: %[[FROM_MEMREF_12:.*]] = amdaie.logicalobjectfifo.from_memref %[[IN_B]], {%[[TILE_1_0]]} // CHECK-DAG: %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_0]] // CHECK-SAME: %[[FROM_MEMREF_11]] // CHECK-DAG: %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_7]]