diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp index b56bbc31d..161e10ab2 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp @@ -349,8 +349,9 @@ LogicalObjectFifoAccessOp::getLogicalObjectFifo() { //===----------------------------------------------------------------------===// void LogicalObjectFifoAcquire::build(OpBuilder &b, mlir::OperationState &result, - Value dma, LogicalObjectFifoPort port) { - build(b, result, dma, port, b.getI32IntegerAttr(1)); + mlir::TypeRange resultTypes, Value dma, + LogicalObjectFifoPort port) { + build(b, result, resultTypes, dma, port, b.getI32IntegerAttr(1)); } //===----------------------------------------------------------------------===// diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td index 083b94db6..c910497bf 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td @@ -402,12 +402,14 @@ def AMDAIE_LogicalObjectFifoAcquire: OptionalAttr:$size ); + let results = (outs AnyAMDAIELogicalObjectFifoType:$output); + let assemblyFormat = [{ - `(` $dma `,` $port `)` attr-dict + `(` $dma `,` $port `)` attr-dict `->` type($output) }]; let builders = [ - OpBuilder<(ins "mlir::Value":$dma, "LogicalObjectFifoPort":$port)>, + OpBuilder<(ins "mlir::TypeRange":$resultTypes, "mlir::Value":$dma, "LogicalObjectFifoPort":$port)>, ]; } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir index 670e303a2..5b96319b1 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir @@ -131,7 +131,7 @@ func.func @logicalobjectfifo_access(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { %0 = amdaie.dma_cpy_nd(%arg0[0, 0, 0, 0] [1, 1, 8, 16] [128, 128, 16, 1], %arg1[0, 0, 0, 0] [1, 1, 8, 16] [128, 16, 16, 1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - amdaie.logicalobjectfifo.acquire(%0, Consume) {size = 1 : i32} + %1 = amdaie.logicalobjectfifo.acquire(%0, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo> return } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAccessToAcquireRelease.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAccessToAcquireRelease.cpp new file mode 100644 index 000000000..3bed00cbf --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAccessToAcquireRelease.cpp @@ -0,0 +1,248 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/Transforms/Passes.h" +#include "mlir/IR/IRMapping.h" +#include "mlir/IR/Iterators.h" + +#define DEBUG_TYPE "iree-amdaie-access-to-acquire-release" + +namespace mlir::iree_compiler::AMDAIE { + +namespace { + +/// Walk all read access operations within the core operations and insert +/// semaphore acquire and release stubs. Acquire operations will be inserted +/// at the location of the access operation and release operations will be +/// inserted before the next access or at the end of the block. +LogicalResult readAccessToAcquireRelease(Operation *parentOp) { + IRRewriter rewriter(parentOp->getContext()); + + SmallVector coreOps; + parentOp->walk([&](AMDAIE::CoreOp coreOp) { coreOps.push_back(coreOp); }); + + // Map from DMA source/target logical objectFifos to those respective DMA + // operations. + DenseMap logicalObjectFifoToDma; + parentOp->walk([&](AMDAIE::CircularDmaCpyNdOp dmaOp) { + logicalObjectFifoToDma[dmaOp.getSource()] = dmaOp; + logicalObjectFifoToDma[dmaOp.getTarget()] = dmaOp; + }); + + for (AMDAIE::CoreOp coreOp : coreOps) { + DenseMap + logicalObjectFifoToLastAccess; + WalkResult res = + coreOp->walk([&](AMDAIE::LogicalObjectFifoAccessOp accessOp) { + if (accessOp.getAccessType() != AMDAIE::MemoryAccess::Read) + return WalkResult::advance(); + + if (logicalObjectFifoToLastAccess.contains(accessOp.getInput())) { + rewriter.setInsertionPoint(accessOp); + rewriter.create( + rewriter.getUnknownLoc(), + logicalObjectFifoToDma[accessOp.getInput()].getResult(), + LogicalObjectFifoPort::Consume); + } + + if (!logicalObjectFifoToDma.contains(accessOp.getInput())) { + accessOp.emitOpError() + << "read access not found as source of DMA operation"; + return WalkResult::interrupt(); + } + rewriter.setInsertionPoint(accessOp); + auto acquireOp = rewriter.create( + rewriter.getUnknownLoc(), + llvm::cast(accessOp.getInput().getType()), + logicalObjectFifoToDma[accessOp.getInput()].getResult(), + LogicalObjectFifoPort::Consume); + auto newAccessOp = rewriter.create( + rewriter.getUnknownLoc(), acquireOp.getResult(), + AMDAIE::MemoryAccess::Read); + rewriter.replaceAllUsesWith(accessOp.getResult(), + newAccessOp.getResult()); + logicalObjectFifoToLastAccess[accessOp.getInput()] = accessOp; + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return failure(); + + // Insert release for remaining read access operations at end of block. + for (auto &&[value, accessOp] : logicalObjectFifoToLastAccess) { + Block *parentBlock = accessOp->getBlock(); + if (!parentBlock->back().hasTrait()) { + rewriter.setInsertionPointToEnd(parentBlock); + } else { + rewriter.setInsertionPoint(parentBlock->getTerminator()); + } + if (!logicalObjectFifoToDma.contains(accessOp.getInput())) { + accessOp.emitOpError() + << "read access not found as source of DMA operation"; + return failure(); + } + rewriter.create( + rewriter.getUnknownLoc(), logicalObjectFifoToDma[accessOp.getInput()], + LogicalObjectFifoPort::Consume); + } + } + return success(); +} + +/// Walk all write access operations within the core operations and insert +/// semaphore operations. Release operations will be inserted +/// at the location of the access operation and acquire operations will be +/// inserted after the preceding access or at the beginning of the block. +LogicalResult writeAccessToAcquireRelease(Operation *parentOp) { + IRRewriter rewriter(parentOp->getContext()); + + SmallVector coreOps; + parentOp->walk([&](AMDAIE::CoreOp coreOp) { coreOps.push_back(coreOp); }); + + // Map from DMA source/target logical objectFifos to those respective DMA + // operations. + DenseMap logicalObjectFifoToDma; + parentOp->walk([&](AMDAIE::CircularDmaCpyNdOp dmaOp) { + logicalObjectFifoToDma[dmaOp.getSource()] = dmaOp; + logicalObjectFifoToDma[dmaOp.getTarget()] = dmaOp; + }); + + for (AMDAIE::CoreOp coreOp : coreOps) { + DenseMap> + logicalObjectFifoToAccesses; + DenseMap + logicalObjectFifoLastWriteAccesses; + WalkResult res = coreOp->walk([&](AMDAIE::LogicalObjectFifoAccessOp + accessOp) { + // If there is another write op on the same logical objectFifo, + // release it before the acquire. + if (logicalObjectFifoLastWriteAccesses.contains(accessOp.getInput())) { + AMDAIE::LogicalObjectFifoAccessOp prevAccess = + logicalObjectFifoLastWriteAccesses[accessOp.getInput()]; + if (!logicalObjectFifoToDma.contains(prevAccess.getInput())) { + prevAccess.emitOpError() + << "write access not found as source of DMA operation"; + return WalkResult::interrupt(); + } + rewriter.setInsertionPoint(accessOp); + rewriter.create( + rewriter.getUnknownLoc(), + logicalObjectFifoToDma[prevAccess.getInput()], + LogicalObjectFifoPort::Produce); + // Remove from last access as settled. + logicalObjectFifoLastWriteAccesses.erase(prevAccess.getInput()); + } + // Insert acquire for write access at first `Any` access or at start + // of block. + if (accessOp.getAccessType() == AMDAIE::MemoryAccess::Write) { + if (!logicalObjectFifoToAccesses.contains(accessOp.getInput())) { + rewriter.setInsertionPointToStart(accessOp->getBlock()); + } else { + AMDAIE::LogicalObjectFifoAccessOp firstAccess = + logicalObjectFifoToAccesses[accessOp.getInput()][0]; + rewriter.setInsertionPoint(firstAccess); + } + if (!logicalObjectFifoToDma.contains(accessOp.getInput())) { + accessOp.emitOpError() + << "write access not found as source of DMA operation"; + return WalkResult::interrupt(); + } + auto acquireOp = rewriter.create( + rewriter.getUnknownLoc(), + llvm::cast(accessOp.getInput().getType()), + logicalObjectFifoToDma[accessOp.getInput()].getResult(), + LogicalObjectFifoPort::Produce); + auto newAccessOp = rewriter.create( + rewriter.getUnknownLoc(), acquireOp.getResult(), + AMDAIE::MemoryAccess::Write); + + // Update uses of this access operation and the preceding ones. + rewriter.replaceAllUsesWith(accessOp.getResult(), + newAccessOp.getResult()); + if (logicalObjectFifoToAccesses.contains(accessOp.getInput())) { + for (AMDAIE::LogicalObjectFifoAccessOp precedingAccessOp : + logicalObjectFifoToAccesses[accessOp.getInput()]) { + rewriter.replaceAllUsesWith(precedingAccessOp.getResult(), + newAccessOp.getResult()); + } + } + + // Insert into last access map + logicalObjectFifoLastWriteAccesses[accessOp.getInput()] = accessOp; + } + // Insert any access operation into first access map. + if (!logicalObjectFifoToAccesses.contains(accessOp.getInput())) { + logicalObjectFifoToAccesses[accessOp.getInput()] = {accessOp}; + } else { + logicalObjectFifoToAccesses[accessOp.getInput()].push_back(accessOp); + } + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return failure(); + + // Insert release for remaining access operations at end of block. + for (auto &&[value, writeAccessOp] : logicalObjectFifoLastWriteAccesses) { + Block *parentBlock = writeAccessOp->getBlock(); + if (!parentBlock->back().hasTrait()) { + rewriter.setInsertionPointToEnd(parentBlock); + } else { + rewriter.setInsertionPoint(parentBlock->getTerminator()); + } + if (!logicalObjectFifoToDma.contains(writeAccessOp.getInput())) { + writeAccessOp.emitOpError() + << "write access not found as source of DMA operation"; + return failure(); + } + rewriter.create( + rewriter.getUnknownLoc(), + logicalObjectFifoToDma[writeAccessOp.getInput()], + LogicalObjectFifoPort::Produce); + } + } + return success(); +} + +class AMDAIEAccessToAcquireReleasePass + : public impl::AMDAIEAccessToAcquireReleaseBase< + AMDAIEAccessToAcquireReleasePass> { + public: + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + AMDAIEAccessToAcquireReleasePass() = default; + AMDAIEAccessToAcquireReleasePass( + const AMDAIEAccessToAcquireReleasePass &pass){}; + void runOnOperation() override; +}; + +void AMDAIEAccessToAcquireReleasePass::runOnOperation() { + Operation *parentOp = getOperation(); + if (failed(readAccessToAcquireRelease(parentOp))) { + parentOp->emitOpError() << "failed to convert read access operations to " + "acquire-release semaphore stubs"; + return signalPassFailure(); + } + if (failed(writeAccessToAcquireRelease(parentOp))) { + parentOp->emitOpError() << "failed to convert write access operations to " + "acquire-release semaphore stubs"; + return signalPassFailure(); + } + // Erase old access operations. + IRRewriter rewriter(parentOp->getContext()); + parentOp->walk([&](AMDAIE::LogicalObjectFifoAccessOp accessOp) { + if (accessOp->getUses().empty()) { + rewriter.eraseOp(accessOp); + } + }); +} + +} // namespace + +std::unique_ptr createAMDAIEAccessToAcquireReleasePass() { + return std::make_unique(); +} + +} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConsumeProduceToAcquireRelease.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConsumeProduceToAcquireRelease.cpp deleted file mode 100644 index cd3c014f4..000000000 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConsumeProduceToAcquireRelease.cpp +++ /dev/null @@ -1,139 +0,0 @@ -// Copyright 2024 The IREE Authors -// -// Licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -#include "iree-amd-aie/IR/AMDAIEOps.h" -#include "iree-amd-aie/Transforms/Passes.h" -#include "mlir/IR/IRMapping.h" -#include "mlir/IR/Iterators.h" - -#define DEBUG_TYPE "iree-amdaie-produce-consume-to-acquire-release" - -namespace mlir::iree_compiler::AMDAIE { - -namespace { - -/// Utility to find the parent operation of the provided operation within the -/// same block as the provided one if it exists. -Operation *getParentOpInBlock(Block *block, Operation *op) { - if (!op || op->getBlock() == block) return op; - auto parentOp = op->getParentOp(); - return getParentOpInBlock(block, parentOp); -} - -/// Walk all consume/produce operations within the core operations and insert -/// semaphore operations. -template -LogicalResult consumeProduceToAcquireRelease(Operation *parentOp) { - using IteratorType = std::conditional_t< - std::is_same::value, - ForwardIterator, ReverseIterator>; - using SemaphoreTypeAtOp = std::conditional_t< - std::is_same::value, - AMDAIE::LogicalObjectFifoAcquire, AMDAIE::LogicalObjectFifoRelease>; - using SemaphoreTypeAtOtherEndOfBlock = std::conditional_t< - std::is_same::value, - AMDAIE::LogicalObjectFifoRelease, AMDAIE::LogicalObjectFifoAcquire>; - - IRRewriter rewriter(parentOp->getContext()); - auto walkResult = parentOp->walk([&](AMDAIE::CoreOp coreOp) { - IRMapping mapper; - coreOp->walk([&](OpTy op) { - rewriter.setInsertionPoint(op); - rewriter.create(rewriter.getUnknownLoc(), op.getDma(), - op.getPort()); - - // Retrieve the DMA operation for this consume/produce and check whether - // it was encountered before. Add it to the map and advance if not. - Operation *dmaOp = op.getDma().getDefiningOp(); - if (!mapper.contains(dmaOp)) { - mapper.map(dmaOp, op.getOperation()); - return WalkResult::advance(); - } - - // Find the new consume/produce operation's parent operation within the - // same block as the previous operation of the same type and operating on - // the same DMA. Use this parent operation in the same block to set the - // insertion point either before or after depending on whether the - // iteration is happening in forward or backward fashion. - auto parentOpInBlock = - getParentOpInBlock(mapper.lookup(dmaOp)->getBlock(), op); - if (parentOpInBlock) { - if (std::is_same::value) { - rewriter.setInsertionPoint(parentOpInBlock); - } else { - rewriter.setInsertionPointAfter(parentOpInBlock); - } - } else { - if (std::is_same::value) { - rewriter.setInsertionPoint( - mapper.lookup(dmaOp)->getBlock()->getTerminator()); - } else { - rewriter.setInsertionPointToStart(mapper.lookup(dmaOp)->getBlock()); - } - } - - // Insert the other semaphore operation and erase the produce/consume - // operation. - rewriter.create( - rewriter.getUnknownLoc(), op.getDma(), op.getPort()); - rewriter.eraseOp(mapper.lookup(dmaOp)); - mapper.map(dmaOp, op.getOperation()); - return WalkResult::advance(); - }); - - // Add `SemaphoreTypeAtOtherEndOfBlock` operations for remaining - // consume/produce operations at the other end of the blocks. - for (auto &&[keyOp, valueOp] : mapper.getOperationMap()) { - auto produceConsumeOp = dyn_cast(valueOp); - if (std::is_same::value) { - rewriter.setInsertionPoint( - produceConsumeOp->getBlock()->getTerminator()); - } else { - rewriter.setInsertionPointToStart(produceConsumeOp->getBlock()); - } - rewriter.create( - rewriter.getUnknownLoc(), produceConsumeOp.getDma(), - produceConsumeOp.getPort()); - rewriter.eraseOp(produceConsumeOp); - } - return WalkResult::advance(); - }); - if (walkResult.wasInterrupted()) return failure(); - return success(); -} - -class AMDAIEConsumeProduceToAcquireReleasePass - : public impl::AMDAIEConsumeProduceToAcquireReleaseBase< - AMDAIEConsumeProduceToAcquireReleasePass> { - public: - void getDependentDialects(DialectRegistry ®istry) const override { - registry.insert(); - } - - AMDAIEConsumeProduceToAcquireReleasePass() = default; - AMDAIEConsumeProduceToAcquireReleasePass( - const AMDAIEConsumeProduceToAcquireReleasePass &pass){}; - void runOnOperation() override; -}; - -void AMDAIEConsumeProduceToAcquireReleasePass::runOnOperation() { - if (failed(consumeProduceToAcquireRelease( - getOperation()))) { - return signalPassFailure(); - } - if (failed(consumeProduceToAcquireRelease( - getOperation()))) { - return signalPassFailure(); - } -} - -} // namespace - -std::unique_ptr createAMDAIEConsumeProduceToAcquireReleasePass() { - return std::make_unique(); -} - -} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp index 84c0da3e8..845f93edd 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp @@ -480,12 +480,10 @@ LogicalResult insertLogicalObjectFifoAccess(ModuleOp moduleOp) { std::tuple value = memrefToLogicalObjectFifo[operand.get()]; - rewriter.create( + auto accessOp = rewriter.create( rewriter.getUnknownLoc(), std::get<0>(value), std::get<1>(value)); - // TODO(jornt): Temporary, enable after access operations are used - // for inserting synchronization stubs instead of consume/produce. - // linalgOp->setOperand(idx, accessOp); + linalgOp->setOperand(idx, accessOp); } else if (auto type = llvm::dyn_cast(operand.get().getType())) { Value memref = operand.get(); @@ -495,12 +493,10 @@ LogicalResult insertLogicalObjectFifoAccess(ModuleOp moduleOp) { rewriter.getUnknownLoc(), LogicalObjectFifoType::get(type), memref); rewriter.setInsertionPointToStart(coreOp.getBody()); - rewriter.create( + auto accessOp = rewriter.create( rewriter.getUnknownLoc(), logicalObjectFifo, AMDAIE::MemoryAccess::None); - // TODO(jornt): Temporary, enable after access operations are used - // for inserting synchronization stubs instead of consume/produce. - // linalgOp->setOperand(idx, accessOp); + linalgOp->setOperand(idx, accessOp); } } } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp index 64f0f39d8..a9aecb8e7 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp @@ -11,6 +11,8 @@ // //===----------------------------------------------------------------------===// +#include + #include "aie/Dialect/AIE/IR/AIEDialect.h" #include "aie/Dialect/AIEX/IR/AIEXDialect.h" #include "iree-amd-aie/IR/AMDAIEDialect.h" @@ -89,26 +91,25 @@ AIE::ObjectFifoCreateOp createObjectFifo(IRRewriter &rewriter, // TODO(jornt): I think objectfifos should support source type != dest type. MemRefType srcType = cast(dmaOp.getSourceType()).getElementType(); - Attribute srcMemSpace = srcType.getMemorySpace(); - int64_t srcMemSpaceInt = - srcMemSpace ? dyn_cast(srcMemSpace).getInt() : 0; MemRefType dstType = cast(dmaOp.getTargetType()).getElementType(); - Attribute dstMemSpace = dstType.getMemorySpace(); - int64_t dstMemSpaceInt = - dstMemSpace ? dyn_cast(dstMemSpace).getInt() : 0; - AIE::AIEObjectFifoType dtype; - if (srcMemSpaceInt == 1) { - // Source on L2 - dtype = AIE::AIEObjectFifoType::get(srcType); - } else if (dstMemSpaceInt == 1) { - // Destination on L2 - dtype = AIE::AIEObjectFifoType::get(dstType); - } else if (srcMemSpaceInt < dstMemSpaceInt) { - dtype = AIE::AIEObjectFifoType::get(dstType); - } else { - dtype = AIE::AIEObjectFifoType::get(srcType); - } + ArrayRef sourceShape = srcType.getShape(); + ArrayRef targetShape = dstType.getShape(); + int64_t sourceSize = std::accumulate(sourceShape.begin(), sourceShape.end(), + 1, std::multiplies<>()); + int64_t targetSize = std::accumulate(targetShape.begin(), targetShape.end(), + 1, std::multiplies<>()); + // TODO(jornt) for now, memory space 1 is used for objectfifos. Maybe refactor + // `aie.objectfifo` in the future to support different memory spaces. + MemRefType memrefType = + sourceSize < targetSize + ? MemRefType::get({sourceSize}, srcType.getElementType(), + MemRefLayoutAttrInterface{}, + rewriter.getI64IntegerAttr(1)) + : MemRefType::get({targetSize}, dstType.getElementType(), + MemRefLayoutAttrInterface{}, + rewriter.getI64IntegerAttr(1)); + AIE::AIEObjectFifoType dtype = AIE::AIEObjectFifoType::get(memrefType); auto depthInBytes = srcType.getElementTypeBitWidth() / 8; auto fifo = rewriter.create( rewriter.getUnknownLoc(), symName, srcTile, dstTiles, @@ -117,6 +118,43 @@ AIE::ObjectFifoCreateOp createObjectFifo(IRRewriter &rewriter, return fifo; } +/// Convert `amdaie.logicalobjectfifo.access` to `aie.objectfifo.subview.access` +/// + `memref.reinterpret_cast` to bridge the gap between the objectFifo type +/// and the type assumed by computational operations. +LogicalResult accessOpToAIE(IRRewriter &rewriter, + AMDAIE::LogicalObjectFifoAccessOp accessOp, + IRMapping &mapper, + SmallVector &toBeErased) { + LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::LogicalObjectFifoAccessOp]\n"); + OpBuilder::InsertionGuard guard(rewriter); + rewriter.setInsertionPoint(accessOp); + if (!mapper.contains(accessOp.getInput())) { + return accessOp.emitError() + << "this access operation's input has not been mapped"; + } + auto subviewOp = dyn_cast( + mapper.lookup(accessOp.getInput()).getDefiningOp()); + if (!subviewOp) { + return accessOp.emitError() + << "access doesn't operate on an input that has been mapped to an " + "`aie.objectfifo.acquire` + subview operation"; + } + auto type = cast(accessOp.getOutput().getType()); + // TODO(jornt): for now, memory space 1 is used for objectFifos. Refactor + // `aie.objectfifo` to support different memory spaces to avoid hardcoding. + MemRefType newType = + MemRefType::Builder(type).setMemorySpace(rewriter.getI64IntegerAttr(1)); + llvm::ArrayRef sizes = newType.getShape(); + auto [strides, baseOffset] = getStridesAndOffset(newType); + auto reinterpretOp = rewriter.create( + rewriter.getUnknownLoc(), newType, subviewOp.getOutput(), baseOffset, + sizes, strides); + mapper.map(accessOp.getOperation(), reinterpretOp.getOperation()); + mapper.map(accessOp.getResult(), reinterpretOp.getResult()); + toBeErased.push_back(accessOp); + return success(); +} + /// Convert `amdaie.logicalobjectfifo.acquire` to `aie.objectfifo.acquire`. /// There are some additional operations being added as well to bridge the gap /// to AIE: @@ -127,7 +165,8 @@ AIE::ObjectFifoCreateOp createObjectFifo(IRRewriter &rewriter, /// into objectfifos can have a different source and target type. LogicalResult acquireOpToAIE(IRRewriter &rewriter, AMDAIE::LogicalObjectFifoAcquire acquireOp, - IRMapping &mapper, IRMapping &localMemrefMapper) { + IRMapping &mapper, + SmallVector &toBeErased) { LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::LogicalObjectFifoAcquire]\n"); OpBuilder::InsertionGuard guard(rewriter); rewriter.setInsertionPoint(acquireOp); @@ -137,14 +176,6 @@ LogicalResult acquireOpToAIE(IRRewriter &rewriter, return dmaOp.emitError() << "acquire doesn't operate on a `amdaie.circular_dma_cpy_nd`"; } - MemRefType srcType = - cast(dmaOp.getSourceType()).getElementType(); - MemRefType newSrcType = MemRefType::Builder(srcType).setMemorySpace( - rewriter.getI64IntegerAttr(1)); - MemRefType dstType = - cast(dmaOp.getTargetType()).getElementType(); - MemRefType newDstType = MemRefType::Builder(dstType).setMemorySpace( - rewriter.getI64IntegerAttr(1)); auto objFifo = dyn_cast(mapper.lookup(dmaOp.getOperation())); @@ -163,42 +194,22 @@ LogicalResult acquireOpToAIE(IRRewriter &rewriter, : AIE::ObjectFifoPort::Consume; auto objFifoAquireOp = rewriter.create( rewriter.getUnknownLoc(), subviewType, port, objFifo.getName(), 1); - auto subview = rewriter.create( + auto subviewOp = rewriter.create( rewriter.getUnknownLoc(), elementType, objFifoAquireOp.getSubview(), rewriter.getIntegerAttr(rewriter.getI32Type(), 0)); - - Attribute dstMemSpace = dstType.getMemorySpace(); - if (!dstMemSpace) { - dmaOp.emitError("no memspace for dma op used in CoreOp is not supported"); - return failure(); - } - int64_t memSpaceInt = dyn_cast(dstMemSpace).getInt(); - // Use target logical objectfifo for L2 to L1 and source for L1 to L2. - Value memref = memSpaceInt == 2 ? dmaOp.getTargetObjectFifo().getMemref() - : dmaOp.getSourceObjectFifo().getMemref(); - MemRefType type = memSpaceInt == 2 ? newDstType : newSrcType; - llvm::ArrayRef sizes = type.getShape(); - auto [strides, baseOffset] = getStridesAndOffset(type); - auto reinterpretOp = rewriter.create( - rewriter.getUnknownLoc(), type, subview.getOutput(), baseOffset, sizes, - strides); - localMemrefMapper.map(memref, reinterpretOp.getResult()); - rewriter.eraseOp(acquireOp); + // Map acquire op to new acquire + subview op. + mapper.map(acquireOp.getOperation(), subviewOp.getOperation()); + mapper.map(acquireOp.getResult(), subviewOp.getOutput()); + toBeErased.push_back(acquireOp); return success(); } LogicalResult coreLinalgOpToAIE(IRRewriter &rewriter, linalg::LinalgOp linalgOp, IRMapping &mapper, - IRMapping &localMemrefMapper) { + SmallVector &toBeErased) { LLVM_DEBUG(llvm::dbgs() << "Convert [linalg.LinalgOp]\n"); OpBuilder::InsertionGuard guard(rewriter); rewriter.setInsertionPoint(linalgOp); - for (int i = 0; i < linalgOp->getNumOperands(); ++i) { - Value operand = linalgOp->getOperand(i); - if (localMemrefMapper.contains(operand)) { - linalgOp->setOperand(i, localMemrefMapper.lookup(operand)); - } - } rewriter.clone(*(linalgOp.getOperation()), mapper); rewriter.eraseOp(linalgOp); return success(); @@ -207,7 +218,7 @@ LogicalResult coreLinalgOpToAIE(IRRewriter &rewriter, linalg::LinalgOp linalgOp, LogicalResult coreReleaseOpToAIE(IRRewriter &rewriter, AMDAIE::LogicalObjectFifoRelease releaseOp, IRMapping &mapper, - IRMapping &localMemrefMapper) { + SmallVector &toBeErased) { LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::LogicalObjectFifoRelease]\n"); OpBuilder::InsertionGuard guard(rewriter); rewriter.setInsertionPoint(releaseOp); @@ -257,30 +268,32 @@ LogicalResult coreToAIE(IRRewriter &rewriter, AMDAIE::CoreOp coreOp, rewriter.setInsertionPointToEnd(aieCoreBlock); rewriter.create(rewriter.getUnknownLoc()); - // Mapper used to remap local memrefs if needed. This happens as we're - // inserting `ReinterpretCastOp` operations in the acquire operation to AIE - // conversion. - IRMapping localMemrefMapper; + SmallVector toBeErased; auto walkResult = aieCoreOp.walk([&](Operation *op) { rewriter.setInsertionPoint(op); if (TypeSwitch(op) .Case([&](auto accessOp) { - // TODO(jornt): Temporary until access operations are used for - // inserting synchronization stubs instead of consume/produce. - rewriter.eraseOp(accessOp); - return success(); + return accessOpToAIE(rewriter, accessOp, mapper, toBeErased); }) .Case([&](auto acquireOp) { - return acquireOpToAIE(rewriter, acquireOp, mapper, - localMemrefMapper); + return acquireOpToAIE(rewriter, acquireOp, mapper, toBeErased); + }) + .Case([&](auto consumeOp) { + // TODO(jornt): get rid of LogicalObjectFifoConsume before this + rewriter.eraseOp(consumeOp); + return success(); + }) + .Case([&](auto produceOp) { + // TODO(jornt): get rid of LogicalObjectFifoProduce before this + rewriter.eraseOp(produceOp); + return success(); }) .Case([&](auto releaseOp) { return coreReleaseOpToAIE(rewriter, releaseOp, mapper, - localMemrefMapper); + toBeErased); }) .Case([&](auto linalgOp) { - return coreLinalgOpToAIE(rewriter, linalgOp, mapper, - localMemrefMapper); + return coreLinalgOpToAIE(rewriter, linalgOp, mapper, toBeErased); }) .Default([&](Operation *op) { remapOperands(op, mapper); @@ -295,6 +308,10 @@ LogicalResult coreToAIE(IRRewriter &rewriter, AMDAIE::CoreOp coreOp, coreOp.emitError("could not convert to AIEDialect ops"); return failure(); } + for (auto *op : toBeErased) { + op->dropAllUses(); + rewriter.eraseOp(op); + } mapper.map(coreOp.getResult(), aieCoreOp.getResult()); mapper.map(coreOp.getOperation(), aieCoreOp.getOperation()); @@ -663,7 +680,7 @@ LogicalResult lowerToAIE(ModuleOp moduleOp) { auto deviceOp = rewriter.create( rewriter.getUnknownLoc(), xilinx::AIE::AIEDeviceAttr::get(rewriter.getContext(), - xilinx::AIE::AIEDevice::npu1)); + xilinx::AIE::AIEDevice::npu1_4col)); deviceOp.getRegion().emplaceBlock(); Block *deviceBlock = &deviceOp.getRegion().front(); @@ -683,7 +700,8 @@ LogicalResult lowerToAIE(ModuleOp moduleOp) { FunctionType funcType = rewriter.getFunctionType(inputTypes, TypeRange{}); rewriter.setInsertionPoint(deviceBlock, deviceBlock->begin()); auto ipuFuncOp = rewriter.create( - rewriter.getUnknownLoc(), rewriter.getStringAttr("sequence"), funcType); + rewriter.getUnknownLoc(), rewriter.getStringAttr(funcOp.getSymName()), + funcType); ipuFuncOp.setPublic(); rewriter.setInsertionPointToStart(ipuFuncOp.addEntryBlock()); rewriter.create(rewriter.getUnknownLoc()); @@ -813,12 +831,15 @@ void AMDAIELowerToAIEPass::runOnOperation() { return WalkResult::advance(); }); if (res.wasInterrupted()) return signalPassFailure(); + LLVM_DEBUG(llvm::dbgs() << "Module after createLogicalObjectFifoLink: " + << getOperation()); // Main function call to convert all operations into AIE dialect operations // inside an AIE device. if (failed(lowerToAIE(getOperation()))) { return signalPassFailure(); } + LLVM_DEBUG(llvm::dbgs() << "Module after lowerToAIE: " << getOperation()); // Clean up the HAL bindings and it's uses as they are not needed anymore. if (failed(eraseHALBindings(getOperation()))) { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt index f6985fef4..d231215f9 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt @@ -43,12 +43,12 @@ iree_cc_library( "AMDAIEUtils.h" "Transforms.h" SRCS + "AMDAIEAccessToAcquireRelease.cpp" "AMDAIEAddLoweringStrategy.cpp" "AMDAIEAIRDmaToAMDAIEDma.cpp" "AMDAIEBufferizeToAllocation.cpp" "AMDAIECanonicalizeDma.cpp" "AMDAIECanonicalizeDoublyStridedOp.cpp" - "AMDAIEConsumeProduceToAcquireRelease.cpp" "AMDAIEControlCodeLoopUnroll.cpp" "AMDAIECreateAIEWorkgroup.cpp" "AMDAIECreateLogicalObjectFifoLink.cpp" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp index 0b9224e9a..3c32bd939 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/KernelDispatch.cpp @@ -206,8 +206,12 @@ FailureOr ParameterSetting::create(linalg::LinalgOp linalgOp, uint32_t K1 = 0; uint32_t K0 = 1; - uint32_t m0Pack = M0; - uint32_t n0Pack = N0; + // Instead of directly packing to (1, 1, M0, N0), the new strategy is making + // the pack size as (2, 2, M0/2, N0/2) to avoid the large allocation in L1. + // Also we should make sure the first level inner pack size is divisible by + // the second level of inner pack size (vector instruction size). + uint32_t m0Pack = (M0 / 2) % m1Pack == 0 ? (M0 / 2) : M0; + uint32_t n0Pack = (N0 / 2) % n1Pack == 0 ? (N0 / 2) : N0; uint32_t k0Pack = findLargestFactor(K, maxL1Size); return ParameterSetting{M0, N0, K0, M1, N1, K1, @@ -355,8 +359,7 @@ static LogicalResult setRootConfigForPackPeelPipeline( SmallVector TileSizeLevel0 = {packPeelTiling.getM0(), packPeelTiling.getN0()}; SmallVector TileSizeLevel1 = {0, 0, packPeelTiling.getK0()}; - SmallVector TileSizeLevel2 = { - 0, 0, 0, packPeelTiling.getM1(), packPeelTiling.getN1(), 0}; + SmallVector TileSizeLevel2 = {1, 1, 0, 0, 0, 0}; TileSizesListType tileSizes = {TileSizeLevel0, TileSizeLevel1, TileSizeLevel2}; if (failed(setOpConfigAndEntryPointFnTranslation( diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h index 4b9c0f28e..b63f3e455 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h @@ -20,13 +20,13 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DECL +#define GEN_PASS_DEF_AMDAIEACCESSTOACQUIRERELEASE #define GEN_PASS_DEF_AMDAIEAIRDMATOAMDAIEDMA #define GEN_PASS_DEF_AMDAIEBRIDGETOAIR #define GEN_PASS_DEF_AMDAIEBUFFERIZETOALLOCATION #define GEN_PASS_DEF_AMDAIECANONICALIZEDMA #define GEN_PASS_DEF_AMDAIECANONICALIZEDOUBLYSTRIDEDOP #define GEN_PASS_DEF_AMDAIECLEANUP -#define GEN_PASS_DEF_AMDAIECONSUMEPRODUCETOACQUIRERELEASE #define GEN_PASS_DEF_AMDAIECONTROLCODELOOPUNROLL #define GEN_PASS_DEF_AMDAIECREATEAIEWORKGROUP #define GEN_PASS_DEF_AMDAIECREATELOGICALOBJECTFIFOLINK diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index a1ece0b03..0df1e7a9b 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -561,7 +561,13 @@ void addMLIRAIRAIELoweringPasses(OpPassManager &passManager, bool packPeel) { passManager.addPass(xilinx::air::createAIRLoweringPass()); { xilinx::air::AffineLoopOptPassOptions options; - const std::vector tile_sizes = {4, 4}; + std::vector tile_sizes; + if (packPeel) { + tile_sizes = {2, 2}; + } + else{ + tile_sizes = {4, 4}; + } options.clTileSizes = ArrayRef(tile_sizes); passManager.addNestedPass( xilinx::air::createAffineLoopOptPass(options)); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index 5e7b59e48..4df281930 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -33,6 +33,10 @@ void addPadPackBasedPassPipeline(OpPassManager &passManager, /// Populates passes needed to link HAL executables across AIE targets. void buildAMDAIELinkingPassPipeline(OpPassManager &passManager); +/// Pass to convert logical objectFifo access operations to acquire/release +/// semaphore operations. +std::unique_ptr createAMDAIEAccessToAcquireReleasePass(); + /// Create a pass to convert AIR DMA ops into AMDAIE DMA ops operating on /// logical objectFifos. std::unique_ptr createAMDAIEAIRDmaAMDAIEDmaPass(); @@ -52,10 +56,6 @@ std::unique_ptr createAMDAIECanonicalizeDmaPass(); /// Create pass to canonicalize doubly strided operations. std::unique_ptr createAMDAIECanonicalizeDoublyStridedOpPass(); -/// Pass to convert logical objectFifo consume/produce operation to -/// acquire/release semaphore operations. -std::unique_ptr createAMDAIEConsumeProduceToAcquireReleasePass(); - /// Pass to unroll the loops within the control code regions. std::unique_ptr createAMDAIEControlCodeLoopUnrollPass(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index c900f20d9..dc48ceba4 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -10,6 +10,13 @@ include "iree-amd-aie/IR/AMDAIEDialect.td" include "mlir/Pass/PassBase.td" +def AMDAIEAccessToAcquireRelease : + Pass<"iree-amdaie-access-to-acquire-release", ""> { + let summary = "Convert logical objectFifo access operations to acquire/release " + "semaphore operations."; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEAccessToAcquireReleasePass()"; +} + def AMDAIEAIRDmaToAMDAIEDma : Pass<"iree-amdaie-air-dma-to-amdaie-dma", ""> { let summary = "Convert AIR DMA ops into AMDAIE DMA ops operating on logical objectFifos"; @@ -67,13 +74,6 @@ def AMDAIECleanup : "mlir::iree_compiler::AMDAIE::createAMDAIECleanupPass()"; } -def AMDAIEConsumeProduceToAcquireRelease : - Pass<"iree-amdaie-consume-produce-to-acquire-release", ""> { - let summary = "Convert logical objectFifo consume/produce operation to " - "acquire/release semaphore operations."; - let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEConsumeProduceToAcquireReleasePass()"; -} - def AMDAIEControlCodeLoopUnroll : Pass<"iree-amdaie-controlcode-loop-unroll", ""> { let summary = "Unroll the loops in the control code regions."; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt index 1944b479a..f9bad1e9e 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt @@ -8,13 +8,13 @@ iree_lit_test_suite( NAME lit SRCS + "access_to_acquire_release.mlir" "aie_link_executables.mlir" "air_dma_to_amdaie_dma.mlir" "bridge_to_air.mlir" "bufferize_to_allocation.mlir" "canonicalize_dma.mlir" "canonicalize_doubly_strided_op.mlir" - "consume_produce_to_acquire_release.mlir" "controlcode_loop_unrolling.mlir" "create_aie_workgroup.mlir" "create_logical_objectfifo_link.mlir" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/access_to_acquire_release.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/access_to_acquire_release.mlir new file mode 100644 index 000000000..fdcd9fcd2 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/access_to_acquire_release.mlir @@ -0,0 +1,167 @@ +// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-amdaie-access-to-acquire-release))" --split-input-file %s | FileCheck %s + +// CHECK-LABEL: @read_access +// CHECK: %[[DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.core +// CHECK: %[[ACQUIRE:.+]] = amdaie.logicalobjectfifo.acquire(%[[DMA]], Consume) +// CHECK: %[[ACCESS:.+]] = amdaie.logicalobjectfifo.access(%[[ACQUIRE]], Read) +// CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[ACCESS]] +// CHECK: amdaie.logicalobjectfifo.release(%[[DMA]], Consume) +func.func @read_access(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %tile = amdaie.tile(%c0, %c0) + %2 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %core = amdaie.core(%tile) { + %3 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> + amdaie.logicalobjectfifo.consume(%2) + linalg.fill ins(%c0_i32 : i32) outs(%3 : memref<1x1x8x16xi32, 2>) + amdaie.end + } + return +} + +// ----- + +// CHECK-LABEL: @write_access +// CHECK: %[[DMA:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.core +// CHECK: %[[ACQUIRE:.+]] = amdaie.logicalobjectfifo.acquire(%[[DMA]], Produce) +// CHECK: %[[ACCESS:.+]] = amdaie.logicalobjectfifo.access(%[[ACQUIRE]], Write) +// CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[ACCESS]] +// CHECK: amdaie.logicalobjectfifo.release(%[[DMA]], Produce) +func.func @write_access(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %tile = amdaie.tile(%c0, %c0) + %2 = amdaie.circular_dma_cpy_nd(%arg1[] [] [], %arg0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %core = amdaie.core(%tile) { + %3 = amdaie.logicalobjectfifo.access(%arg0, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> + linalg.fill ins(%c0_i32 : i32) outs(%3 : memref<1x1x8x16xi32, 2>) + amdaie.logicalobjectfifo.produce(%2) + amdaie.end + } + return +} + +// ----- + +// CHECK-LABEL: @none_access +// CHECK-SAME: %[[ARG0:.+]]: !amdaie.logicalobjectfifo> +// CHECK: amdaie.core +// CHECK: %[[ACCESS:.+]] = amdaie.logicalobjectfifo.access(%[[ARG0]], None) +// CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[ACCESS]] +func.func @none_access(%arg0: !amdaie.logicalobjectfifo>) { + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %tile = amdaie.tile(%c0, %c0) + %core = amdaie.core(%tile) { + %3 = amdaie.logicalobjectfifo.access(%arg0, None) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> + linalg.fill ins(%c0_i32 : i32) outs(%3 : memref<1x1x8x16xi32, 2>) + amdaie.end + } + return +} + +// ----- + +// CHECK-LABEL: @any_access +// CHECK-SAME: %[[ARG0:.+]]: !amdaie.logicalobjectfifo> +// CHECK: amdaie.core +// CHECK: %[[ACCESS:.+]] = amdaie.logicalobjectfifo.access(%[[ARG0]], Any) +// CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[ACCESS]] +func.func @any_access(%arg0: !amdaie.logicalobjectfifo>) { + %c0_i32 = arith.constant 0 : i32 + %c0 = arith.constant 0 : index + %tile = amdaie.tile(%c0, %c0) + %core = amdaie.core(%tile) { + %3 = amdaie.logicalobjectfifo.access(%arg0, Any) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> + linalg.fill ins(%c0_i32 : i32) outs(%3 : memref<1x1x8x16xi32, 2>) + amdaie.end + } + return +} + +// ----- + +// CHECK-LABEL: @read_and_write +// CHECK: %[[DMA0:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: %[[DMA1:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.core +// CHECK: %[[ACQUIRE_0:.+]] = amdaie.logicalobjectfifo.acquire(%[[DMA1]], Produce) +// CHECK: %[[ACCESS_0:.+]] = amdaie.logicalobjectfifo.access(%[[ACQUIRE_0]], Write) +// CHECK: %[[ACQUIRE_1:.+]] = amdaie.logicalobjectfifo.acquire(%[[DMA0]], Consume) +// CHECK: %[[ACCESS_1:.+]] = amdaie.logicalobjectfifo.access(%[[ACQUIRE_1]], Read) +// CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[ACCESS_1]] +// CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[ACCESS_0]] +// CHECK: amdaie.logicalobjectfifo.release(%[[DMA0]], Consume) +// CHECK: amdaie.logicalobjectfifo.release(%[[DMA1]], Produce) +func.func @read_and_write(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>, %arg2: !amdaie.logicalobjectfifo>, %arg3: !amdaie.logicalobjectfifo>) { + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %tile = amdaie.tile(%c0, %c0) + %2 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %3 = amdaie.circular_dma_cpy_nd(%arg3[] [] [], %arg2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %core = amdaie.core(%tile) { + %4 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> + %5 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> + amdaie.logicalobjectfifo.consume(%2) + linalg.fill ins(%c0_i32 : i32) outs(%4 : memref<1x1x8x16xi32, 2>) + linalg.fill ins(%c0_i32 : i32) outs(%5 : memref<1x1x8x16xi32, 2>) + amdaie.logicalobjectfifo.produce(%3) + amdaie.end + } + return +} + +// ----- + +// CHECK-LABEL: @read_write_multiple_blocks +// CHECK: %[[DMA0:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: %[[DMA1:.+]] = amdaie.circular_dma_cpy_nd +// CHECK: amdaie.core +// CHECK: %[[ACQUIRE_0:.+]] = amdaie.logicalobjectfifo.acquire(%[[DMA1]], Produce) +// CHECK: %[[ACCESS_0:.+]] = amdaie.logicalobjectfifo.access(%[[ACQUIRE_0]], Write) +// CHECK: %[[ACQUIRE_1:.+]] = amdaie.logicalobjectfifo.acquire(%[[DMA0]], Consume) +// CHECK: %[[ACCESS_1:.+]] = amdaie.logicalobjectfifo.access(%[[ACQUIRE_1]], Read) +// CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[ACCESS_1]] +// CHECK: scf.for +// CHECK: amdaie.logicalobjectfifo.release(%[[DMA0]], Consume) +// CHECK: %[[ACQUIRE_1:.+]] = amdaie.logicalobjectfifo.acquire(%[[DMA0]], Consume) +// CHECK: %[[ACCESS_1:.+]] = amdaie.logicalobjectfifo.access(%[[ACQUIRE_1]], Read) +// CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[ACCESS_1]] +// CHECK: } +// CHECK: amdaie.logicalobjectfifo.release(%[[DMA0]], Consume) +// CHECK: %[[ACQUIRE_1:.+]] = amdaie.logicalobjectfifo.acquire(%[[DMA0]], Consume) +// CHECK: %[[ACCESS_1:.+]] = amdaie.logicalobjectfifo.access(%[[ACQUIRE_1]], Read) +// CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[ACCESS_1]] +// CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[ACCESS_0]] +// CHECK: amdaie.logicalobjectfifo.release(%[[DMA0]], Consume) +// CHECK: amdaie.logicalobjectfifo.release(%[[DMA1]], Produce) +func.func @read_write_multiple_blocks(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>, %arg2: !amdaie.logicalobjectfifo>, %arg3: !amdaie.logicalobjectfifo>) { + %c0 = arith.constant 0 : index + %c0_i32 = arith.constant 0 : i32 + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %tile = amdaie.tile(%c0, %c0) + %2 = amdaie.circular_dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %3 = amdaie.circular_dma_cpy_nd(%arg3[] [] [], %arg2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %core = amdaie.core(%tile) { + %4 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> + amdaie.logicalobjectfifo.consume(%2) + linalg.fill ins(%c0_i32 : i32) outs(%4 : memref<1x1x8x16xi32, 2>) + scf.for %arg = %c0 to %c8 step %c1 { + %5 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> + amdaie.logicalobjectfifo.consume(%2) + linalg.fill ins(%c0_i32 : i32) outs(%5 : memref<1x1x8x16xi32, 2>) + } + %6 = amdaie.logicalobjectfifo.access(%arg0, Read) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> + %7 = amdaie.logicalobjectfifo.access(%arg2, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2> + amdaie.logicalobjectfifo.consume(%2) + linalg.fill ins(%c0_i32 : i32) outs(%6 : memref<1x1x8x16xi32, 2>) + linalg.fill ins(%c0_i32 : i32) outs(%7 : memref<1x1x8x16xi32, 2>) + amdaie.logicalobjectfifo.produce(%3) + amdaie.end + } + return +} diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/consume_produce_to_acquire_release.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/consume_produce_to_acquire_release.mlir deleted file mode 100644 index 2a3877156..000000000 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/consume_produce_to_acquire_release.mlir +++ /dev/null @@ -1,125 +0,0 @@ -// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-amdaie-consume-produce-to-acquire-release))" --split-input-file %s | FileCheck %s - -// CHECK-LABEL: @consume -// CHECK: %[[DMA:.+]] = amdaie.dma_cpy_nd -// CHECK: amdaie.core -// CHECK: amdaie.logicalobjectfifo.acquire -// CHECK-SAME: %[[DMA]] -// CHECK-SAME: Consume -// CHECK: amdaie.logicalobjectfifo.release -// CHECK-SAME: %[[DMA]] -// CHECK-SAME: Consume -func.func @consume(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { - %c0 = arith.constant 0 : index - %tile = amdaie.tile(%c0, %c0) - %2 = amdaie.dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %core = amdaie.core(%tile) { - amdaie.logicalobjectfifo.consume(%2) - amdaie.end - } - return -} - -// ----- - -// CHECK-LABEL: @produce -// CHECK: %[[DMA:.+]] = amdaie.dma_cpy_nd -// CHECK: amdaie.core -// CHECK: amdaie.logicalobjectfifo.acquire -// CHECK-SAME: %[[DMA]] -// CHECK-SAME: Produce -// CHECK: amdaie.logicalobjectfifo.release -// CHECK-SAME: %[[DMA]] -// CHECK-SAME: Produce -func.func @produce(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { - %c0 = arith.constant 0 : index - %tile = amdaie.tile(%c0, %c0) - %2 = amdaie.dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %core = amdaie.core(%tile) { - amdaie.logicalobjectfifo.produce(%2) - amdaie.end - } - return -} - -// ----- - -// CHECK-LABEL: @consume_and_produce -// CHECK: %[[DMA0:.+]] = amdaie.dma_cpy_nd -// CHECK: %[[DMA1:.+]] = amdaie.dma_cpy_nd -// CHECK: amdaie.core -// CHECK: amdaie.logicalobjectfifo.acquire -// CHECK-SAME: %[[DMA1]] -// CHECK-SAME: Produce -// CHECK: amdaie.logicalobjectfifo.acquire -// CHECK-SAME: %[[DMA0]] -// CHECK-SAME: Consume -// CHECK: amdaie.logicalobjectfifo.release -// CHECK-SAME: %[[DMA1]] -// CHECK-SAME: Produce -// CHECK: amdaie.logicalobjectfifo.release -// CHECK-SAME: %[[DMA0]] -// CHECK-SAME: Consume -func.func @consume_and_produce(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>, %arg2: !amdaie.logicalobjectfifo>, %arg3: !amdaie.logicalobjectfifo>) { - %c0 = arith.constant 0 : index - %tile = amdaie.tile(%c0, %c0) - %2 = amdaie.dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %3 = amdaie.dma_cpy_nd(%arg3[] [] [], %arg2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %core = amdaie.core(%tile) { - amdaie.logicalobjectfifo.consume(%2) - amdaie.logicalobjectfifo.produce(%3) - amdaie.end - } - return -} - -// ----- - -// CHECK-LABEL: @consume_and_produce_multiple_blocks -// CHECK: %[[DMA0:.+]] = amdaie.dma_cpy_nd -// CHECK: %[[DMA1:.+]] = amdaie.dma_cpy_nd -// CHECK: amdaie.core -// CHECK: amdaie.logicalobjectfifo.acquire -// CHECK-SAME: %[[DMA1]] -// CHECK-SAME: Produce -// CHECK: amdaie.logicalobjectfifo.acquire -// CHECK-SAME: %[[DMA0]] -// CHECK-SAME: Consume -// CHECK: amdaie.logicalobjectfifo.release -// CHECK-SAME: %[[DMA0]] -// CHECK-SAME: Consume -// CHECK: scf.for -// CHECK: amdaie.logicalobjectfifo.acquire -// CHECK-SAME: %[[DMA0]] -// CHECK-SAME: Consume -// CHECK: amdaie.logicalobjectfifo.release -// CHECK-SAME: %[[DMA0]] -// CHECK-SAME: Consume -// CHECK: } -// CHECK: amdaie.logicalobjectfifo.acquire -// CHECK-SAME: %[[DMA0]] -// CHECK-SAME: Consume -// CHECK: amdaie.logicalobjectfifo.release -// CHECK-SAME: %[[DMA1]] -// CHECK-SAME: Produce -// CHECK: amdaie.logicalobjectfifo.release -// CHECK-SAME: %[[DMA0]] -// CHECK-SAME: Consume -func.func @consume_and_produce_multiple_blocks(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>, %arg2: !amdaie.logicalobjectfifo>, %arg3: !amdaie.logicalobjectfifo>) { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c8 = arith.constant 8 : index - %tile = amdaie.tile(%c0, %c0) - %2 = amdaie.dma_cpy_nd(%arg0[] [] [], %arg1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %3 = amdaie.dma_cpy_nd(%arg3[] [] [], %arg2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %core = amdaie.core(%tile) { - amdaie.logicalobjectfifo.consume(%2) - scf.for %arg = %c0 to %c8 step %c1 { - amdaie.logicalobjectfifo.consume(%2) - } - amdaie.logicalobjectfifo.consume(%2) - amdaie.logicalobjectfifo.produce(%3) - amdaie.end - } - return -} diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute-cores-and-objectfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute-cores-and-objectfifos.mlir index 3f2c9d123..d655b4c87 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute-cores-and-objectfifos.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute-cores-and-objectfifos.mlir @@ -90,13 +90,13 @@ module { // CHECK-DAG: %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]]) // CHECK: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Read) // CHECK: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) -// CHECK: linalg.fill ins(%{{.+}} : i32) +// CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x64xi32, 2>) // CHECK-DAG: %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]] // CHECK-SAME: %[[FROM_MEMREF_0]] // CHECK-DAG: %[[CORE_1:.*]] = amdaie.core(%[[TILE_1_2]]) // CHECK: %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Read) // CHECK: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) -// CHECK: linalg.fill ins(%{{.+}} : i32) +// CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_1]] : memref<32x64xi32, 2>) module { func.func @unroll_dma() { %c0_i32 = arith.constant 0 : i32 @@ -145,11 +145,11 @@ module { // CHECK-DAG: %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]]) // CHECK: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) // CHECK: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) -// CHECK: linalg.fill ins(%{{.+}} : i32) outs +// CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x64xi32, 2>) // CHECK-DAG: %[[CORE_1:.*]] = amdaie.core(%[[TILE_1_2]]) // CHECK: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) // CHECK: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) -// CHECK: linalg.fill ins(%{{.+}} : i32) outs +// CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x64xi32, 2>) module { func.func @hoist_dma_single_loop() { %c0_i32 = arith.constant 0 : i32 @@ -454,19 +454,19 @@ module { // CHECK-DAG: %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]]) // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_5]], Read) // CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) -// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x64xi32, 2>) // CHECK-DAG: %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]]) // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_4]], Read) // CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_3]]) -// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x64xi32, 2>) // CHECK-DAG: %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]]) // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_5]], Read) // CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) -// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x64xi32, 2>) // CHECK-DAG: %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]]) // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_4]], Read) // CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_3]]) -// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x64xi32, 2>) module { func.func @hoist_dma_dependencies() { %c0_i32 = arith.constant 0 : i32 @@ -536,30 +536,30 @@ module { // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_4]], Write) // CHECK-DAG: %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Read) // CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) -// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs -// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_1]] : memref<32x64xi32, 2>) +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x32xi32, 2>) // CHECK-DAG: %[[DMA_3:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]][%c0, %c1] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_5]] // CHECK-DAG: %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]]) // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_5]], Write) // CHECK-DAG: %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Read) // CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) -// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs -// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_1]] : memref<32x64xi32, 2>) +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x32xi32, 2>) // CHECK-DAG: %[[DMA_4:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]][] [] [], %[[FROM_MEMREF_1]] // CHECK-DAG: %[[DMA_5:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]][%c1, %c0] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_6]] // CHECK-DAG: %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]]) // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_6]], Write) // CHECK-DAG: %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Read) // CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_4]]) -// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs -// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_1]] : memref<32x64xi32, 2>) +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x32xi32, 2>) // CHECK-DAG: %[[DMA_6:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]][%c1, %c1] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_7]] // CHECK-DAG: %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]]) // CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_7]], Write) // CHECK-DAG: %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Read) // CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_4]]) -// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs -// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_1]] : memref<32x64xi32, 2>) +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs(%[[VAL_0]] : memref<32x32xi32, 2>) // CHECK-DAG: %[[DMA_7:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_9]][%[[ARG1]]] [%c1] [%c1], %[[FROM_MEMREF_8]] module { func.func @nested_dma_dependencies() { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir index ed87455f0..25767d079 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir @@ -8,7 +8,7 @@ module { // CHECK: module // CHECK: aie.device -// CHECK: func.func @sequence +// CHECK: func.func @empty_func module { func.func @empty_func() { return @@ -19,7 +19,7 @@ module { // CHECK: module // CHECK: aie.device -// CHECK: func.func @sequence +// CHECK: func.func @workgroup module { func.func @workgroup() { amdaie.workgroup { @@ -33,7 +33,7 @@ module { // ----- -// CHECK: func.func @sequence +// CHECK: func.func @hal_bindings // CHECK-SAME: %{{.+}}: memref<32x1024xi32> // CHECK-SAME: %{{.+}}: memref<1024x64xi32> // CHECK-SAME: %{{.+}}: memref<32x64xi32> @@ -66,7 +66,7 @@ module { // CHECK-NEXT: aie.objectfifo.link // CHECK-SAME: @[[OBJ0]] // CHECK-SAME: @[[OBJ1]] -// CHECK: func.func @sequence +// CHECK: func.func @circular_dma_cpy_nd_and_link module { func.func @circular_dma_cpy_nd_and_link() { amdaie.workgroup { @@ -110,7 +110,7 @@ module { // CHECK-NEXT: aie.objectfifo.link // CHECK-SAME: @[[OBJ0]] // CHECK-SAME: @[[OBJ1]] -// CHECK: func.func @sequence +// CHECK: func.func @circular_dma_cpy_sizes_and_strides module { func.func @circular_dma_cpy_sizes_and_strides() { amdaie.workgroup { @@ -155,12 +155,14 @@ module { // CHECK: aie.core(%[[TILE_0_2]]) // CHECK: %[[ACQUIRE:.+]] = aie.objectfifo.acquire // CHECK-SAME: Produce -// CHECK: aie.objectfifo.subview.access -// CHECK-SAME: %[[ACQUIRE]] -// CHECK: func.func @sequence +// CHECK: %[[ACCESS:.+]] = aie.objectfifo.subview.access %[[ACQUIRE]] +// CHECK: %[[REINTERPRET:.+]] = memref.reinterpret_cast %[[ACCESS]] +// CHECK: linalg.fill ins(%{{.+}} : i32) outs(%[[REINTERPRET]] : memref<32x32xi32, 1>) +// CHECK: func.func @tile_and_core_and_acquire module { func.func @tile_and_core_and_acquire() { amdaie.workgroup { + %c0_i32 = arith.constant 0 : i32 %c0 = arith.constant 0 : index %c1 = arith.constant 1 : index %c2 = arith.constant 2 : index @@ -176,7 +178,9 @@ module { %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj2[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %dma1 = amdaie.circular_dma_cpy_nd(%obj0[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %core_0_0 = amdaie.core(%tile_0_2) { - amdaie.logicalobjectfifo.acquire(%dma0, Produce) {size = 1 : i32} + %0 = amdaie.logicalobjectfifo.acquire(%dma0, Produce) {size = 1 : i32} -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.access(%0, Write) : !amdaie.logicalobjectfifo> -> memref<32x32xi32, 1> + linalg.fill ins(%c0_i32 : i32) outs(%1 : memref<32x32xi32, 1>) amdaie.end } memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> @@ -210,7 +214,7 @@ module { // CHECK-SAME: Consume // CHECK: aie.objectfifo.subview.access // CHECK-SAME: %[[ACQUIRE_1]] -// CHECK: func.func @sequence +// CHECK: func.func @tile_and_core_and_acquire_broadcast module { func.func @tile_and_core_and_acquire_broadcast() { amdaie.workgroup { @@ -230,11 +234,11 @@ module { %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %dma1 = amdaie.circular_dma_cpy_nd(%obj2[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %core_0_2 = amdaie.core(%tile_0_2) { - amdaie.logicalobjectfifo.acquire(%dma1, Consume) {size = 1 : i32} + %0 = amdaie.logicalobjectfifo.acquire(%dma1, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo> amdaie.end } %core_1_2 = amdaie.core(%tile_1_2) { - amdaie.logicalobjectfifo.acquire(%dma1, Consume) {size = 1 : i32} + %0 = amdaie.logicalobjectfifo.acquire(%dma1, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo> amdaie.end } memref.dealloc %alloc_2 : memref<4x8x4x8xi32, 2> @@ -259,7 +263,7 @@ module { // CHECK-DAG: %{{.+}} = aie.tile(0, 0) // CHECK: aie.core(%[[TILE_0_2]]) // CHECK: aie.objectfifo.release -// CHECK: func.func @sequence +// CHECK: func.func @tile_and_core_and_release module { func.func @tile_and_core_and_release() { amdaie.workgroup { @@ -303,7 +307,7 @@ module { // CHECK-NEXT: aie.objectfifo.link // CHECK-SAME: @[[OBJ0]] // CHECK-SAME: @[[OBJ1]] -// CHECK: func.func @sequence +// CHECK: func.func @controlcode // CHECK-SAME: %[[ARG0:.+]]: memref<32x64xi32> // CHECK: aiex.npu.dma_memcpy_nd // CHECK-SAME: %[[ARG0]] @@ -362,13 +366,9 @@ module { // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK: %[[ACQUIRE_0:.+]] = aie.objectfifo.acquire -// CHECK-SAME: @[[OBJ1]] -// CHECK-SAME: Consume -// CHECK: %[[ACCESS_0:.+]] = aie.objectfifo.subview.access -// CHECK-SAME: %[[ACQUIRE_0]] -// CHECK: %[[REINTERPRET_0:.+]] = memref.reinterpret_cast -// CHECK-SAME: %[[ACCESS_0]] +// CHECK: %[[ACQUIRE_0:.+]] = aie.objectfifo.acquire @[[OBJ1]](Consume, 1) +// CHECK: %[[ACCESS_0:.+]] = aie.objectfifo.subview.access %[[ACQUIRE_0]] +// CHECK: %[[REINTERPRET_0:.+]] = memref.reinterpret_cast %[[ACCESS_0]] // CHECK: scf.for %{{.+}} = %[[C0]] to %[[C8]] step %[[C1]] // CHECK: linalg.fill // CHECK-SAME: %[[REINTERPRET_0]] @@ -379,20 +379,16 @@ module { // CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index // CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index -// CHECK: %[[ACQUIRE_1:.+]] = aie.objectfifo.acquire -// CHECK-SAME: @[[OBJ1]] -// CHECK-SAME: Consume -// CHECK: %[[ACCESS_1:.+]] = aie.objectfifo.subview.access -// CHECK-SAME: %[[ACQUIRE_1]] -// CHECK: %[[REINTERPRET_1:.+]] = memref.reinterpret_cast -// CHECK-SAME: %[[ACCESS_1]] +// CHECK: %[[ACQUIRE_1:.+]] = aie.objectfifo.acquire @[[OBJ1]](Consume, 1) +// CHECK: %[[ACCESS_1:.+]] = aie.objectfifo.subview.access %[[ACQUIRE_1]] +// CHECK: %[[REINTERPRET_1:.+]] = memref.reinterpret_cast %[[ACCESS_1]] // CHECK: scf.for %{{.+}} = %[[C0]] to %[[C8]] step %[[C1]] // CHECK: linalg.fill // CHECK-SAME: %[[REINTERPRET_1]] // CHECK: } // CHECK: aie.objectfifo.release // CHECK-SAME: @[[OBJ1]] -// CHECK: func.func @sequence +// CHECK: func.func @large_example // CHECK-SAME: %[[ARG0:.+]]: memref<32x64xi32> // CHECK: aiex.npu.dma_memcpy_nd // CHECK-SAME: %[[ARG0]] @@ -417,27 +413,29 @@ module { %tile_0_1 = amdaie.tile(%c0, %c1) %tile_0_2 = amdaie.tile(%c0, %c2) %tile_1_2 = amdaie.tile(%c1, %c2) - %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<32x64xi32> - memref.assume_alignment %2, 64 : memref<32x64xi32> + %0 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) alignment(64) offset(%c0) : memref<32x64xi32> + memref.assume_alignment %0, 64 : memref<32x64xi32> %alloc_1 = memref.alloc() : memref<32x32xi32, 1> %alloc_2 = memref.alloc() : memref<4x8x4x8xi32, 2> - %obj0 = amdaie.logicalobjectfifo.from_memref %2, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + %obj0 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> %obj1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> %obj2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_2, %tile_1_2} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> %dma0 = amdaie.circular_dma_cpy_nd(%obj1[] [] [], %obj0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %dma1 = amdaie.circular_dma_cpy_nd(%obj2[] [] [], %obj1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) %core_0_2 = amdaie.core(%tile_0_2) { - amdaie.logicalobjectfifo.acquire(%dma1, Consume) {size = 1 : i32} + %1 = amdaie.logicalobjectfifo.acquire(%dma1, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo> + %2 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<4x8x4x8xi32, 2> scf.for %arg2 = %c0 to %c8 step %c1 { - linalg.fill ins(%c0_i32 : i32) outs(%alloc_2 : memref<4x8x4x8xi32, 2>) + linalg.fill ins(%c0_i32 : i32) outs(%2 : memref<4x8x4x8xi32, 2>) } amdaie.logicalobjectfifo.release(%dma1, Consume) {size = 1 : i32} amdaie.end } %core_1_2 = amdaie.core(%tile_1_2) { - amdaie.logicalobjectfifo.acquire(%dma1, Consume) {size = 1 : i32} + %1 = amdaie.logicalobjectfifo.acquire(%dma1, Consume) {size = 1 : i32} -> !amdaie.logicalobjectfifo> + %2 = amdaie.logicalobjectfifo.access(%1, Read) : !amdaie.logicalobjectfifo> -> memref<4x8x4x8xi32, 2> scf.for %arg2 = %c0 to %c8 step %c1 { - linalg.fill ins(%c0_i32 : i32) outs(%alloc_2 : memref<4x8x4x8xi32, 2>) + linalg.fill ins(%c0_i32 : i32) outs(%2: memref<4x8x4x8xi32, 2>) } amdaie.logicalobjectfifo.release(%dma1, Consume) {size = 1 : i32} amdaie.end diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy.mlir index 5d17853f8..49b7f5657 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lowering_strategy.mlir @@ -154,8 +154,8 @@ builtin.module { // ----- -// CHECK-PACK-PEEL{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-PACK-PEEL{LITERAL}: #packingConfig = #amdaie.packing_config +// CHECK-PACK-PEEL{LITERAL}: #config = #iree_codegen.lowering_config +// CHECK-PACK-PEEL{LITERAL}: #packingConfig = #amdaie.packing_config builtin.module { func.func @matmul_large_dispatch_0_matmul_2048x2048x2048_i8_i32() { %c0_i32 = arith.constant 0 : i32 @@ -176,8 +176,8 @@ builtin.module { // ----- -// CHECK-PACK-PEEL{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-PACK-PEEL{LITERAL}: #packingConfig = #amdaie.packing_config +// CHECK-PACK-PEEL{LITERAL}: #config = #iree_codegen.lowering_config +// CHECK-PACK-PEEL{LITERAL}: #packingConfig = #amdaie.packing_config module { func.func @matmul_large_dispatch_0_matmul_308x2432x9728_bf16() { %cst = arith.constant 0.000000e+00 : bf16 @@ -199,8 +199,8 @@ module { // CHECK-PAD-PACK{LITERAL}: #config = #iree_codegen.lowering_config // CHECK-PAD-PACK{LITERAL}: #packingConfig = #amdaie.packing_config -// CHECK-PACK-PEEL{LITERAL}: #config = #iree_codegen.lowering_config -// CHECK-PACK-PEEL{LITERAL}: #packingConfig = #amdaie.packing_config +// CHECK-PACK-PEEL{LITERAL}: #config = #iree_codegen.lowering_config +// CHECK-PACK-PEEL{LITERAL}: #packingConfig = #amdaie.packing_config builtin.module { func.func @matmul_transpose_b_dispatch_0_matmul_transpose_b_256x1024x512_i32() { %c0_i32 = arith.constant 0 : i32 diff --git a/tests/samples/matmul_peeled_objectfifo.mlir b/tests/samples/matmul_peeled_objectfifo.mlir index 6828fb52e..3e94301c9 100644 --- a/tests/samples/matmul_peeled_objectfifo.mlir +++ b/tests/samples/matmul_peeled_objectfifo.mlir @@ -1,6 +1,6 @@ -// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-pack-to-dma,air-copy-to-dma,iree-amdaie-air-dma-to-amdaie-dma,iree-amdaie-insert-cores,cse,iree-amdaie-localize-logicalobjectfifo,iree-amdaie-distribute-cores-and-objectfifos,cse,iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-consume-produce-to-acquire-release,cse,canonicalize,iree-amdaie-controlcode-loop-unroll,cse,canonicalize,canonicalize,iree-amdaie-lower-to-aie,canonicalize)" --split-input-file %s | FileCheck %s +// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-pack-to-dma,air-copy-to-dma,iree-amdaie-air-dma-to-amdaie-dma,iree-amdaie-insert-cores,cse,iree-amdaie-localize-logicalobjectfifo,iree-amdaie-distribute-cores-and-objectfifos,cse,iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-canonicalize-doubly-strided-op,iree-amdaie-access-to-acquire-release,cse,canonicalize,iree-amdaie-controlcode-loop-unroll,cse,canonicalize,canonicalize,iree-amdaie-lower-to-aie,canonicalize)" --split-input-file %s | FileCheck %s -// CHECK: aie.device(npu1) +// CHECK: aie.device(npu1_4col) // CHECK-DAG: %[[TILE_0_2:.+]] = aie.tile(0, 2) // CHECK-DAG: %[[TILE_1_2:.+]] = aie.tile(1, 2) // CHECK-DAG: aie.objectfifo @[[OBJ0:.+]](%[[TILE_0_2]] @@ -9,7 +9,7 @@ // CHECK: aie.objectfifo.acquire @[[OBJ0]](Produce, 1) // CHECK-DAG: aie.core(%[[TILE_1_2]]) // CHECK: aie.objectfifo.acquire @[[OBJ1]](Produce, 1) -// CHECK: func.func @sequence(%[[ARG0:.+]]: memref<32x1024xi32>, %[[ARG1:.+]]: memref<1024x64xi32>, %[[ARG2:.+]]: memref<32x64xi32>) +// CHECK: func.func @matmul_i32(%[[ARG0:.+]]: memref<32x1024xi32>, %[[ARG1:.+]]: memref<1024x64xi32>, %[[ARG2:.+]]: memref<32x64xi32>) // CHECK-DAG: aiex.npu.dma_memcpy_nd // CHECK-SAME: %[[ARG0]] // CHECK-DAG: aiex.npu.dma_memcpy_nd diff --git a/tests/samples/pack_peel_pipeline_matmul_elementwise.mlir b/tests/samples/pack_peel_pipeline_matmul_elementwise.mlir index 2df63e617..f638f989a 100644 --- a/tests/samples/pack_peel_pipeline_matmul_elementwise.mlir +++ b/tests/samples/pack_peel_pipeline_matmul_elementwise.mlir @@ -63,20 +63,19 @@ func.func @matmul_elementwise_bf16_f32(%arg0: tensor<1024x512xbf16>, %arg1: tens // CHECK: aiex.npu.sync // ----- - -func.func @matmul_elementwise_bf16(%arg0: tensor<512x512xbf16>, %arg1: tensor<512x16384xbf16>, %arg2: tensor<512xf32>) -> tensor<16384x512xbf16> { +func.func @matmul_elementwise_bf16(%arg0: tensor<512x512xbf16>, %arg1: tensor<512x16384xbf16>, %arg2: tensor<512xf32>) -> tensor<512x16384xbf16> { %cst = arith.constant 0.000000e+00 : f32 - %7 = tensor.empty() : tensor<16384x512xbf16> + %7 = tensor.empty() : tensor<512x16384xbf16> %8 = tensor.empty() : tensor<512x16384xf32> %9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<512x16384xf32>) -> tensor<512x16384xf32> %10 = linalg.matmul ins(%arg0, %arg1 : tensor<512x512xbf16>, tensor<512x16384xbf16>) outs(%9 : tensor<512x16384xf32>) -> tensor<512x16384xf32> - %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d1, d0)>], iterator_types = ["parallel", "parallel"]} ins(%10, %arg2 : tensor<512x16384xf32>, tensor<512xf32>) outs(%7 : tensor<16384x512xbf16>) { + %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10, %arg2 : tensor<512x16384xf32>, tensor<512xf32>) outs(%7 : tensor<512x16384xbf16>) { ^bb0(%in: f32, %in_0: f32, %out: bf16): %12 = arith.addf %in, %in_0 : f32 %13 = arith.truncf %12 : f32 to bf16 linalg.yield %13 : bf16 - } -> tensor<16384x512xbf16> - return %11 : tensor<16384x512xbf16> + } -> tensor<512x16384xbf16> + return %11 : tensor<512x16384xbf16> } // CHECK-LABEL: hal.executable.export public @matmul_elementwise_bf16_dispatch_0_matmul_512x16384x512_bf16xbf16xf32 diff --git a/third_party/mlir-air b/third_party/mlir-air index b2df4d74a..f384223c9 160000 --- a/third_party/mlir-air +++ b/third_party/mlir-air @@ -1 +1 @@ -Subproject commit b2df4d74a77e6d7be327e75802098eb96b5c9a35 +Subproject commit f384223c9d28ee03f7419cd14c0e8be7398d271e