diff --git a/compiler/plugins/target/AMD-AIE/aie/AIETargetBCF.cpp b/compiler/plugins/target/AMD-AIE/aie/AIETargetBCF.cpp new file mode 100644 index 000000000..ef412db84 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/aie/AIETargetBCF.cpp @@ -0,0 +1,136 @@ +//===- AIETargetBCF.cpp -----------------------------------------*- C++ -*-===// +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "AIETargets.h" +#include "aie/Dialect/AIE/IR/AIEDialect.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/IR/Module.h" + +using namespace mlir; +using namespace xilinx; +using namespace xilinx::AIE; + +std::string utohexstr(uint32_t u) { return "0x" + llvm::utohexstr(u); } + +namespace xilinx { +namespace AIE { + +LogicalResult AIETranslateToBCF(ModuleOp module, raw_ostream &output, + int tileCol, int tileRow) { + DenseMap tiles; + DenseMap> buffers; + + if (module.getOps().empty()) + module.emitOpError("expected aie.device operation at toplevel"); + DeviceOp targetOp = *(module.getOps().begin()); + + collectTiles(targetOp, tiles); + collectBuffers(targetOp, buffers); + + // _entry_point _main_init + // _symbol _main _after _main_init + // _symbol _main_init 0 + // _reserved DMb 0x00000 0x20000 + // _symbol a 0x38000 0x2000 + // _extern a + // _stack DM_stack 0x20000 0x400 //stack for core + // _reserved DMb 0x40000 0xc0000 // And everything else the core can't + // see + // // Include all symbols from rom.c + // _include _file rom.o + for (auto tile : targetOp.getOps()) + if (tile.colIndex() == tileCol && tile.rowIndex() == tileRow) { + const auto &targetModel = getTargetModel(tile); + TileID srcCoord = {tile.colIndex(), tile.rowIndex()}; + + std::string corefunc = std::string("core_") + + std::to_string(tile.getCol()) + "_" + + std::to_string(tile.getRow()); + output << "_entry_point _main_init\n"; + output << "_symbol " << corefunc << " _after _main_init\n"; + output << "_symbol _main_init 0\n"; + std::string initReserved = (targetModel.getTargetArch() == AIEArch::AIE2) + ? "0x40000" + : "0x20000"; + output << "_reserved DMb 0x00000 " << initReserved + << " // Don't put data in code memory\n"; + + int stacksize = 0; + if (auto core = tile.getCoreOp()) stacksize = core.getStackSize(); + output << "_stack DM_stack " + << utohexstr(targetModel.getMemInternalBaseAddress(srcCoord)) + << " " << utohexstr(stacksize) << " // stack for core\n"; + + auto doBuffer = [&](std::optional tile, int offset, + const std::string &dir) { + if (tile) { + output << "// " + dir + + " -------------------------------------------------\n"; + uint32_t localMemSize = targetModel.getLocalMemorySize(); + if (tile != srcCoord) + output << "_reserved DMb " << utohexstr(offset) << " " + << utohexstr(localMemSize) << " " + << " // Don't allocate variables in " << dir + << " neighbor\n\n"; + // TODO How to set as reserved if no buffer exists (or reserve + // remaining buffer) + if (tiles.count(*tile)) { + for (auto buf : buffers[tiles[*tile]]) { + std::string bufName(buf.name().getValue()); + int bufferBaseAddr = getBufferBaseAddress(buf); + int numBytes = buf.getAllocationSize(); + if (buf.getInitialValue() && tile == srcCoord) { + output << "_overlay " << bufName << " " + << utohexstr(offset + bufferBaseAddr) << " // " + << numBytes << " bytes\n"; + } else { + output << "_symbol " << bufName << " " + << utohexstr(offset + bufferBaseAddr) << " " << numBytes + << '\n'; + output << "_extern " << bufName << "\n"; + output << "_reserved DMb " << utohexstr(offset + bufferBaseAddr) + << " " << numBytes << '\n'; + } + output << "\n"; + } + } + } else { + uint32_t localMemSize = targetModel.getLocalMemorySize(); + output << "_reserved DMb " << utohexstr(offset) << " " + << utohexstr(localMemSize) << " " + << " // No tile with memory exists to the " << dir << ".\n"; + } + }; + + output << "\n// mapping neighbors tile memory\n"; + doBuffer(targetModel.getMemSouth(srcCoord), + targetModel.getMemSouthBaseAddress(), std::string("south")); + doBuffer(targetModel.getMemWest(srcCoord), + targetModel.getMemWestBaseAddress(), std::string("west")); + doBuffer(targetModel.getMemNorth(srcCoord), + targetModel.getMemNorthBaseAddress(), std::string("north")); + doBuffer(targetModel.getMemEast(srcCoord), + targetModel.getMemEastBaseAddress(), std::string("east")); + output << "// end mapping neighbors tile memory\n\n"; + + if (targetModel.getTargetArch() == AIEArch::AIE2) { + output << "_reserved DMb 0x80000 0x80000 // And everything else " + "the core can't see\n"; + } else { + output << "_reserved DMb 0x40000 0xc0000 // And everything else " + "the core can't see\n"; + } + if (tile.getCoreOp() && tile.getCoreOp().getLinkWith()) + output << "_include _file " + << tile.getCoreOp().getLinkWith().value().str() << "\n"; + output << "_resolve _main core_" << tile.getCol() << "_" << tile.getRow() + << "\n"; + } + + return success(); +} +} // namespace AIE +} // namespace xilinx diff --git a/compiler/plugins/target/AMD-AIE/aie/AIETargetCDODirect.cpp b/compiler/plugins/target/AMD-AIE/aie/AIETargetCDODirect.cpp new file mode 100644 index 000000000..da4951ecf --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/aie/AIETargetCDODirect.cpp @@ -0,0 +1,788 @@ +//===- AIETargetCDODirect.cpp -----------------------------------*- C++ -*-===// +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "AIETargets.h" +#include "aie/Dialect/AIE/IR/AIETargetModel.h" +extern "C" { +#include "cdo-driver/cdo_driver.h" +} + +#include +#include +#include // size_t +#include // uint +#include // calloc +#include +#include +#include +#include +#include + +#include "aie/Dialect/AIE/IR/AIEDialect.h" +#include "aie/Dialect/AIE/IR/AIEEnums.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "mlir/IR/Block.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/BuiltinTypeInterfaces.h" +#include "mlir/IR/Operation.h" +#include "mlir/IR/Region.h" +#include "mlir/Support/LLVM.h" +#include "mlir/Support/LogicalResult.h" + +#ifndef NDEBUG +#define XAIE_DEBUG +#endif + +extern "C" { +#include "xaiengine/xaie_core.h" +#include "xaiengine/xaie_dma.h" +#include "xaiengine/xaie_elfloader.h" +#include "xaiengine/xaie_interrupt.h" +#include "xaiengine/xaie_locks.h" +#include "xaiengine/xaie_plif.h" +#include "xaiengine/xaie_ss.h" +#include "xaiengine/xaiegbl.h" +#include "xaiengine/xaiegbl_defs.h" +} + +#define DEBUG_TYPE "aie-generate-cdo" + +using namespace mlir; +using namespace xilinx; +using namespace xilinx::AIE; + +#define AIERC_STR(x) x, #x +static const std::map AIERCTOSTR = { + {AIERC_STR(XAIE_OK)}, + {AIERC_STR(XAIE_ERR)}, + {AIERC_STR(XAIE_INVALID_DEVICE)}, + {AIERC_STR(XAIE_INVALID_RANGE)}, + {AIERC_STR(XAIE_INVALID_ARGS)}, + {AIERC_STR(XAIE_INVALID_TILE)}, + {AIERC_STR(XAIE_ERR_STREAM_PORT)}, + {AIERC_STR(XAIE_INVALID_DMA_TILE)}, + {AIERC_STR(XAIE_INVALID_BD_NUM)}, + {AIERC_STR(XAIE_ERR_OUTOFBOUND)}, + {AIERC_STR(XAIE_INVALID_DATA_MEM_ADDR)}, + {AIERC_STR(XAIE_INVALID_ELF)}, + {AIERC_STR(XAIE_CORE_STATUS_TIMEOUT)}, + {AIERC_STR(XAIE_INVALID_CHANNEL_NUM)}, + {AIERC_STR(XAIE_INVALID_LOCK)}, + {AIERC_STR(XAIE_INVALID_DMA_DIRECTION)}, + {AIERC_STR(XAIE_INVALID_PLIF_WIDTH)}, + {AIERC_STR(XAIE_INVALID_LOCK_ID)}, + {AIERC_STR(XAIE_INVALID_LOCK_VALUE)}, + {AIERC_STR(XAIE_LOCK_RESULT_FAILED)}, + {AIERC_STR(XAIE_INVALID_DMA_DESC)}, + {AIERC_STR(XAIE_INVALID_ADDRESS)}, + {AIERC_STR(XAIE_FEATURE_NOT_SUPPORTED)}, + {AIERC_STR(XAIE_INVALID_BURST_LENGTH)}, + {AIERC_STR(XAIE_INVALID_BACKEND)}, + {AIERC_STR(XAIE_INSUFFICIENT_BUFFER_SIZE)}, + {AIERC_STR(XAIE_ERR_MAX)}}; +#undef AIERC_STR + +static const std::map + WIRE_BUNDLE_TO_STRM_SW_PORT_TYPE = { + {WireBundle::Core, StrmSwPortType::CORE}, + {WireBundle::DMA, StrmSwPortType::DMA}, + {WireBundle::Ctrl, StrmSwPortType::CTRL}, + {WireBundle::FIFO, StrmSwPortType::FIFO}, + {WireBundle::South, StrmSwPortType::SOUTH}, + {WireBundle::West, StrmSwPortType::WEST}, + {WireBundle::North, StrmSwPortType::NORTH}, + {WireBundle::East, StrmSwPortType::EAST}, + // missing PLIO from WireBundle + // missing NOC from WireBundle + {WireBundle::Trace, StrmSwPortType::TRACE}, +}; + +// https://stackoverflow.com/a/32230306 +template +raw_ostream &showArgs(raw_ostream &out, const char *label, H1 &&value) { + return out << label << "=" << std::forward

(value); +} + +template +raw_ostream &showArgs(raw_ostream &out, const char *label, H1 &&value, + T &&...rest) { + const char *pcomma = strchr(label, ','); + return showArgs(out.write(label, pcomma - label) + << "=" << std::forward

(value) << ',', + pcomma + 1, std::forward(rest)...); +} + +#define SHOW_ARGS(os, ...) showArgs(os, #__VA_ARGS__, __VA_ARGS__) + +raw_ostream &operator<<(raw_ostream &os, const XAie_LocType &loc) { + os << "XAie_LocType(col: " << std::to_string(loc.Col) + << ", row: " << std::to_string(loc.Row) << ")"; + return os; +} + +raw_ostream &operator<<(raw_ostream &os, const XAie_Lock &lock) { + os << "XAie_Lock(id: " << std::to_string(lock.LockId) + << ", val: " << std::to_string(lock.LockVal) << ")"; + return os; +} + +raw_ostream &operator<<(raw_ostream &os, const XAie_Packet &packet) { + os << "XAie_Packet(id: " << std::to_string(packet.PktId) + << ", type: " << std::to_string(packet.PktType) << ")"; + return os; +} + +// So that we can use the pattern if(auto r = TRY_XAIE_API...) { // r is nonzero +// } +static_assert(XAIE_OK == 0); + +#define TRY_XAIE_API_FATAL_ERROR(API, ...) \ + do { \ + LLVM_DEBUG(llvm::dbgs() << "trying XAIE API: " << #API << " with args: "); \ + LLVM_DEBUG(SHOW_ARGS(llvm::dbgs(), __VA_ARGS__)); \ + LLVM_DEBUG(llvm::dbgs() << "\n"); \ + if (auto r = API(__VA_ARGS__)) \ + llvm::report_fatal_error(llvm::Twine(#API " failed with ") + \ + AIERCTOSTR.at(r)); \ + } while (0) + +#define TRY_XAIE_API_EMIT_ERROR(OP, API, ...) \ + do { \ + LLVM_DEBUG(llvm::dbgs() << "trying XAIE API: " << #API << " with args: "); \ + LLVM_DEBUG(SHOW_ARGS(llvm::dbgs(), __VA_ARGS__)); \ + LLVM_DEBUG(llvm::dbgs() << "\n"); \ + if (auto r = API(__VA_ARGS__)) \ + return OP.emitOpError() << #API " failed with " << AIERCTOSTR.at(r); \ + } while (0) + +#define TRY_XAIE_API_LOGICAL_RESULT(API, ...) \ + do { \ + LLVM_DEBUG(llvm::dbgs() << "trying XAIE API: " << #API << " with args: "); \ + LLVM_DEBUG(SHOW_ARGS(llvm::dbgs(), __VA_ARGS__)); \ + LLVM_DEBUG(llvm::dbgs() << "\n"); \ + if (auto r = API(__VA_ARGS__)) { \ + llvm::errs() << #API " failed with " << AIERCTOSTR.at(r); \ + return failure(); \ + } \ + } while (0) + +auto ps = std::filesystem::path::preferred_separator; + +#define XAIE_BASE_ADDR 0x40000000 +#define XAIE_COL_SHIFT 25 +#define XAIE_ROW_SHIFT 20 +#define XAIE_SHIM_ROW 0 +#define XAIE_MEM_TILE_ROW_START 1 +#define XAIE_PARTITION_BASE_ADDR 0x0 + +#define NPI_ADDR 0x0 +#define NUM_LOCKS 16 +#define EVEN_BD_NUM_START 0 +#define ODD_BD_NUM_START 24 +#define MEM_TILE_LOCK_ID_INCR 64 +#define BASE_ADDR_A_INCR 0x80000 + +namespace xilinx::AIE { + +LogicalResult configureLocksInBdBlock(XAie_DmaDesc &dmaTileBd, Block &block, + const AIETargetModel &targetModel, + XAie_LocType &tileLoc) { + LLVM_DEBUG(llvm::dbgs() << "\nstart configuring bds\n"); + std::optional acqValue, relValue, acqLockId, relLockId; + bool acqEn; + // switch (lock->getAc) + for (auto op : block.getOps()) { + // Only dyn_cast if you are going to check if it was of the type + // expected; if you aren't checking use cast instead as it will at + // least assert in debug mode with an easier to understand error than + // dereferencing. + LockOp lock = cast(op.getLock().getDefiningOp()); + switch (op.getAction()) { + case LockAction::Acquire: + case LockAction::AcquireGreaterEqual: + acqEn = op.getAcqEn(); + acqLockId = lock.getLockIDValue(); + acqValue = op.getLockValue(); + if (op.acquireGE()) acqValue.value() = -acqValue.value(); + break; + case LockAction::Release: + relLockId = lock.getLockIDValue(); + relValue = op.getLockValue(); + break; + } + } + + assert(acqValue && relValue && acqLockId && relLockId && + "expected both use_lock(acquire) and use_lock(release) with bd"); + + if (targetModel.isMemTile(tileLoc.Col, tileLoc.Row)) { + if (acqLockId) acqLockId.value() += MEM_TILE_LOCK_ID_INCR; + if (relLockId) relLockId.value() += MEM_TILE_LOCK_ID_INCR; + } + + // no RelEn in the arch spec even though the API requires you to set it? + bool relEn = false; + XAie_Lock acqLock = XAie_LockInit(acqLockId.value(), acqValue.value()); + XAie_Lock relLock = XAie_LockInit(relLockId.value(), relValue.value()); + TRY_XAIE_API_EMIT_ERROR((*block.getOps().begin()), + dmaTileBd.DmaMod->SetLock, &dmaTileBd, acqLock, + relLock, acqEn, relEn); + return success(); +} + +LogicalResult configureBdInBlock(XAie_DevInst &devInst, XAie_DmaDesc &dmaTileBd, + Block &block, + const AIETargetModel &targetModel, + XAie_LocType &tileLoc, int bdId, + std::optional nextBdId) { + std::optional packetType; + std::optional packetID; + auto maybePacketOps = block.getOps(); + if (!maybePacketOps.empty()) { + assert(llvm::range_size(maybePacketOps) == 1 && + "expected only one dma_bd_packet"); + auto packetOp = *maybePacketOps.begin(); + packetType = packetOp.getPacketType(); + packetID = packetOp.getPacketID(); + } + + auto bdOp = *block.getOps().begin(); + + if (targetModel.isShimNOCTile(tileLoc.Col, tileLoc.Row)) { + // write them out like this so they show up with names in debug prints + size_t smid = 0; + size_t burstLen = 16; // (10):BLEN=16 (256Byte) (corresponds to + // 0x800000000 from target) + size_t qOs = 0; + size_t cache = 0; + size_t secure = 0; + TRY_XAIE_API_EMIT_ERROR(bdOp, XAie_DmaSetAxi, &dmaTileBd, smid, burstLen, + qOs, cache, secure); + } + + // StringRef FifoMode = disable; // FIXME: when to enable FIFO mode? + int baseAddr = 0; + if (!targetModel.isShimNOCTile(tileLoc.Col, tileLoc.Row)) { + auto bufferOp = cast(bdOp.getBuffer().getDefiningOp()); + if (!bufferOp.getAddress()) + return bufferOp.emitError("buffer must have address assigned"); + baseAddr = bufferOp.getAddress().value(); + if (targetModel.isMemTile(tileLoc.Col, tileLoc.Row)) + baseAddr += BASE_ADDR_A_INCR; + } + + std::optional> dims = bdOp.getDimensions(); + int lenInBytes = bdOp.getLenInBytes(); + int basePlusOffsetInBytes = baseAddr + bdOp.getOffsetInBytes(); + if (!dims) { + TRY_XAIE_API_EMIT_ERROR(bdOp, XAie_DmaSetAddrLen, &dmaTileBd, + basePlusOffsetInBytes, lenInBytes); + } else { + XAie_DmaTensor dmaTileBdTensor = {}; + dmaTileBdTensor.NumDim = dims->size(); + dmaTileBdTensor.Dim = static_cast( + calloc(dmaTileBdTensor.NumDim, sizeof(XAie_DmaDimDesc))); + if (!dmaTileBdTensor.Dim) + return bdOp.emitError("couldn't allocate array of XAie_DmaDimDesc"); + // libxaie requires stride in multiples of 32b + double elementWidthIn32bWords = + static_cast(bdOp.getBufferElementTypeWidthInBytes()) / 4.0; + for (size_t i = 0; i < dims->size(); i++) { + // Pass down dimensions in reverse order; in the MLIR, this allows + // us to specify step sizes/wraps in the same order as we would + // access a multi-dim C array, with the highest dimension first. + int j = dims->size() - i - 1; + uint16_t size; + uint32_t stride; + if (j > 0) { + stride = static_cast(dims.value()[i].getStride() * + elementWidthIn32bWords); + size = dims.value()[i].getSize(); + } else { + stride = dims.value()[i].getStride(); + size = static_cast(dims.value()[i].getSize() * + elementWidthIn32bWords); + } + stride = stride > 0 ? stride : 1; + // Assume AIE-ML architecture (ie use AieMlDimDesc instead of AieDimDesc); + // asserted in AIETranslateToCDODirect). + dmaTileBdTensor.Dim[j].AieMlDimDesc = {stride, size}; + } + TRY_XAIE_API_EMIT_ERROR(bdOp, XAie_DmaSetMultiDimAddr, &dmaTileBd, + &dmaTileBdTensor, basePlusOffsetInBytes, + lenInBytes); + } + + if (nextBdId) { + auto enableNextBd = 1; + TRY_XAIE_API_EMIT_ERROR(bdOp, XAie_DmaSetNextBd, &dmaTileBd, + nextBdId.value(), enableNextBd); + } + + if (packetID) { + if (!packetType) bdOp.emitError("must have packetType with packetID"); + if (bdOp.getLen() == 0) + return bdOp.emitOpError( + "For MM2S channels, if Buffer_Length=0 then Enable_Packet must be " + "set to 0, otherwise behavior is undefined (3.7.8 arch spec)"); + TRY_XAIE_API_EMIT_ERROR( + bdOp, XAie_DmaSetPkt, &dmaTileBd, + XAie_PacketInit(packetID.value(), packetType.value())); + } + TRY_XAIE_API_EMIT_ERROR(bdOp, XAie_DmaEnableBd, &dmaTileBd); + TRY_XAIE_API_EMIT_ERROR(bdOp, XAie_DmaWriteBd, &devInst, &dmaTileBd, tileLoc, + bdId); + LLVM_DEBUG(llvm::dbgs() << "\nend configuring bds\n"); + return success(); +}; + +LogicalResult pushToBdQueueAndEnable(XAie_DevInst &devInst, Operation &op, + XAie_LocType &tileLoc, int chNum, + const DMAChannelDir &channelDir, int bdId, + int repeatCount) { + XAie_DmaDirection direction = + channelDir == DMAChannelDir::S2MM ? DMA_S2MM : DMA_MM2S; + auto enTokenIssue = tileLoc.Row == 0 && direction == DMA_S2MM; + // in english repeat_count==0 means "do it once" and don't repeat but + // libxaie treats repeat_count=1 as do it once. + repeatCount += 1; + TRY_XAIE_API_EMIT_ERROR(op, XAie_DmaChannelSetStartQueue, &devInst, tileLoc, + chNum, direction, bdId, repeatCount, enTokenIssue); + TRY_XAIE_API_EMIT_ERROR(op, XAie_DmaChannelEnable, &devInst, tileLoc, chNum, + direction); + return success(); +}; + +LogicalResult configureLocksAndBd(XAie_DevInst &devInst, Block &block, + XAie_LocType tileLoc, + const AIETargetModel &targetModel) { + DMABDOp bd = *block.getOps().begin(); + assert(bd.getBdId().has_value() && + "DMABDOp must have assigned bd_id; did you forget to run " + "aie-assign-bd-ids?"); + XAie_DmaDesc dmaTileBd; + TRY_XAIE_API_EMIT_ERROR(bd, XAie_DmaDescInit, &devInst, &dmaTileBd, tileLoc); + if (!block.getOps().empty() && + failed(configureLocksInBdBlock(dmaTileBd, block, targetModel, tileLoc))) + return failure(); + if (!block.getOps().empty() && + failed(configureBdInBlock(devInst, dmaTileBd, block, targetModel, tileLoc, + bd.getBdId().value(), bd.getNextBdId()))) + return failure(); + return success(); +}; + +struct AIEControl { + XAie_Config configPtr; + XAie_DevInst devInst; + + AIEControl(bool aieSim, bool xaieDebug, const BaseNPUTargetModel &tm) { + // The first column in the NPU lacks a shim tile. AIE-RT exposes some of + // the internals about how this is modeled in a somewhat awkward way. + size_t partitionStartCol = tm.isVirtualized() ? 1 : 0; + size_t partitionNumCols = tm.columns(); + size_t deviceRows = tm.rows(); + size_t deviceCols = tm.columns() + partitionStartCol; + + configPtr = XAie_Config{ + /*AieGen*/ XAIE_DEV_GEN_AIEML, + /*BaseAddr*/ XAIE_BASE_ADDR, + /*ColShift*/ XAIE_COL_SHIFT, + /*RowShift*/ XAIE_ROW_SHIFT, + /*NumRows*/ static_cast(deviceRows), + /*NumCols*/ static_cast(deviceCols), + /*ShimRowNum*/ XAIE_SHIM_ROW, + /*MemTileRowStart*/ XAIE_MEM_TILE_ROW_START, + /*MemTileNumRows*/ static_cast(tm.getNumMemTileRows()), + /*AieTileRowStart*/ + static_cast(XAIE_MEM_TILE_ROW_START + tm.getNumMemTileRows()), + /*AieTileNumRows*/ + static_cast(tm.rows() - tm.getNumMemTileRows() - 1), + /*PartProp*/ {}, + /*Backend*/ XAIE_IO_BACKEND_CDO}; + + // Quoting: The instance of a device must be always declared using this + // macro. In future, the same macro will be expanded to allocate + // more memory from the user application for resource management. + XAie_InstDeclare(_devInst, &configPtr); + devInst = _devInst; + TRY_XAIE_API_FATAL_ERROR(XAie_SetupPartitionConfig, &devInst, + XAIE_PARTITION_BASE_ADDR, partitionStartCol, + partitionNumCols); + TRY_XAIE_API_FATAL_ERROR(XAie_CfgInitialize, &devInst, &configPtr); + if (aieSim) { + TRY_XAIE_API_FATAL_ERROR(XAie_SetIOBackend, &devInst, + XAIE_IO_BACKEND_SIM); + } else if (xaieDebug) + TRY_XAIE_API_FATAL_ERROR(XAie_SetIOBackend, &devInst, + XAIE_IO_BACKEND_DEBUG); + else + TRY_XAIE_API_FATAL_ERROR(XAie_SetIOBackend, &devInst, + XAIE_IO_BACKEND_CDO); + + TRY_XAIE_API_FATAL_ERROR(XAie_UpdateNpiAddr, &devInst, NPI_ADDR); + } + + LogicalResult addAieElfToCDO(uint8_t col, uint8_t row, + const StringRef elfPath, bool aieSim) { + // loadSym: Load symbols from .map file. This argument is not used when + // __AIESIM__ is not defined. + TRY_XAIE_API_LOGICAL_RESULT(XAie_LoadElf, &devInst, XAie_TileLoc(col, row), + elfPath.str().c_str(), /*loadSym*/ aieSim); + return success(); + } + + LogicalResult addAieElfsToCDO(DeviceOp &targetOp, const StringRef workDirPath, + bool aieSim) { + for (auto tileOp : targetOp.getOps()) + if (tileOp.isShimNOCorPLTile()) { + // Resets no needed with V2 kernel driver + } else { + int col = tileOp.colIndex(); + int row = tileOp.rowIndex(); + if (auto coreOp = tileOp.getCoreOp()) { + std::string fileName; + if (auto fileAttr = coreOp.getElfFile()) + fileName = fileAttr->str(); + else + fileName = (llvm::Twine("core_") + std::to_string(col) + "_" + + std::to_string(row) + ".elf") + .str(); + if (failed(addAieElfToCDO( + col, row, + (llvm::Twine(workDirPath) + std::string(1, ps) + fileName) + .str(), + aieSim))) + return failure(); + } + } + return success(); + } + + LogicalResult addInitConfigToCDO(DeviceOp &targetOp) { + for (auto tileOp : targetOp.getOps()) { + auto tileLoc = XAie_TileLoc(tileOp.colIndex(), tileOp.rowIndex()); + if (!tileOp.isShimTile() && tileOp.getCoreOp()) { + TRY_XAIE_API_EMIT_ERROR(tileOp, XAie_CoreReset, &devInst, tileLoc); + TRY_XAIE_API_EMIT_ERROR(tileOp, XAie_CoreUnreset, &devInst, tileLoc); + // Set locks to zero + for (uint8_t l = 0; l < NUM_LOCKS; l++) { + auto locInit = XAie_LockInit(l, 0); + TRY_XAIE_API_EMIT_ERROR(tileOp, XAie_LockSetValue, &devInst, tileLoc, + locInit); + } + } + } + + // Set locks with explicit initializers + targetOp.walk([&](LockOp lockOp) { + if (lockOp.getLockID() && lockOp.getInit()) { + auto tileLoc = XAie_TileLoc(lockOp.getTileOp().colIndex(), + lockOp.getTileOp().rowIndex()); + auto locInit = XAie_LockInit(*lockOp.getLockID(), *lockOp.getInit()); + TRY_XAIE_API_FATAL_ERROR(XAie_LockSetValue, &devInst, tileLoc, locInit); + } else + LLVM_DEBUG(llvm::dbgs() + << "lock op missing either id or init" << lockOp << "\n"); + }); + + const AIETargetModel &targetModel = targetOp.getTargetModel(); + + auto memOps = llvm::to_vector_of(targetOp.getOps()); + llvm::append_range(memOps, targetOp.getOps()); + llvm::append_range(memOps, targetOp.getOps()); + for (TileElement memOp : memOps) { + int col = memOp.getTileID().col; + int row = memOp.getTileID().row; + XAie_LocType tileLoc = XAie_TileLoc(col, row); + + // handle DMA ops separately + auto dmaOps = llvm::to_vector_of( + memOp.getOperation()->getRegion(0).getOps()); + if (!dmaOps.empty()) { + for (auto dmaOp : dmaOps) + for (auto &bdRegion : dmaOp.getBds()) { + Block &block = bdRegion.getBlocks().front(); + if (failed( + configureLocksAndBd(devInst, block, tileLoc, targetModel))) + return failure(); + } + } else { + for (Block &block : memOp.getOperation()->getRegion(0)) { + if (block.getOps().empty()) continue; + if (failed(configureLocksAndBd(devInst, block, tileLoc, targetModel))) + return failure(); + } + } + + if (!dmaOps.empty()) + for (auto dmaOp : dmaOps) { + auto &block = dmaOp.getBds().front().getBlocks().front(); + DMABDOp bd = *block.getOps().begin(); + if (failed(pushToBdQueueAndEnable( + devInst, *dmaOp.getOperation(), tileLoc, + dmaOp.getChannelIndex(), dmaOp.getChannelDir(), + bd.getBdId().value(), dmaOp.getRepeatCount()))) + return failure(); + } + else + for (Block &block : memOp.getOperation()->getRegion(0)) { + for (auto op : block.getOps()) { + DMABDOp bd = *op.getDest()->getOps().begin(); + int chNum = op.getChannelIndex(); + auto channelDir = op.getChannelDir(); + if (failed(pushToBdQueueAndEnable( + devInst, *bd.getOperation(), tileLoc, chNum, channelDir, + bd.getBdId().value(), op.getRepeatCount()))) + return failure(); + } + } + } + + // StreamSwitch (switchbox) configuration + for (auto switchboxOp : targetOp.getOps()) { + int32_t col = switchboxOp.colIndex(); + int32_t row = switchboxOp.rowIndex(); + XAie_LocType tileLoc = XAie_TileLoc(col, row); + assert(targetModel.isNPU() && "Only NPU currently supported"); + if (row == 0) { + // FIXME hack for TCT routing + // TODO Support both channels + auto slvPortNum = 0; + auto mstrPortNum = 0; + TRY_XAIE_API_EMIT_ERROR(switchboxOp, XAie_StrmConnCctEnable, &devInst, + tileLoc, CTRL, slvPortNum, SOUTH, mstrPortNum); + } + + Block &b = switchboxOp.getConnections().front(); + for (auto connectOp : b.getOps()) + TRY_XAIE_API_EMIT_ERROR( + switchboxOp, XAie_StrmConnCctEnable, &devInst, tileLoc, + WIRE_BUNDLE_TO_STRM_SW_PORT_TYPE.at(connectOp.getSourceBundle()), + connectOp.sourceIndex(), + WIRE_BUNDLE_TO_STRM_SW_PORT_TYPE.at(connectOp.getDestBundle()), + connectOp.destIndex()); + + for (auto connectOp : b.getOps()) { + int mask = 0; + int arbiter = -1; + + for (auto val : connectOp.getAmsels()) { + AMSelOp amsel = cast(val.getDefiningOp()); + arbiter = amsel.arbiterIndex(); + int msel = amsel.getMselValue(); + mask |= (1 << msel); + } + + bool isdma = connectOp.getDestBundle() == WireBundle::DMA; + // assume a connection going south from row zero gets wired to shimdma + // by a shimmux. TODO: fix the assumption + if (!isdma && (switchboxOp.rowIndex() == 0)) + isdma = connectOp.getDestBundle() == WireBundle::South; + // Flag for overriding DROP_HEADER. TODO: Formalize this in tablegen + isdma &= !connectOp->hasAttr("keep_pkt_header"); + auto dropHeader = + isdma ? XAIE_SS_PKT_DROP_HEADER : XAIE_SS_PKT_DONOT_DROP_HEADER; + TRY_XAIE_API_EMIT_ERROR( + connectOp, XAie_StrmPktSwMstrPortEnable, &devInst, tileLoc, + WIRE_BUNDLE_TO_STRM_SW_PORT_TYPE.at(connectOp.getDestBundle()), + connectOp.destIndex(), dropHeader, arbiter, mask); + } + + for (auto connectOp : b.getOps()) { + int slot = 0; + Block &block = connectOp.getRules().front(); + for (auto slotOp : block.getOps()) { + AMSelOp amselOp = cast(slotOp.getAmsel().getDefiningOp()); + int arbiter = amselOp.arbiterIndex(); + int msel = amselOp.getMselValue(); + TRY_XAIE_API_EMIT_ERROR( + connectOp, XAie_StrmPktSwSlavePortEnable, &devInst, tileLoc, + WIRE_BUNDLE_TO_STRM_SW_PORT_TYPE.at(connectOp.getSourceBundle()), + connectOp.sourceIndex()); + auto packetInit = XAie_PacketInit(slotOp.valueInt(), /*PktType*/ 0); + // TODO Need to better define packet id,type used here + TRY_XAIE_API_EMIT_ERROR( + connectOp, XAie_StrmPktSwSlaveSlotEnable, &devInst, tileLoc, + WIRE_BUNDLE_TO_STRM_SW_PORT_TYPE.at(connectOp.getSourceBundle()), + connectOp.sourceIndex(), slot, packetInit, slotOp.maskInt(), msel, + arbiter); + slot++; + } + } + } + + for (auto muxOp : targetOp.getOps()) { + // NOTE ShimMux always connects from the south as directions are + // defined relative to the tile stream switch. + auto tileLoc = + XAie_TileLoc(muxOp.getTileOp().getCol(), muxOp.getTileOp().getRow()); + Block &b = muxOp.getConnections().front(); + for (auto connectOp : b.getOps()) { + // demux! + if (connectOp.getSourceBundle() == WireBundle::North) + TRY_XAIE_API_EMIT_ERROR(muxOp, XAie_EnableAieToShimDmaStrmPort, + &devInst, tileLoc, connectOp.sourceIndex()); + // mux + if (connectOp.getDestBundle() == WireBundle::North) + TRY_XAIE_API_EMIT_ERROR(muxOp, XAie_EnableShimDmaToAieStrmPort, + &devInst, tileLoc, connectOp.destIndex()); + } + } + + for (auto switchboxOp : targetOp.getOps()) { + Block &b = switchboxOp.getConnections().front(); + auto tileLoc = XAie_TileLoc(switchboxOp.getCol(), 0); + for (auto connectOp : b.getOps()) + TRY_XAIE_API_EMIT_ERROR( + switchboxOp, XAie_StrmConnCctEnable, &devInst, tileLoc, + WIRE_BUNDLE_TO_STRM_SW_PORT_TYPE.at(connectOp.getSourceBundle()), + connectOp.sourceIndex(), + WIRE_BUNDLE_TO_STRM_SW_PORT_TYPE.at(connectOp.getDestBundle()), + connectOp.destIndex()); + } + + // Cascade configuration + if (targetModel.getTargetArch() == AIEArch::AIE2) { + for (auto configOp : targetOp.getOps()) { + TileOp tile = cast(configOp.getTile().getDefiningOp()); + auto tileLoc = XAie_TileLoc(tile.getCol(), tile.getRow()); + TRY_XAIE_API_EMIT_ERROR( + targetOp, XAie_CoreConfigAccumulatorControl, &devInst, tileLoc, + WIRE_BUNDLE_TO_STRM_SW_PORT_TYPE.at( + static_cast(configOp.getInputDir())), + WIRE_BUNDLE_TO_STRM_SW_PORT_TYPE.at( + static_cast(configOp.getOutputDir()))); + } + } + + return success(); + } + + LogicalResult addCoreEnableToCDO(DeviceOp &targetOp) { + // Start execution of all the cores. + for (auto tileOp : targetOp.getOps()) { + auto tileLoc = XAie_TileLoc(tileOp.colIndex(), tileOp.rowIndex()); + if (!tileOp.isShimTile() && tileOp.getCoreOp()) + TRY_XAIE_API_EMIT_ERROR(targetOp, XAie_CoreEnable, &devInst, tileLoc); + } + return success(); + } + + void dmaUpdateBdAddr(DeviceOp &targetOp, int col, int row, size_t addr, + size_t bdId) { + auto tileLoc = XAie_TileLoc(col, row); + TRY_XAIE_API_FATAL_ERROR(XAie_DmaUpdateBdAddr, &devInst, tileLoc, addr, + bdId); + } +}; + +} // namespace xilinx::AIE + +void initializeCDOGenerator(byte_ordering endianness, bool cdoDebug) { + // Enables AXI-MM prints for configs being added in CDO + if (cdoDebug) EnAXIdebug(); + setEndianness(endianness); +}; + +LogicalResult generateCDOBinary(const StringRef outputPath, + const std::function &cb) { + startCDOFileStream(outputPath.str().c_str()); + FileHeader(); + // Never generate a completely empty CDO file. If the file only contains a + // header, then bootgen flags it as invalid. + insertNoOpCommand(4); + if (failed(cb())) return failure(); + configureHeader(); + endCurrentCDOFileStream(); + return success(); +} + +LogicalResult generateCDOBinariesSeparately(AIEControl &ctl, + const StringRef workDirPath, + DeviceOp &targetOp, bool aieSim, + bool enableCores) { + if (failed(generateCDOBinary( + (llvm::Twine(workDirPath) + std::string(1, ps) + "aie_cdo_elfs.bin") + .str(), + [&ctl, &targetOp, &workDirPath, &aieSim] { + return ctl.addAieElfsToCDO(targetOp, workDirPath, aieSim); + }))) + return failure(); + + if (failed(generateCDOBinary( + (llvm::Twine(workDirPath) + std::string(1, ps) + "aie_cdo_init.bin") + .str(), + [&ctl, &targetOp] { return ctl.addInitConfigToCDO(targetOp); }))) + return failure(); + + if (enableCores && + failed(generateCDOBinary( + (llvm::Twine(workDirPath) + std::string(1, ps) + "aie_cdo_enable.bin") + .str(), + [&ctl, &targetOp] { return ctl.addCoreEnableToCDO(targetOp); }))) + return failure(); + + return success(); +} + +LogicalResult generateCDOUnified(AIEControl &ctl, const StringRef workDirPath, + DeviceOp &targetOp, bool aieSim, + bool enableCores) { + return generateCDOBinary( + (llvm::Twine(workDirPath) + std::string(1, ps) + "aie_cdo.bin").str(), + [&ctl, &targetOp, &workDirPath, &aieSim, &enableCores] { + if (!targetOp.getOps().empty() && + failed(ctl.addAieElfsToCDO(targetOp, workDirPath, aieSim))) + return failure(); + if (failed(ctl.addInitConfigToCDO(targetOp))) return failure(); + if (enableCores && !targetOp.getOps().empty() && + failed(ctl.addCoreEnableToCDO(targetOp))) + return failure(); + return success(); + }); +} + +LogicalResult AIETranslateToCDODirect(ModuleOp m, llvm::StringRef workDirPath, + byte_ordering endianness, + bool emitUnified, bool cdoDebug, + bool aieSim, bool xaieDebug, + bool enableCores) { + auto devOps = m.getOps(); + assert(llvm::range_size(devOps) == 1 && + "only exactly 1 device op supported."); + DeviceOp targetOp = *devOps.begin(); + const BaseNPUTargetModel &targetModel = + (const BaseNPUTargetModel &)targetOp.getTargetModel(); + + // things like XAIE_MEM_TILE_ROW_START and the missing + // shim dma on tile (0,0) are hard-coded assumptions about NPU... + assert(targetModel.isNPU() && "Only NPU currently supported"); + + AIEControl ctl(aieSim, xaieDebug, targetModel); + initializeCDOGenerator(endianness, cdoDebug); + if (emitUnified) + return generateCDOUnified(ctl, workDirPath, targetOp, aieSim, enableCores); + return generateCDOBinariesSeparately(ctl, workDirPath, targetOp, aieSim, + enableCores); +} +// Not sure why but defining this with xilinx::AIE will create a duplicate +// symbol in libAIETargets.a that then doesn't actually match the header? +namespace xilinx::AIE { +LogicalResult AIETranslateToCDODirect(ModuleOp m, llvm::StringRef workDirPath, + bool bigEndian, bool emitUnified, + bool cdoDebug, bool aieSim, + bool xaieDebug, bool enableCores) { + byte_ordering endianness = + bigEndian ? byte_ordering::Big_Endian : byte_ordering::Little_Endian; + return AIETranslateToCDODirect(m, workDirPath, endianness, emitUnified, + cdoDebug, aieSim, xaieDebug, enableCores); +} +} // namespace xilinx::AIE diff --git a/compiler/plugins/target/AMD-AIE/aie/AIETargetLdScript.cpp b/compiler/plugins/target/AMD-AIE/aie/AIETargetLdScript.cpp new file mode 100644 index 000000000..48313109f --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/aie/AIETargetLdScript.cpp @@ -0,0 +1,169 @@ +//===- AIETargetLdScript.cpp -----------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2023 Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// +#include "AIETargets.h" +#include "aie/Dialect/AIE/IR/AIEDialect.h" + +using namespace mlir; +using namespace xilinx; +using namespace xilinx::AIE; + +// Output the memorymap in gnu linker format for the given buffer operations, +// with the given offset. The offset is different depending on where the buffers +// are accessed from. +static void writeLDScriptMap(raw_ostream &output, BufferOp buf, int offset) { + std::string bufName(buf.name().getValue()); + int bufferBaseAddr = getBufferBaseAddress(buf); + int numBytes = buf.getAllocationSize(); + output << ". = 0x" << llvm::utohexstr(offset + bufferBaseAddr) << ";\n"; + output << bufName << " = .;\n"; + output << ". += 0x" << llvm::utohexstr(numBytes) << ";\n"; +} + +///// ld.script format: +// +// MEMORY +// { +// program (RX) : ORIGIN = 0, LENGTH = 0x0020000 +// data (!RX) : ORIGIN = 0x20000, LENGTH = 0x0020000 +// } +// ENTRY(_main_init) +// INPUT(something.o) +// SECTIONS +// { +// . = 0x0; +// .text : { +// // the _main_init symbol from me_basic.o has to come at address zero. +// *me_basic.o(.text) +// . = 0x200; +// __ctors_start__ = .; +// __init_array_start = .; +// KEEP(SORT(*)(.init_array)) +// __ctors_end__ = .; +// __init_array_end = .; +// __dtors_start__ = .; +// __dtors_end__ = .; +// *(.text) +// } > program +// .data : { *(.data) } > data +// . = 0x20000; +// _sp_start_value_DM_stack = .; +// . = 0x24000; +// a = .; +// . += 1024; +// .bss : { *(.bss) } > data +// } +LogicalResult xilinx::AIE::AIETranslateToLdScript(ModuleOp module, + raw_ostream &output, + int tileCol, int tileRow) { + DenseMap tiles; + DenseMap> buffers; + + if (module.getOps().empty()) { + module.emitOpError("expected AIE.device operation at toplevel"); + } + DeviceOp targetOp = *(module.getOps().begin()); + + collectTiles(targetOp, tiles); + collectBuffers(targetOp, buffers); + + for (auto tile : targetOp.getOps()) + if (tile.colIndex() == tileCol && tile.rowIndex() == tileRow) { + TileID srcCoord = {tile.colIndex(), tile.rowIndex()}; + const auto &targetModel = getTargetModel(tile); + + // Figure out how much memory we have left for random allocations + auto core = tile.getCoreOp(); + int max = core.getStackSize(); + for (auto buf : buffers[tiles[srcCoord]]) { + int bufferBaseAddr = getBufferBaseAddress(buf); + int numBytes = buf.getAllocationSize(); + max = std::max(max, bufferBaseAddr + numBytes); + } + int origin = targetModel.getMemInternalBaseAddress(srcCoord) + max; + int length = targetModel.getLocalMemorySize() - max; + output << R"THESCRIPT( +MEMORY +{ + program (RX) : ORIGIN = 0, LENGTH = 0x0020000 +)THESCRIPT"; + output << " data (!RX) : ORIGIN = 0x" << llvm::utohexstr(origin) + << ", LENGTH = 0x" << llvm::utohexstr(length); + output << R"THESCRIPT( +} +ENTRY(_main_init) +SECTIONS +{ + . = 0x0; + .text : { + /* the _main_init symbol from me_basic.o has to come at address zero. */ + *me_basic.o(.text) + . = 0x200; + _ctors_start = .; + _init_array_start = .; + KEEP(SORT(*.init_array)) + _ctors_end = .; + _init_array_end = .; + _dtors_start = .; + _dtors_end = .; + *(.text) + } > program + .data : { + *(.data*); + *(.rodata*) + } > data +)THESCRIPT"; + auto doBuffer = [&](std::optional tile, int offset, + std::string dir) { + if (tile) { + if (tiles.count(*tile)) + for (auto buf : buffers[tiles[*tile]]) + writeLDScriptMap(output, buf, offset); + } else { + output << "/* No tile with memory exists to the " << dir << ". */\n"; + output << ". = 0x" << llvm::utohexstr(offset) << ";\n"; + uint32_t localMemSize = targetModel.getLocalMemorySize(); + output << ". += 0x" << llvm::utohexstr(localMemSize) << ";\n"; + } + }; + + // Stack + output << ". = 0x" + << llvm::utohexstr(targetModel.getMemInternalBaseAddress(srcCoord)) + << ";\n"; + output << "_sp_start_value_DM_stack = .;\n"; + + if (auto core = tile.getCoreOp()) + output << ". += 0x" << llvm::utohexstr(core.getStackSize()) + << "; /* stack */\n"; + else + output << "/* no stack allocated */\n"; + + doBuffer(targetModel.getMemSouth(srcCoord), + targetModel.getMemSouthBaseAddress(), std::string("south")); + doBuffer(targetModel.getMemWest(srcCoord), + targetModel.getMemWestBaseAddress(), std::string("west")); + doBuffer(targetModel.getMemNorth(srcCoord), + targetModel.getMemNorthBaseAddress(), std::string("north")); + doBuffer(targetModel.getMemEast(srcCoord), + targetModel.getMemEastBaseAddress(), std::string("east")); + + output << " .bss : { *(.bss) } > data\n"; + output << " .bss.DMb.4 : { *(.bss.DMb.4) } > data\n"; + output << "}\n"; + if (auto coreOp = tile.getCoreOp()) { + if (auto fileAttr = coreOp.getLinkWith()) + output << "INPUT(" << fileAttr.value().str() << ")\n"; + + output << "PROVIDE(_main = core_" << tile.getCol() << "_" + << tile.getRow() << ");\n"; + } + } + return success(); +} diff --git a/compiler/plugins/target/AMD-AIE/aie/AIETargetNPU.cpp b/compiler/plugins/target/AMD-AIE/aie/AIETargetNPU.cpp new file mode 100644 index 000000000..16ca38041 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/aie/AIETargetNPU.cpp @@ -0,0 +1,153 @@ +//===- AIETargetNPU.cpp -----------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2023 Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include + +#include "AIETargets.h" +#include "aie/Dialect/AIE/IR/AIEDialect.h" +#include "aie/Dialect/AIEX/IR/AIEXDialect.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/TypeSwitch.h" +#include "llvm/Support/Format.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Tools/mlir-translate/MlirTranslateMain.h" + +using namespace mlir; +using namespace xilinx; +using namespace xilinx::AIE; +using namespace xilinx::AIEX; + +namespace { + +std::vector getProlog() { + return {0x00000011, 0x01000405, 0x01000100, 0x0B590100, 0x000055FF, + 0x00000001, 0x00000010, 0x314E5A5F, 0x635F5F31, 0x676E696C, + 0x39354E5F, 0x6E693131, 0x5F727473, 0x64726F77, 0x00004573, + 0x07BD9630, 0x000055FF}; +} + +// Example: +// - instructions = {3,4,5} +// - tailSize = 2 +// instructions becomes {3,4,5,0,0} and +// a mutable reference to the tail {0,0} is returned. +llvm::MutableArrayRef reserveAndGetTail( + std::vector &instructions, uint64_t tailSize) { + auto oldSize = instructions.size(); + auto newSize = oldSize + tailSize; + instructions.resize(newSize, 0); + return llvm::MutableArrayRef(instructions.data() + oldSize, + tailSize); +} + +void appendSync(std::vector &instructions, NpuSyncOp op) { + auto words = reserveAndGetTail(instructions, 2); + + uint32_t opCode = 3; + words[0] |= (opCode & 0xff) << 24; + words[0] |= (op.getColumn() & 0xff) << 16; + words[0] |= (op.getRow() & 0xff) << 8; + words[0] |= op.getDirection() & 0x1; + + words[1] |= (op.getChannel() & 0xff) << 24; + words[1] |= (op.getColumnNum() & 0xff) << 16; + words[1] |= (op.getRowNum() & 0xff) << 8; +} + +void appendWrite32(std::vector &instructions, NpuWrite32Op op) { + auto words = reserveAndGetTail(instructions, 3); + + uint32_t opCode = 2; + words[0] |= (opCode & 0xff) << 24; + words[0] |= (op.getColumn() & 0xff) << 16; + words[0] |= (op.getRow() & 0xff) << 8; + + words[1] = op.getAddress(); + + words[2] = op.getValue(); +} + +void appendWriteBdShimTile(std::vector &instructions, + NpuWriteBdExShimTileOp op) { + auto words = reserveAndGetTail(instructions, 10); + + uint32_t opCode = 6; + words[0] |= (opCode & 0xff) << 24; + words[0] |= (op.getColumn() & 0xff) << 16; + words[0] |= (op.getColumnNum() & 0xff) << 8; + words[0] |= (op.getDdrId() & 0xf) << 4; + words[0] |= (op.getBdId() & 0xf); + + // TODO: Address Incr + // words[1] = ... + + words[2] = op.getBufferLength(); + words[3] = op.getBufferOffset(); + + // En Packet , OoO BD ID , Packet ID , Packet Type + words[4] |= (op.getEnablePacket() & 0x1) << 30; + words[4] |= (op.getOutOfOrderId() & 0x3f) << 24; + words[4] |= (op.getPacketId() & 0x1f) << 19; + words[4] |= (op.getPacketType() & 0x7) << 16; + + // TODO: Secure Access + words[5] |= (op.getD0Size() & 0x3ff) << 20; + words[5] |= op.getD0Stride() & 0xfffff; + + words[6] = 0x80000000; // burst length; + words[6] |= (op.getD1Size() & 0x3ff) << 20; + words[6] |= op.getD1Stride() & 0xfffff; + + // TODO: SIMID, AxCache, AXQoS + words[7] = op.getD2Stride() & 0xfffff; + + words[8] |= (op.getIterationCurrent() & 0x3f) << 26; + words[8] |= (op.getIterationSize() & 0x3f) << 20; + words[8] |= op.getIterationStride() & 0xfffff; + + // TODO: TLAST Suppress + words[9] |= (op.getNextBd() & 0xf) << 27; + words[9] |= (op.getUseNextBd() & 0x1) << 26; + words[9] |= (op.getValidBd() & 0x1) << 25; + words[9] |= (op.getLockRelVal() & 0xef) << 18; + words[9] |= (op.getLockRelId() & 0xf) << 13; + words[9] |= (op.getLockAcqEnable() & 0x1) << 12; + words[9] |= (op.getLockAcqVal() & 0xef) << 5; + words[9] |= op.getLockAcqId() & 0xf; +} + +} // namespace + +std::vector xilinx::AIE::AIETranslateToNPU(ModuleOp module) { + std::vector instructions = getProlog(); + + DeviceOp deviceOp = *module.getOps().begin(); + auto funcOps = deviceOp.getOps(); + for (auto f : funcOps) { + if (f.isDeclaration()) continue; + Block &entry = f.getRegion().front(); + for (auto &o : entry) { + llvm::TypeSwitch(&o) + .Case([&](auto op) { appendSync(instructions, op); }) + .Case([&](auto op) { appendWrite32(instructions, op); }) + .Case( + [&](auto op) { appendWriteBdShimTile(instructions, op); }); + } + } + + return instructions; +} + +LogicalResult xilinx::AIE::AIETranslateToNPU(ModuleOp module, + raw_ostream &output) { + auto instructions = AIETranslateToNPU(module); + for (auto w : instructions) output << llvm::format("%08X\n", w); + return success(); +} diff --git a/compiler/plugins/target/AMD-AIE/aie/AIETargets.h b/compiler/plugins/target/AMD-AIE/aie/AIETargets.h new file mode 100644 index 000000000..f1ef5bf7e --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/aie/AIETargets.h @@ -0,0 +1,36 @@ +//===- AIETargets.h ---------------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef AIE_TARGETS_AIETARGETS_H +#define AIE_TARGETS_AIETARGETS_H + +#include "llvm/Support/raw_ostream.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/Support/LogicalResult.h" + +namespace xilinx { +namespace AIE { + +mlir::LogicalResult AIETranslateToNPU(mlir::ModuleOp module, + llvm::raw_ostream &output); +std::vector AIETranslateToNPU(mlir::ModuleOp); +mlir::LogicalResult AIETranslateToLdScript(mlir::ModuleOp module, + llvm::raw_ostream &output, + int tileCol, int tileRow); +mlir::LogicalResult AIETranslateToBCF(mlir::ModuleOp module, + llvm::raw_ostream &output, int tileCol, + int tileRow); +mlir::LogicalResult AIETranslateToCDODirect( + mlir::ModuleOp m, llvm::StringRef workDirPath, bool bigEndian = false, + bool emitUnified = false, bool cdoDebug = false, bool aieSim = false, + bool xaieDebug = false, bool enableCores = true); +} // namespace AIE + +} // namespace xilinx + +#endif diff --git a/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt index 80b37af49..9b63b4c9a 100644 --- a/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt @@ -364,10 +364,10 @@ iree_cc_library( AIETargets SRCS "XCLBinGen.cpp" - "${IREE_MLIR_AIE_SOURCE_DIR}/lib/Targets/AIETargetBCF.cpp" - "${IREE_MLIR_AIE_SOURCE_DIR}/lib/Targets/AIETargetLdScript.cpp" - "${IREE_MLIR_AIE_SOURCE_DIR}/lib/Targets/AIETargetNPU.cpp" - "${IREE_MLIR_AIE_SOURCE_DIR}/lib/Targets/AIETargetCDODirect.cpp" + "AIETargetBCF.cpp" + "AIETargetLdScript.cpp" + "AIETargetNPU.cpp" + "AIETargetCDODirect.cpp" DEPS ::AIEDialectIR ::AIEDialectIR diff --git a/compiler/plugins/target/AMD-AIE/aie/XCLBinGen.cpp b/compiler/plugins/target/AMD-AIE/aie/XCLBinGen.cpp index 455b995b1..5ed17737d 100644 --- a/compiler/plugins/target/AMD-AIE/aie/XCLBinGen.cpp +++ b/compiler/plugins/target/AMD-AIE/aie/XCLBinGen.cpp @@ -15,13 +15,12 @@ #include #include +#include "AIETargets.h" +#include "Passes.h" #include "aie/AIEAssignBufferAddressesBasic.h" #include "aie/Conversion/AIEVecToLLVM/AIEVecToLLVM.h" -#include "aie/Dialect/AIE/Transforms/AIEPasses.h" #include "aie/Dialect/AIEVec/Pipelines/Passes.h" -#include "aie/Dialect/AIEX/Transforms/AIEXPasses.h" #include "aie/Target/LLVMIR/Dialect/XLLVM/XLLVMToLLVMIRTranslation.h" -#include "aie/Targets/AIETargets.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/JSON.h" #include "llvm/Support/MemoryBuffer.h" @@ -34,8 +33,8 @@ #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h" #include "mlir/Conversion/MathToLLVM/MathToLLVM.h" #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h" -#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h" #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.h" +#include "mlir/Dialect/Linalg/Passes.h" #include "mlir/Dialect/MemRef/Transforms/Passes.h" #include "mlir/IR/MLIRContext.h" #include "mlir/Pass/PassManager.h" @@ -174,16 +173,6 @@ static std::string getUUIDString() { #endif return val; } -static void addAIELoweringPasses(OpPassManager &pm) { - pm.addPass(createLowerAffinePass()); - pm.addPass(AIE::createAIECanonicalizeDevicePass()); - OpPassManager &devicePM = pm.nest(); - devicePM.addPass(AIE::createAIEAssignLockIDsPass()); - devicePM.addPass(AIE::createAIEAssignBufferDescriptorIDsPass()); - devicePM.addPass(AIE::createAIEObjectFifoStatefulTransformPass()); - devicePM.addPass(AIE::createAIEAssignBufferAddressesBasicPass()); - pm.addPass(createConvertSCFToCFPass()); -} static void addLowerToLLVMPasses(OpPassManager &pm) { pm.addPass(createCanonicalizerPass()); @@ -334,7 +323,7 @@ static LogicalResult generateCoreElfFiles(ModuleOp moduleOp, XCLBinGenConfig &TK) { auto deviceOps = moduleOp.getOps(); if (!llvm::hasSingleElement(deviceOps)) - return moduleOp.emitOpError("expected a single device op"); + return moduleOp.emitOpError(": expected a single device op"); AIE::DeviceOp deviceOp = *deviceOps.begin(); auto tileOps = deviceOp.getOps(); @@ -368,14 +357,14 @@ static LogicalResult generateCoreElfFiles(ModuleOp moduleOp, if (!bcfOutput) return coreOp.emitOpError(errorMessage); if (failed(AIE::AIETranslateToBCF(moduleOp, bcfOutput->os(), col, row))) - return coreOp.emitOpError("Failed to generate BCF"); + return coreOp.emitOpError(": Failed to generate BCF"); bcfOutput->keep(); } std::vector extractedIncludes; { auto bcfFileIn = openInputFile(bcfPath, &errorMessage); - if (!bcfFileIn) moduleOp.emitOpError(errorMessage); + if (!bcfFileIn) return moduleOp.emitOpError(errorMessage); std::string bcfFile = std::string(bcfFileIn->getBuffer()); std::regex r("_include _file (.*)"); @@ -395,8 +384,10 @@ static LogicalResult generateCoreElfFiles(ModuleOp moduleOp, for (const auto &inc : extractedIncludes) flags.push_back(inc); auto chessArgs_ = chessArgs(TK.AIEToolsDir, chessworkDir.str().str()); chessArgs_.insert(chessArgs_.end(), flags.begin(), flags.end()); + if (!sys::fs::exists(chessExe)) + return moduleOp.emitOpError(": chess can't be found"); if (runTool(chessExe, chessArgs_, TK.Verbose) != 0) - coreOp.emitOpError("Failed to link with xbridge"); + return coreOp.emitOpError(": Failed to link with xbridge"); } return success(); } @@ -404,71 +395,38 @@ static LogicalResult generateCoreElfFiles(ModuleOp moduleOp, static LogicalResult generateCDO(MLIRContext *context, ModuleOp moduleOp, XCLBinGenConfig &TK) { ModuleOp copy = moduleOp.clone(); - std::string errorMessage; - // This corresponds to `process_host_cgen`, which is listed as host - // compilation in aiecc.py... not sure we need this. - PassManager passManager(context, ModuleOp::getOperationName()); - applyConfigToPassManager(TK, passManager); - - passManager.addNestedPass(AIE::createAIEPathfinderPass()); - if (failed(passManager.run(copy))) - return moduleOp.emitOpError( - "failed to run passes to prepare of XCLBin generation"); - if (failed(AIE::AIETranslateToCDODirect(copy, TK.TempDir))) - return moduleOp.emitOpError("failed to emit CDO"); - + return moduleOp.emitOpError(": failed to emit CDO"); copy->erase(); return success(); } static json::Object makeKernelJSON(std::string name, std::string id, - std::string instance) { + std::string instance, int numArgs) { + json::Array args{json::Object{{"name", "instr"}, + {"memory-connection", "SRAM"}, + {"address-qualifier", "GLOBAL"}, + {"type", "char *"}, + {"offset", "0x00"}}, + json::Object{{"name", "ninstr"}, + {"address-qualifier", "SCALAR"}, + {"type", "uint64_t"}, + {"offset", "0x08"}}}; + for (int arg = 0; arg < numArgs; ++arg) { + args.push_back(json::Object{{"name", "bo" + std::to_string(arg)}, + {"memory-connection", "HOST"}, + {"address-qualifier", "GLOBAL"}, + {"type", "char *"}, + {"offset", std::to_string(0x10 + 0x8 * arg)}}); + } + return json::Object{ {"name", name}, {"type", "dpu"}, {"extended-data", json::Object{ {"subtype", "DPU"}, {"functional", "1"}, {"dpu_kernel_id", id}}}, - {"arguments", json::Array{json::Object{{"name", "instr"}, - {"memory-connection", "SRAM"}, - {"address-qualifier", "GLOBAL"}, - {"type", "char *"}, - {"offset", "0x00"}}, - json::Object{{"name", "ninstr"}, - {"address-qualifier", "SCALAR"}, - {"type", "uint64_t"}, - {"offset", "0x08"}}, - json::Object{{"name", "bo0"}, - {"memory-connection", "HOST"}, - {"address-qualifier", "GLOBAL"}, - {"type", "char *"}, - {"offset", "0x10"}}, - json::Object{{"name", "bo1"}, - {"memory-connection", "HOST"}, - {"address-qualifier", "GLOBAL"}, - {"type", "char *"}, - {"offset", "0x18"}}, - json::Object{{"name", "bo2"}, - {"memory-connection", "HOST"}, - {"address-qualifier", "GLOBAL"}, - {"type", "char *"}, - {"offset", "0x20"}}, - json::Object{{"name", "bo3"}, - {"memory-connection", "HOST"}, - {"address-qualifier", "GLOBAL"}, - {"type", "char *"}, - {"offset", "0x28"}}, - json::Object{{"name", "bo4"}, - {"memory-connection", "HOST"}, - {"address-qualifier", "GLOBAL"}, - {"type", "char *"}, - {"offset", "0x30"}}, - json::Object{{"name", "bo5"}, - {"memory-connection", "HOST"}, - {"address-qualifier", "GLOBAL"}, - {"type", "char *"}, - {"offset", "0x38"}}}}, + {"arguments", std::move(args)}, {"instances", json::Array{json::Object{{"name", instance}}}}}; } @@ -541,7 +499,8 @@ static LogicalResult generateXCLBin(MLIRContext *context, ModuleOp moduleOp, "type": "PRIMARY", "pdi_id": "0x01", "dpu_kernel_ids": [ - "0x901" + ")" + TK.XCLBinKernelID + + R"(" ], "pre_cdo_groups": [ "0xC1" @@ -564,13 +523,24 @@ static LogicalResult generateXCLBin(MLIRContext *context, ModuleOp moduleOp, auto kernelsJsonOut = openOutputFile(kernelsJsonFile, &errorMessage); if (!kernelsJsonOut) return moduleOp.emitOpError(errorMessage); + // TODO(max): should be gotten from the dispatch not this func (which will + // eventually disappear) + std::optional numArgs; + moduleOp.walk([&numArgs](func::FuncOp sequenceFunc) { + if (sequenceFunc.getName() == "sequence") + numArgs = sequenceFunc.getArgumentTypes().size(); + }); + if (!numArgs) + return moduleOp.emitOpError( + "Couldn't find func.func @sequence to count args"); + json::Object kernels_data{ {"ps-kernels", json::Object{ {"kernels", json::Array{// TODO: Support for multiple kernels makeKernelJSON(TK.XCLBinKernelName, TK.XCLBinKernelID, - TK.XCLBinInstanceName)}}}}}; + TK.XCLBinInstanceName, *numArgs)}}}}}; kernelsJsonOut->os() << formatv("{0:2}", json::Value(std::move(kernels_data))); kernelsJsonOut->keep(); @@ -612,9 +582,9 @@ static LogicalResult generateXCLBin(MLIRContext *context, ModuleOp moduleOp, if (auto bootgen = sys::findProgramByName("bootgen")) { if (runTool(*bootgen, flags, TK.Verbose) != 0) - return moduleOp.emitOpError("failed to execute bootgen"); + return moduleOp.emitOpError(": failed to execute bootgen"); } else { - return moduleOp.emitOpError("could not find bootgen"); + return moduleOp.emitOpError(": could not find bootgen"); } } @@ -636,9 +606,9 @@ static LogicalResult generateXCLBin(MLIRContext *context, ModuleOp moduleOp, if (auto xclbinutil = sys::findProgramByName("xclbinutil")) { if (runTool(*xclbinutil, flags, TK.Verbose) != 0) - return moduleOp.emitOpError("failed to execute xclbinutil"); + return moduleOp.emitOpError(": failed to execute xclbinutil"); } else { - return moduleOp.emitOpError("could not find xclbinutil"); + return moduleOp.emitOpError(": could not find xclbinutil"); } } return success(); @@ -735,7 +705,7 @@ static LogicalResult generateObject(MLIRContext *context, ModuleOp moduleOp, }(); if (failed(vectorToAIEVecOptions.parseFromString(optionsString))) { - return moduleOp.emitOpError("Failed to parse options from '") + return moduleOp.emitOpError(": Failed to parse options from '") << optionsString << "': Failed to construct ConvertVectorToAIEVecOptions."; } @@ -753,7 +723,7 @@ static LogicalResult generateObject(MLIRContext *context, ModuleOp moduleOp, ModuleOp copy = moduleOp.clone(); if (failed(pm.run(copy))) - return moduleOp.emitOpError("Failed to lower to LLVM"); + return moduleOp.emitOpError(": Failed to lower to LLVM"); SmallString<64> LLVMIRFile(TK.TempDir); sys::path::append(LLVMIRFile, "input.ll"); @@ -761,72 +731,75 @@ static LogicalResult generateObject(MLIRContext *context, ModuleOp moduleOp, llvm::LLVMContext llvmContext; auto llvmModule = translateModuleToLLVMIR(copy, llvmContext); if (!llvmModule) - return moduleOp.emitOpError("Failed to translate module to LLVMIR"); + return moduleOp.emitOpError(": Failed to translate module to LLVMIR"); + std::string llvmirString; std::string errorMessage; { + raw_string_ostream llvmirStream(llvmirString); + llvmModule->print(llvmirStream, nullptr); + llvmirString = chesshack(llvmirString); auto output = openOutputFile(LLVMIRFile, &errorMessage); if (!output) return moduleOp.emitOpError(errorMessage); - llvmModule->print(output->os(), nullptr); + output->os() << llvmirString; output->keep(); } SmallString<64> chessExe(TK.AIEToolsDir); sys::path::append(chessExe, "bin", "unwrapped", "lnx64.o", "xchesscc"); - SmallString<64> chessworkDir(TK.TempDir); - sys::path::append(chessworkDir, "chesswork"); - SmallString<64> chessIntrinsicsLL(TK.InstallDir); + SmallString<64> chessIntrinsicsLL(TK.TempDir); sys::path::append(chessIntrinsicsLL, "chess_intrinsic_wrapper.ll"); { - auto chessIntrinsicWrapperLlFile = openOutputFile(chessIntrinsicsLL); - if (!chessIntrinsicWrapperLlFile) moduleOp.emitOpError(errorMessage); + auto chessIntrinsicWrapperLlFile = + openOutputFile(chessIntrinsicsLL, &errorMessage); + if (!chessIntrinsicWrapperLlFile) return moduleOp.emitOpError(errorMessage); chessIntrinsicWrapperLlFile->os() << _CHESS_INTRINSIC_WRAPPER_LL; chessIntrinsicWrapperLlFile->keep(); } - std::string llvmirString; - { - raw_string_ostream llvmirStream(llvmirString); - llvmModule->print(llvmirStream, nullptr); - } - SmallString<64> chesslinkedFile(TK.TempDir); sys::path::append(chesslinkedFile, "input.chesslinked.ll"); SmallString<64> chessLlvmLinkBin(TK.AIEToolsDir); sys::path::append(chessLlvmLinkBin, "tps", "lnx64", "target"); sys::path::append(chessLlvmLinkBin, "bin", "LNa64bin", "chess-llvm-link"); + if (!sys::fs::exists(chessLlvmLinkBin)) + return moduleOp.emitOpError(": chess-llvm-link can't be found"); if (runTool(chessLlvmLinkBin, {std::string(LLVMIRFile), std::string(chessIntrinsicsLL), "--opaque-pointers=1", "-S", "-o", std::string(chesslinkedFile)}, TK.Verbose) != 0) - moduleOp.emitOpError("Couldn't link in the intrinsics"); + return moduleOp.emitOpError(": Couldn't link in the intrinsics"); std::string mungedLLVMIR; { auto chesslinkedIn = openInputFile(chesslinkedFile, &errorMessage); - if (!chesslinkedIn) moduleOp.emitOpError(errorMessage); + if (!chesslinkedIn) return moduleOp.emitOpError(errorMessage); mungedLLVMIR = std::string(chesslinkedIn->getBuffer()); mungedLLVMIR = chesshack(mungedLLVMIR); } { - auto chesslinkedOut = openOutputFile(chesslinkedFile); - if (!chesslinkedOut) moduleOp.emitOpError(errorMessage); + auto chesslinkedOut = openOutputFile(chesslinkedFile, &errorMessage); + if (!chesslinkedOut) return moduleOp.emitOpError(errorMessage); chesslinkedOut->os() << mungedLLVMIR; chesslinkedOut->keep(); } + SmallString<64> chessworkDir(TK.TempDir); + sys::path::append(chessworkDir, "chesswork"); auto chessArgs_ = chessArgs(TK.AIEToolsDir, chessworkDir.str().str()); chessArgs_.push_back("-c"); chessArgs_.push_back(std::string(chesslinkedFile)); chessArgs_.push_back("-o"); chessArgs_.push_back(std::string(outputFile)); + if (!sys::fs::exists(chessExe)) + return moduleOp.emitOpError(": chess can't be found"); if (runTool(chessExe, chessArgs_, TK.Verbose) != 0) - return moduleOp.emitOpError("Failed to assemble with chess"); + return moduleOp.emitOpError(": Failed to assemble with chess"); copy->erase(); return success(); } @@ -834,21 +807,8 @@ static LogicalResult generateObject(MLIRContext *context, ModuleOp moduleOp, LogicalResult xilinx::aie2xclbin(MLIRContext *ctx, ModuleOp moduleOp, XCLBinGenConfig &TK, StringRef OutputNPU, StringRef OutputXCLBin) { - if (failed(xilinx::findVitis(TK))) moduleOp.emitOpError("VITIS not found"); - - PassManager pm(ctx, moduleOp.getOperationName()); - applyConfigToPassManager(TK, pm); - - addAIELoweringPasses(pm); - - if (TK.Verbose) { - llvm::outs() << "Running: "; - pm.printAsTextualPipeline(llvm::outs()); - llvm::outs() << "\n"; - } - - if (failed(pm.run(moduleOp))) - return moduleOp.emitOpError("AIE lowering pipline failed"); + if (failed(xilinx::findVitis(TK))) + return moduleOp.emitOpError(": VITIS not found"); TK.TargetArch = StringRef(TK.TargetArch).trim(); @@ -865,17 +825,14 @@ LogicalResult xilinx::aie2xclbin(MLIRContext *ctx, ModuleOp moduleOp, pm.addNestedPass(AIEX::createAIEDmaToNpuPass()); ModuleOp copy = moduleOp.clone(); if (failed(pm.run(copy))) - return moduleOp.emitOpError("NPU Instruction pipeline failed"); + return moduleOp.emitOpError(": NPU Instruction pipeline failed"); std::string errorMessage; auto output = openOutputFile(OutputNPU, &errorMessage); - if (!output) { - llvm::errs() << errorMessage << "\n"; - return moduleOp.emitOpError(""); - } + if (!output) return moduleOp.emitOpError(errorMessage); if (failed(AIE::AIETranslateToNPU(copy, output->os()))) - return moduleOp.emitOpError("NPU Instruction translation failed"); + return moduleOp.emitOpError(": NPU Instruction translation failed"); output->keep(); copy->erase(); @@ -884,16 +841,16 @@ LogicalResult xilinx::aie2xclbin(MLIRContext *ctx, ModuleOp moduleOp, SmallString<64> object(TK.TempDir); sys::path::append(object, "input.o"); if (failed(generateObject(ctx, moduleOp, TK, std::string(object)))) - return moduleOp.emitOpError("Failed to generate object"); + return moduleOp.emitOpError(": Failed to generate object"); if (failed(generateCoreElfFiles(moduleOp, object, TK))) - return moduleOp.emitOpError("Failed to generate core ELF file(s)"); + return moduleOp.emitOpError(": Failed to generate core ELF file(s)"); if (failed(generateCDO(ctx, moduleOp, TK))) - return moduleOp.emitOpError("Failed to generate CDO"); + return moduleOp.emitOpError(": Failed to generate CDO"); if (failed(generateXCLBin(ctx, moduleOp, TK, OutputXCLBin))) - return moduleOp.emitOpError("Failed to generate XCLBin"); + return moduleOp.emitOpError(": Failed to generate XCLBin"); return success(); } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETargetDirect.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETargetDirect.cpp index 5212a4ae3..c1097fa89 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETargetDirect.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETargetDirect.cpp @@ -206,12 +206,6 @@ LogicalResult AIETargetDirectBackend::serializeExecutable( if (failed(maybeWorkDir)) return failure(); auto workDir = maybeWorkDir.value(); - xilinx::XCLBinGenConfig TK; - TK.TempDir = workDir.str(); - TK.TargetArch = "AIE2"; - TK.UseChess = true; - TK.Verbose = true; - SmallVector entryPointNames; for (auto exportOp : variantOp.getExportOps()) { entryPointNames.emplace_back(exportOp.getSymName().substr(0, 48)); @@ -221,9 +215,15 @@ LogicalResult AIETargetDirectBackend::serializeExecutable( return moduleOp.emitOpError("Expected a single entry point"); } + xilinx::XCLBinGenConfig TK; + TK.TempDir = workDir.str(); + TK.TargetArch = "AIE2"; + TK.UseChess = true; + TK.Verbose = true; TK.XCLBinKernelName = entryPointNames[0]; TK.XCLBinKernelID = "0x101"; TK.XCLBinInstanceName = "FOO"; + SmallString<128> xclbinPath(workDir); llvm::sys::path::append(xclbinPath, basename + ".xclbin"); SmallString<128> npuInstPath(workDir); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_dma_transpose.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_dma_transpose.mlir deleted file mode 100644 index c4469b09e..000000000 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_dma_transpose.mlir +++ /dev/null @@ -1,58 +0,0 @@ -// RUN: not iree-compile --compile-mode=hal-executable --iree-hal-target-backends=amd-aie-direct %s 2>&1 | FileCheck %s - -// CHECK: %switchbox_0_0 = aie.switchbox -// CHECK: aie.dma_start -// CHECK: aie.dma_bd -// CHECK: unimplemented AIETargetDirectBackend::serializeExecutable -module attributes {hal.device.targets = [#hal.device.target<"amd-aie-direct", [#hal.executable.target<"amd-aie-direct", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>]>]} { - hal.executable private @dummy1 { - hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie-direct", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) { - hal.executable.export public @dummy2 ordinal(0) layout(#hal.pipeline.layout]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>]} { - ^bb0(%arg0: !hal.device): - %x, %y, %z = flow.dispatch.workgroup_count_from_slice - hal.return %x, %y, %z : index, index, index - } - builtin.module { - aie.device(npu1) { - %tile_0_0 = aie.tile(0, 0) - %tile_0_2 = aie.tile(0, 2) - aie.objectfifo @in(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @out(%tile_0_2, {%tile_0_0}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@in] -> [@out]() - %core_0_2 = aie.core(%tile_0_2) { - %c0 = arith.constant 0 : index - %0 = memref.alloc() : memref<10xf32> - %1 = memref.load %0[%c0] : memref<10xf32> - memref.store %1, %0[%c0] : memref<10xf32> - aie.end - } - func.func @sequence(%arg0: memref<4096xi32>, %arg1: memref<4096xi32>, %arg2: memref<4096xi32>) { - aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0]) {id = 0 : i64, metadata = @out} : memref<4096xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 64, 64, 1][1, 1, 64]) {id = 1 : i64, metadata = @in} : memref<4096xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - return - } - } - } - } - } - util.func public @dummy3(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = ""}} { - // this is all gibberish just to hit serializeExecutable - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %element_type_i8 = hal.element_type : i32 - %dense_row_major = hal.encoding_type : i32 - hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c1, %c1]) type(%element_type_i8) encoding(%dense_row_major) - %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<1024x512xi8> in !stream.resource{%c1} - %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource{%c1} => !stream.timepoint - - %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c1}) { - stream.cmd.dispatch @dummy1::@amdaie_xclbin_fb::@dummy2 { - ro %arg2[%c0 for %c1] : !stream.resource{%c1} - } - } => !stream.timepoint - %3 = stream.timepoint.await %2 => %result : !stream.resource{%c1} - %4 = stream.tensor.export %3 : tensor<1024x1024xi32> in !stream.resource{%c1} -> !hal.buffer_view - util.return %4 : !hal.buffer_view - } -} \ No newline at end of file diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_multiplication_matrix_vector.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_multiplication_matrix_vector.mlir deleted file mode 100644 index f9c8b4dba..000000000 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_multiplication_matrix_vector.mlir +++ /dev/null @@ -1,59 +0,0 @@ -// RUN: iree-opt --aie-objectFifo-stateful-transform %s - -module { - aie.device(npu1) { - func.func private @zero_scalar_f32(memref<32xf32>) - func.func private @zero_vectorized_f32(memref<32xf32>) - func.func private @matvec_scalar_bf16_f32(memref<32x32xbf16>, memref<32xbf16>, memref<32xf32>) - func.func private @matvec_vectorized_bf16_f32(memref<32x32xbf16>, memref<32xbf16>, memref<32xf32>) - %tile_0_0 = aie.tile(0, 0) - %tile_1_0 = aie.tile(1, 0) - %tile_2_0 = aie.tile(2, 0) - %tile_3_0 = aie.tile(3, 0) - %tile_0_1 = aie.tile(0, 1) - %tile_1_1 = aie.tile(1, 1) - %tile_2_1 = aie.tile(2, 1) - %tile_3_1 = aie.tile(3, 1) - %tile_0_2 = aie.tile(0, 2) - %tile_1_2 = aie.tile(1, 2) - %tile_2_2 = aie.tile(2, 2) - %tile_3_2 = aie.tile(3, 2) - aie.objectfifo @memA0(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @inA0(%tile_0_1 toStream [, ], {%tile_0_2}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@memA0] -> [@inA0]() - aie.objectfifo @inB(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @outC0(%tile_0_2, {%tile_0_0}, 2 : i32) : !aie.objectfifo> - %core_0_2 = aie.core(%tile_0_2) { - %c0 = arith.constant 0 : index - %c4294967295 = arith.constant 4294967295 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c4294967295 step %c1 { - %0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<32xf32> - func.call @zero_vectorized_f32(%1) : (memref<32xf32>) -> () - %c0_0 = arith.constant 0 : index - %c9 = arith.constant 9 : index - %c1_1 = arith.constant 1 : index - scf.for %arg1 = %c0_0 to %c9 step %c1_1 { - %2 = aie.objectfifo.acquire @inA0(Consume, 1) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<32x32xbf16> - %4 = aie.objectfifo.acquire @inB(Consume, 1) : !aie.objectfifosubview> - %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview> -> memref<32xbf16> - func.call @matvec_vectorized_bf16_f32(%3, %5, %1) : (memref<32x32xbf16>, memref<32xbf16>, memref<32xf32>) -> () - aie.objectfifo.release @inA0(Consume, 1) - aie.objectfifo.release @inB(Consume, 1) - } - aie.objectfifo.release @outC0(Produce, 1) - } - aie.end - } {link_with = "mv.o"} - func.func @sequence(%arg0: memref<41472xi32>, %arg1: memref<144xi32>, %arg2: memref<288xi32>) { - aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][9, 1, 1, 144][0, 0, 0]) {id = 2 : i64, metadata = @inB} : memref<144xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][9, 9, 32, 16][4608, 16, 144]) {id = 1 : i64, metadata = @memA0} : memref<41472xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 288][0, 0, 0]) {id = 0 : i64, metadata = @outC0} : memref<288xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - return - } - } -} - diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_multiplication_matrix_vector.py b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_multiplication_matrix_vector.py new file mode 100644 index 000000000..9162e1346 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_multiplication_matrix_vector.py @@ -0,0 +1,189 @@ +from aie.dialects import arith, linalg +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.dialects.scf import * +from aie.extras.context import mlir_mod_ctx + + +def my_matmul(M, K, n_cores): + m = 32 + k = 32 + word_size_in = 4 + word_size_out = 4 + + A_sz_in_i32s = M * K * word_size_in // 4 + B_sz_in_i32s = K * word_size_in // 4 + C_sz_in_bytes = M * word_size_out + C_sz_in_i32s = C_sz_in_bytes // 4 + C_sz_div_n_cores_in_i32s = C_sz_in_i32s // n_cores + + M_div_m = M // m + M_div_m_div_n_cores = M // (m * n_cores) + K_div_k = K // k + + K_in_i32s = K * word_size_in // 4 + k_in_i32s = k * word_size_in // 4 + m_in_i32s = m * word_size_in // 4 + m_x_k_in_i32s = m * k * word_size_in // 4 + m_x_K_in_i32s = m * K * word_size_in // 4 + + vectorized = True + + @device(AIEDevice.npu1_4col) + def device_body(): + memRef_inA_ty = T.memref(m * k, T.f32()) + memRef_inB_ty = T.memref(k, T.f32()) + memRef_outC_ty = T.memref(m, T.f32()) + memRef_A_ty = T.memref(m, k, T.f32()) + + # Tile declarations + ShimTile0 = tile(0, 0) + ShimTile1 = tile(1, 0) + ShimTile2 = tile(2, 0) + ShimTile3 = tile(3, 0) + ShimTiles = [ShimTile0, ShimTile1, ShimTile2, ShimTile3] + MemTile0 = tile(0, 1) + MemTile1 = tile(1, 1) + MemTile2 = tile(2, 1) + MemTile3 = tile(3, 1) + MemTiles = [MemTile0, MemTile1, MemTile2, MemTile3] + ComputeTile0 = tile(0, 2) + ComputeTile1 = tile(1, 2) + ComputeTile2 = tile(2, 2) + ComputeTile3 = tile(3, 2) + cores = [ComputeTile0, ComputeTile1, ComputeTile2, ComputeTile3] + memA_fifo_names = ["memA0", "memA1", "memA2", "memA3"] + memA_fifos = {} + inA_fifo_names = ["inA0", "inA1", "inA2", "inA3"] + inA_fifos = {} + inB_fifo_names = ["inB"] + inB_fifos = {} + outC_fifo_names = ["outC0", "outC1", "outC2", "outC3"] + outC_fifos = {} + + # AIE-array data movement with object fifos + # Input A + for i in range(n_cores): + memA_fifos[memA_fifo_names[i]] = object_fifo( + memA_fifo_names[i], + ShimTiles[i], + MemTiles[i], + 2, + memRef_inA_ty, + ) + inA_fifos[inA_fifo_names[i]] = object_fifo( + inA_fifo_names[i], + MemTiles[i], + cores[i], + 2, + memRef_A_ty, + [ + (m, k), + (k, 1), + ], + ) + object_fifo_link( + memA_fifos[memA_fifo_names[i]], inA_fifos[inA_fifo_names[i]] + ) + + # Input B + inB_fifos[inB_fifo_names[0]] = object_fifo( + inB_fifo_names[0], + ShimTiles[1 % n_cores], + cores[0:n_cores], + 2, + memRef_inB_ty, + ) + + # Output C + for i in range(n_cores): + outC_fifos[outC_fifo_names[i]] = object_fifo( + outC_fifo_names[i], + cores[i], + ShimTiles[i], + 2, + memRef_outC_ty, + ) + + # Set up compute tiles + for i in range(n_cores): + # Compute tile i + @core(cores[i]) + def core_body(): + cf0 = arith.constant(T.f32(), 0.0) + for _ in for_(0xFFFFFFFF): + elem_out = outC_fifos[outC_fifo_names[i]].acquire( + ObjectFifoPort.Produce, + 1, + ) + linalg.fill(cf0, outs=[elem_out]) + + for _ in for_(K_div_k): + elem_in_a = inA_fifos[inA_fifo_names[i]].acquire( + ObjectFifoPort.Consume, + 1, + ) + elem_in_b = inB_fifos[inB_fifo_names[0]].acquire( + ObjectFifoPort.Consume, + 1, + ) + linalg.matvec(elem_in_a, elem_in_b, outs=[elem_out]) + inA_fifos[inA_fifo_names[i]].release( + ObjectFifoPort.Consume, + 1, + ) + inB_fifos[inB_fifo_names[0]].release( + ObjectFifoPort.Consume, + 1, + ) + yield_([]) + + outC_fifos[outC_fifo_names[i]].release( + ObjectFifoPort.Produce, + 1, + ) + yield_([]) + + # To/from AIE-array data movement + + @FuncOp.from_py_func( + T.memref(A_sz_in_i32s, T.i32()), + T.memref(B_sz_in_i32s, T.i32()), + T.memref(C_sz_in_i32s, T.i32()), + ) + def sequence(A, B, C): + npu_dma_memcpy_nd( + metadata=inB_fifo_names[0], + bd_id=2, + mem=B, + sizes=[M_div_m_div_n_cores, 1, 1, K_in_i32s], + strides=[0, 0, 0], + ) + for i in range(n_cores): + A_offset = i * M_div_m_div_n_cores * m * K * word_size_in // 4 + C_offset = i * M_div_m_div_n_cores * m * word_size_out // 4 + npu_dma_memcpy_nd( + metadata=memA_fifo_names[i], + bd_id=1, + mem=A, + offsets=[0, 0, 0, A_offset], + sizes=[M_div_m_div_n_cores, K_div_k, m, k_in_i32s], + strides=[m_x_K_in_i32s, k_in_i32s, K_in_i32s], + ) + npu_dma_memcpy_nd( + metadata=outC_fifo_names[i], + bd_id=0, + mem=C, + offsets=[0, 0, 0, C_offset], + sizes=[1, 1, 1, C_sz_div_n_cores_in_i32s], + strides=[0, 0, 0], + ) + + for i in range(n_cores): + npu_sync(column=i, row=0, direction=0, channel=0) + + +def emit_module(M=64, K=64, n_cores=1): + with mlir_mod_ctx() as ctx: + my_matmul(M, K, n_cores) + return str(ctx.module) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_multiplication_single_core.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_multiplication_single_core.mlir index e768e5bcc..30c4a72fd 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_multiplication_single_core.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_multiplication_single_core.mlir @@ -1,65 +1,86 @@ -// RUN: iree-opt --aie-objectFifo-stateful-transform %s +// RUN: not iree-compile --compile-mode=hal-executable --iree-hal-target-backends=amd-aie-direct --iree-hal-dump-executable-intermediates-to=%S/basic_matrix_multiplication_matrix_vector %s 2>&1 | FileCheck %s -module { - aie.device(npu1) { - func.func private @zero_scalar_bf16(memref<64x64xbf16>) - func.func private @zero_bf16(memref<64x64xbf16>) - func.func private @matmul_scalar_bf16_bf16(memref<64x64xbf16>, memref<64x64xbf16>, memref<64x64xbf16>) - func.func private @matmul_bf16_bf16(memref<64x64xbf16>, memref<64x64xbf16>, memref<64x64xbf16>) - %tile_0_0 = aie.tile(0, 0) - %tile_0_1 = aie.tile(0, 1) - %tile_0_2 = aie.tile(0, 2) - aie.objectfifo @inA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @memA(%tile_0_1 toStream [, , , ], {%tile_0_2}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@inA] -> [@memA]() - aie.objectfifo @inB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @memB(%tile_0_1 toStream [, , , ], {%tile_0_2}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@inB] -> [@memB]() - aie.objectfifo @memC(%tile_0_2, {%tile_0_1}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @outC(%tile_0_1 toStream [, , , ], {%tile_0_0}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@memC] -> [@outC]() - %core_0_2 = aie.core(%tile_0_2) { - %c0 = arith.constant 0 : index - %c4294967295 = arith.constant 4294967295 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c4294967295 step %c1 { - %c0_0 = arith.constant 0 : index - %c16 = arith.constant 16 : index - %c1_1 = arith.constant 1 : index - scf.for %arg1 = %c0_0 to %c16 step %c1_1 { - %0 = aie.objectfifo.acquire @memC(Produce, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<64x64xbf16> - func.call @zero_bf16(%1) : (memref<64x64xbf16>) -> () - %c0_2 = arith.constant 0 : index - %c4 = arith.constant 4 : index - %c1_3 = arith.constant 1 : index - scf.for %arg2 = %c0_2 to %c4 step %c1_3 { - %2 = aie.objectfifo.acquire @memA(Consume, 1) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<64x64xbf16> - %4 = aie.objectfifo.acquire @memB(Consume, 1) : !aie.objectfifosubview> - %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview> -> memref<64x64xbf16> - func.call @matmul_bf16_bf16(%3, %5, %1) : (memref<64x64xbf16>, memref<64x64xbf16>, memref<64x64xbf16>) -> () - aie.objectfifo.release @memA(Consume, 1) - aie.objectfifo.release @memB(Consume, 1) +module attributes {hal.device.targets = [#hal.device.target<"amd-aie-direct", [#hal.executable.target<"amd-aie-direct", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>]>]} { + hal.executable private @dummy1 { + hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie-direct", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) { + hal.executable.export public @dummy2 ordinal(0) layout(#hal.pipeline.layout]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>]} { + ^bb0(%arg0: !hal.device): + %x, %y, %z = flow.dispatch.workgroup_count_from_slice + hal.return %x, %y, %z : index, index, index + } + builtin.module { + // this is load bearing... + aie.device(npu1_1col) { + %tile_0_0 = aie.tile(0, 0) + %tile_0_1 = aie.tile(0, 1) + %tile_0_2 = aie.tile(0, 2) + aie.objectfifo @inA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo> + aie.objectfifo @memA(%tile_0_1 toStream [, , , ], {%tile_0_2}, 2 : i32) : !aie.objectfifo> + aie.objectfifo.link [@inA] -> [@memA]() + aie.objectfifo @inB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo> + aie.objectfifo @memB(%tile_0_1 toStream [, , , ], {%tile_0_2}, 2 : i32) : !aie.objectfifo> + aie.objectfifo.link [@inB] -> [@memB]() + aie.objectfifo @memC(%tile_0_2, {%tile_0_1}, 2 : i32) : !aie.objectfifo> + aie.objectfifo @outC(%tile_0_1 toStream [, , , ], {%tile_0_0}, 2 : i32) : !aie.objectfifo> + aie.objectfifo.link [@memC] -> [@outC]() + %core_0_2 = aie.core(%tile_0_2) { + %c0 = arith.constant 0 : index + %c4294967295 = arith.constant 4294967295 : index + %c1 = arith.constant 1 : index + scf.for %arg0 = %c0 to %c4294967295 step %c1 { + %c0_0 = arith.constant 0 : index + %c1_1 = arith.constant 1 : index + %c1_2 = arith.constant 1 : index + scf.for %arg1 = %c0_0 to %c1_1 step %c1_2 { + %0 = aie.objectfifo.acquire @memC(Produce, 1) : !aie.objectfifosubview> + %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<32x32xf32> + %cst = arith.constant 0.000000e+00 : f32 + linalg.fill ins(%cst : f32) outs(%1 : memref<32x32xf32>) + %c0_3 = arith.constant 0 : index + %c1_4 = arith.constant 1 : index + %c1_5 = arith.constant 1 : index + scf.for %arg2 = %c0_3 to %c1_4 step %c1_5 { + %2 = aie.objectfifo.acquire @memA(Consume, 1) : !aie.objectfifosubview> + %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<32x32xf32> + %4 = aie.objectfifo.acquire @memB(Consume, 1) : !aie.objectfifosubview> + %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview> -> memref<32x32xf32> + linalg.matmul {cast = #linalg.type_fn} ins(%3, %5 : memref<32x32xf32>, memref<32x32xf32>) outs(%1 : memref<32x32xf32>) + aie.objectfifo.release @memA(Consume, 1) + aie.objectfifo.release @memB(Consume, 1) + } + aie.objectfifo.release @memC(Produce, 1) + } + } + aie.end + } + func.func @sequence(%arg0: memref<1024xi32>, %arg1: memref<1024xi32>, %arg2: memref<1024xi32>) { + aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 32, 32][1024, 32, 32]) {id = 0 : i64, metadata = @outC} : memref<1024xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 32, 32][0, 32, 32]) {id = 1 : i64, metadata = @inA} : memref<1024xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 32, 32][32, 1024, 32]) {id = 2 : i64, metadata = @inB} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + return } - aie.objectfifo.release @memC(Produce, 1) } } - aie.end - } {link_with = "mm.o"} - func.func @sequence(%arg0: memref<32768xi32>, %arg1: memref<32768xi32>, %arg2: memref<32768xi32>) { - aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][4, 4, 64, 32][8192, 32, 128]) {id = 0 : i64, metadata = @outC} : memref<32768xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][4, 4, 64, 32][0, 32, 128]) {id = 1 : i64, metadata = @inA} : memref<32768xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][4, 4, 64, 32][32, 8192, 128]) {id = 2 : i64, metadata = @inB} : memref<32768xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 8192][4, 4, 64, 32][0, 32, 128]) {id = 3 : i64, metadata = @inA} : memref<32768xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][4, 4, 64, 32][32, 8192, 128]) {id = 4 : i64, metadata = @inB} : memref<32768xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16384][4, 4, 64, 32][0, 32, 128]) {id = 5 : i64, metadata = @inA} : memref<32768xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][4, 4, 64, 32][32, 8192, 128]) {id = 6 : i64, metadata = @inB} : memref<32768xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 24576][4, 4, 64, 32][0, 32, 128]) {id = 7 : i64, metadata = @inA} : memref<32768xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][4, 4, 64, 32][32, 8192, 128]) {id = 8 : i64, metadata = @inB} : memref<32768xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - return } } -} + util.func public @dummy3(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = ""}} { + // this is all gibberish just to hit serializeExecutable + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %element_type_i8 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c1, %c1]) type(%element_type_i8) encoding(%dense_row_major) + %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<1024x512xi8> in !stream.resource{%c1} + %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource{%c1} => !stream.timepoint + %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c1}) { + stream.cmd.dispatch @dummy1::@amdaie_xclbin_fb::@dummy2 { + ro %arg2[%c0 for %c1] : !stream.resource{%c1} + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c1} + %4 = stream.tensor.export %3 : tensor<1024x1024xi32> in !stream.resource{%c1} -> !hal.buffer_view + util.return %4 : !hal.buffer_view + } +} \ No newline at end of file diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_multiplication_single_core.py b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_multiplication_single_core.py new file mode 100644 index 000000000..4607802c1 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_multiplication_single_core.py @@ -0,0 +1,183 @@ +from aie.dialects import arith, linalg +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.dialects.scf import * +from aie.extras.context import mlir_mod_ctx + + +def my_matmul(M, K, N): + m = 32 + k = 32 + n = 32 + r = 4 + s = 8 + t = 4 + word_size_in = 4 + word_size_out = 4 + + A_sz_in_i32s = M * K * word_size_in // 4 + B_sz_in_i32s = K * N * word_size_in // 4 + C_sz_in_bytes = M * N * word_size_out + C_sz_in_i32s = C_sz_in_bytes // 4 + + M_div_m = M // m + K_div_k = K // k + N_div_n = N // n + tiles = M_div_m * N_div_n + + # Matrix A: MxK, submatrices a: mxk + k_in_i32s = k * word_size_in // 4 + K_in_i32s = K * word_size_in // 4 + + # Matrix B: KxN, submatrices b: kxn + n_in_i32s = n * word_size_in // 4 + N_in_i32s = N * word_size_in // 4 + k_x_N_in_i32s = k * N * word_size_in // 4 + + # Output Matrix C: MxN + n_in_i32s_out = n * word_size_out // 4 + N_in_i32s_out = N * word_size_out // 4 + m_x_N_in_i32s_out = m * N * word_size_out // 4 + + @device(AIEDevice.npu1_1col) + def device_body(): + memref_a_ty = T.memref(m, k, T.f32()) + memref_b_ty = T.memref(k, n, T.f32()) + memref_c_ty = T.memref(m, n, T.f32()) + + # Tile declarations + shim_tile = tile(0, 0) + mem_tile = tile(0, 1) + compute_tile2_col, compute_tile2_row = 0, 2 + compute_tile2 = tile(compute_tile2_col, compute_tile2_row) + + # AIE-array data movement with object fifos + # Input A + inA = object_fifo("inA", shim_tile, mem_tile, 2, memref_a_ty) + memA = object_fifo( + "memA", + mem_tile, + compute_tile2, + 2, + memref_a_ty, + [ + (m // r, r * k), + (k // s, s), + (r, k), + (s, 1), + ], + ) + object_fifo_link(inA, memA) + + # Input B + inB = object_fifo("inB", shim_tile, mem_tile, 2, memref_b_ty) + memB = object_fifo( + "memB", + mem_tile, + compute_tile2, + 2, + memref_b_ty, + [ + (k // s, s * n), + (n // t, t), + (s, n), + (t, 1), + ], + ) + object_fifo_link(inB, memB) + + # Output C + memC = object_fifo("memC", compute_tile2, mem_tile, 2, memref_c_ty) + outC = object_fifo( + "outC", + mem_tile, + shim_tile, + 2, + memref_c_ty, + [ + (m // r, r * n), + (r, t), + (n // t, r * t), + (t, 1), + ], + ) + object_fifo_link(memC, outC) + + # Compute tile 2 + @core(compute_tile2) + def core_body(): + for _ in for_(0xFFFFFFFF): + for _ in for_(tiles): + elem_out = memC.acquire(ObjectFifoPort.Produce, 1) + cf0 = arith.constant(T.f32(), 0.0) + linalg.fill(cf0, outs=[elem_out]) + for _ in for_(K_div_k): + elem_in_a = memA.acquire(ObjectFifoPort.Consume, 1) + elem_in_b = memB.acquire(ObjectFifoPort.Consume, 1) + linalg.matmul(elem_in_a, elem_in_b, outs=[elem_out]) + memA.release(ObjectFifoPort.Consume, 1) + memB.release(ObjectFifoPort.Consume, 1) + yield_([]) + + memC.release(ObjectFifoPort.Produce, 1) + yield_([]) + yield_([]) + + # To/from AIE-array data movement + + @FuncOp.from_py_func( + T.memref(A_sz_in_i32s, T.i32()), + T.memref(B_sz_in_i32s, T.i32()), + T.memref(C_sz_in_i32s, T.i32()), + ) + def sequence(A, B, C): + # only do 5 tile rows at a time before synchronizing, so we can reuse BDs + rows_per_block = 5 + for tile_row_block in range( + (M_div_m + rows_per_block - 1) // rows_per_block + ): + C_row_offset_in_i32s = ( + tile_row_block * rows_per_block * m * N * word_size_out // 4 + ) + num_tile_rows = min( + [rows_per_block, M_div_m - tile_row_block * rows_per_block] + ) + npu_dma_memcpy_nd( + metadata="outC", + bd_id=0, + mem=C, + offsets=[0, 0, 0, C_row_offset_in_i32s], + sizes=[num_tile_rows, N_div_n, m, n_in_i32s_out], + strides=[m_x_N_in_i32s_out, n_in_i32s_out, N_in_i32s_out], + ) + for tile_row in range(num_tile_rows): + A_row_offset_in_i32s = ( + ((tile_row_block * rows_per_block) + tile_row) + * m + * K + * word_size_in + // 4 + ) + npu_dma_memcpy_nd( + metadata="inA", + bd_id=2 * tile_row + 1, + mem=A, + offsets=[0, 0, 0, A_row_offset_in_i32s], + sizes=[N_div_n, K_div_k, m, k_in_i32s], + strides=[0, k_in_i32s, K_in_i32s], + ) + npu_dma_memcpy_nd( + metadata="inB", + bd_id=2 * tile_row + 2, + mem=B, + sizes=[N_div_n, K_div_k, k, n_in_i32s], + strides=[n_in_i32s, k_x_N_in_i32s, N_in_i32s], + ) + + npu_sync(column=0, row=0, direction=0, channel=0) + + +def emit_module(M=64, K=64, N=64): + with mlir_mod_ctx() as ctx: + my_matmul(M, K, N) + return str(ctx.module) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_scalar_add.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_scalar_add.mlir deleted file mode 100644 index bb2d63105..000000000 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_scalar_add.mlir +++ /dev/null @@ -1,40 +0,0 @@ -// RUN: iree-opt --aie-objectFifo-stateful-transform %s - -module { - aie.device(npu1) { - %tile_1_0 = aie.tile(1, 0) - %tile_1_2 = aie.tile(1, 2) - aie.objectfifo @in0(%tile_1_0, {%tile_1_2}, 4 : i32) : !aie.objectfifo> - aie.objectfifo @out0(%tile_1_2, {%tile_1_0}, 4 : i32) : !aie.objectfifo> - %core_1_2 = aie.core(%tile_1_2) { - %c0 = arith.constant 0 : index - %c9223372036854775807 = arith.constant 9223372036854775807 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 { - %0 = aie.objectfifo.acquire @in0(Consume, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<128xi32> - %2 = aie.objectfifo.acquire @out0(Produce, 1) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<128xi32> - %c0_0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1_1 = arith.constant 1 : index - scf.for %arg1 = %c0_0 to %c128 step %c1_1 { - %4 = memref.load %1[%arg1] : memref<128xi32> - %c1_i32 = arith.constant 1 : i32 - %5 = arith.addi %4, %c1_i32 : i32 - memref.store %5, %3[%arg1] : memref<128xi32> - } - aie.objectfifo.release @in0(Consume, 1) - aie.objectfifo.release @out0(Produce, 1) - } - aie.end - } - func.func @sequence(%arg0: memref<128xi32>, %arg1: memref<128xi32>, %arg2: memref<128xi32>) { - aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 8, 16][1, 1, 128]) {id = 0 : i64, metadata = @out0} : memref<128xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 8, 16][1, 1, 128]) {id = 1 : i64, metadata = @in0} : memref<128xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - return - } - } -} - diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_passthrough_dmas.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_passthrough_dmas.mlir deleted file mode 100644 index 0445992fa..000000000 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_passthrough_dmas.mlir +++ /dev/null @@ -1,26 +0,0 @@ -// RUN: iree-opt --aie-objectFifo-stateful-transform %s - -module { - aie.device(npu1) { - %tile_0_0 = aie.tile(0, 0) - %tile_0_2 = aie.tile(0, 2) - aie.objectfifo @in(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @out(%tile_0_2, {%tile_0_0}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@in] -> [@out]() - %core_0_2 = aie.core(%tile_0_2) { - %c0 = arith.constant 0 : index - %c9223372036854775807 = arith.constant 9223372036854775807 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 { - } - aie.end - } - func.func @sequence(%arg0: memref<4096xi32>, %arg1: memref<4096xi32>, %arg2: memref<4096xi32>) { - aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0]) {id = 0 : i64, metadata = @out} : memref<4096xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @in} : memref<4096xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - return - } - } -} - diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_passthrough_kernel.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_passthrough_kernel.mlir deleted file mode 100644 index 6af63f1a7..000000000 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_passthrough_kernel.mlir +++ /dev/null @@ -1,34 +0,0 @@ -// RUN: iree-opt --aie-objectFifo-stateful-transform %s - -module { - aie.device(npu1) { - func.func private @passThroughLine(memref<128xui8>, memref<128xui8>, i32) - %tile_0_0 = aie.tile(0, 0) - %tile_0_2 = aie.tile(0, 2) - aie.objectfifo @in(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @out(%tile_0_2, {%tile_0_0}, 2 : i32) : !aie.objectfifo> - %core_0_2 = aie.core(%tile_0_2) { - %c0 = arith.constant 0 : index - %c9223372036854775807 = arith.constant 9223372036854775807 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 { - %0 = aie.objectfifo.acquire @out(Produce, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<128xui8> - %2 = aie.objectfifo.acquire @in(Consume, 1) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<128xui8> - %c128_i32 = arith.constant 128 : i32 - func.call @passThroughLine(%3, %1, %c128_i32) : (memref<128xui8>, memref<128xui8>, i32) -> () - aie.objectfifo.release @in(Consume, 1) - aie.objectfifo.release @out(Produce, 1) - } - aie.end - } {link_with = "passThrough.cc.o"} - func.func @sequence(%arg0: memref<128xi32>, %arg1: memref<128xi32>, %arg2: memref<128xi32>) { - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 128][0, 0, 0]) {id = 0 : i64, metadata = @in} : memref<128xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 128][0, 0, 0]) {id = 1 : i64, metadata = @out} : memref<128xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - return - } - } -} - diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_exp.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_exp.mlir deleted file mode 100644 index 63e0d47dd..000000000 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_exp.mlir +++ /dev/null @@ -1,112 +0,0 @@ -// RUN: iree-opt --aie-objectFifo-stateful-transform %s - -module { - aie.device(npu1) { - func.func private @exp_bf16_1024(memref<1024xbf16>, memref<1024xbf16>) - %tile_0_0 = aie.tile(0, 0) - %tile_0_1 = aie.tile(0, 1) - %tile_0_2 = aie.tile(0, 2) - %tile_0_3 = aie.tile(0, 3) - %tile_0_4 = aie.tile(0, 4) - %tile_0_5 = aie.tile(0, 5) - aie.objectfifo @inA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @memA0(%tile_0_1, {%tile_0_2}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @memA1(%tile_0_1, {%tile_0_3}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @memA2(%tile_0_1, {%tile_0_4}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @memA3(%tile_0_1, {%tile_0_5}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@inA] -> [@memA0, @memA1, @memA2, @memA3]() - aie.objectfifo @memC0(%tile_0_2, {%tile_0_1}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @memC1(%tile_0_3, {%tile_0_1}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @memC2(%tile_0_4, {%tile_0_1}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @memC3(%tile_0_5, {%tile_0_1}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @outC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@memC0, @memC1, @memC2, @memC3] -> [@outC]() - %core_0_2 = aie.core(%tile_0_2) { - %c0 = arith.constant 0 : index - %c4294967295 = arith.constant 4294967295 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c4294967295 step %c1 { - %c0_0 = arith.constant 0 : index - %c16 = arith.constant 16 : index - %c1_1 = arith.constant 1 : index - scf.for %arg1 = %c0_0 to %c16 step %c1_1 { - %0 = aie.objectfifo.acquire @memC0(Produce, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<1024xbf16> - %2 = aie.objectfifo.acquire @memA0(Consume, 1) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<1024xbf16> - func.call @exp_bf16_1024(%3, %1) : (memref<1024xbf16>, memref<1024xbf16>) -> () - aie.objectfifo.release @memA0(Consume, 1) - aie.objectfifo.release @memC0(Produce, 1) - } - } - aie.end - } {link_with = "kernels.a"} - %core_0_3 = aie.core(%tile_0_3) { - %c0 = arith.constant 0 : index - %c4294967295 = arith.constant 4294967295 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c4294967295 step %c1 { - %c0_0 = arith.constant 0 : index - %c16 = arith.constant 16 : index - %c1_1 = arith.constant 1 : index - scf.for %arg1 = %c0_0 to %c16 step %c1_1 { - %0 = aie.objectfifo.acquire @memC1(Produce, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<1024xbf16> - %2 = aie.objectfifo.acquire @memA1(Consume, 1) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<1024xbf16> - func.call @exp_bf16_1024(%3, %1) : (memref<1024xbf16>, memref<1024xbf16>) -> () - aie.objectfifo.release @memA1(Consume, 1) - aie.objectfifo.release @memC1(Produce, 1) - } - } - aie.end - } {link_with = "kernels.a"} - %core_0_4 = aie.core(%tile_0_4) { - %c0 = arith.constant 0 : index - %c4294967295 = arith.constant 4294967295 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c4294967295 step %c1 { - %c0_0 = arith.constant 0 : index - %c16 = arith.constant 16 : index - %c1_1 = arith.constant 1 : index - scf.for %arg1 = %c0_0 to %c16 step %c1_1 { - %0 = aie.objectfifo.acquire @memC2(Produce, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<1024xbf16> - %2 = aie.objectfifo.acquire @memA2(Consume, 1) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<1024xbf16> - func.call @exp_bf16_1024(%3, %1) : (memref<1024xbf16>, memref<1024xbf16>) -> () - aie.objectfifo.release @memA2(Consume, 1) - aie.objectfifo.release @memC2(Produce, 1) - } - } - aie.end - } {link_with = "kernels.a"} - %core_0_5 = aie.core(%tile_0_5) { - %c0 = arith.constant 0 : index - %c4294967295 = arith.constant 4294967295 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c4294967295 step %c1 { - %c0_0 = arith.constant 0 : index - %c16 = arith.constant 16 : index - %c1_1 = arith.constant 1 : index - scf.for %arg1 = %c0_0 to %c16 step %c1_1 { - %0 = aie.objectfifo.acquire @memC3(Produce, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<1024xbf16> - %2 = aie.objectfifo.acquire @memA3(Consume, 1) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<1024xbf16> - func.call @exp_bf16_1024(%3, %1) : (memref<1024xbf16>, memref<1024xbf16>) -> () - aie.objectfifo.release @memA3(Consume, 1) - aie.objectfifo.release @memC3(Produce, 1) - } - } - aie.end - } {link_with = "kernels.a"} - func.func @sequence(%arg0: memref<65536xi32>, %arg1: memref<65536xi32>) { - aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 32768][0, 0, 0]) {id = 0 : i64, metadata = @outC} : memref<65536xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 32768][0, 0, 0]) {id = 1 : i64, metadata = @inA} : memref<65536xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - return - } - } -} - diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_reduce_add.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_reduce_add.mlir deleted file mode 100644 index 915d6d290..000000000 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_reduce_add.mlir +++ /dev/null @@ -1,34 +0,0 @@ -// RUN: iree-opt --aie-objectFifo-stateful-transform %s - -module { - aie.device(npu1) { - func.func private @reduce_add_vector(memref<1024xi32>, memref<1xi32>, i32) - %tile_1_0 = aie.tile(1, 0) - %tile_1_2 = aie.tile(1, 2) - aie.objectfifo @in(%tile_1_0, {%tile_1_2}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @out(%tile_1_2, {%tile_1_0}, 2 : i32) : !aie.objectfifo> - %core_1_2 = aie.core(%tile_1_2) { - %c0 = arith.constant 0 : index - %c4294967295 = arith.constant 4294967295 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c4294967295 step %c1 { - %0 = aie.objectfifo.acquire @out(Produce, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<1xi32> - %2 = aie.objectfifo.acquire @in(Consume, 1) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<1024xi32> - %c1024_i32 = arith.constant 1024 : i32 - func.call @reduce_add_vector(%3, %1, %c1024_i32) : (memref<1024xi32>, memref<1xi32>, i32) -> () - aie.objectfifo.release @in(Consume, 1) - aie.objectfifo.release @out(Produce, 1) - } - aie.end - } {link_with = "reduce_add.cc.o"} - func.func @sequence(%arg0: memref<1024xi32>, %arg1: memref<1024xi32>) { - aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 1][0, 0, 0]) {id = 0 : i64, metadata = @out} : memref<1024xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 1 : i64, metadata = @in} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - return - } - } -} - diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_reduce_max.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_reduce_max.mlir deleted file mode 100644 index bbda32a25..000000000 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_reduce_max.mlir +++ /dev/null @@ -1,34 +0,0 @@ -// RUN: iree-opt --aie-objectFifo-stateful-transform %s - -module { - aie.device(npu1) { - func.func private @reduce_max_vector(memref<1024xi32>, memref<1xi32>, i32) - %tile_1_0 = aie.tile(1, 0) - %tile_1_2 = aie.tile(1, 2) - aie.objectfifo @in(%tile_1_0, {%tile_1_2}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @out(%tile_1_2, {%tile_1_0}, 2 : i32) : !aie.objectfifo> - %core_1_2 = aie.core(%tile_1_2) { - %c0 = arith.constant 0 : index - %c4294967295 = arith.constant 4294967295 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c4294967295 step %c1 { - %0 = aie.objectfifo.acquire @out(Produce, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<1xi32> - %2 = aie.objectfifo.acquire @in(Consume, 1) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<1024xi32> - %c1024_i32 = arith.constant 1024 : i32 - func.call @reduce_max_vector(%3, %1, %c1024_i32) : (memref<1024xi32>, memref<1xi32>, i32) -> () - aie.objectfifo.release @in(Consume, 1) - aie.objectfifo.release @out(Produce, 1) - } - aie.end - } {link_with = "reduce_max.cc.o"} - func.func @sequence(%arg0: memref<1024xi32>, %arg1: memref<1024xi32>) { - aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 1][0, 0, 0]) {id = 0 : i64, metadata = @out} : memref<1024xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 1 : i64, metadata = @in} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - return - } - } -} - diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_reduce_min.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_reduce_min.mlir deleted file mode 100644 index 3327ade5c..000000000 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_reduce_min.mlir +++ /dev/null @@ -1,34 +0,0 @@ -// RUN: iree-opt --aie-objectFifo-stateful-transform %s - -module { - aie.device(npu1) { - func.func private @reduce_min_vector(memref<1024xi32>, memref<1xi32>, i32) - %tile_1_0 = aie.tile(1, 0) - %tile_1_2 = aie.tile(1, 2) - aie.objectfifo @in(%tile_1_0, {%tile_1_2}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @out(%tile_1_2, {%tile_1_0}, 2 : i32) : !aie.objectfifo> - %core_1_2 = aie.core(%tile_1_2) { - %c0 = arith.constant 0 : index - %c4294967295 = arith.constant 4294967295 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c4294967295 step %c1 { - %0 = aie.objectfifo.acquire @out(Produce, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<1xi32> - %2 = aie.objectfifo.acquire @in(Consume, 1) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<1024xi32> - %c1024_i32 = arith.constant 1024 : i32 - func.call @reduce_min_vector(%3, %1, %c1024_i32) : (memref<1024xi32>, memref<1xi32>, i32) -> () - aie.objectfifo.release @in(Consume, 1) - aie.objectfifo.release @out(Produce, 1) - } - aie.end - } {link_with = "reduce_min.cc.o"} - func.func @sequence(%arg0: memref<1024xi32>, %arg1: memref<1024xi32>) { - aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 1][0, 0, 0]) {id = 0 : i64, metadata = @out} : memref<1024xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 1 : i64, metadata = @in} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - return - } - } -} - diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_scalar_add.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_scalar_add.mlir deleted file mode 100644 index 739fc6539..000000000 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_scalar_add.mlir +++ /dev/null @@ -1,45 +0,0 @@ -// RUN: iree-opt --aie-objectFifo-stateful-transform %s - -module { - aie.device(npu1) { - %tile_0_0 = aie.tile(0, 0) - %tile_0_1 = aie.tile(0, 1) - %tile_0_2 = aie.tile(0, 2) - aie.objectfifo @in0(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @in1(%tile_0_1, {%tile_0_2}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@in0] -> [@in1]() - aie.objectfifo @out0(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @out1(%tile_0_2, {%tile_0_1}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@out1] -> [@out0]() - %core_0_2 = aie.core(%tile_0_2) { - %c0 = arith.constant 0 : index - %c9223372036854775807 = arith.constant 9223372036854775807 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 { - %0 = aie.objectfifo.acquire @in1(Consume, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<32xi32> - %2 = aie.objectfifo.acquire @out1(Produce, 1) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<32xi32> - %c0_0 = arith.constant 0 : index - %c32 = arith.constant 32 : index - %c1_1 = arith.constant 1 : index - scf.for %arg1 = %c0_0 to %c32 step %c1_1 { - %4 = memref.load %1[%arg1] : memref<32xi32> - %c1_i32 = arith.constant 1 : i32 - %5 = arith.addi %4, %c1_i32 : i32 - memref.store %5, %3[%arg1] : memref<32xi32> - } - aie.objectfifo.release @in1(Consume, 1) - aie.objectfifo.release @out1(Produce, 1) - } - aie.end - } - func.func @sequence(%arg0: memref<1024xi32>, %arg1: memref<1024xi32>) { - aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 0 : i64, metadata = @out0} : memref<1024xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 1 : i64, metadata = @in0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - return - } - } -} - diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_scalar_mul.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_scalar_mul.mlir deleted file mode 100644 index 04d85bb39..000000000 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_scalar_mul.mlir +++ /dev/null @@ -1,45 +0,0 @@ -// RUN: iree-opt --aie-objectFifo-stateful-transform %s - -module { - aie.device(npu1) { - func.func private @vector_scalar_mul_int16_scalar(memref<128xi16>, memref<128xi16>, memref<1xi32>, i32) - func.func private @vector_scalar_mul_int16_vector(memref<128xi16>, memref<128xi16>, memref<1xi32>, i32) - %tile_0_0 = aie.tile(0, 0) - %tile_0_2 = aie.tile(0, 2) - aie.objectfifo @in(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @infactor(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @out(%tile_0_2, {%tile_0_0}, 2 : i32) : !aie.objectfifo> - %core_0_2 = aie.core(%tile_0_2) { - %c0 = arith.constant 0 : index - %c9223372036854775807 = arith.constant 9223372036854775807 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 { - %0 = aie.objectfifo.acquire @infactor(Consume, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<1xi32> - %c0_0 = arith.constant 0 : index - %c4 = arith.constant 4 : index - %c1_1 = arith.constant 1 : index - scf.for %arg1 = %c0_0 to %c4 step %c1_1 { - %2 = aie.objectfifo.acquire @out(Produce, 1) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<128xi16> - %4 = aie.objectfifo.acquire @in(Consume, 1) : !aie.objectfifosubview> - %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview> -> memref<128xi16> - %c128_i32 = arith.constant 128 : i32 - func.call @vector_scalar_mul_int16_vector(%5, %3, %1, %c128_i32) : (memref<128xi16>, memref<128xi16>, memref<1xi32>, i32) -> () - aie.objectfifo.release @in(Consume, 1) - aie.objectfifo.release @out(Produce, 1) - } - aie.objectfifo.release @infactor(Consume, 1) - } - aie.end - } {link_with = "scale.o"} - func.func @sequence(%arg0: memref<256xi32>, %arg1: memref<1xi32>, %arg2: memref<256xi32>) { - aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0]) {id = 0 : i64, metadata = @out} : memref<256xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0]) {id = 1 : i64, metadata = @in} : memref<256xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 1][0, 0, 0]) {id = 2 : i64, metadata = @infactor} : memref<1xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - return - } - } -} - diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_vector_add.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_vector_add.mlir deleted file mode 100644 index 28728ca2e..000000000 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_vector_add.mlir +++ /dev/null @@ -1,50 +0,0 @@ -// RUN: iree-opt --aie-objectFifo-stateful-transform %s - -module { - aie.device(npu1) { - %tile_1_0 = aie.tile(1, 0) - %tile_1_2 = aie.tile(1, 2) - aie.objectfifo @in1(%tile_1_0, {%tile_1_2}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @in2(%tile_1_0, {%tile_1_2}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @out(%tile_1_2, {%tile_1_0}, 2 : i32) : !aie.objectfifo> - %core_1_2 = aie.core(%tile_1_2) { - %c0 = arith.constant 0 : index - %c9223372036854775807 = arith.constant 9223372036854775807 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 { - %c0_0 = arith.constant 0 : index - %c16 = arith.constant 16 : index - %c1_1 = arith.constant 1 : index - scf.for %arg1 = %c0_0 to %c16 step %c1_1 { - %0 = aie.objectfifo.acquire @in1(Consume, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<16xi32> - %2 = aie.objectfifo.acquire @in2(Consume, 1) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<16xi32> - %4 = aie.objectfifo.acquire @out(Produce, 1) : !aie.objectfifosubview> - %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview> -> memref<16xi32> - %c0_2 = arith.constant 0 : index - %c16_3 = arith.constant 16 : index - %c1_4 = arith.constant 1 : index - scf.for %arg2 = %c0_2 to %c16_3 step %c1_4 { - %6 = memref.load %1[%arg2] : memref<16xi32> - %7 = memref.load %3[%arg2] : memref<16xi32> - %8 = arith.addi %6, %7 : i32 - memref.store %8, %5[%arg2] : memref<16xi32> - } - aie.objectfifo.release @in1(Consume, 1) - aie.objectfifo.release @in2(Consume, 1) - aie.objectfifo.release @out(Produce, 1) - } - } - aie.end - } - func.func @sequence(%arg0: memref<256xi32>, %arg1: memref<256xi32>, %arg2: memref<256xi32>) { - aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0]) {id = 0 : i64, metadata = @out} : memref<256xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0]) {id = 1 : i64, metadata = @in1} : memref<256xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0]) {id = 2 : i64, metadata = @in2} : memref<256xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - return - } - } -} - diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_vector_mul.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_vector_mul.mlir deleted file mode 100644 index 571331d5c..000000000 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_vector_mul.mlir +++ /dev/null @@ -1,50 +0,0 @@ -// RUN: iree-opt --aie-objectFifo-stateful-transform %s - -module { - aie.device(npu1) { - %tile_1_0 = aie.tile(1, 0) - %tile_1_2 = aie.tile(1, 2) - aie.objectfifo @in1(%tile_1_0, {%tile_1_2}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @in2(%tile_1_0, {%tile_1_2}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @out(%tile_1_2, {%tile_1_0}, 2 : i32) : !aie.objectfifo> - %core_1_2 = aie.core(%tile_1_2) { - %c0 = arith.constant 0 : index - %c9223372036854775807 = arith.constant 9223372036854775807 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 { - %c0_0 = arith.constant 0 : index - %c16 = arith.constant 16 : index - %c1_1 = arith.constant 1 : index - scf.for %arg1 = %c0_0 to %c16 step %c1_1 { - %0 = aie.objectfifo.acquire @in1(Consume, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<16xi32> - %2 = aie.objectfifo.acquire @in2(Consume, 1) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<16xi32> - %4 = aie.objectfifo.acquire @out(Produce, 1) : !aie.objectfifosubview> - %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview> -> memref<16xi32> - %c0_2 = arith.constant 0 : index - %c16_3 = arith.constant 16 : index - %c1_4 = arith.constant 1 : index - scf.for %arg2 = %c0_2 to %c16_3 step %c1_4 { - %6 = memref.load %1[%arg2] : memref<16xi32> - %7 = memref.load %3[%arg2] : memref<16xi32> - %8 = arith.muli %6, %7 : i32 - memref.store %8, %5[%arg2] : memref<16xi32> - } - aie.objectfifo.release @in1(Consume, 1) - aie.objectfifo.release @in2(Consume, 1) - aie.objectfifo.release @out(Produce, 1) - } - } - aie.end - } - func.func @sequence(%arg0: memref<256xi32>, %arg1: memref<256xi32>, %arg2: memref<256xi32>) { - aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0]) {id = 0 : i64, metadata = @out} : memref<256xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0]) {id = 1 : i64, metadata = @in1} : memref<256xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0]) {id = 2 : i64, metadata = @in2} : memref<256xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - return - } - } -} - diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/harness.py b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/harness.py new file mode 100644 index 000000000..89c2502aa --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/harness.py @@ -0,0 +1,304 @@ +import os +from pathlib import Path + +import numpy as np +import basic_matrix_multiplication_matrix_vector +import basic_matrix_multiplication_single_core + +os.environ["VITIS"] = "/opt/tools/Xilinx/Vitis/2023.2" + +from iree.compiler import compile_file + +# don't forget LD_LIBRARY_PATH=/opt/xilinx/xrt/lib:/usr/lib/x86_64-linux-gnu +RUN = True +if RUN: + from filelock import FileLock + from xaiepy.xrt import XCLBin + + +TEMPLATE = """ +module attributes {hal.device.targets = [#hal.device.target<"amd-aie-direct", [#hal.executable.target<"amd-aie-direct", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>]>]} { + hal.executable private @dummy1 { + hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie-direct", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) { + hal.executable.export public @dummy2 ordinal(0) layout(#hal.pipeline.layout]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>]} { + ^bb0(%arg0: !hal.device): + %x, %y, %z = flow.dispatch.workgroup_count_from_slice + hal.return %x, %y, %z : index, index, index + } + builtin.MODULE + } + } + util.func public @dummy3(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = ""}} { + // this is all gibberish just to hit serializeExecutable + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %element_type_i8 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c1, %c1]) type(%element_type_i8) encoding(%dense_row_major) + %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<1024x512xi8> in !stream.resource{%c1} + %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource{%c1} => !stream.timepoint + + %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c1}) { + stream.cmd.dispatch @dummy1::@amdaie_xclbin_fb::@dummy2 { + ro %arg2[%c0 for %c1] : !stream.resource{%c1} + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c1} + %4 = stream.tensor.export %3 : tensor<1024x1024xi32> in !stream.resource{%c1} -> !hal.buffer_view + util.return %4 : !hal.buffer_view + } +} +""" + + +def compile(workdir, test): + compile_file( + str(workdir / (test + ".mlir")), + target_backends=["amd-aie-direct"], + extra_args=[ + "--compile-mode=hal-executable", + f"--iree-hal-dump-executable-intermediates-to={workdir}", + ], + ) + + +def test_matrix_vector_32_1_core(): + M = K = 32 + TEST = basic_matrix_multiplication_matrix_vector.__name__ + "_32_1_core" + WORKDIR = Path(__file__).parent.absolute() / TEST + if not WORKDIR.exists(): + WORKDIR.mkdir(parents=True) + with open(WORKDIR / f"{TEST}.mlir", "w") as f: + f.write( + TEMPLATE.replace( + "MODULE", + basic_matrix_multiplication_matrix_vector.emit_module(M, K), + ) + ) + + NPU_INSTS_FP = f"{WORKDIR}/module_dummy1_amdaie_xclbin_fb/module_dummy1_amdaie_xclbin_fb.npu.txt" + XCLBIN_PATH = f"{WORKDIR}/module_dummy1_amdaie_xclbin_fb/module_dummy1_amdaie_xclbin_fb.xclbin" + KERNEL_NAME = "dummy2" + + compile(WORKDIR, TEST) + + with open(NPU_INSTS_FP, "r") as f: + npu_insts = list(map(lambda n: int(n, 16), f.readlines())) + + if RUN: + with FileLock("/tmp/npu.lock"): + xclbin = XCLBin(XCLBIN_PATH, KERNEL_NAME) + views = xclbin.mmap_buffers([(M, K), (K,), (M,)], np.float32) + + xclbin.load_npu_instructions(npu_insts) + + A = np.random.randint(0, 10, (M, K)).astype(np.float32) + B = np.random.randint(0, 10, (K,)).astype(np.float32) + C = np.zeros((M,)).astype(np.float32) + + wraps = list(map(np.asarray, views)) + np.copyto(wraps[0], A, casting="no") + np.copyto(wraps[1], B, casting="no") + np.copyto(wraps[2], C, casting="no") + + xclbin.sync_buffers_to_device() + xclbin.run() + print("Running kernel") + xclbin.wait(30) + xclbin.sync_buffers_from_device() + + assert np.allclose(A @ B, wraps[2]) + print(wraps[2]) + + +def test_matrix_vector_64_1_core(): + M = K = 64 + TEST = basic_matrix_multiplication_matrix_vector.__name__ + "_64_1_core" + WORKDIR = Path(__file__).parent.absolute() / TEST + if not WORKDIR.exists(): + WORKDIR.mkdir(parents=True) + with open(WORKDIR / f"{TEST}.mlir", "w") as f: + f.write( + TEMPLATE.replace( + "MODULE", basic_matrix_multiplication_matrix_vector.emit_module(M, K) + ) + ) + + NPU_INSTS_FP = f"{WORKDIR}/module_dummy1_amdaie_xclbin_fb/module_dummy1_amdaie_xclbin_fb.npu.txt" + XCLBIN_PATH = f"{WORKDIR}/module_dummy1_amdaie_xclbin_fb/module_dummy1_amdaie_xclbin_fb.xclbin" + KERNEL_NAME = "dummy2" + + compile(WORKDIR, TEST) + + with open(NPU_INSTS_FP, "r") as f: + npu_insts = list(map(lambda n: int(n, 16), f.readlines())) + + if RUN: + with FileLock("/tmp/npu.lock"): + xclbin = XCLBin(XCLBIN_PATH, KERNEL_NAME) + views = xclbin.mmap_buffers([(M, K), (K,), (M,)], np.float32) + + xclbin.load_npu_instructions(npu_insts) + + A = np.random.randint(0, 10, (M, K)).astype(np.float32) + B = np.random.randint(0, 10, (K,)).astype(np.float32) + C = np.zeros((M,)).astype(np.float32) + + wraps = list(map(np.asarray, views)) + np.copyto(wraps[0], A, casting="no") + np.copyto(wraps[1], B, casting="no") + np.copyto(wraps[2], C, casting="no") + + xclbin.sync_buffers_to_device() + xclbin.run() + print("Running kernel") + xclbin.wait(30) + xclbin.sync_buffers_from_device() + + assert np.allclose(A @ B, wraps[2]) + print(wraps[2]) + + +def test_matrix_vector_2_cores(): + M = K = 64 + TEST = basic_matrix_multiplication_matrix_vector.__name__ + "_64_2_cores" + WORKDIR = Path(__file__).parent.absolute() / TEST + if not WORKDIR.exists(): + WORKDIR.mkdir(parents=True) + with open(WORKDIR / f"{TEST}.mlir", "w") as f: + f.write( + TEMPLATE.replace( + "MODULE", + basic_matrix_multiplication_matrix_vector.emit_module(M, K, n_cores=2), + ) + ) + + NPU_INSTS_FP = f"{WORKDIR}/module_dummy1_amdaie_xclbin_fb/module_dummy1_amdaie_xclbin_fb.npu.txt" + XCLBIN_PATH = f"{WORKDIR}/module_dummy1_amdaie_xclbin_fb/module_dummy1_amdaie_xclbin_fb.xclbin" + KERNEL_NAME = "dummy2" + + compile(WORKDIR, TEST) + + with open(NPU_INSTS_FP, "r") as f: + npu_insts = list(map(lambda n: int(n, 16), f.readlines())) + + if RUN: + with FileLock("/tmp/npu.lock"): + xclbin = XCLBin(XCLBIN_PATH, KERNEL_NAME) + views = xclbin.mmap_buffers([(M, K), (K,), (M,)], np.float32) + + xclbin.load_npu_instructions(npu_insts) + + A = np.random.randint(0, 10, (M, K)).astype(np.float32) + B = np.random.randint(0, 10, (K,)).astype(np.float32) + C = np.zeros((M,)).astype(np.float32) + + wraps = list(map(np.asarray, views)) + np.copyto(wraps[0], A, casting="no") + np.copyto(wraps[1], B, casting="no") + np.copyto(wraps[2], C, casting="no") + + xclbin.sync_buffers_to_device() + xclbin.run() + print("Running kernel") + xclbin.wait(30) + xclbin.sync_buffers_from_device() + + assert np.allclose(A @ B, wraps[2]) + print(wraps[2]) + + +def test_matmul_32(): + M = K = N = 32 + TEST = basic_matrix_multiplication_single_core.__name__ + "_32" + WORKDIR = Path(__file__).parent.absolute() / TEST + if not WORKDIR.exists(): + WORKDIR.mkdir(parents=True) + with open(WORKDIR / f"{TEST}.mlir", "w") as f: + f.write( + TEMPLATE.replace( + "MODULE", basic_matrix_multiplication_single_core.emit_module(M, K, N) + ) + ) + NPU_INSTS_FP = f"{WORKDIR}/module_dummy1_amdaie_xclbin_fb/module_dummy1_amdaie_xclbin_fb.npu.txt" + XCLBIN_PATH = f"{WORKDIR}/module_dummy1_amdaie_xclbin_fb/module_dummy1_amdaie_xclbin_fb.xclbin" + KERNEL_NAME = "dummy2" + + compile(WORKDIR, TEST) + + with open(NPU_INSTS_FP, "r") as f: + npu_insts = list(map(lambda n: int(n, 16), f.readlines())) + + if RUN: + with FileLock("/tmp/npu.lock"): + xclbin = XCLBin(XCLBIN_PATH, KERNEL_NAME) + views = xclbin.mmap_buffers([(M, K), (K, N), (M, N)], np.float32) + + xclbin.load_npu_instructions(npu_insts) + + # the stupid upstream example isn't correct for real numbers + A = np.ones((M, K)).astype(np.float32) + B = 2 * np.ones((K, N)).astype(np.float32) + C = np.zeros((M, N)).astype(np.float32) + + wraps = list(map(np.asarray, views)) + np.copyto(wraps[0], A, casting="no") + np.copyto(wraps[1], B, casting="no") + np.copyto(wraps[2], C, casting="no") + + xclbin.sync_buffers_to_device() + xclbin.run() + print("Running kernel") + xclbin.wait(30) + xclbin.sync_buffers_from_device() + + assert np.allclose(A @ B, wraps[2]) + print(wraps[2]) + + +def test_matmul_64(): + M = K = N = 64 + TEST = basic_matrix_multiplication_single_core.__name__ + "_64" + WORKDIR = Path(__file__).parent.absolute() / TEST + if not WORKDIR.exists(): + WORKDIR.mkdir(parents=True) + with open(WORKDIR / f"{TEST}.mlir", "w") as f: + f.write( + TEMPLATE.replace( + "MODULE", basic_matrix_multiplication_single_core.emit_module(M, K, N) + ) + ) + NPU_INSTS_FP = f"{WORKDIR}/module_dummy1_amdaie_xclbin_fb/module_dummy1_amdaie_xclbin_fb.npu.txt" + XCLBIN_PATH = f"{WORKDIR}/module_dummy1_amdaie_xclbin_fb/module_dummy1_amdaie_xclbin_fb.xclbin" + KERNEL_NAME = "dummy2" + + compile(WORKDIR, TEST) + + with open(NPU_INSTS_FP, "r") as f: + npu_insts = list(map(lambda n: int(n, 16), f.readlines())) + + if RUN: + with FileLock("/tmp/npu.lock"): + xclbin = XCLBin(XCLBIN_PATH, KERNEL_NAME) + views = xclbin.mmap_buffers([(M, K), (K, N), (M, N)], np.float32) + + xclbin.load_npu_instructions(npu_insts) + + # the stupid upstream example isn't correct for real numbers + A = np.ones((M, K)).astype(np.float32) + B = 2 * np.ones((K, N)).astype(np.float32) + C = np.zeros((M, N)).astype(np.float32) + + wraps = list(map(np.asarray, views)) + np.copyto(wraps[0], A, casting="no") + np.copyto(wraps[1], B, casting="no") + np.copyto(wraps[2], C, casting="no") + + xclbin.sync_buffers_to_device() + xclbin.run() + print("Running kernel") + xclbin.wait(30) + xclbin.sync_buffers_from_device() + + assert np.allclose(A @ B, wraps[2]) + print(wraps[2]) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_bottleneck.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_bottleneck.mlir deleted file mode 100644 index 2dfaba8ba..000000000 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_bottleneck.mlir +++ /dev/null @@ -1,236 +0,0 @@ -// RUN: iree-opt --aie-objectFifo-stateful-transform %s - -module { - aie.device(npu1) { - func.func private @conv2dk1_i8(memref<32x1x256xi8>, memref<16384xi8>, memref<32x1x64xui8>, i32, i32, i32, i32) - func.func private @conv2dk3_ui8(memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) - func.func private @conv2dk1_skip_i8(memref<32x1x32xui8>, memref<32x1x32xui8>, memref<16384xi8>, memref<32x1x256xui8>, memref<32x1x256xi8>, i32, i32, i32, i32, i32) - %tile_0_0 = aie.tile(0, 0) - %tile_0_1 = aie.tile(0, 1) - %tile_0_2 = aie.tile(0, 2) - %tile_0_3 = aie.tile(0, 3) - %tile_0_4 = aie.tile(0, 4) - %tile_0_5 = aie.tile(0, 5) - %rtpComputeTile2 = aie.buffer(%tile_0_2) {sym_name = "rtpComputeTile2"} : memref<16xi32> - %rtpComputeTile3 = aie.buffer(%tile_0_3) {sym_name = "rtpComputeTile3"} : memref<16xi32> - %rtpComputeTile4 = aie.buffer(%tile_0_4) {sym_name = "rtpComputeTile4"} : memref<16xi32> - %rtpComputeTile5 = aie.buffer(%tile_0_5) {sym_name = "rtpComputeTile5"} : memref<16xi32> - aie.objectfifo @inOF_act_L3L2(%tile_0_0, {%tile_0_2, %tile_0_1}, [2 : i32, 2 : i32, 4 : i32]) : !aie.objectfifo> - aie.objectfifo @skip_buf(%tile_0_1, {%tile_0_4}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@inOF_act_L3L2] -> [@skip_buf]() - aie.objectfifo @inOF_wts_0_L3L2(%tile_0_0, {%tile_0_1}, 1 : i32) : !aie.objectfifo> - aie.objectfifo @wts_buf_00(%tile_0_1, {%tile_0_2}, 1 : i32) : !aie.objectfifo> - aie.objectfifo @wts_buf_01(%tile_0_1, {%tile_0_3, %tile_0_5}, 1 : i32) : !aie.objectfifo> - aie.objectfifo @wts_buf_02(%tile_0_1, {%tile_0_4}, 1 : i32) : !aie.objectfifo> - aie.objectfifo.link [@inOF_wts_0_L3L2] -> [@wts_buf_00, @wts_buf_01, @wts_buf_02]() - aie.objectfifo @act_2_3_5(%tile_0_2, {%tile_0_3, %tile_0_5}, [2 : i32, 4 : i32, 4 : i32]) : !aie.objectfifo> - aie.objectfifo @act_3_4(%tile_0_3, {%tile_0_4}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @act_5_4(%tile_0_5, {%tile_0_4}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @outOFL2L3(%tile_0_4, {%tile_0_0}, 2 : i32) : !aie.objectfifo> - %core_0_2 = aie.core(%tile_0_2) { - %c0 = arith.constant 0 : index - %c9223372036854775807 = arith.constant 9223372036854775807 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 { - %0 = aie.objectfifo.acquire @wts_buf_00(Consume, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<16384xi8> - %c0_0 = arith.constant 0 : index - %2 = memref.load %rtpComputeTile2[%c0_0] : memref<16xi32> - %c0_1 = arith.constant 0 : index - %c32 = arith.constant 32 : index - %c1_2 = arith.constant 1 : index - scf.for %arg1 = %c0_1 to %c32 step %c1_2 { - %3 = aie.objectfifo.acquire @inOF_act_L3L2(Consume, 1) : !aie.objectfifosubview> - %4 = aie.objectfifo.subview.access %3[0] : !aie.objectfifosubview> -> memref<32x1x256xi8> - %5 = aie.objectfifo.acquire @act_2_3_5(Produce, 1) : !aie.objectfifosubview> - %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %c32_i32 = arith.constant 32 : i32 - %c256_i32 = arith.constant 256 : i32 - %c64_i32 = arith.constant 64 : i32 - func.call @conv2dk1_i8(%4, %1, %6, %c32_i32, %c256_i32, %c64_i32, %2) : (memref<32x1x256xi8>, memref<16384xi8>, memref<32x1x64xui8>, i32, i32, i32, i32) -> () - aie.objectfifo.release @inOF_act_L3L2(Consume, 1) - aie.objectfifo.release @act_2_3_5(Produce, 1) - } - aie.objectfifo.release @wts_buf_00(Consume, 1) - } - aie.end - } {link_with = "conv2dk1.o"} - %core_0_3 = aie.core(%tile_0_3) { - %c0 = arith.constant 0 : index - %c9223372036854775807 = arith.constant 9223372036854775807 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 { - %0 = aie.objectfifo.acquire @wts_buf_01(Consume, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<36864xi8> - %2 = aie.objectfifo.acquire @act_2_3_5(Consume, 2) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %4 = aie.objectfifo.subview.access %2[1] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %5 = aie.objectfifo.acquire @act_3_4(Produce, 1) : !aie.objectfifosubview> - %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %c32_i32 = arith.constant 32 : i32 - %c64_i32 = arith.constant 64 : i32 - %c64_i32_0 = arith.constant 64 : i32 - %c3_i32 = arith.constant 3 : i32 - %c3_i32_1 = arith.constant 3 : i32 - %c0_i32 = arith.constant 0 : i32 - %c11_i32 = arith.constant 11 : i32 - %c0_i32_2 = arith.constant 0 : i32 - func.call @conv2dk3_ui8(%3, %3, %4, %1, %6, %c32_i32, %c64_i32, %c64_i32_0, %c3_i32, %c3_i32_1, %c0_i32, %c11_i32, %c0_i32_2) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> () - aie.objectfifo.release @act_3_4(Produce, 1) - %c0_3 = arith.constant 0 : index - %c30 = arith.constant 30 : index - %c1_4 = arith.constant 1 : index - scf.for %arg1 = %c0_3 to %c30 step %c1_4 { - %12 = aie.objectfifo.acquire @act_2_3_5(Consume, 3) : !aie.objectfifosubview> - %13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %14 = aie.objectfifo.subview.access %12[1] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %15 = aie.objectfifo.subview.access %12[2] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %16 = aie.objectfifo.acquire @act_3_4(Produce, 1) : !aie.objectfifosubview> - %17 = aie.objectfifo.subview.access %16[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %c32_i32_12 = arith.constant 32 : i32 - %c64_i32_13 = arith.constant 64 : i32 - %c64_i32_14 = arith.constant 64 : i32 - %c3_i32_15 = arith.constant 3 : i32 - %c3_i32_16 = arith.constant 3 : i32 - %c1_i32 = arith.constant 1 : i32 - %c11_i32_17 = arith.constant 11 : i32 - %c0_i32_18 = arith.constant 0 : i32 - func.call @conv2dk3_ui8(%13, %14, %15, %1, %17, %c32_i32_12, %c64_i32_13, %c64_i32_14, %c3_i32_15, %c3_i32_16, %c1_i32, %c11_i32_17, %c0_i32_18) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> () - aie.objectfifo.release @act_2_3_5(Consume, 1) - aie.objectfifo.release @act_3_4(Produce, 1) - } - %7 = aie.objectfifo.acquire @act_2_3_5(Consume, 2) : !aie.objectfifosubview> - %8 = aie.objectfifo.subview.access %7[0] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %9 = aie.objectfifo.subview.access %7[1] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %10 = aie.objectfifo.acquire @act_3_4(Produce, 1) : !aie.objectfifosubview> - %11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %c32_i32_5 = arith.constant 32 : i32 - %c64_i32_6 = arith.constant 64 : i32 - %c64_i32_7 = arith.constant 64 : i32 - %c3_i32_8 = arith.constant 3 : i32 - %c3_i32_9 = arith.constant 3 : i32 - %c2_i32 = arith.constant 2 : i32 - %c11_i32_10 = arith.constant 11 : i32 - %c0_i32_11 = arith.constant 0 : i32 - func.call @conv2dk3_ui8(%8, %9, %9, %1, %11, %c32_i32_5, %c64_i32_6, %c64_i32_7, %c3_i32_8, %c3_i32_9, %c2_i32, %c11_i32_10, %c0_i32_11) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> () - aie.objectfifo.release @act_2_3_5(Consume, 2) - aie.objectfifo.release @act_3_4(Produce, 1) - aie.objectfifo.release @wts_buf_01(Consume, 1) - } - aie.end - } {link_with = "conv2dk3.o"} - %core_0_5 = aie.core(%tile_0_5) { - %c0 = arith.constant 0 : index - %c9223372036854775807 = arith.constant 9223372036854775807 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 { - %0 = aie.objectfifo.acquire @wts_buf_01(Consume, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<36864xi8> - %2 = aie.objectfifo.acquire @act_2_3_5(Consume, 2) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %4 = aie.objectfifo.subview.access %2[1] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %5 = aie.objectfifo.acquire @act_5_4(Produce, 1) : !aie.objectfifosubview> - %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %c32_i32 = arith.constant 32 : i32 - %c64_i32 = arith.constant 64 : i32 - %c64_i32_0 = arith.constant 64 : i32 - %c3_i32 = arith.constant 3 : i32 - %c3_i32_1 = arith.constant 3 : i32 - %c0_i32 = arith.constant 0 : i32 - %c11_i32 = arith.constant 11 : i32 - %c32_i32_2 = arith.constant 32 : i32 - func.call @conv2dk3_ui8(%3, %3, %4, %1, %6, %c32_i32, %c64_i32, %c64_i32_0, %c3_i32, %c3_i32_1, %c0_i32, %c11_i32, %c32_i32_2) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> () - aie.objectfifo.release @act_5_4(Produce, 1) - %c0_3 = arith.constant 0 : index - %c30 = arith.constant 30 : index - %c1_4 = arith.constant 1 : index - scf.for %arg1 = %c0_3 to %c30 step %c1_4 { - %12 = aie.objectfifo.acquire @act_2_3_5(Consume, 3) : !aie.objectfifosubview> - %13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %14 = aie.objectfifo.subview.access %12[1] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %15 = aie.objectfifo.subview.access %12[2] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %16 = aie.objectfifo.acquire @act_5_4(Produce, 1) : !aie.objectfifosubview> - %17 = aie.objectfifo.subview.access %16[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %c32_i32_12 = arith.constant 32 : i32 - %c64_i32_13 = arith.constant 64 : i32 - %c64_i32_14 = arith.constant 64 : i32 - %c3_i32_15 = arith.constant 3 : i32 - %c3_i32_16 = arith.constant 3 : i32 - %c1_i32 = arith.constant 1 : i32 - %c11_i32_17 = arith.constant 11 : i32 - %c32_i32_18 = arith.constant 32 : i32 - func.call @conv2dk3_ui8(%13, %14, %15, %1, %17, %c32_i32_12, %c64_i32_13, %c64_i32_14, %c3_i32_15, %c3_i32_16, %c1_i32, %c11_i32_17, %c32_i32_18) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> () - aie.objectfifo.release @act_2_3_5(Consume, 1) - aie.objectfifo.release @act_5_4(Produce, 1) - } - %7 = aie.objectfifo.acquire @act_2_3_5(Consume, 2) : !aie.objectfifosubview> - %8 = aie.objectfifo.subview.access %7[0] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %9 = aie.objectfifo.subview.access %7[1] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %10 = aie.objectfifo.acquire @act_5_4(Produce, 1) : !aie.objectfifosubview> - %11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %c32_i32_5 = arith.constant 32 : i32 - %c64_i32_6 = arith.constant 64 : i32 - %c64_i32_7 = arith.constant 64 : i32 - %c3_i32_8 = arith.constant 3 : i32 - %c3_i32_9 = arith.constant 3 : i32 - %c2_i32 = arith.constant 2 : i32 - %c11_i32_10 = arith.constant 11 : i32 - %c32_i32_11 = arith.constant 32 : i32 - func.call @conv2dk3_ui8(%8, %9, %9, %1, %11, %c32_i32_5, %c64_i32_6, %c64_i32_7, %c3_i32_8, %c3_i32_9, %c2_i32, %c11_i32_10, %c32_i32_11) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> () - aie.objectfifo.release @act_2_3_5(Consume, 2) - aie.objectfifo.release @act_5_4(Produce, 1) - aie.objectfifo.release @wts_buf_01(Consume, 1) - } - aie.end - } {link_with = "conv2dk3.o"} - %core_0_4 = aie.core(%tile_0_4) { - %c0 = arith.constant 0 : index - %c9223372036854775807 = arith.constant 9223372036854775807 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 { - %0 = aie.objectfifo.acquire @wts_buf_02(Consume, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<16384xi8> - %c0_0 = arith.constant 0 : index - %2 = memref.load %rtpComputeTile4[%c0_0] : memref<16xi32> - %c1_1 = arith.constant 1 : index - %3 = memref.load %rtpComputeTile4[%c1_1] : memref<16xi32> - %c0_2 = arith.constant 0 : index - %c32 = arith.constant 32 : index - %c1_3 = arith.constant 1 : index - scf.for %arg1 = %c0_2 to %c32 step %c1_3 { - %4 = aie.objectfifo.acquire @act_3_4(Consume, 1) : !aie.objectfifosubview> - %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %6 = aie.objectfifo.acquire @act_5_4(Consume, 1) : !aie.objectfifosubview> - %7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %8 = aie.objectfifo.acquire @skip_buf(Consume, 1) : !aie.objectfifosubview> - %9 = aie.objectfifo.subview.access %8[0] : !aie.objectfifosubview> -> memref<32x1x256xi8> - %10 = aie.objectfifo.acquire @outOFL2L3(Produce, 1) : !aie.objectfifosubview> - %11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview> -> memref<32x1x256xui8> - %c32_i32 = arith.constant 32 : i32 - %c64_i32 = arith.constant 64 : i32 - %c256_i32 = arith.constant 256 : i32 - func.call @conv2dk1_skip_i8(%5, %7, %1, %11, %9, %c32_i32, %c64_i32, %c256_i32, %2, %3) : (memref<32x1x32xui8>, memref<32x1x32xui8>, memref<16384xi8>, memref<32x1x256xui8>, memref<32x1x256xi8>, i32, i32, i32, i32, i32) -> () - aie.objectfifo.release @outOFL2L3(Produce, 1) - aie.objectfifo.release @act_3_4(Consume, 1) - aie.objectfifo.release @act_5_4(Consume, 1) - aie.objectfifo.release @skip_buf(Consume, 1) - } - aie.objectfifo.release @wts_buf_02(Consume, 1) - } - aie.end - } {link_with = "conv2dk1_skip.o"} - func.func @sequence(%arg0: memref<65536xi32>, %arg1: memref<17408xi32>, %arg2: memref<65536xi32>) { - aiex.npu.rtp_write(0, 2, 0, 1) {buffer_sym_name = "rtpComputeTile2"} - aiex.npu.rtp_write(0, 3, 0, 1) {buffer_sym_name = "rtpComputeTile3"} - aiex.npu.rtp_write(0, 5, 0, 1) {buffer_sym_name = "rtpComputeTile5"} - aiex.npu.rtp_write(0, 4, 0, 1) {buffer_sym_name = "rtpComputeTile4"} - aiex.npu.rtp_write(0, 4, 1, 0) {buffer_sym_name = "rtpComputeTile4"} - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 65536][0, 0, 0]) {id = 0 : i64, metadata = @inOF_act_L3L2} : memref<65536xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 65536][0, 0, 0]) {id = 2 : i64, metadata = @outOFL2L3} : memref<65536xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 17408][0, 0, 0]) {id = 1 : i64, metadata = @inOF_wts_0_L3L2} : memref<17408xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - return - } - } -} - diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_conv2d.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_conv2d.mlir deleted file mode 100644 index 8eccb2867..000000000 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_conv2d.mlir +++ /dev/null @@ -1,55 +0,0 @@ -// RUN: iree-opt --aie-objectFifo-stateful-transform %s - -module { - aie.device(npu1) { - func.func private @conv2dk1_i8(memref<2048xi8>, memref<4096xi8>, memref<2048xi8>, i32, i32, i32, i32) - %tile_0_0 = aie.tile(0, 0) - %tile_0_1 = aie.tile(0, 1) - %tile_0_2 = aie.tile(0, 2) - aie.objectfifo @inOF_act_L3L2(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @act_L2_02(%tile_0_1, {%tile_0_2}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@inOF_act_L3L2] -> [@act_L2_02]() - aie.objectfifo @inOF_wts_0_L3L2(%tile_0_0, {%tile_0_2}, 1 : i32) : !aie.objectfifo> - aie.objectfifo @out_02_L2(%tile_0_2, {%tile_0_1}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @outOFL2L3(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@out_02_L2] -> [@outOFL2L3]() - %rtp2 = aie.buffer(%tile_0_2) {sym_name = "rtp2"} : memref<16xi32> - %core_0_2 = aie.core(%tile_0_2) { - %c0 = arith.constant 0 : index - %c4294967295 = arith.constant 4294967295 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c4294967295 step %c1 { - %0 = aie.objectfifo.acquire @inOF_wts_0_L3L2(Consume, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<4096xi8> - %c0_0 = arith.constant 0 : index - %2 = memref.load %rtp2[%c0_0] : memref<16xi32> - %c0_1 = arith.constant 0 : index - %c32 = arith.constant 32 : index - %c1_2 = arith.constant 1 : index - scf.for %arg1 = %c0_1 to %c32 step %c1_2 { - %3 = aie.objectfifo.acquire @act_L2_02(Consume, 1) : !aie.objectfifosubview> - %4 = aie.objectfifo.subview.access %3[0] : !aie.objectfifosubview> -> memref<2048xi8> - %5 = aie.objectfifo.acquire @out_02_L2(Produce, 1) : !aie.objectfifosubview> - %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview> -> memref<2048xi8> - %c32_i32 = arith.constant 32 : i32 - %c64_i32 = arith.constant 64 : i32 - %c64_i32_3 = arith.constant 64 : i32 - func.call @conv2dk1_i8(%4, %1, %6, %c32_i32, %c64_i32, %c64_i32_3, %2) : (memref<2048xi8>, memref<4096xi8>, memref<2048xi8>, i32, i32, i32, i32) -> () - aie.objectfifo.release @act_L2_02(Consume, 1) - aie.objectfifo.release @out_02_L2(Produce, 1) - } - aie.objectfifo.release @inOF_wts_0_L3L2(Consume, 1) - } - aie.end - } {link_with = "conv2dk1_i8.o"} - func.func @sequence(%arg0: memref<16384xi32>, %arg1: memref<1024xi32>, %arg2: memref<16384xi32>) { - aiex.npu.rtp_write(0, 2, 0, 10) {buffer_sym_name = "rtp2"} - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 16384][0, 0, 0]) {id = 0 : i64, metadata = @inOF_act_L3L2} : memref<16384xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 16384][0, 0, 0]) {id = 2 : i64, metadata = @outOFL2L3} : memref<16384xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 2 : i64, metadata = @inOF_wts_0_L3L2} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - return - } - } -} - diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_conv2d_fused_relu.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_conv2d_fused_relu.mlir deleted file mode 100644 index 701a268f0..000000000 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_conv2d_fused_relu.mlir +++ /dev/null @@ -1,55 +0,0 @@ -// RUN: iree-opt --aie-objectFifo-stateful-transform %s - -module { - aie.device(npu1) { - func.func private @conv2dk1_i8(memref<2048xi8>, memref<4096xi8>, memref<2048xui8>, i32, i32, i32, i32) - %tile_0_0 = aie.tile(0, 0) - %tile_0_1 = aie.tile(0, 1) - %tile_0_2 = aie.tile(0, 2) - aie.objectfifo @inOF_act_L3L2(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @act_L2_02(%tile_0_1, {%tile_0_2}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@inOF_act_L3L2] -> [@act_L2_02]() - aie.objectfifo @inOF_wts_0_L3L2(%tile_0_0, {%tile_0_2}, 1 : i32) : !aie.objectfifo> - aie.objectfifo @out_02_L2(%tile_0_2, {%tile_0_1}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @outOFL2L3(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@out_02_L2] -> [@outOFL2L3]() - %rtp2 = aie.buffer(%tile_0_2) {sym_name = "rtp2"} : memref<16xi32> - %core_0_2 = aie.core(%tile_0_2) { - %c0 = arith.constant 0 : index - %c4294967295 = arith.constant 4294967295 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c4294967295 step %c1 { - %0 = aie.objectfifo.acquire @inOF_wts_0_L3L2(Consume, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<4096xi8> - %c0_0 = arith.constant 0 : index - %2 = memref.load %rtp2[%c0_0] : memref<16xi32> - %c0_1 = arith.constant 0 : index - %c32 = arith.constant 32 : index - %c1_2 = arith.constant 1 : index - scf.for %arg1 = %c0_1 to %c32 step %c1_2 { - %3 = aie.objectfifo.acquire @act_L2_02(Consume, 1) : !aie.objectfifosubview> - %4 = aie.objectfifo.subview.access %3[0] : !aie.objectfifosubview> -> memref<2048xi8> - %5 = aie.objectfifo.acquire @out_02_L2(Produce, 1) : !aie.objectfifosubview> - %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview> -> memref<2048xui8> - %c32_i32 = arith.constant 32 : i32 - %c64_i32 = arith.constant 64 : i32 - %c64_i32_3 = arith.constant 64 : i32 - func.call @conv2dk1_i8(%4, %1, %6, %c32_i32, %c64_i32, %c64_i32_3, %2) : (memref<2048xi8>, memref<4096xi8>, memref<2048xui8>, i32, i32, i32, i32) -> () - aie.objectfifo.release @act_L2_02(Consume, 1) - aie.objectfifo.release @out_02_L2(Produce, 1) - } - aie.objectfifo.release @inOF_wts_0_L3L2(Consume, 1) - } - aie.end - } {link_with = "conv2dk1.o"} - func.func @sequence(%arg0: memref<16384xi32>, %arg1: memref<1024xi32>, %arg2: memref<16384xi32>) { - aiex.npu.rtp_write(0, 2, 0, 1) {buffer_sym_name = "rtp2"} - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 16384][0, 0, 0]) {id = 0 : i64, metadata = @inOF_act_L3L2} : memref<16384xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 16384][0, 0, 0]) {id = 2 : i64, metadata = @outOFL2L3} : memref<16384xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 2 : i64, metadata = @inOF_wts_0_L3L2} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - return - } - } -} - diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_eltwise_add.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_eltwise_add.mlir deleted file mode 100644 index 4e148353c..000000000 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_eltwise_add.mlir +++ /dev/null @@ -1,78 +0,0 @@ -// RUN: iree-opt --aie-objectFifo-stateful-transform %s - -module { - aie.device(npu1) { - func.func private @eltwise_add_bf16_scalar(memref<1024xbf16>, memref<1024xbf16>, memref<1024xbf16>) - func.func private @eltwise_add_bf16_vector(memref<1024xbf16>, memref<1024xbf16>, memref<1024xbf16>) - %tile_0_0 = aie.tile(0, 0) - %tile_0_1 = aie.tile(0, 1) - %tile_0_2 = aie.tile(0, 2) - %tile_0_3 = aie.tile(0, 3) - aie.objectfifo @inA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @memA0(%tile_0_1, {%tile_0_2}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @memA1(%tile_0_1, {%tile_0_3}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@inA] -> [@memA0, @memA1]() - aie.objectfifo @inB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @memB0(%tile_0_1, {%tile_0_2}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @memB1(%tile_0_1, {%tile_0_3}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@inB] -> [@memB0, @memB1]() - aie.objectfifo @memC0(%tile_0_2, {%tile_0_1}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @memC1(%tile_0_3, {%tile_0_1}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @outC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@memC0, @memC1] -> [@outC]() - %core_0_2 = aie.core(%tile_0_2) { - %c0 = arith.constant 0 : index - %c4294967295 = arith.constant 4294967295 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c4294967295 step %c1 { - %c0_0 = arith.constant 0 : index - %c32 = arith.constant 32 : index - %c1_1 = arith.constant 1 : index - scf.for %arg1 = %c0_0 to %c32 step %c1_1 { - %0 = aie.objectfifo.acquire @memC0(Produce, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<1024xbf16> - %2 = aie.objectfifo.acquire @memA0(Consume, 1) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<1024xbf16> - %4 = aie.objectfifo.acquire @memB0(Consume, 1) : !aie.objectfifosubview> - %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview> -> memref<1024xbf16> - func.call @eltwise_add_bf16_vector(%3, %5, %1) : (memref<1024xbf16>, memref<1024xbf16>, memref<1024xbf16>) -> () - aie.objectfifo.release @memA0(Consume, 1) - aie.objectfifo.release @memB0(Consume, 1) - aie.objectfifo.release @memC0(Produce, 1) - } - } - aie.end - } {link_with = "add.o"} - %core_0_3 = aie.core(%tile_0_3) { - %c0 = arith.constant 0 : index - %c4294967295 = arith.constant 4294967295 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c4294967295 step %c1 { - %c0_0 = arith.constant 0 : index - %c32 = arith.constant 32 : index - %c1_1 = arith.constant 1 : index - scf.for %arg1 = %c0_0 to %c32 step %c1_1 { - %0 = aie.objectfifo.acquire @memC1(Produce, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<1024xbf16> - %2 = aie.objectfifo.acquire @memA1(Consume, 1) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<1024xbf16> - %4 = aie.objectfifo.acquire @memB1(Consume, 1) : !aie.objectfifosubview> - %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview> -> memref<1024xbf16> - func.call @eltwise_add_bf16_vector(%3, %5, %1) : (memref<1024xbf16>, memref<1024xbf16>, memref<1024xbf16>) -> () - aie.objectfifo.release @memA1(Consume, 1) - aie.objectfifo.release @memB1(Consume, 1) - aie.objectfifo.release @memC1(Produce, 1) - } - } - aie.end - } {link_with = "add.o"} - func.func @sequence(%arg0: memref<65536xi32>, %arg1: memref<65536xi32>, %arg2: memref<65536xi32>) { - aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 32768][0, 0, 0]) {id = 0 : i64, metadata = @outC} : memref<65536xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 32768][0, 0, 0]) {id = 1 : i64, metadata = @inA} : memref<65536xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 32768][0, 0, 0]) {id = 2 : i64, metadata = @inB} : memref<65536xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - return - } - } -} - diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_eltwise_mul.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_eltwise_mul.mlir deleted file mode 100644 index ba05654a9..000000000 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_eltwise_mul.mlir +++ /dev/null @@ -1,78 +0,0 @@ -// RUN: iree-opt --aie-objectFifo-stateful-transform %s - -module { - aie.device(npu1) { - func.func private @eltwise_mul_bf16_scalar(memref<1024xbf16>, memref<1024xbf16>, memref<1024xbf16>) - func.func private @eltwise_mul_bf16_vector(memref<1024xbf16>, memref<1024xbf16>, memref<1024xbf16>) - %tile_0_0 = aie.tile(0, 0) - %tile_0_1 = aie.tile(0, 1) - %tile_0_2 = aie.tile(0, 2) - %tile_0_3 = aie.tile(0, 3) - aie.objectfifo @inA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @memA0(%tile_0_1, {%tile_0_2}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @memA1(%tile_0_1, {%tile_0_3}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@inA] -> [@memA0, @memA1]() - aie.objectfifo @inB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @memB0(%tile_0_1, {%tile_0_2}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @memB1(%tile_0_1, {%tile_0_3}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@inB] -> [@memB0, @memB1]() - aie.objectfifo @memC0(%tile_0_2, {%tile_0_1}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @memC1(%tile_0_3, {%tile_0_1}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @outC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@memC0, @memC1] -> [@outC]() - %core_0_2 = aie.core(%tile_0_2) { - %c0 = arith.constant 0 : index - %c4294967295 = arith.constant 4294967295 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c4294967295 step %c1 { - %c0_0 = arith.constant 0 : index - %c32 = arith.constant 32 : index - %c1_1 = arith.constant 1 : index - scf.for %arg1 = %c0_0 to %c32 step %c1_1 { - %0 = aie.objectfifo.acquire @memC0(Produce, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<1024xbf16> - %2 = aie.objectfifo.acquire @memA0(Consume, 1) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<1024xbf16> - %4 = aie.objectfifo.acquire @memB0(Consume, 1) : !aie.objectfifosubview> - %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview> -> memref<1024xbf16> - func.call @eltwise_mul_bf16_vector(%3, %5, %1) : (memref<1024xbf16>, memref<1024xbf16>, memref<1024xbf16>) -> () - aie.objectfifo.release @memA0(Consume, 1) - aie.objectfifo.release @memB0(Consume, 1) - aie.objectfifo.release @memC0(Produce, 1) - } - } - aie.end - } {link_with = "mul.o"} - %core_0_3 = aie.core(%tile_0_3) { - %c0 = arith.constant 0 : index - %c4294967295 = arith.constant 4294967295 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c4294967295 step %c1 { - %c0_0 = arith.constant 0 : index - %c32 = arith.constant 32 : index - %c1_1 = arith.constant 1 : index - scf.for %arg1 = %c0_0 to %c32 step %c1_1 { - %0 = aie.objectfifo.acquire @memC1(Produce, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<1024xbf16> - %2 = aie.objectfifo.acquire @memA1(Consume, 1) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<1024xbf16> - %4 = aie.objectfifo.acquire @memB1(Consume, 1) : !aie.objectfifosubview> - %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview> -> memref<1024xbf16> - func.call @eltwise_mul_bf16_vector(%3, %5, %1) : (memref<1024xbf16>, memref<1024xbf16>, memref<1024xbf16>) -> () - aie.objectfifo.release @memA1(Consume, 1) - aie.objectfifo.release @memB1(Consume, 1) - aie.objectfifo.release @memC1(Produce, 1) - } - } - aie.end - } {link_with = "mul.o"} - func.func @sequence(%arg0: memref<65536xi32>, %arg1: memref<65536xi32>, %arg2: memref<65536xi32>) { - aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 32768][0, 0, 0]) {id = 0 : i64, metadata = @outC} : memref<65536xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 32768][0, 0, 0]) {id = 1 : i64, metadata = @inA} : memref<65536xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 32768][0, 0, 0]) {id = 2 : i64, metadata = @inB} : memref<65536xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - return - } - } -} - diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_relu.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_relu.mlir deleted file mode 100644 index c4ffce536..000000000 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_relu.mlir +++ /dev/null @@ -1,66 +0,0 @@ -// RUN: iree-opt --aie-objectFifo-stateful-transform %s - -module { - aie.device(npu1) { - func.func private @bf16_relu(memref<1024xbf16>, memref<1024xbf16>) - %tile_0_0 = aie.tile(0, 0) - %tile_0_1 = aie.tile(0, 1) - %tile_0_2 = aie.tile(0, 2) - %tile_0_3 = aie.tile(0, 3) - aie.objectfifo @inA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @memA0(%tile_0_1, {%tile_0_2}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @memA1(%tile_0_1, {%tile_0_3}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@inA] -> [@memA0, @memA1]() - aie.objectfifo @memC0(%tile_0_2, {%tile_0_1}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @memC1(%tile_0_3, {%tile_0_1}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @outC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@memC0, @memC1] -> [@outC]() - %core_0_2 = aie.core(%tile_0_2) { - %c0 = arith.constant 0 : index - %c4294967295 = arith.constant 4294967295 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c4294967295 step %c1 { - %c0_0 = arith.constant 0 : index - %c32 = arith.constant 32 : index - %c1_1 = arith.constant 1 : index - scf.for %arg1 = %c0_0 to %c32 step %c1_1 { - %0 = aie.objectfifo.acquire @memC0(Produce, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<1024xbf16> - %2 = aie.objectfifo.acquire @memA0(Consume, 1) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<1024xbf16> - func.call @bf16_relu(%3, %1) : (memref<1024xbf16>, memref<1024xbf16>) -> () - aie.objectfifo.release @memA0(Consume, 1) - aie.objectfifo.release @memC0(Produce, 1) - } - } - aie.end - } {link_with = "relu.o"} - %core_0_3 = aie.core(%tile_0_3) { - %c0 = arith.constant 0 : index - %c4294967295 = arith.constant 4294967295 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c4294967295 step %c1 { - %c0_0 = arith.constant 0 : index - %c32 = arith.constant 32 : index - %c1_1 = arith.constant 1 : index - scf.for %arg1 = %c0_0 to %c32 step %c1_1 { - %0 = aie.objectfifo.acquire @memC1(Produce, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<1024xbf16> - %2 = aie.objectfifo.acquire @memA1(Consume, 1) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<1024xbf16> - func.call @bf16_relu(%3, %1) : (memref<1024xbf16>, memref<1024xbf16>) -> () - aie.objectfifo.release @memA1(Consume, 1) - aie.objectfifo.release @memC1(Produce, 1) - } - } - aie.end - } {link_with = "relu.o"} - func.func @sequence(%arg0: memref<65536xi32>, %arg1: memref<65536xi32>) { - aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 32768][0, 0, 0]) {id = 0 : i64, metadata = @outC} : memref<65536xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 32768][0, 0, 0]) {id = 1 : i64, metadata = @inA} : memref<65536xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - return - } - } -} - diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_resnet_layers_conv2_x.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_resnet_layers_conv2_x.mlir deleted file mode 100644 index b0619f1ee..000000000 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_resnet_layers_conv2_x.mlir +++ /dev/null @@ -1,664 +0,0 @@ -// RUN: iree-opt --aie-objectFifo-stateful-transform %s - -module { - aie.device(npu1) { - func.func private @conv2dk1_i8(memref<32x1x64xi8>, memref<4096xi8>, memref<32x1x64xui8>, i32, i32, i32, i32) - func.func private @conv2dk3_ui8(memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) - func.func private @conv2dk1_skip_init_i8(memref<32x1x32xui8>, memref<32x1x32xui8>, memref<32768xi8>, memref<32x1x256xui8>, memref<32x1x64xi8>, i32, i32, i32, i32, i32, i32, i32) - func.func private @conv2dk1_ui8(memref<32x1x256xui8>, memref<16384xi8>, memref<32x1x64xui8>, i32, i32, i32, i32) - func.func private @conv2dk1_skip_ui8(memref<32x1x32xui8>, memref<32x1x32xui8>, memref<16384xi8>, memref<32x1x256xui8>, memref<32x1x256xui8>, i32, i32, i32, i32, i32) - %tile_0_0 = aie.tile(0, 0) - %tile_0_1 = aie.tile(0, 1) - %tile_0_2 = aie.tile(0, 2) - %tile_0_3 = aie.tile(0, 3) - %tile_0_4 = aie.tile(0, 4) - %tile_0_5 = aie.tile(0, 5) - %tile_1_0 = aie.tile(1, 0) - %tile_1_1 = aie.tile(1, 1) - %tile_1_2 = aie.tile(1, 2) - %tile_1_3 = aie.tile(1, 3) - %tile_1_4 = aie.tile(1, 4) - %tile_1_5 = aie.tile(1, 5) - %tile_2_0 = aie.tile(2, 0) - %tile_2_1 = aie.tile(2, 1) - %tile_2_2 = aie.tile(2, 2) - %tile_2_3 = aie.tile(2, 3) - %tile_2_4 = aie.tile(2, 4) - %tile_2_5 = aie.tile(2, 5) - %rtpComputeTile02 = aie.buffer(%tile_0_2) {sym_name = "rtpComputeTile02"} : memref<16xi32> - %rtpComputeTile03 = aie.buffer(%tile_0_3) {sym_name = "rtpComputeTile03"} : memref<16xi32> - %rtpComputeTile04 = aie.buffer(%tile_0_5) {sym_name = "rtpComputeTile04"} : memref<16xi32> - %rtpComputeTile05 = aie.buffer(%tile_0_4) {sym_name = "rtpComputeTile05"} : memref<16xi32> - %rtpComputeTile12 = aie.buffer(%tile_1_2) {sym_name = "rtpComputeTile12"} : memref<16xi32> - %rtpComputeTile13 = aie.buffer(%tile_1_3) {sym_name = "rtpComputeTile13"} : memref<16xi32> - %rtpComputeTile14 = aie.buffer(%tile_1_4) {sym_name = "rtpComputeTile14"} : memref<16xi32> - %rtpComputeTile15 = aie.buffer(%tile_1_5) {sym_name = "rtpComputeTile15"} : memref<16xi32> - %rtpComputeTile22 = aie.buffer(%tile_2_2) {sym_name = "rtpComputeTile22"} : memref<16xi32> - %rtpComputeTile23 = aie.buffer(%tile_2_3) {sym_name = "rtpComputeTile23"} : memref<16xi32> - %rtpComputeTile24 = aie.buffer(%tile_2_4) {sym_name = "rtpComputeTile24"} : memref<16xi32> - %rtpComputeTile25 = aie.buffer(%tile_2_5) {sym_name = "rtpComputeTile25"} : memref<16xi32> - aie.objectfifo @act1_00_02_01(%tile_0_0, {%tile_0_2, %tile_0_1}, [2 : i32, 2 : i32, 4 : i32]) : !aie.objectfifo> - aie.objectfifo @skip_0(%tile_0_1, {%tile_0_4}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@act1_00_02_01] -> [@skip_0]() - aie.objectfifo @act1_04_15_11(%tile_0_4, {%tile_1_5, %tile_0_1}, [2 : i32, 2 : i32, 4 : i32]) : !aie.objectfifo> - aie.objectfifo @skip_1(%tile_0_1, {%tile_1_3}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@act1_04_15_11] -> [@skip_1]() - aie.objectfifo @act1_13_22_21(%tile_1_3, {%tile_2_2, %tile_2_1}, [2 : i32, 2 : i32, 4 : i32]) : !aie.objectfifo> - aie.objectfifo @skip_2(%tile_2_1, {%tile_2_4}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@act1_13_22_21] -> [@skip_2]() - aie.objectfifo @act2_02_03_05(%tile_0_2, {%tile_0_3, %tile_0_5}, 4 : i32) : !aie.objectfifo> - aie.objectfifo @act3_03_04(%tile_0_3, {%tile_0_4}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @act3_05_04(%tile_0_5, {%tile_0_4}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @act2_15_12_14(%tile_1_5, {%tile_1_2, %tile_1_4}, 4 : i32) : !aie.objectfifo> - aie.objectfifo @act3_14_13(%tile_1_4, {%tile_1_3}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @act3_12_13(%tile_1_2, {%tile_1_3}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @act2_22_23_25(%tile_2_2, {%tile_2_3, %tile_2_5}, 4 : i32) : !aie.objectfifo> - aie.objectfifo @act3_23_24(%tile_2_3, {%tile_2_4}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @act3_25_24(%tile_2_5, {%tile_2_4}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @wts_0_L3L2(%tile_0_0, {%tile_0_1}, 1 : i32) : !aie.objectfifo> - aie.objectfifo @wts_buf_00(%tile_0_1, {%tile_0_2}, 1 : i32) : !aie.objectfifo> - aie.objectfifo @wts_buf_01(%tile_0_1, {%tile_0_3, %tile_0_5}, 1 : i32) : !aie.objectfifo> - aie.objectfifo @wts_buf_02(%tile_0_1, {%tile_0_4}, 1 : i32) : !aie.objectfifo> - aie.objectfifo.link [@wts_0_L3L2] -> [@wts_buf_00, @wts_buf_01, @wts_buf_02]() - aie.objectfifo @wts_1_L3L2(%tile_1_0, {%tile_1_1}, 1 : i32) : !aie.objectfifo> - aie.objectfifo @wts_buf_10(%tile_1_1, {%tile_1_5}, 1 : i32) : !aie.objectfifo> - aie.objectfifo @wts_buf_11(%tile_1_1, {%tile_1_2, %tile_1_4}, 1 : i32) : !aie.objectfifo> - aie.objectfifo @wts_buf_12(%tile_1_1, {%tile_1_3}, 1 : i32) : !aie.objectfifo> - aie.objectfifo.link [@wts_1_L3L2] -> [@wts_buf_10, @wts_buf_11, @wts_buf_12]() - aie.objectfifo @wts_2_L3L2(%tile_2_0, {%tile_2_1}, 1 : i32) : !aie.objectfifo> - aie.objectfifo @wts_buf_20(%tile_2_1, {%tile_2_2}, 1 : i32) : !aie.objectfifo> - aie.objectfifo @wts_buf_21(%tile_2_1, {%tile_2_3, %tile_2_5}, 1 : i32) : !aie.objectfifo> - aie.objectfifo @wts_buf_22(%tile_2_1, {%tile_2_4}, 1 : i32) : !aie.objectfifo> - aie.objectfifo.link [@wts_2_L3L2] -> [@wts_buf_20, @wts_buf_21, @wts_buf_22]() - aie.objectfifo @outOFL2L3(%tile_2_4, {%tile_1_0}, 2 : i32) : !aie.objectfifo> - %core_0_2 = aie.core(%tile_0_2) { - %c0 = arith.constant 0 : index - %c9223372036854775807 = arith.constant 9223372036854775807 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 { - %0 = aie.objectfifo.acquire @wts_buf_00(Consume, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<4096xi8> - %c0_0 = arith.constant 0 : index - %2 = memref.load %rtpComputeTile02[%c0_0] : memref<16xi32> - %c0_1 = arith.constant 0 : index - %c32 = arith.constant 32 : index - %c1_2 = arith.constant 1 : index - scf.for %arg1 = %c0_1 to %c32 step %c1_2 { - %3 = aie.objectfifo.acquire @act1_00_02_01(Consume, 1) : !aie.objectfifosubview> - %4 = aie.objectfifo.subview.access %3[0] : !aie.objectfifosubview> -> memref<32x1x64xi8> - %5 = aie.objectfifo.acquire @act2_02_03_05(Produce, 1) : !aie.objectfifosubview> - %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %c32_i32 = arith.constant 32 : i32 - %c64_i32 = arith.constant 64 : i32 - %c64_i32_3 = arith.constant 64 : i32 - func.call @conv2dk1_i8(%4, %1, %6, %c32_i32, %c64_i32, %c64_i32_3, %2) : (memref<32x1x64xi8>, memref<4096xi8>, memref<32x1x64xui8>, i32, i32, i32, i32) -> () - aie.objectfifo.release @act1_00_02_01(Consume, 1) - aie.objectfifo.release @act2_02_03_05(Produce, 1) - } - aie.objectfifo.release @wts_buf_00(Consume, 1) - } - aie.end - } {link_with = "conv2dk1_i8.o"} - %core_1_5 = aie.core(%tile_1_5) { - %c0 = arith.constant 0 : index - %c9223372036854775807 = arith.constant 9223372036854775807 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 { - %0 = aie.objectfifo.acquire @wts_buf_10(Consume, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<16384xi8> - %c0_0 = arith.constant 0 : index - %2 = memref.load %rtpComputeTile15[%c0_0] : memref<16xi32> - %c0_1 = arith.constant 0 : index - %c32 = arith.constant 32 : index - %c1_2 = arith.constant 1 : index - scf.for %arg1 = %c0_1 to %c32 step %c1_2 { - %3 = aie.objectfifo.acquire @act1_04_15_11(Consume, 1) : !aie.objectfifosubview> - %4 = aie.objectfifo.subview.access %3[0] : !aie.objectfifosubview> -> memref<32x1x256xui8> - %5 = aie.objectfifo.acquire @act2_15_12_14(Produce, 1) : !aie.objectfifosubview> - %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %c32_i32 = arith.constant 32 : i32 - %c256_i32 = arith.constant 256 : i32 - %c64_i32 = arith.constant 64 : i32 - func.call @conv2dk1_ui8(%4, %1, %6, %c32_i32, %c256_i32, %c64_i32, %2) : (memref<32x1x256xui8>, memref<16384xi8>, memref<32x1x64xui8>, i32, i32, i32, i32) -> () - aie.objectfifo.release @act1_04_15_11(Consume, 1) - aie.objectfifo.release @act2_15_12_14(Produce, 1) - } - aie.objectfifo.release @wts_buf_10(Consume, 1) - } - aie.end - } {link_with = "conv2dk1_ui8.o"} - %core_2_2 = aie.core(%tile_2_2) { - %c0 = arith.constant 0 : index - %c9223372036854775807 = arith.constant 9223372036854775807 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 { - %0 = aie.objectfifo.acquire @wts_buf_20(Consume, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<16384xi8> - %c0_0 = arith.constant 0 : index - %2 = memref.load %rtpComputeTile22[%c0_0] : memref<16xi32> - %c0_1 = arith.constant 0 : index - %c32 = arith.constant 32 : index - %c1_2 = arith.constant 1 : index - scf.for %arg1 = %c0_1 to %c32 step %c1_2 { - %3 = aie.objectfifo.acquire @act1_13_22_21(Consume, 1) : !aie.objectfifosubview> - %4 = aie.objectfifo.subview.access %3[0] : !aie.objectfifosubview> -> memref<32x1x256xui8> - %5 = aie.objectfifo.acquire @act2_22_23_25(Produce, 1) : !aie.objectfifosubview> - %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %c32_i32 = arith.constant 32 : i32 - %c256_i32 = arith.constant 256 : i32 - %c64_i32 = arith.constant 64 : i32 - func.call @conv2dk1_ui8(%4, %1, %6, %c32_i32, %c256_i32, %c64_i32, %2) : (memref<32x1x256xui8>, memref<16384xi8>, memref<32x1x64xui8>, i32, i32, i32, i32) -> () - aie.objectfifo.release @act1_13_22_21(Consume, 1) - aie.objectfifo.release @act2_22_23_25(Produce, 1) - } - aie.objectfifo.release @wts_buf_20(Consume, 1) - } - aie.end - } {link_with = "conv2dk1_ui8.o"} - %core_0_3 = aie.core(%tile_0_3) { - %c0 = arith.constant 0 : index - %c9223372036854775807 = arith.constant 9223372036854775807 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 { - %0 = aie.objectfifo.acquire @wts_buf_01(Consume, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<36864xi8> - %2 = aie.objectfifo.acquire @act2_02_03_05(Consume, 2) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %4 = aie.objectfifo.subview.access %2[1] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %5 = aie.objectfifo.acquire @act3_03_04(Produce, 1) : !aie.objectfifosubview> - %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %c32_i32 = arith.constant 32 : i32 - %c64_i32 = arith.constant 64 : i32 - %c32_i32_0 = arith.constant 32 : i32 - %c3_i32 = arith.constant 3 : i32 - %c3_i32_1 = arith.constant 3 : i32 - %c0_i32 = arith.constant 0 : i32 - %c11_i32 = arith.constant 11 : i32 - %c0_i32_2 = arith.constant 0 : i32 - func.call @conv2dk3_ui8(%3, %3, %4, %1, %6, %c32_i32, %c64_i32, %c32_i32_0, %c3_i32, %c3_i32_1, %c0_i32, %c11_i32, %c0_i32_2) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> () - aie.objectfifo.release @act3_03_04(Produce, 1) - %c0_3 = arith.constant 0 : index - %c30 = arith.constant 30 : index - %c1_4 = arith.constant 1 : index - scf.for %arg1 = %c0_3 to %c30 step %c1_4 { - %12 = aie.objectfifo.acquire @act2_02_03_05(Consume, 3) : !aie.objectfifosubview> - %13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %14 = aie.objectfifo.subview.access %12[1] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %15 = aie.objectfifo.subview.access %12[2] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %16 = aie.objectfifo.acquire @act3_03_04(Produce, 1) : !aie.objectfifosubview> - %17 = aie.objectfifo.subview.access %16[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %c32_i32_12 = arith.constant 32 : i32 - %c64_i32_13 = arith.constant 64 : i32 - %c32_i32_14 = arith.constant 32 : i32 - %c3_i32_15 = arith.constant 3 : i32 - %c3_i32_16 = arith.constant 3 : i32 - %c1_i32 = arith.constant 1 : i32 - %c11_i32_17 = arith.constant 11 : i32 - %c0_i32_18 = arith.constant 0 : i32 - func.call @conv2dk3_ui8(%13, %14, %15, %1, %17, %c32_i32_12, %c64_i32_13, %c32_i32_14, %c3_i32_15, %c3_i32_16, %c1_i32, %c11_i32_17, %c0_i32_18) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> () - aie.objectfifo.release @act2_02_03_05(Consume, 1) - aie.objectfifo.release @act3_03_04(Produce, 1) - } - %7 = aie.objectfifo.acquire @act2_02_03_05(Consume, 2) : !aie.objectfifosubview> - %8 = aie.objectfifo.subview.access %7[0] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %9 = aie.objectfifo.subview.access %7[1] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %10 = aie.objectfifo.acquire @act3_03_04(Produce, 1) : !aie.objectfifosubview> - %11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %c32_i32_5 = arith.constant 32 : i32 - %c64_i32_6 = arith.constant 64 : i32 - %c32_i32_7 = arith.constant 32 : i32 - %c3_i32_8 = arith.constant 3 : i32 - %c3_i32_9 = arith.constant 3 : i32 - %c2_i32 = arith.constant 2 : i32 - %c11_i32_10 = arith.constant 11 : i32 - %c0_i32_11 = arith.constant 0 : i32 - func.call @conv2dk3_ui8(%8, %9, %9, %1, %11, %c32_i32_5, %c64_i32_6, %c32_i32_7, %c3_i32_8, %c3_i32_9, %c2_i32, %c11_i32_10, %c0_i32_11) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> () - aie.objectfifo.release @act2_02_03_05(Consume, 2) - aie.objectfifo.release @act3_03_04(Produce, 1) - aie.objectfifo.release @wts_buf_01(Consume, 1) - } - aie.end - } {link_with = "conv2dk3.o"} - %core_1_4 = aie.core(%tile_1_4) { - %c0 = arith.constant 0 : index - %c9223372036854775807 = arith.constant 9223372036854775807 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 { - %0 = aie.objectfifo.acquire @wts_buf_11(Consume, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<36864xi8> - %2 = aie.objectfifo.acquire @act2_15_12_14(Consume, 2) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %4 = aie.objectfifo.subview.access %2[1] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %5 = aie.objectfifo.acquire @act3_14_13(Produce, 1) : !aie.objectfifosubview> - %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %c32_i32 = arith.constant 32 : i32 - %c64_i32 = arith.constant 64 : i32 - %c32_i32_0 = arith.constant 32 : i32 - %c3_i32 = arith.constant 3 : i32 - %c3_i32_1 = arith.constant 3 : i32 - %c0_i32 = arith.constant 0 : i32 - %c11_i32 = arith.constant 11 : i32 - %c0_i32_2 = arith.constant 0 : i32 - func.call @conv2dk3_ui8(%3, %3, %4, %1, %6, %c32_i32, %c64_i32, %c32_i32_0, %c3_i32, %c3_i32_1, %c0_i32, %c11_i32, %c0_i32_2) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> () - aie.objectfifo.release @act3_14_13(Produce, 1) - %c0_3 = arith.constant 0 : index - %c30 = arith.constant 30 : index - %c1_4 = arith.constant 1 : index - scf.for %arg1 = %c0_3 to %c30 step %c1_4 { - %12 = aie.objectfifo.acquire @act2_15_12_14(Consume, 3) : !aie.objectfifosubview> - %13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %14 = aie.objectfifo.subview.access %12[1] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %15 = aie.objectfifo.subview.access %12[2] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %16 = aie.objectfifo.acquire @act3_14_13(Produce, 1) : !aie.objectfifosubview> - %17 = aie.objectfifo.subview.access %16[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %c32_i32_12 = arith.constant 32 : i32 - %c64_i32_13 = arith.constant 64 : i32 - %c32_i32_14 = arith.constant 32 : i32 - %c3_i32_15 = arith.constant 3 : i32 - %c3_i32_16 = arith.constant 3 : i32 - %c1_i32 = arith.constant 1 : i32 - %c11_i32_17 = arith.constant 11 : i32 - %c0_i32_18 = arith.constant 0 : i32 - func.call @conv2dk3_ui8(%13, %14, %15, %1, %17, %c32_i32_12, %c64_i32_13, %c32_i32_14, %c3_i32_15, %c3_i32_16, %c1_i32, %c11_i32_17, %c0_i32_18) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> () - aie.objectfifo.release @act2_15_12_14(Consume, 1) - aie.objectfifo.release @act3_14_13(Produce, 1) - } - %7 = aie.objectfifo.acquire @act2_15_12_14(Consume, 2) : !aie.objectfifosubview> - %8 = aie.objectfifo.subview.access %7[0] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %9 = aie.objectfifo.subview.access %7[1] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %10 = aie.objectfifo.acquire @act3_14_13(Produce, 1) : !aie.objectfifosubview> - %11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %c32_i32_5 = arith.constant 32 : i32 - %c64_i32_6 = arith.constant 64 : i32 - %c32_i32_7 = arith.constant 32 : i32 - %c3_i32_8 = arith.constant 3 : i32 - %c3_i32_9 = arith.constant 3 : i32 - %c2_i32 = arith.constant 2 : i32 - %c11_i32_10 = arith.constant 11 : i32 - %c0_i32_11 = arith.constant 0 : i32 - func.call @conv2dk3_ui8(%8, %9, %9, %1, %11, %c32_i32_5, %c64_i32_6, %c32_i32_7, %c3_i32_8, %c3_i32_9, %c2_i32, %c11_i32_10, %c0_i32_11) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> () - aie.objectfifo.release @act2_15_12_14(Consume, 2) - aie.objectfifo.release @act3_14_13(Produce, 1) - aie.objectfifo.release @wts_buf_11(Consume, 1) - } - aie.end - } {link_with = "conv2dk3.o"} - %core_2_3 = aie.core(%tile_2_3) { - %c0 = arith.constant 0 : index - %c9223372036854775807 = arith.constant 9223372036854775807 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 { - %0 = aie.objectfifo.acquire @wts_buf_21(Consume, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<36864xi8> - %2 = aie.objectfifo.acquire @act2_22_23_25(Consume, 2) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %4 = aie.objectfifo.subview.access %2[1] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %5 = aie.objectfifo.acquire @act3_23_24(Produce, 1) : !aie.objectfifosubview> - %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %c32_i32 = arith.constant 32 : i32 - %c64_i32 = arith.constant 64 : i32 - %c32_i32_0 = arith.constant 32 : i32 - %c3_i32 = arith.constant 3 : i32 - %c3_i32_1 = arith.constant 3 : i32 - %c0_i32 = arith.constant 0 : i32 - %c11_i32 = arith.constant 11 : i32 - %c0_i32_2 = arith.constant 0 : i32 - func.call @conv2dk3_ui8(%3, %3, %4, %1, %6, %c32_i32, %c64_i32, %c32_i32_0, %c3_i32, %c3_i32_1, %c0_i32, %c11_i32, %c0_i32_2) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> () - aie.objectfifo.release @act3_23_24(Produce, 1) - %c0_3 = arith.constant 0 : index - %c30 = arith.constant 30 : index - %c1_4 = arith.constant 1 : index - scf.for %arg1 = %c0_3 to %c30 step %c1_4 { - %12 = aie.objectfifo.acquire @act2_22_23_25(Consume, 3) : !aie.objectfifosubview> - %13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %14 = aie.objectfifo.subview.access %12[1] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %15 = aie.objectfifo.subview.access %12[2] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %16 = aie.objectfifo.acquire @act3_23_24(Produce, 1) : !aie.objectfifosubview> - %17 = aie.objectfifo.subview.access %16[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %c32_i32_12 = arith.constant 32 : i32 - %c64_i32_13 = arith.constant 64 : i32 - %c32_i32_14 = arith.constant 32 : i32 - %c3_i32_15 = arith.constant 3 : i32 - %c3_i32_16 = arith.constant 3 : i32 - %c1_i32 = arith.constant 1 : i32 - %c11_i32_17 = arith.constant 11 : i32 - %c0_i32_18 = arith.constant 0 : i32 - func.call @conv2dk3_ui8(%13, %14, %15, %1, %17, %c32_i32_12, %c64_i32_13, %c32_i32_14, %c3_i32_15, %c3_i32_16, %c1_i32, %c11_i32_17, %c0_i32_18) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> () - aie.objectfifo.release @act2_22_23_25(Consume, 1) - aie.objectfifo.release @act3_23_24(Produce, 1) - } - %7 = aie.objectfifo.acquire @act2_22_23_25(Consume, 2) : !aie.objectfifosubview> - %8 = aie.objectfifo.subview.access %7[0] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %9 = aie.objectfifo.subview.access %7[1] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %10 = aie.objectfifo.acquire @act3_23_24(Produce, 1) : !aie.objectfifosubview> - %11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %c32_i32_5 = arith.constant 32 : i32 - %c64_i32_6 = arith.constant 64 : i32 - %c32_i32_7 = arith.constant 32 : i32 - %c3_i32_8 = arith.constant 3 : i32 - %c3_i32_9 = arith.constant 3 : i32 - %c2_i32 = arith.constant 2 : i32 - %c11_i32_10 = arith.constant 11 : i32 - %c0_i32_11 = arith.constant 0 : i32 - func.call @conv2dk3_ui8(%8, %9, %9, %1, %11, %c32_i32_5, %c64_i32_6, %c32_i32_7, %c3_i32_8, %c3_i32_9, %c2_i32, %c11_i32_10, %c0_i32_11) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> () - aie.objectfifo.release @act2_22_23_25(Consume, 2) - aie.objectfifo.release @act3_23_24(Produce, 1) - aie.objectfifo.release @wts_buf_21(Consume, 1) - } - aie.end - } {link_with = "conv2dk3.o"} - %core_0_5 = aie.core(%tile_0_5) { - %c0 = arith.constant 0 : index - %c9223372036854775807 = arith.constant 9223372036854775807 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 { - %0 = aie.objectfifo.acquire @wts_buf_01(Consume, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<36864xi8> - %2 = aie.objectfifo.acquire @act2_02_03_05(Consume, 2) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %4 = aie.objectfifo.subview.access %2[1] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %5 = aie.objectfifo.acquire @act3_05_04(Produce, 1) : !aie.objectfifosubview> - %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %c32_i32 = arith.constant 32 : i32 - %c64_i32 = arith.constant 64 : i32 - %c32_i32_0 = arith.constant 32 : i32 - %c3_i32 = arith.constant 3 : i32 - %c3_i32_1 = arith.constant 3 : i32 - %c0_i32 = arith.constant 0 : i32 - %c11_i32 = arith.constant 11 : i32 - %c32_i32_2 = arith.constant 32 : i32 - func.call @conv2dk3_ui8(%3, %3, %4, %1, %6, %c32_i32, %c64_i32, %c32_i32_0, %c3_i32, %c3_i32_1, %c0_i32, %c11_i32, %c32_i32_2) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> () - aie.objectfifo.release @act3_05_04(Produce, 1) - %c0_3 = arith.constant 0 : index - %c30 = arith.constant 30 : index - %c1_4 = arith.constant 1 : index - scf.for %arg1 = %c0_3 to %c30 step %c1_4 { - %12 = aie.objectfifo.acquire @act2_02_03_05(Consume, 3) : !aie.objectfifosubview> - %13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %14 = aie.objectfifo.subview.access %12[1] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %15 = aie.objectfifo.subview.access %12[2] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %16 = aie.objectfifo.acquire @act3_05_04(Produce, 1) : !aie.objectfifosubview> - %17 = aie.objectfifo.subview.access %16[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %c32_i32_12 = arith.constant 32 : i32 - %c64_i32_13 = arith.constant 64 : i32 - %c32_i32_14 = arith.constant 32 : i32 - %c3_i32_15 = arith.constant 3 : i32 - %c3_i32_16 = arith.constant 3 : i32 - %c1_i32 = arith.constant 1 : i32 - %c11_i32_17 = arith.constant 11 : i32 - %c32_i32_18 = arith.constant 32 : i32 - func.call @conv2dk3_ui8(%13, %14, %15, %1, %17, %c32_i32_12, %c64_i32_13, %c32_i32_14, %c3_i32_15, %c3_i32_16, %c1_i32, %c11_i32_17, %c32_i32_18) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> () - aie.objectfifo.release @act2_02_03_05(Consume, 1) - aie.objectfifo.release @act3_05_04(Produce, 1) - } - %7 = aie.objectfifo.acquire @act2_02_03_05(Consume, 2) : !aie.objectfifosubview> - %8 = aie.objectfifo.subview.access %7[0] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %9 = aie.objectfifo.subview.access %7[1] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %10 = aie.objectfifo.acquire @act3_05_04(Produce, 1) : !aie.objectfifosubview> - %11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %c32_i32_5 = arith.constant 32 : i32 - %c64_i32_6 = arith.constant 64 : i32 - %c32_i32_7 = arith.constant 32 : i32 - %c3_i32_8 = arith.constant 3 : i32 - %c3_i32_9 = arith.constant 3 : i32 - %c2_i32 = arith.constant 2 : i32 - %c11_i32_10 = arith.constant 11 : i32 - %c32_i32_11 = arith.constant 32 : i32 - func.call @conv2dk3_ui8(%8, %9, %9, %1, %11, %c32_i32_5, %c64_i32_6, %c32_i32_7, %c3_i32_8, %c3_i32_9, %c2_i32, %c11_i32_10, %c32_i32_11) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> () - aie.objectfifo.release @act2_02_03_05(Consume, 2) - aie.objectfifo.release @act3_05_04(Produce, 1) - aie.objectfifo.release @wts_buf_01(Consume, 1) - } - aie.end - } {link_with = "conv2dk3.o"} - %core_1_2 = aie.core(%tile_1_2) { - %c0 = arith.constant 0 : index - %c9223372036854775807 = arith.constant 9223372036854775807 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 { - %0 = aie.objectfifo.acquire @wts_buf_11(Consume, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<36864xi8> - %2 = aie.objectfifo.acquire @act2_15_12_14(Consume, 2) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %4 = aie.objectfifo.subview.access %2[1] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %5 = aie.objectfifo.acquire @act3_12_13(Produce, 1) : !aie.objectfifosubview> - %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %c32_i32 = arith.constant 32 : i32 - %c64_i32 = arith.constant 64 : i32 - %c32_i32_0 = arith.constant 32 : i32 - %c3_i32 = arith.constant 3 : i32 - %c3_i32_1 = arith.constant 3 : i32 - %c0_i32 = arith.constant 0 : i32 - %c11_i32 = arith.constant 11 : i32 - %c32_i32_2 = arith.constant 32 : i32 - func.call @conv2dk3_ui8(%3, %3, %4, %1, %6, %c32_i32, %c64_i32, %c32_i32_0, %c3_i32, %c3_i32_1, %c0_i32, %c11_i32, %c32_i32_2) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> () - aie.objectfifo.release @act3_12_13(Produce, 1) - %c0_3 = arith.constant 0 : index - %c30 = arith.constant 30 : index - %c1_4 = arith.constant 1 : index - scf.for %arg1 = %c0_3 to %c30 step %c1_4 { - %12 = aie.objectfifo.acquire @act2_15_12_14(Consume, 3) : !aie.objectfifosubview> - %13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %14 = aie.objectfifo.subview.access %12[1] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %15 = aie.objectfifo.subview.access %12[2] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %16 = aie.objectfifo.acquire @act3_12_13(Produce, 1) : !aie.objectfifosubview> - %17 = aie.objectfifo.subview.access %16[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %c32_i32_12 = arith.constant 32 : i32 - %c64_i32_13 = arith.constant 64 : i32 - %c32_i32_14 = arith.constant 32 : i32 - %c3_i32_15 = arith.constant 3 : i32 - %c3_i32_16 = arith.constant 3 : i32 - %c1_i32 = arith.constant 1 : i32 - %c11_i32_17 = arith.constant 11 : i32 - %c32_i32_18 = arith.constant 32 : i32 - func.call @conv2dk3_ui8(%13, %14, %15, %1, %17, %c32_i32_12, %c64_i32_13, %c32_i32_14, %c3_i32_15, %c3_i32_16, %c1_i32, %c11_i32_17, %c32_i32_18) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> () - aie.objectfifo.release @act2_15_12_14(Consume, 1) - aie.objectfifo.release @act3_12_13(Produce, 1) - } - %7 = aie.objectfifo.acquire @act2_15_12_14(Consume, 2) : !aie.objectfifosubview> - %8 = aie.objectfifo.subview.access %7[0] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %9 = aie.objectfifo.subview.access %7[1] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %10 = aie.objectfifo.acquire @act3_12_13(Produce, 1) : !aie.objectfifosubview> - %11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %c32_i32_5 = arith.constant 32 : i32 - %c64_i32_6 = arith.constant 64 : i32 - %c32_i32_7 = arith.constant 32 : i32 - %c3_i32_8 = arith.constant 3 : i32 - %c3_i32_9 = arith.constant 3 : i32 - %c2_i32 = arith.constant 2 : i32 - %c11_i32_10 = arith.constant 11 : i32 - %c32_i32_11 = arith.constant 32 : i32 - func.call @conv2dk3_ui8(%8, %9, %9, %1, %11, %c32_i32_5, %c64_i32_6, %c32_i32_7, %c3_i32_8, %c3_i32_9, %c2_i32, %c11_i32_10, %c32_i32_11) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> () - aie.objectfifo.release @act2_15_12_14(Consume, 2) - aie.objectfifo.release @act3_12_13(Produce, 1) - aie.objectfifo.release @wts_buf_11(Consume, 1) - } - aie.end - } {link_with = "conv2dk3.o"} - %core_2_5 = aie.core(%tile_2_5) { - %c0 = arith.constant 0 : index - %c9223372036854775807 = arith.constant 9223372036854775807 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 { - %0 = aie.objectfifo.acquire @wts_buf_21(Consume, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<36864xi8> - %2 = aie.objectfifo.acquire @act2_22_23_25(Consume, 2) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %4 = aie.objectfifo.subview.access %2[1] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %5 = aie.objectfifo.acquire @act3_25_24(Produce, 1) : !aie.objectfifosubview> - %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %c32_i32 = arith.constant 32 : i32 - %c64_i32 = arith.constant 64 : i32 - %c32_i32_0 = arith.constant 32 : i32 - %c3_i32 = arith.constant 3 : i32 - %c3_i32_1 = arith.constant 3 : i32 - %c0_i32 = arith.constant 0 : i32 - %c11_i32 = arith.constant 11 : i32 - %c32_i32_2 = arith.constant 32 : i32 - func.call @conv2dk3_ui8(%3, %3, %4, %1, %6, %c32_i32, %c64_i32, %c32_i32_0, %c3_i32, %c3_i32_1, %c0_i32, %c11_i32, %c32_i32_2) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> () - aie.objectfifo.release @act3_25_24(Produce, 1) - %c0_3 = arith.constant 0 : index - %c30 = arith.constant 30 : index - %c1_4 = arith.constant 1 : index - scf.for %arg1 = %c0_3 to %c30 step %c1_4 { - %12 = aie.objectfifo.acquire @act2_22_23_25(Consume, 3) : !aie.objectfifosubview> - %13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %14 = aie.objectfifo.subview.access %12[1] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %15 = aie.objectfifo.subview.access %12[2] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %16 = aie.objectfifo.acquire @act3_25_24(Produce, 1) : !aie.objectfifosubview> - %17 = aie.objectfifo.subview.access %16[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %c32_i32_12 = arith.constant 32 : i32 - %c64_i32_13 = arith.constant 64 : i32 - %c32_i32_14 = arith.constant 32 : i32 - %c3_i32_15 = arith.constant 3 : i32 - %c3_i32_16 = arith.constant 3 : i32 - %c1_i32 = arith.constant 1 : i32 - %c11_i32_17 = arith.constant 11 : i32 - %c32_i32_18 = arith.constant 32 : i32 - func.call @conv2dk3_ui8(%13, %14, %15, %1, %17, %c32_i32_12, %c64_i32_13, %c32_i32_14, %c3_i32_15, %c3_i32_16, %c1_i32, %c11_i32_17, %c32_i32_18) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> () - aie.objectfifo.release @act2_22_23_25(Consume, 1) - aie.objectfifo.release @act3_25_24(Produce, 1) - } - %7 = aie.objectfifo.acquire @act2_22_23_25(Consume, 2) : !aie.objectfifosubview> - %8 = aie.objectfifo.subview.access %7[0] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %9 = aie.objectfifo.subview.access %7[1] : !aie.objectfifosubview> -> memref<32x1x64xui8> - %10 = aie.objectfifo.acquire @act3_25_24(Produce, 1) : !aie.objectfifosubview> - %11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %c32_i32_5 = arith.constant 32 : i32 - %c64_i32_6 = arith.constant 64 : i32 - %c32_i32_7 = arith.constant 32 : i32 - %c3_i32_8 = arith.constant 3 : i32 - %c3_i32_9 = arith.constant 3 : i32 - %c2_i32 = arith.constant 2 : i32 - %c11_i32_10 = arith.constant 11 : i32 - %c32_i32_11 = arith.constant 32 : i32 - func.call @conv2dk3_ui8(%8, %9, %9, %1, %11, %c32_i32_5, %c64_i32_6, %c32_i32_7, %c3_i32_8, %c3_i32_9, %c2_i32, %c11_i32_10, %c32_i32_11) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> () - aie.objectfifo.release @act2_22_23_25(Consume, 2) - aie.objectfifo.release @act3_25_24(Produce, 1) - aie.objectfifo.release @wts_buf_21(Consume, 1) - } - aie.end - } {link_with = "conv2dk3.o"} - %core_0_4 = aie.core(%tile_0_4) { - %c0 = arith.constant 0 : index - %c9223372036854775807 = arith.constant 9223372036854775807 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 { - %0 = aie.objectfifo.acquire @wts_buf_02(Consume, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<32768xi8> - %c0_0 = arith.constant 0 : index - %2 = memref.load %rtpComputeTile05[%c0_0] : memref<16xi32> - %c1_1 = arith.constant 1 : index - %3 = memref.load %rtpComputeTile05[%c1_1] : memref<16xi32> - %c2 = arith.constant 2 : index - %4 = memref.load %rtpComputeTile05[%c2] : memref<16xi32> - %c0_2 = arith.constant 0 : index - %c32 = arith.constant 32 : index - %c1_3 = arith.constant 1 : index - scf.for %arg1 = %c0_2 to %c32 step %c1_3 { - %5 = aie.objectfifo.acquire @act3_03_04(Consume, 1) : !aie.objectfifosubview> - %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %7 = aie.objectfifo.acquire @act3_05_04(Consume, 1) : !aie.objectfifosubview> - %8 = aie.objectfifo.subview.access %7[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %9 = aie.objectfifo.acquire @act1_04_15_11(Produce, 1) : !aie.objectfifosubview> - %10 = aie.objectfifo.subview.access %9[0] : !aie.objectfifosubview> -> memref<32x1x256xui8> - %11 = aie.objectfifo.acquire @skip_0(Consume, 1) : !aie.objectfifosubview> - %12 = aie.objectfifo.subview.access %11[0] : !aie.objectfifosubview> -> memref<32x1x64xi8> - %c32_i32 = arith.constant 32 : i32 - %c64_i32 = arith.constant 64 : i32 - %c256_i32 = arith.constant 256 : i32 - %c64_i32_4 = arith.constant 64 : i32 - func.call @conv2dk1_skip_init_i8(%6, %8, %1, %10, %12, %c32_i32, %c64_i32, %c256_i32, %c64_i32_4, %2, %3, %4) : (memref<32x1x32xui8>, memref<32x1x32xui8>, memref<32768xi8>, memref<32x1x256xui8>, memref<32x1x64xi8>, i32, i32, i32, i32, i32, i32, i32) -> () - aie.objectfifo.release @act3_03_04(Consume, 1) - aie.objectfifo.release @act3_05_04(Consume, 1) - aie.objectfifo.release @act1_04_15_11(Produce, 1) - aie.objectfifo.release @skip_0(Consume, 1) - } - aie.objectfifo.release @wts_buf_02(Consume, 1) - } - aie.end - } {link_with = "conv2dk1_skip_init.o"} - %core_1_3 = aie.core(%tile_1_3) { - %c0 = arith.constant 0 : index - %c9223372036854775807 = arith.constant 9223372036854775807 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 { - %0 = aie.objectfifo.acquire @wts_buf_12(Consume, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<16384xi8> - %c0_0 = arith.constant 0 : index - %2 = memref.load %rtpComputeTile13[%c0_0] : memref<16xi32> - %c1_1 = arith.constant 1 : index - %3 = memref.load %rtpComputeTile13[%c1_1] : memref<16xi32> - %c0_2 = arith.constant 0 : index - %c32 = arith.constant 32 : index - %c1_3 = arith.constant 1 : index - scf.for %arg1 = %c0_2 to %c32 step %c1_3 { - %4 = aie.objectfifo.acquire @act3_14_13(Consume, 1) : !aie.objectfifosubview> - %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %6 = aie.objectfifo.acquire @act3_12_13(Consume, 1) : !aie.objectfifosubview> - %7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %8 = aie.objectfifo.acquire @act1_13_22_21(Produce, 1) : !aie.objectfifosubview> - %9 = aie.objectfifo.subview.access %8[0] : !aie.objectfifosubview> -> memref<32x1x256xui8> - %10 = aie.objectfifo.acquire @skip_1(Consume, 1) : !aie.objectfifosubview> - %11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview> -> memref<32x1x256xui8> - %c32_i32 = arith.constant 32 : i32 - %c64_i32 = arith.constant 64 : i32 - %c256_i32 = arith.constant 256 : i32 - func.call @conv2dk1_skip_ui8(%5, %7, %1, %9, %11, %c32_i32, %c64_i32, %c256_i32, %2, %3) : (memref<32x1x32xui8>, memref<32x1x32xui8>, memref<16384xi8>, memref<32x1x256xui8>, memref<32x1x256xui8>, i32, i32, i32, i32, i32) -> () - aie.objectfifo.release @act3_14_13(Consume, 1) - aie.objectfifo.release @act3_12_13(Consume, 1) - aie.objectfifo.release @act1_13_22_21(Produce, 1) - aie.objectfifo.release @skip_1(Consume, 1) - } - aie.objectfifo.release @wts_buf_12(Consume, 1) - } - aie.end - } {link_with = "conv2dk1_skip.o"} - %core_2_4 = aie.core(%tile_2_4) { - %c0 = arith.constant 0 : index - %c9223372036854775807 = arith.constant 9223372036854775807 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 { - %0 = aie.objectfifo.acquire @wts_buf_22(Consume, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<16384xi8> - %c0_0 = arith.constant 0 : index - %2 = memref.load %rtpComputeTile24[%c0_0] : memref<16xi32> - %c1_1 = arith.constant 1 : index - %3 = memref.load %rtpComputeTile24[%c1_1] : memref<16xi32> - %c0_2 = arith.constant 0 : index - %c32 = arith.constant 32 : index - %c1_3 = arith.constant 1 : index - scf.for %arg1 = %c0_2 to %c32 step %c1_3 { - %4 = aie.objectfifo.acquire @act3_23_24(Consume, 1) : !aie.objectfifosubview> - %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %6 = aie.objectfifo.acquire @act3_25_24(Consume, 1) : !aie.objectfifosubview> - %7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview> -> memref<32x1x32xui8> - %8 = aie.objectfifo.acquire @outOFL2L3(Produce, 1) : !aie.objectfifosubview> - %9 = aie.objectfifo.subview.access %8[0] : !aie.objectfifosubview> -> memref<32x1x256xui8> - %10 = aie.objectfifo.acquire @skip_2(Consume, 1) : !aie.objectfifosubview> - %11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview> -> memref<32x1x256xui8> - %c32_i32 = arith.constant 32 : i32 - %c64_i32 = arith.constant 64 : i32 - %c256_i32 = arith.constant 256 : i32 - func.call @conv2dk1_skip_ui8(%5, %7, %1, %9, %11, %c32_i32, %c64_i32, %c256_i32, %2, %3) : (memref<32x1x32xui8>, memref<32x1x32xui8>, memref<16384xi8>, memref<32x1x256xui8>, memref<32x1x256xui8>, i32, i32, i32, i32, i32) -> () - aie.objectfifo.release @act3_23_24(Consume, 1) - aie.objectfifo.release @act3_25_24(Consume, 1) - aie.objectfifo.release @outOFL2L3(Produce, 1) - aie.objectfifo.release @skip_2(Consume, 1) - } - aie.objectfifo.release @wts_buf_22(Consume, 1) - } - aie.end - } {link_with = "conv2dk1_skip.o"} - func.func @sequence(%arg0: memref<16384xi32>, %arg1: memref<53248xi32>, %arg2: memref<65536xi32>) { - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 16384][0, 0, 0]) {id = 0 : i64, metadata = @act1_00_02_01} : memref<16384xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 65536][0, 0, 0]) {id = 2 : i64, metadata = @outOFL2L3} : memref<65536xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 18432][0, 0, 0]) {id = 1 : i64, metadata = @wts_0_L3L2} : memref<53248xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 18432][1, 1, 1, 17408][0, 0, 0]) {id = 1 : i64, metadata = @wts_1_L3L2} : memref<53248xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 35840][1, 1, 1, 17408][0, 0, 0]) {id = 1 : i64, metadata = @wts_2_L3L2} : memref<53248xi32> - aiex.npu.sync {channel = 0 : i32, column = 1 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - return - } - } -} - diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_softmax.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_softmax.mlir deleted file mode 100644 index daa031f2c..000000000 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_softmax.mlir +++ /dev/null @@ -1,66 +0,0 @@ -// RUN: iree-opt --aie-objectFifo-stateful-transform %s - -module { - aie.device(npu1) { - func.func private @softmax_bf16_vector(memref<1024xbf16>, memref<1024xbf16>) - %tile_0_0 = aie.tile(0, 0) - %tile_0_1 = aie.tile(0, 1) - %tile_0_2 = aie.tile(0, 2) - %tile_0_3 = aie.tile(0, 3) - aie.objectfifo @inA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @memA0(%tile_0_1, {%tile_0_2}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @memA1(%tile_0_1, {%tile_0_3}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@inA] -> [@memA0, @memA1]() - aie.objectfifo @memC0(%tile_0_2, {%tile_0_1}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @memC1(%tile_0_3, {%tile_0_1}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @outC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo> - aie.objectfifo.link [@memC0, @memC1] -> [@outC]() - %core_0_2 = aie.core(%tile_0_2) { - %c0 = arith.constant 0 : index - %c4294967295 = arith.constant 4294967295 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c4294967295 step %c1 { - %c0_0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1_1 = arith.constant 1 : index - scf.for %arg1 = %c0_0 to %c128 step %c1_1 { - %0 = aie.objectfifo.acquire @memC0(Produce, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<1024xbf16> - %2 = aie.objectfifo.acquire @memA0(Consume, 1) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<1024xbf16> - func.call @softmax_bf16_vector(%3, %1) : (memref<1024xbf16>, memref<1024xbf16>) -> () - aie.objectfifo.release @memA0(Consume, 1) - aie.objectfifo.release @memC0(Produce, 1) - } - } - aie.end - } {link_with = "kernels.a"} - %core_0_3 = aie.core(%tile_0_3) { - %c0 = arith.constant 0 : index - %c4294967295 = arith.constant 4294967295 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c4294967295 step %c1 { - %c0_0 = arith.constant 0 : index - %c128 = arith.constant 128 : index - %c1_1 = arith.constant 1 : index - scf.for %arg1 = %c0_0 to %c128 step %c1_1 { - %0 = aie.objectfifo.acquire @memC1(Produce, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<1024xbf16> - %2 = aie.objectfifo.acquire @memA1(Consume, 1) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<1024xbf16> - func.call @softmax_bf16_vector(%3, %1) : (memref<1024xbf16>, memref<1024xbf16>) -> () - aie.objectfifo.release @memA1(Consume, 1) - aie.objectfifo.release @memC1(Produce, 1) - } - } - aie.end - } {link_with = "kernels.a"} - func.func @sequence(%arg0: memref<262144xi32>, %arg1: memref<262144xi32>) { - aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 131072][0, 0, 0]) {id = 0 : i64, metadata = @outC} : memref<262144xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 131072][0, 0, 0]) {id = 1 : i64, metadata = @inA} : memref<262144xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - return - } - } -} - diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/reset_npu.sh b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/reset_npu.sh new file mode 100755 index 000000000..4536ab1b8 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/reset_npu.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +set -eux + +NUMBER=$(lspci -D | grep "\[AMD\] Device 1502" | cut -d ' ' -f1) + +if [ x"$NUMBER" != x"" ]; then + sudo modprobe -r amdxdna + sudo modprobe drm_shmem_helper + sudo modprobe amdxdna dyndbg==pflm + +# if [ -f "/opt/xilinx/xrt/test/example_noop_test" ]; then +# /opt/xilinx/xrt/test/example_noop_test /lib/firmware/amdipu/1502/validate.xclbin +# fi +else + echo "couldn't find npu" +fi + diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/vision_vision_passthrough.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/vision_vision_passthrough.mlir deleted file mode 100644 index 99a7ceec5..000000000 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/vision_vision_passthrough.mlir +++ /dev/null @@ -1,39 +0,0 @@ -// RUN: iree-opt --aie-objectFifo-stateful-transform %s - -module { - aie.device(npu1) { - func.func private @passThroughLine(memref<512xui8>, memref<512xui8>, i32) - %tile_0_0 = aie.tile(0, 0) - %tile_0_2 = aie.tile(0, 2) - aie.objectfifo @in(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @out(%tile_0_2, {%tile_0_0}, 2 : i32) : !aie.objectfifo> - %core_0_2 = aie.core(%tile_0_2) { - %c0 = arith.constant 0 : index - %c9223372036854775807 = arith.constant 9223372036854775807 : index - %c1 = arith.constant 1 : index - scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 { - %c0_0 = arith.constant 0 : index - %c9 = arith.constant 9 : index - %c1_1 = arith.constant 1 : index - scf.for %arg1 = %c0_0 to %c9 step %c1_1 { - %0 = aie.objectfifo.acquire @out(Produce, 1) : !aie.objectfifosubview> - %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<512xui8> - %2 = aie.objectfifo.acquire @in(Consume, 1) : !aie.objectfifosubview> - %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<512xui8> - %c512_i32 = arith.constant 512 : i32 - func.call @passThroughLine(%3, %1, %c512_i32) : (memref<512xui8>, memref<512xui8>, i32) -> () - aie.objectfifo.release @in(Consume, 1) - aie.objectfifo.release @out(Produce, 1) - } - } - aie.end - } {link_with = "passThrough.cc.o"} - func.func @sequence(%arg0: memref<1152xi32>, %arg1: memref<1152xi32>, %arg2: memref<1152xi32>) { - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 1152][0, 0, 0]) {id = 1 : i64, metadata = @in} : memref<1152xi32> - aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 1152][0, 0, 0]) {id = 0 : i64, metadata = @out} : memref<1152xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} - return - } - } -} - diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index 54e0507e3..1890127c9 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -16,6 +16,7 @@ #include "iree/compiler/Codegen/Common/Passes.h" #include "iree/compiler/Dialect/LinalgExt/Transforms/Passes.h" #include "iree/compiler/Utils/PassUtils.h" +#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h" #include "mlir/Dialect/Affine/Passes.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/Linalg/Passes.h" @@ -436,14 +437,19 @@ void buildAMDAIETransformPassPipeline(OpPassManager &variantPassManager) { void buildAMDAIELowerObjectFIFO(OpPassManager &variantPassManager) { OpPassManager &modulePassManager = variantPassManager.nest(); + modulePassManager.addPass(createCanonicalizerPass()); + modulePassManager.addPass(createConvertLinalgToLoopsPass()); + modulePassManager.addPass(memref::createFoldMemRefAliasOpsPass()); modulePassManager.addPass(xilinx::AIE::createAIECanonicalizeDevicePass()); auto &devicePassMan = modulePassManager.nest(); + devicePassMan.addPass(xilinx::AIE::createAIEAssignLockIDsPass()); devicePassMan.addPass( xilinx::AIE::createAIEObjectFifoStatefulTransformPass()); + devicePassMan.addPass(xilinx::AIE::createAIEAssignBufferDescriptorIDsPass()); devicePassMan.addPass(xilinx::AIE::createAIEAssignBufferAddressesBasicPass()); - devicePassMan.addPass(xilinx::AIE::createAIEAssignLockIDsPass()); - devicePassMan.addPass(xilinx::AIE::createAIEPathfinderPass()); - devicePassMan.addPass(xilinx::AIE::createAIELocalizeLocksPass()); + modulePassManager.addPass(createConvertSCFToCFPass()); + modulePassManager.addNestedPass( + xilinx::AIE::createAIEPathfinderPass()); LLVM_DEBUG({ llvm::dbgs() << "Using AMDAIE pass pipeline:\n"; diff --git a/tests/matmul/requirements.txt b/tests/matmul/requirements.txt index 62116914f..f57c361bd 100644 --- a/tests/matmul/requirements.txt +++ b/tests/matmul/requirements.txt @@ -1,3 +1,8 @@ PyYAML>=5.4.1 requests>=2.28.0 -enum_tools==0.6.4 \ No newline at end of file +enum_tools==0.6.4 +numpy +-f https://github.com/nod-ai/prototype-aie-toolchain/releases/expanded_assets/release +xaiepy +-f https://github.com/Xilinx/mlir-aie/releases/expanded_assets/latest-wheels +aie-python-bindings-debug \ No newline at end of file