diff --git a/compiler/plugins/target/AMD-AIE/aie/AIETargetBCF.cpp b/compiler/plugins/target/AMD-AIE/aie/AIETargetBCF.cpp
new file mode 100644
index 000000000..ef412db84
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/aie/AIETargetBCF.cpp
@@ -0,0 +1,136 @@
+//===- AIETargetBCF.cpp -----------------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "AIETargets.h"
+#include "aie/Dialect/AIE/IR/AIEDialect.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/Module.h"
+
+using namespace mlir;
+using namespace xilinx;
+using namespace xilinx::AIE;
+
+std::string utohexstr(uint32_t u) { return "0x" + llvm::utohexstr(u); }
+
+namespace xilinx {
+namespace AIE {
+
+LogicalResult AIETranslateToBCF(ModuleOp module, raw_ostream &output,
+                                int tileCol, int tileRow) {
+  DenseMap<TileID, Operation *> tiles;
+  DenseMap<Operation *, SmallVector<BufferOp, 4>> buffers;
+
+  if (module.getOps<DeviceOp>().empty())
+    module.emitOpError("expected aie.device operation at toplevel");
+  DeviceOp targetOp = *(module.getOps<DeviceOp>().begin());
+
+  collectTiles(targetOp, tiles);
+  collectBuffers(targetOp, buffers);
+
+  // _entry_point _main_init
+  // _symbol      _main _after _main_init
+  // _symbol      _main_init 0
+  // _reserved DMb      0x00000 0x20000
+  // _symbol   a        0x38000 0x2000
+  // _extern   a
+  // _stack    DM_stack 0x20000  0x400 //stack for core
+  // _reserved DMb 0x40000 0xc0000 // And everything else the core can't
+  // see
+  // // Include all symbols from rom.c
+  // _include _file rom.o
+  for (auto tile : targetOp.getOps<TileOp>())
+    if (tile.colIndex() == tileCol && tile.rowIndex() == tileRow) {
+      const auto &targetModel = getTargetModel(tile);
+      TileID srcCoord = {tile.colIndex(), tile.rowIndex()};
+
+      std::string corefunc = std::string("core_") +
+                             std::to_string(tile.getCol()) + "_" +
+                             std::to_string(tile.getRow());
+      output << "_entry_point _main_init\n";
+      output << "_symbol " << corefunc << " _after _main_init\n";
+      output << "_symbol _main_init 0\n";
+      std::string initReserved = (targetModel.getTargetArch() == AIEArch::AIE2)
+                                     ? "0x40000"
+                                     : "0x20000";
+      output << "_reserved DMb 0x00000 " << initReserved
+             << " // Don't put data in code memory\n";
+
+      int stacksize = 0;
+      if (auto core = tile.getCoreOp()) stacksize = core.getStackSize();
+      output << "_stack DM_stack "
+             << utohexstr(targetModel.getMemInternalBaseAddress(srcCoord))
+             << " " << utohexstr(stacksize) << " // stack for core\n";
+
+      auto doBuffer = [&](std::optional<TileID> tile, int offset,
+                          const std::string &dir) {
+        if (tile) {
+          output << "// " + dir +
+                        " -------------------------------------------------\n";
+          uint32_t localMemSize = targetModel.getLocalMemorySize();
+          if (tile != srcCoord)
+            output << "_reserved DMb " << utohexstr(offset) << " "
+                   << utohexstr(localMemSize) << " "
+                   << " // Don't allocate variables in " << dir
+                   << " neighbor\n\n";
+          // TODO How to set as reserved if no buffer exists (or reserve
+          // remaining buffer)
+          if (tiles.count(*tile)) {
+            for (auto buf : buffers[tiles[*tile]]) {
+              std::string bufName(buf.name().getValue());
+              int bufferBaseAddr = getBufferBaseAddress(buf);
+              int numBytes = buf.getAllocationSize();
+              if (buf.getInitialValue() && tile == srcCoord) {
+                output << "_overlay " << bufName << " "
+                       << utohexstr(offset + bufferBaseAddr) << " // "
+                       << numBytes << " bytes\n";
+              } else {
+                output << "_symbol " << bufName << " "
+                       << utohexstr(offset + bufferBaseAddr) << " " << numBytes
+                       << '\n';
+                output << "_extern " << bufName << "\n";
+                output << "_reserved DMb " << utohexstr(offset + bufferBaseAddr)
+                       << " " << numBytes << '\n';
+              }
+              output << "\n";
+            }
+          }
+        } else {
+          uint32_t localMemSize = targetModel.getLocalMemorySize();
+          output << "_reserved DMb " << utohexstr(offset) << " "
+                 << utohexstr(localMemSize) << " "
+                 << " // No tile with memory exists to the " << dir << ".\n";
+        }
+      };
+
+      output << "\n// mapping neighbors tile memory\n";
+      doBuffer(targetModel.getMemSouth(srcCoord),
+               targetModel.getMemSouthBaseAddress(), std::string("south"));
+      doBuffer(targetModel.getMemWest(srcCoord),
+               targetModel.getMemWestBaseAddress(), std::string("west"));
+      doBuffer(targetModel.getMemNorth(srcCoord),
+               targetModel.getMemNorthBaseAddress(), std::string("north"));
+      doBuffer(targetModel.getMemEast(srcCoord),
+               targetModel.getMemEastBaseAddress(), std::string("east"));
+      output << "// end mapping neighbors tile memory\n\n";
+
+      if (targetModel.getTargetArch() == AIEArch::AIE2) {
+        output << "_reserved DMb 0x80000 0x80000 // And everything else "
+                  "the core can't see\n";
+      } else {
+        output << "_reserved DMb 0x40000 0xc0000 // And everything else "
+                  "the core can't see\n";
+      }
+      if (tile.getCoreOp() && tile.getCoreOp().getLinkWith())
+        output << "_include _file "
+               << tile.getCoreOp().getLinkWith().value().str() << "\n";
+      output << "_resolve _main core_" << tile.getCol() << "_" << tile.getRow()
+             << "\n";
+    }
+
+  return success();
+}
+}  // namespace AIE
+}  // namespace xilinx
diff --git a/compiler/plugins/target/AMD-AIE/aie/AIETargetCDODirect.cpp b/compiler/plugins/target/AMD-AIE/aie/AIETargetCDODirect.cpp
new file mode 100644
index 000000000..da4951ecf
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/aie/AIETargetCDODirect.cpp
@@ -0,0 +1,788 @@
+//===- AIETargetCDODirect.cpp -----------------------------------*- C++ -*-===//
+//
+// Copyright (C) 2023, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "AIETargets.h"
+#include "aie/Dialect/AIE/IR/AIETargetModel.h"
+extern "C" {
+#include "cdo-driver/cdo_driver.h"
+}
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>  // size_t
+#include <cstdint>  // uint
+#include <cstdlib>  // calloc
+#include <filesystem>
+#include <functional>
+#include <map>
+#include <optional>
+#include <string>
+
+#include "aie/Dialect/AIE/IR/AIEDialect.h"
+#include "aie/Dialect/AIE/IR/AIEEnums.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/BuiltinTypeInterfaces.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/Region.h"
+#include "mlir/Support/LLVM.h"
+#include "mlir/Support/LogicalResult.h"
+
+#ifndef NDEBUG
+#define XAIE_DEBUG
+#endif
+
+extern "C" {
+#include "xaiengine/xaie_core.h"
+#include "xaiengine/xaie_dma.h"
+#include "xaiengine/xaie_elfloader.h"
+#include "xaiengine/xaie_interrupt.h"
+#include "xaiengine/xaie_locks.h"
+#include "xaiengine/xaie_plif.h"
+#include "xaiengine/xaie_ss.h"
+#include "xaiengine/xaiegbl.h"
+#include "xaiengine/xaiegbl_defs.h"
+}
+
+#define DEBUG_TYPE "aie-generate-cdo"
+
+using namespace mlir;
+using namespace xilinx;
+using namespace xilinx::AIE;
+
+#define AIERC_STR(x) x, #x
+static const std::map<AieRC, std::string> AIERCTOSTR = {
+    {AIERC_STR(XAIE_OK)},
+    {AIERC_STR(XAIE_ERR)},
+    {AIERC_STR(XAIE_INVALID_DEVICE)},
+    {AIERC_STR(XAIE_INVALID_RANGE)},
+    {AIERC_STR(XAIE_INVALID_ARGS)},
+    {AIERC_STR(XAIE_INVALID_TILE)},
+    {AIERC_STR(XAIE_ERR_STREAM_PORT)},
+    {AIERC_STR(XAIE_INVALID_DMA_TILE)},
+    {AIERC_STR(XAIE_INVALID_BD_NUM)},
+    {AIERC_STR(XAIE_ERR_OUTOFBOUND)},
+    {AIERC_STR(XAIE_INVALID_DATA_MEM_ADDR)},
+    {AIERC_STR(XAIE_INVALID_ELF)},
+    {AIERC_STR(XAIE_CORE_STATUS_TIMEOUT)},
+    {AIERC_STR(XAIE_INVALID_CHANNEL_NUM)},
+    {AIERC_STR(XAIE_INVALID_LOCK)},
+    {AIERC_STR(XAIE_INVALID_DMA_DIRECTION)},
+    {AIERC_STR(XAIE_INVALID_PLIF_WIDTH)},
+    {AIERC_STR(XAIE_INVALID_LOCK_ID)},
+    {AIERC_STR(XAIE_INVALID_LOCK_VALUE)},
+    {AIERC_STR(XAIE_LOCK_RESULT_FAILED)},
+    {AIERC_STR(XAIE_INVALID_DMA_DESC)},
+    {AIERC_STR(XAIE_INVALID_ADDRESS)},
+    {AIERC_STR(XAIE_FEATURE_NOT_SUPPORTED)},
+    {AIERC_STR(XAIE_INVALID_BURST_LENGTH)},
+    {AIERC_STR(XAIE_INVALID_BACKEND)},
+    {AIERC_STR(XAIE_INSUFFICIENT_BUFFER_SIZE)},
+    {AIERC_STR(XAIE_ERR_MAX)}};
+#undef AIERC_STR
+
+static const std::map<WireBundle, StrmSwPortType>
+    WIRE_BUNDLE_TO_STRM_SW_PORT_TYPE = {
+        {WireBundle::Core, StrmSwPortType::CORE},
+        {WireBundle::DMA, StrmSwPortType::DMA},
+        {WireBundle::Ctrl, StrmSwPortType::CTRL},
+        {WireBundle::FIFO, StrmSwPortType::FIFO},
+        {WireBundle::South, StrmSwPortType::SOUTH},
+        {WireBundle::West, StrmSwPortType::WEST},
+        {WireBundle::North, StrmSwPortType::NORTH},
+        {WireBundle::East, StrmSwPortType::EAST},
+        // missing PLIO from WireBundle
+        // missing NOC from WireBundle
+        {WireBundle::Trace, StrmSwPortType::TRACE},
+};
+
+// https://stackoverflow.com/a/32230306
+template <typename H1>
+raw_ostream &showArgs(raw_ostream &out, const char *label, H1 &&value) {
+  return out << label << "=" << std::forward<H1>(value);
+}
+
+template <typename H1, typename... T>
+raw_ostream &showArgs(raw_ostream &out, const char *label, H1 &&value,
+                      T &&...rest) {
+  const char *pcomma = strchr(label, ',');
+  return showArgs(out.write(label, pcomma - label)
+                      << "=" << std::forward<H1>(value) << ',',
+                  pcomma + 1, std::forward<T>(rest)...);
+}
+
+#define SHOW_ARGS(os, ...) showArgs(os, #__VA_ARGS__, __VA_ARGS__)
+
+raw_ostream &operator<<(raw_ostream &os, const XAie_LocType &loc) {
+  os << "XAie_LocType(col: " << std::to_string(loc.Col)
+     << ", row: " << std::to_string(loc.Row) << ")";
+  return os;
+}
+
+raw_ostream &operator<<(raw_ostream &os, const XAie_Lock &lock) {
+  os << "XAie_Lock(id: " << std::to_string(lock.LockId)
+     << ", val: " << std::to_string(lock.LockVal) << ")";
+  return os;
+}
+
+raw_ostream &operator<<(raw_ostream &os, const XAie_Packet &packet) {
+  os << "XAie_Packet(id: " << std::to_string(packet.PktId)
+     << ", type: " << std::to_string(packet.PktType) << ")";
+  return os;
+}
+
+// So that we can use the pattern if(auto r = TRY_XAIE_API...) { // r is nonzero
+// }
+static_assert(XAIE_OK == 0);
+
+#define TRY_XAIE_API_FATAL_ERROR(API, ...)                                     \
+  do {                                                                         \
+    LLVM_DEBUG(llvm::dbgs() << "trying XAIE API: " << #API << " with args: "); \
+    LLVM_DEBUG(SHOW_ARGS(llvm::dbgs(), __VA_ARGS__));                          \
+    LLVM_DEBUG(llvm::dbgs() << "\n");                                          \
+    if (auto r = API(__VA_ARGS__))                                             \
+      llvm::report_fatal_error(llvm::Twine(#API " failed with ") +             \
+                               AIERCTOSTR.at(r));                              \
+  } while (0)
+
+#define TRY_XAIE_API_EMIT_ERROR(OP, API, ...)                                  \
+  do {                                                                         \
+    LLVM_DEBUG(llvm::dbgs() << "trying XAIE API: " << #API << " with args: "); \
+    LLVM_DEBUG(SHOW_ARGS(llvm::dbgs(), __VA_ARGS__));                          \
+    LLVM_DEBUG(llvm::dbgs() << "\n");                                          \
+    if (auto r = API(__VA_ARGS__))                                             \
+      return OP.emitOpError() << #API " failed with " << AIERCTOSTR.at(r);     \
+  } while (0)
+
+#define TRY_XAIE_API_LOGICAL_RESULT(API, ...)                                  \
+  do {                                                                         \
+    LLVM_DEBUG(llvm::dbgs() << "trying XAIE API: " << #API << " with args: "); \
+    LLVM_DEBUG(SHOW_ARGS(llvm::dbgs(), __VA_ARGS__));                          \
+    LLVM_DEBUG(llvm::dbgs() << "\n");                                          \
+    if (auto r = API(__VA_ARGS__)) {                                           \
+      llvm::errs() << #API " failed with " << AIERCTOSTR.at(r);                \
+      return failure();                                                        \
+    }                                                                          \
+  } while (0)
+
+auto ps = std::filesystem::path::preferred_separator;
+
+#define XAIE_BASE_ADDR 0x40000000
+#define XAIE_COL_SHIFT 25
+#define XAIE_ROW_SHIFT 20
+#define XAIE_SHIM_ROW 0
+#define XAIE_MEM_TILE_ROW_START 1
+#define XAIE_PARTITION_BASE_ADDR 0x0
+
+#define NPI_ADDR 0x0
+#define NUM_LOCKS 16
+#define EVEN_BD_NUM_START 0
+#define ODD_BD_NUM_START 24
+#define MEM_TILE_LOCK_ID_INCR 64
+#define BASE_ADDR_A_INCR 0x80000
+
+namespace xilinx::AIE {
+
+LogicalResult configureLocksInBdBlock(XAie_DmaDesc &dmaTileBd, Block &block,
+                                      const AIETargetModel &targetModel,
+                                      XAie_LocType &tileLoc) {
+  LLVM_DEBUG(llvm::dbgs() << "\nstart configuring bds\n");
+  std::optional<int> acqValue, relValue, acqLockId, relLockId;
+  bool acqEn;
+  // switch (lock->getAc)
+  for (auto op : block.getOps<UseLockOp>()) {
+    // Only dyn_cast if you are going to check if it was of the type
+    // expected; if you aren't checking use cast instead as it will at
+    // least assert in debug mode with an easier to understand error than
+    // dereferencing.
+    LockOp lock = cast<LockOp>(op.getLock().getDefiningOp());
+    switch (op.getAction()) {
+      case LockAction::Acquire:
+      case LockAction::AcquireGreaterEqual:
+        acqEn = op.getAcqEn();
+        acqLockId = lock.getLockIDValue();
+        acqValue = op.getLockValue();
+        if (op.acquireGE()) acqValue.value() = -acqValue.value();
+        break;
+      case LockAction::Release:
+        relLockId = lock.getLockIDValue();
+        relValue = op.getLockValue();
+        break;
+    }
+  }
+
+  assert(acqValue && relValue && acqLockId && relLockId &&
+         "expected both use_lock(acquire) and use_lock(release) with bd");
+
+  if (targetModel.isMemTile(tileLoc.Col, tileLoc.Row)) {
+    if (acqLockId) acqLockId.value() += MEM_TILE_LOCK_ID_INCR;
+    if (relLockId) relLockId.value() += MEM_TILE_LOCK_ID_INCR;
+  }
+
+  // no RelEn in the arch spec even though the API requires you to set it?
+  bool relEn = false;
+  XAie_Lock acqLock = XAie_LockInit(acqLockId.value(), acqValue.value());
+  XAie_Lock relLock = XAie_LockInit(relLockId.value(), relValue.value());
+  TRY_XAIE_API_EMIT_ERROR((*block.getOps<UseLockOp>().begin()),
+                          dmaTileBd.DmaMod->SetLock, &dmaTileBd, acqLock,
+                          relLock, acqEn, relEn);
+  return success();
+}
+
+LogicalResult configureBdInBlock(XAie_DevInst &devInst, XAie_DmaDesc &dmaTileBd,
+                                 Block &block,
+                                 const AIETargetModel &targetModel,
+                                 XAie_LocType &tileLoc, int bdId,
+                                 std::optional<int> nextBdId) {
+  std::optional<int> packetType;
+  std::optional<int> packetID;
+  auto maybePacketOps = block.getOps<DMABDPACKETOp>();
+  if (!maybePacketOps.empty()) {
+    assert(llvm::range_size(maybePacketOps) == 1 &&
+           "expected only one dma_bd_packet");
+    auto packetOp = *maybePacketOps.begin();
+    packetType = packetOp.getPacketType();
+    packetID = packetOp.getPacketID();
+  }
+
+  auto bdOp = *block.getOps<DMABDOp>().begin();
+
+  if (targetModel.isShimNOCTile(tileLoc.Col, tileLoc.Row)) {
+    // write them out like this so they show up with names in debug prints
+    size_t smid = 0;
+    size_t burstLen = 16;  // (10):BLEN=16 (256Byte) (corresponds to
+                           // 0x800000000 from target)
+    size_t qOs = 0;
+    size_t cache = 0;
+    size_t secure = 0;
+    TRY_XAIE_API_EMIT_ERROR(bdOp, XAie_DmaSetAxi, &dmaTileBd, smid, burstLen,
+                            qOs, cache, secure);
+  }
+
+  // StringRef FifoMode = disable; // FIXME: when to enable FIFO mode?
+  int baseAddr = 0;
+  if (!targetModel.isShimNOCTile(tileLoc.Col, tileLoc.Row)) {
+    auto bufferOp = cast<AIE::BufferOp>(bdOp.getBuffer().getDefiningOp());
+    if (!bufferOp.getAddress())
+      return bufferOp.emitError("buffer must have address assigned");
+    baseAddr = bufferOp.getAddress().value();
+    if (targetModel.isMemTile(tileLoc.Col, tileLoc.Row))
+      baseAddr += BASE_ADDR_A_INCR;
+  }
+
+  std::optional<llvm::ArrayRef<BDDimLayoutAttr>> dims = bdOp.getDimensions();
+  int lenInBytes = bdOp.getLenInBytes();
+  int basePlusOffsetInBytes = baseAddr + bdOp.getOffsetInBytes();
+  if (!dims) {
+    TRY_XAIE_API_EMIT_ERROR(bdOp, XAie_DmaSetAddrLen, &dmaTileBd,
+                            basePlusOffsetInBytes, lenInBytes);
+  } else {
+    XAie_DmaTensor dmaTileBdTensor = {};
+    dmaTileBdTensor.NumDim = dims->size();
+    dmaTileBdTensor.Dim = static_cast<XAie_DmaDimDesc *>(
+        calloc(dmaTileBdTensor.NumDim, sizeof(XAie_DmaDimDesc)));
+    if (!dmaTileBdTensor.Dim)
+      return bdOp.emitError("couldn't allocate array of XAie_DmaDimDesc");
+    // libxaie requires stride in multiples of 32b
+    double elementWidthIn32bWords =
+        static_cast<double>(bdOp.getBufferElementTypeWidthInBytes()) / 4.0;
+    for (size_t i = 0; i < dims->size(); i++) {
+      // Pass down dimensions in reverse order; in the MLIR, this allows
+      // us to specify step sizes/wraps in the same order as we would
+      // access a multi-dim C array, with the highest dimension first.
+      int j = dims->size() - i - 1;
+      uint16_t size;
+      uint32_t stride;
+      if (j > 0) {
+        stride = static_cast<uint32_t>(dims.value()[i].getStride() *
+                                       elementWidthIn32bWords);
+        size = dims.value()[i].getSize();
+      } else {
+        stride = dims.value()[i].getStride();
+        size = static_cast<uint16_t>(dims.value()[i].getSize() *
+                                     elementWidthIn32bWords);
+      }
+      stride = stride > 0 ? stride : 1;
+      // Assume AIE-ML architecture (ie use AieMlDimDesc instead of AieDimDesc);
+      // asserted in AIETranslateToCDODirect).
+      dmaTileBdTensor.Dim[j].AieMlDimDesc = {stride, size};
+    }
+    TRY_XAIE_API_EMIT_ERROR(bdOp, XAie_DmaSetMultiDimAddr, &dmaTileBd,
+                            &dmaTileBdTensor, basePlusOffsetInBytes,
+                            lenInBytes);
+  }
+
+  if (nextBdId) {
+    auto enableNextBd = 1;
+    TRY_XAIE_API_EMIT_ERROR(bdOp, XAie_DmaSetNextBd, &dmaTileBd,
+                            nextBdId.value(), enableNextBd);
+  }
+
+  if (packetID) {
+    if (!packetType) bdOp.emitError("must have packetType with packetID");
+    if (bdOp.getLen() == 0)
+      return bdOp.emitOpError(
+          "For MM2S channels, if Buffer_Length=0 then Enable_Packet must be "
+          "set to 0, otherwise behavior is undefined (3.7.8 arch spec)");
+    TRY_XAIE_API_EMIT_ERROR(
+        bdOp, XAie_DmaSetPkt, &dmaTileBd,
+        XAie_PacketInit(packetID.value(), packetType.value()));
+  }
+  TRY_XAIE_API_EMIT_ERROR(bdOp, XAie_DmaEnableBd, &dmaTileBd);
+  TRY_XAIE_API_EMIT_ERROR(bdOp, XAie_DmaWriteBd, &devInst, &dmaTileBd, tileLoc,
+                          bdId);
+  LLVM_DEBUG(llvm::dbgs() << "\nend configuring bds\n");
+  return success();
+};
+
+LogicalResult pushToBdQueueAndEnable(XAie_DevInst &devInst, Operation &op,
+                                     XAie_LocType &tileLoc, int chNum,
+                                     const DMAChannelDir &channelDir, int bdId,
+                                     int repeatCount) {
+  XAie_DmaDirection direction =
+      channelDir == DMAChannelDir::S2MM ? DMA_S2MM : DMA_MM2S;
+  auto enTokenIssue = tileLoc.Row == 0 && direction == DMA_S2MM;
+  // in english repeat_count==0 means "do it once" and don't repeat but
+  // libxaie treats repeat_count=1 as do it once.
+  repeatCount += 1;
+  TRY_XAIE_API_EMIT_ERROR(op, XAie_DmaChannelSetStartQueue, &devInst, tileLoc,
+                          chNum, direction, bdId, repeatCount, enTokenIssue);
+  TRY_XAIE_API_EMIT_ERROR(op, XAie_DmaChannelEnable, &devInst, tileLoc, chNum,
+                          direction);
+  return success();
+};
+
+LogicalResult configureLocksAndBd(XAie_DevInst &devInst, Block &block,
+                                  XAie_LocType tileLoc,
+                                  const AIETargetModel &targetModel) {
+  DMABDOp bd = *block.getOps<DMABDOp>().begin();
+  assert(bd.getBdId().has_value() &&
+         "DMABDOp must have assigned bd_id; did you forget to run "
+         "aie-assign-bd-ids?");
+  XAie_DmaDesc dmaTileBd;
+  TRY_XAIE_API_EMIT_ERROR(bd, XAie_DmaDescInit, &devInst, &dmaTileBd, tileLoc);
+  if (!block.getOps<UseLockOp>().empty() &&
+      failed(configureLocksInBdBlock(dmaTileBd, block, targetModel, tileLoc)))
+    return failure();
+  if (!block.getOps<DMABDOp>().empty() &&
+      failed(configureBdInBlock(devInst, dmaTileBd, block, targetModel, tileLoc,
+                                bd.getBdId().value(), bd.getNextBdId())))
+    return failure();
+  return success();
+};
+
+struct AIEControl {
+  XAie_Config configPtr;
+  XAie_DevInst devInst;
+
+  AIEControl(bool aieSim, bool xaieDebug, const BaseNPUTargetModel &tm) {
+    // The first column in the NPU lacks a shim tile.  AIE-RT exposes some of
+    // the internals about how this is modeled in a somewhat awkward way.
+    size_t partitionStartCol = tm.isVirtualized() ? 1 : 0;
+    size_t partitionNumCols = tm.columns();
+    size_t deviceRows = tm.rows();
+    size_t deviceCols = tm.columns() + partitionStartCol;
+
+    configPtr = XAie_Config{
+        /*AieGen*/ XAIE_DEV_GEN_AIEML,
+        /*BaseAddr*/ XAIE_BASE_ADDR,
+        /*ColShift*/ XAIE_COL_SHIFT,
+        /*RowShift*/ XAIE_ROW_SHIFT,
+        /*NumRows*/ static_cast<uint8_t>(deviceRows),
+        /*NumCols*/ static_cast<uint8_t>(deviceCols),
+        /*ShimRowNum*/ XAIE_SHIM_ROW,
+        /*MemTileRowStart*/ XAIE_MEM_TILE_ROW_START,
+        /*MemTileNumRows*/ static_cast<uint8_t>(tm.getNumMemTileRows()),
+        /*AieTileRowStart*/
+        static_cast<uint8_t>(XAIE_MEM_TILE_ROW_START + tm.getNumMemTileRows()),
+        /*AieTileNumRows*/
+        static_cast<uint8_t>(tm.rows() - tm.getNumMemTileRows() - 1),
+        /*PartProp*/ {},
+        /*Backend*/ XAIE_IO_BACKEND_CDO};
+
+    // Quoting: The instance of a device must be always declared using this
+    //		macro. In future, the same macro will be expanded to allocate
+    //		more memory from the user application for resource management.
+    XAie_InstDeclare(_devInst, &configPtr);
+    devInst = _devInst;
+    TRY_XAIE_API_FATAL_ERROR(XAie_SetupPartitionConfig, &devInst,
+                             XAIE_PARTITION_BASE_ADDR, partitionStartCol,
+                             partitionNumCols);
+    TRY_XAIE_API_FATAL_ERROR(XAie_CfgInitialize, &devInst, &configPtr);
+    if (aieSim) {
+      TRY_XAIE_API_FATAL_ERROR(XAie_SetIOBackend, &devInst,
+                               XAIE_IO_BACKEND_SIM);
+    } else if (xaieDebug)
+      TRY_XAIE_API_FATAL_ERROR(XAie_SetIOBackend, &devInst,
+                               XAIE_IO_BACKEND_DEBUG);
+    else
+      TRY_XAIE_API_FATAL_ERROR(XAie_SetIOBackend, &devInst,
+                               XAIE_IO_BACKEND_CDO);
+
+    TRY_XAIE_API_FATAL_ERROR(XAie_UpdateNpiAddr, &devInst, NPI_ADDR);
+  }
+
+  LogicalResult addAieElfToCDO(uint8_t col, uint8_t row,
+                               const StringRef elfPath, bool aieSim) {
+    // loadSym: Load symbols from .map file. This argument is not used when
+    // __AIESIM__ is not defined.
+    TRY_XAIE_API_LOGICAL_RESULT(XAie_LoadElf, &devInst, XAie_TileLoc(col, row),
+                                elfPath.str().c_str(), /*loadSym*/ aieSim);
+    return success();
+  }
+
+  LogicalResult addAieElfsToCDO(DeviceOp &targetOp, const StringRef workDirPath,
+                                bool aieSim) {
+    for (auto tileOp : targetOp.getOps<TileOp>())
+      if (tileOp.isShimNOCorPLTile()) {
+        // Resets no needed with V2 kernel driver
+      } else {
+        int col = tileOp.colIndex();
+        int row = tileOp.rowIndex();
+        if (auto coreOp = tileOp.getCoreOp()) {
+          std::string fileName;
+          if (auto fileAttr = coreOp.getElfFile())
+            fileName = fileAttr->str();
+          else
+            fileName = (llvm::Twine("core_") + std::to_string(col) + "_" +
+                        std::to_string(row) + ".elf")
+                           .str();
+          if (failed(addAieElfToCDO(
+                  col, row,
+                  (llvm::Twine(workDirPath) + std::string(1, ps) + fileName)
+                      .str(),
+                  aieSim)))
+            return failure();
+        }
+      }
+    return success();
+  }
+
+  LogicalResult addInitConfigToCDO(DeviceOp &targetOp) {
+    for (auto tileOp : targetOp.getOps<TileOp>()) {
+      auto tileLoc = XAie_TileLoc(tileOp.colIndex(), tileOp.rowIndex());
+      if (!tileOp.isShimTile() && tileOp.getCoreOp()) {
+        TRY_XAIE_API_EMIT_ERROR(tileOp, XAie_CoreReset, &devInst, tileLoc);
+        TRY_XAIE_API_EMIT_ERROR(tileOp, XAie_CoreUnreset, &devInst, tileLoc);
+        // Set locks to zero
+        for (uint8_t l = 0; l < NUM_LOCKS; l++) {
+          auto locInit = XAie_LockInit(l, 0);
+          TRY_XAIE_API_EMIT_ERROR(tileOp, XAie_LockSetValue, &devInst, tileLoc,
+                                  locInit);
+        }
+      }
+    }
+
+    // Set locks with explicit initializers
+    targetOp.walk<WalkOrder::PreOrder>([&](LockOp lockOp) {
+      if (lockOp.getLockID() && lockOp.getInit()) {
+        auto tileLoc = XAie_TileLoc(lockOp.getTileOp().colIndex(),
+                                    lockOp.getTileOp().rowIndex());
+        auto locInit = XAie_LockInit(*lockOp.getLockID(), *lockOp.getInit());
+        TRY_XAIE_API_FATAL_ERROR(XAie_LockSetValue, &devInst, tileLoc, locInit);
+      } else
+        LLVM_DEBUG(llvm::dbgs()
+                   << "lock op missing either id or init" << lockOp << "\n");
+    });
+
+    const AIETargetModel &targetModel = targetOp.getTargetModel();
+
+    auto memOps = llvm::to_vector_of<TileElement>(targetOp.getOps<MemOp>());
+    llvm::append_range(memOps, targetOp.getOps<MemTileDMAOp>());
+    llvm::append_range(memOps, targetOp.getOps<ShimDMAOp>());
+    for (TileElement memOp : memOps) {
+      int col = memOp.getTileID().col;
+      int row = memOp.getTileID().row;
+      XAie_LocType tileLoc = XAie_TileLoc(col, row);
+
+      // handle DMA ops separately
+      auto dmaOps = llvm::to_vector_of<DMAOp>(
+          memOp.getOperation()->getRegion(0).getOps<DMAOp>());
+      if (!dmaOps.empty()) {
+        for (auto dmaOp : dmaOps)
+          for (auto &bdRegion : dmaOp.getBds()) {
+            Block &block = bdRegion.getBlocks().front();
+            if (failed(
+                    configureLocksAndBd(devInst, block, tileLoc, targetModel)))
+              return failure();
+          }
+      } else {
+        for (Block &block : memOp.getOperation()->getRegion(0)) {
+          if (block.getOps<DMABDOp>().empty()) continue;
+          if (failed(configureLocksAndBd(devInst, block, tileLoc, targetModel)))
+            return failure();
+        }
+      }
+
+      if (!dmaOps.empty())
+        for (auto dmaOp : dmaOps) {
+          auto &block = dmaOp.getBds().front().getBlocks().front();
+          DMABDOp bd = *block.getOps<DMABDOp>().begin();
+          if (failed(pushToBdQueueAndEnable(
+                  devInst, *dmaOp.getOperation(), tileLoc,
+                  dmaOp.getChannelIndex(), dmaOp.getChannelDir(),
+                  bd.getBdId().value(), dmaOp.getRepeatCount())))
+            return failure();
+        }
+      else
+        for (Block &block : memOp.getOperation()->getRegion(0)) {
+          for (auto op : block.getOps<DMAStartOp>()) {
+            DMABDOp bd = *op.getDest()->getOps<DMABDOp>().begin();
+            int chNum = op.getChannelIndex();
+            auto channelDir = op.getChannelDir();
+            if (failed(pushToBdQueueAndEnable(
+                    devInst, *bd.getOperation(), tileLoc, chNum, channelDir,
+                    bd.getBdId().value(), op.getRepeatCount())))
+              return failure();
+          }
+        }
+    }
+
+    // StreamSwitch (switchbox) configuration
+    for (auto switchboxOp : targetOp.getOps<SwitchboxOp>()) {
+      int32_t col = switchboxOp.colIndex();
+      int32_t row = switchboxOp.rowIndex();
+      XAie_LocType tileLoc = XAie_TileLoc(col, row);
+      assert(targetModel.isNPU() && "Only NPU currently supported");
+      if (row == 0) {
+        // FIXME hack for TCT routing
+        // TODO Support both channels
+        auto slvPortNum = 0;
+        auto mstrPortNum = 0;
+        TRY_XAIE_API_EMIT_ERROR(switchboxOp, XAie_StrmConnCctEnable, &devInst,
+                                tileLoc, CTRL, slvPortNum, SOUTH, mstrPortNum);
+      }
+
+      Block &b = switchboxOp.getConnections().front();
+      for (auto connectOp : b.getOps<ConnectOp>())
+        TRY_XAIE_API_EMIT_ERROR(
+            switchboxOp, XAie_StrmConnCctEnable, &devInst, tileLoc,
+            WIRE_BUNDLE_TO_STRM_SW_PORT_TYPE.at(connectOp.getSourceBundle()),
+            connectOp.sourceIndex(),
+            WIRE_BUNDLE_TO_STRM_SW_PORT_TYPE.at(connectOp.getDestBundle()),
+            connectOp.destIndex());
+
+      for (auto connectOp : b.getOps<MasterSetOp>()) {
+        int mask = 0;
+        int arbiter = -1;
+
+        for (auto val : connectOp.getAmsels()) {
+          AMSelOp amsel = cast<AMSelOp>(val.getDefiningOp());
+          arbiter = amsel.arbiterIndex();
+          int msel = amsel.getMselValue();
+          mask |= (1 << msel);
+        }
+
+        bool isdma = connectOp.getDestBundle() == WireBundle::DMA;
+        // assume a connection going south from row zero gets wired to shimdma
+        // by a shimmux. TODO: fix the assumption
+        if (!isdma && (switchboxOp.rowIndex() == 0))
+          isdma = connectOp.getDestBundle() == WireBundle::South;
+        // Flag for overriding DROP_HEADER. TODO: Formalize this in tablegen
+        isdma &= !connectOp->hasAttr("keep_pkt_header");
+        auto dropHeader =
+            isdma ? XAIE_SS_PKT_DROP_HEADER : XAIE_SS_PKT_DONOT_DROP_HEADER;
+        TRY_XAIE_API_EMIT_ERROR(
+            connectOp, XAie_StrmPktSwMstrPortEnable, &devInst, tileLoc,
+            WIRE_BUNDLE_TO_STRM_SW_PORT_TYPE.at(connectOp.getDestBundle()),
+            connectOp.destIndex(), dropHeader, arbiter, mask);
+      }
+
+      for (auto connectOp : b.getOps<PacketRulesOp>()) {
+        int slot = 0;
+        Block &block = connectOp.getRules().front();
+        for (auto slotOp : block.getOps<PacketRuleOp>()) {
+          AMSelOp amselOp = cast<AMSelOp>(slotOp.getAmsel().getDefiningOp());
+          int arbiter = amselOp.arbiterIndex();
+          int msel = amselOp.getMselValue();
+          TRY_XAIE_API_EMIT_ERROR(
+              connectOp, XAie_StrmPktSwSlavePortEnable, &devInst, tileLoc,
+              WIRE_BUNDLE_TO_STRM_SW_PORT_TYPE.at(connectOp.getSourceBundle()),
+              connectOp.sourceIndex());
+          auto packetInit = XAie_PacketInit(slotOp.valueInt(), /*PktType*/ 0);
+          // TODO Need to better define packet id,type used here
+          TRY_XAIE_API_EMIT_ERROR(
+              connectOp, XAie_StrmPktSwSlaveSlotEnable, &devInst, tileLoc,
+              WIRE_BUNDLE_TO_STRM_SW_PORT_TYPE.at(connectOp.getSourceBundle()),
+              connectOp.sourceIndex(), slot, packetInit, slotOp.maskInt(), msel,
+              arbiter);
+          slot++;
+        }
+      }
+    }
+
+    for (auto muxOp : targetOp.getOps<ShimMuxOp>()) {
+      // NOTE ShimMux always connects from the south as directions are
+      // defined relative to the tile stream switch.
+      auto tileLoc =
+          XAie_TileLoc(muxOp.getTileOp().getCol(), muxOp.getTileOp().getRow());
+      Block &b = muxOp.getConnections().front();
+      for (auto connectOp : b.getOps<ConnectOp>()) {
+        // demux!
+        if (connectOp.getSourceBundle() == WireBundle::North)
+          TRY_XAIE_API_EMIT_ERROR(muxOp, XAie_EnableAieToShimDmaStrmPort,
+                                  &devInst, tileLoc, connectOp.sourceIndex());
+        // mux
+        if (connectOp.getDestBundle() == WireBundle::North)
+          TRY_XAIE_API_EMIT_ERROR(muxOp, XAie_EnableShimDmaToAieStrmPort,
+                                  &devInst, tileLoc, connectOp.destIndex());
+      }
+    }
+
+    for (auto switchboxOp : targetOp.getOps<ShimSwitchboxOp>()) {
+      Block &b = switchboxOp.getConnections().front();
+      auto tileLoc = XAie_TileLoc(switchboxOp.getCol(), 0);
+      for (auto connectOp : b.getOps<ConnectOp>())
+        TRY_XAIE_API_EMIT_ERROR(
+            switchboxOp, XAie_StrmConnCctEnable, &devInst, tileLoc,
+            WIRE_BUNDLE_TO_STRM_SW_PORT_TYPE.at(connectOp.getSourceBundle()),
+            connectOp.sourceIndex(),
+            WIRE_BUNDLE_TO_STRM_SW_PORT_TYPE.at(connectOp.getDestBundle()),
+            connectOp.destIndex());
+    }
+
+    // Cascade configuration
+    if (targetModel.getTargetArch() == AIEArch::AIE2) {
+      for (auto configOp : targetOp.getOps<ConfigureCascadeOp>()) {
+        TileOp tile = cast<TileOp>(configOp.getTile().getDefiningOp());
+        auto tileLoc = XAie_TileLoc(tile.getCol(), tile.getRow());
+        TRY_XAIE_API_EMIT_ERROR(
+            targetOp, XAie_CoreConfigAccumulatorControl, &devInst, tileLoc,
+            WIRE_BUNDLE_TO_STRM_SW_PORT_TYPE.at(
+                static_cast<WireBundle>(configOp.getInputDir())),
+            WIRE_BUNDLE_TO_STRM_SW_PORT_TYPE.at(
+                static_cast<WireBundle>(configOp.getOutputDir())));
+      }
+    }
+
+    return success();
+  }
+
+  LogicalResult addCoreEnableToCDO(DeviceOp &targetOp) {
+    // Start execution of all the cores.
+    for (auto tileOp : targetOp.getOps<TileOp>()) {
+      auto tileLoc = XAie_TileLoc(tileOp.colIndex(), tileOp.rowIndex());
+      if (!tileOp.isShimTile() && tileOp.getCoreOp())
+        TRY_XAIE_API_EMIT_ERROR(targetOp, XAie_CoreEnable, &devInst, tileLoc);
+    }
+    return success();
+  }
+
+  void dmaUpdateBdAddr(DeviceOp &targetOp, int col, int row, size_t addr,
+                       size_t bdId) {
+    auto tileLoc = XAie_TileLoc(col, row);
+    TRY_XAIE_API_FATAL_ERROR(XAie_DmaUpdateBdAddr, &devInst, tileLoc, addr,
+                             bdId);
+  }
+};
+
+}  // namespace xilinx::AIE
+
+void initializeCDOGenerator(byte_ordering endianness, bool cdoDebug) {
+  // Enables AXI-MM prints for configs being added in CDO
+  if (cdoDebug) EnAXIdebug();
+  setEndianness(endianness);
+};
+
+LogicalResult generateCDOBinary(const StringRef outputPath,
+                                const std::function<LogicalResult()> &cb) {
+  startCDOFileStream(outputPath.str().c_str());
+  FileHeader();
+  // Never generate a completely empty CDO file.  If the file only contains a
+  // header, then bootgen flags it as invalid.
+  insertNoOpCommand(4);
+  if (failed(cb())) return failure();
+  configureHeader();
+  endCurrentCDOFileStream();
+  return success();
+}
+
+LogicalResult generateCDOBinariesSeparately(AIEControl &ctl,
+                                            const StringRef workDirPath,
+                                            DeviceOp &targetOp, bool aieSim,
+                                            bool enableCores) {
+  if (failed(generateCDOBinary(
+          (llvm::Twine(workDirPath) + std::string(1, ps) + "aie_cdo_elfs.bin")
+              .str(),
+          [&ctl, &targetOp, &workDirPath, &aieSim] {
+            return ctl.addAieElfsToCDO(targetOp, workDirPath, aieSim);
+          })))
+    return failure();
+
+  if (failed(generateCDOBinary(
+          (llvm::Twine(workDirPath) + std::string(1, ps) + "aie_cdo_init.bin")
+              .str(),
+          [&ctl, &targetOp] { return ctl.addInitConfigToCDO(targetOp); })))
+    return failure();
+
+  if (enableCores &&
+      failed(generateCDOBinary(
+          (llvm::Twine(workDirPath) + std::string(1, ps) + "aie_cdo_enable.bin")
+              .str(),
+          [&ctl, &targetOp] { return ctl.addCoreEnableToCDO(targetOp); })))
+    return failure();
+
+  return success();
+}
+
+LogicalResult generateCDOUnified(AIEControl &ctl, const StringRef workDirPath,
+                                 DeviceOp &targetOp, bool aieSim,
+                                 bool enableCores) {
+  return generateCDOBinary(
+      (llvm::Twine(workDirPath) + std::string(1, ps) + "aie_cdo.bin").str(),
+      [&ctl, &targetOp, &workDirPath, &aieSim, &enableCores] {
+        if (!targetOp.getOps<CoreOp>().empty() &&
+            failed(ctl.addAieElfsToCDO(targetOp, workDirPath, aieSim)))
+          return failure();
+        if (failed(ctl.addInitConfigToCDO(targetOp))) return failure();
+        if (enableCores && !targetOp.getOps<CoreOp>().empty() &&
+            failed(ctl.addCoreEnableToCDO(targetOp)))
+          return failure();
+        return success();
+      });
+}
+
+LogicalResult AIETranslateToCDODirect(ModuleOp m, llvm::StringRef workDirPath,
+                                      byte_ordering endianness,
+                                      bool emitUnified, bool cdoDebug,
+                                      bool aieSim, bool xaieDebug,
+                                      bool enableCores) {
+  auto devOps = m.getOps<DeviceOp>();
+  assert(llvm::range_size(devOps) == 1 &&
+         "only exactly 1 device op supported.");
+  DeviceOp targetOp = *devOps.begin();
+  const BaseNPUTargetModel &targetModel =
+      (const BaseNPUTargetModel &)targetOp.getTargetModel();
+
+  // things like XAIE_MEM_TILE_ROW_START and the missing
+  // shim dma on tile (0,0) are hard-coded assumptions about NPU...
+  assert(targetModel.isNPU() && "Only NPU currently supported");
+
+  AIEControl ctl(aieSim, xaieDebug, targetModel);
+  initializeCDOGenerator(endianness, cdoDebug);
+  if (emitUnified)
+    return generateCDOUnified(ctl, workDirPath, targetOp, aieSim, enableCores);
+  return generateCDOBinariesSeparately(ctl, workDirPath, targetOp, aieSim,
+                                       enableCores);
+}
+// Not sure why but defining this with xilinx::AIE will create a duplicate
+// symbol in libAIETargets.a that then doesn't actually match the header?
+namespace xilinx::AIE {
+LogicalResult AIETranslateToCDODirect(ModuleOp m, llvm::StringRef workDirPath,
+                                      bool bigEndian, bool emitUnified,
+                                      bool cdoDebug, bool aieSim,
+                                      bool xaieDebug, bool enableCores) {
+  byte_ordering endianness =
+      bigEndian ? byte_ordering::Big_Endian : byte_ordering::Little_Endian;
+  return AIETranslateToCDODirect(m, workDirPath, endianness, emitUnified,
+                                 cdoDebug, aieSim, xaieDebug, enableCores);
+}
+}  // namespace xilinx::AIE
diff --git a/compiler/plugins/target/AMD-AIE/aie/AIETargetLdScript.cpp b/compiler/plugins/target/AMD-AIE/aie/AIETargetLdScript.cpp
new file mode 100644
index 000000000..48313109f
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/aie/AIETargetLdScript.cpp
@@ -0,0 +1,169 @@
+//===- AIETargetLdScript.cpp -----------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2023 Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+#include "AIETargets.h"
+#include "aie/Dialect/AIE/IR/AIEDialect.h"
+
+using namespace mlir;
+using namespace xilinx;
+using namespace xilinx::AIE;
+
+// Output the memorymap in gnu linker format for the given buffer operations,
+// with the given offset. The offset is different depending on where the buffers
+// are accessed from.
+static void writeLDScriptMap(raw_ostream &output, BufferOp buf, int offset) {
+  std::string bufName(buf.name().getValue());
+  int bufferBaseAddr = getBufferBaseAddress(buf);
+  int numBytes = buf.getAllocationSize();
+  output << ". = 0x" << llvm::utohexstr(offset + bufferBaseAddr) << ";\n";
+  output << bufName << " = .;\n";
+  output << ". += 0x" << llvm::utohexstr(numBytes) << ";\n";
+}
+
+///// ld.script format:
+//
+// MEMORY
+// {
+//    program (RX) : ORIGIN = 0, LENGTH = 0x0020000
+//    data (!RX) : ORIGIN = 0x20000, LENGTH = 0x0020000
+// }
+// ENTRY(_main_init)
+// INPUT(something.o)
+// SECTIONS
+// {
+//   . = 0x0;
+//   .text : {
+//      // the _main_init symbol from me_basic.o has to come at address zero.
+//      *me_basic.o(.text)
+//      . = 0x200;
+//      __ctors_start__ = .;
+//      __init_array_start = .;
+//      KEEP(SORT(*)(.init_array))
+//      __ctors_end__ = .;
+//      __init_array_end = .;
+//      __dtors_start__ = .;
+//      __dtors_end__ = .;
+//      *(.text)
+//   } > program
+//   .data : { *(.data) } > data
+//   . = 0x20000;
+//   _sp_start_value_DM_stack = .;
+//   . = 0x24000;
+//   a = .;
+//   . += 1024;
+//   .bss : { *(.bss) } > data
+// }
+LogicalResult xilinx::AIE::AIETranslateToLdScript(ModuleOp module,
+                                                  raw_ostream &output,
+                                                  int tileCol, int tileRow) {
+  DenseMap<TileID, Operation *> tiles;
+  DenseMap<Operation *, SmallVector<BufferOp, 4>> buffers;
+
+  if (module.getOps<DeviceOp>().empty()) {
+    module.emitOpError("expected AIE.device operation at toplevel");
+  }
+  DeviceOp targetOp = *(module.getOps<DeviceOp>().begin());
+
+  collectTiles(targetOp, tiles);
+  collectBuffers(targetOp, buffers);
+
+  for (auto tile : targetOp.getOps<TileOp>())
+    if (tile.colIndex() == tileCol && tile.rowIndex() == tileRow) {
+      TileID srcCoord = {tile.colIndex(), tile.rowIndex()};
+      const auto &targetModel = getTargetModel(tile);
+
+      // Figure out how much memory we have left for random allocations
+      auto core = tile.getCoreOp();
+      int max = core.getStackSize();
+      for (auto buf : buffers[tiles[srcCoord]]) {
+        int bufferBaseAddr = getBufferBaseAddress(buf);
+        int numBytes = buf.getAllocationSize();
+        max = std::max(max, bufferBaseAddr + numBytes);
+      }
+      int origin = targetModel.getMemInternalBaseAddress(srcCoord) + max;
+      int length = targetModel.getLocalMemorySize() - max;
+      output << R"THESCRIPT(
+MEMORY
+{
+   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
+)THESCRIPT";
+      output << "   data (!RX) : ORIGIN = 0x" << llvm::utohexstr(origin)
+             << ", LENGTH = 0x" << llvm::utohexstr(length);
+      output << R"THESCRIPT(
+}
+ENTRY(_main_init)
+SECTIONS
+{
+  . = 0x0;
+  .text : {
+     /* the _main_init symbol from me_basic.o has to come at address zero. */
+     *me_basic.o(.text)
+     . = 0x200;
+     _ctors_start = .;
+     _init_array_start = .;
+     KEEP(SORT(*.init_array))
+     _ctors_end = .;
+     _init_array_end = .;
+     _dtors_start = .;
+     _dtors_end = .;
+     *(.text)
+  } > program
+  .data : {
+     *(.data*);
+     *(.rodata*)
+  } > data
+)THESCRIPT";
+      auto doBuffer = [&](std::optional<TileID> tile, int offset,
+                          std::string dir) {
+        if (tile) {
+          if (tiles.count(*tile))
+            for (auto buf : buffers[tiles[*tile]])
+              writeLDScriptMap(output, buf, offset);
+        } else {
+          output << "/* No tile with memory exists to the " << dir << ". */\n";
+          output << ". = 0x" << llvm::utohexstr(offset) << ";\n";
+          uint32_t localMemSize = targetModel.getLocalMemorySize();
+          output << ". += 0x" << llvm::utohexstr(localMemSize) << ";\n";
+        }
+      };
+
+      // Stack
+      output << ". = 0x"
+             << llvm::utohexstr(targetModel.getMemInternalBaseAddress(srcCoord))
+             << ";\n";
+      output << "_sp_start_value_DM_stack = .;\n";
+
+      if (auto core = tile.getCoreOp())
+        output << ". += 0x" << llvm::utohexstr(core.getStackSize())
+               << "; /* stack */\n";
+      else
+        output << "/* no stack allocated */\n";
+
+      doBuffer(targetModel.getMemSouth(srcCoord),
+               targetModel.getMemSouthBaseAddress(), std::string("south"));
+      doBuffer(targetModel.getMemWest(srcCoord),
+               targetModel.getMemWestBaseAddress(), std::string("west"));
+      doBuffer(targetModel.getMemNorth(srcCoord),
+               targetModel.getMemNorthBaseAddress(), std::string("north"));
+      doBuffer(targetModel.getMemEast(srcCoord),
+               targetModel.getMemEastBaseAddress(), std::string("east"));
+
+      output << "  .bss : { *(.bss) } > data\n";
+      output << "  .bss.DMb.4 : { *(.bss.DMb.4) } > data\n";
+      output << "}\n";
+      if (auto coreOp = tile.getCoreOp()) {
+        if (auto fileAttr = coreOp.getLinkWith())
+          output << "INPUT(" << fileAttr.value().str() << ")\n";
+
+        output << "PROVIDE(_main = core_" << tile.getCol() << "_"
+               << tile.getRow() << ");\n";
+      }
+    }
+  return success();
+}
diff --git a/compiler/plugins/target/AMD-AIE/aie/AIETargetNPU.cpp b/compiler/plugins/target/AMD-AIE/aie/AIETargetNPU.cpp
new file mode 100644
index 000000000..16ca38041
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/aie/AIETargetNPU.cpp
@@ -0,0 +1,153 @@
+//===- AIETargetNPU.cpp -----------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2023 Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+#include <vector>
+
+#include "AIETargets.h"
+#include "aie/Dialect/AIE/IR/AIEDialect.h"
+#include "aie/Dialect/AIEX/IR/AIEXDialect.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/Format.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Tools/mlir-translate/MlirTranslateMain.h"
+
+using namespace mlir;
+using namespace xilinx;
+using namespace xilinx::AIE;
+using namespace xilinx::AIEX;
+
+namespace {
+
+std::vector<uint32_t> getProlog() {
+  return {0x00000011, 0x01000405, 0x01000100, 0x0B590100, 0x000055FF,
+          0x00000001, 0x00000010, 0x314E5A5F, 0x635F5F31, 0x676E696C,
+          0x39354E5F, 0x6E693131, 0x5F727473, 0x64726F77, 0x00004573,
+          0x07BD9630, 0x000055FF};
+}
+
+// Example:
+// - instructions = {3,4,5}
+// - tailSize = 2
+// instructions becomes {3,4,5,0,0} and
+// a mutable reference to the tail {0,0} is returned.
+llvm::MutableArrayRef<uint32_t> reserveAndGetTail(
+    std::vector<uint32_t> &instructions, uint64_t tailSize) {
+  auto oldSize = instructions.size();
+  auto newSize = oldSize + tailSize;
+  instructions.resize(newSize, 0);
+  return llvm::MutableArrayRef<uint32_t>(instructions.data() + oldSize,
+                                         tailSize);
+}
+
+void appendSync(std::vector<uint32_t> &instructions, NpuSyncOp op) {
+  auto words = reserveAndGetTail(instructions, 2);
+
+  uint32_t opCode = 3;
+  words[0] |= (opCode & 0xff) << 24;
+  words[0] |= (op.getColumn() & 0xff) << 16;
+  words[0] |= (op.getRow() & 0xff) << 8;
+  words[0] |= op.getDirection() & 0x1;
+
+  words[1] |= (op.getChannel() & 0xff) << 24;
+  words[1] |= (op.getColumnNum() & 0xff) << 16;
+  words[1] |= (op.getRowNum() & 0xff) << 8;
+}
+
+void appendWrite32(std::vector<uint32_t> &instructions, NpuWrite32Op op) {
+  auto words = reserveAndGetTail(instructions, 3);
+
+  uint32_t opCode = 2;
+  words[0] |= (opCode & 0xff) << 24;
+  words[0] |= (op.getColumn() & 0xff) << 16;
+  words[0] |= (op.getRow() & 0xff) << 8;
+
+  words[1] = op.getAddress();
+
+  words[2] = op.getValue();
+}
+
+void appendWriteBdShimTile(std::vector<uint32_t> &instructions,
+                           NpuWriteBdExShimTileOp op) {
+  auto words = reserveAndGetTail(instructions, 10);
+
+  uint32_t opCode = 6;
+  words[0] |= (opCode & 0xff) << 24;
+  words[0] |= (op.getColumn() & 0xff) << 16;
+  words[0] |= (op.getColumnNum() & 0xff) << 8;
+  words[0] |= (op.getDdrId() & 0xf) << 4;
+  words[0] |= (op.getBdId() & 0xf);
+
+  // TODO: Address Incr
+  // words[1] = ...
+
+  words[2] = op.getBufferLength();
+  words[3] = op.getBufferOffset();
+
+  // En Packet , OoO BD ID , Packet ID , Packet Type
+  words[4] |= (op.getEnablePacket() & 0x1) << 30;
+  words[4] |= (op.getOutOfOrderId() & 0x3f) << 24;
+  words[4] |= (op.getPacketId() & 0x1f) << 19;
+  words[4] |= (op.getPacketType() & 0x7) << 16;
+
+  // TODO: Secure Access
+  words[5] |= (op.getD0Size() & 0x3ff) << 20;
+  words[5] |= op.getD0Stride() & 0xfffff;
+
+  words[6] = 0x80000000;  // burst length;
+  words[6] |= (op.getD1Size() & 0x3ff) << 20;
+  words[6] |= op.getD1Stride() & 0xfffff;
+
+  // TODO: SIMID, AxCache, AXQoS
+  words[7] = op.getD2Stride() & 0xfffff;
+
+  words[8] |= (op.getIterationCurrent() & 0x3f) << 26;
+  words[8] |= (op.getIterationSize() & 0x3f) << 20;
+  words[8] |= op.getIterationStride() & 0xfffff;
+
+  // TODO: TLAST Suppress
+  words[9] |= (op.getNextBd() & 0xf) << 27;
+  words[9] |= (op.getUseNextBd() & 0x1) << 26;
+  words[9] |= (op.getValidBd() & 0x1) << 25;
+  words[9] |= (op.getLockRelVal() & 0xef) << 18;
+  words[9] |= (op.getLockRelId() & 0xf) << 13;
+  words[9] |= (op.getLockAcqEnable() & 0x1) << 12;
+  words[9] |= (op.getLockAcqVal() & 0xef) << 5;
+  words[9] |= op.getLockAcqId() & 0xf;
+}
+
+}  // namespace
+
+std::vector<uint32_t> xilinx::AIE::AIETranslateToNPU(ModuleOp module) {
+  std::vector<uint32_t> instructions = getProlog();
+
+  DeviceOp deviceOp = *module.getOps<DeviceOp>().begin();
+  auto funcOps = deviceOp.getOps<func::FuncOp>();
+  for (auto f : funcOps) {
+    if (f.isDeclaration()) continue;
+    Block &entry = f.getRegion().front();
+    for (auto &o : entry) {
+      llvm::TypeSwitch<Operation *>(&o)
+          .Case<NpuSyncOp>([&](auto op) { appendSync(instructions, op); })
+          .Case<NpuWrite32Op>([&](auto op) { appendWrite32(instructions, op); })
+          .Case<NpuWriteBdExShimTileOp>(
+              [&](auto op) { appendWriteBdShimTile(instructions, op); });
+    }
+  }
+
+  return instructions;
+}
+
+LogicalResult xilinx::AIE::AIETranslateToNPU(ModuleOp module,
+                                             raw_ostream &output) {
+  auto instructions = AIETranslateToNPU(module);
+  for (auto w : instructions) output << llvm::format("%08X\n", w);
+  return success();
+}
diff --git a/compiler/plugins/target/AMD-AIE/aie/AIETargets.h b/compiler/plugins/target/AMD-AIE/aie/AIETargets.h
new file mode 100644
index 000000000..f1ef5bf7e
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/aie/AIETargets.h
@@ -0,0 +1,36 @@
+//===- AIETargets.h ---------------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AIE_TARGETS_AIETARGETS_H
+#define AIE_TARGETS_AIETARGETS_H
+
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/Support/LogicalResult.h"
+
+namespace xilinx {
+namespace AIE {
+
+mlir::LogicalResult AIETranslateToNPU(mlir::ModuleOp module,
+                                      llvm::raw_ostream &output);
+std::vector<uint32_t> AIETranslateToNPU(mlir::ModuleOp);
+mlir::LogicalResult AIETranslateToLdScript(mlir::ModuleOp module,
+                                           llvm::raw_ostream &output,
+                                           int tileCol, int tileRow);
+mlir::LogicalResult AIETranslateToBCF(mlir::ModuleOp module,
+                                      llvm::raw_ostream &output, int tileCol,
+                                      int tileRow);
+mlir::LogicalResult AIETranslateToCDODirect(
+    mlir::ModuleOp m, llvm::StringRef workDirPath, bool bigEndian = false,
+    bool emitUnified = false, bool cdoDebug = false, bool aieSim = false,
+    bool xaieDebug = false, bool enableCores = true);
+}  // namespace AIE
+
+}  // namespace xilinx
+
+#endif
diff --git a/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt
index 80b37af49..9b63b4c9a 100644
--- a/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt
@@ -364,10 +364,10 @@ iree_cc_library(
     AIETargets
   SRCS
     "XCLBinGen.cpp"
-    "${IREE_MLIR_AIE_SOURCE_DIR}/lib/Targets/AIETargetBCF.cpp"
-    "${IREE_MLIR_AIE_SOURCE_DIR}/lib/Targets/AIETargetLdScript.cpp"
-    "${IREE_MLIR_AIE_SOURCE_DIR}/lib/Targets/AIETargetNPU.cpp"
-    "${IREE_MLIR_AIE_SOURCE_DIR}/lib/Targets/AIETargetCDODirect.cpp"
+    "AIETargetBCF.cpp"
+    "AIETargetLdScript.cpp"
+    "AIETargetNPU.cpp"
+    "AIETargetCDODirect.cpp"
   DEPS
     ::AIEDialectIR
     ::AIEDialectIR
diff --git a/compiler/plugins/target/AMD-AIE/aie/XCLBinGen.cpp b/compiler/plugins/target/AMD-AIE/aie/XCLBinGen.cpp
index 455b995b1..5ed17737d 100644
--- a/compiler/plugins/target/AMD-AIE/aie/XCLBinGen.cpp
+++ b/compiler/plugins/target/AMD-AIE/aie/XCLBinGen.cpp
@@ -15,13 +15,12 @@
 #include <unordered_map>
 #include <utility>
 
+#include "AIETargets.h"
+#include "Passes.h"
 #include "aie/AIEAssignBufferAddressesBasic.h"
 #include "aie/Conversion/AIEVecToLLVM/AIEVecToLLVM.h"
-#include "aie/Dialect/AIE/Transforms/AIEPasses.h"
 #include "aie/Dialect/AIEVec/Pipelines/Passes.h"
-#include "aie/Dialect/AIEX/Transforms/AIEXPasses.h"
 #include "aie/Target/LLVMIR/Dialect/XLLVM/XLLVMToLLVMIRTranslation.h"
-#include "aie/Targets/AIETargets.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/JSON.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -34,8 +33,8 @@
 #include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"
 #include "mlir/Conversion/MathToLLVM/MathToLLVM.h"
 #include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h"
-#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
 #include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.h"
+#include "mlir/Dialect/Linalg/Passes.h"
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/Pass/PassManager.h"
@@ -174,16 +173,6 @@ static std::string getUUIDString() {
 #endif
   return val;
 }
-static void addAIELoweringPasses(OpPassManager &pm) {
-  pm.addPass(createLowerAffinePass());
-  pm.addPass(AIE::createAIECanonicalizeDevicePass());
-  OpPassManager &devicePM = pm.nest<AIE::DeviceOp>();
-  devicePM.addPass(AIE::createAIEAssignLockIDsPass());
-  devicePM.addPass(AIE::createAIEAssignBufferDescriptorIDsPass());
-  devicePM.addPass(AIE::createAIEObjectFifoStatefulTransformPass());
-  devicePM.addPass(AIE::createAIEAssignBufferAddressesBasicPass());
-  pm.addPass(createConvertSCFToCFPass());
-}
 
 static void addLowerToLLVMPasses(OpPassManager &pm) {
   pm.addPass(createCanonicalizerPass());
@@ -334,7 +323,7 @@ static LogicalResult generateCoreElfFiles(ModuleOp moduleOp,
                                           XCLBinGenConfig &TK) {
   auto deviceOps = moduleOp.getOps<AIE::DeviceOp>();
   if (!llvm::hasSingleElement(deviceOps))
-    return moduleOp.emitOpError("expected a single device op");
+    return moduleOp.emitOpError(": expected a single device op");
 
   AIE::DeviceOp deviceOp = *deviceOps.begin();
   auto tileOps = deviceOp.getOps<AIE::TileOp>();
@@ -368,14 +357,14 @@ static LogicalResult generateCoreElfFiles(ModuleOp moduleOp,
       if (!bcfOutput) return coreOp.emitOpError(errorMessage);
 
       if (failed(AIE::AIETranslateToBCF(moduleOp, bcfOutput->os(), col, row)))
-        return coreOp.emitOpError("Failed to generate BCF");
+        return coreOp.emitOpError(": Failed to generate BCF");
       bcfOutput->keep();
     }
 
     std::vector<std::string> extractedIncludes;
     {
       auto bcfFileIn = openInputFile(bcfPath, &errorMessage);
-      if (!bcfFileIn) moduleOp.emitOpError(errorMessage);
+      if (!bcfFileIn) return moduleOp.emitOpError(errorMessage);
 
       std::string bcfFile = std::string(bcfFileIn->getBuffer());
       std::regex r("_include _file (.*)");
@@ -395,8 +384,10 @@ static LogicalResult generateCoreElfFiles(ModuleOp moduleOp,
     for (const auto &inc : extractedIncludes) flags.push_back(inc);
     auto chessArgs_ = chessArgs(TK.AIEToolsDir, chessworkDir.str().str());
     chessArgs_.insert(chessArgs_.end(), flags.begin(), flags.end());
+    if (!sys::fs::exists(chessExe))
+      return moduleOp.emitOpError(": chess can't be found");
     if (runTool(chessExe, chessArgs_, TK.Verbose) != 0)
-      coreOp.emitOpError("Failed to link with xbridge");
+      return coreOp.emitOpError(": Failed to link with xbridge");
   }
   return success();
 }
@@ -404,71 +395,38 @@ static LogicalResult generateCoreElfFiles(ModuleOp moduleOp,
 static LogicalResult generateCDO(MLIRContext *context, ModuleOp moduleOp,
                                  XCLBinGenConfig &TK) {
   ModuleOp copy = moduleOp.clone();
-  std::string errorMessage;
-  // This corresponds to `process_host_cgen`, which is listed as host
-  // compilation in aiecc.py... not sure we need this.
-  PassManager passManager(context, ModuleOp::getOperationName());
-  applyConfigToPassManager(TK, passManager);
-
-  passManager.addNestedPass<AIE::DeviceOp>(AIE::createAIEPathfinderPass());
-  if (failed(passManager.run(copy)))
-    return moduleOp.emitOpError(
-        "failed to run passes to prepare of XCLBin generation");
-
   if (failed(AIE::AIETranslateToCDODirect(copy, TK.TempDir)))
-    return moduleOp.emitOpError("failed to emit CDO");
-
+    return moduleOp.emitOpError(": failed to emit CDO");
   copy->erase();
   return success();
 }
 
 static json::Object makeKernelJSON(std::string name, std::string id,
-                                   std::string instance) {
+                                   std::string instance, int numArgs) {
+  json::Array args{json::Object{{"name", "instr"},
+                                {"memory-connection", "SRAM"},
+                                {"address-qualifier", "GLOBAL"},
+                                {"type", "char *"},
+                                {"offset", "0x00"}},
+                   json::Object{{"name", "ninstr"},
+                                {"address-qualifier", "SCALAR"},
+                                {"type", "uint64_t"},
+                                {"offset", "0x08"}}};
+  for (int arg = 0; arg < numArgs; ++arg) {
+    args.push_back(json::Object{{"name", "bo" + std::to_string(arg)},
+                                {"memory-connection", "HOST"},
+                                {"address-qualifier", "GLOBAL"},
+                                {"type", "char *"},
+                                {"offset", std::to_string(0x10 + 0x8 * arg)}});
+  }
+
   return json::Object{
       {"name", name},
       {"type", "dpu"},
       {"extended-data",
        json::Object{
            {"subtype", "DPU"}, {"functional", "1"}, {"dpu_kernel_id", id}}},
-      {"arguments", json::Array{json::Object{{"name", "instr"},
-                                             {"memory-connection", "SRAM"},
-                                             {"address-qualifier", "GLOBAL"},
-                                             {"type", "char *"},
-                                             {"offset", "0x00"}},
-                                json::Object{{"name", "ninstr"},
-                                             {"address-qualifier", "SCALAR"},
-                                             {"type", "uint64_t"},
-                                             {"offset", "0x08"}},
-                                json::Object{{"name", "bo0"},
-                                             {"memory-connection", "HOST"},
-                                             {"address-qualifier", "GLOBAL"},
-                                             {"type", "char *"},
-                                             {"offset", "0x10"}},
-                                json::Object{{"name", "bo1"},
-                                             {"memory-connection", "HOST"},
-                                             {"address-qualifier", "GLOBAL"},
-                                             {"type", "char *"},
-                                             {"offset", "0x18"}},
-                                json::Object{{"name", "bo2"},
-                                             {"memory-connection", "HOST"},
-                                             {"address-qualifier", "GLOBAL"},
-                                             {"type", "char *"},
-                                             {"offset", "0x20"}},
-                                json::Object{{"name", "bo3"},
-                                             {"memory-connection", "HOST"},
-                                             {"address-qualifier", "GLOBAL"},
-                                             {"type", "char *"},
-                                             {"offset", "0x28"}},
-                                json::Object{{"name", "bo4"},
-                                             {"memory-connection", "HOST"},
-                                             {"address-qualifier", "GLOBAL"},
-                                             {"type", "char *"},
-                                             {"offset", "0x30"}},
-                                json::Object{{"name", "bo5"},
-                                             {"memory-connection", "HOST"},
-                                             {"address-qualifier", "GLOBAL"},
-                                             {"type", "char *"},
-                                             {"offset", "0x38"}}}},
+      {"arguments", std::move(args)},
       {"instances", json::Array{json::Object{{"name", instance}}}}};
 }
 
@@ -541,7 +499,8 @@ static LogicalResult generateXCLBin(MLIRContext *context, ModuleOp moduleOp,
                   "type": "PRIMARY",
                   "pdi_id": "0x01",
                   "dpu_kernel_ids": [
-                    "0x901"
+                    ")" + TK.XCLBinKernelID +
+                                          R"("
                   ],
                   "pre_cdo_groups": [
                     "0xC1"
@@ -564,13 +523,24 @@ static LogicalResult generateXCLBin(MLIRContext *context, ModuleOp moduleOp,
     auto kernelsJsonOut = openOutputFile(kernelsJsonFile, &errorMessage);
     if (!kernelsJsonOut) return moduleOp.emitOpError(errorMessage);
 
+    // TODO(max): should be gotten from the dispatch not this func (which will
+    // eventually disappear)
+    std::optional<int> numArgs;
+    moduleOp.walk([&numArgs](func::FuncOp sequenceFunc) {
+      if (sequenceFunc.getName() == "sequence")
+        numArgs = sequenceFunc.getArgumentTypes().size();
+    });
+    if (!numArgs)
+      return moduleOp.emitOpError(
+          "Couldn't find func.func @sequence to count args");
+
     json::Object kernels_data{
         {"ps-kernels",
          json::Object{
              {"kernels",
               json::Array{// TODO: Support for multiple kernels
                           makeKernelJSON(TK.XCLBinKernelName, TK.XCLBinKernelID,
-                                         TK.XCLBinInstanceName)}}}}};
+                                         TK.XCLBinInstanceName, *numArgs)}}}}};
     kernelsJsonOut->os() << formatv("{0:2}",
                                     json::Value(std::move(kernels_data)));
     kernelsJsonOut->keep();
@@ -612,9 +582,9 @@ static LogicalResult generateXCLBin(MLIRContext *context, ModuleOp moduleOp,
 
     if (auto bootgen = sys::findProgramByName("bootgen")) {
       if (runTool(*bootgen, flags, TK.Verbose) != 0)
-        return moduleOp.emitOpError("failed to execute bootgen");
+        return moduleOp.emitOpError(": failed to execute bootgen");
     } else {
-      return moduleOp.emitOpError("could not find bootgen");
+      return moduleOp.emitOpError(": could not find bootgen");
     }
   }
 
@@ -636,9 +606,9 @@ static LogicalResult generateXCLBin(MLIRContext *context, ModuleOp moduleOp,
 
     if (auto xclbinutil = sys::findProgramByName("xclbinutil")) {
       if (runTool(*xclbinutil, flags, TK.Verbose) != 0)
-        return moduleOp.emitOpError("failed to execute xclbinutil");
+        return moduleOp.emitOpError(": failed to execute xclbinutil");
     } else {
-      return moduleOp.emitOpError("could not find xclbinutil");
+      return moduleOp.emitOpError(": could not find xclbinutil");
     }
   }
   return success();
@@ -735,7 +705,7 @@ static LogicalResult generateObject(MLIRContext *context, ModuleOp moduleOp,
     }();
 
     if (failed(vectorToAIEVecOptions.parseFromString(optionsString))) {
-      return moduleOp.emitOpError("Failed to parse options from '")
+      return moduleOp.emitOpError(": Failed to parse options from '")
              << optionsString
              << "': Failed to construct ConvertVectorToAIEVecOptions.";
     }
@@ -753,7 +723,7 @@ static LogicalResult generateObject(MLIRContext *context, ModuleOp moduleOp,
 
   ModuleOp copy = moduleOp.clone();
   if (failed(pm.run(copy)))
-    return moduleOp.emitOpError("Failed to lower to LLVM");
+    return moduleOp.emitOpError(": Failed to lower to LLVM");
 
   SmallString<64> LLVMIRFile(TK.TempDir);
   sys::path::append(LLVMIRFile, "input.ll");
@@ -761,72 +731,75 @@ static LogicalResult generateObject(MLIRContext *context, ModuleOp moduleOp,
   llvm::LLVMContext llvmContext;
   auto llvmModule = translateModuleToLLVMIR(copy, llvmContext);
   if (!llvmModule)
-    return moduleOp.emitOpError("Failed to translate module to LLVMIR");
+    return moduleOp.emitOpError(": Failed to translate module to LLVMIR");
 
+  std::string llvmirString;
   std::string errorMessage;
   {
+    raw_string_ostream llvmirStream(llvmirString);
+    llvmModule->print(llvmirStream, nullptr);
+    llvmirString = chesshack(llvmirString);
     auto output = openOutputFile(LLVMIRFile, &errorMessage);
     if (!output) return moduleOp.emitOpError(errorMessage);
-    llvmModule->print(output->os(), nullptr);
+    output->os() << llvmirString;
     output->keep();
   }
 
   SmallString<64> chessExe(TK.AIEToolsDir);
   sys::path::append(chessExe, "bin", "unwrapped", "lnx64.o", "xchesscc");
-  SmallString<64> chessworkDir(TK.TempDir);
-  sys::path::append(chessworkDir, "chesswork");
-  SmallString<64> chessIntrinsicsLL(TK.InstallDir);
+  SmallString<64> chessIntrinsicsLL(TK.TempDir);
   sys::path::append(chessIntrinsicsLL, "chess_intrinsic_wrapper.ll");
   {
-    auto chessIntrinsicWrapperLlFile = openOutputFile(chessIntrinsicsLL);
-    if (!chessIntrinsicWrapperLlFile) moduleOp.emitOpError(errorMessage);
+    auto chessIntrinsicWrapperLlFile =
+        openOutputFile(chessIntrinsicsLL, &errorMessage);
+    if (!chessIntrinsicWrapperLlFile) return moduleOp.emitOpError(errorMessage);
 
     chessIntrinsicWrapperLlFile->os() << _CHESS_INTRINSIC_WRAPPER_LL;
     chessIntrinsicWrapperLlFile->keep();
   }
 
-  std::string llvmirString;
-  {
-    raw_string_ostream llvmirStream(llvmirString);
-    llvmModule->print(llvmirStream, nullptr);
-  }
-
   SmallString<64> chesslinkedFile(TK.TempDir);
   sys::path::append(chesslinkedFile, "input.chesslinked.ll");
   SmallString<64> chessLlvmLinkBin(TK.AIEToolsDir);
   sys::path::append(chessLlvmLinkBin, "tps", "lnx64", "target");
   sys::path::append(chessLlvmLinkBin, "bin", "LNa64bin", "chess-llvm-link");
+  if (!sys::fs::exists(chessLlvmLinkBin))
+    return moduleOp.emitOpError(": chess-llvm-link can't be found");
 
   if (runTool(chessLlvmLinkBin,
               {std::string(LLVMIRFile), std::string(chessIntrinsicsLL),
                "--opaque-pointers=1", "-S", "-o", std::string(chesslinkedFile)},
               TK.Verbose) != 0)
-    moduleOp.emitOpError("Couldn't link in the intrinsics");
+    return moduleOp.emitOpError(": Couldn't link in the intrinsics");
 
   std::string mungedLLVMIR;
   {
     auto chesslinkedIn = openInputFile(chesslinkedFile, &errorMessage);
-    if (!chesslinkedIn) moduleOp.emitOpError(errorMessage);
+    if (!chesslinkedIn) return moduleOp.emitOpError(errorMessage);
 
     mungedLLVMIR = std::string(chesslinkedIn->getBuffer());
     mungedLLVMIR = chesshack(mungedLLVMIR);
   }
   {
-    auto chesslinkedOut = openOutputFile(chesslinkedFile);
-    if (!chesslinkedOut) moduleOp.emitOpError(errorMessage);
+    auto chesslinkedOut = openOutputFile(chesslinkedFile, &errorMessage);
+    if (!chesslinkedOut) return moduleOp.emitOpError(errorMessage);
 
     chesslinkedOut->os() << mungedLLVMIR;
     chesslinkedOut->keep();
   }
 
+  SmallString<64> chessworkDir(TK.TempDir);
+  sys::path::append(chessworkDir, "chesswork");
   auto chessArgs_ = chessArgs(TK.AIEToolsDir, chessworkDir.str().str());
   chessArgs_.push_back("-c");
   chessArgs_.push_back(std::string(chesslinkedFile));
   chessArgs_.push_back("-o");
   chessArgs_.push_back(std::string(outputFile));
+  if (!sys::fs::exists(chessExe))
+    return moduleOp.emitOpError(": chess can't be found");
 
   if (runTool(chessExe, chessArgs_, TK.Verbose) != 0)
-    return moduleOp.emitOpError("Failed to assemble with chess");
+    return moduleOp.emitOpError(": Failed to assemble with chess");
   copy->erase();
   return success();
 }
@@ -834,21 +807,8 @@ static LogicalResult generateObject(MLIRContext *context, ModuleOp moduleOp,
 LogicalResult xilinx::aie2xclbin(MLIRContext *ctx, ModuleOp moduleOp,
                                  XCLBinGenConfig &TK, StringRef OutputNPU,
                                  StringRef OutputXCLBin) {
-  if (failed(xilinx::findVitis(TK))) moduleOp.emitOpError("VITIS not found");
-
-  PassManager pm(ctx, moduleOp.getOperationName());
-  applyConfigToPassManager(TK, pm);
-
-  addAIELoweringPasses(pm);
-
-  if (TK.Verbose) {
-    llvm::outs() << "Running: ";
-    pm.printAsTextualPipeline(llvm::outs());
-    llvm::outs() << "\n";
-  }
-
-  if (failed(pm.run(moduleOp)))
-    return moduleOp.emitOpError("AIE lowering pipline failed");
+  if (failed(xilinx::findVitis(TK)))
+    return moduleOp.emitOpError(": VITIS not found");
 
   TK.TargetArch = StringRef(TK.TargetArch).trim();
 
@@ -865,17 +825,14 @@ LogicalResult xilinx::aie2xclbin(MLIRContext *ctx, ModuleOp moduleOp,
     pm.addNestedPass<AIE::DeviceOp>(AIEX::createAIEDmaToNpuPass());
     ModuleOp copy = moduleOp.clone();
     if (failed(pm.run(copy)))
-      return moduleOp.emitOpError("NPU Instruction pipeline failed");
+      return moduleOp.emitOpError(": NPU Instruction pipeline failed");
 
     std::string errorMessage;
     auto output = openOutputFile(OutputNPU, &errorMessage);
-    if (!output) {
-      llvm::errs() << errorMessage << "\n";
-      return moduleOp.emitOpError("");
-    }
+    if (!output) return moduleOp.emitOpError(errorMessage);
 
     if (failed(AIE::AIETranslateToNPU(copy, output->os())))
-      return moduleOp.emitOpError("NPU Instruction translation failed");
+      return moduleOp.emitOpError(": NPU Instruction translation failed");
 
     output->keep();
     copy->erase();
@@ -884,16 +841,16 @@ LogicalResult xilinx::aie2xclbin(MLIRContext *ctx, ModuleOp moduleOp,
   SmallString<64> object(TK.TempDir);
   sys::path::append(object, "input.o");
   if (failed(generateObject(ctx, moduleOp, TK, std::string(object))))
-    return moduleOp.emitOpError("Failed to generate object");
+    return moduleOp.emitOpError(": Failed to generate object");
 
   if (failed(generateCoreElfFiles(moduleOp, object, TK)))
-    return moduleOp.emitOpError("Failed to generate core ELF file(s)");
+    return moduleOp.emitOpError(": Failed to generate core ELF file(s)");
 
   if (failed(generateCDO(ctx, moduleOp, TK)))
-    return moduleOp.emitOpError("Failed to generate CDO");
+    return moduleOp.emitOpError(": Failed to generate CDO");
 
   if (failed(generateXCLBin(ctx, moduleOp, TK, OutputXCLBin)))
-    return moduleOp.emitOpError("Failed to generate XCLBin");
+    return moduleOp.emitOpError(": Failed to generate XCLBin");
 
   return success();
 }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETargetDirect.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETargetDirect.cpp
index 5212a4ae3..c1097fa89 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETargetDirect.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETargetDirect.cpp
@@ -206,12 +206,6 @@ LogicalResult AIETargetDirectBackend::serializeExecutable(
   if (failed(maybeWorkDir)) return failure();
   auto workDir = maybeWorkDir.value();
 
-  xilinx::XCLBinGenConfig TK;
-  TK.TempDir = workDir.str();
-  TK.TargetArch = "AIE2";
-  TK.UseChess = true;
-  TK.Verbose = true;
-
   SmallVector<std::string> entryPointNames;
   for (auto exportOp : variantOp.getExportOps()) {
     entryPointNames.emplace_back(exportOp.getSymName().substr(0, 48));
@@ -221,9 +215,15 @@ LogicalResult AIETargetDirectBackend::serializeExecutable(
     return moduleOp.emitOpError("Expected a single entry point");
   }
 
+  xilinx::XCLBinGenConfig TK;
+  TK.TempDir = workDir.str();
+  TK.TargetArch = "AIE2";
+  TK.UseChess = true;
+  TK.Verbose = true;
   TK.XCLBinKernelName = entryPointNames[0];
   TK.XCLBinKernelID = "0x101";
   TK.XCLBinInstanceName = "FOO";
+
   SmallString<128> xclbinPath(workDir);
   llvm::sys::path::append(xclbinPath, basename + ".xclbin");
   SmallString<128> npuInstPath(workDir);
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_dma_transpose.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_dma_transpose.mlir
deleted file mode 100644
index c4469b09e..000000000
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_dma_transpose.mlir
+++ /dev/null
@@ -1,58 +0,0 @@
-// RUN: not iree-compile --compile-mode=hal-executable --iree-hal-target-backends=amd-aie-direct %s 2>&1 | FileCheck %s
-
-// CHECK: %switchbox_0_0 = aie.switchbox
-// CHECK: aie.dma_start
-// CHECK: aie.dma_bd
-// CHECK: unimplemented AIETargetDirectBackend::serializeExecutable
-module attributes {hal.device.targets = [#hal.device.target<"amd-aie-direct", [#hal.executable.target<"amd-aie-direct", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>]>]} {
-  hal.executable private @dummy1 {
-    hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie-direct", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
-      hal.executable.export public @dummy2 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>]} {
-      ^bb0(%arg0: !hal.device):
-        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
-        hal.return %x, %y, %z : index, index, index
-      }
-      builtin.module {
-        aie.device(npu1) {
-          %tile_0_0 = aie.tile(0, 0)
-          %tile_0_2 = aie.tile(0, 2)
-          aie.objectfifo @in(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<64x64xi32>>
-          aie.objectfifo @out(%tile_0_2, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<64x64xi32>>
-          aie.objectfifo.link [@in] -> [@out]()
-          %core_0_2 = aie.core(%tile_0_2) {
-            %c0 = arith.constant 0 : index
-            %0 = memref.alloc() : memref<10xf32>
-            %1 = memref.load %0[%c0] : memref<10xf32>
-            memref.store %1, %0[%c0] : memref<10xf32>
-            aie.end
-          }
-          func.func @sequence(%arg0: memref<4096xi32>, %arg1: memref<4096xi32>, %arg2: memref<4096xi32>) {
-            aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0]) {id = 0 : i64, metadata = @out} : memref<4096xi32>
-            aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 64, 64, 1][1, 1, 64]) {id = 1 : i64, metadata = @in} : memref<4096xi32>
-            aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-            return
-          }
-        }
-      }
-    }
-  }
-  util.func public @dummy3(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = ""}} {
-    // this is all gibberish just to hit serializeExecutable
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %element_type_i8 = hal.element_type<i8> : i32
-    %dense_row_major = hal.encoding_type<dense_row_major> : i32
-    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c1, %c1]) type(%element_type_i8) encoding(%dense_row_major)
-    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<1024x512xi8> in !stream.resource<external>{%c1}
-    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c1} => !stream.timepoint
-
-    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c1}) {
-      stream.cmd.dispatch @dummy1::@amdaie_xclbin_fb::@dummy2 {
-        ro %arg2[%c0 for %c1] : !stream.resource<external>{%c1}
-      }
-    } => !stream.timepoint
-    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c1}
-    %4 = stream.tensor.export %3 : tensor<1024x1024xi32> in !stream.resource<external>{%c1} -> !hal.buffer_view
-    util.return %4 : !hal.buffer_view
-  }
-}
\ No newline at end of file
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_multiplication_matrix_vector.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_multiplication_matrix_vector.mlir
deleted file mode 100644
index f9c8b4dba..000000000
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_multiplication_matrix_vector.mlir
+++ /dev/null
@@ -1,59 +0,0 @@
-// RUN: iree-opt --aie-objectFifo-stateful-transform %s
-
-module {
-  aie.device(npu1) {
-    func.func private @zero_scalar_f32(memref<32xf32>)
-    func.func private @zero_vectorized_f32(memref<32xf32>)
-    func.func private @matvec_scalar_bf16_f32(memref<32x32xbf16>, memref<32xbf16>, memref<32xf32>)
-    func.func private @matvec_vectorized_bf16_f32(memref<32x32xbf16>, memref<32xbf16>, memref<32xf32>)
-    %tile_0_0 = aie.tile(0, 0)
-    %tile_1_0 = aie.tile(1, 0)
-    %tile_2_0 = aie.tile(2, 0)
-    %tile_3_0 = aie.tile(3, 0)
-    %tile_0_1 = aie.tile(0, 1)
-    %tile_1_1 = aie.tile(1, 1)
-    %tile_2_1 = aie.tile(2, 1)
-    %tile_3_1 = aie.tile(3, 1)
-    %tile_0_2 = aie.tile(0, 2)
-    %tile_1_2 = aie.tile(1, 2)
-    %tile_2_2 = aie.tile(2, 2)
-    %tile_3_2 = aie.tile(3, 2)
-    aie.objectfifo @memA0(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<1024xbf16>>
-    aie.objectfifo @inA0(%tile_0_1 toStream [<size = 32, stride = 32>, <size = 32, stride = 1>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<32x32xbf16>>
-    aie.objectfifo.link [@memA0] -> [@inA0]()
-    aie.objectfifo @inB(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<32xbf16>>
-    aie.objectfifo @outC0(%tile_0_2, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<32xf32>>
-    %core_0_2 = aie.core(%tile_0_2) {
-      %c0 = arith.constant 0 : index
-      %c4294967295 = arith.constant 4294967295 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c4294967295 step %c1 {
-        %0 = aie.objectfifo.acquire @outC0(Produce, 1) : !aie.objectfifosubview<memref<32xf32>>
-        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<32xf32>> -> memref<32xf32>
-        func.call @zero_vectorized_f32(%1) : (memref<32xf32>) -> ()
-        %c0_0 = arith.constant 0 : index
-        %c9 = arith.constant 9 : index
-        %c1_1 = arith.constant 1 : index
-        scf.for %arg1 = %c0_0 to %c9 step %c1_1 {
-          %2 = aie.objectfifo.acquire @inA0(Consume, 1) : !aie.objectfifosubview<memref<32x32xbf16>>
-          %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<32x32xbf16>> -> memref<32x32xbf16>
-          %4 = aie.objectfifo.acquire @inB(Consume, 1) : !aie.objectfifosubview<memref<32xbf16>>
-          %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<32xbf16>> -> memref<32xbf16>
-          func.call @matvec_vectorized_bf16_f32(%3, %5, %1) : (memref<32x32xbf16>, memref<32xbf16>, memref<32xf32>) -> ()
-          aie.objectfifo.release @inA0(Consume, 1)
-          aie.objectfifo.release @inB(Consume, 1)
-        }
-        aie.objectfifo.release @outC0(Produce, 1)
-      }
-      aie.end
-    } {link_with = "mv.o"}
-    func.func @sequence(%arg0: memref<41472xi32>, %arg1: memref<144xi32>, %arg2: memref<288xi32>) {
-      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][9, 1, 1, 144][0, 0, 0]) {id = 2 : i64, metadata = @inB} : memref<144xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][9, 9, 32, 16][4608, 16, 144]) {id = 1 : i64, metadata = @memA0} : memref<41472xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 288][0, 0, 0]) {id = 0 : i64, metadata = @outC0} : memref<288xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
-    }
-  }
-}
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_multiplication_matrix_vector.py b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_multiplication_matrix_vector.py
new file mode 100644
index 000000000..9162e1346
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_multiplication_matrix_vector.py
@@ -0,0 +1,189 @@
+from aie.dialects import arith, linalg
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.context import mlir_mod_ctx
+
+
+def my_matmul(M, K, n_cores):
+    m = 32
+    k = 32
+    word_size_in = 4
+    word_size_out = 4
+
+    A_sz_in_i32s = M * K * word_size_in // 4
+    B_sz_in_i32s = K * word_size_in // 4
+    C_sz_in_bytes = M * word_size_out
+    C_sz_in_i32s = C_sz_in_bytes // 4
+    C_sz_div_n_cores_in_i32s = C_sz_in_i32s // n_cores
+
+    M_div_m = M // m
+    M_div_m_div_n_cores = M // (m * n_cores)
+    K_div_k = K // k
+
+    K_in_i32s = K * word_size_in // 4
+    k_in_i32s = k * word_size_in // 4
+    m_in_i32s = m * word_size_in // 4
+    m_x_k_in_i32s = m * k * word_size_in // 4
+    m_x_K_in_i32s = m * K * word_size_in // 4
+
+    vectorized = True
+
+    @device(AIEDevice.npu1_4col)
+    def device_body():
+        memRef_inA_ty = T.memref(m * k, T.f32())
+        memRef_inB_ty = T.memref(k, T.f32())
+        memRef_outC_ty = T.memref(m, T.f32())
+        memRef_A_ty = T.memref(m, k, T.f32())
+
+        # Tile declarations
+        ShimTile0 = tile(0, 0)
+        ShimTile1 = tile(1, 0)
+        ShimTile2 = tile(2, 0)
+        ShimTile3 = tile(3, 0)
+        ShimTiles = [ShimTile0, ShimTile1, ShimTile2, ShimTile3]
+        MemTile0 = tile(0, 1)
+        MemTile1 = tile(1, 1)
+        MemTile2 = tile(2, 1)
+        MemTile3 = tile(3, 1)
+        MemTiles = [MemTile0, MemTile1, MemTile2, MemTile3]
+        ComputeTile0 = tile(0, 2)
+        ComputeTile1 = tile(1, 2)
+        ComputeTile2 = tile(2, 2)
+        ComputeTile3 = tile(3, 2)
+        cores = [ComputeTile0, ComputeTile1, ComputeTile2, ComputeTile3]
+        memA_fifo_names = ["memA0", "memA1", "memA2", "memA3"]
+        memA_fifos = {}
+        inA_fifo_names = ["inA0", "inA1", "inA2", "inA3"]
+        inA_fifos = {}
+        inB_fifo_names = ["inB"]
+        inB_fifos = {}
+        outC_fifo_names = ["outC0", "outC1", "outC2", "outC3"]
+        outC_fifos = {}
+
+        # AIE-array data movement with object fifos
+        # Input A
+        for i in range(n_cores):
+            memA_fifos[memA_fifo_names[i]] = object_fifo(
+                memA_fifo_names[i],
+                ShimTiles[i],
+                MemTiles[i],
+                2,
+                memRef_inA_ty,
+            )
+            inA_fifos[inA_fifo_names[i]] = object_fifo(
+                inA_fifo_names[i],
+                MemTiles[i],
+                cores[i],
+                2,
+                memRef_A_ty,
+                [
+                    (m, k),
+                    (k, 1),
+                ],
+            )
+            object_fifo_link(
+                memA_fifos[memA_fifo_names[i]], inA_fifos[inA_fifo_names[i]]
+            )
+
+        # Input B
+        inB_fifos[inB_fifo_names[0]] = object_fifo(
+            inB_fifo_names[0],
+            ShimTiles[1 % n_cores],
+            cores[0:n_cores],
+            2,
+            memRef_inB_ty,
+        )
+
+        # Output C
+        for i in range(n_cores):
+            outC_fifos[outC_fifo_names[i]] = object_fifo(
+                outC_fifo_names[i],
+                cores[i],
+                ShimTiles[i],
+                2,
+                memRef_outC_ty,
+            )
+
+        # Set up compute tiles
+        for i in range(n_cores):
+            # Compute tile i
+            @core(cores[i])
+            def core_body():
+                cf0 = arith.constant(T.f32(), 0.0)
+                for _ in for_(0xFFFFFFFF):
+                    elem_out = outC_fifos[outC_fifo_names[i]].acquire(
+                        ObjectFifoPort.Produce,
+                        1,
+                    )
+                    linalg.fill(cf0, outs=[elem_out])
+
+                    for _ in for_(K_div_k):
+                        elem_in_a = inA_fifos[inA_fifo_names[i]].acquire(
+                            ObjectFifoPort.Consume,
+                            1,
+                        )
+                        elem_in_b = inB_fifos[inB_fifo_names[0]].acquire(
+                            ObjectFifoPort.Consume,
+                            1,
+                        )
+                        linalg.matvec(elem_in_a, elem_in_b, outs=[elem_out])
+                        inA_fifos[inA_fifo_names[i]].release(
+                            ObjectFifoPort.Consume,
+                            1,
+                        )
+                        inB_fifos[inB_fifo_names[0]].release(
+                            ObjectFifoPort.Consume,
+                            1,
+                        )
+                        yield_([])
+
+                    outC_fifos[outC_fifo_names[i]].release(
+                        ObjectFifoPort.Produce,
+                        1,
+                    )
+                    yield_([])
+
+        # To/from AIE-array data movement
+
+        @FuncOp.from_py_func(
+            T.memref(A_sz_in_i32s, T.i32()),
+            T.memref(B_sz_in_i32s, T.i32()),
+            T.memref(C_sz_in_i32s, T.i32()),
+        )
+        def sequence(A, B, C):
+            npu_dma_memcpy_nd(
+                metadata=inB_fifo_names[0],
+                bd_id=2,
+                mem=B,
+                sizes=[M_div_m_div_n_cores, 1, 1, K_in_i32s],
+                strides=[0, 0, 0],
+            )
+            for i in range(n_cores):
+                A_offset = i * M_div_m_div_n_cores * m * K * word_size_in // 4
+                C_offset = i * M_div_m_div_n_cores * m * word_size_out // 4
+                npu_dma_memcpy_nd(
+                    metadata=memA_fifo_names[i],
+                    bd_id=1,
+                    mem=A,
+                    offsets=[0, 0, 0, A_offset],
+                    sizes=[M_div_m_div_n_cores, K_div_k, m, k_in_i32s],
+                    strides=[m_x_K_in_i32s, k_in_i32s, K_in_i32s],
+                )
+                npu_dma_memcpy_nd(
+                    metadata=outC_fifo_names[i],
+                    bd_id=0,
+                    mem=C,
+                    offsets=[0, 0, 0, C_offset],
+                    sizes=[1, 1, 1, C_sz_div_n_cores_in_i32s],
+                    strides=[0, 0, 0],
+                )
+
+            for i in range(n_cores):
+                npu_sync(column=i, row=0, direction=0, channel=0)
+
+
+def emit_module(M=64, K=64, n_cores=1):
+    with mlir_mod_ctx() as ctx:
+        my_matmul(M, K, n_cores)
+        return str(ctx.module)
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_multiplication_single_core.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_multiplication_single_core.mlir
index e768e5bcc..30c4a72fd 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_multiplication_single_core.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_multiplication_single_core.mlir
@@ -1,65 +1,86 @@
-// RUN: iree-opt --aie-objectFifo-stateful-transform %s
+// RUN: not iree-compile --compile-mode=hal-executable --iree-hal-target-backends=amd-aie-direct --iree-hal-dump-executable-intermediates-to=%S/basic_matrix_multiplication_matrix_vector %s 2>&1 | FileCheck %s
 
-module {
-  aie.device(npu1) {
-    func.func private @zero_scalar_bf16(memref<64x64xbf16>)
-    func.func private @zero_bf16(memref<64x64xbf16>)
-    func.func private @matmul_scalar_bf16_bf16(memref<64x64xbf16>, memref<64x64xbf16>, memref<64x64xbf16>)
-    func.func private @matmul_bf16_bf16(memref<64x64xbf16>, memref<64x64xbf16>, memref<64x64xbf16>)
-    %tile_0_0 = aie.tile(0, 0)
-    %tile_0_1 = aie.tile(0, 1)
-    %tile_0_2 = aie.tile(0, 2)
-    aie.objectfifo @inA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<64x64xbf16>>
-    aie.objectfifo @memA(%tile_0_1 toStream [<size = 16, stride = 256>, <size = 8, stride = 8>, <size = 4, stride = 64>, <size = 8, stride = 1>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<64x64xbf16>>
-    aie.objectfifo.link [@inA] -> [@memA]()
-    aie.objectfifo @inB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<64x64xbf16>>
-    aie.objectfifo @memB(%tile_0_1 toStream [<size = 8, stride = 512>, <size = 16, stride = 4>, <size = 8, stride = 64>, <size = 4, stride = 1>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<64x64xbf16>>
-    aie.objectfifo.link [@inB] -> [@memB]()
-    aie.objectfifo @memC(%tile_0_2, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<64x64xbf16>>
-    aie.objectfifo @outC(%tile_0_1 toStream [<size = 16, stride = 256>, <size = 4, stride = 4>, <size = 16, stride = 16>, <size = 4, stride = 1>], {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<64x64xbf16>>
-    aie.objectfifo.link [@memC] -> [@outC]()
-    %core_0_2 = aie.core(%tile_0_2) {
-      %c0 = arith.constant 0 : index
-      %c4294967295 = arith.constant 4294967295 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c4294967295 step %c1 {
-        %c0_0 = arith.constant 0 : index
-        %c16 = arith.constant 16 : index
-        %c1_1 = arith.constant 1 : index
-        scf.for %arg1 = %c0_0 to %c16 step %c1_1 {
-          %0 = aie.objectfifo.acquire @memC(Produce, 1) : !aie.objectfifosubview<memref<64x64xbf16>>
-          %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<64x64xbf16>> -> memref<64x64xbf16>
-          func.call @zero_bf16(%1) : (memref<64x64xbf16>) -> ()
-          %c0_2 = arith.constant 0 : index
-          %c4 = arith.constant 4 : index
-          %c1_3 = arith.constant 1 : index
-          scf.for %arg2 = %c0_2 to %c4 step %c1_3 {
-            %2 = aie.objectfifo.acquire @memA(Consume, 1) : !aie.objectfifosubview<memref<64x64xbf16>>
-            %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<64x64xbf16>> -> memref<64x64xbf16>
-            %4 = aie.objectfifo.acquire @memB(Consume, 1) : !aie.objectfifosubview<memref<64x64xbf16>>
-            %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<64x64xbf16>> -> memref<64x64xbf16>
-            func.call @matmul_bf16_bf16(%3, %5, %1) : (memref<64x64xbf16>, memref<64x64xbf16>, memref<64x64xbf16>) -> ()
-            aie.objectfifo.release @memA(Consume, 1)
-            aie.objectfifo.release @memB(Consume, 1)
+module attributes {hal.device.targets = [#hal.device.target<"amd-aie-direct", [#hal.executable.target<"amd-aie-direct", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>]>]} {
+  hal.executable private @dummy1 {
+    hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie-direct", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
+      hal.executable.export public @dummy2 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>]} {
+      ^bb0(%arg0: !hal.device):
+        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
+        hal.return %x, %y, %z : index, index, index
+      }
+      builtin.module {
+        // this is load bearing...
+        aie.device(npu1_1col) {
+          %tile_0_0 = aie.tile(0, 0)
+          %tile_0_1 = aie.tile(0, 1)
+          %tile_0_2 = aie.tile(0, 2)
+          aie.objectfifo @inA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xf32>>
+          aie.objectfifo @memA(%tile_0_1 toStream [<size = 8, stride = 128>, <size = 4, stride = 8>, <size = 4, stride = 32>, <size = 8, stride = 1>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<32x32xf32>>
+          aie.objectfifo.link [@inA] -> [@memA]()
+          aie.objectfifo @inB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xf32>>
+          aie.objectfifo @memB(%tile_0_1 toStream [<size = 4, stride = 256>, <size = 8, stride = 4>, <size = 8, stride = 32>, <size = 4, stride = 1>], {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<32x32xf32>>
+          aie.objectfifo.link [@inB] -> [@memB]()
+          aie.objectfifo @memC(%tile_0_2, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32x32xf32>>
+          aie.objectfifo @outC(%tile_0_1 toStream [<size = 8, stride = 128>, <size = 4, stride = 4>, <size = 8, stride = 16>, <size = 4, stride = 1>], {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<32x32xf32>>
+          aie.objectfifo.link [@memC] -> [@outC]()
+          %core_0_2 = aie.core(%tile_0_2) {
+            %c0 = arith.constant 0 : index
+            %c4294967295 = arith.constant 4294967295 : index
+            %c1 = arith.constant 1 : index
+            scf.for %arg0 = %c0 to %c4294967295 step %c1 {
+              %c0_0 = arith.constant 0 : index
+              %c1_1 = arith.constant 1 : index
+              %c1_2 = arith.constant 1 : index
+              scf.for %arg1 = %c0_0 to %c1_1 step %c1_2 {
+                %0 = aie.objectfifo.acquire @memC(Produce, 1) : !aie.objectfifosubview<memref<32x32xf32>>
+                %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<32x32xf32>> -> memref<32x32xf32>
+                %cst = arith.constant 0.000000e+00 : f32
+                linalg.fill ins(%cst : f32) outs(%1 : memref<32x32xf32>)
+                %c0_3 = arith.constant 0 : index
+                %c1_4 = arith.constant 1 : index
+                %c1_5 = arith.constant 1 : index
+                scf.for %arg2 = %c0_3 to %c1_4 step %c1_5 {
+                  %2 = aie.objectfifo.acquire @memA(Consume, 1) : !aie.objectfifosubview<memref<32x32xf32>>
+                  %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<32x32xf32>> -> memref<32x32xf32>
+                  %4 = aie.objectfifo.acquire @memB(Consume, 1) : !aie.objectfifosubview<memref<32x32xf32>>
+                  %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<32x32xf32>> -> memref<32x32xf32>
+                  linalg.matmul {cast = #linalg.type_fn<cast_signed>} ins(%3, %5 : memref<32x32xf32>, memref<32x32xf32>) outs(%1 : memref<32x32xf32>)
+                  aie.objectfifo.release @memA(Consume, 1)
+                  aie.objectfifo.release @memB(Consume, 1)
+                }
+                aie.objectfifo.release @memC(Produce, 1)
+              }
+            }
+            aie.end
+          }
+          func.func @sequence(%arg0: memref<1024xi32>, %arg1: memref<1024xi32>, %arg2: memref<1024xi32>) {
+            aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 32, 32][1024, 32, 32]) {id = 0 : i64, metadata = @outC} : memref<1024xi32>
+            aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 32, 32][0, 32, 32]) {id = 1 : i64, metadata = @inA} : memref<1024xi32>
+            aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 32, 32][32, 1024, 32]) {id = 2 : i64, metadata = @inB} : memref<1024xi32>
+            aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
+            return
           }
-          aie.objectfifo.release @memC(Produce, 1)
         }
       }
-      aie.end
-    } {link_with = "mm.o"}
-    func.func @sequence(%arg0: memref<32768xi32>, %arg1: memref<32768xi32>, %arg2: memref<32768xi32>) {
-      aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][4, 4, 64, 32][8192, 32, 128]) {id = 0 : i64, metadata = @outC} : memref<32768xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][4, 4, 64, 32][0, 32, 128]) {id = 1 : i64, metadata = @inA} : memref<32768xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][4, 4, 64, 32][32, 8192, 128]) {id = 2 : i64, metadata = @inB} : memref<32768xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 8192][4, 4, 64, 32][0, 32, 128]) {id = 3 : i64, metadata = @inA} : memref<32768xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][4, 4, 64, 32][32, 8192, 128]) {id = 4 : i64, metadata = @inB} : memref<32768xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 16384][4, 4, 64, 32][0, 32, 128]) {id = 5 : i64, metadata = @inA} : memref<32768xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][4, 4, 64, 32][32, 8192, 128]) {id = 6 : i64, metadata = @inB} : memref<32768xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 24576][4, 4, 64, 32][0, 32, 128]) {id = 7 : i64, metadata = @inA} : memref<32768xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][4, 4, 64, 32][32, 8192, 128]) {id = 8 : i64, metadata = @inB} : memref<32768xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
     }
   }
-}
+  util.func public @dummy3(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = ""}} {
+    // this is all gibberish just to hit serializeExecutable
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %element_type_i8 = hal.element_type<i8> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c1, %c1]) type(%element_type_i8) encoding(%dense_row_major)
+    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<1024x512xi8> in !stream.resource<external>{%c1}
+    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c1} => !stream.timepoint
 
+    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c1}) {
+      stream.cmd.dispatch @dummy1::@amdaie_xclbin_fb::@dummy2 {
+        ro %arg2[%c0 for %c1] : !stream.resource<external>{%c1}
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c1}
+    %4 = stream.tensor.export %3 : tensor<1024x1024xi32> in !stream.resource<external>{%c1} -> !hal.buffer_view
+    util.return %4 : !hal.buffer_view
+  }
+}
\ No newline at end of file
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_multiplication_single_core.py b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_multiplication_single_core.py
new file mode 100644
index 000000000..4607802c1
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_multiplication_single_core.py
@@ -0,0 +1,183 @@
+from aie.dialects import arith, linalg
+from aie.dialects.aie import *
+from aie.dialects.aiex import *
+from aie.dialects.scf import *
+from aie.extras.context import mlir_mod_ctx
+
+
+def my_matmul(M, K, N):
+    m = 32
+    k = 32
+    n = 32
+    r = 4
+    s = 8
+    t = 4
+    word_size_in = 4
+    word_size_out = 4
+
+    A_sz_in_i32s = M * K * word_size_in // 4
+    B_sz_in_i32s = K * N * word_size_in // 4
+    C_sz_in_bytes = M * N * word_size_out
+    C_sz_in_i32s = C_sz_in_bytes // 4
+
+    M_div_m = M // m
+    K_div_k = K // k
+    N_div_n = N // n
+    tiles = M_div_m * N_div_n
+
+    # Matrix A: MxK, submatrices a: mxk
+    k_in_i32s = k * word_size_in // 4
+    K_in_i32s = K * word_size_in // 4
+
+    # Matrix B: KxN, submatrices b: kxn
+    n_in_i32s = n * word_size_in // 4
+    N_in_i32s = N * word_size_in // 4
+    k_x_N_in_i32s = k * N * word_size_in // 4
+
+    # Output Matrix C: MxN
+    n_in_i32s_out = n * word_size_out // 4
+    N_in_i32s_out = N * word_size_out // 4
+    m_x_N_in_i32s_out = m * N * word_size_out // 4
+
+    @device(AIEDevice.npu1_1col)
+    def device_body():
+        memref_a_ty = T.memref(m, k, T.f32())
+        memref_b_ty = T.memref(k, n, T.f32())
+        memref_c_ty = T.memref(m, n, T.f32())
+
+        # Tile declarations
+        shim_tile = tile(0, 0)
+        mem_tile = tile(0, 1)
+        compute_tile2_col, compute_tile2_row = 0, 2
+        compute_tile2 = tile(compute_tile2_col, compute_tile2_row)
+
+        # AIE-array data movement with object fifos
+        # Input A
+        inA = object_fifo("inA", shim_tile, mem_tile, 2, memref_a_ty)
+        memA = object_fifo(
+            "memA",
+            mem_tile,
+            compute_tile2,
+            2,
+            memref_a_ty,
+            [
+                (m // r, r * k),
+                (k // s, s),
+                (r, k),
+                (s, 1),
+            ],
+        )
+        object_fifo_link(inA, memA)
+
+        # Input B
+        inB = object_fifo("inB", shim_tile, mem_tile, 2, memref_b_ty)
+        memB = object_fifo(
+            "memB",
+            mem_tile,
+            compute_tile2,
+            2,
+            memref_b_ty,
+            [
+                (k // s, s * n),
+                (n // t, t),
+                (s, n),
+                (t, 1),
+            ],
+        )
+        object_fifo_link(inB, memB)
+
+        # Output C
+        memC = object_fifo("memC", compute_tile2, mem_tile, 2, memref_c_ty)
+        outC = object_fifo(
+            "outC",
+            mem_tile,
+            shim_tile,
+            2,
+            memref_c_ty,
+            [
+                (m // r, r * n),
+                (r, t),
+                (n // t, r * t),
+                (t, 1),
+            ],
+        )
+        object_fifo_link(memC, outC)
+
+        # Compute tile 2
+        @core(compute_tile2)
+        def core_body():
+            for _ in for_(0xFFFFFFFF):
+                for _ in for_(tiles):
+                    elem_out = memC.acquire(ObjectFifoPort.Produce, 1)
+                    cf0 = arith.constant(T.f32(), 0.0)
+                    linalg.fill(cf0, outs=[elem_out])
+                    for _ in for_(K_div_k):
+                        elem_in_a = memA.acquire(ObjectFifoPort.Consume, 1)
+                        elem_in_b = memB.acquire(ObjectFifoPort.Consume, 1)
+                        linalg.matmul(elem_in_a, elem_in_b, outs=[elem_out])
+                        memA.release(ObjectFifoPort.Consume, 1)
+                        memB.release(ObjectFifoPort.Consume, 1)
+                        yield_([])
+
+                    memC.release(ObjectFifoPort.Produce, 1)
+                    yield_([])
+                yield_([])
+
+        # To/from AIE-array data movement
+
+        @FuncOp.from_py_func(
+            T.memref(A_sz_in_i32s, T.i32()),
+            T.memref(B_sz_in_i32s, T.i32()),
+            T.memref(C_sz_in_i32s, T.i32()),
+        )
+        def sequence(A, B, C):
+            # only do 5 tile rows at a time before synchronizing, so we can reuse BDs
+            rows_per_block = 5
+            for tile_row_block in range(
+                (M_div_m + rows_per_block - 1) // rows_per_block
+            ):
+                C_row_offset_in_i32s = (
+                    tile_row_block * rows_per_block * m * N * word_size_out // 4
+                )
+                num_tile_rows = min(
+                    [rows_per_block, M_div_m - tile_row_block * rows_per_block]
+                )
+                npu_dma_memcpy_nd(
+                    metadata="outC",
+                    bd_id=0,
+                    mem=C,
+                    offsets=[0, 0, 0, C_row_offset_in_i32s],
+                    sizes=[num_tile_rows, N_div_n, m, n_in_i32s_out],
+                    strides=[m_x_N_in_i32s_out, n_in_i32s_out, N_in_i32s_out],
+                )
+                for tile_row in range(num_tile_rows):
+                    A_row_offset_in_i32s = (
+                        ((tile_row_block * rows_per_block) + tile_row)
+                        * m
+                        * K
+                        * word_size_in
+                        // 4
+                    )
+                    npu_dma_memcpy_nd(
+                        metadata="inA",
+                        bd_id=2 * tile_row + 1,
+                        mem=A,
+                        offsets=[0, 0, 0, A_row_offset_in_i32s],
+                        sizes=[N_div_n, K_div_k, m, k_in_i32s],
+                        strides=[0, k_in_i32s, K_in_i32s],
+                    )
+                    npu_dma_memcpy_nd(
+                        metadata="inB",
+                        bd_id=2 * tile_row + 2,
+                        mem=B,
+                        sizes=[N_div_n, K_div_k, k, n_in_i32s],
+                        strides=[n_in_i32s, k_x_N_in_i32s, N_in_i32s],
+                    )
+
+                npu_sync(column=0, row=0, direction=0, channel=0)
+
+
+def emit_module(M=64, K=64, N=64):
+    with mlir_mod_ctx() as ctx:
+        my_matmul(M, K, N)
+        return str(ctx.module)
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_scalar_add.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_scalar_add.mlir
deleted file mode 100644
index bb2d63105..000000000
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_matrix_scalar_add.mlir
+++ /dev/null
@@ -1,40 +0,0 @@
-// RUN: iree-opt --aie-objectFifo-stateful-transform %s
-
-module {
-  aie.device(npu1) {
-    %tile_1_0 = aie.tile(1, 0)
-    %tile_1_2 = aie.tile(1, 2)
-    aie.objectfifo @in0(%tile_1_0, {%tile_1_2}, 4 : i32) : !aie.objectfifo<memref<128xi32>>
-    aie.objectfifo @out0(%tile_1_2, {%tile_1_0}, 4 : i32) : !aie.objectfifo<memref<128xi32>>
-    %core_1_2 = aie.core(%tile_1_2) {
-      %c0 = arith.constant 0 : index
-      %c9223372036854775807 = arith.constant 9223372036854775807 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 {
-        %0 = aie.objectfifo.acquire @in0(Consume, 1) : !aie.objectfifosubview<memref<128xi32>>
-        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<128xi32>> -> memref<128xi32>
-        %2 = aie.objectfifo.acquire @out0(Produce, 1) : !aie.objectfifosubview<memref<128xi32>>
-        %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<128xi32>> -> memref<128xi32>
-        %c0_0 = arith.constant 0 : index
-        %c128 = arith.constant 128 : index
-        %c1_1 = arith.constant 1 : index
-        scf.for %arg1 = %c0_0 to %c128 step %c1_1 {
-          %4 = memref.load %1[%arg1] : memref<128xi32>
-          %c1_i32 = arith.constant 1 : i32
-          %5 = arith.addi %4, %c1_i32 : i32
-          memref.store %5, %3[%arg1] : memref<128xi32>
-        }
-        aie.objectfifo.release @in0(Consume, 1)
-        aie.objectfifo.release @out0(Produce, 1)
-      }
-      aie.end
-    }
-    func.func @sequence(%arg0: memref<128xi32>, %arg1: memref<128xi32>, %arg2: memref<128xi32>) {
-      aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 8, 16][1, 1, 128]) {id = 0 : i64, metadata = @out0} : memref<128xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 8, 16][1, 1, 128]) {id = 1 : i64, metadata = @in0} : memref<128xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
-    }
-  }
-}
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_passthrough_dmas.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_passthrough_dmas.mlir
deleted file mode 100644
index 0445992fa..000000000
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_passthrough_dmas.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-// RUN: iree-opt --aie-objectFifo-stateful-transform %s
-
-module {
-  aie.device(npu1) {
-    %tile_0_0 = aie.tile(0, 0)
-    %tile_0_2 = aie.tile(0, 2)
-    aie.objectfifo @in(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<1024xi32>>
-    aie.objectfifo @out(%tile_0_2, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<1024xi32>>
-    aie.objectfifo.link [@in] -> [@out]()
-    %core_0_2 = aie.core(%tile_0_2) {
-      %c0 = arith.constant 0 : index
-      %c9223372036854775807 = arith.constant 9223372036854775807 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 {
-      }
-      aie.end
-    }
-    func.func @sequence(%arg0: memref<4096xi32>, %arg1: memref<4096xi32>, %arg2: memref<4096xi32>) {
-      aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0]) {id = 0 : i64, metadata = @out} : memref<4096xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 4096][0, 0, 0]) {id = 1 : i64, metadata = @in} : memref<4096xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
-    }
-  }
-}
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_passthrough_kernel.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_passthrough_kernel.mlir
deleted file mode 100644
index 6af63f1a7..000000000
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_passthrough_kernel.mlir
+++ /dev/null
@@ -1,34 +0,0 @@
-// RUN: iree-opt --aie-objectFifo-stateful-transform %s
-
-module {
-  aie.device(npu1) {
-    func.func private @passThroughLine(memref<128xui8>, memref<128xui8>, i32)
-    %tile_0_0 = aie.tile(0, 0)
-    %tile_0_2 = aie.tile(0, 2)
-    aie.objectfifo @in(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<128xui8>>
-    aie.objectfifo @out(%tile_0_2, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<128xui8>>
-    %core_0_2 = aie.core(%tile_0_2) {
-      %c0 = arith.constant 0 : index
-      %c9223372036854775807 = arith.constant 9223372036854775807 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 {
-        %0 = aie.objectfifo.acquire @out(Produce, 1) : !aie.objectfifosubview<memref<128xui8>>
-        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<128xui8>> -> memref<128xui8>
-        %2 = aie.objectfifo.acquire @in(Consume, 1) : !aie.objectfifosubview<memref<128xui8>>
-        %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<128xui8>> -> memref<128xui8>
-        %c128_i32 = arith.constant 128 : i32
-        func.call @passThroughLine(%3, %1, %c128_i32) : (memref<128xui8>, memref<128xui8>, i32) -> ()
-        aie.objectfifo.release @in(Consume, 1)
-        aie.objectfifo.release @out(Produce, 1)
-      }
-      aie.end
-    } {link_with = "passThrough.cc.o"}
-    func.func @sequence(%arg0: memref<128xi32>, %arg1: memref<128xi32>, %arg2: memref<128xi32>) {
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 128][0, 0, 0]) {id = 0 : i64, metadata = @in} : memref<128xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 128][0, 0, 0]) {id = 1 : i64, metadata = @out} : memref<128xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
-    }
-  }
-}
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_exp.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_exp.mlir
deleted file mode 100644
index 63e0d47dd..000000000
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_exp.mlir
+++ /dev/null
@@ -1,112 +0,0 @@
-// RUN: iree-opt --aie-objectFifo-stateful-transform %s
-
-module {
-  aie.device(npu1) {
-    func.func private @exp_bf16_1024(memref<1024xbf16>, memref<1024xbf16>)
-    %tile_0_0 = aie.tile(0, 0)
-    %tile_0_1 = aie.tile(0, 1)
-    %tile_0_2 = aie.tile(0, 2)
-    %tile_0_3 = aie.tile(0, 3)
-    %tile_0_4 = aie.tile(0, 4)
-    %tile_0_5 = aie.tile(0, 5)
-    aie.objectfifo @inA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<4096xbf16>>
-    aie.objectfifo @memA0(%tile_0_1, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<1024xbf16>>
-    aie.objectfifo @memA1(%tile_0_1, {%tile_0_3}, 2 : i32) : !aie.objectfifo<memref<1024xbf16>>
-    aie.objectfifo @memA2(%tile_0_1, {%tile_0_4}, 2 : i32) : !aie.objectfifo<memref<1024xbf16>>
-    aie.objectfifo @memA3(%tile_0_1, {%tile_0_5}, 2 : i32) : !aie.objectfifo<memref<1024xbf16>>
-    aie.objectfifo.link [@inA] -> [@memA0, @memA1, @memA2, @memA3]()
-    aie.objectfifo @memC0(%tile_0_2, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<1024xbf16>>
-    aie.objectfifo @memC1(%tile_0_3, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<1024xbf16>>
-    aie.objectfifo @memC2(%tile_0_4, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<1024xbf16>>
-    aie.objectfifo @memC3(%tile_0_5, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<1024xbf16>>
-    aie.objectfifo @outC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<4096xbf16>>
-    aie.objectfifo.link [@memC0, @memC1, @memC2, @memC3] -> [@outC]()
-    %core_0_2 = aie.core(%tile_0_2) {
-      %c0 = arith.constant 0 : index
-      %c4294967295 = arith.constant 4294967295 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c4294967295 step %c1 {
-        %c0_0 = arith.constant 0 : index
-        %c16 = arith.constant 16 : index
-        %c1_1 = arith.constant 1 : index
-        scf.for %arg1 = %c0_0 to %c16 step %c1_1 {
-          %0 = aie.objectfifo.acquire @memC0(Produce, 1) : !aie.objectfifosubview<memref<1024xbf16>>
-          %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<1024xbf16>> -> memref<1024xbf16>
-          %2 = aie.objectfifo.acquire @memA0(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16>>
-          %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<1024xbf16>> -> memref<1024xbf16>
-          func.call @exp_bf16_1024(%3, %1) : (memref<1024xbf16>, memref<1024xbf16>) -> ()
-          aie.objectfifo.release @memA0(Consume, 1)
-          aie.objectfifo.release @memC0(Produce, 1)
-        }
-      }
-      aie.end
-    } {link_with = "kernels.a"}
-    %core_0_3 = aie.core(%tile_0_3) {
-      %c0 = arith.constant 0 : index
-      %c4294967295 = arith.constant 4294967295 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c4294967295 step %c1 {
-        %c0_0 = arith.constant 0 : index
-        %c16 = arith.constant 16 : index
-        %c1_1 = arith.constant 1 : index
-        scf.for %arg1 = %c0_0 to %c16 step %c1_1 {
-          %0 = aie.objectfifo.acquire @memC1(Produce, 1) : !aie.objectfifosubview<memref<1024xbf16>>
-          %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<1024xbf16>> -> memref<1024xbf16>
-          %2 = aie.objectfifo.acquire @memA1(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16>>
-          %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<1024xbf16>> -> memref<1024xbf16>
-          func.call @exp_bf16_1024(%3, %1) : (memref<1024xbf16>, memref<1024xbf16>) -> ()
-          aie.objectfifo.release @memA1(Consume, 1)
-          aie.objectfifo.release @memC1(Produce, 1)
-        }
-      }
-      aie.end
-    } {link_with = "kernels.a"}
-    %core_0_4 = aie.core(%tile_0_4) {
-      %c0 = arith.constant 0 : index
-      %c4294967295 = arith.constant 4294967295 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c4294967295 step %c1 {
-        %c0_0 = arith.constant 0 : index
-        %c16 = arith.constant 16 : index
-        %c1_1 = arith.constant 1 : index
-        scf.for %arg1 = %c0_0 to %c16 step %c1_1 {
-          %0 = aie.objectfifo.acquire @memC2(Produce, 1) : !aie.objectfifosubview<memref<1024xbf16>>
-          %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<1024xbf16>> -> memref<1024xbf16>
-          %2 = aie.objectfifo.acquire @memA2(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16>>
-          %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<1024xbf16>> -> memref<1024xbf16>
-          func.call @exp_bf16_1024(%3, %1) : (memref<1024xbf16>, memref<1024xbf16>) -> ()
-          aie.objectfifo.release @memA2(Consume, 1)
-          aie.objectfifo.release @memC2(Produce, 1)
-        }
-      }
-      aie.end
-    } {link_with = "kernels.a"}
-    %core_0_5 = aie.core(%tile_0_5) {
-      %c0 = arith.constant 0 : index
-      %c4294967295 = arith.constant 4294967295 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c4294967295 step %c1 {
-        %c0_0 = arith.constant 0 : index
-        %c16 = arith.constant 16 : index
-        %c1_1 = arith.constant 1 : index
-        scf.for %arg1 = %c0_0 to %c16 step %c1_1 {
-          %0 = aie.objectfifo.acquire @memC3(Produce, 1) : !aie.objectfifosubview<memref<1024xbf16>>
-          %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<1024xbf16>> -> memref<1024xbf16>
-          %2 = aie.objectfifo.acquire @memA3(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16>>
-          %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<1024xbf16>> -> memref<1024xbf16>
-          func.call @exp_bf16_1024(%3, %1) : (memref<1024xbf16>, memref<1024xbf16>) -> ()
-          aie.objectfifo.release @memA3(Consume, 1)
-          aie.objectfifo.release @memC3(Produce, 1)
-        }
-      }
-      aie.end
-    } {link_with = "kernels.a"}
-    func.func @sequence(%arg0: memref<65536xi32>, %arg1: memref<65536xi32>) {
-      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 32768][0, 0, 0]) {id = 0 : i64, metadata = @outC} : memref<65536xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 32768][0, 0, 0]) {id = 1 : i64, metadata = @inA} : memref<65536xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
-    }
-  }
-}
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_reduce_add.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_reduce_add.mlir
deleted file mode 100644
index 915d6d290..000000000
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_reduce_add.mlir
+++ /dev/null
@@ -1,34 +0,0 @@
-// RUN: iree-opt --aie-objectFifo-stateful-transform %s
-
-module {
-  aie.device(npu1) {
-    func.func private @reduce_add_vector(memref<1024xi32>, memref<1xi32>, i32)
-    %tile_1_0 = aie.tile(1, 0)
-    %tile_1_2 = aie.tile(1, 2)
-    aie.objectfifo @in(%tile_1_0, {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<1024xi32>>
-    aie.objectfifo @out(%tile_1_2, {%tile_1_0}, 2 : i32) : !aie.objectfifo<memref<1xi32>>
-    %core_1_2 = aie.core(%tile_1_2) {
-      %c0 = arith.constant 0 : index
-      %c4294967295 = arith.constant 4294967295 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c4294967295 step %c1 {
-        %0 = aie.objectfifo.acquire @out(Produce, 1) : !aie.objectfifosubview<memref<1xi32>>
-        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<1xi32>> -> memref<1xi32>
-        %2 = aie.objectfifo.acquire @in(Consume, 1) : !aie.objectfifosubview<memref<1024xi32>>
-        %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<1024xi32>> -> memref<1024xi32>
-        %c1024_i32 = arith.constant 1024 : i32
-        func.call @reduce_add_vector(%3, %1, %c1024_i32) : (memref<1024xi32>, memref<1xi32>, i32) -> ()
-        aie.objectfifo.release @in(Consume, 1)
-        aie.objectfifo.release @out(Produce, 1)
-      }
-      aie.end
-    } {link_with = "reduce_add.cc.o"}
-    func.func @sequence(%arg0: memref<1024xi32>, %arg1: memref<1024xi32>) {
-      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 1][0, 0, 0]) {id = 0 : i64, metadata = @out} : memref<1024xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 1 : i64, metadata = @in} : memref<1024xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
-    }
-  }
-}
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_reduce_max.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_reduce_max.mlir
deleted file mode 100644
index bbda32a25..000000000
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_reduce_max.mlir
+++ /dev/null
@@ -1,34 +0,0 @@
-// RUN: iree-opt --aie-objectFifo-stateful-transform %s
-
-module {
-  aie.device(npu1) {
-    func.func private @reduce_max_vector(memref<1024xi32>, memref<1xi32>, i32)
-    %tile_1_0 = aie.tile(1, 0)
-    %tile_1_2 = aie.tile(1, 2)
-    aie.objectfifo @in(%tile_1_0, {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<1024xi32>>
-    aie.objectfifo @out(%tile_1_2, {%tile_1_0}, 2 : i32) : !aie.objectfifo<memref<1xi32>>
-    %core_1_2 = aie.core(%tile_1_2) {
-      %c0 = arith.constant 0 : index
-      %c4294967295 = arith.constant 4294967295 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c4294967295 step %c1 {
-        %0 = aie.objectfifo.acquire @out(Produce, 1) : !aie.objectfifosubview<memref<1xi32>>
-        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<1xi32>> -> memref<1xi32>
-        %2 = aie.objectfifo.acquire @in(Consume, 1) : !aie.objectfifosubview<memref<1024xi32>>
-        %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<1024xi32>> -> memref<1024xi32>
-        %c1024_i32 = arith.constant 1024 : i32
-        func.call @reduce_max_vector(%3, %1, %c1024_i32) : (memref<1024xi32>, memref<1xi32>, i32) -> ()
-        aie.objectfifo.release @in(Consume, 1)
-        aie.objectfifo.release @out(Produce, 1)
-      }
-      aie.end
-    } {link_with = "reduce_max.cc.o"}
-    func.func @sequence(%arg0: memref<1024xi32>, %arg1: memref<1024xi32>) {
-      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 1][0, 0, 0]) {id = 0 : i64, metadata = @out} : memref<1024xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 1 : i64, metadata = @in} : memref<1024xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
-    }
-  }
-}
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_reduce_min.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_reduce_min.mlir
deleted file mode 100644
index 3327ade5c..000000000
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_reduce_min.mlir
+++ /dev/null
@@ -1,34 +0,0 @@
-// RUN: iree-opt --aie-objectFifo-stateful-transform %s
-
-module {
-  aie.device(npu1) {
-    func.func private @reduce_min_vector(memref<1024xi32>, memref<1xi32>, i32)
-    %tile_1_0 = aie.tile(1, 0)
-    %tile_1_2 = aie.tile(1, 2)
-    aie.objectfifo @in(%tile_1_0, {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<1024xi32>>
-    aie.objectfifo @out(%tile_1_2, {%tile_1_0}, 2 : i32) : !aie.objectfifo<memref<1xi32>>
-    %core_1_2 = aie.core(%tile_1_2) {
-      %c0 = arith.constant 0 : index
-      %c4294967295 = arith.constant 4294967295 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c4294967295 step %c1 {
-        %0 = aie.objectfifo.acquire @out(Produce, 1) : !aie.objectfifosubview<memref<1xi32>>
-        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<1xi32>> -> memref<1xi32>
-        %2 = aie.objectfifo.acquire @in(Consume, 1) : !aie.objectfifosubview<memref<1024xi32>>
-        %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<1024xi32>> -> memref<1024xi32>
-        %c1024_i32 = arith.constant 1024 : i32
-        func.call @reduce_min_vector(%3, %1, %c1024_i32) : (memref<1024xi32>, memref<1xi32>, i32) -> ()
-        aie.objectfifo.release @in(Consume, 1)
-        aie.objectfifo.release @out(Produce, 1)
-      }
-      aie.end
-    } {link_with = "reduce_min.cc.o"}
-    func.func @sequence(%arg0: memref<1024xi32>, %arg1: memref<1024xi32>) {
-      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 1][0, 0, 0]) {id = 0 : i64, metadata = @out} : memref<1024xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 1 : i64, metadata = @in} : memref<1024xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
-    }
-  }
-}
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_scalar_add.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_scalar_add.mlir
deleted file mode 100644
index 739fc6539..000000000
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_scalar_add.mlir
+++ /dev/null
@@ -1,45 +0,0 @@
-// RUN: iree-opt --aie-objectFifo-stateful-transform %s
-
-module {
-  aie.device(npu1) {
-    %tile_0_0 = aie.tile(0, 0)
-    %tile_0_1 = aie.tile(0, 1)
-    %tile_0_2 = aie.tile(0, 2)
-    aie.objectfifo @in0(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<64xi32>>
-    aie.objectfifo @in1(%tile_0_1, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<32xi32>>
-    aie.objectfifo.link [@in0] -> [@in1]()
-    aie.objectfifo @out0(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<64xi32>>
-    aie.objectfifo @out1(%tile_0_2, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<32xi32>>
-    aie.objectfifo.link [@out1] -> [@out0]()
-    %core_0_2 = aie.core(%tile_0_2) {
-      %c0 = arith.constant 0 : index
-      %c9223372036854775807 = arith.constant 9223372036854775807 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 {
-        %0 = aie.objectfifo.acquire @in1(Consume, 1) : !aie.objectfifosubview<memref<32xi32>>
-        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<32xi32>> -> memref<32xi32>
-        %2 = aie.objectfifo.acquire @out1(Produce, 1) : !aie.objectfifosubview<memref<32xi32>>
-        %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<32xi32>> -> memref<32xi32>
-        %c0_0 = arith.constant 0 : index
-        %c32 = arith.constant 32 : index
-        %c1_1 = arith.constant 1 : index
-        scf.for %arg1 = %c0_0 to %c32 step %c1_1 {
-          %4 = memref.load %1[%arg1] : memref<32xi32>
-          %c1_i32 = arith.constant 1 : i32
-          %5 = arith.addi %4, %c1_i32 : i32
-          memref.store %5, %3[%arg1] : memref<32xi32>
-        }
-        aie.objectfifo.release @in1(Consume, 1)
-        aie.objectfifo.release @out1(Produce, 1)
-      }
-      aie.end
-    }
-    func.func @sequence(%arg0: memref<1024xi32>, %arg1: memref<1024xi32>) {
-      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 0 : i64, metadata = @out0} : memref<1024xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 1 : i64, metadata = @in0} : memref<1024xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
-    }
-  }
-}
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_scalar_mul.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_scalar_mul.mlir
deleted file mode 100644
index 04d85bb39..000000000
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_scalar_mul.mlir
+++ /dev/null
@@ -1,45 +0,0 @@
-// RUN: iree-opt --aie-objectFifo-stateful-transform %s
-
-module {
-  aie.device(npu1) {
-    func.func private @vector_scalar_mul_int16_scalar(memref<128xi16>, memref<128xi16>, memref<1xi32>, i32)
-    func.func private @vector_scalar_mul_int16_vector(memref<128xi16>, memref<128xi16>, memref<1xi32>, i32)
-    %tile_0_0 = aie.tile(0, 0)
-    %tile_0_2 = aie.tile(0, 2)
-    aie.objectfifo @in(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<128xi16>>
-    aie.objectfifo @infactor(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<1xi32>>
-    aie.objectfifo @out(%tile_0_2, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<128xi16>>
-    %core_0_2 = aie.core(%tile_0_2) {
-      %c0 = arith.constant 0 : index
-      %c9223372036854775807 = arith.constant 9223372036854775807 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 {
-        %0 = aie.objectfifo.acquire @infactor(Consume, 1) : !aie.objectfifosubview<memref<1xi32>>
-        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<1xi32>> -> memref<1xi32>
-        %c0_0 = arith.constant 0 : index
-        %c4 = arith.constant 4 : index
-        %c1_1 = arith.constant 1 : index
-        scf.for %arg1 = %c0_0 to %c4 step %c1_1 {
-          %2 = aie.objectfifo.acquire @out(Produce, 1) : !aie.objectfifosubview<memref<128xi16>>
-          %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<128xi16>> -> memref<128xi16>
-          %4 = aie.objectfifo.acquire @in(Consume, 1) : !aie.objectfifosubview<memref<128xi16>>
-          %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<128xi16>> -> memref<128xi16>
-          %c128_i32 = arith.constant 128 : i32
-          func.call @vector_scalar_mul_int16_vector(%5, %3, %1, %c128_i32) : (memref<128xi16>, memref<128xi16>, memref<1xi32>, i32) -> ()
-          aie.objectfifo.release @in(Consume, 1)
-          aie.objectfifo.release @out(Produce, 1)
-        }
-        aie.objectfifo.release @infactor(Consume, 1)
-      }
-      aie.end
-    } {link_with = "scale.o"}
-    func.func @sequence(%arg0: memref<256xi32>, %arg1: memref<1xi32>, %arg2: memref<256xi32>) {
-      aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0]) {id = 0 : i64, metadata = @out} : memref<256xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0]) {id = 1 : i64, metadata = @in} : memref<256xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 1][0, 0, 0]) {id = 2 : i64, metadata = @infactor} : memref<1xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
-    }
-  }
-}
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_vector_add.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_vector_add.mlir
deleted file mode 100644
index 28728ca2e..000000000
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_vector_add.mlir
+++ /dev/null
@@ -1,50 +0,0 @@
-// RUN: iree-opt --aie-objectFifo-stateful-transform %s
-
-module {
-  aie.device(npu1) {
-    %tile_1_0 = aie.tile(1, 0)
-    %tile_1_2 = aie.tile(1, 2)
-    aie.objectfifo @in1(%tile_1_0, {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
-    aie.objectfifo @in2(%tile_1_0, {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
-    aie.objectfifo @out(%tile_1_2, {%tile_1_0}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
-    %core_1_2 = aie.core(%tile_1_2) {
-      %c0 = arith.constant 0 : index
-      %c9223372036854775807 = arith.constant 9223372036854775807 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 {
-        %c0_0 = arith.constant 0 : index
-        %c16 = arith.constant 16 : index
-        %c1_1 = arith.constant 1 : index
-        scf.for %arg1 = %c0_0 to %c16 step %c1_1 {
-          %0 = aie.objectfifo.acquire @in1(Consume, 1) : !aie.objectfifosubview<memref<16xi32>>
-          %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-          %2 = aie.objectfifo.acquire @in2(Consume, 1) : !aie.objectfifosubview<memref<16xi32>>
-          %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-          %4 = aie.objectfifo.acquire @out(Produce, 1) : !aie.objectfifosubview<memref<16xi32>>
-          %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-          %c0_2 = arith.constant 0 : index
-          %c16_3 = arith.constant 16 : index
-          %c1_4 = arith.constant 1 : index
-          scf.for %arg2 = %c0_2 to %c16_3 step %c1_4 {
-            %6 = memref.load %1[%arg2] : memref<16xi32>
-            %7 = memref.load %3[%arg2] : memref<16xi32>
-            %8 = arith.addi %6, %7 : i32
-            memref.store %8, %5[%arg2] : memref<16xi32>
-          }
-          aie.objectfifo.release @in1(Consume, 1)
-          aie.objectfifo.release @in2(Consume, 1)
-          aie.objectfifo.release @out(Produce, 1)
-        }
-      }
-      aie.end
-    }
-    func.func @sequence(%arg0: memref<256xi32>, %arg1: memref<256xi32>, %arg2: memref<256xi32>) {
-      aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0]) {id = 0 : i64, metadata = @out} : memref<256xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0]) {id = 1 : i64, metadata = @in1} : memref<256xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0]) {id = 2 : i64, metadata = @in2} : memref<256xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
-    }
-  }
-}
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_vector_mul.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_vector_mul.mlir
deleted file mode 100644
index 571331d5c..000000000
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_vector_vector_mul.mlir
+++ /dev/null
@@ -1,50 +0,0 @@
-// RUN: iree-opt --aie-objectFifo-stateful-transform %s
-
-module {
-  aie.device(npu1) {
-    %tile_1_0 = aie.tile(1, 0)
-    %tile_1_2 = aie.tile(1, 2)
-    aie.objectfifo @in1(%tile_1_0, {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
-    aie.objectfifo @in2(%tile_1_0, {%tile_1_2}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
-    aie.objectfifo @out(%tile_1_2, {%tile_1_0}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
-    %core_1_2 = aie.core(%tile_1_2) {
-      %c0 = arith.constant 0 : index
-      %c9223372036854775807 = arith.constant 9223372036854775807 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 {
-        %c0_0 = arith.constant 0 : index
-        %c16 = arith.constant 16 : index
-        %c1_1 = arith.constant 1 : index
-        scf.for %arg1 = %c0_0 to %c16 step %c1_1 {
-          %0 = aie.objectfifo.acquire @in1(Consume, 1) : !aie.objectfifosubview<memref<16xi32>>
-          %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-          %2 = aie.objectfifo.acquire @in2(Consume, 1) : !aie.objectfifosubview<memref<16xi32>>
-          %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-          %4 = aie.objectfifo.acquire @out(Produce, 1) : !aie.objectfifosubview<memref<16xi32>>
-          %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
-          %c0_2 = arith.constant 0 : index
-          %c16_3 = arith.constant 16 : index
-          %c1_4 = arith.constant 1 : index
-          scf.for %arg2 = %c0_2 to %c16_3 step %c1_4 {
-            %6 = memref.load %1[%arg2] : memref<16xi32>
-            %7 = memref.load %3[%arg2] : memref<16xi32>
-            %8 = arith.muli %6, %7 : i32
-            memref.store %8, %5[%arg2] : memref<16xi32>
-          }
-          aie.objectfifo.release @in1(Consume, 1)
-          aie.objectfifo.release @in2(Consume, 1)
-          aie.objectfifo.release @out(Produce, 1)
-        }
-      }
-      aie.end
-    }
-    func.func @sequence(%arg0: memref<256xi32>, %arg1: memref<256xi32>, %arg2: memref<256xi32>) {
-      aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0]) {id = 0 : i64, metadata = @out} : memref<256xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0]) {id = 1 : i64, metadata = @in1} : memref<256xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 256][0, 0, 0]) {id = 2 : i64, metadata = @in2} : memref<256xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
-    }
-  }
-}
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/harness.py b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/harness.py
new file mode 100644
index 000000000..89c2502aa
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/harness.py
@@ -0,0 +1,304 @@
+import os
+from pathlib import Path
+
+import numpy as np
+import basic_matrix_multiplication_matrix_vector
+import basic_matrix_multiplication_single_core
+
+os.environ["VITIS"] = "/opt/tools/Xilinx/Vitis/2023.2"
+
+from iree.compiler import compile_file
+
+# don't forget LD_LIBRARY_PATH=/opt/xilinx/xrt/lib:/usr/lib/x86_64-linux-gnu
+RUN = True
+if RUN:
+    from filelock import FileLock
+    from xaiepy.xrt import XCLBin
+
+
+TEMPLATE = """
+module attributes {hal.device.targets = [#hal.device.target<"amd-aie-direct", [#hal.executable.target<"amd-aie-direct", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>]>]} {
+  hal.executable private @dummy1 {
+    hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie-direct", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) {
+      hal.executable.export public @dummy2 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>]} {
+      ^bb0(%arg0: !hal.device):
+        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
+        hal.return %x, %y, %z : index, index, index
+      }
+      builtin.MODULE
+    }
+  }
+  util.func public @dummy3(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = ""}} {
+    // this is all gibberish just to hit serializeExecutable
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %element_type_i8 = hal.element_type<i8> : i32
+    %dense_row_major = hal.encoding_type<dense_row_major> : i32
+    hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c1, %c1]) type(%element_type_i8) encoding(%dense_row_major)
+    %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<1024x512xi8> in !stream.resource<external>{%c1}
+    %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource<external>{%c1} => !stream.timepoint
+
+    %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource<external>{%c1}) {
+      stream.cmd.dispatch @dummy1::@amdaie_xclbin_fb::@dummy2 {
+        ro %arg2[%c0 for %c1] : !stream.resource<external>{%c1}
+      }
+    } => !stream.timepoint
+    %3 = stream.timepoint.await %2 => %result : !stream.resource<external>{%c1}
+    %4 = stream.tensor.export %3 : tensor<1024x1024xi32> in !stream.resource<external>{%c1} -> !hal.buffer_view
+    util.return %4 : !hal.buffer_view
+  }
+}
+"""
+
+
+def compile(workdir, test):
+    compile_file(
+        str(workdir / (test + ".mlir")),
+        target_backends=["amd-aie-direct"],
+        extra_args=[
+            "--compile-mode=hal-executable",
+            f"--iree-hal-dump-executable-intermediates-to={workdir}",
+        ],
+    )
+
+
+def test_matrix_vector_32_1_core():
+    M = K = 32
+    TEST = basic_matrix_multiplication_matrix_vector.__name__ + "_32_1_core"
+    WORKDIR = Path(__file__).parent.absolute() / TEST
+    if not WORKDIR.exists():
+        WORKDIR.mkdir(parents=True)
+    with open(WORKDIR / f"{TEST}.mlir", "w") as f:
+        f.write(
+            TEMPLATE.replace(
+                "MODULE",
+                basic_matrix_multiplication_matrix_vector.emit_module(M, K),
+            )
+        )
+
+    NPU_INSTS_FP = f"{WORKDIR}/module_dummy1_amdaie_xclbin_fb/module_dummy1_amdaie_xclbin_fb.npu.txt"
+    XCLBIN_PATH = f"{WORKDIR}/module_dummy1_amdaie_xclbin_fb/module_dummy1_amdaie_xclbin_fb.xclbin"
+    KERNEL_NAME = "dummy2"
+
+    compile(WORKDIR, TEST)
+
+    with open(NPU_INSTS_FP, "r") as f:
+        npu_insts = list(map(lambda n: int(n, 16), f.readlines()))
+
+    if RUN:
+        with FileLock("/tmp/npu.lock"):
+            xclbin = XCLBin(XCLBIN_PATH, KERNEL_NAME)
+            views = xclbin.mmap_buffers([(M, K), (K,), (M,)], np.float32)
+
+            xclbin.load_npu_instructions(npu_insts)
+
+            A = np.random.randint(0, 10, (M, K)).astype(np.float32)
+            B = np.random.randint(0, 10, (K,)).astype(np.float32)
+            C = np.zeros((M,)).astype(np.float32)
+
+            wraps = list(map(np.asarray, views))
+            np.copyto(wraps[0], A, casting="no")
+            np.copyto(wraps[1], B, casting="no")
+            np.copyto(wraps[2], C, casting="no")
+
+            xclbin.sync_buffers_to_device()
+            xclbin.run()
+            print("Running kernel")
+            xclbin.wait(30)
+            xclbin.sync_buffers_from_device()
+
+            assert np.allclose(A @ B, wraps[2])
+            print(wraps[2])
+
+
+def test_matrix_vector_64_1_core():
+    M = K = 64
+    TEST = basic_matrix_multiplication_matrix_vector.__name__ + "_64_1_core"
+    WORKDIR = Path(__file__).parent.absolute() / TEST
+    if not WORKDIR.exists():
+        WORKDIR.mkdir(parents=True)
+    with open(WORKDIR / f"{TEST}.mlir", "w") as f:
+        f.write(
+            TEMPLATE.replace(
+                "MODULE", basic_matrix_multiplication_matrix_vector.emit_module(M, K)
+            )
+        )
+
+    NPU_INSTS_FP = f"{WORKDIR}/module_dummy1_amdaie_xclbin_fb/module_dummy1_amdaie_xclbin_fb.npu.txt"
+    XCLBIN_PATH = f"{WORKDIR}/module_dummy1_amdaie_xclbin_fb/module_dummy1_amdaie_xclbin_fb.xclbin"
+    KERNEL_NAME = "dummy2"
+
+    compile(WORKDIR, TEST)
+
+    with open(NPU_INSTS_FP, "r") as f:
+        npu_insts = list(map(lambda n: int(n, 16), f.readlines()))
+
+    if RUN:
+        with FileLock("/tmp/npu.lock"):
+            xclbin = XCLBin(XCLBIN_PATH, KERNEL_NAME)
+            views = xclbin.mmap_buffers([(M, K), (K,), (M,)], np.float32)
+
+            xclbin.load_npu_instructions(npu_insts)
+
+            A = np.random.randint(0, 10, (M, K)).astype(np.float32)
+            B = np.random.randint(0, 10, (K,)).astype(np.float32)
+            C = np.zeros((M,)).astype(np.float32)
+
+            wraps = list(map(np.asarray, views))
+            np.copyto(wraps[0], A, casting="no")
+            np.copyto(wraps[1], B, casting="no")
+            np.copyto(wraps[2], C, casting="no")
+
+            xclbin.sync_buffers_to_device()
+            xclbin.run()
+            print("Running kernel")
+            xclbin.wait(30)
+            xclbin.sync_buffers_from_device()
+
+            assert np.allclose(A @ B, wraps[2])
+            print(wraps[2])
+
+
+def test_matrix_vector_2_cores():
+    M = K = 64
+    TEST = basic_matrix_multiplication_matrix_vector.__name__ + "_64_2_cores"
+    WORKDIR = Path(__file__).parent.absolute() / TEST
+    if not WORKDIR.exists():
+        WORKDIR.mkdir(parents=True)
+    with open(WORKDIR / f"{TEST}.mlir", "w") as f:
+        f.write(
+            TEMPLATE.replace(
+                "MODULE",
+                basic_matrix_multiplication_matrix_vector.emit_module(M, K, n_cores=2),
+            )
+        )
+
+    NPU_INSTS_FP = f"{WORKDIR}/module_dummy1_amdaie_xclbin_fb/module_dummy1_amdaie_xclbin_fb.npu.txt"
+    XCLBIN_PATH = f"{WORKDIR}/module_dummy1_amdaie_xclbin_fb/module_dummy1_amdaie_xclbin_fb.xclbin"
+    KERNEL_NAME = "dummy2"
+
+    compile(WORKDIR, TEST)
+
+    with open(NPU_INSTS_FP, "r") as f:
+        npu_insts = list(map(lambda n: int(n, 16), f.readlines()))
+
+    if RUN:
+        with FileLock("/tmp/npu.lock"):
+            xclbin = XCLBin(XCLBIN_PATH, KERNEL_NAME)
+            views = xclbin.mmap_buffers([(M, K), (K,), (M,)], np.float32)
+
+            xclbin.load_npu_instructions(npu_insts)
+
+            A = np.random.randint(0, 10, (M, K)).astype(np.float32)
+            B = np.random.randint(0, 10, (K,)).astype(np.float32)
+            C = np.zeros((M,)).astype(np.float32)
+
+            wraps = list(map(np.asarray, views))
+            np.copyto(wraps[0], A, casting="no")
+            np.copyto(wraps[1], B, casting="no")
+            np.copyto(wraps[2], C, casting="no")
+
+            xclbin.sync_buffers_to_device()
+            xclbin.run()
+            print("Running kernel")
+            xclbin.wait(30)
+            xclbin.sync_buffers_from_device()
+
+            assert np.allclose(A @ B, wraps[2])
+            print(wraps[2])
+
+
+def test_matmul_32():
+    M = K = N = 32
+    TEST = basic_matrix_multiplication_single_core.__name__ + "_32"
+    WORKDIR = Path(__file__).parent.absolute() / TEST
+    if not WORKDIR.exists():
+        WORKDIR.mkdir(parents=True)
+    with open(WORKDIR / f"{TEST}.mlir", "w") as f:
+        f.write(
+            TEMPLATE.replace(
+                "MODULE", basic_matrix_multiplication_single_core.emit_module(M, K, N)
+            )
+        )
+    NPU_INSTS_FP = f"{WORKDIR}/module_dummy1_amdaie_xclbin_fb/module_dummy1_amdaie_xclbin_fb.npu.txt"
+    XCLBIN_PATH = f"{WORKDIR}/module_dummy1_amdaie_xclbin_fb/module_dummy1_amdaie_xclbin_fb.xclbin"
+    KERNEL_NAME = "dummy2"
+
+    compile(WORKDIR, TEST)
+
+    with open(NPU_INSTS_FP, "r") as f:
+        npu_insts = list(map(lambda n: int(n, 16), f.readlines()))
+
+    if RUN:
+        with FileLock("/tmp/npu.lock"):
+            xclbin = XCLBin(XCLBIN_PATH, KERNEL_NAME)
+            views = xclbin.mmap_buffers([(M, K), (K, N), (M, N)], np.float32)
+
+            xclbin.load_npu_instructions(npu_insts)
+
+            # the stupid upstream example isn't correct for real numbers
+            A = np.ones((M, K)).astype(np.float32)
+            B = 2 * np.ones((K, N)).astype(np.float32)
+            C = np.zeros((M, N)).astype(np.float32)
+
+            wraps = list(map(np.asarray, views))
+            np.copyto(wraps[0], A, casting="no")
+            np.copyto(wraps[1], B, casting="no")
+            np.copyto(wraps[2], C, casting="no")
+
+            xclbin.sync_buffers_to_device()
+            xclbin.run()
+            print("Running kernel")
+            xclbin.wait(30)
+            xclbin.sync_buffers_from_device()
+
+            assert np.allclose(A @ B, wraps[2])
+            print(wraps[2])
+
+
+def test_matmul_64():
+    M = K = N = 64
+    TEST = basic_matrix_multiplication_single_core.__name__ + "_64"
+    WORKDIR = Path(__file__).parent.absolute() / TEST
+    if not WORKDIR.exists():
+        WORKDIR.mkdir(parents=True)
+    with open(WORKDIR / f"{TEST}.mlir", "w") as f:
+        f.write(
+            TEMPLATE.replace(
+                "MODULE", basic_matrix_multiplication_single_core.emit_module(M, K, N)
+            )
+        )
+    NPU_INSTS_FP = f"{WORKDIR}/module_dummy1_amdaie_xclbin_fb/module_dummy1_amdaie_xclbin_fb.npu.txt"
+    XCLBIN_PATH = f"{WORKDIR}/module_dummy1_amdaie_xclbin_fb/module_dummy1_amdaie_xclbin_fb.xclbin"
+    KERNEL_NAME = "dummy2"
+
+    compile(WORKDIR, TEST)
+
+    with open(NPU_INSTS_FP, "r") as f:
+        npu_insts = list(map(lambda n: int(n, 16), f.readlines()))
+
+    if RUN:
+        with FileLock("/tmp/npu.lock"):
+            xclbin = XCLBin(XCLBIN_PATH, KERNEL_NAME)
+            views = xclbin.mmap_buffers([(M, K), (K, N), (M, N)], np.float32)
+
+            xclbin.load_npu_instructions(npu_insts)
+
+            # the stupid upstream example isn't correct for real numbers
+            A = np.ones((M, K)).astype(np.float32)
+            B = 2 * np.ones((K, N)).astype(np.float32)
+            C = np.zeros((M, N)).astype(np.float32)
+
+            wraps = list(map(np.asarray, views))
+            np.copyto(wraps[0], A, casting="no")
+            np.copyto(wraps[1], B, casting="no")
+            np.copyto(wraps[2], C, casting="no")
+
+            xclbin.sync_buffers_to_device()
+            xclbin.run()
+            print("Running kernel")
+            xclbin.wait(30)
+            xclbin.sync_buffers_from_device()
+
+            assert np.allclose(A @ B, wraps[2])
+            print(wraps[2])
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_bottleneck.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_bottleneck.mlir
deleted file mode 100644
index 2dfaba8ba..000000000
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_bottleneck.mlir
+++ /dev/null
@@ -1,236 +0,0 @@
-// RUN: iree-opt --aie-objectFifo-stateful-transform %s
-
-module {
-  aie.device(npu1) {
-    func.func private @conv2dk1_i8(memref<32x1x256xi8>, memref<16384xi8>, memref<32x1x64xui8>, i32, i32, i32, i32)
-    func.func private @conv2dk3_ui8(memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32)
-    func.func private @conv2dk1_skip_i8(memref<32x1x32xui8>, memref<32x1x32xui8>, memref<16384xi8>, memref<32x1x256xui8>, memref<32x1x256xi8>, i32, i32, i32, i32, i32)
-    %tile_0_0 = aie.tile(0, 0)
-    %tile_0_1 = aie.tile(0, 1)
-    %tile_0_2 = aie.tile(0, 2)
-    %tile_0_3 = aie.tile(0, 3)
-    %tile_0_4 = aie.tile(0, 4)
-    %tile_0_5 = aie.tile(0, 5)
-    %rtpComputeTile2 = aie.buffer(%tile_0_2) {sym_name = "rtpComputeTile2"} : memref<16xi32> 
-    %rtpComputeTile3 = aie.buffer(%tile_0_3) {sym_name = "rtpComputeTile3"} : memref<16xi32> 
-    %rtpComputeTile4 = aie.buffer(%tile_0_4) {sym_name = "rtpComputeTile4"} : memref<16xi32> 
-    %rtpComputeTile5 = aie.buffer(%tile_0_5) {sym_name = "rtpComputeTile5"} : memref<16xi32> 
-    aie.objectfifo @inOF_act_L3L2(%tile_0_0, {%tile_0_2, %tile_0_1}, [2 : i32, 2 : i32, 4 : i32]) : !aie.objectfifo<memref<32x1x256xi8>>
-    aie.objectfifo @skip_buf(%tile_0_1, {%tile_0_4}, 2 : i32) : !aie.objectfifo<memref<32x1x256xi8>>
-    aie.objectfifo.link [@inOF_act_L3L2] -> [@skip_buf]()
-    aie.objectfifo @inOF_wts_0_L3L2(%tile_0_0, {%tile_0_1}, 1 : i32) : !aie.objectfifo<memref<69632xi8>>
-    aie.objectfifo @wts_buf_00(%tile_0_1, {%tile_0_2}, 1 : i32) : !aie.objectfifo<memref<16384xi8>>
-    aie.objectfifo @wts_buf_01(%tile_0_1, {%tile_0_3, %tile_0_5}, 1 : i32) : !aie.objectfifo<memref<36864xi8>>
-    aie.objectfifo @wts_buf_02(%tile_0_1, {%tile_0_4}, 1 : i32) : !aie.objectfifo<memref<16384xi8>>
-    aie.objectfifo.link [@inOF_wts_0_L3L2] -> [@wts_buf_00, @wts_buf_01, @wts_buf_02]()
-    aie.objectfifo @act_2_3_5(%tile_0_2, {%tile_0_3, %tile_0_5}, [2 : i32, 4 : i32, 4 : i32]) : !aie.objectfifo<memref<32x1x64xui8>>
-    aie.objectfifo @act_3_4(%tile_0_3, {%tile_0_4}, 2 : i32) : !aie.objectfifo<memref<32x1x32xui8>>
-    aie.objectfifo @act_5_4(%tile_0_5, {%tile_0_4}, 2 : i32) : !aie.objectfifo<memref<32x1x32xui8>>
-    aie.objectfifo @outOFL2L3(%tile_0_4, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<32x1x256xui8>>
-    %core_0_2 = aie.core(%tile_0_2) {
-      %c0 = arith.constant 0 : index
-      %c9223372036854775807 = arith.constant 9223372036854775807 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 {
-        %0 = aie.objectfifo.acquire @wts_buf_00(Consume, 1) : !aie.objectfifosubview<memref<16384xi8>>
-        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<16384xi8>> -> memref<16384xi8>
-        %c0_0 = arith.constant 0 : index
-        %2 = memref.load %rtpComputeTile2[%c0_0] : memref<16xi32>
-        %c0_1 = arith.constant 0 : index
-        %c32 = arith.constant 32 : index
-        %c1_2 = arith.constant 1 : index
-        scf.for %arg1 = %c0_1 to %c32 step %c1_2 {
-          %3 = aie.objectfifo.acquire @inOF_act_L3L2(Consume, 1) : !aie.objectfifosubview<memref<32x1x256xi8>>
-          %4 = aie.objectfifo.subview.access %3[0] : !aie.objectfifosubview<memref<32x1x256xi8>> -> memref<32x1x256xi8>
-          %5 = aie.objectfifo.acquire @act_2_3_5(Produce, 1) : !aie.objectfifosubview<memref<32x1x64xui8>>
-          %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-          %c32_i32 = arith.constant 32 : i32
-          %c256_i32 = arith.constant 256 : i32
-          %c64_i32 = arith.constant 64 : i32
-          func.call @conv2dk1_i8(%4, %1, %6, %c32_i32, %c256_i32, %c64_i32, %2) : (memref<32x1x256xi8>, memref<16384xi8>, memref<32x1x64xui8>, i32, i32, i32, i32) -> ()
-          aie.objectfifo.release @inOF_act_L3L2(Consume, 1)
-          aie.objectfifo.release @act_2_3_5(Produce, 1)
-        }
-        aie.objectfifo.release @wts_buf_00(Consume, 1)
-      }
-      aie.end
-    } {link_with = "conv2dk1.o"}
-    %core_0_3 = aie.core(%tile_0_3) {
-      %c0 = arith.constant 0 : index
-      %c9223372036854775807 = arith.constant 9223372036854775807 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 {
-        %0 = aie.objectfifo.acquire @wts_buf_01(Consume, 1) : !aie.objectfifosubview<memref<36864xi8>>
-        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<36864xi8>> -> memref<36864xi8>
-        %2 = aie.objectfifo.acquire @act_2_3_5(Consume, 2) : !aie.objectfifosubview<memref<32x1x64xui8>>
-        %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %4 = aie.objectfifo.subview.access %2[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %5 = aie.objectfifo.acquire @act_3_4(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-        %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-        %c32_i32 = arith.constant 32 : i32
-        %c64_i32 = arith.constant 64 : i32
-        %c64_i32_0 = arith.constant 64 : i32
-        %c3_i32 = arith.constant 3 : i32
-        %c3_i32_1 = arith.constant 3 : i32
-        %c0_i32 = arith.constant 0 : i32
-        %c11_i32 = arith.constant 11 : i32
-        %c0_i32_2 = arith.constant 0 : i32
-        func.call @conv2dk3_ui8(%3, %3, %4, %1, %6, %c32_i32, %c64_i32, %c64_i32_0, %c3_i32, %c3_i32_1, %c0_i32, %c11_i32, %c0_i32_2) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
-        aie.objectfifo.release @act_3_4(Produce, 1)
-        %c0_3 = arith.constant 0 : index
-        %c30 = arith.constant 30 : index
-        %c1_4 = arith.constant 1 : index
-        scf.for %arg1 = %c0_3 to %c30 step %c1_4 {
-          %12 = aie.objectfifo.acquire @act_2_3_5(Consume, 3) : !aie.objectfifosubview<memref<32x1x64xui8>>
-          %13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-          %14 = aie.objectfifo.subview.access %12[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-          %15 = aie.objectfifo.subview.access %12[2] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-          %16 = aie.objectfifo.acquire @act_3_4(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-          %17 = aie.objectfifo.subview.access %16[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-          %c32_i32_12 = arith.constant 32 : i32
-          %c64_i32_13 = arith.constant 64 : i32
-          %c64_i32_14 = arith.constant 64 : i32
-          %c3_i32_15 = arith.constant 3 : i32
-          %c3_i32_16 = arith.constant 3 : i32
-          %c1_i32 = arith.constant 1 : i32
-          %c11_i32_17 = arith.constant 11 : i32
-          %c0_i32_18 = arith.constant 0 : i32
-          func.call @conv2dk3_ui8(%13, %14, %15, %1, %17, %c32_i32_12, %c64_i32_13, %c64_i32_14, %c3_i32_15, %c3_i32_16, %c1_i32, %c11_i32_17, %c0_i32_18) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
-          aie.objectfifo.release @act_2_3_5(Consume, 1)
-          aie.objectfifo.release @act_3_4(Produce, 1)
-        }
-        %7 = aie.objectfifo.acquire @act_2_3_5(Consume, 2) : !aie.objectfifosubview<memref<32x1x64xui8>>
-        %8 = aie.objectfifo.subview.access %7[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %9 = aie.objectfifo.subview.access %7[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %10 = aie.objectfifo.acquire @act_3_4(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-        %11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-        %c32_i32_5 = arith.constant 32 : i32
-        %c64_i32_6 = arith.constant 64 : i32
-        %c64_i32_7 = arith.constant 64 : i32
-        %c3_i32_8 = arith.constant 3 : i32
-        %c3_i32_9 = arith.constant 3 : i32
-        %c2_i32 = arith.constant 2 : i32
-        %c11_i32_10 = arith.constant 11 : i32
-        %c0_i32_11 = arith.constant 0 : i32
-        func.call @conv2dk3_ui8(%8, %9, %9, %1, %11, %c32_i32_5, %c64_i32_6, %c64_i32_7, %c3_i32_8, %c3_i32_9, %c2_i32, %c11_i32_10, %c0_i32_11) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
-        aie.objectfifo.release @act_2_3_5(Consume, 2)
-        aie.objectfifo.release @act_3_4(Produce, 1)
-        aie.objectfifo.release @wts_buf_01(Consume, 1)
-      }
-      aie.end
-    } {link_with = "conv2dk3.o"}
-    %core_0_5 = aie.core(%tile_0_5) {
-      %c0 = arith.constant 0 : index
-      %c9223372036854775807 = arith.constant 9223372036854775807 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 {
-        %0 = aie.objectfifo.acquire @wts_buf_01(Consume, 1) : !aie.objectfifosubview<memref<36864xi8>>
-        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<36864xi8>> -> memref<36864xi8>
-        %2 = aie.objectfifo.acquire @act_2_3_5(Consume, 2) : !aie.objectfifosubview<memref<32x1x64xui8>>
-        %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %4 = aie.objectfifo.subview.access %2[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %5 = aie.objectfifo.acquire @act_5_4(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-        %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-        %c32_i32 = arith.constant 32 : i32
-        %c64_i32 = arith.constant 64 : i32
-        %c64_i32_0 = arith.constant 64 : i32
-        %c3_i32 = arith.constant 3 : i32
-        %c3_i32_1 = arith.constant 3 : i32
-        %c0_i32 = arith.constant 0 : i32
-        %c11_i32 = arith.constant 11 : i32
-        %c32_i32_2 = arith.constant 32 : i32
-        func.call @conv2dk3_ui8(%3, %3, %4, %1, %6, %c32_i32, %c64_i32, %c64_i32_0, %c3_i32, %c3_i32_1, %c0_i32, %c11_i32, %c32_i32_2) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
-        aie.objectfifo.release @act_5_4(Produce, 1)
-        %c0_3 = arith.constant 0 : index
-        %c30 = arith.constant 30 : index
-        %c1_4 = arith.constant 1 : index
-        scf.for %arg1 = %c0_3 to %c30 step %c1_4 {
-          %12 = aie.objectfifo.acquire @act_2_3_5(Consume, 3) : !aie.objectfifosubview<memref<32x1x64xui8>>
-          %13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-          %14 = aie.objectfifo.subview.access %12[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-          %15 = aie.objectfifo.subview.access %12[2] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-          %16 = aie.objectfifo.acquire @act_5_4(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-          %17 = aie.objectfifo.subview.access %16[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-          %c32_i32_12 = arith.constant 32 : i32
-          %c64_i32_13 = arith.constant 64 : i32
-          %c64_i32_14 = arith.constant 64 : i32
-          %c3_i32_15 = arith.constant 3 : i32
-          %c3_i32_16 = arith.constant 3 : i32
-          %c1_i32 = arith.constant 1 : i32
-          %c11_i32_17 = arith.constant 11 : i32
-          %c32_i32_18 = arith.constant 32 : i32
-          func.call @conv2dk3_ui8(%13, %14, %15, %1, %17, %c32_i32_12, %c64_i32_13, %c64_i32_14, %c3_i32_15, %c3_i32_16, %c1_i32, %c11_i32_17, %c32_i32_18) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
-          aie.objectfifo.release @act_2_3_5(Consume, 1)
-          aie.objectfifo.release @act_5_4(Produce, 1)
-        }
-        %7 = aie.objectfifo.acquire @act_2_3_5(Consume, 2) : !aie.objectfifosubview<memref<32x1x64xui8>>
-        %8 = aie.objectfifo.subview.access %7[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %9 = aie.objectfifo.subview.access %7[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %10 = aie.objectfifo.acquire @act_5_4(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-        %11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-        %c32_i32_5 = arith.constant 32 : i32
-        %c64_i32_6 = arith.constant 64 : i32
-        %c64_i32_7 = arith.constant 64 : i32
-        %c3_i32_8 = arith.constant 3 : i32
-        %c3_i32_9 = arith.constant 3 : i32
-        %c2_i32 = arith.constant 2 : i32
-        %c11_i32_10 = arith.constant 11 : i32
-        %c32_i32_11 = arith.constant 32 : i32
-        func.call @conv2dk3_ui8(%8, %9, %9, %1, %11, %c32_i32_5, %c64_i32_6, %c64_i32_7, %c3_i32_8, %c3_i32_9, %c2_i32, %c11_i32_10, %c32_i32_11) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
-        aie.objectfifo.release @act_2_3_5(Consume, 2)
-        aie.objectfifo.release @act_5_4(Produce, 1)
-        aie.objectfifo.release @wts_buf_01(Consume, 1)
-      }
-      aie.end
-    } {link_with = "conv2dk3.o"}
-    %core_0_4 = aie.core(%tile_0_4) {
-      %c0 = arith.constant 0 : index
-      %c9223372036854775807 = arith.constant 9223372036854775807 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 {
-        %0 = aie.objectfifo.acquire @wts_buf_02(Consume, 1) : !aie.objectfifosubview<memref<16384xi8>>
-        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<16384xi8>> -> memref<16384xi8>
-        %c0_0 = arith.constant 0 : index
-        %2 = memref.load %rtpComputeTile4[%c0_0] : memref<16xi32>
-        %c1_1 = arith.constant 1 : index
-        %3 = memref.load %rtpComputeTile4[%c1_1] : memref<16xi32>
-        %c0_2 = arith.constant 0 : index
-        %c32 = arith.constant 32 : index
-        %c1_3 = arith.constant 1 : index
-        scf.for %arg1 = %c0_2 to %c32 step %c1_3 {
-          %4 = aie.objectfifo.acquire @act_3_4(Consume, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-          %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-          %6 = aie.objectfifo.acquire @act_5_4(Consume, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-          %7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-          %8 = aie.objectfifo.acquire @skip_buf(Consume, 1) : !aie.objectfifosubview<memref<32x1x256xi8>>
-          %9 = aie.objectfifo.subview.access %8[0] : !aie.objectfifosubview<memref<32x1x256xi8>> -> memref<32x1x256xi8>
-          %10 = aie.objectfifo.acquire @outOFL2L3(Produce, 1) : !aie.objectfifosubview<memref<32x1x256xui8>>
-          %11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview<memref<32x1x256xui8>> -> memref<32x1x256xui8>
-          %c32_i32 = arith.constant 32 : i32
-          %c64_i32 = arith.constant 64 : i32
-          %c256_i32 = arith.constant 256 : i32
-          func.call @conv2dk1_skip_i8(%5, %7, %1, %11, %9, %c32_i32, %c64_i32, %c256_i32, %2, %3) : (memref<32x1x32xui8>, memref<32x1x32xui8>, memref<16384xi8>, memref<32x1x256xui8>, memref<32x1x256xi8>, i32, i32, i32, i32, i32) -> ()
-          aie.objectfifo.release @outOFL2L3(Produce, 1)
-          aie.objectfifo.release @act_3_4(Consume, 1)
-          aie.objectfifo.release @act_5_4(Consume, 1)
-          aie.objectfifo.release @skip_buf(Consume, 1)
-        }
-        aie.objectfifo.release @wts_buf_02(Consume, 1)
-      }
-      aie.end
-    } {link_with = "conv2dk1_skip.o"}
-    func.func @sequence(%arg0: memref<65536xi32>, %arg1: memref<17408xi32>, %arg2: memref<65536xi32>) {
-      aiex.npu.rtp_write(0, 2, 0, 1) {buffer_sym_name = "rtpComputeTile2"}
-      aiex.npu.rtp_write(0, 3, 0, 1) {buffer_sym_name = "rtpComputeTile3"}
-      aiex.npu.rtp_write(0, 5, 0, 1) {buffer_sym_name = "rtpComputeTile5"}
-      aiex.npu.rtp_write(0, 4, 0, 1) {buffer_sym_name = "rtpComputeTile4"}
-      aiex.npu.rtp_write(0, 4, 1, 0) {buffer_sym_name = "rtpComputeTile4"}
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 65536][0, 0, 0]) {id = 0 : i64, metadata = @inOF_act_L3L2} : memref<65536xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 65536][0, 0, 0]) {id = 2 : i64, metadata = @outOFL2L3} : memref<65536xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 17408][0, 0, 0]) {id = 1 : i64, metadata = @inOF_wts_0_L3L2} : memref<17408xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
-    }
-  }
-}
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_conv2d.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_conv2d.mlir
deleted file mode 100644
index 8eccb2867..000000000
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_conv2d.mlir
+++ /dev/null
@@ -1,55 +0,0 @@
-// RUN: iree-opt --aie-objectFifo-stateful-transform %s
-
-module {
-  aie.device(npu1) {
-    func.func private @conv2dk1_i8(memref<2048xi8>, memref<4096xi8>, memref<2048xi8>, i32, i32, i32, i32)
-    %tile_0_0 = aie.tile(0, 0)
-    %tile_0_1 = aie.tile(0, 1)
-    %tile_0_2 = aie.tile(0, 2)
-    aie.objectfifo @inOF_act_L3L2(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<4096xi8>>
-    aie.objectfifo @act_L2_02(%tile_0_1, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<2048xi8>>
-    aie.objectfifo.link [@inOF_act_L3L2] -> [@act_L2_02]()
-    aie.objectfifo @inOF_wts_0_L3L2(%tile_0_0, {%tile_0_2}, 1 : i32) : !aie.objectfifo<memref<4096xi8>>
-    aie.objectfifo @out_02_L2(%tile_0_2, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<2048xi8>>
-    aie.objectfifo @outOFL2L3(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<4096xi8>>
-    aie.objectfifo.link [@out_02_L2] -> [@outOFL2L3]()
-    %rtp2 = aie.buffer(%tile_0_2) {sym_name = "rtp2"} : memref<16xi32> 
-    %core_0_2 = aie.core(%tile_0_2) {
-      %c0 = arith.constant 0 : index
-      %c4294967295 = arith.constant 4294967295 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c4294967295 step %c1 {
-        %0 = aie.objectfifo.acquire @inOF_wts_0_L3L2(Consume, 1) : !aie.objectfifosubview<memref<4096xi8>>
-        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4096xi8>> -> memref<4096xi8>
-        %c0_0 = arith.constant 0 : index
-        %2 = memref.load %rtp2[%c0_0] : memref<16xi32>
-        %c0_1 = arith.constant 0 : index
-        %c32 = arith.constant 32 : index
-        %c1_2 = arith.constant 1 : index
-        scf.for %arg1 = %c0_1 to %c32 step %c1_2 {
-          %3 = aie.objectfifo.acquire @act_L2_02(Consume, 1) : !aie.objectfifosubview<memref<2048xi8>>
-          %4 = aie.objectfifo.subview.access %3[0] : !aie.objectfifosubview<memref<2048xi8>> -> memref<2048xi8>
-          %5 = aie.objectfifo.acquire @out_02_L2(Produce, 1) : !aie.objectfifosubview<memref<2048xi8>>
-          %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview<memref<2048xi8>> -> memref<2048xi8>
-          %c32_i32 = arith.constant 32 : i32
-          %c64_i32 = arith.constant 64 : i32
-          %c64_i32_3 = arith.constant 64 : i32
-          func.call @conv2dk1_i8(%4, %1, %6, %c32_i32, %c64_i32, %c64_i32_3, %2) : (memref<2048xi8>, memref<4096xi8>, memref<2048xi8>, i32, i32, i32, i32) -> ()
-          aie.objectfifo.release @act_L2_02(Consume, 1)
-          aie.objectfifo.release @out_02_L2(Produce, 1)
-        }
-        aie.objectfifo.release @inOF_wts_0_L3L2(Consume, 1)
-      }
-      aie.end
-    } {link_with = "conv2dk1_i8.o"}
-    func.func @sequence(%arg0: memref<16384xi32>, %arg1: memref<1024xi32>, %arg2: memref<16384xi32>) {
-      aiex.npu.rtp_write(0, 2, 0, 10) {buffer_sym_name = "rtp2"}
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 16384][0, 0, 0]) {id = 0 : i64, metadata = @inOF_act_L3L2} : memref<16384xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 16384][0, 0, 0]) {id = 2 : i64, metadata = @outOFL2L3} : memref<16384xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 2 : i64, metadata = @inOF_wts_0_L3L2} : memref<1024xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
-    }
-  }
-}
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_conv2d_fused_relu.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_conv2d_fused_relu.mlir
deleted file mode 100644
index 701a268f0..000000000
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_conv2d_fused_relu.mlir
+++ /dev/null
@@ -1,55 +0,0 @@
-// RUN: iree-opt --aie-objectFifo-stateful-transform %s
-
-module {
-  aie.device(npu1) {
-    func.func private @conv2dk1_i8(memref<2048xi8>, memref<4096xi8>, memref<2048xui8>, i32, i32, i32, i32)
-    %tile_0_0 = aie.tile(0, 0)
-    %tile_0_1 = aie.tile(0, 1)
-    %tile_0_2 = aie.tile(0, 2)
-    aie.objectfifo @inOF_act_L3L2(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<4096xi8>>
-    aie.objectfifo @act_L2_02(%tile_0_1, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<2048xi8>>
-    aie.objectfifo.link [@inOF_act_L3L2] -> [@act_L2_02]()
-    aie.objectfifo @inOF_wts_0_L3L2(%tile_0_0, {%tile_0_2}, 1 : i32) : !aie.objectfifo<memref<4096xi8>>
-    aie.objectfifo @out_02_L2(%tile_0_2, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<2048xui8>>
-    aie.objectfifo @outOFL2L3(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<4096xui8>>
-    aie.objectfifo.link [@out_02_L2] -> [@outOFL2L3]()
-    %rtp2 = aie.buffer(%tile_0_2) {sym_name = "rtp2"} : memref<16xi32> 
-    %core_0_2 = aie.core(%tile_0_2) {
-      %c0 = arith.constant 0 : index
-      %c4294967295 = arith.constant 4294967295 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c4294967295 step %c1 {
-        %0 = aie.objectfifo.acquire @inOF_wts_0_L3L2(Consume, 1) : !aie.objectfifosubview<memref<4096xi8>>
-        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4096xi8>> -> memref<4096xi8>
-        %c0_0 = arith.constant 0 : index
-        %2 = memref.load %rtp2[%c0_0] : memref<16xi32>
-        %c0_1 = arith.constant 0 : index
-        %c32 = arith.constant 32 : index
-        %c1_2 = arith.constant 1 : index
-        scf.for %arg1 = %c0_1 to %c32 step %c1_2 {
-          %3 = aie.objectfifo.acquire @act_L2_02(Consume, 1) : !aie.objectfifosubview<memref<2048xi8>>
-          %4 = aie.objectfifo.subview.access %3[0] : !aie.objectfifosubview<memref<2048xi8>> -> memref<2048xi8>
-          %5 = aie.objectfifo.acquire @out_02_L2(Produce, 1) : !aie.objectfifosubview<memref<2048xui8>>
-          %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview<memref<2048xui8>> -> memref<2048xui8>
-          %c32_i32 = arith.constant 32 : i32
-          %c64_i32 = arith.constant 64 : i32
-          %c64_i32_3 = arith.constant 64 : i32
-          func.call @conv2dk1_i8(%4, %1, %6, %c32_i32, %c64_i32, %c64_i32_3, %2) : (memref<2048xi8>, memref<4096xi8>, memref<2048xui8>, i32, i32, i32, i32) -> ()
-          aie.objectfifo.release @act_L2_02(Consume, 1)
-          aie.objectfifo.release @out_02_L2(Produce, 1)
-        }
-        aie.objectfifo.release @inOF_wts_0_L3L2(Consume, 1)
-      }
-      aie.end
-    } {link_with = "conv2dk1.o"}
-    func.func @sequence(%arg0: memref<16384xi32>, %arg1: memref<1024xi32>, %arg2: memref<16384xi32>) {
-      aiex.npu.rtp_write(0, 2, 0, 1) {buffer_sym_name = "rtp2"}
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 16384][0, 0, 0]) {id = 0 : i64, metadata = @inOF_act_L3L2} : memref<16384xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 16384][0, 0, 0]) {id = 2 : i64, metadata = @outOFL2L3} : memref<16384xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 2 : i64, metadata = @inOF_wts_0_L3L2} : memref<1024xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
-    }
-  }
-}
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_eltwise_add.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_eltwise_add.mlir
deleted file mode 100644
index 4e148353c..000000000
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_eltwise_add.mlir
+++ /dev/null
@@ -1,78 +0,0 @@
-// RUN: iree-opt --aie-objectFifo-stateful-transform %s
-
-module {
-  aie.device(npu1) {
-    func.func private @eltwise_add_bf16_scalar(memref<1024xbf16>, memref<1024xbf16>, memref<1024xbf16>)
-    func.func private @eltwise_add_bf16_vector(memref<1024xbf16>, memref<1024xbf16>, memref<1024xbf16>)
-    %tile_0_0 = aie.tile(0, 0)
-    %tile_0_1 = aie.tile(0, 1)
-    %tile_0_2 = aie.tile(0, 2)
-    %tile_0_3 = aie.tile(0, 3)
-    aie.objectfifo @inA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<2048xbf16>>
-    aie.objectfifo @memA0(%tile_0_1, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<1024xbf16>>
-    aie.objectfifo @memA1(%tile_0_1, {%tile_0_3}, 2 : i32) : !aie.objectfifo<memref<1024xbf16>>
-    aie.objectfifo.link [@inA] -> [@memA0, @memA1]()
-    aie.objectfifo @inB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<2048xbf16>>
-    aie.objectfifo @memB0(%tile_0_1, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<1024xbf16>>
-    aie.objectfifo @memB1(%tile_0_1, {%tile_0_3}, 2 : i32) : !aie.objectfifo<memref<1024xbf16>>
-    aie.objectfifo.link [@inB] -> [@memB0, @memB1]()
-    aie.objectfifo @memC0(%tile_0_2, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<1024xbf16>>
-    aie.objectfifo @memC1(%tile_0_3, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<1024xbf16>>
-    aie.objectfifo @outC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<2048xbf16>>
-    aie.objectfifo.link [@memC0, @memC1] -> [@outC]()
-    %core_0_2 = aie.core(%tile_0_2) {
-      %c0 = arith.constant 0 : index
-      %c4294967295 = arith.constant 4294967295 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c4294967295 step %c1 {
-        %c0_0 = arith.constant 0 : index
-        %c32 = arith.constant 32 : index
-        %c1_1 = arith.constant 1 : index
-        scf.for %arg1 = %c0_0 to %c32 step %c1_1 {
-          %0 = aie.objectfifo.acquire @memC0(Produce, 1) : !aie.objectfifosubview<memref<1024xbf16>>
-          %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<1024xbf16>> -> memref<1024xbf16>
-          %2 = aie.objectfifo.acquire @memA0(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16>>
-          %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<1024xbf16>> -> memref<1024xbf16>
-          %4 = aie.objectfifo.acquire @memB0(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16>>
-          %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<1024xbf16>> -> memref<1024xbf16>
-          func.call @eltwise_add_bf16_vector(%3, %5, %1) : (memref<1024xbf16>, memref<1024xbf16>, memref<1024xbf16>) -> ()
-          aie.objectfifo.release @memA0(Consume, 1)
-          aie.objectfifo.release @memB0(Consume, 1)
-          aie.objectfifo.release @memC0(Produce, 1)
-        }
-      }
-      aie.end
-    } {link_with = "add.o"}
-    %core_0_3 = aie.core(%tile_0_3) {
-      %c0 = arith.constant 0 : index
-      %c4294967295 = arith.constant 4294967295 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c4294967295 step %c1 {
-        %c0_0 = arith.constant 0 : index
-        %c32 = arith.constant 32 : index
-        %c1_1 = arith.constant 1 : index
-        scf.for %arg1 = %c0_0 to %c32 step %c1_1 {
-          %0 = aie.objectfifo.acquire @memC1(Produce, 1) : !aie.objectfifosubview<memref<1024xbf16>>
-          %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<1024xbf16>> -> memref<1024xbf16>
-          %2 = aie.objectfifo.acquire @memA1(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16>>
-          %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<1024xbf16>> -> memref<1024xbf16>
-          %4 = aie.objectfifo.acquire @memB1(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16>>
-          %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<1024xbf16>> -> memref<1024xbf16>
-          func.call @eltwise_add_bf16_vector(%3, %5, %1) : (memref<1024xbf16>, memref<1024xbf16>, memref<1024xbf16>) -> ()
-          aie.objectfifo.release @memA1(Consume, 1)
-          aie.objectfifo.release @memB1(Consume, 1)
-          aie.objectfifo.release @memC1(Produce, 1)
-        }
-      }
-      aie.end
-    } {link_with = "add.o"}
-    func.func @sequence(%arg0: memref<65536xi32>, %arg1: memref<65536xi32>, %arg2: memref<65536xi32>) {
-      aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 32768][0, 0, 0]) {id = 0 : i64, metadata = @outC} : memref<65536xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 32768][0, 0, 0]) {id = 1 : i64, metadata = @inA} : memref<65536xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 32768][0, 0, 0]) {id = 2 : i64, metadata = @inB} : memref<65536xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
-    }
-  }
-}
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_eltwise_mul.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_eltwise_mul.mlir
deleted file mode 100644
index ba05654a9..000000000
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_eltwise_mul.mlir
+++ /dev/null
@@ -1,78 +0,0 @@
-// RUN: iree-opt --aie-objectFifo-stateful-transform %s
-
-module {
-  aie.device(npu1) {
-    func.func private @eltwise_mul_bf16_scalar(memref<1024xbf16>, memref<1024xbf16>, memref<1024xbf16>)
-    func.func private @eltwise_mul_bf16_vector(memref<1024xbf16>, memref<1024xbf16>, memref<1024xbf16>)
-    %tile_0_0 = aie.tile(0, 0)
-    %tile_0_1 = aie.tile(0, 1)
-    %tile_0_2 = aie.tile(0, 2)
-    %tile_0_3 = aie.tile(0, 3)
-    aie.objectfifo @inA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<2048xbf16>>
-    aie.objectfifo @memA0(%tile_0_1, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<1024xbf16>>
-    aie.objectfifo @memA1(%tile_0_1, {%tile_0_3}, 2 : i32) : !aie.objectfifo<memref<1024xbf16>>
-    aie.objectfifo.link [@inA] -> [@memA0, @memA1]()
-    aie.objectfifo @inB(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<2048xbf16>>
-    aie.objectfifo @memB0(%tile_0_1, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<1024xbf16>>
-    aie.objectfifo @memB1(%tile_0_1, {%tile_0_3}, 2 : i32) : !aie.objectfifo<memref<1024xbf16>>
-    aie.objectfifo.link [@inB] -> [@memB0, @memB1]()
-    aie.objectfifo @memC0(%tile_0_2, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<1024xbf16>>
-    aie.objectfifo @memC1(%tile_0_3, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<1024xbf16>>
-    aie.objectfifo @outC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<2048xbf16>>
-    aie.objectfifo.link [@memC0, @memC1] -> [@outC]()
-    %core_0_2 = aie.core(%tile_0_2) {
-      %c0 = arith.constant 0 : index
-      %c4294967295 = arith.constant 4294967295 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c4294967295 step %c1 {
-        %c0_0 = arith.constant 0 : index
-        %c32 = arith.constant 32 : index
-        %c1_1 = arith.constant 1 : index
-        scf.for %arg1 = %c0_0 to %c32 step %c1_1 {
-          %0 = aie.objectfifo.acquire @memC0(Produce, 1) : !aie.objectfifosubview<memref<1024xbf16>>
-          %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<1024xbf16>> -> memref<1024xbf16>
-          %2 = aie.objectfifo.acquire @memA0(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16>>
-          %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<1024xbf16>> -> memref<1024xbf16>
-          %4 = aie.objectfifo.acquire @memB0(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16>>
-          %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<1024xbf16>> -> memref<1024xbf16>
-          func.call @eltwise_mul_bf16_vector(%3, %5, %1) : (memref<1024xbf16>, memref<1024xbf16>, memref<1024xbf16>) -> ()
-          aie.objectfifo.release @memA0(Consume, 1)
-          aie.objectfifo.release @memB0(Consume, 1)
-          aie.objectfifo.release @memC0(Produce, 1)
-        }
-      }
-      aie.end
-    } {link_with = "mul.o"}
-    %core_0_3 = aie.core(%tile_0_3) {
-      %c0 = arith.constant 0 : index
-      %c4294967295 = arith.constant 4294967295 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c4294967295 step %c1 {
-        %c0_0 = arith.constant 0 : index
-        %c32 = arith.constant 32 : index
-        %c1_1 = arith.constant 1 : index
-        scf.for %arg1 = %c0_0 to %c32 step %c1_1 {
-          %0 = aie.objectfifo.acquire @memC1(Produce, 1) : !aie.objectfifosubview<memref<1024xbf16>>
-          %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<1024xbf16>> -> memref<1024xbf16>
-          %2 = aie.objectfifo.acquire @memA1(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16>>
-          %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<1024xbf16>> -> memref<1024xbf16>
-          %4 = aie.objectfifo.acquire @memB1(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16>>
-          %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<1024xbf16>> -> memref<1024xbf16>
-          func.call @eltwise_mul_bf16_vector(%3, %5, %1) : (memref<1024xbf16>, memref<1024xbf16>, memref<1024xbf16>) -> ()
-          aie.objectfifo.release @memA1(Consume, 1)
-          aie.objectfifo.release @memB1(Consume, 1)
-          aie.objectfifo.release @memC1(Produce, 1)
-        }
-      }
-      aie.end
-    } {link_with = "mul.o"}
-    func.func @sequence(%arg0: memref<65536xi32>, %arg1: memref<65536xi32>, %arg2: memref<65536xi32>) {
-      aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 32768][0, 0, 0]) {id = 0 : i64, metadata = @outC} : memref<65536xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 32768][0, 0, 0]) {id = 1 : i64, metadata = @inA} : memref<65536xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 32768][0, 0, 0]) {id = 2 : i64, metadata = @inB} : memref<65536xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
-    }
-  }
-}
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_relu.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_relu.mlir
deleted file mode 100644
index c4ffce536..000000000
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_relu.mlir
+++ /dev/null
@@ -1,66 +0,0 @@
-// RUN: iree-opt --aie-objectFifo-stateful-transform %s
-
-module {
-  aie.device(npu1) {
-    func.func private @bf16_relu(memref<1024xbf16>, memref<1024xbf16>)
-    %tile_0_0 = aie.tile(0, 0)
-    %tile_0_1 = aie.tile(0, 1)
-    %tile_0_2 = aie.tile(0, 2)
-    %tile_0_3 = aie.tile(0, 3)
-    aie.objectfifo @inA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<2048xbf16>>
-    aie.objectfifo @memA0(%tile_0_1, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<1024xbf16>>
-    aie.objectfifo @memA1(%tile_0_1, {%tile_0_3}, 2 : i32) : !aie.objectfifo<memref<1024xbf16>>
-    aie.objectfifo.link [@inA] -> [@memA0, @memA1]()
-    aie.objectfifo @memC0(%tile_0_2, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<1024xbf16>>
-    aie.objectfifo @memC1(%tile_0_3, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<1024xbf16>>
-    aie.objectfifo @outC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<2048xbf16>>
-    aie.objectfifo.link [@memC0, @memC1] -> [@outC]()
-    %core_0_2 = aie.core(%tile_0_2) {
-      %c0 = arith.constant 0 : index
-      %c4294967295 = arith.constant 4294967295 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c4294967295 step %c1 {
-        %c0_0 = arith.constant 0 : index
-        %c32 = arith.constant 32 : index
-        %c1_1 = arith.constant 1 : index
-        scf.for %arg1 = %c0_0 to %c32 step %c1_1 {
-          %0 = aie.objectfifo.acquire @memC0(Produce, 1) : !aie.objectfifosubview<memref<1024xbf16>>
-          %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<1024xbf16>> -> memref<1024xbf16>
-          %2 = aie.objectfifo.acquire @memA0(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16>>
-          %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<1024xbf16>> -> memref<1024xbf16>
-          func.call @bf16_relu(%3, %1) : (memref<1024xbf16>, memref<1024xbf16>) -> ()
-          aie.objectfifo.release @memA0(Consume, 1)
-          aie.objectfifo.release @memC0(Produce, 1)
-        }
-      }
-      aie.end
-    } {link_with = "relu.o"}
-    %core_0_3 = aie.core(%tile_0_3) {
-      %c0 = arith.constant 0 : index
-      %c4294967295 = arith.constant 4294967295 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c4294967295 step %c1 {
-        %c0_0 = arith.constant 0 : index
-        %c32 = arith.constant 32 : index
-        %c1_1 = arith.constant 1 : index
-        scf.for %arg1 = %c0_0 to %c32 step %c1_1 {
-          %0 = aie.objectfifo.acquire @memC1(Produce, 1) : !aie.objectfifosubview<memref<1024xbf16>>
-          %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<1024xbf16>> -> memref<1024xbf16>
-          %2 = aie.objectfifo.acquire @memA1(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16>>
-          %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<1024xbf16>> -> memref<1024xbf16>
-          func.call @bf16_relu(%3, %1) : (memref<1024xbf16>, memref<1024xbf16>) -> ()
-          aie.objectfifo.release @memA1(Consume, 1)
-          aie.objectfifo.release @memC1(Produce, 1)
-        }
-      }
-      aie.end
-    } {link_with = "relu.o"}
-    func.func @sequence(%arg0: memref<65536xi32>, %arg1: memref<65536xi32>) {
-      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 32768][0, 0, 0]) {id = 0 : i64, metadata = @outC} : memref<65536xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 32768][0, 0, 0]) {id = 1 : i64, metadata = @inA} : memref<65536xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
-    }
-  }
-}
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_resnet_layers_conv2_x.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_resnet_layers_conv2_x.mlir
deleted file mode 100644
index b0619f1ee..000000000
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_resnet_layers_conv2_x.mlir
+++ /dev/null
@@ -1,664 +0,0 @@
-// RUN: iree-opt --aie-objectFifo-stateful-transform %s
-
-module {
-  aie.device(npu1) {
-    func.func private @conv2dk1_i8(memref<32x1x64xi8>, memref<4096xi8>, memref<32x1x64xui8>, i32, i32, i32, i32)
-    func.func private @conv2dk3_ui8(memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32)
-    func.func private @conv2dk1_skip_init_i8(memref<32x1x32xui8>, memref<32x1x32xui8>, memref<32768xi8>, memref<32x1x256xui8>, memref<32x1x64xi8>, i32, i32, i32, i32, i32, i32, i32)
-    func.func private @conv2dk1_ui8(memref<32x1x256xui8>, memref<16384xi8>, memref<32x1x64xui8>, i32, i32, i32, i32)
-    func.func private @conv2dk1_skip_ui8(memref<32x1x32xui8>, memref<32x1x32xui8>, memref<16384xi8>, memref<32x1x256xui8>, memref<32x1x256xui8>, i32, i32, i32, i32, i32)
-    %tile_0_0 = aie.tile(0, 0)
-    %tile_0_1 = aie.tile(0, 1)
-    %tile_0_2 = aie.tile(0, 2)
-    %tile_0_3 = aie.tile(0, 3)
-    %tile_0_4 = aie.tile(0, 4)
-    %tile_0_5 = aie.tile(0, 5)
-    %tile_1_0 = aie.tile(1, 0)
-    %tile_1_1 = aie.tile(1, 1)
-    %tile_1_2 = aie.tile(1, 2)
-    %tile_1_3 = aie.tile(1, 3)
-    %tile_1_4 = aie.tile(1, 4)
-    %tile_1_5 = aie.tile(1, 5)
-    %tile_2_0 = aie.tile(2, 0)
-    %tile_2_1 = aie.tile(2, 1)
-    %tile_2_2 = aie.tile(2, 2)
-    %tile_2_3 = aie.tile(2, 3)
-    %tile_2_4 = aie.tile(2, 4)
-    %tile_2_5 = aie.tile(2, 5)
-    %rtpComputeTile02 = aie.buffer(%tile_0_2) {sym_name = "rtpComputeTile02"} : memref<16xi32> 
-    %rtpComputeTile03 = aie.buffer(%tile_0_3) {sym_name = "rtpComputeTile03"} : memref<16xi32> 
-    %rtpComputeTile04 = aie.buffer(%tile_0_5) {sym_name = "rtpComputeTile04"} : memref<16xi32> 
-    %rtpComputeTile05 = aie.buffer(%tile_0_4) {sym_name = "rtpComputeTile05"} : memref<16xi32> 
-    %rtpComputeTile12 = aie.buffer(%tile_1_2) {sym_name = "rtpComputeTile12"} : memref<16xi32> 
-    %rtpComputeTile13 = aie.buffer(%tile_1_3) {sym_name = "rtpComputeTile13"} : memref<16xi32> 
-    %rtpComputeTile14 = aie.buffer(%tile_1_4) {sym_name = "rtpComputeTile14"} : memref<16xi32> 
-    %rtpComputeTile15 = aie.buffer(%tile_1_5) {sym_name = "rtpComputeTile15"} : memref<16xi32> 
-    %rtpComputeTile22 = aie.buffer(%tile_2_2) {sym_name = "rtpComputeTile22"} : memref<16xi32> 
-    %rtpComputeTile23 = aie.buffer(%tile_2_3) {sym_name = "rtpComputeTile23"} : memref<16xi32> 
-    %rtpComputeTile24 = aie.buffer(%tile_2_4) {sym_name = "rtpComputeTile24"} : memref<16xi32> 
-    %rtpComputeTile25 = aie.buffer(%tile_2_5) {sym_name = "rtpComputeTile25"} : memref<16xi32> 
-    aie.objectfifo @act1_00_02_01(%tile_0_0, {%tile_0_2, %tile_0_1}, [2 : i32, 2 : i32, 4 : i32]) : !aie.objectfifo<memref<32x1x64xi8>>
-    aie.objectfifo @skip_0(%tile_0_1, {%tile_0_4}, 2 : i32) : !aie.objectfifo<memref<32x1x64xi8>>
-    aie.objectfifo.link [@act1_00_02_01] -> [@skip_0]()
-    aie.objectfifo @act1_04_15_11(%tile_0_4, {%tile_1_5, %tile_0_1}, [2 : i32, 2 : i32, 4 : i32]) : !aie.objectfifo<memref<32x1x256xui8>>
-    aie.objectfifo @skip_1(%tile_0_1, {%tile_1_3}, 2 : i32) : !aie.objectfifo<memref<32x1x256xui8>>
-    aie.objectfifo.link [@act1_04_15_11] -> [@skip_1]()
-    aie.objectfifo @act1_13_22_21(%tile_1_3, {%tile_2_2, %tile_2_1}, [2 : i32, 2 : i32, 4 : i32]) : !aie.objectfifo<memref<32x1x256xui8>>
-    aie.objectfifo @skip_2(%tile_2_1, {%tile_2_4}, 2 : i32) : !aie.objectfifo<memref<32x1x256xui8>>
-    aie.objectfifo.link [@act1_13_22_21] -> [@skip_2]()
-    aie.objectfifo @act2_02_03_05(%tile_0_2, {%tile_0_3, %tile_0_5}, 4 : i32) : !aie.objectfifo<memref<32x1x64xui8>>
-    aie.objectfifo @act3_03_04(%tile_0_3, {%tile_0_4}, 2 : i32) : !aie.objectfifo<memref<32x1x32xui8>>
-    aie.objectfifo @act3_05_04(%tile_0_5, {%tile_0_4}, 2 : i32) : !aie.objectfifo<memref<32x1x32xui8>>
-    aie.objectfifo @act2_15_12_14(%tile_1_5, {%tile_1_2, %tile_1_4}, 4 : i32) : !aie.objectfifo<memref<32x1x64xui8>>
-    aie.objectfifo @act3_14_13(%tile_1_4, {%tile_1_3}, 2 : i32) : !aie.objectfifo<memref<32x1x32xui8>>
-    aie.objectfifo @act3_12_13(%tile_1_2, {%tile_1_3}, 2 : i32) : !aie.objectfifo<memref<32x1x32xui8>>
-    aie.objectfifo @act2_22_23_25(%tile_2_2, {%tile_2_3, %tile_2_5}, 4 : i32) : !aie.objectfifo<memref<32x1x64xui8>>
-    aie.objectfifo @act3_23_24(%tile_2_3, {%tile_2_4}, 2 : i32) : !aie.objectfifo<memref<32x1x32xui8>>
-    aie.objectfifo @act3_25_24(%tile_2_5, {%tile_2_4}, 2 : i32) : !aie.objectfifo<memref<32x1x32xui8>>
-    aie.objectfifo @wts_0_L3L2(%tile_0_0, {%tile_0_1}, 1 : i32) : !aie.objectfifo<memref<73728xi8>>
-    aie.objectfifo @wts_buf_00(%tile_0_1, {%tile_0_2}, 1 : i32) : !aie.objectfifo<memref<4096xi8>>
-    aie.objectfifo @wts_buf_01(%tile_0_1, {%tile_0_3, %tile_0_5}, 1 : i32) : !aie.objectfifo<memref<36864xi8>>
-    aie.objectfifo @wts_buf_02(%tile_0_1, {%tile_0_4}, 1 : i32) : !aie.objectfifo<memref<32768xi8>>
-    aie.objectfifo.link [@wts_0_L3L2] -> [@wts_buf_00, @wts_buf_01, @wts_buf_02]()
-    aie.objectfifo @wts_1_L3L2(%tile_1_0, {%tile_1_1}, 1 : i32) : !aie.objectfifo<memref<69632xi8>>
-    aie.objectfifo @wts_buf_10(%tile_1_1, {%tile_1_5}, 1 : i32) : !aie.objectfifo<memref<16384xi8>>
-    aie.objectfifo @wts_buf_11(%tile_1_1, {%tile_1_2, %tile_1_4}, 1 : i32) : !aie.objectfifo<memref<36864xi8>>
-    aie.objectfifo @wts_buf_12(%tile_1_1, {%tile_1_3}, 1 : i32) : !aie.objectfifo<memref<16384xi8>>
-    aie.objectfifo.link [@wts_1_L3L2] -> [@wts_buf_10, @wts_buf_11, @wts_buf_12]()
-    aie.objectfifo @wts_2_L3L2(%tile_2_0, {%tile_2_1}, 1 : i32) : !aie.objectfifo<memref<69632xi8>>
-    aie.objectfifo @wts_buf_20(%tile_2_1, {%tile_2_2}, 1 : i32) : !aie.objectfifo<memref<16384xi8>>
-    aie.objectfifo @wts_buf_21(%tile_2_1, {%tile_2_3, %tile_2_5}, 1 : i32) : !aie.objectfifo<memref<36864xi8>>
-    aie.objectfifo @wts_buf_22(%tile_2_1, {%tile_2_4}, 1 : i32) : !aie.objectfifo<memref<16384xi8>>
-    aie.objectfifo.link [@wts_2_L3L2] -> [@wts_buf_20, @wts_buf_21, @wts_buf_22]()
-    aie.objectfifo @outOFL2L3(%tile_2_4, {%tile_1_0}, 2 : i32) : !aie.objectfifo<memref<32x1x256xui8>>
-    %core_0_2 = aie.core(%tile_0_2) {
-      %c0 = arith.constant 0 : index
-      %c9223372036854775807 = arith.constant 9223372036854775807 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 {
-        %0 = aie.objectfifo.acquire @wts_buf_00(Consume, 1) : !aie.objectfifosubview<memref<4096xi8>>
-        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<4096xi8>> -> memref<4096xi8>
-        %c0_0 = arith.constant 0 : index
-        %2 = memref.load %rtpComputeTile02[%c0_0] : memref<16xi32>
-        %c0_1 = arith.constant 0 : index
-        %c32 = arith.constant 32 : index
-        %c1_2 = arith.constant 1 : index
-        scf.for %arg1 = %c0_1 to %c32 step %c1_2 {
-          %3 = aie.objectfifo.acquire @act1_00_02_01(Consume, 1) : !aie.objectfifosubview<memref<32x1x64xi8>>
-          %4 = aie.objectfifo.subview.access %3[0] : !aie.objectfifosubview<memref<32x1x64xi8>> -> memref<32x1x64xi8>
-          %5 = aie.objectfifo.acquire @act2_02_03_05(Produce, 1) : !aie.objectfifosubview<memref<32x1x64xui8>>
-          %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-          %c32_i32 = arith.constant 32 : i32
-          %c64_i32 = arith.constant 64 : i32
-          %c64_i32_3 = arith.constant 64 : i32
-          func.call @conv2dk1_i8(%4, %1, %6, %c32_i32, %c64_i32, %c64_i32_3, %2) : (memref<32x1x64xi8>, memref<4096xi8>, memref<32x1x64xui8>, i32, i32, i32, i32) -> ()
-          aie.objectfifo.release @act1_00_02_01(Consume, 1)
-          aie.objectfifo.release @act2_02_03_05(Produce, 1)
-        }
-        aie.objectfifo.release @wts_buf_00(Consume, 1)
-      }
-      aie.end
-    } {link_with = "conv2dk1_i8.o"}
-    %core_1_5 = aie.core(%tile_1_5) {
-      %c0 = arith.constant 0 : index
-      %c9223372036854775807 = arith.constant 9223372036854775807 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 {
-        %0 = aie.objectfifo.acquire @wts_buf_10(Consume, 1) : !aie.objectfifosubview<memref<16384xi8>>
-        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<16384xi8>> -> memref<16384xi8>
-        %c0_0 = arith.constant 0 : index
-        %2 = memref.load %rtpComputeTile15[%c0_0] : memref<16xi32>
-        %c0_1 = arith.constant 0 : index
-        %c32 = arith.constant 32 : index
-        %c1_2 = arith.constant 1 : index
-        scf.for %arg1 = %c0_1 to %c32 step %c1_2 {
-          %3 = aie.objectfifo.acquire @act1_04_15_11(Consume, 1) : !aie.objectfifosubview<memref<32x1x256xui8>>
-          %4 = aie.objectfifo.subview.access %3[0] : !aie.objectfifosubview<memref<32x1x256xui8>> -> memref<32x1x256xui8>
-          %5 = aie.objectfifo.acquire @act2_15_12_14(Produce, 1) : !aie.objectfifosubview<memref<32x1x64xui8>>
-          %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-          %c32_i32 = arith.constant 32 : i32
-          %c256_i32 = arith.constant 256 : i32
-          %c64_i32 = arith.constant 64 : i32
-          func.call @conv2dk1_ui8(%4, %1, %6, %c32_i32, %c256_i32, %c64_i32, %2) : (memref<32x1x256xui8>, memref<16384xi8>, memref<32x1x64xui8>, i32, i32, i32, i32) -> ()
-          aie.objectfifo.release @act1_04_15_11(Consume, 1)
-          aie.objectfifo.release @act2_15_12_14(Produce, 1)
-        }
-        aie.objectfifo.release @wts_buf_10(Consume, 1)
-      }
-      aie.end
-    } {link_with = "conv2dk1_ui8.o"}
-    %core_2_2 = aie.core(%tile_2_2) {
-      %c0 = arith.constant 0 : index
-      %c9223372036854775807 = arith.constant 9223372036854775807 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 {
-        %0 = aie.objectfifo.acquire @wts_buf_20(Consume, 1) : !aie.objectfifosubview<memref<16384xi8>>
-        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<16384xi8>> -> memref<16384xi8>
-        %c0_0 = arith.constant 0 : index
-        %2 = memref.load %rtpComputeTile22[%c0_0] : memref<16xi32>
-        %c0_1 = arith.constant 0 : index
-        %c32 = arith.constant 32 : index
-        %c1_2 = arith.constant 1 : index
-        scf.for %arg1 = %c0_1 to %c32 step %c1_2 {
-          %3 = aie.objectfifo.acquire @act1_13_22_21(Consume, 1) : !aie.objectfifosubview<memref<32x1x256xui8>>
-          %4 = aie.objectfifo.subview.access %3[0] : !aie.objectfifosubview<memref<32x1x256xui8>> -> memref<32x1x256xui8>
-          %5 = aie.objectfifo.acquire @act2_22_23_25(Produce, 1) : !aie.objectfifosubview<memref<32x1x64xui8>>
-          %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-          %c32_i32 = arith.constant 32 : i32
-          %c256_i32 = arith.constant 256 : i32
-          %c64_i32 = arith.constant 64 : i32
-          func.call @conv2dk1_ui8(%4, %1, %6, %c32_i32, %c256_i32, %c64_i32, %2) : (memref<32x1x256xui8>, memref<16384xi8>, memref<32x1x64xui8>, i32, i32, i32, i32) -> ()
-          aie.objectfifo.release @act1_13_22_21(Consume, 1)
-          aie.objectfifo.release @act2_22_23_25(Produce, 1)
-        }
-        aie.objectfifo.release @wts_buf_20(Consume, 1)
-      }
-      aie.end
-    } {link_with = "conv2dk1_ui8.o"}
-    %core_0_3 = aie.core(%tile_0_3) {
-      %c0 = arith.constant 0 : index
-      %c9223372036854775807 = arith.constant 9223372036854775807 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 {
-        %0 = aie.objectfifo.acquire @wts_buf_01(Consume, 1) : !aie.objectfifosubview<memref<36864xi8>>
-        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<36864xi8>> -> memref<36864xi8>
-        %2 = aie.objectfifo.acquire @act2_02_03_05(Consume, 2) : !aie.objectfifosubview<memref<32x1x64xui8>>
-        %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %4 = aie.objectfifo.subview.access %2[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %5 = aie.objectfifo.acquire @act3_03_04(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-        %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-        %c32_i32 = arith.constant 32 : i32
-        %c64_i32 = arith.constant 64 : i32
-        %c32_i32_0 = arith.constant 32 : i32
-        %c3_i32 = arith.constant 3 : i32
-        %c3_i32_1 = arith.constant 3 : i32
-        %c0_i32 = arith.constant 0 : i32
-        %c11_i32 = arith.constant 11 : i32
-        %c0_i32_2 = arith.constant 0 : i32
-        func.call @conv2dk3_ui8(%3, %3, %4, %1, %6, %c32_i32, %c64_i32, %c32_i32_0, %c3_i32, %c3_i32_1, %c0_i32, %c11_i32, %c0_i32_2) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
-        aie.objectfifo.release @act3_03_04(Produce, 1)
-        %c0_3 = arith.constant 0 : index
-        %c30 = arith.constant 30 : index
-        %c1_4 = arith.constant 1 : index
-        scf.for %arg1 = %c0_3 to %c30 step %c1_4 {
-          %12 = aie.objectfifo.acquire @act2_02_03_05(Consume, 3) : !aie.objectfifosubview<memref<32x1x64xui8>>
-          %13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-          %14 = aie.objectfifo.subview.access %12[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-          %15 = aie.objectfifo.subview.access %12[2] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-          %16 = aie.objectfifo.acquire @act3_03_04(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-          %17 = aie.objectfifo.subview.access %16[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-          %c32_i32_12 = arith.constant 32 : i32
-          %c64_i32_13 = arith.constant 64 : i32
-          %c32_i32_14 = arith.constant 32 : i32
-          %c3_i32_15 = arith.constant 3 : i32
-          %c3_i32_16 = arith.constant 3 : i32
-          %c1_i32 = arith.constant 1 : i32
-          %c11_i32_17 = arith.constant 11 : i32
-          %c0_i32_18 = arith.constant 0 : i32
-          func.call @conv2dk3_ui8(%13, %14, %15, %1, %17, %c32_i32_12, %c64_i32_13, %c32_i32_14, %c3_i32_15, %c3_i32_16, %c1_i32, %c11_i32_17, %c0_i32_18) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
-          aie.objectfifo.release @act2_02_03_05(Consume, 1)
-          aie.objectfifo.release @act3_03_04(Produce, 1)
-        }
-        %7 = aie.objectfifo.acquire @act2_02_03_05(Consume, 2) : !aie.objectfifosubview<memref<32x1x64xui8>>
-        %8 = aie.objectfifo.subview.access %7[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %9 = aie.objectfifo.subview.access %7[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %10 = aie.objectfifo.acquire @act3_03_04(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-        %11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-        %c32_i32_5 = arith.constant 32 : i32
-        %c64_i32_6 = arith.constant 64 : i32
-        %c32_i32_7 = arith.constant 32 : i32
-        %c3_i32_8 = arith.constant 3 : i32
-        %c3_i32_9 = arith.constant 3 : i32
-        %c2_i32 = arith.constant 2 : i32
-        %c11_i32_10 = arith.constant 11 : i32
-        %c0_i32_11 = arith.constant 0 : i32
-        func.call @conv2dk3_ui8(%8, %9, %9, %1, %11, %c32_i32_5, %c64_i32_6, %c32_i32_7, %c3_i32_8, %c3_i32_9, %c2_i32, %c11_i32_10, %c0_i32_11) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
-        aie.objectfifo.release @act2_02_03_05(Consume, 2)
-        aie.objectfifo.release @act3_03_04(Produce, 1)
-        aie.objectfifo.release @wts_buf_01(Consume, 1)
-      }
-      aie.end
-    } {link_with = "conv2dk3.o"}
-    %core_1_4 = aie.core(%tile_1_4) {
-      %c0 = arith.constant 0 : index
-      %c9223372036854775807 = arith.constant 9223372036854775807 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 {
-        %0 = aie.objectfifo.acquire @wts_buf_11(Consume, 1) : !aie.objectfifosubview<memref<36864xi8>>
-        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<36864xi8>> -> memref<36864xi8>
-        %2 = aie.objectfifo.acquire @act2_15_12_14(Consume, 2) : !aie.objectfifosubview<memref<32x1x64xui8>>
-        %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %4 = aie.objectfifo.subview.access %2[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %5 = aie.objectfifo.acquire @act3_14_13(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-        %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-        %c32_i32 = arith.constant 32 : i32
-        %c64_i32 = arith.constant 64 : i32
-        %c32_i32_0 = arith.constant 32 : i32
-        %c3_i32 = arith.constant 3 : i32
-        %c3_i32_1 = arith.constant 3 : i32
-        %c0_i32 = arith.constant 0 : i32
-        %c11_i32 = arith.constant 11 : i32
-        %c0_i32_2 = arith.constant 0 : i32
-        func.call @conv2dk3_ui8(%3, %3, %4, %1, %6, %c32_i32, %c64_i32, %c32_i32_0, %c3_i32, %c3_i32_1, %c0_i32, %c11_i32, %c0_i32_2) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
-        aie.objectfifo.release @act3_14_13(Produce, 1)
-        %c0_3 = arith.constant 0 : index
-        %c30 = arith.constant 30 : index
-        %c1_4 = arith.constant 1 : index
-        scf.for %arg1 = %c0_3 to %c30 step %c1_4 {
-          %12 = aie.objectfifo.acquire @act2_15_12_14(Consume, 3) : !aie.objectfifosubview<memref<32x1x64xui8>>
-          %13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-          %14 = aie.objectfifo.subview.access %12[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-          %15 = aie.objectfifo.subview.access %12[2] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-          %16 = aie.objectfifo.acquire @act3_14_13(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-          %17 = aie.objectfifo.subview.access %16[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-          %c32_i32_12 = arith.constant 32 : i32
-          %c64_i32_13 = arith.constant 64 : i32
-          %c32_i32_14 = arith.constant 32 : i32
-          %c3_i32_15 = arith.constant 3 : i32
-          %c3_i32_16 = arith.constant 3 : i32
-          %c1_i32 = arith.constant 1 : i32
-          %c11_i32_17 = arith.constant 11 : i32
-          %c0_i32_18 = arith.constant 0 : i32
-          func.call @conv2dk3_ui8(%13, %14, %15, %1, %17, %c32_i32_12, %c64_i32_13, %c32_i32_14, %c3_i32_15, %c3_i32_16, %c1_i32, %c11_i32_17, %c0_i32_18) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
-          aie.objectfifo.release @act2_15_12_14(Consume, 1)
-          aie.objectfifo.release @act3_14_13(Produce, 1)
-        }
-        %7 = aie.objectfifo.acquire @act2_15_12_14(Consume, 2) : !aie.objectfifosubview<memref<32x1x64xui8>>
-        %8 = aie.objectfifo.subview.access %7[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %9 = aie.objectfifo.subview.access %7[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %10 = aie.objectfifo.acquire @act3_14_13(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-        %11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-        %c32_i32_5 = arith.constant 32 : i32
-        %c64_i32_6 = arith.constant 64 : i32
-        %c32_i32_7 = arith.constant 32 : i32
-        %c3_i32_8 = arith.constant 3 : i32
-        %c3_i32_9 = arith.constant 3 : i32
-        %c2_i32 = arith.constant 2 : i32
-        %c11_i32_10 = arith.constant 11 : i32
-        %c0_i32_11 = arith.constant 0 : i32
-        func.call @conv2dk3_ui8(%8, %9, %9, %1, %11, %c32_i32_5, %c64_i32_6, %c32_i32_7, %c3_i32_8, %c3_i32_9, %c2_i32, %c11_i32_10, %c0_i32_11) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
-        aie.objectfifo.release @act2_15_12_14(Consume, 2)
-        aie.objectfifo.release @act3_14_13(Produce, 1)
-        aie.objectfifo.release @wts_buf_11(Consume, 1)
-      }
-      aie.end
-    } {link_with = "conv2dk3.o"}
-    %core_2_3 = aie.core(%tile_2_3) {
-      %c0 = arith.constant 0 : index
-      %c9223372036854775807 = arith.constant 9223372036854775807 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 {
-        %0 = aie.objectfifo.acquire @wts_buf_21(Consume, 1) : !aie.objectfifosubview<memref<36864xi8>>
-        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<36864xi8>> -> memref<36864xi8>
-        %2 = aie.objectfifo.acquire @act2_22_23_25(Consume, 2) : !aie.objectfifosubview<memref<32x1x64xui8>>
-        %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %4 = aie.objectfifo.subview.access %2[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %5 = aie.objectfifo.acquire @act3_23_24(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-        %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-        %c32_i32 = arith.constant 32 : i32
-        %c64_i32 = arith.constant 64 : i32
-        %c32_i32_0 = arith.constant 32 : i32
-        %c3_i32 = arith.constant 3 : i32
-        %c3_i32_1 = arith.constant 3 : i32
-        %c0_i32 = arith.constant 0 : i32
-        %c11_i32 = arith.constant 11 : i32
-        %c0_i32_2 = arith.constant 0 : i32
-        func.call @conv2dk3_ui8(%3, %3, %4, %1, %6, %c32_i32, %c64_i32, %c32_i32_0, %c3_i32, %c3_i32_1, %c0_i32, %c11_i32, %c0_i32_2) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
-        aie.objectfifo.release @act3_23_24(Produce, 1)
-        %c0_3 = arith.constant 0 : index
-        %c30 = arith.constant 30 : index
-        %c1_4 = arith.constant 1 : index
-        scf.for %arg1 = %c0_3 to %c30 step %c1_4 {
-          %12 = aie.objectfifo.acquire @act2_22_23_25(Consume, 3) : !aie.objectfifosubview<memref<32x1x64xui8>>
-          %13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-          %14 = aie.objectfifo.subview.access %12[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-          %15 = aie.objectfifo.subview.access %12[2] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-          %16 = aie.objectfifo.acquire @act3_23_24(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-          %17 = aie.objectfifo.subview.access %16[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-          %c32_i32_12 = arith.constant 32 : i32
-          %c64_i32_13 = arith.constant 64 : i32
-          %c32_i32_14 = arith.constant 32 : i32
-          %c3_i32_15 = arith.constant 3 : i32
-          %c3_i32_16 = arith.constant 3 : i32
-          %c1_i32 = arith.constant 1 : i32
-          %c11_i32_17 = arith.constant 11 : i32
-          %c0_i32_18 = arith.constant 0 : i32
-          func.call @conv2dk3_ui8(%13, %14, %15, %1, %17, %c32_i32_12, %c64_i32_13, %c32_i32_14, %c3_i32_15, %c3_i32_16, %c1_i32, %c11_i32_17, %c0_i32_18) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
-          aie.objectfifo.release @act2_22_23_25(Consume, 1)
-          aie.objectfifo.release @act3_23_24(Produce, 1)
-        }
-        %7 = aie.objectfifo.acquire @act2_22_23_25(Consume, 2) : !aie.objectfifosubview<memref<32x1x64xui8>>
-        %8 = aie.objectfifo.subview.access %7[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %9 = aie.objectfifo.subview.access %7[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %10 = aie.objectfifo.acquire @act3_23_24(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-        %11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-        %c32_i32_5 = arith.constant 32 : i32
-        %c64_i32_6 = arith.constant 64 : i32
-        %c32_i32_7 = arith.constant 32 : i32
-        %c3_i32_8 = arith.constant 3 : i32
-        %c3_i32_9 = arith.constant 3 : i32
-        %c2_i32 = arith.constant 2 : i32
-        %c11_i32_10 = arith.constant 11 : i32
-        %c0_i32_11 = arith.constant 0 : i32
-        func.call @conv2dk3_ui8(%8, %9, %9, %1, %11, %c32_i32_5, %c64_i32_6, %c32_i32_7, %c3_i32_8, %c3_i32_9, %c2_i32, %c11_i32_10, %c0_i32_11) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
-        aie.objectfifo.release @act2_22_23_25(Consume, 2)
-        aie.objectfifo.release @act3_23_24(Produce, 1)
-        aie.objectfifo.release @wts_buf_21(Consume, 1)
-      }
-      aie.end
-    } {link_with = "conv2dk3.o"}
-    %core_0_5 = aie.core(%tile_0_5) {
-      %c0 = arith.constant 0 : index
-      %c9223372036854775807 = arith.constant 9223372036854775807 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 {
-        %0 = aie.objectfifo.acquire @wts_buf_01(Consume, 1) : !aie.objectfifosubview<memref<36864xi8>>
-        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<36864xi8>> -> memref<36864xi8>
-        %2 = aie.objectfifo.acquire @act2_02_03_05(Consume, 2) : !aie.objectfifosubview<memref<32x1x64xui8>>
-        %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %4 = aie.objectfifo.subview.access %2[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %5 = aie.objectfifo.acquire @act3_05_04(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-        %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-        %c32_i32 = arith.constant 32 : i32
-        %c64_i32 = arith.constant 64 : i32
-        %c32_i32_0 = arith.constant 32 : i32
-        %c3_i32 = arith.constant 3 : i32
-        %c3_i32_1 = arith.constant 3 : i32
-        %c0_i32 = arith.constant 0 : i32
-        %c11_i32 = arith.constant 11 : i32
-        %c32_i32_2 = arith.constant 32 : i32
-        func.call @conv2dk3_ui8(%3, %3, %4, %1, %6, %c32_i32, %c64_i32, %c32_i32_0, %c3_i32, %c3_i32_1, %c0_i32, %c11_i32, %c32_i32_2) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
-        aie.objectfifo.release @act3_05_04(Produce, 1)
-        %c0_3 = arith.constant 0 : index
-        %c30 = arith.constant 30 : index
-        %c1_4 = arith.constant 1 : index
-        scf.for %arg1 = %c0_3 to %c30 step %c1_4 {
-          %12 = aie.objectfifo.acquire @act2_02_03_05(Consume, 3) : !aie.objectfifosubview<memref<32x1x64xui8>>
-          %13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-          %14 = aie.objectfifo.subview.access %12[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-          %15 = aie.objectfifo.subview.access %12[2] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-          %16 = aie.objectfifo.acquire @act3_05_04(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-          %17 = aie.objectfifo.subview.access %16[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-          %c32_i32_12 = arith.constant 32 : i32
-          %c64_i32_13 = arith.constant 64 : i32
-          %c32_i32_14 = arith.constant 32 : i32
-          %c3_i32_15 = arith.constant 3 : i32
-          %c3_i32_16 = arith.constant 3 : i32
-          %c1_i32 = arith.constant 1 : i32
-          %c11_i32_17 = arith.constant 11 : i32
-          %c32_i32_18 = arith.constant 32 : i32
-          func.call @conv2dk3_ui8(%13, %14, %15, %1, %17, %c32_i32_12, %c64_i32_13, %c32_i32_14, %c3_i32_15, %c3_i32_16, %c1_i32, %c11_i32_17, %c32_i32_18) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
-          aie.objectfifo.release @act2_02_03_05(Consume, 1)
-          aie.objectfifo.release @act3_05_04(Produce, 1)
-        }
-        %7 = aie.objectfifo.acquire @act2_02_03_05(Consume, 2) : !aie.objectfifosubview<memref<32x1x64xui8>>
-        %8 = aie.objectfifo.subview.access %7[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %9 = aie.objectfifo.subview.access %7[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %10 = aie.objectfifo.acquire @act3_05_04(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-        %11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-        %c32_i32_5 = arith.constant 32 : i32
-        %c64_i32_6 = arith.constant 64 : i32
-        %c32_i32_7 = arith.constant 32 : i32
-        %c3_i32_8 = arith.constant 3 : i32
-        %c3_i32_9 = arith.constant 3 : i32
-        %c2_i32 = arith.constant 2 : i32
-        %c11_i32_10 = arith.constant 11 : i32
-        %c32_i32_11 = arith.constant 32 : i32
-        func.call @conv2dk3_ui8(%8, %9, %9, %1, %11, %c32_i32_5, %c64_i32_6, %c32_i32_7, %c3_i32_8, %c3_i32_9, %c2_i32, %c11_i32_10, %c32_i32_11) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
-        aie.objectfifo.release @act2_02_03_05(Consume, 2)
-        aie.objectfifo.release @act3_05_04(Produce, 1)
-        aie.objectfifo.release @wts_buf_01(Consume, 1)
-      }
-      aie.end
-    } {link_with = "conv2dk3.o"}
-    %core_1_2 = aie.core(%tile_1_2) {
-      %c0 = arith.constant 0 : index
-      %c9223372036854775807 = arith.constant 9223372036854775807 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 {
-        %0 = aie.objectfifo.acquire @wts_buf_11(Consume, 1) : !aie.objectfifosubview<memref<36864xi8>>
-        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<36864xi8>> -> memref<36864xi8>
-        %2 = aie.objectfifo.acquire @act2_15_12_14(Consume, 2) : !aie.objectfifosubview<memref<32x1x64xui8>>
-        %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %4 = aie.objectfifo.subview.access %2[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %5 = aie.objectfifo.acquire @act3_12_13(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-        %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-        %c32_i32 = arith.constant 32 : i32
-        %c64_i32 = arith.constant 64 : i32
-        %c32_i32_0 = arith.constant 32 : i32
-        %c3_i32 = arith.constant 3 : i32
-        %c3_i32_1 = arith.constant 3 : i32
-        %c0_i32 = arith.constant 0 : i32
-        %c11_i32 = arith.constant 11 : i32
-        %c32_i32_2 = arith.constant 32 : i32
-        func.call @conv2dk3_ui8(%3, %3, %4, %1, %6, %c32_i32, %c64_i32, %c32_i32_0, %c3_i32, %c3_i32_1, %c0_i32, %c11_i32, %c32_i32_2) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
-        aie.objectfifo.release @act3_12_13(Produce, 1)
-        %c0_3 = arith.constant 0 : index
-        %c30 = arith.constant 30 : index
-        %c1_4 = arith.constant 1 : index
-        scf.for %arg1 = %c0_3 to %c30 step %c1_4 {
-          %12 = aie.objectfifo.acquire @act2_15_12_14(Consume, 3) : !aie.objectfifosubview<memref<32x1x64xui8>>
-          %13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-          %14 = aie.objectfifo.subview.access %12[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-          %15 = aie.objectfifo.subview.access %12[2] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-          %16 = aie.objectfifo.acquire @act3_12_13(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-          %17 = aie.objectfifo.subview.access %16[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-          %c32_i32_12 = arith.constant 32 : i32
-          %c64_i32_13 = arith.constant 64 : i32
-          %c32_i32_14 = arith.constant 32 : i32
-          %c3_i32_15 = arith.constant 3 : i32
-          %c3_i32_16 = arith.constant 3 : i32
-          %c1_i32 = arith.constant 1 : i32
-          %c11_i32_17 = arith.constant 11 : i32
-          %c32_i32_18 = arith.constant 32 : i32
-          func.call @conv2dk3_ui8(%13, %14, %15, %1, %17, %c32_i32_12, %c64_i32_13, %c32_i32_14, %c3_i32_15, %c3_i32_16, %c1_i32, %c11_i32_17, %c32_i32_18) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
-          aie.objectfifo.release @act2_15_12_14(Consume, 1)
-          aie.objectfifo.release @act3_12_13(Produce, 1)
-        }
-        %7 = aie.objectfifo.acquire @act2_15_12_14(Consume, 2) : !aie.objectfifosubview<memref<32x1x64xui8>>
-        %8 = aie.objectfifo.subview.access %7[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %9 = aie.objectfifo.subview.access %7[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %10 = aie.objectfifo.acquire @act3_12_13(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-        %11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-        %c32_i32_5 = arith.constant 32 : i32
-        %c64_i32_6 = arith.constant 64 : i32
-        %c32_i32_7 = arith.constant 32 : i32
-        %c3_i32_8 = arith.constant 3 : i32
-        %c3_i32_9 = arith.constant 3 : i32
-        %c2_i32 = arith.constant 2 : i32
-        %c11_i32_10 = arith.constant 11 : i32
-        %c32_i32_11 = arith.constant 32 : i32
-        func.call @conv2dk3_ui8(%8, %9, %9, %1, %11, %c32_i32_5, %c64_i32_6, %c32_i32_7, %c3_i32_8, %c3_i32_9, %c2_i32, %c11_i32_10, %c32_i32_11) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
-        aie.objectfifo.release @act2_15_12_14(Consume, 2)
-        aie.objectfifo.release @act3_12_13(Produce, 1)
-        aie.objectfifo.release @wts_buf_11(Consume, 1)
-      }
-      aie.end
-    } {link_with = "conv2dk3.o"}
-    %core_2_5 = aie.core(%tile_2_5) {
-      %c0 = arith.constant 0 : index
-      %c9223372036854775807 = arith.constant 9223372036854775807 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 {
-        %0 = aie.objectfifo.acquire @wts_buf_21(Consume, 1) : !aie.objectfifosubview<memref<36864xi8>>
-        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<36864xi8>> -> memref<36864xi8>
-        %2 = aie.objectfifo.acquire @act2_22_23_25(Consume, 2) : !aie.objectfifosubview<memref<32x1x64xui8>>
-        %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %4 = aie.objectfifo.subview.access %2[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %5 = aie.objectfifo.acquire @act3_25_24(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-        %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-        %c32_i32 = arith.constant 32 : i32
-        %c64_i32 = arith.constant 64 : i32
-        %c32_i32_0 = arith.constant 32 : i32
-        %c3_i32 = arith.constant 3 : i32
-        %c3_i32_1 = arith.constant 3 : i32
-        %c0_i32 = arith.constant 0 : i32
-        %c11_i32 = arith.constant 11 : i32
-        %c32_i32_2 = arith.constant 32 : i32
-        func.call @conv2dk3_ui8(%3, %3, %4, %1, %6, %c32_i32, %c64_i32, %c32_i32_0, %c3_i32, %c3_i32_1, %c0_i32, %c11_i32, %c32_i32_2) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
-        aie.objectfifo.release @act3_25_24(Produce, 1)
-        %c0_3 = arith.constant 0 : index
-        %c30 = arith.constant 30 : index
-        %c1_4 = arith.constant 1 : index
-        scf.for %arg1 = %c0_3 to %c30 step %c1_4 {
-          %12 = aie.objectfifo.acquire @act2_22_23_25(Consume, 3) : !aie.objectfifosubview<memref<32x1x64xui8>>
-          %13 = aie.objectfifo.subview.access %12[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-          %14 = aie.objectfifo.subview.access %12[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-          %15 = aie.objectfifo.subview.access %12[2] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-          %16 = aie.objectfifo.acquire @act3_25_24(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-          %17 = aie.objectfifo.subview.access %16[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-          %c32_i32_12 = arith.constant 32 : i32
-          %c64_i32_13 = arith.constant 64 : i32
-          %c32_i32_14 = arith.constant 32 : i32
-          %c3_i32_15 = arith.constant 3 : i32
-          %c3_i32_16 = arith.constant 3 : i32
-          %c1_i32 = arith.constant 1 : i32
-          %c11_i32_17 = arith.constant 11 : i32
-          %c32_i32_18 = arith.constant 32 : i32
-          func.call @conv2dk3_ui8(%13, %14, %15, %1, %17, %c32_i32_12, %c64_i32_13, %c32_i32_14, %c3_i32_15, %c3_i32_16, %c1_i32, %c11_i32_17, %c32_i32_18) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
-          aie.objectfifo.release @act2_22_23_25(Consume, 1)
-          aie.objectfifo.release @act3_25_24(Produce, 1)
-        }
-        %7 = aie.objectfifo.acquire @act2_22_23_25(Consume, 2) : !aie.objectfifosubview<memref<32x1x64xui8>>
-        %8 = aie.objectfifo.subview.access %7[0] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %9 = aie.objectfifo.subview.access %7[1] : !aie.objectfifosubview<memref<32x1x64xui8>> -> memref<32x1x64xui8>
-        %10 = aie.objectfifo.acquire @act3_25_24(Produce, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-        %11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-        %c32_i32_5 = arith.constant 32 : i32
-        %c64_i32_6 = arith.constant 64 : i32
-        %c32_i32_7 = arith.constant 32 : i32
-        %c3_i32_8 = arith.constant 3 : i32
-        %c3_i32_9 = arith.constant 3 : i32
-        %c2_i32 = arith.constant 2 : i32
-        %c11_i32_10 = arith.constant 11 : i32
-        %c32_i32_11 = arith.constant 32 : i32
-        func.call @conv2dk3_ui8(%8, %9, %9, %1, %11, %c32_i32_5, %c64_i32_6, %c32_i32_7, %c3_i32_8, %c3_i32_9, %c2_i32, %c11_i32_10, %c32_i32_11) : (memref<32x1x64xui8>, memref<32x1x64xui8>, memref<32x1x64xui8>, memref<36864xi8>, memref<32x1x32xui8>, i32, i32, i32, i32, i32, i32, i32, i32) -> ()
-        aie.objectfifo.release @act2_22_23_25(Consume, 2)
-        aie.objectfifo.release @act3_25_24(Produce, 1)
-        aie.objectfifo.release @wts_buf_21(Consume, 1)
-      }
-      aie.end
-    } {link_with = "conv2dk3.o"}
-    %core_0_4 = aie.core(%tile_0_4) {
-      %c0 = arith.constant 0 : index
-      %c9223372036854775807 = arith.constant 9223372036854775807 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 {
-        %0 = aie.objectfifo.acquire @wts_buf_02(Consume, 1) : !aie.objectfifosubview<memref<32768xi8>>
-        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<32768xi8>> -> memref<32768xi8>
-        %c0_0 = arith.constant 0 : index
-        %2 = memref.load %rtpComputeTile05[%c0_0] : memref<16xi32>
-        %c1_1 = arith.constant 1 : index
-        %3 = memref.load %rtpComputeTile05[%c1_1] : memref<16xi32>
-        %c2 = arith.constant 2 : index
-        %4 = memref.load %rtpComputeTile05[%c2] : memref<16xi32>
-        %c0_2 = arith.constant 0 : index
-        %c32 = arith.constant 32 : index
-        %c1_3 = arith.constant 1 : index
-        scf.for %arg1 = %c0_2 to %c32 step %c1_3 {
-          %5 = aie.objectfifo.acquire @act3_03_04(Consume, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-          %6 = aie.objectfifo.subview.access %5[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-          %7 = aie.objectfifo.acquire @act3_05_04(Consume, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-          %8 = aie.objectfifo.subview.access %7[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-          %9 = aie.objectfifo.acquire @act1_04_15_11(Produce, 1) : !aie.objectfifosubview<memref<32x1x256xui8>>
-          %10 = aie.objectfifo.subview.access %9[0] : !aie.objectfifosubview<memref<32x1x256xui8>> -> memref<32x1x256xui8>
-          %11 = aie.objectfifo.acquire @skip_0(Consume, 1) : !aie.objectfifosubview<memref<32x1x64xi8>>
-          %12 = aie.objectfifo.subview.access %11[0] : !aie.objectfifosubview<memref<32x1x64xi8>> -> memref<32x1x64xi8>
-          %c32_i32 = arith.constant 32 : i32
-          %c64_i32 = arith.constant 64 : i32
-          %c256_i32 = arith.constant 256 : i32
-          %c64_i32_4 = arith.constant 64 : i32
-          func.call @conv2dk1_skip_init_i8(%6, %8, %1, %10, %12, %c32_i32, %c64_i32, %c256_i32, %c64_i32_4, %2, %3, %4) : (memref<32x1x32xui8>, memref<32x1x32xui8>, memref<32768xi8>, memref<32x1x256xui8>, memref<32x1x64xi8>, i32, i32, i32, i32, i32, i32, i32) -> ()
-          aie.objectfifo.release @act3_03_04(Consume, 1)
-          aie.objectfifo.release @act3_05_04(Consume, 1)
-          aie.objectfifo.release @act1_04_15_11(Produce, 1)
-          aie.objectfifo.release @skip_0(Consume, 1)
-        }
-        aie.objectfifo.release @wts_buf_02(Consume, 1)
-      }
-      aie.end
-    } {link_with = "conv2dk1_skip_init.o"}
-    %core_1_3 = aie.core(%tile_1_3) {
-      %c0 = arith.constant 0 : index
-      %c9223372036854775807 = arith.constant 9223372036854775807 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 {
-        %0 = aie.objectfifo.acquire @wts_buf_12(Consume, 1) : !aie.objectfifosubview<memref<16384xi8>>
-        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<16384xi8>> -> memref<16384xi8>
-        %c0_0 = arith.constant 0 : index
-        %2 = memref.load %rtpComputeTile13[%c0_0] : memref<16xi32>
-        %c1_1 = arith.constant 1 : index
-        %3 = memref.load %rtpComputeTile13[%c1_1] : memref<16xi32>
-        %c0_2 = arith.constant 0 : index
-        %c32 = arith.constant 32 : index
-        %c1_3 = arith.constant 1 : index
-        scf.for %arg1 = %c0_2 to %c32 step %c1_3 {
-          %4 = aie.objectfifo.acquire @act3_14_13(Consume, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-          %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-          %6 = aie.objectfifo.acquire @act3_12_13(Consume, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-          %7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-          %8 = aie.objectfifo.acquire @act1_13_22_21(Produce, 1) : !aie.objectfifosubview<memref<32x1x256xui8>>
-          %9 = aie.objectfifo.subview.access %8[0] : !aie.objectfifosubview<memref<32x1x256xui8>> -> memref<32x1x256xui8>
-          %10 = aie.objectfifo.acquire @skip_1(Consume, 1) : !aie.objectfifosubview<memref<32x1x256xui8>>
-          %11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview<memref<32x1x256xui8>> -> memref<32x1x256xui8>
-          %c32_i32 = arith.constant 32 : i32
-          %c64_i32 = arith.constant 64 : i32
-          %c256_i32 = arith.constant 256 : i32
-          func.call @conv2dk1_skip_ui8(%5, %7, %1, %9, %11, %c32_i32, %c64_i32, %c256_i32, %2, %3) : (memref<32x1x32xui8>, memref<32x1x32xui8>, memref<16384xi8>, memref<32x1x256xui8>, memref<32x1x256xui8>, i32, i32, i32, i32, i32) -> ()
-          aie.objectfifo.release @act3_14_13(Consume, 1)
-          aie.objectfifo.release @act3_12_13(Consume, 1)
-          aie.objectfifo.release @act1_13_22_21(Produce, 1)
-          aie.objectfifo.release @skip_1(Consume, 1)
-        }
-        aie.objectfifo.release @wts_buf_12(Consume, 1)
-      }
-      aie.end
-    } {link_with = "conv2dk1_skip.o"}
-    %core_2_4 = aie.core(%tile_2_4) {
-      %c0 = arith.constant 0 : index
-      %c9223372036854775807 = arith.constant 9223372036854775807 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 {
-        %0 = aie.objectfifo.acquire @wts_buf_22(Consume, 1) : !aie.objectfifosubview<memref<16384xi8>>
-        %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<16384xi8>> -> memref<16384xi8>
-        %c0_0 = arith.constant 0 : index
-        %2 = memref.load %rtpComputeTile24[%c0_0] : memref<16xi32>
-        %c1_1 = arith.constant 1 : index
-        %3 = memref.load %rtpComputeTile24[%c1_1] : memref<16xi32>
-        %c0_2 = arith.constant 0 : index
-        %c32 = arith.constant 32 : index
-        %c1_3 = arith.constant 1 : index
-        scf.for %arg1 = %c0_2 to %c32 step %c1_3 {
-          %4 = aie.objectfifo.acquire @act3_23_24(Consume, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-          %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-          %6 = aie.objectfifo.acquire @act3_25_24(Consume, 1) : !aie.objectfifosubview<memref<32x1x32xui8>>
-          %7 = aie.objectfifo.subview.access %6[0] : !aie.objectfifosubview<memref<32x1x32xui8>> -> memref<32x1x32xui8>
-          %8 = aie.objectfifo.acquire @outOFL2L3(Produce, 1) : !aie.objectfifosubview<memref<32x1x256xui8>>
-          %9 = aie.objectfifo.subview.access %8[0] : !aie.objectfifosubview<memref<32x1x256xui8>> -> memref<32x1x256xui8>
-          %10 = aie.objectfifo.acquire @skip_2(Consume, 1) : !aie.objectfifosubview<memref<32x1x256xui8>>
-          %11 = aie.objectfifo.subview.access %10[0] : !aie.objectfifosubview<memref<32x1x256xui8>> -> memref<32x1x256xui8>
-          %c32_i32 = arith.constant 32 : i32
-          %c64_i32 = arith.constant 64 : i32
-          %c256_i32 = arith.constant 256 : i32
-          func.call @conv2dk1_skip_ui8(%5, %7, %1, %9, %11, %c32_i32, %c64_i32, %c256_i32, %2, %3) : (memref<32x1x32xui8>, memref<32x1x32xui8>, memref<16384xi8>, memref<32x1x256xui8>, memref<32x1x256xui8>, i32, i32, i32, i32, i32) -> ()
-          aie.objectfifo.release @act3_23_24(Consume, 1)
-          aie.objectfifo.release @act3_25_24(Consume, 1)
-          aie.objectfifo.release @outOFL2L3(Produce, 1)
-          aie.objectfifo.release @skip_2(Consume, 1)
-        }
-        aie.objectfifo.release @wts_buf_22(Consume, 1)
-      }
-      aie.end
-    } {link_with = "conv2dk1_skip.o"}
-    func.func @sequence(%arg0: memref<16384xi32>, %arg1: memref<53248xi32>, %arg2: memref<65536xi32>) {
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 16384][0, 0, 0]) {id = 0 : i64, metadata = @act1_00_02_01} : memref<16384xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 65536][0, 0, 0]) {id = 2 : i64, metadata = @outOFL2L3} : memref<65536xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 18432][0, 0, 0]) {id = 1 : i64, metadata = @wts_0_L3L2} : memref<53248xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 18432][1, 1, 1, 17408][0, 0, 0]) {id = 1 : i64, metadata = @wts_1_L3L2} : memref<53248xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 35840][1, 1, 1, 17408][0, 0, 0]) {id = 1 : i64, metadata = @wts_2_L3L2} : memref<53248xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 1 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
-    }
-  }
-}
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_softmax.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_softmax.mlir
deleted file mode 100644
index daa031f2c..000000000
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/ml_softmax.mlir
+++ /dev/null
@@ -1,66 +0,0 @@
-// RUN: iree-opt --aie-objectFifo-stateful-transform %s
-
-module {
-  aie.device(npu1) {
-    func.func private @softmax_bf16_vector(memref<1024xbf16>, memref<1024xbf16>)
-    %tile_0_0 = aie.tile(0, 0)
-    %tile_0_1 = aie.tile(0, 1)
-    %tile_0_2 = aie.tile(0, 2)
-    %tile_0_3 = aie.tile(0, 3)
-    aie.objectfifo @inA(%tile_0_0, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<2048xbf16>>
-    aie.objectfifo @memA0(%tile_0_1, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<1024xbf16>>
-    aie.objectfifo @memA1(%tile_0_1, {%tile_0_3}, 2 : i32) : !aie.objectfifo<memref<1024xbf16>>
-    aie.objectfifo.link [@inA] -> [@memA0, @memA1]()
-    aie.objectfifo @memC0(%tile_0_2, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<1024xbf16>>
-    aie.objectfifo @memC1(%tile_0_3, {%tile_0_1}, 2 : i32) : !aie.objectfifo<memref<1024xbf16>>
-    aie.objectfifo @outC(%tile_0_1, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<2048xbf16>>
-    aie.objectfifo.link [@memC0, @memC1] -> [@outC]()
-    %core_0_2 = aie.core(%tile_0_2) {
-      %c0 = arith.constant 0 : index
-      %c4294967295 = arith.constant 4294967295 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c4294967295 step %c1 {
-        %c0_0 = arith.constant 0 : index
-        %c128 = arith.constant 128 : index
-        %c1_1 = arith.constant 1 : index
-        scf.for %arg1 = %c0_0 to %c128 step %c1_1 {
-          %0 = aie.objectfifo.acquire @memC0(Produce, 1) : !aie.objectfifosubview<memref<1024xbf16>>
-          %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<1024xbf16>> -> memref<1024xbf16>
-          %2 = aie.objectfifo.acquire @memA0(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16>>
-          %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<1024xbf16>> -> memref<1024xbf16>
-          func.call @softmax_bf16_vector(%3, %1) : (memref<1024xbf16>, memref<1024xbf16>) -> ()
-          aie.objectfifo.release @memA0(Consume, 1)
-          aie.objectfifo.release @memC0(Produce, 1)
-        }
-      }
-      aie.end
-    } {link_with = "kernels.a"}
-    %core_0_3 = aie.core(%tile_0_3) {
-      %c0 = arith.constant 0 : index
-      %c4294967295 = arith.constant 4294967295 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c4294967295 step %c1 {
-        %c0_0 = arith.constant 0 : index
-        %c128 = arith.constant 128 : index
-        %c1_1 = arith.constant 1 : index
-        scf.for %arg1 = %c0_0 to %c128 step %c1_1 {
-          %0 = aie.objectfifo.acquire @memC1(Produce, 1) : !aie.objectfifosubview<memref<1024xbf16>>
-          %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<1024xbf16>> -> memref<1024xbf16>
-          %2 = aie.objectfifo.acquire @memA1(Consume, 1) : !aie.objectfifosubview<memref<1024xbf16>>
-          %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<1024xbf16>> -> memref<1024xbf16>
-          func.call @softmax_bf16_vector(%3, %1) : (memref<1024xbf16>, memref<1024xbf16>) -> ()
-          aie.objectfifo.release @memA1(Consume, 1)
-          aie.objectfifo.release @memC1(Produce, 1)
-        }
-      }
-      aie.end
-    } {link_with = "kernels.a"}
-    func.func @sequence(%arg0: memref<262144xi32>, %arg1: memref<262144xi32>) {
-      aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 131072][0, 0, 0]) {id = 0 : i64, metadata = @outC} : memref<262144xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 131072][0, 0, 0]) {id = 1 : i64, metadata = @inA} : memref<262144xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
-    }
-  }
-}
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/reset_npu.sh b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/reset_npu.sh
new file mode 100755
index 000000000..4536ab1b8
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/reset_npu.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+set -eux
+
+NUMBER=$(lspci -D | grep "\[AMD\] Device 1502" | cut -d ' ' -f1)
+
+if [ x"$NUMBER" != x"" ]; then
+  sudo modprobe -r amdxdna
+  sudo modprobe drm_shmem_helper
+  sudo modprobe amdxdna dyndbg==pflm
+
+#  if [ -f "/opt/xilinx/xrt/test/example_noop_test" ]; then
+#    /opt/xilinx/xrt/test/example_noop_test /lib/firmware/amdipu/1502/validate.xclbin
+#  fi
+else
+  echo "couldn't find npu"
+fi
+
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/vision_vision_passthrough.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/vision_vision_passthrough.mlir
deleted file mode 100644
index 99a7ceec5..000000000
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/vision_vision_passthrough.mlir
+++ /dev/null
@@ -1,39 +0,0 @@
-// RUN: iree-opt --aie-objectFifo-stateful-transform %s
-
-module {
-  aie.device(npu1) {
-    func.func private @passThroughLine(memref<512xui8>, memref<512xui8>, i32)
-    %tile_0_0 = aie.tile(0, 0)
-    %tile_0_2 = aie.tile(0, 2)
-    aie.objectfifo @in(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<512xui8>>
-    aie.objectfifo @out(%tile_0_2, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<512xui8>>
-    %core_0_2 = aie.core(%tile_0_2) {
-      %c0 = arith.constant 0 : index
-      %c9223372036854775807 = arith.constant 9223372036854775807 : index
-      %c1 = arith.constant 1 : index
-      scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 {
-        %c0_0 = arith.constant 0 : index
-        %c9 = arith.constant 9 : index
-        %c1_1 = arith.constant 1 : index
-        scf.for %arg1 = %c0_0 to %c9 step %c1_1 {
-          %0 = aie.objectfifo.acquire @out(Produce, 1) : !aie.objectfifosubview<memref<512xui8>>
-          %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview<memref<512xui8>> -> memref<512xui8>
-          %2 = aie.objectfifo.acquire @in(Consume, 1) : !aie.objectfifosubview<memref<512xui8>>
-          %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview<memref<512xui8>> -> memref<512xui8>
-          %c512_i32 = arith.constant 512 : i32
-          func.call @passThroughLine(%3, %1, %c512_i32) : (memref<512xui8>, memref<512xui8>, i32) -> ()
-          aie.objectfifo.release @in(Consume, 1)
-          aie.objectfifo.release @out(Produce, 1)
-        }
-      }
-      aie.end
-    } {link_with = "passThrough.cc.o"}
-    func.func @sequence(%arg0: memref<1152xi32>, %arg1: memref<1152xi32>, %arg2: memref<1152xi32>) {
-      aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 1152][0, 0, 0]) {id = 1 : i64, metadata = @in} : memref<1152xi32>
-      aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 1152][0, 0, 0]) {id = 0 : i64, metadata = @out} : memref<1152xi32>
-      aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32}
-      return
-    }
-  }
-}
-
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
index 54e0507e3..1890127c9 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
@@ -16,6 +16,7 @@
 #include "iree/compiler/Codegen/Common/Passes.h"
 #include "iree/compiler/Dialect/LinalgExt/Transforms/Passes.h"
 #include "iree/compiler/Utils/PassUtils.h"
+#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
 #include "mlir/Dialect/Affine/Passes.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Linalg/Passes.h"
@@ -436,14 +437,19 @@ void buildAMDAIETransformPassPipeline(OpPassManager &variantPassManager) {
 
 void buildAMDAIELowerObjectFIFO(OpPassManager &variantPassManager) {
   OpPassManager &modulePassManager = variantPassManager.nest<ModuleOp>();
+  modulePassManager.addPass(createCanonicalizerPass());
+  modulePassManager.addPass(createConvertLinalgToLoopsPass());
+  modulePassManager.addPass(memref::createFoldMemRefAliasOpsPass());
   modulePassManager.addPass(xilinx::AIE::createAIECanonicalizeDevicePass());
   auto &devicePassMan = modulePassManager.nest<xilinx::AIE::DeviceOp>();
+  devicePassMan.addPass(xilinx::AIE::createAIEAssignLockIDsPass());
   devicePassMan.addPass(
       xilinx::AIE::createAIEObjectFifoStatefulTransformPass());
+  devicePassMan.addPass(xilinx::AIE::createAIEAssignBufferDescriptorIDsPass());
   devicePassMan.addPass(xilinx::AIE::createAIEAssignBufferAddressesBasicPass());
-  devicePassMan.addPass(xilinx::AIE::createAIEAssignLockIDsPass());
-  devicePassMan.addPass(xilinx::AIE::createAIEPathfinderPass());
-  devicePassMan.addPass(xilinx::AIE::createAIELocalizeLocksPass());
+  modulePassManager.addPass(createConvertSCFToCFPass());
+  modulePassManager.addNestedPass<xilinx::AIE::DeviceOp>(
+      xilinx::AIE::createAIEPathfinderPass());
 
   LLVM_DEBUG({
     llvm::dbgs() << "Using AMDAIE pass pipeline:\n";
diff --git a/tests/matmul/requirements.txt b/tests/matmul/requirements.txt
index 62116914f..f57c361bd 100644
--- a/tests/matmul/requirements.txt
+++ b/tests/matmul/requirements.txt
@@ -1,3 +1,8 @@
 PyYAML>=5.4.1
 requests>=2.28.0
-enum_tools==0.6.4
\ No newline at end of file
+enum_tools==0.6.4
+numpy
+-f https://github.com/nod-ai/prototype-aie-toolchain/releases/expanded_assets/release
+xaiepy
+-f https://github.com/Xilinx/mlir-aie/releases/expanded_assets/latest-wheels
+aie-python-bindings-debug
\ No newline at end of file