diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp
index c64955c3d..18d6b2a49 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp
@@ -6,15 +6,14 @@
 
 #include "iree-amd-aie/IR/AMDAIEOps.h"
 
+#include <numeric>
+
 #include "iree-amd-aie/IR/AMDAIEDialect.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/IR/OpDefinition.h"
 
-#define GET_OP_CLASSES
-#include "iree-amd-aie/IR/AMDAIEOps.cpp.inc"
-
 namespace mlir::iree_compiler::AMDAIE {
 
 void AMDAIEDialect::initializeAMDAIEOps() {
@@ -24,6 +23,38 @@ void AMDAIEDialect::initializeAMDAIEOps() {
       >();
 }
 
+//===----------------------------------------------------------------------===//
+// custom<AsyncTokenType>(type($async_token))
+//===----------------------------------------------------------------------===//
+
+/// Parses an optional list of async operands.
+static ParseResult parseAsyncTokenType(OpAsmParser &parser, Type &resultType) {
+  if (succeeded(parser.parseOptionalKeyword("async_source"))) {
+    resultType = parser.getBuilder().getType<AMDAIE::AsyncSourceTokenType>();
+  } else if (succeeded(parser.parseOptionalKeyword("async_target"))) {
+    resultType = parser.getBuilder().getType<AMDAIE::AsyncTargetTokenType>();
+  } else if (succeeded(parser.parseOptionalKeyword("async"))) {
+    resultType = parser.getBuilder().getType<AMDAIE::AsyncTokenType>();
+  }
+  return success();
+}
+
+/// Prints optional async tokens with its leading keyword.
+static void printAsyncTokenType(OpAsmPrinter &p, Operation *op,
+                                Type asyncTokenType) {
+  if (asyncTokenType) {
+    if (isa<AMDAIE::AsyncSourceTokenType>(asyncTokenType)) {
+      p << "async_source";
+    } else if (isa<AMDAIE::AsyncTargetTokenType>(asyncTokenType)) {
+      p << "async_target";
+    } else if (isa<AMDAIE::AsyncTokenType>(asyncTokenType)) {
+      p << "async";
+    } else {
+      assert(false && "unsupported async token type");
+    }
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // AMDAIE_BdIdOp
 //===----------------------------------------------------------------------===//
@@ -1026,6 +1057,97 @@ void NpuDmaCpyNdOp::getCanonicalizationPatterns(RewritePatternSet &results,
           context);
 }
 
+//===----------------------------------------------------------------------===//
+// AMDAIE_NpuHalfDmaCpyNdOp
+//===----------------------------------------------------------------------===//
+
+// Build a NpuHalfDmaCpyNdOp with mixed static and dynamic entries.
+void NpuHalfDmaCpyNdOp::build(OpBuilder &b, OperationState &result,
+                              TypeRange resultTypes, Value connection,
+                              Value input, ArrayRef<OpFoldResult> offsets,
+                              ArrayRef<OpFoldResult> sizes,
+                              ArrayRef<OpFoldResult> strides, Value bdId,
+                              Value channel) {
+  SmallVector<int64_t> staticOffsets, staticSizes, staticStrides;
+  SmallVector<Value> dynamicOffsets, dynamicSizes, dynamicStrides;
+  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
+  dispatchIndexOpFoldResults(sizes, dynamicSizes, staticSizes);
+  dispatchIndexOpFoldResults(strides, dynamicStrides, staticStrides);
+  build(b, result, resultTypes, connection, input, dynamicOffsets, dynamicSizes,
+        dynamicStrides, staticOffsets, staticSizes, staticStrides, bdId,
+        channel);
+}
+
+// Build a NpuHalfDmaCpyNdOp with static entries.
+void NpuHalfDmaCpyNdOp::build(OpBuilder &b, OperationState &result,
+                              TypeRange resultTypes, Value connection,
+                              Value input, ArrayRef<int64_t> offsets,
+                              ArrayRef<int64_t> sizes,
+                              ArrayRef<int64_t> strides, mlir::Value bdId,
+                              Value channel) {
+  SmallVector<OpFoldResult> offsetValues = llvm::to_vector<4>(llvm::map_range(
+      offsets,
+      [&](int64_t v) -> OpFoldResult { return b.getI64IntegerAttr(v); }));
+  SmallVector<OpFoldResult> sizeValues =
+      llvm::to_vector<4>(llvm::map_range(sizes, [&](int64_t v) -> OpFoldResult {
+        return b.getI64IntegerAttr(v);
+      }));
+  SmallVector<OpFoldResult> strideValues = llvm::to_vector<4>(llvm::map_range(
+      strides,
+      [&](int64_t v) -> OpFoldResult { return b.getI64IntegerAttr(v); }));
+  build(b, result, resultTypes, connection, input, offsetValues, sizeValues,
+        strideValues, bdId, channel);
+}
+
+// Build a NpuHalfDmaCpyNdOp with dynamic entries.
+void NpuHalfDmaCpyNdOp::build(OpBuilder &b, OperationState &result,
+                              TypeRange resultTypes, Value connection,
+                              Value input, ValueRange offsets, ValueRange sizes,
+                              ValueRange strides, mlir::Value bdId,
+                              Value channel) {
+  SmallVector<OpFoldResult> offsetValues = llvm::to_vector<4>(
+      llvm::map_range(offsets, [](Value v) -> OpFoldResult { return v; }));
+  SmallVector<OpFoldResult> sizeValues = llvm::to_vector<4>(
+      llvm::map_range(sizes, [](Value v) -> OpFoldResult { return v; }));
+  SmallVector<OpFoldResult> strideValues = llvm::to_vector<4>(
+      llvm::map_range(strides, [](Value v) -> OpFoldResult { return v; }));
+  build(b, result, resultTypes, connection, input, offsetValues, sizeValues,
+        strideValues, bdId, channel);
+}
+
+std::optional<int64_t> NpuHalfDmaCpyNdOp::getStaticBaseOffset() {
+  int64_t baseOffset = 0;
+  SmallVector<OpFoldResult> offsets = getMixedOffsets();
+  SmallVector<OpFoldResult> strides = getMixedStrides();
+  for (auto &&[offset, stride] : llvm::zip(offsets, strides)) {
+    std::optional<int64_t> constantOffset = getConstantIntValue(offset);
+    std::optional<int64_t> constantStride = getConstantIntValue(stride);
+    // If offset is zero, we can just continue to the next one. This enables
+    // the case where the stride is dynamic.
+    if (constantOffset && constantOffset.value() == 0) continue;
+    if (constantOffset && constantStride) {
+      baseOffset += (constantOffset.value() * constantStride.value());
+    } else {
+      return std::nullopt;
+    }
+  }
+  return baseOffset;
+}
+
+std::optional<int64_t> NpuHalfDmaCpyNdOp::getAccessStaticSize() {
+  SmallVector<OpFoldResult> sizes = getMixedSizes();
+  if (sizes.size() == 0) return 0;
+  std::optional<SmallVector<int64_t>> staticSizes = getConstantIntValues(sizes);
+  if (!staticSizes) return std::nullopt;
+  return std::accumulate(staticSizes->begin(), staticSizes->end(), 1,
+                         std::multiplies<>());
+}
+
+bool NpuHalfDmaCpyNdOp::hasDmaWaitOpUser() {
+  return llvm::any_of((*this)->getUsers(),
+                      [](auto userOp) { return isa<NpuDmaWaitOp>(userOp); });
+}
+
 //===----------------------------------------------------------------------===//
 // AMDAIE_NpuCircularDmaCpyNdOp
 //===----------------------------------------------------------------------===//
@@ -1254,3 +1376,10 @@ LogicalResult WorkgroupOp::verify() {
   return success();
 }
 }  // namespace mlir::iree_compiler::AMDAIE
+
+//===----------------------------------------------------------------------===//
+// TableGen definitions (intentionally last)
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "iree-amd-aie/IR/AMDAIEOps.cpp.inc"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
index 087dac7b4..3a4df8459 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
@@ -11,6 +11,7 @@ include "mlir/Interfaces/CopyOpInterface.td"
 include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/Interfaces/ViewLikeInterface.td"
+include "mlir/IR/BuiltinAttributes.td"
 include "mlir/IR/OpAsmInterface.td"
 include "mlir/IR/OpBase.td"
 include "mlir/IR/SymbolInterfaces.td"
@@ -177,6 +178,9 @@ def AMDAIE_WorkgroupOp : AMDAIE_Op<"workgroup",
   }];
   
   let regions = (region SizedRegion<1>:$region);
+  let arguments = (
+    ins OptionalAttr<Builtin_DenseResourceElementsAttr>:$npu_instructions
+  );
   
   let assemblyFormat = [{ regions attr-dict }];
 
@@ -191,7 +195,7 @@ def AMDAIE_WorkgroupOp : AMDAIE_Op<"workgroup",
   let extraClassDeclaration = [{
     // Return the control code op within this workgroup.
     ControlCodeOp getControlCode() {
-      return dyn_cast<ControlCodeOp>(getBody()->getTerminator());
+      return cast<ControlCodeOp>(getBody()->getTerminator());
     }
     // Make sure the WorkgroupOp region is well-formed with a ControlCodeOp
     // terminator.
@@ -355,14 +359,15 @@ def AMDAIE_ChannelOp: AMDAIE_Op<"channel", [
 
     ```mlir
     %tile = amdaie.tile(%c0, %c0)
-    %channel = amdaie.channel(%tile, 0, port_type = DMA)
+    %channel = amdaie.channel(%tile, 0, port_type = DMA, direction = MM2S)
     ```
   }];
 
   let arguments = (
     ins Index:$tile,
         ConfinedAttr<I8Attr, [IntMinValue<0>]>:$value,
-        StrmSwPortTypeAttr:$port_type
+        StrmSwPortTypeAttr:$port_type,
+        DMAChannelDir:$direction
   );
 
   let extraClassDeclaration = [{
@@ -370,7 +375,13 @@ def AMDAIE_ChannelOp: AMDAIE_Op<"channel", [
   }];
 
   let assemblyFormat = [{ 
-    `(` $tile `,` $value `,` `port_type` `=` $port_type `)` attr-dict
+    `(` 
+    $tile `,`
+    $value `,` 
+    `port_type` `=` $port_type `,` 
+    `direction` `=` $direction
+    `)`
+    attr-dict
   }];
 }
 
@@ -378,6 +389,30 @@ def AMDAIE_ChannelOp: AMDAIE_Op<"channel", [
 // IREE AMDAIE Npu Ops
 //===----------------------------------------------------------------------===//
 
+def AMDAIE_NpuAddressPatchOp: AMDAIE_Op<"npu.address_patch"> {
+  let summary = "Operation to patch the address inside a buffer descriptor";
+  let description = [{
+    This NPU controller operation patches the address inside the buffer
+    descriptor with provided ID on the specified column. This enables codegen to
+    provide an argument index and offset at compile time, which is then
+    translated to a physical address at runtime by the firmware.
+
+    Example:
+
+    ```mlir
+    amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32,
+      col = 0 : ui32, offset = 1024 : ui32}
+    ```
+  }];
+  let arguments = (
+    ins UI32Attr:$col,
+        UI32Attr:$bd_id,
+        UI32Attr:$arg_idx,
+        UI32Attr:$offset
+  );
+  let assemblyFormat = [{ attr-dict }];
+}
+
 def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", [
       AttrSizedOperandSegments, DoublyStridedOpInterface]> {
   let summary = "The Npu uController's dma operator";
@@ -556,6 +591,151 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", [
   let hasCanonicalizer = 1;
 }
 
+def AMDAIE_NpuHalfDmaCpyNdOp
+  : AMDAIE_Op<"npu.half_dma_cpy_nd", [AttrSizedOperandSegments, OffsetSizeAndStrideOpInterface]> {
+  let summary = "The NPU uController's DMA operation, operating on a single port";
+  let description = [{
+    The NPU DMA operation represents a strided DMA operation with an unlimited
+    number of dimensions, executed by the NPU uController. This operation refers
+    to a `connection` and `input` logical objectFifo being operated on, as well
+    as an optionally specified BD ID and `channel` (DMA port). The `connection`
+    operand provides information on how to use the connection, for example
+    whether a packet header is needed.
+
+    The representation supports a partially-static representation for the 
+    `offsets`, `sizes` and `strides`. A special sentinel value 
+    ShapedType::kDynamic encodes that the corresponding entry has a dynamic
+    value.
+
+    Example:
+
+    ```mlir
+    %2 = amdaie.connection(%1, %0) 
+      : (!amdaie.logicalobjectfifo<memref<32x64xi32, 1>>,
+      !amdaie.logicalobjectfifo<memref<32x1024xi32>>)
+    %bd_id = amdaie.bd_id(%tile_0_0, 0)
+    %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S)
+    ...
+    amdaie.controlcode {
+      %5 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} 
+        : memref<32x1024xi32> -> !amdaie.logicalobjectfifo<memref<32768xi32>>
+      %4 = amdaie.npu.half_dma_cpy_nd async %2(%0[0, 0] [32, 64] [1024, 1]
+        bd_id = %bd_id channel = %channel)
+      ...
+    }
+    ```
+  }];
+
+  let arguments = (
+    ins Index:$connection,
+        AnyAMDAIELogicalObjectFifoType:$input,
+        Variadic<Index>:$offsets,
+        Variadic<Index>:$sizes,
+        Variadic<Index>:$strides,
+        DenseI64ArrayAttr:$static_offsets,
+        DenseI64ArrayAttr:$static_sizes,
+        DenseI64ArrayAttr:$static_strides,
+        Optional<Index>:$bd_id,
+        Optional<Index>:$channel
+  );
+
+  let results = (outs Optional<AMDAIE_AsyncTokenType>:$async_token);
+
+  let assemblyFormat = [{
+    custom<AsyncTokenType>(type($async_token))
+    $connection
+    `(`
+    $input
+    custom<DynamicIndexList>($offsets, $static_offsets)
+    custom<DynamicIndexList>($sizes, $static_sizes)
+    custom<DynamicIndexList>($strides, $static_strides)
+    (`bd_id` `=` $bd_id^)?
+    (`channel` `=` $channel^)?
+    `)`
+    attr-dict 
+    `:` type($input)
+  }];
+
+  let builders = [
+    // Build a NpuHalfDmaCpyNdOp with mixed static and dynamic entries.
+    OpBuilder<(ins "::mlir::TypeRange":$result_types, "Value":$connection,
+      "::mlir::Value":$input, "ArrayRef<OpFoldResult>":$offsets,
+      "ArrayRef<OpFoldResult>":$sizes, "ArrayRef<OpFoldResult>":$strides,
+      "::mlir::Value":$bd_id, "::mlir::Value":$channel)>,
+    // Build a NpuHalfDmaCpyNdOp with static entries.
+    OpBuilder<(ins "::mlir::TypeRange":$result_types, "Value":$connection,
+      "::mlir::Value":$target, "ArrayRef<int64_t>":$offsets,
+      "ArrayRef<int64_t>":$sizes, "ArrayRef<int64_t>":$strides,
+      "::mlir::Value":$bd_id, "::mlir::Value":$channel)>,
+    // Build a NpuHalfDmaCpyNdOp with dynamic entries.
+    OpBuilder<(ins "::mlir::TypeRange":$result_types, "Value":$connection,
+      "::mlir::Value":$input, "ValueRange":$offsets, "ValueRange":$sizes,
+      "ValueRange":$strides, "::mlir::Value":$bd_id, "::mlir::Value":$channel)>
+  ];
+
+  let extraClassDeclaration = [{
+    /// Return the number of leading operands before the `offsets`, `sizes` and
+    /// and `strides` operands.
+    static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 2; }
+    
+    /// Return the expected rank of each of the`static_offsets`, `static_sizes`
+    /// and `static_strides` attributes.
+    std::array<unsigned, 3> getArrayAttrMaxRanks() {
+      unsigned rank = getMixedOffsets().size();
+      return {rank, rank, rank};
+    }
+
+    std::optional<BdIdOp> getBdIdOp() {
+      return dyn_cast_if_present<BdIdOp>(getBdId().getDefiningOp());
+    }
+
+    // Return the input `amdaie.connection` operation.
+    std::optional<ConnectionOp> getConnectionOp() {
+      return dyn_cast_if_present<ConnectionOp>(getConnection().getDefiningOp());
+    }
+
+    std::optional<ChannelOp> getChannelOp() {
+      return dyn_cast_if_present<ChannelOp>(getChannel().getDefiningOp());
+    }
+
+    // Return the source memref type. This is retrieved using information from
+    // the input DMA operation.
+    MemRefType getMemrefType() { 
+      return cast<LogicalObjectFifoType>(getInput().getType())
+          .getElementType();
+    }
+
+    // Return the source memory space as an attribute.
+    Attribute getMemorySpace() {
+      return cast<LogicalObjectFifoType>(getInput().getType())
+        .getMemorySpace();
+    }
+
+    // Helper method to return the memory space as an integer. If no memory 
+    // space attribute, this indicates a global memory space and 0 is returned.
+    // Else cast the memory space attribute to an integer. 
+    uint8_t getMemorySpaceAsUInt() {
+      Attribute memSpace = getMemorySpace();
+      return memSpace ? cast<IntegerAttr>(memSpace).getInt() : 0;
+    }
+
+    // Compute and return the constant base offset if possible.
+    std::optional<int64_t> getStaticBaseOffset();
+
+    // Compute and return the size of the DMA access if possible.
+    std::optional<int64_t> getAccessStaticSize();
+    
+    // Check whether this dma operation has a wait user.
+    bool hasDmaWaitOpUser();
+    
+    // Check whether this operation has addressing.
+    bool hasAddressing() {
+      return !getMixedOffsets().empty() || !getMixedSizes().empty() 
+        || !getMixedStrides().empty();
+    }
+  }];
+}
+
 def AMDAIE_NpuCircularDmaCpyNdOp: AMDAIE_Op<"npu.circular_dma_cpy_nd", [
       AMDAIE_CircularDmaOp, AttrSizedOperandSegments, DoublyStridedOpInterface]>,
     Results<(outs Index)> {
@@ -761,6 +941,85 @@ def AMDAIE_NpuDmaWaitOp: AMDAIE_Op<"npu.dma_wait", []> {
   }];
 }
 
+def AMDAIE_NpuPushToQueueOp: AMDAIE_Op<"npu.push_to_queue">,
+  Results<(outs Optional<AMDAIE_AsyncTokenType>:$async_token)> {
+  let summary = "Push the provided BD to the specified channel's queue.";
+  let description = [{
+    This NPU controller operation to push a buffer descriptor with specified
+    `bd_id` to the queue of the (`channel`, `direction`) DMA port on tile
+    (`col`, `row`). The BD will be repeated for a `repeat_count` number of
+    times.
+
+    Example:
+
+    ```mlir
+    amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32,
+      col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32,
+      row = 0 : ui32}
+    ```
+  }];
+  let arguments = (
+    ins UI32Attr:$col,
+        UI32Attr:$row,
+        DMAChannelDir:$direction,
+        UI32Attr:$channel,
+        UI32Attr:$repeat_count,
+        UI32Attr:$bd_id
+  );
+  let assemblyFormat = [{
+    custom<AsyncTokenType>(type($async_token)) attr-dict
+  }];
+}
+
+def AMDAIE_NpuWriteBdOp: AMDAIE_Op<"npu.write_bd"> {
+  let summary = "Initialize the buffer descriptor with specified ID";
+  let description = [{
+    This NPU controller operation to initialize the `bd_id` buffer descriptor on
+    the (`col`, `row`) tile with the provided configurations.
+
+    Example:
+
+    ```mlir
+    amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32,
+      buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false,
+      iteration_current = 0 : ui32, iteration_size = 0 : ui32,
+      iteration_stride = 0 : ui32, lock_acq_enable = false,
+      lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32,
+      lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32,
+      packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>,
+      paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 0>,
+      strides = array<i32: 0, 0, 0>, use_next_bd = false, valid_bd = true}
+    ```
+  }];
+  let arguments = (
+    ins UI32Attr:$col,
+        UI32Attr:$row,
+        UI32Attr:$bd_id,
+        UI32Attr:$buffer_length,
+        UI32Attr:$buffer_offset,
+        DenseI32ArrayAttr:$sizes,
+        DenseI32ArrayAttr:$strides,
+        DenseI32ArrayAttr:$paddings_before,
+        DenseI32ArrayAttr:$paddings_after,
+        UI32Attr:$iteration_current,
+        UI32Attr:$iteration_size,
+        UI32Attr:$iteration_stride,
+        BoolAttr:$enable_packet,
+        UI32Attr:$packet_id,
+        UI32Attr:$packet_type,
+        UI32Attr:$out_of_order_id,
+        BoolAttr:$use_next_bd,
+        UI32Attr:$next_bd,
+        BoolAttr:$valid_bd,
+        BoolAttr:$lock_acq_enable,
+        I32Attr:$lock_rel_val,
+        UI32Attr:$lock_rel_id,
+        I32Attr:$lock_acq_val,
+        UI32Attr:$lock_acq_id
+  );
+  let assemblyFormat = [{ attr-dict }];
+}
+
 //===----------------------------------------------------------------------===//
 // IREE AMDAIE LogicalObjectFifo Ops
 //===----------------------------------------------------------------------===//
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir
index 53831b2f0..bb31f7ec9 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir
@@ -153,8 +153,8 @@ func.func @dma_cpy_nd_mixed(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32
 // CHECK:       %[[C1:.*]] = arith.constant 1 : index
 // CHECK:       %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]])
 // CHECK:       %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]])
-// CHECK:       %[[CHANNEL:.*]] = amdaie.channel(%[[TILE_0_0]], 0, port_type = DMA)
-// CHECK:       %[[CHANNEL_1:.*]] = amdaie.channel(%[[TILE_0_1]], 0, port_type = DMA)
+// CHECK:       %[[CHANNEL:.*]] = amdaie.channel(%[[TILE_0_0]], 0, port_type = DMA, direction = MM2S)
+// CHECK:       %[[CHANNEL_1:.*]] = amdaie.channel(%[[TILE_0_1]], 0, port_type = DMA, direction = S2MM)
 // CHECK:       amdaie.flow({%[[CHANNEL]]} -> {%[[CHANNEL_1]]}) {is_packet_flow = false}
 // CHECK:       amdaie.flow({%[[CHANNEL]]} -> {%[[CHANNEL_1]]}) {is_packet_flow = true, packet_id = 1 : ui8}
 func.func @flow() {
@@ -162,8 +162,8 @@ func.func @flow() {
   %c1 = arith.constant 1 : index
   %tile_0_0 = amdaie.tile(%c0, %c0)
   %tile_0_1 = amdaie.tile(%c0, %c1)
-  %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA)
-  %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA)
+  %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S)
+  %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM)
   %0 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false}
   %1 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true, packet_id = 1 : ui8}
   return
@@ -391,6 +391,65 @@ func.func @npu_dma_cpy_nd_all_operands(%arg0: !amdaie.logicalobjectfifo<memref<1
 
 // -----
 
+// CHECK-LABEL: func.func @npu_half_dma_cpy_nd
+// CHECK-SAME:  %[[ARG0:.+]]: !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
+// CHECK-DAG:   %[[BD_ID:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0)
+// CHECK-DAG:   %[[CHANNEL:.*]] = amdaie.channel(%[[TILE_0_0]], 0, port_type = DMA, direction = S2MM)
+// CHECK-DAG:   %[[CONNECTION_0:.+]] = amdaie.connection
+func.func @npu_half_dma_cpy_nd(%arg0: !amdaie.logicalobjectfifo<memref<2048xi32>>, %arg1: !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>>) {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %tile_0_0 = amdaie.tile(%c0, %c0)
+  %bd_id = amdaie.bd_id(%tile_0_0, 0)
+  %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = S2MM)
+  %0 = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo<memref<2048xi32>>, !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>>)
+// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [] [] []) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+  amdaie.npu.half_dma_cpy_nd %0(%arg0[] [] []) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK: %{{.+}} = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_0]](%[[ARG0]] [] [] []) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+  amdaie.npu.half_dma_cpy_nd async %0(%arg0[] [] []) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [0] [1024] [1] bd_id = %[[BD_ID]]) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+  amdaie.npu.half_dma_cpy_nd %0(%arg0[0] [1024] [1] bd_id = %bd_id) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [%[[C0]], 0] [%[[C0]], 64] [%[[C0]], 1] channel = %[[CHANNEL]]) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+  amdaie.npu.half_dma_cpy_nd %0(%arg0[%c0, 0] [%c0, 64] [%c0, 1] channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [] [] [] bd_id = %[[BD_ID]] channel = %[[CHANNEL]]) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+  amdaie.npu.half_dma_cpy_nd %0(%arg0[] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @npu_patch_address
+// CHECK:       amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
+func.func @npu_patch_address() {
+  amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @npu_push_to_queue
+// CHECK:       amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
+// CHECK:       %{{.+}} = amdaie.npu.push_to_queue async {bd_id = 15 : ui32, channel = 0 : ui32, col = 2 : ui32, direction = 1 : i32, repeat_count = 256 : ui32, row = 0 : ui32}
+func.func @npu_push_to_queue() {
+  amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
+  %0 = amdaie.npu.push_to_queue async {bd_id = 15 : ui32, channel = 0 : ui32, col = 2 : ui32, direction = 1 : i32, repeat_count = 256 : ui32, row = 0 : ui32}
+  return
+}
+
+// -----
+
+// CHECK-LABEL: func.func @npu_write_bd
+// CHECK:       amdaie.npu.write_bd {bd_id = 2 : ui32, buffer_length = 1024 : ui32, buffer_offset = 32 : ui32, col = 1 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 1 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 4, 16, 16>, strides = array<i32: 64, 8, 1>, use_next_bd = false, valid_bd = true}
+func.func @npu_write_bd() {
+  amdaie.npu.write_bd {bd_id = 2 : ui32, buffer_length = 1024 : ui32, buffer_offset = 32 : ui32, col = 1 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 1 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 4, 16, 16>, strides = array<i32: 64, 8, 1>, use_next_bd = false, valid_bd = true}
+  return
+}
+
+// -----
+
 // CHECK-LABEL: func.func @workgroup
 // CHECK: amdaie.workgroup
 // CHECK: amdaie.core
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir
index e902c7e2a..429089cdd 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir
@@ -1,6 +1,6 @@
 // This pipeline is obtained by going into Passes.cpp, and dumping the pass pipeline (at the end of addAMDAIEObjectFifoLoweringPasses) using `passManager.dump()`. This test is included, as it can be useful to have a reference in IR of all the passes that are run.
 
-// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-distribute-l1-allocations,iree-amdaie-convert-to-dma,iree-amdaie-normalize-loop-bounds,iree-amdaie-insert-cores,iree-amdaie-localize-logicalobjectfifo,cse,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-split-logical-objectfifos-for-connection-reuse,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-dma-cse,iree-amdaie-hoist-logical-objectfifo,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-assign-logical-objectfifo-depth{l1-buffer-depth=2 l2-buffer-depth=2 l3-buffer-depth=1},iree-amdaie-access-to-acquire-release,iree-amdaie-none-access-to-temporary-buffer,iree-amdaie-assign-connection-types,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-composition{only-zero-stride-on-outer-dim=true},cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-assign-npu-dma-bd-ids,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-controlcode-loop-unroll,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-convert-core-forall-to-for,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-channels,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-objfifo-bufferization,iree-amdaie-connection-to-flow,iree-amdaie-assign-packet-ids,iree-amdaie-acquire-release-to-use-lock,iree-amdaie-canonicalize-npu-dma-cpy-nd{nb-dimensions=4},canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-sink-into-core,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-lower-to-aie,iree-amdaie-remove-memoryspace)" --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-distribute-l1-allocations,iree-amdaie-convert-to-dma,iree-amdaie-normalize-loop-bounds,iree-amdaie-insert-cores,iree-amdaie-localize-logicalobjectfifo,cse,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-split-logical-objectfifos-for-connection-reuse,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-dma-cse,iree-amdaie-hoist-logical-objectfifo,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-assign-logical-objectfifo-depth{l1-buffer-depth=2 l2-buffer-depth=2 l3-buffer-depth=1},iree-amdaie-access-to-acquire-release,iree-amdaie-none-access-to-temporary-buffer,iree-amdaie-assign-connection-types,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-composition{only-zero-stride-on-outer-dim=true},cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-assign-npu-dma-bd-ids,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-controlcode-loop-unroll,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-convert-core-forall-to-for,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-channels,cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-objfifo-bufferization,iree-amdaie-connection-to-flow,iree-amdaie-assign-packet-ids,iree-amdaie-controlcode-lowering,iree-amdaie-controlcode-to-transaction,iree-amdaie-acquire-release-to-use-lock,iree-amdaie-canonicalize-npu-dma-cpy-nd{nb-dimensions=4},canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-sink-into-core,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-lower-to-aie,iree-amdaie-remove-memoryspace)" --split-input-file %s | FileCheck %s
 
 
 
@@ -19,9 +19,8 @@
 // CHECK-DAG:   aie.core(%[[TILE_1_2]])
 // CHECK:         aie.use_lock
 // Check a bit of the aiex.runtime_sequence:
-// CHECK:       aiex.runtime_sequence @matmul_i32(%[[ARG0:.+]]: memref<32x1024xi32>, %[[ARG1:.+]]: memref<1024x64xi32>, %[[ARG2:.+]]: memref<32x64xi32>)
-// CHECK-DAG:     aiex.npu.dma_memcpy_nd
-// CHECK-DAG:     aiex.npu.dma_wait
+// CHECK:       aiex.runtime_sequence @matmul_i32()
+// CHECK:       } {npu_instructions = dense_resource<npu_instructions> : tensor<174xui32>, runtime_sequence_name = "matmul_i32"}
 
 #pipeline_layout = #hal.pipeline.layout<bindings= [
     #hal.pipeline.binding<storage_buffer, ReadOnly>,
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignChannels.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignChannels.cpp
index d0440336b..1959c7a9a 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignChannels.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignChannels.cpp
@@ -45,14 +45,16 @@ LogicalResult assignChannels(AMDAIE::WorkgroupOp workgroupOp) {
     for (Value tile : sourceLogicalObjFifo.getTiles()) {
       uint8_t channel = generator.getProducerDMAChannel(tile);
       auto channelOp = rewriter.create<AMDAIE::ChannelOp>(
-          rewriter.getUnknownLoc(), tile, channel, StrmSwPortType::DMA);
+          rewriter.getUnknownLoc(), tile, channel, StrmSwPortType::DMA,
+          AMDAIE::DMAChannelDir::MM2S);
       sourceChannels.push_back(channelOp.getResult());
     }
     SmallVector<Value> targetChannels;
     for (Value tile : targetLogicalObjFifo.getTiles()) {
       uint8_t channel = generator.getConsumerDMAChannel(tile);
       auto channelOp = rewriter.create<AMDAIE::ChannelOp>(
-          rewriter.getUnknownLoc(), tile, channel, StrmSwPortType::DMA);
+          rewriter.getUnknownLoc(), tile, channel, StrmSwPortType::DMA,
+          AMDAIE::DMAChannelDir::S2MM);
       targetChannels.push_back(channelOp.getResult());
     }
     rewriter.replaceOpWithNewOp<AMDAIE::ConnectionOp>(
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp
new file mode 100644
index 000000000..2348478d9
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp
@@ -0,0 +1,342 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include <algorithm>
+
+#include "iree-amd-aie/IR/AMDAIEOps.h"
+#include "iree-amd-aie/Transforms/AMDAIEDmaUtils.h"
+#include "iree-amd-aie/Transforms/AMDAIEUtils.h"
+#include "iree-amd-aie/Transforms/Passes.h"
+#include "iree-amd-aie/Transforms/Transforms.h"
+#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+#define DEBUG_TYPE "iree-amdaie-controlcode-lowering"
+
+namespace mlir::iree_compiler::AMDAIE {
+
+struct DmaCpyNdToHalfDmaCpyNdConverter final
+    : OpConversionPattern<AMDAIE::NpuDmaCpyNdOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      AMDAIE::NpuDmaCpyNdOp dmaOp, OpAdaptor adaptor,
+      ConversionPatternRewriter &rewriter) const override {
+    LLVM_DEBUG(llvm::dbgs() << "matchAndRewrite[AMDAIE::NpuDmaCpyNdOp]\n");
+    AMDAIE::ConnectionOp connectionOp = dmaOp.getConnectionOp();
+    if (!connectionOp) {
+      return dmaOp.emitOpError()
+             << "should operate on an `amdaie.connection` op";
+    }
+    // Convert source half.
+    Value source =
+        dmaOp.getSource() ? dmaOp.getSource() : connectionOp.getSource();
+    if (connectionOp.getSourceChannels().size() != 1)
+      return connectionOp.emitOpError() << "expected a single source channel";
+    auto sourceChannelOp = dyn_cast<AMDAIE::ChannelOp>(
+        connectionOp.getSourceChannels()[0].getDefiningOp());
+    bool hasAsyncSourceToken =
+        llvm::any_of(dmaOp.getAsyncTokens(), [](Value token) {
+          return isa<AMDAIE::AsyncSourceTokenType>(token.getType());
+        });
+    SmallVector<Type> resultTypes = {
+        rewriter.getType<AMDAIE::AsyncTokenType>()};
+    TypeRange sourceResultTypes =
+        hasAsyncSourceToken ? TypeRange{resultTypes} : TypeRange{};
+    rewriter.setInsertionPoint(dmaOp);
+    auto sourceDma = rewriter.create<AMDAIE::NpuHalfDmaCpyNdOp>(
+        dmaOp.getLoc(), sourceResultTypes, connectionOp, source,
+        dmaOp.getSourceMixedOffsets(), dmaOp.getSourceMixedSizes(),
+        dmaOp.getSourceMixedStrides(), dmaOp.getSourceBdId(), sourceChannelOp);
+
+    // Convert target half.
+    Value target =
+        dmaOp.getTarget() ? dmaOp.getTarget() : connectionOp.getTarget();
+    if (connectionOp.getTargetChannels().size() != 1)
+      return connectionOp.emitOpError() << "expected a single target channel";
+    auto targetChannelOp = dyn_cast<AMDAIE::ChannelOp>(
+        connectionOp.getTargetChannels()[0].getDefiningOp());
+    bool hasAsyncTargetToken =
+        llvm::any_of(dmaOp.getAsyncTokens(), [](Value token) {
+          return isa<AMDAIE::AsyncTargetTokenType>(token.getType());
+        });
+    TypeRange targetResultTypes =
+        hasAsyncTargetToken ? TypeRange{resultTypes} : TypeRange{};
+    auto targetDma = rewriter.create<AMDAIE::NpuHalfDmaCpyNdOp>(
+        dmaOp.getLoc(), targetResultTypes, connectionOp, target,
+        dmaOp.getTargetMixedOffsets(), dmaOp.getTargetMixedSizes(),
+        dmaOp.getTargetMixedStrides(), dmaOp.getTargetBdId(), targetChannelOp);
+    if (dmaOp.getNumResults() == 1) {
+      if (sourceDma.getNumResults() == 1) {
+        rewriter.replaceUsesWithIf(
+            dmaOp.getResult(0), sourceDma.getResult(0), [&](OpOperand &use) {
+              return isa<AMDAIE::AsyncSourceTokenType>(use.get().getType()) &&
+                     isa<AMDAIE::NpuDmaWaitOp>(use.getOwner());
+            });
+      }
+      if (targetDma.getNumResults() == 1) {
+        rewriter.replaceUsesWithIf(
+            dmaOp.getResult(0), targetDma.getResult(0), [&](OpOperand &use) {
+              return isa<AMDAIE::AsyncTargetTokenType>(use.get().getType()) &&
+                     isa<AMDAIE::NpuDmaWaitOp>(use.getOwner());
+            });
+      }
+      if (!dmaOp.getResult(0).use_empty())
+        return dmaOp.emitOpError() << "should not have any uses anymore";
+    }
+    rewriter.eraseOp(dmaOp);
+    return success();
+  }
+};
+
+struct HalfDmaCpyNdToNpuConverter final
+    : OpConversionPattern<AMDAIE::NpuHalfDmaCpyNdOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  HalfDmaCpyNdToNpuConverter(MLIRContext *context,
+                             const AMDAIE::AMDAIEDeviceModel &deviceModel)
+      : OpConversionPattern(context), deviceModel(std::move(deviceModel)) {
+    minStrideBitWidth = deviceModel.getMinStrideBitWidth();
+  }
+
+  /// Insert ops to write a BD, patch the address and push it to the queue. This
+  /// is specific to Shim BDs for now.
+  FailureOr<AMDAIE::NpuPushToQueueOp> insertWriteBdOps(
+      AMDAIE::NpuHalfDmaCpyNdOp op, ConversionPatternRewriter &rewriter,
+      AMDAIE::AMDAIETileType tileType,
+      AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjFifo,
+      AMDAIE::BdIdOp bdIdOp, AMDAIE::ChannelOp channelOp, int64_t bufferLength,
+      int64_t bufferOffset, int32_t enablePacket, int32_t packetId,
+      int32_t packetType, ArrayRef<OpFoldResult> sizes,
+      ArrayRef<OpFoldResult> strides) const {
+    uint8_t numIntraAddrDim = deviceModel.getDmaProp<uint8_t>(
+        tileType, AMDAIE::AMDAIEDmaProp::NumAddrDim);
+    uint8_t numAddrDim = numIntraAddrDim + kAMDAIEDmaNbInterDims;
+    auto subspanOp = dyn_cast_if_present<IREE::HAL::InterfaceBindingSubspanOp>(
+        logicalObjFifo.getMemref().getDefiningOp());
+    if (!subspanOp) {
+      return logicalObjFifo.emitOpError()
+             << "must operate on an `hal.interface.binding.subspan`";
+    }
+    int64_t argIdx = subspanOp.getBinding().getZExtValue();
+    MemRefType memrefType = logicalObjFifo.getMemrefType();
+    int64_t elemWidthInBits = memrefType.getElementTypeBitWidth();
+    std::optional<AMDAIE::DMAChannelDir> maybeDmaDirection =
+        channelOp.getDirection();
+    if (!maybeDmaDirection) {
+      return channelOp.emitOpError()
+             << "direction needed for lowering of NPU ops";
+    }
+    AMDAIE::TileOp tileOp =
+        dyn_cast_if_present<AMDAIE::TileOp>(bdIdOp.getTile().getDefiningOp());
+    if (!tileOp)
+      return bdIdOp.emitOpError() << "must operate on an `amdaie.tile`";
+    int64_t col = getConstantIndexOrAssert(tileOp.getCol());
+    int64_t row = getConstantIndexOrAssert(tileOp.getRow());
+    int32_t bdId = bdIdOp.getValue();
+    int32_t outOfOrderId{0};
+
+    SmallVector<int32_t, 4> staticSizes;
+    SmallVector<int32_t, 4> staticStrides;
+    // Padding is unused for now.
+    SmallVector<int32_t, 4> paddingsBefore;
+    SmallVector<int32_t, 4> paddingsAfter;
+    int32_t iterationCurrent{0};
+    int32_t iterationSize{0};
+    int32_t iterationStride{0};
+    int32_t repeatCount{1};
+    for (auto iter : llvm::enumerate(llvm::zip(sizes, strides))) {
+      int64_t size = getConstantIndexOrAssert(std::get<0>(iter.value()));
+      int64_t stride = getConstantIndexOrAssert(std::get<1>(iter.value()));
+
+      /// Map the outer dimension to the iteration dimension if intra dimensions
+      /// are all used already or if the first stride == 0 as only the iteration
+      /// dimension supports stride == 0.
+      if (iter.index() == 0 && (sizes.size() == numAddrDim || stride == 0)) {
+        if (stride == 0) {
+          repeatCount = size;
+        } else {
+          iterationStride =
+              std::max(stride * elemWidthInBits / minStrideBitWidth,
+                       (int64_t)1);
+          iterationSize = size;
+          if (stride == 1)
+            size = (size * elemWidthInBits) / minStrideBitWidth;
+          repeatCount = iterationSize;
+        }
+      } else {
+        staticStrides.push_back(
+            std::max(stride * elemWidthInBits / minStrideBitWidth,
+                     (int64_t)1));
+        // Innermost size needs to account for addressing granularity.
+        if (iter.index() == (sizes.size() - 1)) {
+          staticSizes.push_back(size * elemWidthInBits /
+                                minStrideBitWidth);
+        } else {
+          staticSizes.push_back(size);
+        }
+      }
+    }
+    // Make sure sizes/strides have the correct size based on the number from
+    // intra addressing dimensions.
+    staticSizes.insert(staticSizes.begin(),
+                       numIntraAddrDim - staticSizes.size(), 0);
+    staticStrides.insert(staticStrides.begin(),
+                         numIntraAddrDim - staticStrides.size(), 0);
+
+    bool useNextBd{false};
+    int32_t nextBd{0};
+    bool validBd{true};
+    int32_t lockRelVal{0};
+    int32_t lockRelId{0};
+    bool lockAcqEnable{false};
+    int32_t lockAcqVal{0};
+    int32_t lockAcqId{0};
+
+    uint32_t bufferLengthInWords =
+        bufferLength * elemWidthInBits / minStrideBitWidth;
+    uint32_t innerBufferLength = bufferLengthInWords / repeatCount;
+    uint32_t bufferOffsetInBytes = bufferOffset * elemWidthInBits / 8;
+
+    // Offset set to zero for shim as the offset is embedded in the address
+    // patch.
+    rewriter.create<AMDAIE::NpuWriteBdOp>(
+        op.getLoc(), col, row, bdId, innerBufferLength, 0, staticSizes,
+        staticStrides, paddingsBefore, paddingsAfter, iterationCurrent,
+        iterationSize, iterationStride, enablePacket, packetId, packetType,
+        outOfOrderId, useNextBd, nextBd, validBd, lockAcqEnable, lockRelVal,
+        lockRelId, lockAcqVal, lockAcqId);
+    rewriter.create<AMDAIE::NpuAddressPatchOp>(op.getLoc(), col, bdId, argIdx,
+                                               bufferOffsetInBytes);
+    SmallVector<Type> resultTypes = {
+        rewriter.getType<AMDAIE::AsyncTokenType>()};
+    TypeRange resultTypeRange =
+        op.getAsyncToken() ? TypeRange{resultTypes} : TypeRange{};
+    auto npuPushToQueueOp = rewriter.create<AMDAIE::NpuPushToQueueOp>(
+        op.getLoc(), resultTypeRange, col, row, maybeDmaDirection.value(),
+        channelOp.getValue(), repeatCount, bdId);
+    return npuPushToQueueOp;
+  }
+
+  LogicalResult matchAndRewrite(
+      AMDAIE::NpuHalfDmaCpyNdOp op, OpAdaptor adaptor,
+      ConversionPatternRewriter &rewriter) const override {
+    LLVM_DEBUG(llvm::dbgs() << "matchAndRewrite[AMDAIE::NpuHalfDmaCpyNdOp]\n");
+    // First retrieve the connection and flow ops operated on.
+    // NOTE(jornt): this will logic will simplify in the future when DMA ops can
+    // operate directly on `amdaie.flow`.
+    std::optional<AMDAIE::ConnectionOp> maybeConnectionOp =
+        op.getConnectionOp();
+    if (!maybeConnectionOp) {
+      return op.emitOpError()
+             << "expected to operate on an `amdaie.connection`";
+    }
+    std::optional<AMDAIE::FlowOp> maybeFlowOp = maybeConnectionOp->getFlowOp();
+    if (!maybeFlowOp) {
+      return maybeConnectionOp->emitOpError()
+             << "expected to operate on an `amdaie.flow`";
+    }
+    bool enablePacket = maybeFlowOp->getIsPacketFlow();
+    int32_t packetId{0};
+    int32_t packetType{0};
+    std::optional<uint8_t> maybePacketId = maybeFlowOp->getPacketId();
+    if (enablePacket) {
+      if (!maybePacketId) {
+        return maybeFlowOp->emitOpError()
+               << "packet flow enabled, but no packet ID is set";
+      }
+      packetId = maybePacketId.value();
+    }
+    // Only support Shim for now.
+    if (op.getMemorySpaceAsUInt() != 0) {
+      rewriter.eraseOp(op);
+      return success();
+    }
+    auto logicalObjFifo =
+        dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>(
+            op.getInput().getDefiningOp());
+    if (!logicalObjFifo) {
+      return op.emitOpError() << "expected input to be an "
+                                 "`amdaie.logicalobjectfifo.from_memref`";
+    }
+    std::optional<AMDAIE::BdIdOp> maybeBdIdOp = op.getBdIdOp();
+    if (!maybeBdIdOp) {
+      return op.emitOpError() << "must have a BD ID op to lower to "
+                                 "`amdaie.npu.write_bd`";
+    }
+    std::optional<AMDAIE::ChannelOp> maybeChannelOp = op.getChannelOp();
+    if (!maybeChannelOp)
+      return op.emitOpError() << "found non-`amdaie.channel` channel";
+    std::optional<int64_t> maybeSize = op.getAccessStaticSize();
+    if (!maybeSize)
+      return op.emitOpError() << "could not compute a static size";
+    std::optional<int64_t> maybeOffset = op.getStaticBaseOffset();
+    if (!maybeOffset)
+      return op.emitOpError() << "could not compute a static source offset";
+    SmallVector<OpFoldResult> sizes = op.getMixedSizes();
+    SmallVector<OpFoldResult> strides = op.getMixedStrides();
+    FailureOr<AMDAIE::NpuPushToQueueOp> npuPushToQueueOp = insertWriteBdOps(
+        op, rewriter, AMDAIE::AMDAIETileType::SHIMNOC, logicalObjFifo,
+        maybeBdIdOp.value(), maybeChannelOp.value(), maybeSize.value(),
+        maybeOffset.value(), enablePacket, packetId, packetType, sizes,
+        strides);
+    if (failed(npuPushToQueueOp)) return failure();
+    rewriter.replaceOp(op, *npuPushToQueueOp);
+    return success();
+  }
+
+ private:
+  const AMDAIE::AMDAIEDeviceModel &deviceModel;
+  uint8_t minStrideBitWidth;
+};
+
+namespace {
+class AMDAIEControlCodeLoweringPass
+    : public impl::AMDAIEControlCodeLoweringBase<
+          AMDAIEControlCodeLoweringPass> {
+ public:
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<AMDAIEDialect>();
+  }
+  void runOnOperation() override;
+};
+
+void AMDAIEControlCodeLoweringPass::runOnOperation() {
+  Operation *parentOp = getOperation();
+  MLIRContext *context = &getContext();
+
+  auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(parentOp);
+  std::optional<AMDAIEDevice> maybeDevice = getConfigAMDAIEDevice(targetAttr);
+  if (!maybeDevice) {
+    parentOp->emitOpError()
+        << "has no AMDAIEDevice in the target attribute configuration. This "
+           "device-specific information is required to lower control code "
+           "ops.";
+    return signalPassFailure();
+  }
+  AMDAIE::AMDAIEDeviceModel deviceModel =
+      AMDAIE::getDeviceModel(maybeDevice.value());
+
+  RewritePatternSet patterns(context);
+  ConversionTarget conversionTarget(*context);
+  conversionTarget.addLegalDialect<AMDAIEDialect>();
+  conversionTarget
+      .addIllegalOp<AMDAIE::NpuHalfDmaCpyNdOp, AMDAIE::NpuDmaCpyNdOp>();
+  patterns.insert<DmaCpyNdToHalfDmaCpyNdConverter>(context);
+  patterns.insert<HalfDmaCpyNdToNpuConverter>(context, deviceModel);
+  if (failed(applyPartialConversion(parentOp, conversionTarget,
+                                    std::move(patterns)))) {
+    return signalPassFailure();
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<Pass> createAMDAIEControlCodeLoweringPass() {
+  return std::make_unique<AMDAIEControlCodeLoweringPass>();
+}
+
+}  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp
new file mode 100644
index 000000000..4ed6d0bb0
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp
@@ -0,0 +1,358 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree-amd-aie/IR/AMDAIEOps.h"
+#include "iree-amd-aie/Transforms/AMDAIEUtils.h"
+#include "iree-amd-aie/Transforms/Passes.h"
+#include "iree-amd-aie/Transforms/Transforms.h"
+#include "mlir/IR/AsmState.h"
+#include "mlir/IR/Iterators.h"
+#include "mlir/Transforms/DialectConversion.h"
+
+#define DEBUG_TYPE "iree-amdaie-controlcode-to-transaction"
+
+#define TXN_OPC_WRITE 0x0
+#define TXN_OPC_BLOCKWRITE 0x1
+#define TXN_OPC_TCT 0x80
+#define TXN_OPC_DDR_PATCH 0x81
+
+namespace mlir::iree_compiler::AMDAIE {
+
+class TransactionBuilder {
+ public:
+  AMDAIE::AMDAIEDeviceModel deviceModel;
+  TransactionBuilder(AMDAIE::AMDAIEDeviceModel deviceModel)
+      : deviceModel(std::move(deviceModel)) {}
+
+  void clearAndInitialize() {
+    instructions.clear();
+    llvm::MutableArrayRef<uint32_t> words = reserveAndGetTail(4);
+    // setup txn header
+    words[0] = 0x06030100;
+    words[1] = 0x00000105;
+  }
+
+  size_t getInstructionSize() const { return instructions.size(); }
+
+  ArrayRef<uint32_t> finalizeAndReturnInstructions() {
+    finalizeHeader();
+    return ArrayRef<uint32_t>(instructions.data(), instructions.size());
+  }
+
+  void dumpTransactionAsHex() const {
+    llvm::outs() << "Transaction: \n";
+    for (uint32_t word : instructions) {
+      // Write hex as 0xXXXXXXXX
+      llvm::outs() << utohexstr(word, 8) << "\n";
+    }
+  }
+
+  LogicalResult appendAddressPatch(uint32_t addr, uint32_t argIdx,
+                                   uint32_t offset) {
+    llvm::MutableArrayRef<uint32_t> words = reserveAndGetTail(12);
+    words[0] = TXN_OPC_DDR_PATCH;
+    words[1] = words.size() * sizeof(uint32_t);  // Operation Size
+
+    words[6] = addr;
+    words[7] = 0;
+    words[8] = argIdx;
+    words[9] = 0;
+    words[10] = offset;
+    words[11] = 0;
+    instructionCounter++;
+    return success();
+  }
+
+  LogicalResult appendTCTSync(uint32_t col, uint32_t row, uint32_t direction,
+                              uint32_t rowNum, uint32_t colNum,
+                              uint32_t channel) {
+    llvm::MutableArrayRef<uint32_t> words = reserveAndGetTail(4);
+    words[0] = TXN_OPC_TCT;
+    words[1] = words.size() * sizeof(uint32_t);  // Operation Size
+
+    words[2] |= direction & 0xff;
+    words[2] |= (row & 0xff) << 8;
+    words[2] |= (col & 0xff) << 16;
+
+    words[3] |= (rowNum & 0xff) << 8;
+    words[3] |= (colNum & 0xff) << 16;
+    words[3] |= (channel & 0xff) << 24;
+    instructionCounter++;
+    return success();
+  }
+
+  LogicalResult appendPushToQueueOp(uint32_t col, uint32_t row,
+                                    AMDAIE::DMAChannelDir direction,
+                                    uint32_t channel, uint32_t bdId,
+                                    uint32_t repeatCount, bool issueToken) {
+    uint32_t colShift = deviceModel.getColumnShift();
+    uint32_t rowShift = deviceModel.getRowShift();
+    uint32_t addr =
+        direction == AMDAIE::DMAChannelDir::MM2S ? 0x1D214 : 0x1D204;
+    if (channel == 1) addr += 0x8;
+    if (col && row) {
+      addr |= ((col & 0xff) << colShift) | ((row & 0xff) << rowShift) |
+              (addr & 0xFFFFF);
+    }
+    uint32_t value = 0;
+    value |= bdId & 0xF;
+    value |= (repeatCount & 0xFF) << 16;
+    if (issueToken) value |= 0x80000000;
+    return appendWrite32Op(addr, value);
+  }
+
+  LogicalResult appendWrite32Op(uint32_t addr, uint32_t value) {
+    llvm::MutableArrayRef<uint32_t> words = reserveAndGetTail(6);
+    // XAIE_IO_WRITE
+    words[0] = TXN_OPC_WRITE;
+    words[1] = 0;
+    words[2] = addr;
+    words[3] = 0;
+    words[4] = value;                            // Value
+    words[5] = words.size() * sizeof(uint32_t);  // Operation Size
+    instructionCounter++;
+    return success();
+  }
+
+  LogicalResult appendWriteBdOp(
+      uint32_t bdAddr, uint32_t bufferLength, uint32_t bufferOffset,
+      bool enablePacket, uint32_t outOfOrderId, uint32_t packetId,
+      uint32_t packetType, uint32_t d0Size, uint32_t d0Stride, uint32_t d1Size,
+      uint32_t d1Stride, uint32_t d2Stride, uint32_t iterationCurrent,
+      uint32_t iterationSize, uint32_t iterationStride, uint32_t nextBd,
+      bool useNextBd, bool validBd, int32_t lockRelVal, uint32_t lockRelId,
+      bool lockAcqEnable, int32_t lockAcqVal, uint32_t lockAcqId) {
+    llvm::MutableArrayRef<uint32_t> words = reserveAndGetTail(12);
+    words[0] = TXN_OPC_BLOCKWRITE;
+    words[1] = 0;
+    // RegOff
+    words[2] = bdAddr;                           // ADDR
+    words[3] = words.size() * sizeof(uint32_t);  // Operation Size
+    // DMA_BDX_0
+    words[4] = bufferLength;
+    // DMA_BDX_1
+    words[5] = bufferOffset;
+    // DMA_BDX_2
+    // En Packet , OoO BD ID , Packet ID , Packet Type
+    words[6] |= ((int)enablePacket & 0x1) << 30;
+    words[6] |= (outOfOrderId & 0x3f) << 24;
+    words[6] |= (packetId & 0x1f) << 19;
+    words[6] |= (packetType & 0x7) << 16;
+    // DMA_BDX_3
+    // TODO: Secure Access
+    words[7] |= (d0Size & 0x3ff) << 20;
+    words[7] |= d0Stride & 0xfffff;
+    // DMA_BDX_4
+    words[8] = 0x80000000;  // burst length;
+    words[8] |= (d1Size & 0x3ff) << 20;
+    words[8] |= d1Stride & 0xfffff;
+    // DMA_BDX_5
+    // TODO: SIMID, AxCache, AXQoS
+    words[9] = d2Stride & 0xfffff;
+    // DMA_BDX_6
+    words[10] |= (iterationCurrent & 0x3f) << 26;
+    words[10] |= (iterationSize & 0x3f) << 20;
+    words[10] |= iterationStride & 0xfffff;
+    // DMA_BDX_7
+    // TODO: TLAST Suppress
+    words[11] |= (nextBd & 0xf) << 27;
+    words[11] |= ((int)useNextBd & 0x1) << 26;
+    words[11] |= ((int)validBd & 0x1) << 25;
+    words[11] |= (lockRelVal & 0xef) << 18;
+    words[11] |= (lockRelId & 0xf) << 13;
+    words[11] |= ((int)lockAcqEnable & 0x1) << 12;
+    words[11] |= (lockAcqVal & 0xef) << 5;
+    words[11] |= lockAcqId & 0xf;
+    instructionCounter++;
+    return success();
+  }
+
+ private:
+  void finalizeHeader() {
+    // Finalize txn header.
+    instructions[2] = instructionCounter;
+    instructions[3] = instructions.size() * sizeof(uint32_t);
+  }
+
+  llvm::MutableArrayRef<uint32_t> reserveAndGetTail(size_t tailSize) {
+    auto oldSize = instructions.size();
+    auto newSize = oldSize + tailSize;
+    instructions.resize(newSize, 0);
+    return llvm::MutableArrayRef<uint32_t>(instructions.data() + oldSize,
+                                           tailSize);
+  }
+  size_t instructionCounter{0};
+  std::vector<uint32_t> instructions;
+};
+
+LogicalResult convertOp(AMDAIE::NpuAddressPatchOp op,
+                        TransactionBuilder &builder) {
+  uint32_t col = op.getCol();
+  uint32_t bdId = op.getBdId();
+  uint32_t colShift = builder.deviceModel.getColumnShift();
+  uint32_t addr = (col << colShift) | (0x1D004 + bdId * 0x20);
+  if (failed(builder.appendAddressPatch(addr, op.getArgIdx(), op.getOffset())))
+    return failure();
+  return success();
+}
+
+LogicalResult convertOp(AMDAIE::NpuDmaWaitOp op, TransactionBuilder &builder) {
+  for (Value token : op.getAsyncTokens()) {
+    auto pushToQueueOp =
+        dyn_cast_if_present<AMDAIE::NpuPushToQueueOp>(token.getDefiningOp());
+    if (!pushToQueueOp) {
+      return op.emitOpError()
+             << "should operate on an `amdaie.push_to_queue` op";
+    }
+    if (failed(builder.appendTCTSync(
+            pushToQueueOp.getCol(), pushToQueueOp.getRow(),
+            static_cast<uint32_t>(pushToQueueOp.getDirection()), 1, 1,
+            pushToQueueOp.getChannel()))) {
+      return failure();
+    }
+  }
+  return success();
+}
+
+LogicalResult convertOp(AMDAIE::NpuPushToQueueOp op,
+                        TransactionBuilder &builder) {
+  uint32_t repeatCount = op.getRepeatCount() - 1;
+  if (failed(builder.appendPushToQueueOp(op.getCol(), op.getRow(),
+                                         op.getDirection(), op.getChannel(),
+                                         op.getBdId(), repeatCount, true))) {
+    return failure();
+  }
+  return success();
+}
+
+LogicalResult convertOp(AMDAIE::NpuWriteBdOp op, TransactionBuilder &builder) {
+  uint32_t col = op.getCol();
+  uint32_t row = op.getRow();
+  uint32_t bdId = op.getBdId();
+  uint32_t colShift = builder.deviceModel.getColumnShift();
+  uint32_t rowShift = builder.deviceModel.getRowShift();
+  uint32_t bdAddr =
+      (col << colShift) | (row << rowShift) | (0x1D000 + bdId * 0x20);
+  ArrayRef<int32_t> sizes = op.getSizes();
+  ArrayRef<int32_t> strides = op.getStrides();
+  if (sizes.size() != 3) return op.emitOpError() << "expected 3 sizes";
+  if (strides.size() != 3) return op.emitOpError() << "expected 3 strides";
+  uint32_t d0Size = sizes[sizes.size() - 1];
+  uint32_t d1Size = sizes[sizes.size() - 2];
+  // Strides and iteration_size are encoded as `actual - 1`, but `0` should stay
+  // `0` as it's not supported;
+  uint32_t d0Stride =
+      std::max((int64_t)strides[strides.size() - 1] - 1, (int64_t)0);
+  uint32_t d1Stride =
+      std::max((int64_t)strides[strides.size() - 2] - 1, (int64_t)0);
+  uint32_t d2Stride =
+      std::max((int64_t)strides[strides.size() - 3] - 1, (int64_t)0);
+  uint32_t iterationSize =
+      std::max((int64_t)op.getIterationSize() - 1, (int64_t)0);
+  uint32_t iterationStride =
+      std::max((int64_t)op.getIterationStride() - 1, (int64_t)0);
+  if (failed(builder.appendWriteBdOp(
+          bdAddr, op.getBufferLength(), op.getBufferOffset(),
+          op.getEnablePacket(), op.getOutOfOrderId(), op.getPacketId(),
+          op.getPacketType(), d0Size, d0Stride, d1Size, d1Stride, d2Stride,
+          op.getIterationCurrent(), iterationSize, iterationStride,
+          op.getNextBd(), op.getUseNextBd(), op.getValidBd(),
+          op.getLockRelVal(), op.getLockRelId(), op.getLockAcqEnable(),
+          op.getLockAcqVal(), op.getLockAcqId()))) {
+    return failure();
+  }
+  return success();
+}
+
+LogicalResult controlCodeToTransaction(IRRewriter &rewriter,
+                                       AMDAIE::ControlCodeOp controlCodeOp,
+                                       TransactionBuilder &builder) {
+  SmallVector<Operation *> toBeErased;
+  WalkResult res = controlCodeOp->walk([&](Operation *op) {
+    LogicalResult switchResult =
+        TypeSwitch<Operation *, LogicalResult>(op)
+            .Case<AMDAIE::NpuAddressPatchOp, AMDAIE::NpuDmaWaitOp,
+                  AMDAIE::NpuPushToQueueOp, AMDAIE::NpuWriteBdOp>(
+                [&](auto npuOp) {
+                  if (failed(convertOp(npuOp, builder))) return failure();
+                  toBeErased.push_back(npuOp);
+                  return success();
+                })
+            .Default([&](Operation *) { return success(); });
+    if (failed(switchResult)) return WalkResult::interrupt();
+    return WalkResult::advance();
+  });
+  if (res.wasInterrupted()) return failure();
+  for (Operation *op : toBeErased) {
+    op->dropAllUses();
+    rewriter.eraseOp(op);
+  }
+  return success();
+}
+
+namespace {
+
+class AMDAIEControlCodeToTransactionPass
+    : public impl::AMDAIEControlCodeToTransactionBase<
+          AMDAIEControlCodeToTransactionPass> {
+ public:
+  AMDAIEControlCodeToTransactionPass(
+      const AMDAIEControlCodeToTransactionOptions &options)
+      : AMDAIEControlCodeToTransactionBase(options) {}
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<AMDAIEDialect>();
+  }
+  void runOnOperation() override;
+};
+
+void AMDAIEControlCodeToTransactionPass::runOnOperation() {
+  Operation *parentOp = getOperation();
+  MLIRContext *context = &getContext();
+
+  auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(parentOp);
+  std::optional<AMDAIEDevice> maybeDevice = getConfigAMDAIEDevice(targetAttr);
+  if (!maybeDevice) {
+    parentOp->emitOpError()
+        << "has no AMDAIEDevice in the target attribute configuration. This "
+           "device-specific information is required to lower control code "
+           "ops.";
+    return signalPassFailure();
+  }
+  AMDAIE::AMDAIEDeviceModel deviceModel =
+      AMDAIE::getDeviceModel(maybeDevice.value());
+  TransactionBuilder transactionBuilder(std::move(deviceModel));
+
+  IRRewriter rewriter(context);
+  WalkResult res = parentOp->walk([&](AMDAIE::WorkgroupOp workgroupOp) {
+    transactionBuilder.clearAndInitialize();
+    if (failed(controlCodeToTransaction(rewriter, workgroupOp.getControlCode(),
+                                        transactionBuilder))) {
+      return WalkResult::interrupt();
+    }
+    LLVM_DEBUG(llvm::dbgs() << "Instruction size: "
+                            << transactionBuilder.getInstructionSize() << "\n");
+    ArrayRef<uint32_t> instructions =
+        transactionBuilder.finalizeAndReturnInstructions();
+    workgroupOp.setNpuInstructionsAttr(DenseUI32ResourceElementsAttr::get(
+        RankedTensorType::get(
+            transactionBuilder.getInstructionSize(),
+            IntegerType::get(&getContext(), 32, IntegerType::Unsigned)),
+        "npu_instructions",
+        HeapAsmResourceBlob::allocateAndCopyInferAlign(instructions)));
+    if (dumpTransaction) transactionBuilder.dumpTransactionAsHex();
+    return WalkResult::advance();
+  });
+  if (res.wasInterrupted()) return signalPassFailure();
+}
+
+}  // namespace
+
+std::unique_ptr<Pass> createAMDAIEControlCodeToTransactionPass(
+    AMDAIEControlCodeToTransactionOptions options) {
+  return std::make_unique<AMDAIEControlCodeToTransactionPass>(options);
+}
+
+}  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp
index 0edeb3659..c4459f60c 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp
@@ -395,203 +395,6 @@ LogicalResult AIEDeviceBuilder::coreToAIE(AMDAIE::CoreOp coreOp,
   return success();
 }
 
-//===----------------------------------------------------------------------===//
-// Convert amdaie.controlcode operation to NPU instruction func
-//===----------------------------------------------------------------------===//
-
-/// Convert the `amdaie.npu.dma_cpy_nd` operation to `aiex.npu.dma_memcpy_nd`.
-LogicalResult AIEDeviceBuilder::npuDmaCpyNdOpToAIE(
-    AMDAIE::NpuDmaCpyNdOp dmaOp, SmallVector<Operation *> &toBeErased) {
-  LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::NpuDmaCpyNdOp]\n");
-  AMDAIE::ConnectionOp connectionOp = dmaOp.getConnectionOp();
-
-  SmallVector<Value> offsets, sizes, strides;
-  ArrayRef<int64_t> staticOffsets, staticSizes, staticStrides;
-  AMDAIE::BdIdOp bdIdOp;
-  LogicalObjectFifoFromMemrefOp logicalObjFifo;
-  SmallVector<Operation *> memOps;
-  AIE::PacketInfoAttr pktInfoAttr = nullptr;
-  // Convert bidirectional `amdaie.npu.dma_cpy_nd` op into two halves.
-  if (dmaOp.getSource()) {
-    offsets = dmaOp.getSourceOffsets();
-    sizes = dmaOp.getSourceSizes();
-    strides = dmaOp.getSourceStrides();
-    staticOffsets = dmaOp.getSourceStaticOffsets();
-    staticSizes = dmaOp.getSourceStaticSizes();
-    staticStrides = dmaOp.getSourceStaticStrides();
-    bdIdOp = dmaOp.getSourceBdIdOp();
-    if (!bdIdOp) {
-      return dmaOp.emitOpError()
-             << "must have a source BD ID op to lower to the AIE dialect.";
-    }
-    logicalObjFifo = dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>(
-        dmaOp.getSource().getDefiningOp());
-    if (!logicalObjFifo) {
-      return dmaOp.emitOpError() << "expected source to be an "
-                                    "`amdaie.logicalobjectfifo.from_memref`";
-    }
-    memOps = connectionToSourceTargetMemOps[connectionOp].first;
-    // Set the packet info attribute for MM2S DMAs, operating on a packet flow
-    // connection.
-    std::optional<AMDAIE::FlowOp> maybeFlowOp = connectionOp.getFlowOp();
-    if (maybeFlowOp && maybeFlowOp->getPacketId()) {
-      pktInfoAttr = AIE::PacketInfoAttr::get(
-          rewriter.getContext(),
-          /*pkt_type*/ 0, /*pkt_id*/ maybeFlowOp->getPacketId().value());
-    }
-  } else if (dmaOp.getTarget()) {
-    offsets = dmaOp.getTargetOffsets();
-    sizes = dmaOp.getTargetSizes();
-    strides = dmaOp.getTargetStrides();
-    staticOffsets = dmaOp.getTargetStaticOffsets();
-    staticSizes = dmaOp.getTargetStaticSizes();
-    staticStrides = dmaOp.getTargetStaticStrides();
-    bdIdOp = dmaOp.getTargetBdIdOp();
-    if (!bdIdOp) {
-      return dmaOp.emitOpError()
-             << "must have a target BD ID op to lower to the AIE dialect.";
-    }
-    logicalObjFifo = dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>(
-        dmaOp.getTarget().getDefiningOp());
-    if (!logicalObjFifo) {
-      return dmaOp.emitOpError() << "expected target to be an "
-                                    "`amdaie.logicalobjectfifo.from_memref`";
-    }
-    memOps = connectionToSourceTargetMemOps[connectionOp].second;
-  } else {
-    return dmaOp.emitOpError()
-           << "has neither source not target memory space as L3.";
-  }
-
-  Value memref = bindingsMapper.lookup(logicalObjFifo.getMemref());
-
-  if (memOps.size() != 1) {
-    return dmaOp.emitOpError() << "only a single connection op source expected";
-  }
-  auto shimDmaAllocOp = dyn_cast<AIE::ShimDMAAllocationOp>(memOps[0]);
-  if (!shimDmaAllocOp) {
-    return dmaOp.emitOpError() << "expected the source of the connection to "
-                                  "be mapped to a `AIE::ShimDMAAllocationOp`";
-  }
-
-  if (!offsets.empty() || !sizes.empty() || !strides.empty()) {
-    // Not doing now as better to just eliminate use of aiex dialect
-    // altogether.
-    return dmaOp.emitError()
-           << "Expect all source offsets, sizes, and strides to be static at "
-              "this point. Dynamic values can be supported, just need to "
-              "cast from 'index' to 64-bit signless integer for "
-              "aiex.npu.dma_memcpy_nd.";
-  }
-
-  uint32_t bdId = bdIdOp.getValue();
-  bool issueToken = dmaOp.hasDmaWaitOpUser();
-
-  rewriter.setInsertionPoint(dmaOp);
-  rewriter.create<AIEX::NpuDmaMemcpyNdOp>(
-      dmaOp.getLoc(), SmallVector<Type, 1>{}, 0, 0, memref, offsets, sizes,
-      strides, staticOffsets, staticSizes, staticStrides, pktInfoAttr,
-      shimDmaAllocOp.getSymName(), bdId, issueToken);
-
-  toBeErased.push_back(dmaOp);
-  return success();
-}
-
-/// Convert the `amdaie.npu.dma_wait` operation to `aiex.npu.dma_wait`.
-LogicalResult AIEDeviceBuilder::npuDmaWaitToAIE(
-    AMDAIE::NpuDmaWaitOp waitOp, SmallVector<Operation *> &toBeErased) {
-  LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::NpuDmaWaitOp]\n");
-  rewriter.setInsertionPoint(waitOp);
-  for (Value asyncToken : waitOp.getAsyncTokens()) {
-    auto npuDmaOp =
-        dyn_cast_if_present<NpuDmaCpyNdOp>(asyncToken.getDefiningOp());
-    if (!npuDmaOp) {
-      return waitOp.emitOpError()
-             << "should be operating on `amdaie.npu.dma_cpy_nd` for "
-                "lowering";
-    }
-    AMDAIE::ConnectionOp connectionOp = npuDmaOp.getConnectionOp();
-    if (!connectionToSourceTargetMemOps.contains(connectionOp)) {
-      return connectionOp.emitOpError() << "should be found in the connection "
-                                           "to source/target mem ops map";
-    }
-    SmallVector<Operation *> memOps =
-        isa<AMDAIE::AsyncSourceTokenType>(asyncToken.getType())
-            ? connectionToSourceTargetMemOps[connectionOp].first
-            : connectionToSourceTargetMemOps[connectionOp].second;
-    if (memOps.size() != 1) {
-      return waitOp.emitOpError()
-             << "only a single connection op source expected";
-    }
-    auto shimDmaAllocOp = dyn_cast<AIE::ShimDMAAllocationOp>(memOps[0]);
-    if (!shimDmaAllocOp) {
-      return waitOp.emitOpError()
-             << "expected the source of the connection to "
-                "be mapped to a `AIE::ShimDMAAllocationOp`";
-    }
-    rewriter.create<AIEX::NpuDmaWaitOp>(rewriter.getUnknownLoc(),
-                                        shimDmaAllocOp.getSymName());
-  }
-  toBeErased.push_back(waitOp);
-  return success();
-}
-
-/// Insert the control code operations into the NPU instruction function.
-LogicalResult AIEDeviceBuilder::controlCodeToAIE(
-    AMDAIE::ControlCodeOp controlCodeOp,
-    xilinx::AIEX::RuntimeSequenceOp funcOp) {
-  LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::ControlCodeOp]\n");
-  Block *funcBlock = &funcOp.getBody().front();
-  rewriter.setInsertionPointToEnd(funcBlock);
-  auto insertIt = funcBlock->begin();
-  auto controlCodeBegin = controlCodeOp.getBody()->begin();
-  auto controlCodeEnd = controlCodeOp.getBody()->getTerminator()->getIterator();
-  funcBlock->getOperations().splice(insertIt,
-                                    controlCodeOp.getBody()->getOperations(),
-                                    controlCodeBegin, controlCodeEnd);
-
-  // Keep track of operations to be erased instead of erasing them directly as
-  // there are bidirectional dependencies between operations. For example,
-  // `amdaie.npu.dma_cpy_nd` potentially needs information from a sunsequent
-  // `amdaie.npu.dma_wait` operation user and vice versa.
-  // TODO(jornt): This is caused by differences between the `AMDAIE` dialect and
-  // the `AIE` dialect and can be streamlined later by adjusting (both)
-  // dialects.
-  SmallVector<Operation *> toBeErased;
-  WalkResult res =
-      funcOp->walk<WalkOrder::PostOrder, ReverseIterator>([&](Operation *op) {
-        if (TypeSwitch<Operation *, LogicalResult>(op)
-                .Case<AMDAIE::NpuCircularDmaCpyNdOp>([&](auto dmaOp) {
-                  // TODO(jornt): This is temporarily handled already by
-                  // combining with `ConnectionOp` to create `aie.objectfifo`
-                  // until we get rid of those.
-                  eraseOp(dmaOp);
-                  return success();
-                })
-                .Case<AMDAIE::NpuDmaCpyNdOp>([&](auto dmaOp) {
-                  return npuDmaCpyNdOpToAIE(dmaOp, toBeErased);
-                })
-                .Case<AMDAIE::NpuDmaWaitOp>([&](auto waitOp) {
-                  return npuDmaWaitToAIE(waitOp, toBeErased);
-                })
-                .Case<AMDAIE::EndOp>([&](auto endOp) {
-                  eraseOp(endOp);
-                  return success();
-                })
-                .Default([&](Operation *op) {
-                  remapOperands(op);
-                  return success();
-                })
-                .failed()) {
-          return WalkResult::interrupt();
-        }
-        return WalkResult::advance();
-      });
-  if (res.wasInterrupted()) return failure();
-  for (Operation *op : toBeErased) eraseOp(op);
-  return success();
-}
-
 //===----------------------------------------------------------------------===//
 // Convert ops in Workgroup to AIE ops
 //===----------------------------------------------------------------------===//
@@ -961,9 +764,8 @@ LogicalResult AIEDeviceBuilder::tileToAIE(AMDAIE::TileOp tileOp,
 // Convert amdaie.workgroup operation and insert into aie.device
 //===----------------------------------------------------------------------===//
 
-LogicalResult AIEDeviceBuilder::workgroupToAIE(
-    AMDAIE::WorkgroupOp workgroupOp, xilinx::AIE::DeviceOp deviceOp,
-    xilinx::AIEX::RuntimeSequenceOp npuFuncOp) {
+LogicalResult AIEDeviceBuilder::workgroupToAIE(AMDAIE::WorkgroupOp workgroupOp,
+                                               xilinx::AIE::DeviceOp deviceOp) {
   OpBuilder::InsertionGuard guard(rewriter);
   Block *deviceBlock = &deviceOp.getRegion().front();
   Block *deviceCoreBlock = rewriter.createBlock(&deviceOp.getRegion());
@@ -1003,10 +805,10 @@ LogicalResult AIEDeviceBuilder::workgroupToAIE(
           return WalkResult::advance();
         })
         .Case<AMDAIE::ControlCodeOp>([&](auto controlCodeOp) {
-          if (failed(controlCodeToAIE(controlCodeOp, npuFuncOp))) {
-            controlCodeOp.emitError("could not convert to AIEDialect ops");
-            return WalkResult::interrupt();
-          }
+          // Skip control code as it should already be translated into firmware
+          // code at this point.
+          // TODO(jornt): currently, it still contains ops that are needed in
+          // this translation, but don't have to be translated themselves.
           return WalkResult::skip();
         })
         .Case<AMDAIE::CoreOp>([&](auto coreOp) {
@@ -1100,20 +902,6 @@ LogicalResult AIEDeviceBuilder::lowerToAIE(ModuleOp moduleOp) {
         rewriter.getUnknownLoc(),
         xilinx::AIE::AIEDeviceAttr::get(rewriter.getContext(), aieDevice));
     Block *deviceBlock = &deviceOp.getRegion().emplaceBlock();
-
-    // The amdaie.controlcode operation has no operands, but the
-    // aiex.runtime_sequence that it lowers to, does. Create the signature
-    // of the aiex.runtime_sequence operation that replaces the
-    // amdaie.controlcode. The HAL interface bindings are used to
-    // order the function parameters correctly.
-    SmallVector<IREE::HAL::InterfaceBindingSubspanOp> subspanOps;
-    funcOp->walk([&](IREE::HAL::InterfaceBindingSubspanOp subspanOp) {
-      subspanOps.push_back(subspanOp);
-    });
-    llvm::sort(subspanOps, [](IREE::HAL::InterfaceBindingSubspanOp a,
-                              IREE::HAL::InterfaceBindingSubspanOp b) {
-      return a.getBinding().getZExtValue() < b.getBinding().getZExtValue();
-    });
     rewriter.setInsertionPoint(deviceBlock, deviceBlock->begin());
 
     // Create aiex.runtime_sequence inside aie.device
@@ -1122,11 +910,6 @@ LogicalResult AIEDeviceBuilder::lowerToAIE(ModuleOp moduleOp) {
     Region &body = npuFuncOp.getBody();
     body.emplaceBlock();
 
-    for (auto &&a : llvm::enumerate(subspanOps)) {
-      body.addArgument(a.value().getType(), a.value().getLoc());
-      bindingsMapper.map(a.value(), body.getArgument(a.index()));
-    }
-
     // Walk the AIE regions ops and convert ops into pure AIEDialect ops.
     // IRMapping mapper;
     rewriter.setInsertionPointToStart(deviceBlock);
@@ -1134,7 +917,7 @@ LogicalResult AIEDeviceBuilder::lowerToAIE(ModuleOp moduleOp) {
       if (isa<func::FuncOp, func::ReturnOp>(op)) {
         return WalkResult::advance();
       } else if (auto workgroupOp = dyn_cast<AMDAIE::WorkgroupOp>(op)) {
-        if (failed(workgroupToAIE(workgroupOp, deviceOp, npuFuncOp))) {
+        if (failed(workgroupToAIE(workgroupOp, deviceOp))) {
           return WalkResult::interrupt();
         }
         return WalkResult::skip();
@@ -1147,6 +930,28 @@ LogicalResult AIEDeviceBuilder::lowerToAIE(ModuleOp moduleOp) {
     });
     if (res.wasInterrupted()) return WalkResult::interrupt();
 
+    SmallVector<AMDAIE::WorkgroupOp> workgroupOps;
+    funcOp->walk([&](AMDAIE::WorkgroupOp op) { workgroupOps.push_back(op); });
+    // Only a single workgroup op is supported as only a single `aie.device` is
+    // created.
+    if (workgroupOps.size() > 1) {
+      funcOp.emitOpError()
+          << "multiple `amdaie.workgroup` ops is not supported";
+      return WalkResult::interrupt();
+    }
+    if (workgroupOps.size() == 1) {
+      AMDAIE::WorkgroupOp workgroupOp = workgroupOps[0];
+      mlir::Attribute maybeNpuInstructions =
+          workgroupOp.getNpuInstructionsAttr();
+      // Only add attributes if the instructions attribute is found to
+      // facilitate simplified tests.
+      if (maybeNpuInstructions) {
+        deviceOp->setAttr("npu_instructions", maybeNpuInstructions);
+        deviceOp->setAttr("runtime_sequence_name",
+                          rewriter.getStringAttr(funcOp.getSymName()));
+      }
+    }
+
     // Move NPU instruction function to the end of the device block.
     rewriter.moveOpBefore(npuFuncOp, deviceBlock, deviceBlock->end());
     // After walking the FuncOp, it has been converted into a DeviceOp and can
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.h
index 88ec017cd..6f25c3592 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.h
@@ -46,14 +46,6 @@ class AIEDeviceBuilder {
   LogicalResult coreToAIE(AMDAIE::CoreOp coreOp, AIE::DeviceOp deviceOp,
                           Block *deviceCoreBlock);
 
-  /// Controlcode ops conversion methods.
-  LogicalResult npuDmaCpyNdOpToAIE(AMDAIE::NpuDmaCpyNdOp dmaOp,
-                                   SmallVector<Operation *> &toBeErased);
-  LogicalResult npuDmaWaitToAIE(AMDAIE::NpuDmaWaitOp waitOp,
-                                SmallVector<Operation *> &toBeErased);
-  LogicalResult controlCodeToAIE(AMDAIE::ControlCodeOp controlCodeOp,
-                                 xilinx::AIEX::RuntimeSequenceOp funcOp);
-
   /// Workgroup ops conversion methods.
   LogicalResult bufferToAIE(AMDAIE::BufferOp bufferOp, Block *deviceBlock,
                             int &bufferId);
@@ -67,8 +59,7 @@ class AIEDeviceBuilder {
       Block *deviceBlock);
   LogicalResult tileToAIE(AMDAIE::TileOp tileOp, Block *deviceBlock);
   LogicalResult workgroupToAIE(AMDAIE::WorkgroupOp workgroupOp,
-                               xilinx::AIE::DeviceOp deviceOp,
-                               xilinx::AIEX::RuntimeSequenceOp npuFuncOp);
+                               xilinx::AIE::DeviceOp deviceOp);
 
   /// Utilities
 
@@ -120,8 +111,6 @@ class AIEDeviceBuilder {
 
   IRRewriter rewriter;
   IRMapping mapper;
-  /// Dedicated mapper for the HAL bindings.
-  IRMapping bindingsMapper;
   /// Map from tile values to AIE memory op (`aie.mem` or `aie.memtile_dma`).
   /// This is used to look up and add new DMA patterns to those memory ops.
   DenseMap<Value, Operation *> tileToMemOpMap;
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.cpp
index 54495abe4..a26d77e21 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.cpp
@@ -7,6 +7,7 @@
 #include "AMDAIEUtils.h"
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringExtras.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Iterators.h"
 
@@ -350,6 +351,15 @@ bool isMatmulProducerOfElementwise(linalg::LinalgOp linalgOp) {
   return false;
 }
 
+std::string utohexstr(uint32_t value, size_t width, bool header,
+                      bool lowercase) {
+  std::string res = "";
+  if (header) res += "0x";
+  std::string hexStr = llvm::utohexstr(value, lowercase);
+  std::string prefix(width - hexStr.size(), '0');
+  return res + prefix + hexStr;
+}
+
 /// Find the largest factor of 'num' which is not larger than 'max'.
 int detail::findLargestFactor(int num, int max) {
   assert(max > 0 && "No factors less than or equal to 0 exist");
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.h
index fa37b89dd..d657fff4d 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.h
@@ -94,6 +94,10 @@ bool isMatmulInDefChain(Value operand);
 /// matmul-like op upstream in its computation tree.
 bool isMatmulProducerOfElementwise(linalg::LinalgOp linalgOp);
 
+/// Utility to convert a `uint32_t` value into a hex string.
+std::string utohexstr(uint32_t value, size_t width, bool header = true,
+                      bool lowercase = false);
+
 namespace detail {
 
 // Returns the largest number that perfectly divides `num` that
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
index a467ce00e..0d8c8ce85 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
@@ -58,7 +58,9 @@ iree_cc_library(
     "AMDAIECombineStridedOps.cpp"
     "AMDAIEConnectionToFlow.cpp"
     "AMDAIEConvertToDma.cpp"
+    "AMDAIEControlCodeLowering.cpp"
     "AMDAIEControlCodeLoopUnroll.cpp"
+    "AMDAIEControlCodeToTransaction.cpp"
     "AMDAIEConvertCoreForallToFor.cpp"
     "AMDAIECreateAIEWorkgroup.cpp"
     "AMDAIECreateReferenceToAllocation.cpp"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
index 4cd5586f0..a58f6a880 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
@@ -35,6 +35,8 @@ namespace mlir::iree_compiler::AMDAIE {
 #define GEN_PASS_DEF_AMDAIECOMBINESTRIDEDOPS
 #define GEN_PASS_DEF_AMDAIECONNECTIONTOFLOW
 #define GEN_PASS_DEF_AMDAIECONTROLCODELOOPUNROLL
+#define GEN_PASS_DEF_AMDAIECONTROLCODELOWERING
+#define GEN_PASS_DEF_AMDAIECONTROLCODETOTRANSACTION
 #define GEN_PASS_DEF_AMDAIECONVERTCOREFORALLTOFOR
 #define GEN_PASS_DEF_AMDAIECREATEAIEWORKGROUP
 #define GEN_PASS_DEF_AMDAIECREATELOGICALOBJECTFIFOLINK
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
index 4bc7c8bc4..ce858a7b4 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
@@ -621,6 +621,9 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager,
   passManager.addPass(createAMDAIEConnectionToFlowPass());
   passManager.addPass(createAMDAIEAssignPacketIdsPass());
 
+  passManager.addPass(createAMDAIEControlCodeLoweringPass());
+  passManager.addPass(createAMDAIEControlCodeToTransactionPass());
+
   addAMDAIEToAIEPasses(passManager);
 
   // Now lower using the AIE passes from MLIR-AIE.
@@ -631,7 +634,6 @@ void addMLIRAIELoweringPasses(OpPassManager &pm) {
   {
     OpPassManager &devicePM = pm.nest<xilinx::AIE::DeviceOp>();
     devicePM.addPass(createCanonicalizerPass());
-    devicePM.addPass(createAMDAIEDmaToNpuPass());
     devicePM.addPass(createAMDAIEAssignBufferDescriptorIDsPass());
     devicePM.addPass(createAMDAIEAssignBufferAddressesBasicPass());
     devicePM.addPass(createAMDAIEPathfinderPass());
@@ -834,12 +836,17 @@ void addMLIRAIRLoweringPasses(OpPassManager &passManager, AMDAIEDevice device,
   passManager.addPass(xilinx::airrt::createAIRRtToNpuPass());
   passManager.addPass(createCanonicalizerPass());
 
+  {
+    // createAMDAIEDmaToNpuPass only needed for AIR.
+    OpPassManager &devicePM = passManager.nest<xilinx::AIE::DeviceOp>();
+    devicePM.addPass(createCanonicalizerPass());
+    devicePM.addPass(createAMDAIEDmaToNpuPass());
+  }
+
   // Now lower using the AIE passes from MLIR-AIE.
   addMLIRAIELoweringPasses(passManager);
 }
 
-
-
 // NOTE: this runs on the top-level program module containing all hal.executable
 // ops.
 void buildAMDAIELinkingPassPipeline(OpPassManager &passManager) {
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
index df670e19f..ded26bd18 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
@@ -111,6 +111,13 @@ std::unique_ptr<Pass> createAMDAIEConnectionToFlowPass();
 /// Pass to unroll the loops within the control code regions.
 std::unique_ptr<Pass> createAMDAIEControlCodeLoopUnrollPass();
 
+/// Pass to convert control code DMA operations into NPU writes and syncs.
+std::unique_ptr<Pass> createAMDAIEControlCodeLoweringPass();
+
+/// Pass to convert control code into a transaction binary.
+std::unique_ptr<Pass> createAMDAIEControlCodeToTransactionPass(
+    AMDAIEControlCodeToTransactionOptions options = {});
+
 /// Pass to convert `scf.forall` to `scf.for` within `aie.core`.
 std::unique_ptr<Pass> createAMDAIEConvertCoreForallToForPass();
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
index 7c8364fed..5db0ba05a 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
@@ -151,6 +151,22 @@ def AMDAIEControlCodeLoopUnroll :
   let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEControlCodeLoopUnrollPass()";
 }
 
+def AMDAIEControlCodeLowering :
+    Pass<"iree-amdaie-controlcode-lowering", ""> {
+  let summary = "Lower control code ops to the most basic NPU write/sync/patch instructions";
+  let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEControlCodeLoweringPass()";
+}
+
+def AMDAIEControlCodeToTransaction :
+    Pass<"iree-amdaie-controlcode-to-transaction", ""> {
+  let summary = "Convert controlcode instructions into a NPU instruction transaction.";
+  let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEControlCodeToTransactionPass()";
+  let options = [
+    Option<"dumpTransaction", "dump-transaction", "bool", /*default=*/"false",
+      "Dump the generated transaction. (Used for tests)">
+  ];
+}
+
 def AMDAIEConvertCoreForallToFor :
     Pass<"iree-amdaie-convert-core-forall-to-for", ""> {
   let summary = "Converts `scf.forall` to `scf.for` within `aie.core`.";
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
index 570e66b83..298338e4c 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
@@ -23,6 +23,8 @@ iree_lit_test_suite(
     "combine_strided_ops.mlir"
     "connection_to_flow.mlir"
     "controlcode_loop_unrolling.mlir"
+    "controlcode_lowering.mlir"
+    "controlcode_to_transaction.mlir"
     "convert_core_forall_to_for.mlir"
     "create_aie_workgroup.mlir"
     "create_reference_to_allocation.mlir"
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_channels.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_channels.mlir
index 92b0f691b..1dbf73473 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_channels.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_channels.mlir
@@ -6,14 +6,14 @@
 // CHECK:       amdaie.workgroup
 // CHECK:         %[[tile_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
 // CHECK:         %[[tile_0_1:.+]] = amdaie.tile(%[[C0]], %[[C1]])
-// CHECK:         %[[CHANNEL_0:.+]] = amdaie.channel(%[[tile_0_0]], 0, port_type = DMA)
-// CHECK:         %[[CHANNEL_1:.+]] = amdaie.channel(%[[tile_0_1]], 0, port_type = DMA)
+// CHECK:         %[[CHANNEL_0:.+]] = amdaie.channel(%[[tile_0_0]], 0, port_type = DMA, direction = MM2S)
+// CHECK:         %[[CHANNEL_1:.+]] = amdaie.channel(%[[tile_0_1]], 0, port_type = DMA, direction = S2MM)
 // CHECK:         amdaie.connection(%{{.+}} {%[[CHANNEL_1]]}, %{{.+}} {%[[CHANNEL_0]]})
-// CHECK:         %[[CHANNEL_2:.+]] = amdaie.channel(%[[tile_0_0]], 1, port_type = DMA)
-// CHECK:         %[[CHANNEL_3:.+]] = amdaie.channel(%[[tile_0_1]], 1, port_type = DMA)
+// CHECK:         %[[CHANNEL_2:.+]] = amdaie.channel(%[[tile_0_0]], 1, port_type = DMA, direction = MM2S)
+// CHECK:         %[[CHANNEL_3:.+]] = amdaie.channel(%[[tile_0_1]], 1, port_type = DMA, direction = S2MM)
 // CHECK:         amdaie.connection(%{{.+}} {%[[CHANNEL_3]]}, %{{.+}} {%[[CHANNEL_2]]})
-// CHECK:         %[[CHANNEL_4:.+]] = amdaie.channel(%[[tile_0_0]], 2, port_type = DMA)
-// CHECK:         %[[CHANNEL_5:.+]] = amdaie.channel(%[[tile_0_1]], 2, port_type = DMA)
+// CHECK:         %[[CHANNEL_4:.+]] = amdaie.channel(%[[tile_0_0]], 2, port_type = DMA, direction = MM2S)
+// CHECK:         %[[CHANNEL_5:.+]] = amdaie.channel(%[[tile_0_1]], 2, port_type = DMA, direction = S2MM)
 // CHECK:         amdaie.connection(%{{.+}} {%[[CHANNEL_5]]}, %{{.+}} {%[[CHANNEL_4]]})
 module {
   func.func @assign_channels(%arg0: memref<1x1x8x16xi32, 1>, %arg1: memref<8x16xi32>) {
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_packet_ids.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_packet_ids.mlir
index 15e196d7c..4fa6e7394 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_packet_ids.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_packet_ids.mlir
@@ -8,8 +8,8 @@ module {
     amdaie.workgroup {
       %tile_0_0 = amdaie.tile(%c0, %c0)
       %tile_0_1 = amdaie.tile(%c0, %c1)
-      %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA)
-      %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA)
+      %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S)
+      %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM)
       %0 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false}
       amdaie.controlcode {
         amdaie.end
@@ -29,10 +29,10 @@ module {
 // CHECK:         %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]])
 // CHECK:         %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]])
 // CHECK:         %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]])
-// CHECK:         %[[CHANNEL:.*]] = amdaie.channel(%[[TILE_0_0]], 0, port_type = DMA)
-// CHECK:         %[[CHANNEL_1:.*]] = amdaie.channel(%[[TILE_0_1]], 0, port_type = DMA)
-// CHECK:         %[[CHANNEL_2:.*]] = amdaie.channel(%[[TILE_0_2]], 0, port_type = DMA)
-// CHECK:         %[[CHANNEL_3:.*]] = amdaie.channel(%[[TILE_0_1]], 1, port_type = DMA)
+// CHECK:         %[[CHANNEL:.*]] = amdaie.channel(%[[TILE_0_0]], 0, port_type = DMA, direction = MM2S)
+// CHECK:         %[[CHANNEL_1:.*]] = amdaie.channel(%[[TILE_0_1]], 0, port_type = DMA, direction = S2MM)
+// CHECK:         %[[CHANNEL_2:.*]] = amdaie.channel(%[[TILE_0_2]], 0, port_type = DMA, direction = MM2S)
+// CHECK:         %[[CHANNEL_3:.*]] = amdaie.channel(%[[TILE_0_1]], 1, port_type = DMA, direction = S2MM)
 // CHECK:         amdaie.flow({%[[CHANNEL]]} -> {%[[CHANNEL_1]]}) {is_packet_flow = false}
 // CHECK:         amdaie.flow({%[[CHANNEL]]} -> {%[[CHANNEL_1]]}) {is_packet_flow = true, packet_id = 0 : ui8}
 // CHECK:         amdaie.flow({%[[CHANNEL_2]]} -> {%[[CHANNEL_3]]}) {is_packet_flow = true, packet_id = 1 : ui8}
@@ -46,10 +46,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       %tile_0_0 = amdaie.tile(%c0, %c0)
       %tile_0_1 = amdaie.tile(%c0, %c1)
       %tile_0_2 = amdaie.tile(%c0, %c2)
-      %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA)
-      %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA)
-      %channel_2 = amdaie.channel(%tile_0_2, 0, port_type = DMA)
-      %channel_3 = amdaie.channel(%tile_0_1, 1, port_type = DMA)
+      %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S)
+      %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM)
+      %channel_2 = amdaie.channel(%tile_0_2, 0, port_type = DMA, direction = MM2S)
+      %channel_3 = amdaie.channel(%tile_0_1, 1, port_type = DMA, direction = S2MM)
       %0 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false}
       %1 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true}
       %2 = amdaie.flow({%channel_2} -> {%channel_3}) {is_packet_flow = true}
@@ -71,8 +71,8 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
     amdaie.workgroup {
       %tile_0_0 = amdaie.tile(%c0, %c0)
       %tile_0_1 = amdaie.tile(%c0, %c1)
-      %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA)
-      %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA)
+      %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S)
+      %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM)
       %0 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true}
       %1 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true}
       %2 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true}
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/connection_to_flow.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/connection_to_flow.mlir
index 74691c7ba..8babcb3f6 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/connection_to_flow.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/connection_to_flow.mlir
@@ -8,15 +8,18 @@
 // CHECK:         %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]])
 // CHECK:         %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]])
 // CHECK:         %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]])
-// CHECK:         %[[CHANNEL:.*]] = amdaie.channel(%[[TILE_0_0]], 0, port_type = DMA)
-// CHECK:         %[[CHANNEL_1:.*]] = amdaie.channel(%[[TILE_0_1]], 0, port_type = DMA)
-// CHECK:         %[[CHANNEL_2:.*]] = amdaie.channel(%[[TILE_0_2]], 0, port_type = DMA)
+// CHECK:         %[[CHANNEL:.*]] = amdaie.channel(%[[TILE_0_0]], 0, port_type = DMA, direction = MM2S)
+// CHECK:         %[[CHANNEL_1:.*]] = amdaie.channel(%[[TILE_0_1]], 0, port_type = DMA, direction = S2MM)
+// CHECK:         %[[CHANNEL_2:.*]] = amdaie.channel(%[[TILE_0_1]], 0, port_type = DMA, direction = MM2S)
+// CHECK:         %[[CHANNEL_3:.*]] = amdaie.channel(%[[TILE_0_0]], 0, port_type = DMA, direction = S2MM)
+// CHECK:         %[[CHANNEL_4:.*]] = amdaie.channel(%[[TILE_0_1]], 0, port_type = DMA, direction = MM2S)
+// CHECK:         %[[CHANNEL_5:.*]] = amdaie.channel(%[[TILE_0_2]], 0, port_type = DMA, direction = S2MM)
 // CHECK:         %[[FLOW_0:.+]] = amdaie.flow({%[[CHANNEL]]} -> {%[[CHANNEL_1]]}) {is_packet_flow = false}
 // CHECK:         amdaie.connection(%{{.+}} {%[[CHANNEL_1]]}, %{{.+}} {%[[CHANNEL]]}, flow = %[[FLOW_0]])
-// CHECK:         %[[FLOW_1:.+]] = amdaie.flow({%[[CHANNEL_1]]} -> {%[[CHANNEL]]}) {is_packet_flow = true}
-// CHECK:         amdaie.connection(%{{.+}} {%[[CHANNEL]]}, %{{.+}} {%[[CHANNEL_1]]}, flow = %[[FLOW_1]])
-// CHECK:         %[[FLOW_2:.+]] = amdaie.flow({%[[CHANNEL_1]]} -> {%[[CHANNEL_2]]}) {is_packet_flow = false}
-// CHECK:         amdaie.connection(%{{.+}} {%[[CHANNEL_2]]}, %{{.+}} {%[[CHANNEL_1]]}, flow = %[[FLOW_2]])
+// CHECK:         %[[FLOW_1:.+]] = amdaie.flow({%[[CHANNEL_2]]} -> {%[[CHANNEL_3]]}) {is_packet_flow = true}
+// CHECK:         amdaie.connection(%{{.+}} {%[[CHANNEL_3]]}, %{{.+}} {%[[CHANNEL_2]]}, flow = %[[FLOW_1]])
+// CHECK:         %[[FLOW_2:.+]] = amdaie.flow({%[[CHANNEL_4]]} -> {%[[CHANNEL_5]]}) {is_packet_flow = false}
+// CHECK:         amdaie.connection(%{{.+}} {%[[CHANNEL_5]]}, %{{.+}} {%[[CHANNEL_4]]}, flow = %[[FLOW_2]])
 module {
   func.func @connection_to_flow(%arg0: memref<8x16xi32>, %arg1: memref<1x1x8x16xi32, 1>, %arg2: memref<1x1x8x16xi32, 2>) {
     %c0 = arith.constant 0 : index
@@ -29,12 +32,15 @@ module {
       %0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo<memref<8x16xi32>>
       %1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>
       %2 = amdaie.logicalobjectfifo.from_memref %arg2, {%tile_0_2} : memref<1x1x8x16xi32, 2> -> !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>
-      %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA)
-      %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA)
-      %channel_2 = amdaie.channel(%tile_0_2, 0, port_type = DMA)
+      %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S)
+      %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM)
+      %channel_2 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = MM2S)
+      %channel_3 = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = S2MM)
+      %channel_4 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = MM2S)
+      %channel_5 = amdaie.channel(%tile_0_2, 0, port_type = DMA, direction = S2MM)
       %3 = amdaie.connection(%1 {%channel_1}, %0 {%channel}) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>, !amdaie.logicalobjectfifo<memref<8x16xi32>>)
-      %4 = amdaie.connection(%0 {%channel}, %1 {%channel_1}) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<8x16xi32>>, !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>)
-      %5 = amdaie.connection(%2 {%channel_2}, %1 {%channel_1}) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>, !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>)
+      %4 = amdaie.connection(%0 {%channel_3}, %1 {%channel_2}) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<8x16xi32>>, !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>)
+      %5 = amdaie.connection(%2 {%channel_5}, %1 {%channel_4}) : (!amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>, !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 1>>)
       amdaie.controlcode {
         amdaie.end
       }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir
new file mode 100644
index 000000000..e15cabc27
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir
@@ -0,0 +1,333 @@
+// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-controlcode-lowering)" --split-input-file --verify-diagnostics %s | FileCheck %s
+
+// expected-error @+1 {{op has no AMDAIEDevice in the target attribute configuration}}
+module {
+  func.func @no_amdaie_device() {
+    amdaie.workgroup {
+      amdaie.controlcode {
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @no_ops
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @no_ops() {
+    amdaie.workgroup {
+      amdaie.controlcode {
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @npu_dma_cpy_nd_source
+// CHECK:       amdaie.controlcode
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @npu_dma_cpy_nd_source() {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    amdaie.workgroup {
+      %tile = amdaie.tile(%c0, %c1)
+      %tile_0 = amdaie.tile(%c0, %c0)
+      %bd_id = amdaie.bd_id(%tile_0, 0)
+      %buffer = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
+      %buffer_1 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
+      %lock = amdaie.lock(%tile(4), 4)
+      %lock_2 = amdaie.lock(%tile(5), 0)
+      %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_2}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
+      %2 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %channel = amdaie.channel(%tile_0, 0, port_type = DMA, direction = MM2S)
+      %channel_3 = amdaie.channel(%tile, 0, port_type = DMA, direction = S2MM)
+      %3 = amdaie.flow({%channel} -> {%channel_3}) {is_packet_flow = true, packet_id = 0 : ui8}
+      %4 = amdaie.connection(%0 {%channel_3}, %2 {%channel}, flow = %3) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      amdaie.controlcode {
+        %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        memref.assume_alignment %1, 64 : memref<64x32xi32>
+// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 0>, strides = array<i32: 0, 0, 0>, use_next_bd = false, valid_bd = true}
+// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
+// CHECK: amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
+        amdaie.npu.dma_cpy_nd %4([] [] [], %5[] [] [] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 4, 16, 16>, strides = array<i32: 64, 8, 1>, use_next_bd = false, valid_bd = true}
+// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
+// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32}
+// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
+        %6 = amdaie.npu.dma_cpy_nd async_source %4([] [] [], %5[0, 0, 0, 0] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%6 : !amdaie.async_source_token)
+// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 2 : ui32, iteration_stride = 128 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 4, 16, 16>, strides = array<i32: 64, 8, 1>, use_next_bd = false, valid_bd = true}
+// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 128 : ui32}
+// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32}
+// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token)
+        %7 = amdaie.npu.dma_cpy_nd async_source %4([] [] [], %5[0, 0, 0, 32] [2, 4, 16, 16] [128, 64, 8, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%7 : !amdaie.async_source_token)
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @npu_dma_cpy_nd_source_bf16
+// CHECK:       amdaie.controlcode
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @npu_dma_cpy_nd_source_bf16() {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    amdaie.workgroup {
+      %tile = amdaie.tile(%c0, %c1)
+      %tile_0 = amdaie.tile(%c0, %c0)
+      %bd_id = amdaie.bd_id(%tile_0, 0)
+      %buffer = amdaie.buffer(%tile) : memref<2048xbf16, 1 : i32>
+      %buffer_1 = amdaie.buffer(%tile) : memref<2048xbf16, 1 : i32>
+      %lock = amdaie.lock(%tile(4), 4)
+      %lock_2 = amdaie.lock(%tile(5), 0)
+      %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_2}) : memref<2048xbf16, 1 : i32>, memref<2048xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xbf16, 1 : i32>, 2>
+      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
+      %2 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %channel = amdaie.channel(%tile_0, 0, port_type = DMA, direction = MM2S)
+      %channel_3 = amdaie.channel(%tile, 0, port_type = DMA, direction = S2MM)
+      %3 = amdaie.flow({%channel} -> {%channel_3}) {is_packet_flow = true, packet_id = 0 : ui8}
+      %4 = amdaie.connection(%0 {%channel_3}, %2 {%channel}, flow = %3) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xbf16, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      amdaie.controlcode {
+        %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xbf16>>
+        memref.assume_alignment %1, 64 : memref<64x32xi32>
+// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 0>, strides = array<i32: 0, 0, 0>, use_next_bd = false, valid_bd = true}
+// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
+// CHECK: amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
+        amdaie.npu.dma_cpy_nd %4([] [] [], %5[] [] [] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo<memref<2048xbf16>>
+// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 512 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 4, 16, 8>, strides = array<i32: 32, 4, 1>, use_next_bd = false, valid_bd = true}
+// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
+// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32}
+// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
+        %6 = amdaie.npu.dma_cpy_nd async_source %4([] [] [], %5[0, 0, 0, 0] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo<memref<2048xbf16>>
+        amdaie.npu.dma_wait(%6 : !amdaie.async_source_token)
+// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 512 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 2 : ui32, iteration_stride = 64 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 4, 16, 8>, strides = array<i32: 32, 4, 1>, use_next_bd = false, valid_bd = true}
+// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 64 : ui32}
+// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32}
+// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token)
+        %7 = amdaie.npu.dma_cpy_nd async_source %4([] [] [], %5[0, 0, 0, 32] [2, 4, 16, 16] [128, 64, 8, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo<memref<2048xbf16>>
+        amdaie.npu.dma_wait(%7 : !amdaie.async_source_token)
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @npu_dma_cpy_nd_target
+// CHECK:       amdaie.controlcode
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @npu_dma_cpy_nd_target() {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    amdaie.workgroup {
+      %tile = amdaie.tile(%c0, %c1)
+      %tile_0 = amdaie.tile(%c0, %c0)
+      %bd_id = amdaie.bd_id(%tile_0, 0)
+      %buffer = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
+      %buffer_1 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
+      %lock = amdaie.lock(%tile(4), 4)
+      %lock_2 = amdaie.lock(%tile(5), 0)
+      %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_2}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
+      %2 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %channel = amdaie.channel(%tile, 0, port_type = DMA, direction = MM2S)
+      %channel_3 = amdaie.channel(%tile_0, 0, port_type = DMA, direction = S2MM)
+      %3 = amdaie.flow({%channel} -> {%channel_3}) {is_packet_flow = true, packet_id = 0 : ui8}
+      %4 = amdaie.connection(%2 {%channel_3}, %0 {%channel}, flow = %3) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<64x32xi32>>, !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>)
+      amdaie.controlcode {
+        %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        memref.assume_alignment %1, 64 : memref<64x32xi32>
+// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 0>, strides = array<i32: 0, 0, 0>, use_next_bd = false, valid_bd = true}
+// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
+// CHECK: amdaie.npu.push_to_queue  {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 0 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
+        amdaie.npu.dma_cpy_nd %4(%5[] [] [] bd_id = %bd_id, [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 4, 16, 16>, strides = array<i32: 64, 8, 1>, use_next_bd = false, valid_bd = true}
+// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
+// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 0 : i32, repeat_count = 2 : ui32, row = 0 : ui32}
+// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
+        %6 = amdaie.npu.dma_cpy_nd async_target %4(%5[0, 0, 0, 0] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id, [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%6 : !amdaie.async_target_token)
+// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 2 : ui32, iteration_stride = 128 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 4, 16, 16>, strides = array<i32: 64, 8, 1>, use_next_bd = false, valid_bd = true}
+// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 1152 : ui32}
+// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 0 : i32, repeat_count = 2 : ui32, row = 0 : ui32}
+// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
+        %7 = amdaie.npu.dma_cpy_nd async_target %4(%5[0, 0, 32, 32] [2, 4, 16, 16] [128, 64, 8, 1] bd_id = %bd_id, [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%7 : !amdaie.async_target_token)
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @npu_dma_cpy_nd_target_i8
+// CHECK:       amdaie.controlcode
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @npu_dma_cpy_nd_target_i8() {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    amdaie.workgroup {
+      %tile = amdaie.tile(%c0, %c1)
+      %tile_0 = amdaie.tile(%c0, %c0)
+      %bd_id = amdaie.bd_id(%tile_0, 0)
+      %buffer = amdaie.buffer(%tile) : memref<2048xi8, 1 : i32>
+      %buffer_1 = amdaie.buffer(%tile) : memref<2048xi8, 1 : i32>
+      %lock = amdaie.lock(%tile(4), 4)
+      %lock_2 = amdaie.lock(%tile(5), 0)
+      %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_2}) : memref<2048xi8, 1 : i32>, memref<2048xi8, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi8, 1 : i32>, 2>
+      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
+      %2 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %channel = amdaie.channel(%tile, 0, port_type = DMA, direction = MM2S)
+      %channel_3 = amdaie.channel(%tile_0, 0, port_type = DMA, direction = S2MM)
+      %3 = amdaie.flow({%channel} -> {%channel_3}) {is_packet_flow = true, packet_id = 0 : ui8}
+      %4 = amdaie.connection(%2 {%channel_3}, %0 {%channel}, flow = %3) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<64x32xi32>>, !amdaie.logicalobjectfifo<memref<2048xi8, 1 : i32>, 2>)
+      amdaie.controlcode {
+        %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi8>>
+        memref.assume_alignment %1, 64 : memref<64x32xi32>
+// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 0>, strides = array<i32: 0, 0, 0>, use_next_bd = false, valid_bd = true}
+// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
+// CHECK: amdaie.npu.push_to_queue  {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 0 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
+        amdaie.npu.dma_cpy_nd %4(%5[] [] [] bd_id = %bd_id, [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<2048xi8>>
+// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 256 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 4, 16, 4>, strides = array<i32: 16, 2, 1>, use_next_bd = false, valid_bd = true}
+// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
+// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 0 : i32, repeat_count = 2 : ui32, row = 0 : ui32}
+// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
+        %6 = amdaie.npu.dma_cpy_nd async_target %4(%5[0, 0, 0, 0] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id, [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<2048xi8>>
+        amdaie.npu.dma_wait(%6 : !amdaie.async_target_token)
+// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 256 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 2 : ui32, iteration_stride = 32 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 4, 16, 4>, strides = array<i32: 16, 2, 1>, use_next_bd = false, valid_bd = true}
+// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 288 : ui32}
+// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 0 : i32, repeat_count = 2 : ui32, row = 0 : ui32}
+        %7 = amdaie.npu.dma_cpy_nd async_target %4(%5[0, 0, 32, 32] [2, 4, 16, 16] [128, 64, 8, 1] bd_id = %bd_id, [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<2048xi8>>
+        amdaie.npu.dma_wait(%7 : !amdaie.async_target_token)
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @half_npu_dma_cpy_nd
+// CHECK:       amdaie.controlcode
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @half_npu_dma_cpy_nd() {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    amdaie.workgroup {
+      %tile = amdaie.tile(%c0, %c1)
+      %tile_0 = amdaie.tile(%c0, %c0)
+      %bd_id = amdaie.bd_id(%tile_0, 0)
+      %buffer = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
+      %buffer_1 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32>
+      %lock = amdaie.lock(%tile(4), 4)
+      %lock_2 = amdaie.lock(%tile(5), 0)
+      %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_2}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>
+      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
+      %2 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %channel = amdaie.channel(%tile_0, 0, port_type = DMA, direction = MM2S)
+      %channel_3 = amdaie.channel(%tile, 0, port_type = DMA, direction = S2MM)
+      %3 = amdaie.flow({%channel} -> {%channel_3}) {is_packet_flow = true, packet_id = 0 : ui8}
+      %4 = amdaie.connection(%0 {%channel_3}, %2 {%channel}, flow = %3) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      amdaie.controlcode {
+        %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
+        memref.assume_alignment %1, 64 : memref<64x32xi32>
+// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 0>, strides = array<i32: 0, 0, 0>, use_next_bd = false, valid_bd = true}
+// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
+// CHECK: amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
+        amdaie.npu.half_dma_cpy_nd %4(%5[] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 2048 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 2048>, strides = array<i32: 0, 0, 1>, use_next_bd = false, valid_bd = true}
+// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
+// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
+// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
+        %6 = amdaie.npu.half_dma_cpy_nd async %4(%5[0] [2048] [1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%6 : !amdaie.async_token)
+// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 4, 16, 16>, strides = array<i32: 64, 8, 1>, use_next_bd = false, valid_bd = true}
+// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
+// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32}
+// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token)
+        %7 = amdaie.npu.half_dma_cpy_nd async %4(%5[0, 0, 0, 0] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xi32>>
+        amdaie.npu.dma_wait(%7 : !amdaie.async_token)
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK-LABEL: @half_npu_dma_cpy_nd_bf16
+// CHECK:       amdaie.controlcode
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @half_npu_dma_cpy_nd_bf16() {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    amdaie.workgroup {
+      %tile = amdaie.tile(%c0, %c1)
+      %tile_0 = amdaie.tile(%c0, %c0)
+      %bd_id = amdaie.bd_id(%tile_0, 0)
+      %buffer = amdaie.buffer(%tile) : memref<2048xbf16, 1 : i32>
+      %buffer_1 = amdaie.buffer(%tile) : memref<2048xbf16, 1 : i32>
+      %lock = amdaie.lock(%tile(4), 4)
+      %lock_2 = amdaie.lock(%tile(5), 0)
+      %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_2}) : memref<2048xbf16, 1 : i32>, memref<2048xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xbf16, 1 : i32>, 2>
+      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32>
+      %2 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo<memref<64x32xi32>>
+      %channel = amdaie.channel(%tile_0, 0, port_type = DMA, direction = MM2S)
+      %channel_3 = amdaie.channel(%tile, 0, port_type = DMA, direction = S2MM)
+      %3 = amdaie.flow({%channel} -> {%channel_3}) {is_packet_flow = true, packet_id = 0 : ui8}
+      %4 = amdaie.connection(%0 {%channel_3}, %2 {%channel}, flow = %3) {connection_type = #amdaie<connection_type Packet>} : (!amdaie.logicalobjectfifo<memref<2048xbf16, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<64x32xi32>>)
+      amdaie.controlcode {
+        %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo<memref<2048xbf16>>
+        memref.assume_alignment %1, 64 : memref<64x32xi32>
+// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 0>, strides = array<i32: 0, 0, 0>, use_next_bd = false, valid_bd = true}
+// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
+// CHECK: amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
+        amdaie.npu.half_dma_cpy_nd %4(%5[] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xbf16>>
+// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 1024>, strides = array<i32: 0, 0, 1>, use_next_bd = false, valid_bd = true}
+// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
+// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
+// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token)
+        %6 = amdaie.npu.half_dma_cpy_nd async %4(%5[0] [2048] [1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xbf16>>
+        amdaie.npu.dma_wait(%6 : !amdaie.async_token)
+// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 512 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 4, 16, 8>, strides = array<i32: 32, 4, 1>, use_next_bd = false, valid_bd = true}
+// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 64 : ui32}
+// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32}
+// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token)
+        %7 = amdaie.npu.half_dma_cpy_nd async %4(%5[0, 0, 0, 32] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo<memref<2048xbf16>>
+        amdaie.npu.dma_wait(%7 : !amdaie.async_token)
+        amdaie.end
+      }
+    }
+    return
+  }
+}
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir
new file mode 100644
index 000000000..bfe6dc456
--- /dev/null
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir
@@ -0,0 +1,218 @@
+// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-controlcode-to-transaction{dump-transaction=true})" --split-input-file --verify-diagnostics %s | FileCheck %s
+
+// expected-error @+1 {{op has no AMDAIEDevice in the target attribute configuration}}
+module {
+  func.func @no_amdaie_device() {
+    amdaie.workgroup {
+      amdaie.controlcode {
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK:       0x06030100
+// CHECK:       0x00000105
+// CHECK:       0x00000000
+// CHECK:       0x00000010
+// CHECK-LABEL: @no_ops
+// CHECK:       npu_instructions = dense_resource<npu_instructions> : tensor<4xui32>
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @no_ops() {
+    amdaie.workgroup {
+      amdaie.controlcode {
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK:       0x06030100
+// CHECK:       0x00000105
+// CHECK:       0x00000001
+// CHECK:       0x00000040
+// CHECK:       0x00000081
+// CHECK:       0x00000030
+// CHECK:       0x00000000
+// CHECK:       0x00000000
+// CHECK:       0x00000000
+// CHECK:       0x00000000
+// CHECK:       0x0001D004
+// CHECK:       0x00000000
+// CHECK:       0x00000000
+// CHECK:       0x00000000
+// CHECK:       0x00000000
+// CHECK:       0x00000000
+// CHECK-LABEL: @address_patch
+// CHECK:       npu_instructions = dense_resource<npu_instructions> : tensor<16xui32>
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @address_patch() {
+    amdaie.workgroup {
+      amdaie.controlcode {
+        amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32}
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK:       0x06030100
+// CHECK:       0x00000105
+// CHECK:       0x00000001
+// CHECK:       0x00000028
+// CHECK:       0x00000000
+// CHECK:       0x00000000
+// CHECK:       0x0001D214
+// CHECK:       0x00000000
+// CHECK:       0x80000000
+// CHECK:       0x00000018
+// CHECK-LABEL: @push_to_queue_default_values
+// CHECK:       npu_instructions = dense_resource<npu_instructions> : tensor<10xui32>
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @push_to_queue_default_values() {
+    amdaie.workgroup {
+      amdaie.controlcode {
+        amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32}
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK:       0x06030100
+// CHECK:       0x00000105
+// CHECK:       0x00000001
+// CHECK:       0x00000028
+// CHECK:       0x00000000
+// CHECK:       0x00000000
+// CHECK:       0x0001D21C
+// CHECK:       0x00000000
+// CHECK:       0x803F0002
+// CHECK:       0x00000018
+// CHECK-LABEL: @push_to_queue
+// CHECK:       npu_instructions = dense_resource<npu_instructions> : tensor<10xui32>
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @push_to_queue() {
+    amdaie.workgroup {
+      amdaie.controlcode {
+        amdaie.npu.push_to_queue {bd_id = 2 : ui32, channel = 1 : ui32, col = 3 : ui32, direction = 1 : i32, repeat_count = 64 : ui32, row = 0 : ui32}
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK:       0x06030100
+// CHECK:       0x00000105
+// CHECK:       0x00000002
+// CHECK:       0x00000038
+// CHECK:       0x00000000
+// CHECK:       0x00000000
+// CHECK:       0x0001D214
+// CHECK:       0x00000000
+// CHECK:       0x80FF000F
+// CHECK:       0x00000018
+// CHECK:       0x00000080
+// CHECK:       0x00000010
+// CHECK:       0x00020001
+// CHECK:       0x00010100
+// CHECK-LABEL: @async_push_to_queue_and_wait
+// CHECK:       npu_instructions = dense_resource<npu_instructions> : tensor<14xui32>
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @async_push_to_queue_and_wait() {
+    amdaie.workgroup {
+      amdaie.controlcode {
+        %0 = amdaie.npu.push_to_queue async {bd_id = 15 : ui32, channel = 0 : ui32, col = 2 : ui32, direction = 1 : i32, repeat_count = 256 : ui32, row = 0 : ui32}
+        amdaie.npu.dma_wait(%0 : !amdaie.async_token)
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK:       0x06030100
+// CHECK:       0x00000105
+// CHECK:       0x00000001
+// CHECK:       0x00000040
+// CHECK:       0x00000001
+// CHECK:       0x00000000
+// CHECK:       0x0001D000
+// CHECK:       0x00000030
+// CHECK:       0x00000000
+// CHECK:       0x00000000
+// CHECK:       0x00000000
+// CHECK:       0x00000000
+// CHECK:       0x80000000
+// CHECK:       0x00000000
+// CHECK:       0x00000000
+// CHECK:       0x02000000
+// CHECK-LABEL: @write_bd_empty
+// CHECK:       npu_instructions = dense_resource<npu_instructions> : tensor<16xui32>
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @write_bd_empty() {
+    amdaie.workgroup {
+      amdaie.controlcode {
+        amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 0, 0, 0>, strides = array<i32: 0, 0, 0>, use_next_bd = false, valid_bd = true}
+        amdaie.end
+      }
+    }
+    return
+  }
+}
+
+// -----
+
+// CHECK:       0x06030100
+// CHECK:       0x00000105
+// CHECK:       0x00000001
+// CHECK:       0x00000040
+// CHECK:       0x00000001
+// CHECK:       0x00000000
+// CHECK:       0x0201D040
+// CHECK:       0x00000030
+// CHECK:       0x00000400
+// CHECK:       0x00000020
+// CHECK:       0x40080000
+// CHECK:       0x01000000
+// CHECK:       0x81000007
+// CHECK:       0x0000003F
+// CHECK:       0x00000000
+// CHECK:       0x02000000
+// CHECK-LABEL: @write_bd_with_addressing_and_packet
+// CHECK:       npu_instructions = dense_resource<npu_instructions> : tensor<16xui32>
+#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
+module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
+  func.func @write_bd_with_addressing_and_packet() {
+    amdaie.workgroup {
+      amdaie.controlcode {
+        amdaie.npu.write_bd {bd_id = 2 : ui32, buffer_length = 1024 : ui32, buffer_offset = 32 : ui32, col = 1 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 1 : ui32, packet_type = 0 : ui32, paddings_after = array<i32>, paddings_before = array<i32>, row = 0 : ui32, sizes = array<i32: 4, 16, 16>, strides = array<i32: 64, 8, 1>, use_next_bd = false, valid_bd = true}
+        amdaie.end
+      }
+    }
+    return
+  }
+}
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir
index e75a379a5..a036bcb5f 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir
@@ -46,26 +46,18 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
-// CHECK:       aiex.runtime_sequence @hal_bindings
-// CHECK-SAME:  %{{.+}}: memref<32x1024xi32>
-// CHECK-SAME:  %{{.+}}: memref<1024x64xi32>
-// CHECK-SAME:  %{{.+}}: memref<32x64xi32>
-// CHECK-NOT:   memref.assume_alignment
-#pipeline_layout = #hal.pipeline.layout<bindings = [
-  <storage_buffer>,
-  <storage_buffer>,
-  <storage_buffer>
-]>
+// CHECK: module
+// CHECK: aie.device
+// CHECK: aiex.runtime_sequence @workgroup_with_instructions
+// CHECK: } {npu_instructions = dense_resource<npu_instructions> : tensor<208xui32>, runtime_sequence_name = "workgroup_with_instructions"}
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func @hal_bindings() {
-    %c0 = arith.constant 0 : index
-    %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<1024x64xi32>
-    memref.assume_alignment %0, 64 : memref<1024x64xi32>
-    %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<32x1024xi32>
-    memref.assume_alignment %1, 64 : memref<32x1024xi32>
-    %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : memref<32x64xi32>
-    memref.assume_alignment %2, 64 : memref<32x64xi32>
+  func.func @workgroup_with_instructions() {
+    amdaie.workgroup {
+      amdaie.controlcode {
+        amdaie.end
+      }
+    } {npu_instructions = dense_resource<npu_instructions> : tensor<208xui32>}
     return
   }
 }
@@ -202,8 +194,8 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       %lock_3 = amdaie.lock(%tile_0_2(1), 0)
       %0 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 1>
       %1 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 2 : i32>, 1>
-      %channel = amdaie.channel(%tile_0_1, 0, port_type = DMA)
-      %channel_1 = amdaie.channel(%tile_0_2, 0, port_type = DMA)
+      %channel = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = MM2S)
+      %channel_1 = amdaie.channel(%tile_0_2, 0, port_type = DMA, direction = S2MM)
       %2 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false}
       %3 = amdaie.connection(%1 {%channel_1}, %0 {%channel}, flow = %2) : (!amdaie.logicalobjectfifo<memref<4096xi32, 2 : i32>, 1>, !amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 1>)
       amdaie.controlcode {
@@ -282,8 +274,8 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       %lock_3 = amdaie.lock(%tile_0_2(1), 0)
       %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32>, memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 2>
       %1 = amdaie.logicalobjectfifo.from_buffers({%buffer_2, %buffer_3}, {%lock_2}, {%lock_3}) : memref<4096xi32, 2 : i32>, memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 2 : i32>, 2>
-      %channel = amdaie.channel(%tile_0_1, 0, port_type = DMA)
-      %channel_1 = amdaie.channel(%tile_0_2, 0, port_type = DMA)
+      %channel = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = MM2S)
+      %channel_1 = amdaie.channel(%tile_0_2, 0, port_type = DMA, direction = S2MM)
       %2 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false}
       %3 = amdaie.connection(%1 {%channel_1}, %0 {%channel}, flow = %2) : (!amdaie.logicalobjectfifo<memref<4096xi32, 2 : i32>, 2>, !amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 2>)
       amdaie.controlcode {
@@ -372,14 +364,14 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       %lock_7 = amdaie.lock(%tile_0_2(1), 0) 
       %0 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 1>
       %1 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 2 : i32>, 1>
-      %channel = amdaie.channel(%tile_0_1, 0, port_type = DMA)
-      %channel_1 = amdaie.channel(%tile_0_2, 0, port_type = DMA)
+      %channel = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = MM2S)
+      %channel_1 = amdaie.channel(%tile_0_2, 0, port_type = DMA, direction = S2MM)
       %2 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false}
       %3 = amdaie.connection(%1 {%channel_1}, %0 {%channel}, flow = %2) : (!amdaie.logicalobjectfifo<memref<4096xi32, 2 : i32>, 1>, !amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 1>)
       %4 = amdaie.logicalobjectfifo.from_buffers({%buffer_2}, {%lock_4}, {%lock_5}) : memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 1>
       %5 = amdaie.logicalobjectfifo.from_buffers({%buffer_3}, {%lock_6}, {%lock_7}) : memref<2048xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 2 : i32>, 1>
-      %channel_2 = amdaie.channel(%tile_0_1, 1, port_type = DMA)
-      %channel_3 = amdaie.channel(%tile_0_2, 1, port_type = DMA)
+      %channel_2 = amdaie.channel(%tile_0_1, 1, port_type = DMA, direction = MM2S)
+      %channel_3 = amdaie.channel(%tile_0_2, 1, port_type = DMA, direction = S2MM)
       %6 = amdaie.flow({%channel_2} -> {%channel_3}) {is_packet_flow = false}
       %7 = amdaie.connection(%5 {%channel_3}, %4 {%channel_2}, flow = %6) : (!amdaie.logicalobjectfifo<memref<2048xi32, 2 : i32>, 1>, !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 1>)
       amdaie.controlcode {
@@ -525,14 +517,14 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       %lock_7 = amdaie.lock(%tile_0_2(1), 0)
       %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32>, memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 2>
       %1 = amdaie.logicalobjectfifo.from_buffers({%buffer_2, %buffer_3}, {%lock_2}, {%lock_3}) : memref<4096xi32, 2 : i32>, memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 2 : i32>, 2>
-      %channel = amdaie.channel(%tile_0_1, 0, port_type = DMA)
-      %channel_1 = amdaie.channel(%tile_0_2, 0, port_type = DMA)
+      %channel = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = MM2S)
+      %channel_1 = amdaie.channel(%tile_0_2, 0, port_type = DMA, direction = S2MM)
       %2 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false}
       %3 = amdaie.connection(%1 {%channel_1}, %0 {%channel}, flow = %2) : (!amdaie.logicalobjectfifo<memref<4096xi32, 2 : i32>, 2>, !amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 2>)
       %4 = amdaie.logicalobjectfifo.from_buffers({%buffer_4, %buffer_5, %buffer_6, %buffer_7}, {%lock_4}, {%lock_5}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 4>
       %5 = amdaie.logicalobjectfifo.from_buffers({%buffer_8, %buffer_9, %buffer_10, %buffer_11}, {%lock_6}, {%lock_7}) : memref<2048xi32, 2 : i32>, memref<2048xi32, 2 : i32>, memref<2048xi32, 2 : i32>, memref<2048xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 2 : i32>, 4>
-      %channel_2 = amdaie.channel(%tile_0_1, 1, port_type = DMA)
-      %channel_3 = amdaie.channel(%tile_0_2, 1, port_type = DMA)
+      %channel_2 = amdaie.channel(%tile_0_1, 1, port_type = DMA, direction = MM2S)
+      %channel_3 = amdaie.channel(%tile_0_2, 1, port_type = DMA, direction = S2MM)
       %6 = amdaie.flow({%channel_2} -> {%channel_3}) {is_packet_flow = false}
       %7 = amdaie.connection(%5 {%channel_3}, %4 {%channel_2}, flow = %6) : (!amdaie.logicalobjectfifo<memref<2048xi32, 2 : i32>, 4>, !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 4>)
       amdaie.controlcode {
@@ -608,12 +600,12 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       %0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<4096xi32>>
       %1 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 1>
       %2 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 2 : i32>, 1>
-      %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA)
-      %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA)
+      %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S)
+      %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM)
       %3 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false}
       %4 = amdaie.connection(%1 {%channel_1}, %0 {%channel}, flow = %3) : (!amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 1>, !amdaie.logicalobjectfifo<memref<4096xi32>, 1>)
-      %channel_2 = amdaie.channel(%tile_0_1, 0, port_type = DMA)
-      %channel_3 = amdaie.channel(%tile_0_2, 0, port_type = DMA)
+      %channel_2 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = MM2S)
+      %channel_3 = amdaie.channel(%tile_0_2, 0, port_type = DMA, direction = S2MM)
       %5 = amdaie.flow({%channel_2} -> {%channel_3}) {is_packet_flow = false}
       %6 = amdaie.connection(%2 {%channel_3}, %1 {%channel_2}, flow = %5) : (!amdaie.logicalobjectfifo<memref<4096xi32, 2 : i32>, 1>, !amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 1>)
       amdaie.controlcode {
@@ -628,219 +620,6 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 
 // -----
 
-//===----------------------------------------------------------------------===//
-// Controlcode tests
-//===----------------------------------------------------------------------===//
-
-#pipeline_layout = #hal.pipeline.layout<bindings = [<storage_buffer>]>
-#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
-module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func @invalid_npu_dma_cpy_nd() {
-    amdaie.workgroup {
-      %c0 = arith.constant 0 : index
-      %c1 = arith.constant 1 : index
-      %c2 = arith.constant 2 : index
-      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<4096xi32>
-      %tile_0_0 = amdaie.tile(%c0, %c0)
-      %tile_0_1 = amdaie.tile(%c0, %c1)
-      %buffer = amdaie.buffer(%tile_0_1) : memref<4096xi32, 1 : i32>
-      %lock = amdaie.lock(%tile_0_1(0), 1)
-      %lock_1 = amdaie.lock(%tile_0_1(1), 0)
-      %1 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<4096xi32>>
-      %2 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 1>
-      %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA)
-      %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA)
-      %3 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false}
-      %4 = amdaie.connection(%2 {%channel_1}, %1 {%channel}, flow = %3) : (!amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 1>, !amdaie.logicalobjectfifo<memref<4096xi32>, 1>)
-      // expected-error @+1 {{could not convert to AIEDialect ops}}
-      amdaie.controlcode {
-        %5 = amdaie.npu.circular_dma_cpy_nd %4([0] [1024] [1], [] [] [])
-        %6 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xi32> -> !amdaie.logicalobjectfifo<memref<4096xi32>>
-        // expected-error @+1 {{'amdaie.npu.dma_cpy_nd' op must have a source BD ID op to lower to the AIE dialect}}
-        amdaie.npu.dma_cpy_nd %4([] [] [], %6[0, 32] [32, 32] [64, 1]) : source_type = !amdaie.logicalobjectfifo<memref<4096xi32>>
-        amdaie.end
-      }
-    }
-    return
-  }
-}
-
-// -----
-
-// CHECK:       aie.device
-// CHECK:       aiex.runtime_sequence @npu_dma_cpy_nd_with_repeat_already_on_outer_dim(%[[ARG0:.+]]: memref<4096xi32>
-// CHECK:       aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 32][2, 1, 2, 32][2, 0, 16, 1]) {
-// CHECK-SAME:  id = 0 : i64
-#pipeline_layout = #hal.pipeline.layout<bindings = [<storage_buffer>]>
-#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
-module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func @npu_dma_cpy_nd_with_repeat_already_on_outer_dim() {
-    amdaie.workgroup {
-      %c0 = arith.constant 0 : index
-      %c1 = arith.constant 1 : index
-      %c2 = arith.constant 2 : index
-      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<4096xi32>
-      %tile_0_0 = amdaie.tile(%c0, %c0)
-      %tile_0_1 = amdaie.tile(%c0, %c1)
-      %bd_id_0 = amdaie.bd_id(%tile_0_0, 0)
-      %buffer = amdaie.buffer(%tile_0_1) : memref<4096xi32, 1 : i32>
-      %lock = amdaie.lock(%tile_0_1(0), 1)
-      %lock_1 = amdaie.lock(%tile_0_1(1), 0)
-      %1 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<4096xi32>>
-      %2 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 1>
-      %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA)
-      %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA)
-      %3 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false}
-      %4 = amdaie.connection(%2 {%channel_1}, %1 {%channel}, flow = %3) : (!amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 1>, !amdaie.logicalobjectfifo<memref<4096xi32>, 1>)
-      amdaie.controlcode {
-        %5 = amdaie.npu.circular_dma_cpy_nd %4([0] [1024] [1], [] [] [])
-        %6 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xi32> -> !amdaie.logicalobjectfifo<memref<4096xi32>>
-        amdaie.npu.dma_cpy_nd %4([] [] [], %6[0, 0, 0, 32] [2, 1, 2, 32] [2, 0, 16, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo<memref<4096xi32>>
-        amdaie.end
-      }
-    }
-    return
-  }
-}
-
-// -----
-
-// Test to show mix of implicit/explicit source/target addressing in amdaie.npu.dma_cpy_nd.
-
-// CHECK:   aie.device
-// CHECK:   memref.global "public" @[[SHIM_1:.+]] : memref<2048xi32>
-// CHECK:   memref.global "public" @[[SHIM_0:.+]] : memref<4096xi32>
-// CHECK:   aiex.runtime_sequence @controlcode(%[[ARG0:.+]]: memref<4096xi32>, %[[ARG1:.+]]: memref<2048xi32>)
-// CHECK:   aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 32][1, 1, 32, 32][0, 0, 64, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_0]]} : memref<4096xi32>
-// CHECK:   aiex.npu.dma_wait {symbol = @[[SHIM_0]]}
-// CHECK:   aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_0]]} : memref<4096xi32>
-// CHECK:   aiex.npu.dma_wait {symbol = @[[SHIM_0]]}
-// CHECK:   scf.forall
-// CHECK:     aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 32][1, 1, 32, 32][0, 0, 64, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_1]]} : memref<2048xi32>
-// CHECK:     aiex.npu.dma_wait {symbol = @[[SHIM_1]]}
-// CHECK:     aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_1]]} : memref<2048xi32>
-// CHECK:     aiex.npu.dma_wait {symbol = @[[SHIM_1]]}
-// CHECK:   }
-#pipeline_layout = #hal.pipeline.layout<bindings = [<storage_buffer>]>
-#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
-module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func @controlcode() {
-    amdaie.workgroup {
-      %c0 = arith.constant 0 : index
-      %c1 = arith.constant 1 : index
-      %c2 = arith.constant 2 : index
-      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<4096xi32>
-      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<2048xi32>
-      %tile_0_0 = amdaie.tile(%c0, %c0)
-      %tile_0_1 = amdaie.tile(%c0, %c1)
-      %bd_id_0 = amdaie.bd_id(%tile_0_0, 0)
-      %buffer = amdaie.buffer(%tile_0_1) : memref<4096xi32, 1 : i32>
-      %lock = amdaie.lock(%tile_0_1(0), 1)
-      %lock_1 = amdaie.lock(%tile_0_1(1), 0)
-      %buffer_1 = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32>
-      %lock_2 = amdaie.lock(%tile_0_1(0), 1)
-      %lock_3 = amdaie.lock(%tile_0_1(1), 0)
-      %2 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<4096xi32>>
-      %3 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 1>
-      %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA)
-      %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA)
-      %4 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false}
-      %5 = amdaie.connection(%3 {%channel_1}, %2 {%channel}, flow = %4) : (!amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 1>, !amdaie.logicalobjectfifo<memref<4096xi32>, 1>)
-      %6 = amdaie.logicalobjectfifo.placeholder{%tile_0_1} : !amdaie.logicalobjectfifo<memref<2048xi32>>
-      %7 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 1>
-      %channel_2 = amdaie.channel(%tile_0_0, 1, port_type = DMA)
-      %channel_3 = amdaie.channel(%tile_0_1, 1, port_type = DMA)
-      %8 = amdaie.flow({%channel_2} -> {%channel_3}) {is_packet_flow = false}
-      %9 = amdaie.connection(%6 {%channel_2}, %7 {%channel_3}, flow = %8) : (!amdaie.logicalobjectfifo<memref<2048xi32>, 1>, !amdaie.logicalobjectfifo<memref<2048xi32, 1 : i32>, 1>)
-      amdaie.controlcode {
-        %10 = amdaie.npu.circular_dma_cpy_nd %5([0] [4096] [1], [] [] [])
-        %11 = amdaie.npu.circular_dma_cpy_nd %9([] [] [], [0] [2048] [1])
-        %12 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xi32> -> !amdaie.logicalobjectfifo<memref<4096xi32>>
-        %13 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<2048xi32> -> !amdaie.logicalobjectfifo<memref<2048xi32>>
-        %14 = amdaie.npu.dma_cpy_nd async_source %5([] [] [], %12[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo<memref<4096xi32>>
-        amdaie.npu.dma_wait(%14 : !amdaie.async_source_token)
-        %15 = amdaie.npu.dma_cpy_nd async_source %5([] [] [], %12[0, 0, 0, 0] [1, 1, 1, 2048] [0, 0, 0, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo<memref<4096xi32>>
-        amdaie.npu.dma_wait(%15 : !amdaie.async_source_token)
-        scf.forall (%arg0, %arg1) in (2, 1) {
-          %16 = amdaie.npu.dma_cpy_nd async_target %9(%13[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
-          amdaie.npu.dma_wait(%16 : !amdaie.async_target_token)
-          %17 = amdaie.npu.dma_cpy_nd async_target %9(%13[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<2048xi32>>
-          amdaie.npu.dma_wait(%17 : !amdaie.async_target_token)
-        }
-        amdaie.end
-      }
-    }
-    return
-  }
-}
-
-// -----
-
-// CHECK:   aie.device
-// CHECK:   memref.global "public" @[[SHIM_1:.+]] : memref<2048xf32>
-// CHECK:   memref.global "public" @[[SHIM_0:.+]] : memref<4096xbf16>
-// CHECK:   aiex.runtime_sequence @controlcode_bf16_f32(%[[ARG0:.+]]: memref<4096xbf16>, %[[ARG1:.+]]: memref<2048xf32>)
-// CHECK:   aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 1, 2][1, 2, 32, 16][0, 16, 32, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_0]]} : memref<4096xbf16>
-// CHECK:   aiex.npu.dma_wait {symbol = @[[SHIM_0]]}
-// CHECK:   aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_0]]} : memref<4096xbf16>
-// CHECK:   aiex.npu.dma_wait {symbol = @[[SHIM_0]]}
-// CHECK:   aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 32][1, 1, 32, 32][0, 0, 64, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_1]]} : memref<2048xf32>
-// CHECK:   aiex.npu.dma_wait {symbol = @[[SHIM_1]]}
-// CHECK:   aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_1]]} : memref<2048xf32>
-// CHECK:   aiex.npu.dma_wait {symbol = @[[SHIM_1]]}
-#pipeline_layout = #hal.pipeline.layout<bindings = [<storage_buffer>]>
-#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
-module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
-  func.func @controlcode_bf16_f32() {
-    amdaie.workgroup {
-      %c0 = arith.constant 0 : index
-      %c1 = arith.constant 1 : index
-      %c2 = arith.constant 2 : index
-      %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<4096xbf16>
-      %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<2048xf32>
-      %tile_0_0 = amdaie.tile(%c0, %c0)
-      %tile_0_1 = amdaie.tile(%c0, %c1)
-      %bd_id_0 = amdaie.bd_id(%tile_0_0, 0)
-      %buffer = amdaie.buffer(%tile_0_1) : memref<4096xbf16, 1 : i32>
-      %lock = amdaie.lock(%tile_0_1(0), 1)
-      %lock_1 = amdaie.lock(%tile_0_1(1), 0)
-      %buffer_1 = amdaie.buffer(%tile_0_1) : memref<2048xf32, 1 : i32>
-      %lock_2 = amdaie.lock(%tile_0_1(0), 1)
-      %lock_3 = amdaie.lock(%tile_0_1(1), 0)
-      %2 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo<memref<4096xbf16>>
-      %3 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xbf16, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4096xbf16, 1 : i32>, 1>
-      %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA)
-      %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA)
-      %4 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false}
-      %5 = amdaie.connection(%3 {%channel_1}, %2 {%channel}, flow = %4) : (!amdaie.logicalobjectfifo<memref<4096xbf16, 1 : i32>, 1>, !amdaie.logicalobjectfifo<memref<4096xbf16>, 1>)
-      %6 = amdaie.logicalobjectfifo.placeholder{%tile_0_1} : !amdaie.logicalobjectfifo<memref<2048xf32>>
-      %7 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<2048xf32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<2048xf32, 1 : i32>, 1>
-      %channel_2 = amdaie.channel(%tile_0_0, 1, port_type = DMA)
-      %channel_3 = amdaie.channel(%tile_0_1, 1, port_type = DMA)
-      %8 = amdaie.flow({%channel_2} -> {%channel_3}) {is_packet_flow = false}
-      %9 = amdaie.connection(%6 {%channel_2}, %7 {%channel_3}, flow = %8) : (!amdaie.logicalobjectfifo<memref<2048xf32>, 1>, !amdaie.logicalobjectfifo<memref<2048xf32, 1 : i32>, 1>)
-      amdaie.controlcode {
-        %10 = amdaie.npu.circular_dma_cpy_nd %5([0] [4096] [1], [] [] [])
-        %11 = amdaie.npu.circular_dma_cpy_nd %9([] [] [], [0] [2048] [1])
-        %12 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xbf16> -> !amdaie.logicalobjectfifo<memref<4096xbf16>>
-        %13 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<2048xf32> -> !amdaie.logicalobjectfifo<memref<2048xf32>>
-        %14 = amdaie.npu.dma_cpy_nd async_source %5([] [] [], %12[0, 0, 1, 2] [1, 2, 32, 16] [0, 16, 32, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo<memref<4096xbf16>>
-        amdaie.npu.dma_wait(%14 : !amdaie.async_source_token)
-        %15 = amdaie.npu.dma_cpy_nd async_source %5([] [] [], %12[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo<memref<4096xbf16>>
-        amdaie.npu.dma_wait(%15 : !amdaie.async_source_token)
-        %16 = amdaie.npu.dma_cpy_nd async_target %9(%13[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<2048xf32>>
-        amdaie.npu.dma_wait(%16 : !amdaie.async_target_token)
-        %17 = amdaie.npu.dma_cpy_nd async_target %9(%13[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo<memref<2048xf32>>
-        amdaie.npu.dma_wait(%17 : !amdaie.async_target_token)
-        amdaie.end
-      }
-    }
-    return
-  }
-}
-
-// -----
-
 //===----------------------------------------------------------------------===//
 // CoreOp tests
 //===----------------------------------------------------------------------===//
@@ -1080,11 +859,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
 // CHECK:       aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 1)
 // CHECK:       aie.end
 // CHECK:     }
-// CHECK:     aiex.runtime_sequence @large_example(%[[ARG0:.*]]: memref<4096xi32>) {
-// CHECK:       aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 32][1, 1, 32, 32][0, 0, 64, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_0]]} : memref<4096xi32>
-// CHECK:       aiex.npu.dma_wait {symbol = @[[SHIM_0]]}
-// CHECK:     }
-// CHECK:   }
+// CHECK:     aiex.runtime_sequence @large_example
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>]>
 #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}>
 module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
@@ -1115,13 +890,13 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       %1 = amdaie.logicalobjectfifo.placeholder{%tile} : !amdaie.logicalobjectfifo<memref<4096xi32>>
       %2 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_3}, {%lock}, {%lock_4}) : memref<4096xi32, 1 : i32>, memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 2>
       %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_5, %buffer_6, %buffer_9, %buffer_10}, {%lock_7, %lock_11}, {%lock_8, %lock_12}) : memref<4096xi32, 2 : i32>, memref<4096xi32, 2 : i32>, memref<4096xi32, 2 : i32>, memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<4096xi32, 2 : i32>, 2>
-      %channel = amdaie.channel(%tile, 0, port_type = DMA)
-      %channel_13 = amdaie.channel(%tile_0, 0, port_type = DMA)
+      %channel = amdaie.channel(%tile, 0, port_type = DMA, direction = MM2S)
+      %channel_13 = amdaie.channel(%tile_0, 0, port_type = DMA, direction = S2MM)
       %4 = amdaie.flow({%channel} -> {%channel_13}) {is_packet_flow = false}
       %5 = amdaie.connection(%2 {%channel_13}, %1 {%channel}, flow = %4) : (!amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 2>, !amdaie.logicalobjectfifo<memref<4096xi32>>)
-      %channel_14 = amdaie.channel(%tile_0, 1, port_type = DMA)
-      %channel_15 = amdaie.channel(%tile_1, 0, port_type = DMA)
-      %channel_16 = amdaie.channel(%tile_2, 0, port_type = DMA)
+      %channel_14 = amdaie.channel(%tile_0, 1, port_type = DMA, direction = MM2S)
+      %channel_15 = amdaie.channel(%tile_1, 0, port_type = DMA, direction = S2MM)
+      %channel_16 = amdaie.channel(%tile_2, 0, port_type = DMA, direction = S2MM)
       %6 = amdaie.flow({%channel_14} -> {%channel_15, %channel_16}) {is_packet_flow = false}
       %7 = amdaie.connection(%3 {%channel_15, %channel_16}, %2 {%channel_14}, flow = %6) : (!amdaie.logicalobjectfifo<memref<4096xi32, 2 : i32>, 2>, !amdaie.logicalobjectfifo<memref<4096xi32, 1 : i32>, 2>)
       %8 = amdaie.core(%tile_1, in : [%7], out : []) {
@@ -1149,9 +924,6 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
       amdaie.controlcode {
         %10 = amdaie.npu.circular_dma_cpy_nd %5([0, 0] [64, 64] [32, 1], [] [] [])
         %11 = amdaie.npu.circular_dma_cpy_nd %7([] [] [], [0, 1024] [64, 64] [32, 1])
-        %12 = amdaie.logicalobjectfifo.from_memref %0, {%tile} : memref<4096xi32> -> !amdaie.logicalobjectfifo<memref<4096xi32>>
-        %13 = amdaie.npu.dma_cpy_nd async_source %5([] [] [], %12[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo<memref<4096xi32>>
-        amdaie.npu.dma_wait(%13 : !amdaie.async_source_token)
         amdaie.end
       }
     }
diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc
index 2d23678d4..6ffc1b11a 100644
--- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc
+++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc
@@ -159,6 +159,10 @@ AMDAIEDeviceModel::AMDAIEDeviceModel(
   TRY_XAIE_API_FATAL_ERROR(XAie_TurnEccOff, &devInst);
 }
 
+uint8_t AMDAIEDeviceModel::getMinStrideBitWidth() const {
+  return deviceConfig.minStrideBitWidth;
+}
+
 int AMDAIEDeviceModel::rows() const {
   if (device == AMDAIEDevice::xcvc1902 || device == AMDAIEDevice::xcve2802)
     return MLIRAIELegacy::rows(*this);
diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h
index de8c855b7..e9a227d25 100644
--- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h
+++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h
@@ -222,6 +222,10 @@ struct AMDAIEDeviceModel {
   /// aie-rt for whatever reason. Make sure the parameters can't be retrieved in
   /// another way before adding new fields to this struct.
   struct AMDAIEDeviceConfig {
+    /// Set default minimum stride bitwidth/addressing granularity to 32 bits as
+    /// this is the value for all current architecture versions.
+    uint8_t minStrideBitWidth{32};
+    /// The max packet id.
     uint8_t packetIdMaxIdx{0};
     /// Currently, the max arbiter/msel is hidden inside aie-rt.
     uint8_t streamSwitchCoreArbiterMax{0};
@@ -246,6 +250,7 @@ struct AMDAIEDeviceModel {
                              AMDAIEDevice device,
                              AMDAIEDeviceConfig deviceConfig);
 
+  uint8_t getMinStrideBitWidth() const;
   int rows() const;
   int columns() const;