diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp index c64955c3d..18d6b2a49 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp @@ -6,15 +6,14 @@ #include "iree-amd-aie/IR/AMDAIEOps.h" +#include + #include "iree-amd-aie/IR/AMDAIEDialect.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/Utils/StaticValueUtils.h" #include "mlir/IR/OpDefinition.h" -#define GET_OP_CLASSES -#include "iree-amd-aie/IR/AMDAIEOps.cpp.inc" - namespace mlir::iree_compiler::AMDAIE { void AMDAIEDialect::initializeAMDAIEOps() { @@ -24,6 +23,38 @@ void AMDAIEDialect::initializeAMDAIEOps() { >(); } +//===----------------------------------------------------------------------===// +// custom(type($async_token)) +//===----------------------------------------------------------------------===// + +/// Parses an optional list of async operands. +static ParseResult parseAsyncTokenType(OpAsmParser &parser, Type &resultType) { + if (succeeded(parser.parseOptionalKeyword("async_source"))) { + resultType = parser.getBuilder().getType(); + } else if (succeeded(parser.parseOptionalKeyword("async_target"))) { + resultType = parser.getBuilder().getType(); + } else if (succeeded(parser.parseOptionalKeyword("async"))) { + resultType = parser.getBuilder().getType(); + } + return success(); +} + +/// Prints optional async tokens with its leading keyword. +static void printAsyncTokenType(OpAsmPrinter &p, Operation *op, + Type asyncTokenType) { + if (asyncTokenType) { + if (isa(asyncTokenType)) { + p << "async_source"; + } else if (isa(asyncTokenType)) { + p << "async_target"; + } else if (isa(asyncTokenType)) { + p << "async"; + } else { + assert(false && "unsupported async token type"); + } + } +} + //===----------------------------------------------------------------------===// // AMDAIE_BdIdOp //===----------------------------------------------------------------------===// @@ -1026,6 +1057,97 @@ void NpuDmaCpyNdOp::getCanonicalizationPatterns(RewritePatternSet &results, context); } +//===----------------------------------------------------------------------===// +// AMDAIE_NpuHalfDmaCpyNdOp +//===----------------------------------------------------------------------===// + +// Build a NpuHalfDmaCpyNdOp with mixed static and dynamic entries. +void NpuHalfDmaCpyNdOp::build(OpBuilder &b, OperationState &result, + TypeRange resultTypes, Value connection, + Value input, ArrayRef offsets, + ArrayRef sizes, + ArrayRef strides, Value bdId, + Value channel) { + SmallVector staticOffsets, staticSizes, staticStrides; + SmallVector dynamicOffsets, dynamicSizes, dynamicStrides; + dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets); + dispatchIndexOpFoldResults(sizes, dynamicSizes, staticSizes); + dispatchIndexOpFoldResults(strides, dynamicStrides, staticStrides); + build(b, result, resultTypes, connection, input, dynamicOffsets, dynamicSizes, + dynamicStrides, staticOffsets, staticSizes, staticStrides, bdId, + channel); +} + +// Build a NpuHalfDmaCpyNdOp with static entries. +void NpuHalfDmaCpyNdOp::build(OpBuilder &b, OperationState &result, + TypeRange resultTypes, Value connection, + Value input, ArrayRef offsets, + ArrayRef sizes, + ArrayRef strides, mlir::Value bdId, + Value channel) { + SmallVector offsetValues = llvm::to_vector<4>(llvm::map_range( + offsets, + [&](int64_t v) -> OpFoldResult { return b.getI64IntegerAttr(v); })); + SmallVector sizeValues = + llvm::to_vector<4>(llvm::map_range(sizes, [&](int64_t v) -> OpFoldResult { + return b.getI64IntegerAttr(v); + })); + SmallVector strideValues = llvm::to_vector<4>(llvm::map_range( + strides, + [&](int64_t v) -> OpFoldResult { return b.getI64IntegerAttr(v); })); + build(b, result, resultTypes, connection, input, offsetValues, sizeValues, + strideValues, bdId, channel); +} + +// Build a NpuHalfDmaCpyNdOp with dynamic entries. +void NpuHalfDmaCpyNdOp::build(OpBuilder &b, OperationState &result, + TypeRange resultTypes, Value connection, + Value input, ValueRange offsets, ValueRange sizes, + ValueRange strides, mlir::Value bdId, + Value channel) { + SmallVector offsetValues = llvm::to_vector<4>( + llvm::map_range(offsets, [](Value v) -> OpFoldResult { return v; })); + SmallVector sizeValues = llvm::to_vector<4>( + llvm::map_range(sizes, [](Value v) -> OpFoldResult { return v; })); + SmallVector strideValues = llvm::to_vector<4>( + llvm::map_range(strides, [](Value v) -> OpFoldResult { return v; })); + build(b, result, resultTypes, connection, input, offsetValues, sizeValues, + strideValues, bdId, channel); +} + +std::optional NpuHalfDmaCpyNdOp::getStaticBaseOffset() { + int64_t baseOffset = 0; + SmallVector offsets = getMixedOffsets(); + SmallVector strides = getMixedStrides(); + for (auto &&[offset, stride] : llvm::zip(offsets, strides)) { + std::optional constantOffset = getConstantIntValue(offset); + std::optional constantStride = getConstantIntValue(stride); + // If offset is zero, we can just continue to the next one. This enables + // the case where the stride is dynamic. + if (constantOffset && constantOffset.value() == 0) continue; + if (constantOffset && constantStride) { + baseOffset += (constantOffset.value() * constantStride.value()); + } else { + return std::nullopt; + } + } + return baseOffset; +} + +std::optional NpuHalfDmaCpyNdOp::getAccessStaticSize() { + SmallVector sizes = getMixedSizes(); + if (sizes.size() == 0) return 0; + std::optional> staticSizes = getConstantIntValues(sizes); + if (!staticSizes) return std::nullopt; + return std::accumulate(staticSizes->begin(), staticSizes->end(), 1, + std::multiplies<>()); +} + +bool NpuHalfDmaCpyNdOp::hasDmaWaitOpUser() { + return llvm::any_of((*this)->getUsers(), + [](auto userOp) { return isa(userOp); }); +} + //===----------------------------------------------------------------------===// // AMDAIE_NpuCircularDmaCpyNdOp //===----------------------------------------------------------------------===// @@ -1254,3 +1376,10 @@ LogicalResult WorkgroupOp::verify() { return success(); } } // namespace mlir::iree_compiler::AMDAIE + +//===----------------------------------------------------------------------===// +// TableGen definitions (intentionally last) +//===----------------------------------------------------------------------===// + +#define GET_OP_CLASSES +#include "iree-amd-aie/IR/AMDAIEOps.cpp.inc" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td index 087dac7b4..3a4df8459 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td @@ -11,6 +11,7 @@ include "mlir/Interfaces/CopyOpInterface.td" include "mlir/Interfaces/InferTypeOpInterface.td" include "mlir/Interfaces/SideEffectInterfaces.td" include "mlir/Interfaces/ViewLikeInterface.td" +include "mlir/IR/BuiltinAttributes.td" include "mlir/IR/OpAsmInterface.td" include "mlir/IR/OpBase.td" include "mlir/IR/SymbolInterfaces.td" @@ -177,6 +178,9 @@ def AMDAIE_WorkgroupOp : AMDAIE_Op<"workgroup", }]; let regions = (region SizedRegion<1>:$region); + let arguments = ( + ins OptionalAttr:$npu_instructions + ); let assemblyFormat = [{ regions attr-dict }]; @@ -191,7 +195,7 @@ def AMDAIE_WorkgroupOp : AMDAIE_Op<"workgroup", let extraClassDeclaration = [{ // Return the control code op within this workgroup. ControlCodeOp getControlCode() { - return dyn_cast(getBody()->getTerminator()); + return cast(getBody()->getTerminator()); } // Make sure the WorkgroupOp region is well-formed with a ControlCodeOp // terminator. @@ -355,14 +359,15 @@ def AMDAIE_ChannelOp: AMDAIE_Op<"channel", [ ```mlir %tile = amdaie.tile(%c0, %c0) - %channel = amdaie.channel(%tile, 0, port_type = DMA) + %channel = amdaie.channel(%tile, 0, port_type = DMA, direction = MM2S) ``` }]; let arguments = ( ins Index:$tile, ConfinedAttr]>:$value, - StrmSwPortTypeAttr:$port_type + StrmSwPortTypeAttr:$port_type, + DMAChannelDir:$direction ); let extraClassDeclaration = [{ @@ -370,7 +375,13 @@ def AMDAIE_ChannelOp: AMDAIE_Op<"channel", [ }]; let assemblyFormat = [{ - `(` $tile `,` $value `,` `port_type` `=` $port_type `)` attr-dict + `(` + $tile `,` + $value `,` + `port_type` `=` $port_type `,` + `direction` `=` $direction + `)` + attr-dict }]; } @@ -378,6 +389,30 @@ def AMDAIE_ChannelOp: AMDAIE_Op<"channel", [ // IREE AMDAIE Npu Ops //===----------------------------------------------------------------------===// +def AMDAIE_NpuAddressPatchOp: AMDAIE_Op<"npu.address_patch"> { + let summary = "Operation to patch the address inside a buffer descriptor"; + let description = [{ + This NPU controller operation patches the address inside the buffer + descriptor with provided ID on the specified column. This enables codegen to + provide an argument index and offset at compile time, which is then + translated to a physical address at runtime by the firmware. + + Example: + + ```mlir + amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, + col = 0 : ui32, offset = 1024 : ui32} + ``` + }]; + let arguments = ( + ins UI32Attr:$col, + UI32Attr:$bd_id, + UI32Attr:$arg_idx, + UI32Attr:$offset + ); + let assemblyFormat = [{ attr-dict }]; +} + def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", [ AttrSizedOperandSegments, DoublyStridedOpInterface]> { let summary = "The Npu uController's dma operator"; @@ -556,6 +591,151 @@ def AMDAIE_NpuDmaCpyNdOp: AMDAIE_Op<"npu.dma_cpy_nd", [ let hasCanonicalizer = 1; } +def AMDAIE_NpuHalfDmaCpyNdOp + : AMDAIE_Op<"npu.half_dma_cpy_nd", [AttrSizedOperandSegments, OffsetSizeAndStrideOpInterface]> { + let summary = "The NPU uController's DMA operation, operating on a single port"; + let description = [{ + The NPU DMA operation represents a strided DMA operation with an unlimited + number of dimensions, executed by the NPU uController. This operation refers + to a `connection` and `input` logical objectFifo being operated on, as well + as an optionally specified BD ID and `channel` (DMA port). The `connection` + operand provides information on how to use the connection, for example + whether a packet header is needed. + + The representation supports a partially-static representation for the + `offsets`, `sizes` and `strides`. A special sentinel value + ShapedType::kDynamic encodes that the corresponding entry has a dynamic + value. + + Example: + + ```mlir + %2 = amdaie.connection(%1, %0) + : (!amdaie.logicalobjectfifo>, + !amdaie.logicalobjectfifo>) + %bd_id = amdaie.bd_id(%tile_0_0, 0) + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S) + ... + amdaie.controlcode { + %5 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} + : memref<32x1024xi32> -> !amdaie.logicalobjectfifo> + %4 = amdaie.npu.half_dma_cpy_nd async %2(%0[0, 0] [32, 64] [1024, 1] + bd_id = %bd_id channel = %channel) + ... + } + ``` + }]; + + let arguments = ( + ins Index:$connection, + AnyAMDAIELogicalObjectFifoType:$input, + Variadic:$offsets, + Variadic:$sizes, + Variadic:$strides, + DenseI64ArrayAttr:$static_offsets, + DenseI64ArrayAttr:$static_sizes, + DenseI64ArrayAttr:$static_strides, + Optional:$bd_id, + Optional:$channel + ); + + let results = (outs Optional:$async_token); + + let assemblyFormat = [{ + custom(type($async_token)) + $connection + `(` + $input + custom($offsets, $static_offsets) + custom($sizes, $static_sizes) + custom($strides, $static_strides) + (`bd_id` `=` $bd_id^)? + (`channel` `=` $channel^)? + `)` + attr-dict + `:` type($input) + }]; + + let builders = [ + // Build a NpuHalfDmaCpyNdOp with mixed static and dynamic entries. + OpBuilder<(ins "::mlir::TypeRange":$result_types, "Value":$connection, + "::mlir::Value":$input, "ArrayRef":$offsets, + "ArrayRef":$sizes, "ArrayRef":$strides, + "::mlir::Value":$bd_id, "::mlir::Value":$channel)>, + // Build a NpuHalfDmaCpyNdOp with static entries. + OpBuilder<(ins "::mlir::TypeRange":$result_types, "Value":$connection, + "::mlir::Value":$target, "ArrayRef":$offsets, + "ArrayRef":$sizes, "ArrayRef":$strides, + "::mlir::Value":$bd_id, "::mlir::Value":$channel)>, + // Build a NpuHalfDmaCpyNdOp with dynamic entries. + OpBuilder<(ins "::mlir::TypeRange":$result_types, "Value":$connection, + "::mlir::Value":$input, "ValueRange":$offsets, "ValueRange":$sizes, + "ValueRange":$strides, "::mlir::Value":$bd_id, "::mlir::Value":$channel)> + ]; + + let extraClassDeclaration = [{ + /// Return the number of leading operands before the `offsets`, `sizes` and + /// and `strides` operands. + static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 2; } + + /// Return the expected rank of each of the`static_offsets`, `static_sizes` + /// and `static_strides` attributes. + std::array getArrayAttrMaxRanks() { + unsigned rank = getMixedOffsets().size(); + return {rank, rank, rank}; + } + + std::optional getBdIdOp() { + return dyn_cast_if_present(getBdId().getDefiningOp()); + } + + // Return the input `amdaie.connection` operation. + std::optional getConnectionOp() { + return dyn_cast_if_present(getConnection().getDefiningOp()); + } + + std::optional getChannelOp() { + return dyn_cast_if_present(getChannel().getDefiningOp()); + } + + // Return the source memref type. This is retrieved using information from + // the input DMA operation. + MemRefType getMemrefType() { + return cast(getInput().getType()) + .getElementType(); + } + + // Return the source memory space as an attribute. + Attribute getMemorySpace() { + return cast(getInput().getType()) + .getMemorySpace(); + } + + // Helper method to return the memory space as an integer. If no memory + // space attribute, this indicates a global memory space and 0 is returned. + // Else cast the memory space attribute to an integer. + uint8_t getMemorySpaceAsUInt() { + Attribute memSpace = getMemorySpace(); + return memSpace ? cast(memSpace).getInt() : 0; + } + + // Compute and return the constant base offset if possible. + std::optional getStaticBaseOffset(); + + // Compute and return the size of the DMA access if possible. + std::optional getAccessStaticSize(); + + // Check whether this dma operation has a wait user. + bool hasDmaWaitOpUser(); + + // Check whether this operation has addressing. + bool hasAddressing() { + return !getMixedOffsets().empty() || !getMixedSizes().empty() + || !getMixedStrides().empty(); + } + }]; +} + def AMDAIE_NpuCircularDmaCpyNdOp: AMDAIE_Op<"npu.circular_dma_cpy_nd", [ AMDAIE_CircularDmaOp, AttrSizedOperandSegments, DoublyStridedOpInterface]>, Results<(outs Index)> { @@ -761,6 +941,85 @@ def AMDAIE_NpuDmaWaitOp: AMDAIE_Op<"npu.dma_wait", []> { }]; } +def AMDAIE_NpuPushToQueueOp: AMDAIE_Op<"npu.push_to_queue">, + Results<(outs Optional:$async_token)> { + let summary = "Push the provided BD to the specified channel's queue."; + let description = [{ + This NPU controller operation to push a buffer descriptor with specified + `bd_id` to the queue of the (`channel`, `direction`) DMA port on tile + (`col`, `row`). The BD will be repeated for a `repeat_count` number of + times. + + Example: + + ```mlir + amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32, + col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, + row = 0 : ui32} + ``` + }]; + let arguments = ( + ins UI32Attr:$col, + UI32Attr:$row, + DMAChannelDir:$direction, + UI32Attr:$channel, + UI32Attr:$repeat_count, + UI32Attr:$bd_id + ); + let assemblyFormat = [{ + custom(type($async_token)) attr-dict + }]; +} + +def AMDAIE_NpuWriteBdOp: AMDAIE_Op<"npu.write_bd"> { + let summary = "Initialize the buffer descriptor with specified ID"; + let description = [{ + This NPU controller operation to initialize the `bd_id` buffer descriptor on + the (`col`, `row`) tile with the provided configurations. + + Example: + + ```mlir + amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, + buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, + iteration_current = 0 : ui32, iteration_size = 0 : ui32, + iteration_stride = 0 : ui32, lock_acq_enable = false, + lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, + lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, + packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, + paddings_before = array, row = 0 : ui32, sizes = array, + strides = array, use_next_bd = false, valid_bd = true} + ``` + }]; + let arguments = ( + ins UI32Attr:$col, + UI32Attr:$row, + UI32Attr:$bd_id, + UI32Attr:$buffer_length, + UI32Attr:$buffer_offset, + DenseI32ArrayAttr:$sizes, + DenseI32ArrayAttr:$strides, + DenseI32ArrayAttr:$paddings_before, + DenseI32ArrayAttr:$paddings_after, + UI32Attr:$iteration_current, + UI32Attr:$iteration_size, + UI32Attr:$iteration_stride, + BoolAttr:$enable_packet, + UI32Attr:$packet_id, + UI32Attr:$packet_type, + UI32Attr:$out_of_order_id, + BoolAttr:$use_next_bd, + UI32Attr:$next_bd, + BoolAttr:$valid_bd, + BoolAttr:$lock_acq_enable, + I32Attr:$lock_rel_val, + UI32Attr:$lock_rel_id, + I32Attr:$lock_acq_val, + UI32Attr:$lock_acq_id + ); + let assemblyFormat = [{ attr-dict }]; +} + //===----------------------------------------------------------------------===// // IREE AMDAIE LogicalObjectFifo Ops //===----------------------------------------------------------------------===// diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir index 53831b2f0..bb31f7ec9 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir @@ -153,8 +153,8 @@ func.func @dma_cpy_nd_mixed(%arg0: !amdaie.logicalobjectfifo {%[[CHANNEL_1]]}) {is_packet_flow = false} // CHECK: amdaie.flow({%[[CHANNEL]]} -> {%[[CHANNEL_1]]}) {is_packet_flow = true, packet_id = 1 : ui8} func.func @flow() { @@ -162,8 +162,8 @@ func.func @flow() { %c1 = arith.constant 1 : index %tile_0_0 = amdaie.tile(%c0, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) - %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) - %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM) %0 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} %1 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true, packet_id = 1 : ui8} return @@ -391,6 +391,65 @@ func.func @npu_dma_cpy_nd_all_operands(%arg0: !amdaie.logicalobjectfifo> +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK-DAG: %[[BD_ID:.+]] = amdaie.bd_id(%[[TILE_0_0]], 0) +// CHECK-DAG: %[[CHANNEL:.*]] = amdaie.channel(%[[TILE_0_0]], 0, port_type = DMA, direction = S2MM) +// CHECK-DAG: %[[CONNECTION_0:.+]] = amdaie.connection +func.func @npu_half_dma_cpy_nd(%arg0: !amdaie.logicalobjectfifo>, %arg1: !amdaie.logicalobjectfifo>) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %tile_0_0 = amdaie.tile(%c0, %c0) + %bd_id = amdaie.bd_id(%tile_0_0, 0) + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = S2MM) + %0 = amdaie.connection(%arg0, %arg1) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) +// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [] [] []) : !amdaie.logicalobjectfifo> + amdaie.npu.half_dma_cpy_nd %0(%arg0[] [] []) : !amdaie.logicalobjectfifo> +// CHECK: %{{.+}} = amdaie.npu.half_dma_cpy_nd async %[[CONNECTION_0]](%[[ARG0]] [] [] []) : !amdaie.logicalobjectfifo> + amdaie.npu.half_dma_cpy_nd async %0(%arg0[] [] []) : !amdaie.logicalobjectfifo> +// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [0] [1024] [1] bd_id = %[[BD_ID]]) : !amdaie.logicalobjectfifo> + amdaie.npu.half_dma_cpy_nd %0(%arg0[0] [1024] [1] bd_id = %bd_id) : !amdaie.logicalobjectfifo> +// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [%[[C0]], 0] [%[[C0]], 64] [%[[C0]], 1] channel = %[[CHANNEL]]) : !amdaie.logicalobjectfifo> + amdaie.npu.half_dma_cpy_nd %0(%arg0[%c0, 0] [%c0, 64] [%c0, 1] channel = %channel) : !amdaie.logicalobjectfifo> +// CHECK: amdaie.npu.half_dma_cpy_nd %[[CONNECTION_0]](%[[ARG0]] [] [] [] bd_id = %[[BD_ID]] channel = %[[CHANNEL]]) : !amdaie.logicalobjectfifo> + amdaie.npu.half_dma_cpy_nd %0(%arg0[] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> + return +} + +// ----- + +// CHECK-LABEL: func.func @npu_patch_address +// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} +func.func @npu_patch_address() { + amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} + return +} + +// ----- + +// CHECK-LABEL: func.func @npu_push_to_queue +// CHECK: amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} +// CHECK: %{{.+}} = amdaie.npu.push_to_queue async {bd_id = 15 : ui32, channel = 0 : ui32, col = 2 : ui32, direction = 1 : i32, repeat_count = 256 : ui32, row = 0 : ui32} +func.func @npu_push_to_queue() { + amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} + %0 = amdaie.npu.push_to_queue async {bd_id = 15 : ui32, channel = 0 : ui32, col = 2 : ui32, direction = 1 : i32, repeat_count = 256 : ui32, row = 0 : ui32} + return +} + +// ----- + +// CHECK-LABEL: func.func @npu_write_bd +// CHECK: amdaie.npu.write_bd {bd_id = 2 : ui32, buffer_length = 1024 : ui32, buffer_offset = 32 : ui32, col = 1 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 1 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} +func.func @npu_write_bd() { + amdaie.npu.write_bd {bd_id = 2 : ui32, buffer_length = 1024 : ui32, buffer_offset = 32 : ui32, col = 1 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 1 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} + return +} + +// ----- + // CHECK-LABEL: func.func @workgroup // CHECK: amdaie.workgroup // CHECK: amdaie.core diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir index e902c7e2a..429089cdd 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Test/samples/matmul_pack_peel_objectfifo.mlir @@ -1,6 +1,6 @@ // This pipeline is obtained by going into Passes.cpp, and dumping the pass pipeline (at the end of addAMDAIEObjectFifoLoweringPasses) using `passManager.dump()`. This test is included, as it can be useful to have a reference in IR of all the passes that are run. -// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-distribute-l1-allocations,iree-amdaie-convert-to-dma,iree-amdaie-normalize-loop-bounds,iree-amdaie-insert-cores,iree-amdaie-localize-logicalobjectfifo,cse,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-split-logical-objectfifos-for-connection-reuse,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-dma-cse,iree-amdaie-hoist-logical-objectfifo,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-assign-logical-objectfifo-depth{l1-buffer-depth=2 l2-buffer-depth=2 l3-buffer-depth=1},iree-amdaie-access-to-acquire-release,iree-amdaie-none-access-to-temporary-buffer,iree-amdaie-assign-connection-types,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-composition{only-zero-stride-on-outer-dim=true},cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-assign-npu-dma-bd-ids,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-controlcode-loop-unroll,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-convert-core-forall-to-for,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-channels,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-objfifo-bufferization,iree-amdaie-connection-to-flow,iree-amdaie-assign-packet-ids,iree-amdaie-acquire-release-to-use-lock,iree-amdaie-canonicalize-npu-dma-cpy-nd{nb-dimensions=4},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-sink-into-core,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-lower-to-aie,iree-amdaie-remove-memoryspace)" --split-input-file %s | FileCheck %s +// RUN: iree-opt --pass-pipeline="builtin.module(fold-memref-alias-ops,iree-amdaie-distribute-l1-allocations,iree-amdaie-convert-to-dma,iree-amdaie-normalize-loop-bounds,iree-amdaie-insert-cores,iree-amdaie-localize-logicalobjectfifo,cse,iree-amdaie-distribute-cores-and-objectfifos,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-split-logical-objectfifos-for-connection-reuse,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-to-circular-dma,func.func(iree-amdaie-create-aie-workgroup),cse,iree-amdaie-dma-cse,iree-amdaie-hoist-logical-objectfifo,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},iree-amdaie-flatten-logicalobjectfifo,iree-amdaie-assign-logical-objectfifo-depth{l1-buffer-depth=2 l2-buffer-depth=2 l3-buffer-depth=1},iree-amdaie-access-to-acquire-release,iree-amdaie-none-access-to-temporary-buffer,iree-amdaie-assign-connection-types,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-composition{only-zero-stride-on-outer-dim=true},cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-assign-npu-dma-bd-ids,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-controlcode-loop-unroll,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-dma-cse,iree-amdaie-canonicalize-doubly-strided-op{fold-single-dims=false},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-convert-core-forall-to-for,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-assign-channels,cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-objfifo-bufferization,iree-amdaie-connection-to-flow,iree-amdaie-assign-packet-ids,iree-amdaie-controlcode-lowering,iree-amdaie-controlcode-to-transaction,iree-amdaie-acquire-release-to-use-lock,iree-amdaie-canonicalize-npu-dma-cpy-nd{nb-dimensions=4},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-sink-into-core,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},iree-amdaie-lower-to-aie,iree-amdaie-remove-memoryspace)" --split-input-file %s | FileCheck %s @@ -19,9 +19,8 @@ // CHECK-DAG: aie.core(%[[TILE_1_2]]) // CHECK: aie.use_lock // Check a bit of the aiex.runtime_sequence: -// CHECK: aiex.runtime_sequence @matmul_i32(%[[ARG0:.+]]: memref<32x1024xi32>, %[[ARG1:.+]]: memref<1024x64xi32>, %[[ARG2:.+]]: memref<32x64xi32>) -// CHECK-DAG: aiex.npu.dma_memcpy_nd -// CHECK-DAG: aiex.npu.dma_wait +// CHECK: aiex.runtime_sequence @matmul_i32() +// CHECK: } {npu_instructions = dense_resource : tensor<174xui32>, runtime_sequence_name = "matmul_i32"} #pipeline_layout = #hal.pipeline.layout, diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignChannels.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignChannels.cpp index d0440336b..1959c7a9a 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignChannels.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEAssignChannels.cpp @@ -45,14 +45,16 @@ LogicalResult assignChannels(AMDAIE::WorkgroupOp workgroupOp) { for (Value tile : sourceLogicalObjFifo.getTiles()) { uint8_t channel = generator.getProducerDMAChannel(tile); auto channelOp = rewriter.create( - rewriter.getUnknownLoc(), tile, channel, StrmSwPortType::DMA); + rewriter.getUnknownLoc(), tile, channel, StrmSwPortType::DMA, + AMDAIE::DMAChannelDir::MM2S); sourceChannels.push_back(channelOp.getResult()); } SmallVector targetChannels; for (Value tile : targetLogicalObjFifo.getTiles()) { uint8_t channel = generator.getConsumerDMAChannel(tile); auto channelOp = rewriter.create( - rewriter.getUnknownLoc(), tile, channel, StrmSwPortType::DMA); + rewriter.getUnknownLoc(), tile, channel, StrmSwPortType::DMA, + AMDAIE::DMAChannelDir::S2MM); targetChannels.push_back(channelOp.getResult()); } rewriter.replaceOpWithNewOp( diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp new file mode 100644 index 000000000..2348478d9 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeLowering.cpp @@ -0,0 +1,342 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include + +#include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/Transforms/AMDAIEDmaUtils.h" +#include "iree-amd-aie/Transforms/AMDAIEUtils.h" +#include "iree-amd-aie/Transforms/Passes.h" +#include "iree-amd-aie/Transforms/Transforms.h" +#include "mlir/Transforms/DialectConversion.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" + +#define DEBUG_TYPE "iree-amdaie-controlcode-lowering" + +namespace mlir::iree_compiler::AMDAIE { + +struct DmaCpyNdToHalfDmaCpyNdConverter final + : OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + LogicalResult matchAndRewrite( + AMDAIE::NpuDmaCpyNdOp dmaOp, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + LLVM_DEBUG(llvm::dbgs() << "matchAndRewrite[AMDAIE::NpuDmaCpyNdOp]\n"); + AMDAIE::ConnectionOp connectionOp = dmaOp.getConnectionOp(); + if (!connectionOp) { + return dmaOp.emitOpError() + << "should operate on an `amdaie.connection` op"; + } + // Convert source half. + Value source = + dmaOp.getSource() ? dmaOp.getSource() : connectionOp.getSource(); + if (connectionOp.getSourceChannels().size() != 1) + return connectionOp.emitOpError() << "expected a single source channel"; + auto sourceChannelOp = dyn_cast( + connectionOp.getSourceChannels()[0].getDefiningOp()); + bool hasAsyncSourceToken = + llvm::any_of(dmaOp.getAsyncTokens(), [](Value token) { + return isa(token.getType()); + }); + SmallVector resultTypes = { + rewriter.getType()}; + TypeRange sourceResultTypes = + hasAsyncSourceToken ? TypeRange{resultTypes} : TypeRange{}; + rewriter.setInsertionPoint(dmaOp); + auto sourceDma = rewriter.create( + dmaOp.getLoc(), sourceResultTypes, connectionOp, source, + dmaOp.getSourceMixedOffsets(), dmaOp.getSourceMixedSizes(), + dmaOp.getSourceMixedStrides(), dmaOp.getSourceBdId(), sourceChannelOp); + + // Convert target half. + Value target = + dmaOp.getTarget() ? dmaOp.getTarget() : connectionOp.getTarget(); + if (connectionOp.getTargetChannels().size() != 1) + return connectionOp.emitOpError() << "expected a single target channel"; + auto targetChannelOp = dyn_cast( + connectionOp.getTargetChannels()[0].getDefiningOp()); + bool hasAsyncTargetToken = + llvm::any_of(dmaOp.getAsyncTokens(), [](Value token) { + return isa(token.getType()); + }); + TypeRange targetResultTypes = + hasAsyncTargetToken ? TypeRange{resultTypes} : TypeRange{}; + auto targetDma = rewriter.create( + dmaOp.getLoc(), targetResultTypes, connectionOp, target, + dmaOp.getTargetMixedOffsets(), dmaOp.getTargetMixedSizes(), + dmaOp.getTargetMixedStrides(), dmaOp.getTargetBdId(), targetChannelOp); + if (dmaOp.getNumResults() == 1) { + if (sourceDma.getNumResults() == 1) { + rewriter.replaceUsesWithIf( + dmaOp.getResult(0), sourceDma.getResult(0), [&](OpOperand &use) { + return isa(use.get().getType()) && + isa(use.getOwner()); + }); + } + if (targetDma.getNumResults() == 1) { + rewriter.replaceUsesWithIf( + dmaOp.getResult(0), targetDma.getResult(0), [&](OpOperand &use) { + return isa(use.get().getType()) && + isa(use.getOwner()); + }); + } + if (!dmaOp.getResult(0).use_empty()) + return dmaOp.emitOpError() << "should not have any uses anymore"; + } + rewriter.eraseOp(dmaOp); + return success(); + } +}; + +struct HalfDmaCpyNdToNpuConverter final + : OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + + HalfDmaCpyNdToNpuConverter(MLIRContext *context, + const AMDAIE::AMDAIEDeviceModel &deviceModel) + : OpConversionPattern(context), deviceModel(std::move(deviceModel)) { + minStrideBitWidth = deviceModel.getMinStrideBitWidth(); + } + + /// Insert ops to write a BD, patch the address and push it to the queue. This + /// is specific to Shim BDs for now. + FailureOr insertWriteBdOps( + AMDAIE::NpuHalfDmaCpyNdOp op, ConversionPatternRewriter &rewriter, + AMDAIE::AMDAIETileType tileType, + AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjFifo, + AMDAIE::BdIdOp bdIdOp, AMDAIE::ChannelOp channelOp, int64_t bufferLength, + int64_t bufferOffset, int32_t enablePacket, int32_t packetId, + int32_t packetType, ArrayRef sizes, + ArrayRef strides) const { + uint8_t numIntraAddrDim = deviceModel.getDmaProp( + tileType, AMDAIE::AMDAIEDmaProp::NumAddrDim); + uint8_t numAddrDim = numIntraAddrDim + kAMDAIEDmaNbInterDims; + auto subspanOp = dyn_cast_if_present( + logicalObjFifo.getMemref().getDefiningOp()); + if (!subspanOp) { + return logicalObjFifo.emitOpError() + << "must operate on an `hal.interface.binding.subspan`"; + } + int64_t argIdx = subspanOp.getBinding().getZExtValue(); + MemRefType memrefType = logicalObjFifo.getMemrefType(); + int64_t elemWidthInBits = memrefType.getElementTypeBitWidth(); + std::optional maybeDmaDirection = + channelOp.getDirection(); + if (!maybeDmaDirection) { + return channelOp.emitOpError() + << "direction needed for lowering of NPU ops"; + } + AMDAIE::TileOp tileOp = + dyn_cast_if_present(bdIdOp.getTile().getDefiningOp()); + if (!tileOp) + return bdIdOp.emitOpError() << "must operate on an `amdaie.tile`"; + int64_t col = getConstantIndexOrAssert(tileOp.getCol()); + int64_t row = getConstantIndexOrAssert(tileOp.getRow()); + int32_t bdId = bdIdOp.getValue(); + int32_t outOfOrderId{0}; + + SmallVector staticSizes; + SmallVector staticStrides; + // Padding is unused for now. + SmallVector paddingsBefore; + SmallVector paddingsAfter; + int32_t iterationCurrent{0}; + int32_t iterationSize{0}; + int32_t iterationStride{0}; + int32_t repeatCount{1}; + for (auto iter : llvm::enumerate(llvm::zip(sizes, strides))) { + int64_t size = getConstantIndexOrAssert(std::get<0>(iter.value())); + int64_t stride = getConstantIndexOrAssert(std::get<1>(iter.value())); + + /// Map the outer dimension to the iteration dimension if intra dimensions + /// are all used already or if the first stride == 0 as only the iteration + /// dimension supports stride == 0. + if (iter.index() == 0 && (sizes.size() == numAddrDim || stride == 0)) { + if (stride == 0) { + repeatCount = size; + } else { + iterationStride = + std::max(stride * elemWidthInBits / minStrideBitWidth, + (int64_t)1); + iterationSize = size; + if (stride == 1) + size = (size * elemWidthInBits) / minStrideBitWidth; + repeatCount = iterationSize; + } + } else { + staticStrides.push_back( + std::max(stride * elemWidthInBits / minStrideBitWidth, + (int64_t)1)); + // Innermost size needs to account for addressing granularity. + if (iter.index() == (sizes.size() - 1)) { + staticSizes.push_back(size * elemWidthInBits / + minStrideBitWidth); + } else { + staticSizes.push_back(size); + } + } + } + // Make sure sizes/strides have the correct size based on the number from + // intra addressing dimensions. + staticSizes.insert(staticSizes.begin(), + numIntraAddrDim - staticSizes.size(), 0); + staticStrides.insert(staticStrides.begin(), + numIntraAddrDim - staticStrides.size(), 0); + + bool useNextBd{false}; + int32_t nextBd{0}; + bool validBd{true}; + int32_t lockRelVal{0}; + int32_t lockRelId{0}; + bool lockAcqEnable{false}; + int32_t lockAcqVal{0}; + int32_t lockAcqId{0}; + + uint32_t bufferLengthInWords = + bufferLength * elemWidthInBits / minStrideBitWidth; + uint32_t innerBufferLength = bufferLengthInWords / repeatCount; + uint32_t bufferOffsetInBytes = bufferOffset * elemWidthInBits / 8; + + // Offset set to zero for shim as the offset is embedded in the address + // patch. + rewriter.create( + op.getLoc(), col, row, bdId, innerBufferLength, 0, staticSizes, + staticStrides, paddingsBefore, paddingsAfter, iterationCurrent, + iterationSize, iterationStride, enablePacket, packetId, packetType, + outOfOrderId, useNextBd, nextBd, validBd, lockAcqEnable, lockRelVal, + lockRelId, lockAcqVal, lockAcqId); + rewriter.create(op.getLoc(), col, bdId, argIdx, + bufferOffsetInBytes); + SmallVector resultTypes = { + rewriter.getType()}; + TypeRange resultTypeRange = + op.getAsyncToken() ? TypeRange{resultTypes} : TypeRange{}; + auto npuPushToQueueOp = rewriter.create( + op.getLoc(), resultTypeRange, col, row, maybeDmaDirection.value(), + channelOp.getValue(), repeatCount, bdId); + return npuPushToQueueOp; + } + + LogicalResult matchAndRewrite( + AMDAIE::NpuHalfDmaCpyNdOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + LLVM_DEBUG(llvm::dbgs() << "matchAndRewrite[AMDAIE::NpuHalfDmaCpyNdOp]\n"); + // First retrieve the connection and flow ops operated on. + // NOTE(jornt): this will logic will simplify in the future when DMA ops can + // operate directly on `amdaie.flow`. + std::optional maybeConnectionOp = + op.getConnectionOp(); + if (!maybeConnectionOp) { + return op.emitOpError() + << "expected to operate on an `amdaie.connection`"; + } + std::optional maybeFlowOp = maybeConnectionOp->getFlowOp(); + if (!maybeFlowOp) { + return maybeConnectionOp->emitOpError() + << "expected to operate on an `amdaie.flow`"; + } + bool enablePacket = maybeFlowOp->getIsPacketFlow(); + int32_t packetId{0}; + int32_t packetType{0}; + std::optional maybePacketId = maybeFlowOp->getPacketId(); + if (enablePacket) { + if (!maybePacketId) { + return maybeFlowOp->emitOpError() + << "packet flow enabled, but no packet ID is set"; + } + packetId = maybePacketId.value(); + } + // Only support Shim for now. + if (op.getMemorySpaceAsUInt() != 0) { + rewriter.eraseOp(op); + return success(); + } + auto logicalObjFifo = + dyn_cast_if_present( + op.getInput().getDefiningOp()); + if (!logicalObjFifo) { + return op.emitOpError() << "expected input to be an " + "`amdaie.logicalobjectfifo.from_memref`"; + } + std::optional maybeBdIdOp = op.getBdIdOp(); + if (!maybeBdIdOp) { + return op.emitOpError() << "must have a BD ID op to lower to " + "`amdaie.npu.write_bd`"; + } + std::optional maybeChannelOp = op.getChannelOp(); + if (!maybeChannelOp) + return op.emitOpError() << "found non-`amdaie.channel` channel"; + std::optional maybeSize = op.getAccessStaticSize(); + if (!maybeSize) + return op.emitOpError() << "could not compute a static size"; + std::optional maybeOffset = op.getStaticBaseOffset(); + if (!maybeOffset) + return op.emitOpError() << "could not compute a static source offset"; + SmallVector sizes = op.getMixedSizes(); + SmallVector strides = op.getMixedStrides(); + FailureOr npuPushToQueueOp = insertWriteBdOps( + op, rewriter, AMDAIE::AMDAIETileType::SHIMNOC, logicalObjFifo, + maybeBdIdOp.value(), maybeChannelOp.value(), maybeSize.value(), + maybeOffset.value(), enablePacket, packetId, packetType, sizes, + strides); + if (failed(npuPushToQueueOp)) return failure(); + rewriter.replaceOp(op, *npuPushToQueueOp); + return success(); + } + + private: + const AMDAIE::AMDAIEDeviceModel &deviceModel; + uint8_t minStrideBitWidth; +}; + +namespace { +class AMDAIEControlCodeLoweringPass + : public impl::AMDAIEControlCodeLoweringBase< + AMDAIEControlCodeLoweringPass> { + public: + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + void runOnOperation() override; +}; + +void AMDAIEControlCodeLoweringPass::runOnOperation() { + Operation *parentOp = getOperation(); + MLIRContext *context = &getContext(); + + auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(parentOp); + std::optional maybeDevice = getConfigAMDAIEDevice(targetAttr); + if (!maybeDevice) { + parentOp->emitOpError() + << "has no AMDAIEDevice in the target attribute configuration. This " + "device-specific information is required to lower control code " + "ops."; + return signalPassFailure(); + } + AMDAIE::AMDAIEDeviceModel deviceModel = + AMDAIE::getDeviceModel(maybeDevice.value()); + + RewritePatternSet patterns(context); + ConversionTarget conversionTarget(*context); + conversionTarget.addLegalDialect(); + conversionTarget + .addIllegalOp(); + patterns.insert(context); + patterns.insert(context, deviceModel); + if (failed(applyPartialConversion(parentOp, conversionTarget, + std::move(patterns)))) { + return signalPassFailure(); + } +} + +} // namespace + +std::unique_ptr createAMDAIEControlCodeLoweringPass() { + return std::make_unique(); +} + +} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp new file mode 100644 index 000000000..4ed6d0bb0 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEControlCodeToTransaction.cpp @@ -0,0 +1,358 @@ +// Copyright 2024 The IREE Authors +// +// Licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#include "iree-amd-aie/IR/AMDAIEOps.h" +#include "iree-amd-aie/Transforms/AMDAIEUtils.h" +#include "iree-amd-aie/Transforms/Passes.h" +#include "iree-amd-aie/Transforms/Transforms.h" +#include "mlir/IR/AsmState.h" +#include "mlir/IR/Iterators.h" +#include "mlir/Transforms/DialectConversion.h" + +#define DEBUG_TYPE "iree-amdaie-controlcode-to-transaction" + +#define TXN_OPC_WRITE 0x0 +#define TXN_OPC_BLOCKWRITE 0x1 +#define TXN_OPC_TCT 0x80 +#define TXN_OPC_DDR_PATCH 0x81 + +namespace mlir::iree_compiler::AMDAIE { + +class TransactionBuilder { + public: + AMDAIE::AMDAIEDeviceModel deviceModel; + TransactionBuilder(AMDAIE::AMDAIEDeviceModel deviceModel) + : deviceModel(std::move(deviceModel)) {} + + void clearAndInitialize() { + instructions.clear(); + llvm::MutableArrayRef words = reserveAndGetTail(4); + // setup txn header + words[0] = 0x06030100; + words[1] = 0x00000105; + } + + size_t getInstructionSize() const { return instructions.size(); } + + ArrayRef finalizeAndReturnInstructions() { + finalizeHeader(); + return ArrayRef(instructions.data(), instructions.size()); + } + + void dumpTransactionAsHex() const { + llvm::outs() << "Transaction: \n"; + for (uint32_t word : instructions) { + // Write hex as 0xXXXXXXXX + llvm::outs() << utohexstr(word, 8) << "\n"; + } + } + + LogicalResult appendAddressPatch(uint32_t addr, uint32_t argIdx, + uint32_t offset) { + llvm::MutableArrayRef words = reserveAndGetTail(12); + words[0] = TXN_OPC_DDR_PATCH; + words[1] = words.size() * sizeof(uint32_t); // Operation Size + + words[6] = addr; + words[7] = 0; + words[8] = argIdx; + words[9] = 0; + words[10] = offset; + words[11] = 0; + instructionCounter++; + return success(); + } + + LogicalResult appendTCTSync(uint32_t col, uint32_t row, uint32_t direction, + uint32_t rowNum, uint32_t colNum, + uint32_t channel) { + llvm::MutableArrayRef words = reserveAndGetTail(4); + words[0] = TXN_OPC_TCT; + words[1] = words.size() * sizeof(uint32_t); // Operation Size + + words[2] |= direction & 0xff; + words[2] |= (row & 0xff) << 8; + words[2] |= (col & 0xff) << 16; + + words[3] |= (rowNum & 0xff) << 8; + words[3] |= (colNum & 0xff) << 16; + words[3] |= (channel & 0xff) << 24; + instructionCounter++; + return success(); + } + + LogicalResult appendPushToQueueOp(uint32_t col, uint32_t row, + AMDAIE::DMAChannelDir direction, + uint32_t channel, uint32_t bdId, + uint32_t repeatCount, bool issueToken) { + uint32_t colShift = deviceModel.getColumnShift(); + uint32_t rowShift = deviceModel.getRowShift(); + uint32_t addr = + direction == AMDAIE::DMAChannelDir::MM2S ? 0x1D214 : 0x1D204; + if (channel == 1) addr += 0x8; + if (col && row) { + addr |= ((col & 0xff) << colShift) | ((row & 0xff) << rowShift) | + (addr & 0xFFFFF); + } + uint32_t value = 0; + value |= bdId & 0xF; + value |= (repeatCount & 0xFF) << 16; + if (issueToken) value |= 0x80000000; + return appendWrite32Op(addr, value); + } + + LogicalResult appendWrite32Op(uint32_t addr, uint32_t value) { + llvm::MutableArrayRef words = reserveAndGetTail(6); + // XAIE_IO_WRITE + words[0] = TXN_OPC_WRITE; + words[1] = 0; + words[2] = addr; + words[3] = 0; + words[4] = value; // Value + words[5] = words.size() * sizeof(uint32_t); // Operation Size + instructionCounter++; + return success(); + } + + LogicalResult appendWriteBdOp( + uint32_t bdAddr, uint32_t bufferLength, uint32_t bufferOffset, + bool enablePacket, uint32_t outOfOrderId, uint32_t packetId, + uint32_t packetType, uint32_t d0Size, uint32_t d0Stride, uint32_t d1Size, + uint32_t d1Stride, uint32_t d2Stride, uint32_t iterationCurrent, + uint32_t iterationSize, uint32_t iterationStride, uint32_t nextBd, + bool useNextBd, bool validBd, int32_t lockRelVal, uint32_t lockRelId, + bool lockAcqEnable, int32_t lockAcqVal, uint32_t lockAcqId) { + llvm::MutableArrayRef words = reserveAndGetTail(12); + words[0] = TXN_OPC_BLOCKWRITE; + words[1] = 0; + // RegOff + words[2] = bdAddr; // ADDR + words[3] = words.size() * sizeof(uint32_t); // Operation Size + // DMA_BDX_0 + words[4] = bufferLength; + // DMA_BDX_1 + words[5] = bufferOffset; + // DMA_BDX_2 + // En Packet , OoO BD ID , Packet ID , Packet Type + words[6] |= ((int)enablePacket & 0x1) << 30; + words[6] |= (outOfOrderId & 0x3f) << 24; + words[6] |= (packetId & 0x1f) << 19; + words[6] |= (packetType & 0x7) << 16; + // DMA_BDX_3 + // TODO: Secure Access + words[7] |= (d0Size & 0x3ff) << 20; + words[7] |= d0Stride & 0xfffff; + // DMA_BDX_4 + words[8] = 0x80000000; // burst length; + words[8] |= (d1Size & 0x3ff) << 20; + words[8] |= d1Stride & 0xfffff; + // DMA_BDX_5 + // TODO: SIMID, AxCache, AXQoS + words[9] = d2Stride & 0xfffff; + // DMA_BDX_6 + words[10] |= (iterationCurrent & 0x3f) << 26; + words[10] |= (iterationSize & 0x3f) << 20; + words[10] |= iterationStride & 0xfffff; + // DMA_BDX_7 + // TODO: TLAST Suppress + words[11] |= (nextBd & 0xf) << 27; + words[11] |= ((int)useNextBd & 0x1) << 26; + words[11] |= ((int)validBd & 0x1) << 25; + words[11] |= (lockRelVal & 0xef) << 18; + words[11] |= (lockRelId & 0xf) << 13; + words[11] |= ((int)lockAcqEnable & 0x1) << 12; + words[11] |= (lockAcqVal & 0xef) << 5; + words[11] |= lockAcqId & 0xf; + instructionCounter++; + return success(); + } + + private: + void finalizeHeader() { + // Finalize txn header. + instructions[2] = instructionCounter; + instructions[3] = instructions.size() * sizeof(uint32_t); + } + + llvm::MutableArrayRef reserveAndGetTail(size_t tailSize) { + auto oldSize = instructions.size(); + auto newSize = oldSize + tailSize; + instructions.resize(newSize, 0); + return llvm::MutableArrayRef(instructions.data() + oldSize, + tailSize); + } + size_t instructionCounter{0}; + std::vector instructions; +}; + +LogicalResult convertOp(AMDAIE::NpuAddressPatchOp op, + TransactionBuilder &builder) { + uint32_t col = op.getCol(); + uint32_t bdId = op.getBdId(); + uint32_t colShift = builder.deviceModel.getColumnShift(); + uint32_t addr = (col << colShift) | (0x1D004 + bdId * 0x20); + if (failed(builder.appendAddressPatch(addr, op.getArgIdx(), op.getOffset()))) + return failure(); + return success(); +} + +LogicalResult convertOp(AMDAIE::NpuDmaWaitOp op, TransactionBuilder &builder) { + for (Value token : op.getAsyncTokens()) { + auto pushToQueueOp = + dyn_cast_if_present(token.getDefiningOp()); + if (!pushToQueueOp) { + return op.emitOpError() + << "should operate on an `amdaie.push_to_queue` op"; + } + if (failed(builder.appendTCTSync( + pushToQueueOp.getCol(), pushToQueueOp.getRow(), + static_cast(pushToQueueOp.getDirection()), 1, 1, + pushToQueueOp.getChannel()))) { + return failure(); + } + } + return success(); +} + +LogicalResult convertOp(AMDAIE::NpuPushToQueueOp op, + TransactionBuilder &builder) { + uint32_t repeatCount = op.getRepeatCount() - 1; + if (failed(builder.appendPushToQueueOp(op.getCol(), op.getRow(), + op.getDirection(), op.getChannel(), + op.getBdId(), repeatCount, true))) { + return failure(); + } + return success(); +} + +LogicalResult convertOp(AMDAIE::NpuWriteBdOp op, TransactionBuilder &builder) { + uint32_t col = op.getCol(); + uint32_t row = op.getRow(); + uint32_t bdId = op.getBdId(); + uint32_t colShift = builder.deviceModel.getColumnShift(); + uint32_t rowShift = builder.deviceModel.getRowShift(); + uint32_t bdAddr = + (col << colShift) | (row << rowShift) | (0x1D000 + bdId * 0x20); + ArrayRef sizes = op.getSizes(); + ArrayRef strides = op.getStrides(); + if (sizes.size() != 3) return op.emitOpError() << "expected 3 sizes"; + if (strides.size() != 3) return op.emitOpError() << "expected 3 strides"; + uint32_t d0Size = sizes[sizes.size() - 1]; + uint32_t d1Size = sizes[sizes.size() - 2]; + // Strides and iteration_size are encoded as `actual - 1`, but `0` should stay + // `0` as it's not supported; + uint32_t d0Stride = + std::max((int64_t)strides[strides.size() - 1] - 1, (int64_t)0); + uint32_t d1Stride = + std::max((int64_t)strides[strides.size() - 2] - 1, (int64_t)0); + uint32_t d2Stride = + std::max((int64_t)strides[strides.size() - 3] - 1, (int64_t)0); + uint32_t iterationSize = + std::max((int64_t)op.getIterationSize() - 1, (int64_t)0); + uint32_t iterationStride = + std::max((int64_t)op.getIterationStride() - 1, (int64_t)0); + if (failed(builder.appendWriteBdOp( + bdAddr, op.getBufferLength(), op.getBufferOffset(), + op.getEnablePacket(), op.getOutOfOrderId(), op.getPacketId(), + op.getPacketType(), d0Size, d0Stride, d1Size, d1Stride, d2Stride, + op.getIterationCurrent(), iterationSize, iterationStride, + op.getNextBd(), op.getUseNextBd(), op.getValidBd(), + op.getLockRelVal(), op.getLockRelId(), op.getLockAcqEnable(), + op.getLockAcqVal(), op.getLockAcqId()))) { + return failure(); + } + return success(); +} + +LogicalResult controlCodeToTransaction(IRRewriter &rewriter, + AMDAIE::ControlCodeOp controlCodeOp, + TransactionBuilder &builder) { + SmallVector toBeErased; + WalkResult res = controlCodeOp->walk([&](Operation *op) { + LogicalResult switchResult = + TypeSwitch(op) + .Case( + [&](auto npuOp) { + if (failed(convertOp(npuOp, builder))) return failure(); + toBeErased.push_back(npuOp); + return success(); + }) + .Default([&](Operation *) { return success(); }); + if (failed(switchResult)) return WalkResult::interrupt(); + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return failure(); + for (Operation *op : toBeErased) { + op->dropAllUses(); + rewriter.eraseOp(op); + } + return success(); +} + +namespace { + +class AMDAIEControlCodeToTransactionPass + : public impl::AMDAIEControlCodeToTransactionBase< + AMDAIEControlCodeToTransactionPass> { + public: + AMDAIEControlCodeToTransactionPass( + const AMDAIEControlCodeToTransactionOptions &options) + : AMDAIEControlCodeToTransactionBase(options) {} + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + void runOnOperation() override; +}; + +void AMDAIEControlCodeToTransactionPass::runOnOperation() { + Operation *parentOp = getOperation(); + MLIRContext *context = &getContext(); + + auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(parentOp); + std::optional maybeDevice = getConfigAMDAIEDevice(targetAttr); + if (!maybeDevice) { + parentOp->emitOpError() + << "has no AMDAIEDevice in the target attribute configuration. This " + "device-specific information is required to lower control code " + "ops."; + return signalPassFailure(); + } + AMDAIE::AMDAIEDeviceModel deviceModel = + AMDAIE::getDeviceModel(maybeDevice.value()); + TransactionBuilder transactionBuilder(std::move(deviceModel)); + + IRRewriter rewriter(context); + WalkResult res = parentOp->walk([&](AMDAIE::WorkgroupOp workgroupOp) { + transactionBuilder.clearAndInitialize(); + if (failed(controlCodeToTransaction(rewriter, workgroupOp.getControlCode(), + transactionBuilder))) { + return WalkResult::interrupt(); + } + LLVM_DEBUG(llvm::dbgs() << "Instruction size: " + << transactionBuilder.getInstructionSize() << "\n"); + ArrayRef instructions = + transactionBuilder.finalizeAndReturnInstructions(); + workgroupOp.setNpuInstructionsAttr(DenseUI32ResourceElementsAttr::get( + RankedTensorType::get( + transactionBuilder.getInstructionSize(), + IntegerType::get(&getContext(), 32, IntegerType::Unsigned)), + "npu_instructions", + HeapAsmResourceBlob::allocateAndCopyInferAlign(instructions))); + if (dumpTransaction) transactionBuilder.dumpTransactionAsHex(); + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return signalPassFailure(); +} + +} // namespace + +std::unique_ptr createAMDAIEControlCodeToTransactionPass( + AMDAIEControlCodeToTransactionOptions options) { + return std::make_unique(options); +} + +} // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp index 0edeb3659..c4459f60c 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp @@ -395,203 +395,6 @@ LogicalResult AIEDeviceBuilder::coreToAIE(AMDAIE::CoreOp coreOp, return success(); } -//===----------------------------------------------------------------------===// -// Convert amdaie.controlcode operation to NPU instruction func -//===----------------------------------------------------------------------===// - -/// Convert the `amdaie.npu.dma_cpy_nd` operation to `aiex.npu.dma_memcpy_nd`. -LogicalResult AIEDeviceBuilder::npuDmaCpyNdOpToAIE( - AMDAIE::NpuDmaCpyNdOp dmaOp, SmallVector &toBeErased) { - LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::NpuDmaCpyNdOp]\n"); - AMDAIE::ConnectionOp connectionOp = dmaOp.getConnectionOp(); - - SmallVector offsets, sizes, strides; - ArrayRef staticOffsets, staticSizes, staticStrides; - AMDAIE::BdIdOp bdIdOp; - LogicalObjectFifoFromMemrefOp logicalObjFifo; - SmallVector memOps; - AIE::PacketInfoAttr pktInfoAttr = nullptr; - // Convert bidirectional `amdaie.npu.dma_cpy_nd` op into two halves. - if (dmaOp.getSource()) { - offsets = dmaOp.getSourceOffsets(); - sizes = dmaOp.getSourceSizes(); - strides = dmaOp.getSourceStrides(); - staticOffsets = dmaOp.getSourceStaticOffsets(); - staticSizes = dmaOp.getSourceStaticSizes(); - staticStrides = dmaOp.getSourceStaticStrides(); - bdIdOp = dmaOp.getSourceBdIdOp(); - if (!bdIdOp) { - return dmaOp.emitOpError() - << "must have a source BD ID op to lower to the AIE dialect."; - } - logicalObjFifo = dyn_cast_if_present( - dmaOp.getSource().getDefiningOp()); - if (!logicalObjFifo) { - return dmaOp.emitOpError() << "expected source to be an " - "`amdaie.logicalobjectfifo.from_memref`"; - } - memOps = connectionToSourceTargetMemOps[connectionOp].first; - // Set the packet info attribute for MM2S DMAs, operating on a packet flow - // connection. - std::optional maybeFlowOp = connectionOp.getFlowOp(); - if (maybeFlowOp && maybeFlowOp->getPacketId()) { - pktInfoAttr = AIE::PacketInfoAttr::get( - rewriter.getContext(), - /*pkt_type*/ 0, /*pkt_id*/ maybeFlowOp->getPacketId().value()); - } - } else if (dmaOp.getTarget()) { - offsets = dmaOp.getTargetOffsets(); - sizes = dmaOp.getTargetSizes(); - strides = dmaOp.getTargetStrides(); - staticOffsets = dmaOp.getTargetStaticOffsets(); - staticSizes = dmaOp.getTargetStaticSizes(); - staticStrides = dmaOp.getTargetStaticStrides(); - bdIdOp = dmaOp.getTargetBdIdOp(); - if (!bdIdOp) { - return dmaOp.emitOpError() - << "must have a target BD ID op to lower to the AIE dialect."; - } - logicalObjFifo = dyn_cast_if_present( - dmaOp.getTarget().getDefiningOp()); - if (!logicalObjFifo) { - return dmaOp.emitOpError() << "expected target to be an " - "`amdaie.logicalobjectfifo.from_memref`"; - } - memOps = connectionToSourceTargetMemOps[connectionOp].second; - } else { - return dmaOp.emitOpError() - << "has neither source not target memory space as L3."; - } - - Value memref = bindingsMapper.lookup(logicalObjFifo.getMemref()); - - if (memOps.size() != 1) { - return dmaOp.emitOpError() << "only a single connection op source expected"; - } - auto shimDmaAllocOp = dyn_cast(memOps[0]); - if (!shimDmaAllocOp) { - return dmaOp.emitOpError() << "expected the source of the connection to " - "be mapped to a `AIE::ShimDMAAllocationOp`"; - } - - if (!offsets.empty() || !sizes.empty() || !strides.empty()) { - // Not doing now as better to just eliminate use of aiex dialect - // altogether. - return dmaOp.emitError() - << "Expect all source offsets, sizes, and strides to be static at " - "this point. Dynamic values can be supported, just need to " - "cast from 'index' to 64-bit signless integer for " - "aiex.npu.dma_memcpy_nd."; - } - - uint32_t bdId = bdIdOp.getValue(); - bool issueToken = dmaOp.hasDmaWaitOpUser(); - - rewriter.setInsertionPoint(dmaOp); - rewriter.create( - dmaOp.getLoc(), SmallVector{}, 0, 0, memref, offsets, sizes, - strides, staticOffsets, staticSizes, staticStrides, pktInfoAttr, - shimDmaAllocOp.getSymName(), bdId, issueToken); - - toBeErased.push_back(dmaOp); - return success(); -} - -/// Convert the `amdaie.npu.dma_wait` operation to `aiex.npu.dma_wait`. -LogicalResult AIEDeviceBuilder::npuDmaWaitToAIE( - AMDAIE::NpuDmaWaitOp waitOp, SmallVector &toBeErased) { - LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::NpuDmaWaitOp]\n"); - rewriter.setInsertionPoint(waitOp); - for (Value asyncToken : waitOp.getAsyncTokens()) { - auto npuDmaOp = - dyn_cast_if_present(asyncToken.getDefiningOp()); - if (!npuDmaOp) { - return waitOp.emitOpError() - << "should be operating on `amdaie.npu.dma_cpy_nd` for " - "lowering"; - } - AMDAIE::ConnectionOp connectionOp = npuDmaOp.getConnectionOp(); - if (!connectionToSourceTargetMemOps.contains(connectionOp)) { - return connectionOp.emitOpError() << "should be found in the connection " - "to source/target mem ops map"; - } - SmallVector memOps = - isa(asyncToken.getType()) - ? connectionToSourceTargetMemOps[connectionOp].first - : connectionToSourceTargetMemOps[connectionOp].second; - if (memOps.size() != 1) { - return waitOp.emitOpError() - << "only a single connection op source expected"; - } - auto shimDmaAllocOp = dyn_cast(memOps[0]); - if (!shimDmaAllocOp) { - return waitOp.emitOpError() - << "expected the source of the connection to " - "be mapped to a `AIE::ShimDMAAllocationOp`"; - } - rewriter.create(rewriter.getUnknownLoc(), - shimDmaAllocOp.getSymName()); - } - toBeErased.push_back(waitOp); - return success(); -} - -/// Insert the control code operations into the NPU instruction function. -LogicalResult AIEDeviceBuilder::controlCodeToAIE( - AMDAIE::ControlCodeOp controlCodeOp, - xilinx::AIEX::RuntimeSequenceOp funcOp) { - LLVM_DEBUG(llvm::dbgs() << "Convert [AMDAIE::ControlCodeOp]\n"); - Block *funcBlock = &funcOp.getBody().front(); - rewriter.setInsertionPointToEnd(funcBlock); - auto insertIt = funcBlock->begin(); - auto controlCodeBegin = controlCodeOp.getBody()->begin(); - auto controlCodeEnd = controlCodeOp.getBody()->getTerminator()->getIterator(); - funcBlock->getOperations().splice(insertIt, - controlCodeOp.getBody()->getOperations(), - controlCodeBegin, controlCodeEnd); - - // Keep track of operations to be erased instead of erasing them directly as - // there are bidirectional dependencies between operations. For example, - // `amdaie.npu.dma_cpy_nd` potentially needs information from a sunsequent - // `amdaie.npu.dma_wait` operation user and vice versa. - // TODO(jornt): This is caused by differences between the `AMDAIE` dialect and - // the `AIE` dialect and can be streamlined later by adjusting (both) - // dialects. - SmallVector toBeErased; - WalkResult res = - funcOp->walk([&](Operation *op) { - if (TypeSwitch(op) - .Case([&](auto dmaOp) { - // TODO(jornt): This is temporarily handled already by - // combining with `ConnectionOp` to create `aie.objectfifo` - // until we get rid of those. - eraseOp(dmaOp); - return success(); - }) - .Case([&](auto dmaOp) { - return npuDmaCpyNdOpToAIE(dmaOp, toBeErased); - }) - .Case([&](auto waitOp) { - return npuDmaWaitToAIE(waitOp, toBeErased); - }) - .Case([&](auto endOp) { - eraseOp(endOp); - return success(); - }) - .Default([&](Operation *op) { - remapOperands(op); - return success(); - }) - .failed()) { - return WalkResult::interrupt(); - } - return WalkResult::advance(); - }); - if (res.wasInterrupted()) return failure(); - for (Operation *op : toBeErased) eraseOp(op); - return success(); -} - //===----------------------------------------------------------------------===// // Convert ops in Workgroup to AIE ops //===----------------------------------------------------------------------===// @@ -961,9 +764,8 @@ LogicalResult AIEDeviceBuilder::tileToAIE(AMDAIE::TileOp tileOp, // Convert amdaie.workgroup operation and insert into aie.device //===----------------------------------------------------------------------===// -LogicalResult AIEDeviceBuilder::workgroupToAIE( - AMDAIE::WorkgroupOp workgroupOp, xilinx::AIE::DeviceOp deviceOp, - xilinx::AIEX::RuntimeSequenceOp npuFuncOp) { +LogicalResult AIEDeviceBuilder::workgroupToAIE(AMDAIE::WorkgroupOp workgroupOp, + xilinx::AIE::DeviceOp deviceOp) { OpBuilder::InsertionGuard guard(rewriter); Block *deviceBlock = &deviceOp.getRegion().front(); Block *deviceCoreBlock = rewriter.createBlock(&deviceOp.getRegion()); @@ -1003,10 +805,10 @@ LogicalResult AIEDeviceBuilder::workgroupToAIE( return WalkResult::advance(); }) .Case([&](auto controlCodeOp) { - if (failed(controlCodeToAIE(controlCodeOp, npuFuncOp))) { - controlCodeOp.emitError("could not convert to AIEDialect ops"); - return WalkResult::interrupt(); - } + // Skip control code as it should already be translated into firmware + // code at this point. + // TODO(jornt): currently, it still contains ops that are needed in + // this translation, but don't have to be translated themselves. return WalkResult::skip(); }) .Case([&](auto coreOp) { @@ -1100,20 +902,6 @@ LogicalResult AIEDeviceBuilder::lowerToAIE(ModuleOp moduleOp) { rewriter.getUnknownLoc(), xilinx::AIE::AIEDeviceAttr::get(rewriter.getContext(), aieDevice)); Block *deviceBlock = &deviceOp.getRegion().emplaceBlock(); - - // The amdaie.controlcode operation has no operands, but the - // aiex.runtime_sequence that it lowers to, does. Create the signature - // of the aiex.runtime_sequence operation that replaces the - // amdaie.controlcode. The HAL interface bindings are used to - // order the function parameters correctly. - SmallVector subspanOps; - funcOp->walk([&](IREE::HAL::InterfaceBindingSubspanOp subspanOp) { - subspanOps.push_back(subspanOp); - }); - llvm::sort(subspanOps, [](IREE::HAL::InterfaceBindingSubspanOp a, - IREE::HAL::InterfaceBindingSubspanOp b) { - return a.getBinding().getZExtValue() < b.getBinding().getZExtValue(); - }); rewriter.setInsertionPoint(deviceBlock, deviceBlock->begin()); // Create aiex.runtime_sequence inside aie.device @@ -1122,11 +910,6 @@ LogicalResult AIEDeviceBuilder::lowerToAIE(ModuleOp moduleOp) { Region &body = npuFuncOp.getBody(); body.emplaceBlock(); - for (auto &&a : llvm::enumerate(subspanOps)) { - body.addArgument(a.value().getType(), a.value().getLoc()); - bindingsMapper.map(a.value(), body.getArgument(a.index())); - } - // Walk the AIE regions ops and convert ops into pure AIEDialect ops. // IRMapping mapper; rewriter.setInsertionPointToStart(deviceBlock); @@ -1134,7 +917,7 @@ LogicalResult AIEDeviceBuilder::lowerToAIE(ModuleOp moduleOp) { if (isa(op)) { return WalkResult::advance(); } else if (auto workgroupOp = dyn_cast(op)) { - if (failed(workgroupToAIE(workgroupOp, deviceOp, npuFuncOp))) { + if (failed(workgroupToAIE(workgroupOp, deviceOp))) { return WalkResult::interrupt(); } return WalkResult::skip(); @@ -1147,6 +930,28 @@ LogicalResult AIEDeviceBuilder::lowerToAIE(ModuleOp moduleOp) { }); if (res.wasInterrupted()) return WalkResult::interrupt(); + SmallVector workgroupOps; + funcOp->walk([&](AMDAIE::WorkgroupOp op) { workgroupOps.push_back(op); }); + // Only a single workgroup op is supported as only a single `aie.device` is + // created. + if (workgroupOps.size() > 1) { + funcOp.emitOpError() + << "multiple `amdaie.workgroup` ops is not supported"; + return WalkResult::interrupt(); + } + if (workgroupOps.size() == 1) { + AMDAIE::WorkgroupOp workgroupOp = workgroupOps[0]; + mlir::Attribute maybeNpuInstructions = + workgroupOp.getNpuInstructionsAttr(); + // Only add attributes if the instructions attribute is found to + // facilitate simplified tests. + if (maybeNpuInstructions) { + deviceOp->setAttr("npu_instructions", maybeNpuInstructions); + deviceOp->setAttr("runtime_sequence_name", + rewriter.getStringAttr(funcOp.getSymName())); + } + } + // Move NPU instruction function to the end of the device block. rewriter.moveOpBefore(npuFuncOp, deviceBlock, deviceBlock->end()); // After walking the FuncOp, it has been converted into a DeviceOp and can diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.h index 88ec017cd..6f25c3592 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.h @@ -46,14 +46,6 @@ class AIEDeviceBuilder { LogicalResult coreToAIE(AMDAIE::CoreOp coreOp, AIE::DeviceOp deviceOp, Block *deviceCoreBlock); - /// Controlcode ops conversion methods. - LogicalResult npuDmaCpyNdOpToAIE(AMDAIE::NpuDmaCpyNdOp dmaOp, - SmallVector &toBeErased); - LogicalResult npuDmaWaitToAIE(AMDAIE::NpuDmaWaitOp waitOp, - SmallVector &toBeErased); - LogicalResult controlCodeToAIE(AMDAIE::ControlCodeOp controlCodeOp, - xilinx::AIEX::RuntimeSequenceOp funcOp); - /// Workgroup ops conversion methods. LogicalResult bufferToAIE(AMDAIE::BufferOp bufferOp, Block *deviceBlock, int &bufferId); @@ -67,8 +59,7 @@ class AIEDeviceBuilder { Block *deviceBlock); LogicalResult tileToAIE(AMDAIE::TileOp tileOp, Block *deviceBlock); LogicalResult workgroupToAIE(AMDAIE::WorkgroupOp workgroupOp, - xilinx::AIE::DeviceOp deviceOp, - xilinx::AIEX::RuntimeSequenceOp npuFuncOp); + xilinx::AIE::DeviceOp deviceOp); /// Utilities @@ -120,8 +111,6 @@ class AIEDeviceBuilder { IRRewriter rewriter; IRMapping mapper; - /// Dedicated mapper for the HAL bindings. - IRMapping bindingsMapper; /// Map from tile values to AIE memory op (`aie.mem` or `aie.memtile_dma`). /// This is used to look up and add new DMA patterns to those memory ops. DenseMap tileToMemOpMap; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.cpp index 54495abe4..a26d77e21 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.cpp @@ -7,6 +7,7 @@ #include "AMDAIEUtils.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/StringExtras.h" #include "mlir/IR/BuiltinTypes.h" #include "mlir/IR/Iterators.h" @@ -350,6 +351,15 @@ bool isMatmulProducerOfElementwise(linalg::LinalgOp linalgOp) { return false; } +std::string utohexstr(uint32_t value, size_t width, bool header, + bool lowercase) { + std::string res = ""; + if (header) res += "0x"; + std::string hexStr = llvm::utohexstr(value, lowercase); + std::string prefix(width - hexStr.size(), '0'); + return res + prefix + hexStr; +} + /// Find the largest factor of 'num' which is not larger than 'max'. int detail::findLargestFactor(int num, int max) { assert(max > 0 && "No factors less than or equal to 0 exist"); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.h index fa37b89dd..d657fff4d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEUtils.h @@ -94,6 +94,10 @@ bool isMatmulInDefChain(Value operand); /// matmul-like op upstream in its computation tree. bool isMatmulProducerOfElementwise(linalg::LinalgOp linalgOp); +/// Utility to convert a `uint32_t` value into a hex string. +std::string utohexstr(uint32_t value, size_t width, bool header = true, + bool lowercase = false); + namespace detail { // Returns the largest number that perfectly divides `num` that diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt index a467ce00e..0d8c8ce85 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt @@ -58,7 +58,9 @@ iree_cc_library( "AMDAIECombineStridedOps.cpp" "AMDAIEConnectionToFlow.cpp" "AMDAIEConvertToDma.cpp" + "AMDAIEControlCodeLowering.cpp" "AMDAIEControlCodeLoopUnroll.cpp" + "AMDAIEControlCodeToTransaction.cpp" "AMDAIEConvertCoreForallToFor.cpp" "AMDAIECreateAIEWorkgroup.cpp" "AMDAIECreateReferenceToAllocation.cpp" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h index 4cd5586f0..a58f6a880 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h @@ -35,6 +35,8 @@ namespace mlir::iree_compiler::AMDAIE { #define GEN_PASS_DEF_AMDAIECOMBINESTRIDEDOPS #define GEN_PASS_DEF_AMDAIECONNECTIONTOFLOW #define GEN_PASS_DEF_AMDAIECONTROLCODELOOPUNROLL +#define GEN_PASS_DEF_AMDAIECONTROLCODELOWERING +#define GEN_PASS_DEF_AMDAIECONTROLCODETOTRANSACTION #define GEN_PASS_DEF_AMDAIECONVERTCOREFORALLTOFOR #define GEN_PASS_DEF_AMDAIECREATEAIEWORKGROUP #define GEN_PASS_DEF_AMDAIECREATELOGICALOBJECTFIFOLINK diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index 4bc7c8bc4..ce858a7b4 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -621,6 +621,9 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager, passManager.addPass(createAMDAIEConnectionToFlowPass()); passManager.addPass(createAMDAIEAssignPacketIdsPass()); + passManager.addPass(createAMDAIEControlCodeLoweringPass()); + passManager.addPass(createAMDAIEControlCodeToTransactionPass()); + addAMDAIEToAIEPasses(passManager); // Now lower using the AIE passes from MLIR-AIE. @@ -631,7 +634,6 @@ void addMLIRAIELoweringPasses(OpPassManager &pm) { { OpPassManager &devicePM = pm.nest(); devicePM.addPass(createCanonicalizerPass()); - devicePM.addPass(createAMDAIEDmaToNpuPass()); devicePM.addPass(createAMDAIEAssignBufferDescriptorIDsPass()); devicePM.addPass(createAMDAIEAssignBufferAddressesBasicPass()); devicePM.addPass(createAMDAIEPathfinderPass()); @@ -834,12 +836,17 @@ void addMLIRAIRLoweringPasses(OpPassManager &passManager, AMDAIEDevice device, passManager.addPass(xilinx::airrt::createAIRRtToNpuPass()); passManager.addPass(createCanonicalizerPass()); + { + // createAMDAIEDmaToNpuPass only needed for AIR. + OpPassManager &devicePM = passManager.nest(); + devicePM.addPass(createCanonicalizerPass()); + devicePM.addPass(createAMDAIEDmaToNpuPass()); + } + // Now lower using the AIE passes from MLIR-AIE. addMLIRAIELoweringPasses(passManager); } - - // NOTE: this runs on the top-level program module containing all hal.executable // ops. void buildAMDAIELinkingPassPipeline(OpPassManager &passManager) { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index df670e19f..ded26bd18 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -111,6 +111,13 @@ std::unique_ptr createAMDAIEConnectionToFlowPass(); /// Pass to unroll the loops within the control code regions. std::unique_ptr createAMDAIEControlCodeLoopUnrollPass(); +/// Pass to convert control code DMA operations into NPU writes and syncs. +std::unique_ptr createAMDAIEControlCodeLoweringPass(); + +/// Pass to convert control code into a transaction binary. +std::unique_ptr createAMDAIEControlCodeToTransactionPass( + AMDAIEControlCodeToTransactionOptions options = {}); + /// Pass to convert `scf.forall` to `scf.for` within `aie.core`. std::unique_ptr createAMDAIEConvertCoreForallToForPass(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index 7c8364fed..5db0ba05a 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -151,6 +151,22 @@ def AMDAIEControlCodeLoopUnroll : let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEControlCodeLoopUnrollPass()"; } +def AMDAIEControlCodeLowering : + Pass<"iree-amdaie-controlcode-lowering", ""> { + let summary = "Lower control code ops to the most basic NPU write/sync/patch instructions"; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEControlCodeLoweringPass()"; +} + +def AMDAIEControlCodeToTransaction : + Pass<"iree-amdaie-controlcode-to-transaction", ""> { + let summary = "Convert controlcode instructions into a NPU instruction transaction."; + let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIEControlCodeToTransactionPass()"; + let options = [ + Option<"dumpTransaction", "dump-transaction", "bool", /*default=*/"false", + "Dump the generated transaction. (Used for tests)"> + ]; +} + def AMDAIEConvertCoreForallToFor : Pass<"iree-amdaie-convert-core-forall-to-for", ""> { let summary = "Converts `scf.forall` to `scf.for` within `aie.core`."; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt index 570e66b83..298338e4c 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt @@ -23,6 +23,8 @@ iree_lit_test_suite( "combine_strided_ops.mlir" "connection_to_flow.mlir" "controlcode_loop_unrolling.mlir" + "controlcode_lowering.mlir" + "controlcode_to_transaction.mlir" "convert_core_forall_to_for.mlir" "create_aie_workgroup.mlir" "create_reference_to_allocation.mlir" diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_channels.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_channels.mlir index 92b0f691b..1dbf73473 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_channels.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_channels.mlir @@ -6,14 +6,14 @@ // CHECK: amdaie.workgroup // CHECK: %[[tile_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) // CHECK: %[[tile_0_1:.+]] = amdaie.tile(%[[C0]], %[[C1]]) -// CHECK: %[[CHANNEL_0:.+]] = amdaie.channel(%[[tile_0_0]], 0, port_type = DMA) -// CHECK: %[[CHANNEL_1:.+]] = amdaie.channel(%[[tile_0_1]], 0, port_type = DMA) +// CHECK: %[[CHANNEL_0:.+]] = amdaie.channel(%[[tile_0_0]], 0, port_type = DMA, direction = MM2S) +// CHECK: %[[CHANNEL_1:.+]] = amdaie.channel(%[[tile_0_1]], 0, port_type = DMA, direction = S2MM) // CHECK: amdaie.connection(%{{.+}} {%[[CHANNEL_1]]}, %{{.+}} {%[[CHANNEL_0]]}) -// CHECK: %[[CHANNEL_2:.+]] = amdaie.channel(%[[tile_0_0]], 1, port_type = DMA) -// CHECK: %[[CHANNEL_3:.+]] = amdaie.channel(%[[tile_0_1]], 1, port_type = DMA) +// CHECK: %[[CHANNEL_2:.+]] = amdaie.channel(%[[tile_0_0]], 1, port_type = DMA, direction = MM2S) +// CHECK: %[[CHANNEL_3:.+]] = amdaie.channel(%[[tile_0_1]], 1, port_type = DMA, direction = S2MM) // CHECK: amdaie.connection(%{{.+}} {%[[CHANNEL_3]]}, %{{.+}} {%[[CHANNEL_2]]}) -// CHECK: %[[CHANNEL_4:.+]] = amdaie.channel(%[[tile_0_0]], 2, port_type = DMA) -// CHECK: %[[CHANNEL_5:.+]] = amdaie.channel(%[[tile_0_1]], 2, port_type = DMA) +// CHECK: %[[CHANNEL_4:.+]] = amdaie.channel(%[[tile_0_0]], 2, port_type = DMA, direction = MM2S) +// CHECK: %[[CHANNEL_5:.+]] = amdaie.channel(%[[tile_0_1]], 2, port_type = DMA, direction = S2MM) // CHECK: amdaie.connection(%{{.+}} {%[[CHANNEL_5]]}, %{{.+}} {%[[CHANNEL_4]]}) module { func.func @assign_channels(%arg0: memref<1x1x8x16xi32, 1>, %arg1: memref<8x16xi32>) { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_packet_ids.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_packet_ids.mlir index 15e196d7c..4fa6e7394 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_packet_ids.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/assign_packet_ids.mlir @@ -8,8 +8,8 @@ module { amdaie.workgroup { %tile_0_0 = amdaie.tile(%c0, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) - %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) - %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM) %0 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} amdaie.controlcode { amdaie.end @@ -29,10 +29,10 @@ module { // CHECK: %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]]) // CHECK: %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]]) // CHECK: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) -// CHECK: %[[CHANNEL:.*]] = amdaie.channel(%[[TILE_0_0]], 0, port_type = DMA) -// CHECK: %[[CHANNEL_1:.*]] = amdaie.channel(%[[TILE_0_1]], 0, port_type = DMA) -// CHECK: %[[CHANNEL_2:.*]] = amdaie.channel(%[[TILE_0_2]], 0, port_type = DMA) -// CHECK: %[[CHANNEL_3:.*]] = amdaie.channel(%[[TILE_0_1]], 1, port_type = DMA) +// CHECK: %[[CHANNEL:.*]] = amdaie.channel(%[[TILE_0_0]], 0, port_type = DMA, direction = MM2S) +// CHECK: %[[CHANNEL_1:.*]] = amdaie.channel(%[[TILE_0_1]], 0, port_type = DMA, direction = S2MM) +// CHECK: %[[CHANNEL_2:.*]] = amdaie.channel(%[[TILE_0_2]], 0, port_type = DMA, direction = MM2S) +// CHECK: %[[CHANNEL_3:.*]] = amdaie.channel(%[[TILE_0_1]], 1, port_type = DMA, direction = S2MM) // CHECK: amdaie.flow({%[[CHANNEL]]} -> {%[[CHANNEL_1]]}) {is_packet_flow = false} // CHECK: amdaie.flow({%[[CHANNEL]]} -> {%[[CHANNEL_1]]}) {is_packet_flow = true, packet_id = 0 : ui8} // CHECK: amdaie.flow({%[[CHANNEL_2]]} -> {%[[CHANNEL_3]]}) {is_packet_flow = true, packet_id = 1 : ui8} @@ -46,10 +46,10 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %tile_0_0 = amdaie.tile(%c0, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) %tile_0_2 = amdaie.tile(%c0, %c2) - %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) - %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) - %channel_2 = amdaie.channel(%tile_0_2, 0, port_type = DMA) - %channel_3 = amdaie.channel(%tile_0_1, 1, port_type = DMA) + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM) + %channel_2 = amdaie.channel(%tile_0_2, 0, port_type = DMA, direction = MM2S) + %channel_3 = amdaie.channel(%tile_0_1, 1, port_type = DMA, direction = S2MM) %0 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} %1 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} %2 = amdaie.flow({%channel_2} -> {%channel_3}) {is_packet_flow = true} @@ -71,8 +71,8 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.workgroup { %tile_0_0 = amdaie.tile(%c0, %c0) %tile_0_1 = amdaie.tile(%c0, %c1) - %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) - %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM) %0 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} %1 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} %2 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = true} diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/connection_to_flow.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/connection_to_flow.mlir index 74691c7ba..8babcb3f6 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/connection_to_flow.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/connection_to_flow.mlir @@ -8,15 +8,18 @@ // CHECK: %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]]) // CHECK: %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]]) // CHECK: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) -// CHECK: %[[CHANNEL:.*]] = amdaie.channel(%[[TILE_0_0]], 0, port_type = DMA) -// CHECK: %[[CHANNEL_1:.*]] = amdaie.channel(%[[TILE_0_1]], 0, port_type = DMA) -// CHECK: %[[CHANNEL_2:.*]] = amdaie.channel(%[[TILE_0_2]], 0, port_type = DMA) +// CHECK: %[[CHANNEL:.*]] = amdaie.channel(%[[TILE_0_0]], 0, port_type = DMA, direction = MM2S) +// CHECK: %[[CHANNEL_1:.*]] = amdaie.channel(%[[TILE_0_1]], 0, port_type = DMA, direction = S2MM) +// CHECK: %[[CHANNEL_2:.*]] = amdaie.channel(%[[TILE_0_1]], 0, port_type = DMA, direction = MM2S) +// CHECK: %[[CHANNEL_3:.*]] = amdaie.channel(%[[TILE_0_0]], 0, port_type = DMA, direction = S2MM) +// CHECK: %[[CHANNEL_4:.*]] = amdaie.channel(%[[TILE_0_1]], 0, port_type = DMA, direction = MM2S) +// CHECK: %[[CHANNEL_5:.*]] = amdaie.channel(%[[TILE_0_2]], 0, port_type = DMA, direction = S2MM) // CHECK: %[[FLOW_0:.+]] = amdaie.flow({%[[CHANNEL]]} -> {%[[CHANNEL_1]]}) {is_packet_flow = false} // CHECK: amdaie.connection(%{{.+}} {%[[CHANNEL_1]]}, %{{.+}} {%[[CHANNEL]]}, flow = %[[FLOW_0]]) -// CHECK: %[[FLOW_1:.+]] = amdaie.flow({%[[CHANNEL_1]]} -> {%[[CHANNEL]]}) {is_packet_flow = true} -// CHECK: amdaie.connection(%{{.+}} {%[[CHANNEL]]}, %{{.+}} {%[[CHANNEL_1]]}, flow = %[[FLOW_1]]) -// CHECK: %[[FLOW_2:.+]] = amdaie.flow({%[[CHANNEL_1]]} -> {%[[CHANNEL_2]]}) {is_packet_flow = false} -// CHECK: amdaie.connection(%{{.+}} {%[[CHANNEL_2]]}, %{{.+}} {%[[CHANNEL_1]]}, flow = %[[FLOW_2]]) +// CHECK: %[[FLOW_1:.+]] = amdaie.flow({%[[CHANNEL_2]]} -> {%[[CHANNEL_3]]}) {is_packet_flow = true} +// CHECK: amdaie.connection(%{{.+}} {%[[CHANNEL_3]]}, %{{.+}} {%[[CHANNEL_2]]}, flow = %[[FLOW_1]]) +// CHECK: %[[FLOW_2:.+]] = amdaie.flow({%[[CHANNEL_4]]} -> {%[[CHANNEL_5]]}) {is_packet_flow = false} +// CHECK: amdaie.connection(%{{.+}} {%[[CHANNEL_5]]}, %{{.+}} {%[[CHANNEL_4]]}, flow = %[[FLOW_2]]) module { func.func @connection_to_flow(%arg0: memref<8x16xi32>, %arg1: memref<1x1x8x16xi32, 1>, %arg2: memref<1x1x8x16xi32, 2>) { %c0 = arith.constant 0 : index @@ -29,12 +32,15 @@ module { %0 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_0_0} : memref<8x16xi32> -> !amdaie.logicalobjectfifo> %1 = amdaie.logicalobjectfifo.from_memref %arg1, {%tile_0_1} : memref<1x1x8x16xi32, 1> -> !amdaie.logicalobjectfifo> %2 = amdaie.logicalobjectfifo.from_memref %arg2, {%tile_0_2} : memref<1x1x8x16xi32, 2> -> !amdaie.logicalobjectfifo> - %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) - %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) - %channel_2 = amdaie.channel(%tile_0_2, 0, port_type = DMA) + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM) + %channel_2 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = MM2S) + %channel_3 = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = S2MM) + %channel_4 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = MM2S) + %channel_5 = amdaie.channel(%tile_0_2, 0, port_type = DMA, direction = S2MM) %3 = amdaie.connection(%1 {%channel_1}, %0 {%channel}) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %4 = amdaie.connection(%0 {%channel}, %1 {%channel_1}) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %5 = amdaie.connection(%2 {%channel_2}, %1 {%channel_1}) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %4 = amdaie.connection(%0 {%channel_3}, %1 {%channel_2}) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %5 = amdaie.connection(%2 {%channel_5}, %1 {%channel_4}) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) amdaie.controlcode { amdaie.end } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir new file mode 100644 index 000000000..e15cabc27 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_lowering.mlir @@ -0,0 +1,333 @@ +// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-controlcode-lowering)" --split-input-file --verify-diagnostics %s | FileCheck %s + +// expected-error @+1 {{op has no AMDAIEDevice in the target attribute configuration}} +module { + func.func @no_amdaie_device() { + amdaie.workgroup { + amdaie.controlcode { + amdaie.end + } + } + return + } +} + +// ----- + +// CHECK-LABEL: @no_ops +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @no_ops() { + amdaie.workgroup { + amdaie.controlcode { + amdaie.end + } + } + return + } +} + +// ----- + +// CHECK-LABEL: @npu_dma_cpy_nd_source +// CHECK: amdaie.controlcode +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @npu_dma_cpy_nd_source() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + amdaie.workgroup { + %tile = amdaie.tile(%c0, %c1) + %tile_0 = amdaie.tile(%c0, %c0) + %bd_id = amdaie.bd_id(%tile_0, 0) + %buffer = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> + %buffer_1 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> + %lock = amdaie.lock(%tile(4), 4) + %lock_2 = amdaie.lock(%tile(5), 0) + %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_2}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %2 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo> + %channel = amdaie.channel(%tile_0, 0, port_type = DMA, direction = MM2S) + %channel_3 = amdaie.channel(%tile, 0, port_type = DMA, direction = S2MM) + %3 = amdaie.flow({%channel} -> {%channel_3}) {is_packet_flow = true, packet_id = 0 : ui8} + %4 = amdaie.connection(%0 {%channel_3}, %2 {%channel}, flow = %3) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %1, 64 : memref<64x32xi32> +// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} +// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} +// CHECK: amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} + amdaie.npu.dma_cpy_nd %4([] [] [], %5[] [] [] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo> +// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} +// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} +// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32} +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) + %6 = amdaie.npu.dma_cpy_nd async_source %4([] [] [], %5[0, 0, 0, 0] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%6 : !amdaie.async_source_token) +// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 2 : ui32, iteration_stride = 128 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} +// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 128 : ui32} +// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32} +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token) + %7 = amdaie.npu.dma_cpy_nd async_source %4([] [] [], %5[0, 0, 0, 32] [2, 4, 16, 16] [128, 64, 8, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%7 : !amdaie.async_source_token) + amdaie.end + } + } + return + } +} + +// ----- + +// CHECK-LABEL: @npu_dma_cpy_nd_source_bf16 +// CHECK: amdaie.controlcode +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @npu_dma_cpy_nd_source_bf16() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + amdaie.workgroup { + %tile = amdaie.tile(%c0, %c1) + %tile_0 = amdaie.tile(%c0, %c0) + %bd_id = amdaie.bd_id(%tile_0, 0) + %buffer = amdaie.buffer(%tile) : memref<2048xbf16, 1 : i32> + %buffer_1 = amdaie.buffer(%tile) : memref<2048xbf16, 1 : i32> + %lock = amdaie.lock(%tile(4), 4) + %lock_2 = amdaie.lock(%tile(5), 0) + %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_2}) : memref<2048xbf16, 1 : i32>, memref<2048xbf16, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %2 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo> + %channel = amdaie.channel(%tile_0, 0, port_type = DMA, direction = MM2S) + %channel_3 = amdaie.channel(%tile, 0, port_type = DMA, direction = S2MM) + %3 = amdaie.flow({%channel} -> {%channel_3}) {is_packet_flow = true, packet_id = 0 : ui8} + %4 = amdaie.connection(%0 {%channel_3}, %2 {%channel}, flow = %3) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %1, 64 : memref<64x32xi32> +// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} +// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} +// CHECK: amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} + amdaie.npu.dma_cpy_nd %4([] [] [], %5[] [] [] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo> +// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 512 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} +// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} +// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32} +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) + %6 = amdaie.npu.dma_cpy_nd async_source %4([] [] [], %5[0, 0, 0, 0] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%6 : !amdaie.async_source_token) +// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 512 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 2 : ui32, iteration_stride = 64 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} +// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 64 : ui32} +// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32} +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token) + %7 = amdaie.npu.dma_cpy_nd async_source %4([] [] [], %5[0, 0, 0, 32] [2, 4, 16, 16] [128, 64, 8, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%7 : !amdaie.async_source_token) + amdaie.end + } + } + return + } +} + +// ----- + +// CHECK-LABEL: @npu_dma_cpy_nd_target +// CHECK: amdaie.controlcode +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @npu_dma_cpy_nd_target() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + amdaie.workgroup { + %tile = amdaie.tile(%c0, %c1) + %tile_0 = amdaie.tile(%c0, %c0) + %bd_id = amdaie.bd_id(%tile_0, 0) + %buffer = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> + %buffer_1 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> + %lock = amdaie.lock(%tile(4), 4) + %lock_2 = amdaie.lock(%tile(5), 0) + %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_2}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %2 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo> + %channel = amdaie.channel(%tile, 0, port_type = DMA, direction = MM2S) + %channel_3 = amdaie.channel(%tile_0, 0, port_type = DMA, direction = S2MM) + %3 = amdaie.flow({%channel} -> {%channel_3}) {is_packet_flow = true, packet_id = 0 : ui8} + %4 = amdaie.connection(%2 {%channel_3}, %0 {%channel}, flow = %3) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo, 2>) + amdaie.controlcode { + %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %1, 64 : memref<64x32xi32> +// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} +// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} +// CHECK: amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 0 : i32, repeat_count = 1 : ui32, row = 0 : ui32} + amdaie.npu.dma_cpy_nd %4(%5[] [] [] bd_id = %bd_id, [] [] []) : target_type = !amdaie.logicalobjectfifo> +// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} +// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} +// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 0 : i32, repeat_count = 2 : ui32, row = 0 : ui32} +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) + %6 = amdaie.npu.dma_cpy_nd async_target %4(%5[0, 0, 0, 0] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id, [] [] []) : target_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%6 : !amdaie.async_target_token) +// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 2 : ui32, iteration_stride = 128 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} +// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 1152 : ui32} +// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 0 : i32, repeat_count = 2 : ui32, row = 0 : ui32} +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) + %7 = amdaie.npu.dma_cpy_nd async_target %4(%5[0, 0, 32, 32] [2, 4, 16, 16] [128, 64, 8, 1] bd_id = %bd_id, [] [] []) : target_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%7 : !amdaie.async_target_token) + amdaie.end + } + } + return + } +} + +// ----- + +// CHECK-LABEL: @npu_dma_cpy_nd_target_i8 +// CHECK: amdaie.controlcode +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @npu_dma_cpy_nd_target_i8() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + amdaie.workgroup { + %tile = amdaie.tile(%c0, %c1) + %tile_0 = amdaie.tile(%c0, %c0) + %bd_id = amdaie.bd_id(%tile_0, 0) + %buffer = amdaie.buffer(%tile) : memref<2048xi8, 1 : i32> + %buffer_1 = amdaie.buffer(%tile) : memref<2048xi8, 1 : i32> + %lock = amdaie.lock(%tile(4), 4) + %lock_2 = amdaie.lock(%tile(5), 0) + %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_2}) : memref<2048xi8, 1 : i32>, memref<2048xi8, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %2 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo> + %channel = amdaie.channel(%tile, 0, port_type = DMA, direction = MM2S) + %channel_3 = amdaie.channel(%tile_0, 0, port_type = DMA, direction = S2MM) + %3 = amdaie.flow({%channel} -> {%channel_3}) {is_packet_flow = true, packet_id = 0 : ui8} + %4 = amdaie.connection(%2 {%channel_3}, %0 {%channel}, flow = %3) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo, 2>) + amdaie.controlcode { + %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %1, 64 : memref<64x32xi32> +// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} +// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} +// CHECK: amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 0 : i32, repeat_count = 1 : ui32, row = 0 : ui32} + amdaie.npu.dma_cpy_nd %4(%5[] [] [] bd_id = %bd_id, [] [] []) : target_type = !amdaie.logicalobjectfifo> +// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 256 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} +// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} +// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 0 : i32, repeat_count = 2 : ui32, row = 0 : ui32} +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) + %6 = amdaie.npu.dma_cpy_nd async_target %4(%5[0, 0, 0, 0] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id, [] [] []) : target_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%6 : !amdaie.async_target_token) +// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 256 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 2 : ui32, iteration_stride = 32 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} +// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 288 : ui32} +// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 0 : i32, repeat_count = 2 : ui32, row = 0 : ui32} + %7 = amdaie.npu.dma_cpy_nd async_target %4(%5[0, 0, 32, 32] [2, 4, 16, 16] [128, 64, 8, 1] bd_id = %bd_id, [] [] []) : target_type = !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%7 : !amdaie.async_target_token) + amdaie.end + } + } + return + } +} + +// ----- + +// CHECK-LABEL: @half_npu_dma_cpy_nd +// CHECK: amdaie.controlcode +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @half_npu_dma_cpy_nd() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + amdaie.workgroup { + %tile = amdaie.tile(%c0, %c1) + %tile_0 = amdaie.tile(%c0, %c0) + %bd_id = amdaie.bd_id(%tile_0, 0) + %buffer = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> + %buffer_1 = amdaie.buffer(%tile) : memref<2048xi32, 1 : i32> + %lock = amdaie.lock(%tile(4), 4) + %lock_2 = amdaie.lock(%tile(5), 0) + %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_2}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %2 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo> + %channel = amdaie.channel(%tile_0, 0, port_type = DMA, direction = MM2S) + %channel_3 = amdaie.channel(%tile, 0, port_type = DMA, direction = S2MM) + %3 = amdaie.flow({%channel} -> {%channel_3}) {is_packet_flow = true, packet_id = 0 : ui8} + %4 = amdaie.connection(%0 {%channel_3}, %2 {%channel}, flow = %3) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %1, 64 : memref<64x32xi32> +// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} +// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} +// CHECK: amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} + amdaie.npu.half_dma_cpy_nd %4(%5[] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> +// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 2048 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} +// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} +// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) + %6 = amdaie.npu.half_dma_cpy_nd async %4(%5[0] [2048] [1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%6 : !amdaie.async_token) +// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} +// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} +// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32} +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token) + %7 = amdaie.npu.half_dma_cpy_nd async %4(%5[0, 0, 0, 0] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%7 : !amdaie.async_token) + amdaie.end + } + } + return + } +} + +// ----- + +// CHECK-LABEL: @half_npu_dma_cpy_nd_bf16 +// CHECK: amdaie.controlcode +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @half_npu_dma_cpy_nd_bf16() { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + amdaie.workgroup { + %tile = amdaie.tile(%c0, %c1) + %tile_0 = amdaie.tile(%c0, %c0) + %bd_id = amdaie.bd_id(%tile_0, 0) + %buffer = amdaie.buffer(%tile) : memref<2048xbf16, 1 : i32> + %buffer_1 = amdaie.buffer(%tile) : memref<2048xbf16, 1 : i32> + %lock = amdaie.lock(%tile(4), 4) + %lock_2 = amdaie.lock(%tile(5), 0) + %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_2}) : memref<2048xbf16, 1 : i32>, memref<2048xbf16, 1 : i32> -> !amdaie.logicalobjectfifo, 2> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<64x32xi32> + %2 = amdaie.logicalobjectfifo.placeholder{%tile_0} : !amdaie.logicalobjectfifo> + %channel = amdaie.channel(%tile_0, 0, port_type = DMA, direction = MM2S) + %channel_3 = amdaie.channel(%tile, 0, port_type = DMA, direction = S2MM) + %3 = amdaie.flow({%channel} -> {%channel_3}) {is_packet_flow = true, packet_id = 0 : ui8} + %4 = amdaie.connection(%0 {%channel_3}, %2 {%channel}, flow = %3) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) + amdaie.controlcode { + %5 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0} : memref<64x32xi32> -> !amdaie.logicalobjectfifo> + memref.assume_alignment %1, 64 : memref<64x32xi32> +// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} +// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} +// CHECK: amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} + amdaie.npu.half_dma_cpy_nd %4(%5[] [] [] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> +// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 1024 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} +// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} +// CHECK: %[[TOKEN_0:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_0]] : !amdaie.async_token) + %6 = amdaie.npu.half_dma_cpy_nd async %4(%5[0] [2048] [1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%6 : !amdaie.async_token) +// CHECK: amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 512 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} +// CHECK: amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 64 : ui32} +// CHECK: %[[TOKEN_1:.+]] = amdaie.npu.push_to_queue async {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 2 : ui32, row = 0 : ui32} +// CHECK: amdaie.npu.dma_wait(%[[TOKEN_1]] : !amdaie.async_token) + %7 = amdaie.npu.half_dma_cpy_nd async %4(%5[0, 0, 0, 32] [2, 4, 16, 16] [0, 64, 8, 1] bd_id = %bd_id channel = %channel) : !amdaie.logicalobjectfifo> + amdaie.npu.dma_wait(%7 : !amdaie.async_token) + amdaie.end + } + } + return + } +} diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir new file mode 100644 index 000000000..bfe6dc456 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/controlcode_to_transaction.mlir @@ -0,0 +1,218 @@ +// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-controlcode-to-transaction{dump-transaction=true})" --split-input-file --verify-diagnostics %s | FileCheck %s + +// expected-error @+1 {{op has no AMDAIEDevice in the target attribute configuration}} +module { + func.func @no_amdaie_device() { + amdaie.workgroup { + amdaie.controlcode { + amdaie.end + } + } + return + } +} + +// ----- + +// CHECK: 0x06030100 +// CHECK: 0x00000105 +// CHECK: 0x00000000 +// CHECK: 0x00000010 +// CHECK-LABEL: @no_ops +// CHECK: npu_instructions = dense_resource : tensor<4xui32> +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @no_ops() { + amdaie.workgroup { + amdaie.controlcode { + amdaie.end + } + } + return + } +} + +// ----- + +// CHECK: 0x06030100 +// CHECK: 0x00000105 +// CHECK: 0x00000001 +// CHECK: 0x00000040 +// CHECK: 0x00000081 +// CHECK: 0x00000030 +// CHECK: 0x00000000 +// CHECK: 0x00000000 +// CHECK: 0x00000000 +// CHECK: 0x00000000 +// CHECK: 0x0001D004 +// CHECK: 0x00000000 +// CHECK: 0x00000000 +// CHECK: 0x00000000 +// CHECK: 0x00000000 +// CHECK: 0x00000000 +// CHECK-LABEL: @address_patch +// CHECK: npu_instructions = dense_resource : tensor<16xui32> +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @address_patch() { + amdaie.workgroup { + amdaie.controlcode { + amdaie.npu.address_patch {arg_idx = 0 : ui32, bd_id = 0 : ui32, col = 0 : ui32, offset = 0 : ui32} + amdaie.end + } + } + return + } +} + +// ----- + +// CHECK: 0x06030100 +// CHECK: 0x00000105 +// CHECK: 0x00000001 +// CHECK: 0x00000028 +// CHECK: 0x00000000 +// CHECK: 0x00000000 +// CHECK: 0x0001D214 +// CHECK: 0x00000000 +// CHECK: 0x80000000 +// CHECK: 0x00000018 +// CHECK-LABEL: @push_to_queue_default_values +// CHECK: npu_instructions = dense_resource : tensor<10xui32> +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @push_to_queue_default_values() { + amdaie.workgroup { + amdaie.controlcode { + amdaie.npu.push_to_queue {bd_id = 0 : ui32, channel = 0 : ui32, col = 0 : ui32, direction = 1 : i32, repeat_count = 1 : ui32, row = 0 : ui32} + amdaie.end + } + } + return + } +} + +// ----- + +// CHECK: 0x06030100 +// CHECK: 0x00000105 +// CHECK: 0x00000001 +// CHECK: 0x00000028 +// CHECK: 0x00000000 +// CHECK: 0x00000000 +// CHECK: 0x0001D21C +// CHECK: 0x00000000 +// CHECK: 0x803F0002 +// CHECK: 0x00000018 +// CHECK-LABEL: @push_to_queue +// CHECK: npu_instructions = dense_resource : tensor<10xui32> +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @push_to_queue() { + amdaie.workgroup { + amdaie.controlcode { + amdaie.npu.push_to_queue {bd_id = 2 : ui32, channel = 1 : ui32, col = 3 : ui32, direction = 1 : i32, repeat_count = 64 : ui32, row = 0 : ui32} + amdaie.end + } + } + return + } +} + +// ----- + +// CHECK: 0x06030100 +// CHECK: 0x00000105 +// CHECK: 0x00000002 +// CHECK: 0x00000038 +// CHECK: 0x00000000 +// CHECK: 0x00000000 +// CHECK: 0x0001D214 +// CHECK: 0x00000000 +// CHECK: 0x80FF000F +// CHECK: 0x00000018 +// CHECK: 0x00000080 +// CHECK: 0x00000010 +// CHECK: 0x00020001 +// CHECK: 0x00010100 +// CHECK-LABEL: @async_push_to_queue_and_wait +// CHECK: npu_instructions = dense_resource : tensor<14xui32> +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @async_push_to_queue_and_wait() { + amdaie.workgroup { + amdaie.controlcode { + %0 = amdaie.npu.push_to_queue async {bd_id = 15 : ui32, channel = 0 : ui32, col = 2 : ui32, direction = 1 : i32, repeat_count = 256 : ui32, row = 0 : ui32} + amdaie.npu.dma_wait(%0 : !amdaie.async_token) + amdaie.end + } + } + return + } +} + +// ----- + +// CHECK: 0x06030100 +// CHECK: 0x00000105 +// CHECK: 0x00000001 +// CHECK: 0x00000040 +// CHECK: 0x00000001 +// CHECK: 0x00000000 +// CHECK: 0x0001D000 +// CHECK: 0x00000030 +// CHECK: 0x00000000 +// CHECK: 0x00000000 +// CHECK: 0x00000000 +// CHECK: 0x00000000 +// CHECK: 0x80000000 +// CHECK: 0x00000000 +// CHECK: 0x00000000 +// CHECK: 0x02000000 +// CHECK-LABEL: @write_bd_empty +// CHECK: npu_instructions = dense_resource : tensor<16xui32> +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @write_bd_empty() { + amdaie.workgroup { + amdaie.controlcode { + amdaie.npu.write_bd {bd_id = 0 : ui32, buffer_length = 0 : ui32, buffer_offset = 0 : ui32, col = 0 : ui32, enable_packet = false, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 0 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} + amdaie.end + } + } + return + } +} + +// ----- + +// CHECK: 0x06030100 +// CHECK: 0x00000105 +// CHECK: 0x00000001 +// CHECK: 0x00000040 +// CHECK: 0x00000001 +// CHECK: 0x00000000 +// CHECK: 0x0201D040 +// CHECK: 0x00000030 +// CHECK: 0x00000400 +// CHECK: 0x00000020 +// CHECK: 0x40080000 +// CHECK: 0x01000000 +// CHECK: 0x81000007 +// CHECK: 0x0000003F +// CHECK: 0x00000000 +// CHECK: 0x02000000 +// CHECK-LABEL: @write_bd_with_addressing_and_packet +// CHECK: npu_instructions = dense_resource : tensor<16xui32> +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { + func.func @write_bd_with_addressing_and_packet() { + amdaie.workgroup { + amdaie.controlcode { + amdaie.npu.write_bd {bd_id = 2 : ui32, buffer_length = 1024 : ui32, buffer_offset = 32 : ui32, col = 1 : ui32, enable_packet = true, iteration_current = 0 : ui32, iteration_size = 0 : ui32, iteration_stride = 0 : ui32, lock_acq_enable = false, lock_acq_id = 0 : ui32, lock_acq_val = 0 : i32, lock_rel_id = 0 : ui32, lock_rel_val = 0 : i32, next_bd = 0 : ui32, out_of_order_id = 0 : ui32, packet_id = 1 : ui32, packet_type = 0 : ui32, paddings_after = array, paddings_before = array, row = 0 : ui32, sizes = array, strides = array, use_next_bd = false, valid_bd = true} + amdaie.end + } + } + return + } +} diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir index e75a379a5..a036bcb5f 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/lower_to_aie.mlir @@ -46,26 +46,18 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- -// CHECK: aiex.runtime_sequence @hal_bindings -// CHECK-SAME: %{{.+}}: memref<32x1024xi32> -// CHECK-SAME: %{{.+}}: memref<1024x64xi32> -// CHECK-SAME: %{{.+}}: memref<32x64xi32> -// CHECK-NOT: memref.assume_alignment -#pipeline_layout = #hal.pipeline.layout, - , - -]> +// CHECK: module +// CHECK: aie.device +// CHECK: aiex.runtime_sequence @workgroup_with_instructions +// CHECK: } {npu_instructions = dense_resource : tensor<208xui32>, runtime_sequence_name = "workgroup_with_instructions"} #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @hal_bindings() { - %c0 = arith.constant 0 : index - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<1024x64xi32> - memref.assume_alignment %0, 64 : memref<1024x64xi32> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<32x1024xi32> - memref.assume_alignment %1, 64 : memref<32x1024xi32> - %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : memref<32x64xi32> - memref.assume_alignment %2, 64 : memref<32x64xi32> + func.func @workgroup_with_instructions() { + amdaie.workgroup { + amdaie.controlcode { + amdaie.end + } + } {npu_instructions = dense_resource : tensor<208xui32>} return } } @@ -202,8 +194,8 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %lock_3 = amdaie.lock(%tile_0_2(1), 0) %0 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> %1 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 1> - %channel = amdaie.channel(%tile_0_1, 0, port_type = DMA) - %channel_1 = amdaie.channel(%tile_0_2, 0, port_type = DMA) + %channel = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = MM2S) + %channel_1 = amdaie.channel(%tile_0_2, 0, port_type = DMA, direction = S2MM) %2 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} %3 = amdaie.connection(%1 {%channel_1}, %0 {%channel}, flow = %2) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) amdaie.controlcode { @@ -282,8 +274,8 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %lock_3 = amdaie.lock(%tile_0_2(1), 0) %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32>, memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> %1 = amdaie.logicalobjectfifo.from_buffers({%buffer_2, %buffer_3}, {%lock_2}, {%lock_3}) : memref<4096xi32, 2 : i32>, memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 2> - %channel = amdaie.channel(%tile_0_1, 0, port_type = DMA) - %channel_1 = amdaie.channel(%tile_0_2, 0, port_type = DMA) + %channel = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = MM2S) + %channel_1 = amdaie.channel(%tile_0_2, 0, port_type = DMA, direction = S2MM) %2 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} %3 = amdaie.connection(%1 {%channel_1}, %0 {%channel}, flow = %2) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo, 2>) amdaie.controlcode { @@ -372,14 +364,14 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %lock_7 = amdaie.lock(%tile_0_2(1), 0) %0 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> %1 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 1> - %channel = amdaie.channel(%tile_0_1, 0, port_type = DMA) - %channel_1 = amdaie.channel(%tile_0_2, 0, port_type = DMA) + %channel = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = MM2S) + %channel_1 = amdaie.channel(%tile_0_2, 0, port_type = DMA, direction = S2MM) %2 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} %3 = amdaie.connection(%1 {%channel_1}, %0 {%channel}, flow = %2) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) %4 = amdaie.logicalobjectfifo.from_buffers({%buffer_2}, {%lock_4}, {%lock_5}) : memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> %5 = amdaie.logicalobjectfifo.from_buffers({%buffer_3}, {%lock_6}, {%lock_7}) : memref<2048xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 1> - %channel_2 = amdaie.channel(%tile_0_1, 1, port_type = DMA) - %channel_3 = amdaie.channel(%tile_0_2, 1, port_type = DMA) + %channel_2 = amdaie.channel(%tile_0_1, 1, port_type = DMA, direction = MM2S) + %channel_3 = amdaie.channel(%tile_0_2, 1, port_type = DMA, direction = S2MM) %6 = amdaie.flow({%channel_2} -> {%channel_3}) {is_packet_flow = false} %7 = amdaie.connection(%5 {%channel_3}, %4 {%channel_2}, flow = %6) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) amdaie.controlcode { @@ -525,14 +517,14 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %lock_7 = amdaie.lock(%tile_0_2(1), 0) %0 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_1}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32>, memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> %1 = amdaie.logicalobjectfifo.from_buffers({%buffer_2, %buffer_3}, {%lock_2}, {%lock_3}) : memref<4096xi32, 2 : i32>, memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 2> - %channel = amdaie.channel(%tile_0_1, 0, port_type = DMA) - %channel_1 = amdaie.channel(%tile_0_2, 0, port_type = DMA) + %channel = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = MM2S) + %channel_1 = amdaie.channel(%tile_0_2, 0, port_type = DMA, direction = S2MM) %2 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} %3 = amdaie.connection(%1 {%channel_1}, %0 {%channel}, flow = %2) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo, 2>) %4 = amdaie.logicalobjectfifo.from_buffers({%buffer_4, %buffer_5, %buffer_6, %buffer_7}, {%lock_4}, {%lock_5}) : memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32>, memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 4> %5 = amdaie.logicalobjectfifo.from_buffers({%buffer_8, %buffer_9, %buffer_10, %buffer_11}, {%lock_6}, {%lock_7}) : memref<2048xi32, 2 : i32>, memref<2048xi32, 2 : i32>, memref<2048xi32, 2 : i32>, memref<2048xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 4> - %channel_2 = amdaie.channel(%tile_0_1, 1, port_type = DMA) - %channel_3 = amdaie.channel(%tile_0_2, 1, port_type = DMA) + %channel_2 = amdaie.channel(%tile_0_1, 1, port_type = DMA, direction = MM2S) + %channel_3 = amdaie.channel(%tile_0_2, 1, port_type = DMA, direction = S2MM) %6 = amdaie.flow({%channel_2} -> {%channel_3}) {is_packet_flow = false} %7 = amdaie.connection(%5 {%channel_3}, %4 {%channel_2}, flow = %6) : (!amdaie.logicalobjectfifo, 4>, !amdaie.logicalobjectfifo, 4>) amdaie.controlcode { @@ -608,12 +600,12 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %0 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> %1 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> %2 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 1> - %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) - %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) + %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA, direction = MM2S) + %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = S2MM) %3 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} %4 = amdaie.connection(%1 {%channel_1}, %0 {%channel}, flow = %3) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) - %channel_2 = amdaie.channel(%tile_0_1, 0, port_type = DMA) - %channel_3 = amdaie.channel(%tile_0_2, 0, port_type = DMA) + %channel_2 = amdaie.channel(%tile_0_1, 0, port_type = DMA, direction = MM2S) + %channel_3 = amdaie.channel(%tile_0_2, 0, port_type = DMA, direction = S2MM) %5 = amdaie.flow({%channel_2} -> {%channel_3}) {is_packet_flow = false} %6 = amdaie.connection(%2 {%channel_3}, %1 {%channel_2}, flow = %5) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) amdaie.controlcode { @@ -628,219 +620,6 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // ----- -//===----------------------------------------------------------------------===// -// Controlcode tests -//===----------------------------------------------------------------------===// - -#pipeline_layout = #hal.pipeline.layout]> -#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> -module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @invalid_npu_dma_cpy_nd() { - amdaie.workgroup { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<4096xi32> - %tile_0_0 = amdaie.tile(%c0, %c0) - %tile_0_1 = amdaie.tile(%c0, %c1) - %buffer = amdaie.buffer(%tile_0_1) : memref<4096xi32, 1 : i32> - %lock = amdaie.lock(%tile_0_1(0), 1) - %lock_1 = amdaie.lock(%tile_0_1(1), 0) - %1 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> - %2 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> - %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) - %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) - %3 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} - %4 = amdaie.connection(%2 {%channel_1}, %1 {%channel}, flow = %3) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) - // expected-error @+1 {{could not convert to AIEDialect ops}} - amdaie.controlcode { - %5 = amdaie.npu.circular_dma_cpy_nd %4([0] [1024] [1], [] [] []) - %6 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xi32> -> !amdaie.logicalobjectfifo> - // expected-error @+1 {{'amdaie.npu.dma_cpy_nd' op must have a source BD ID op to lower to the AIE dialect}} - amdaie.npu.dma_cpy_nd %4([] [] [], %6[0, 32] [32, 32] [64, 1]) : source_type = !amdaie.logicalobjectfifo> - amdaie.end - } - } - return - } -} - -// ----- - -// CHECK: aie.device -// CHECK: aiex.runtime_sequence @npu_dma_cpy_nd_with_repeat_already_on_outer_dim(%[[ARG0:.+]]: memref<4096xi32> -// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 32][2, 1, 2, 32][2, 0, 16, 1]) { -// CHECK-SAME: id = 0 : i64 -#pipeline_layout = #hal.pipeline.layout]> -#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> -module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @npu_dma_cpy_nd_with_repeat_already_on_outer_dim() { - amdaie.workgroup { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<4096xi32> - %tile_0_0 = amdaie.tile(%c0, %c0) - %tile_0_1 = amdaie.tile(%c0, %c1) - %bd_id_0 = amdaie.bd_id(%tile_0_0, 0) - %buffer = amdaie.buffer(%tile_0_1) : memref<4096xi32, 1 : i32> - %lock = amdaie.lock(%tile_0_1(0), 1) - %lock_1 = amdaie.lock(%tile_0_1(1), 0) - %1 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> - %2 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> - %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) - %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) - %3 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} - %4 = amdaie.connection(%2 {%channel_1}, %1 {%channel}, flow = %3) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) - amdaie.controlcode { - %5 = amdaie.npu.circular_dma_cpy_nd %4([0] [1024] [1], [] [] []) - %6 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xi32> -> !amdaie.logicalobjectfifo> - amdaie.npu.dma_cpy_nd %4([] [] [], %6[0, 0, 0, 32] [2, 1, 2, 32] [2, 0, 16, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> - amdaie.end - } - } - return - } -} - -// ----- - -// Test to show mix of implicit/explicit source/target addressing in amdaie.npu.dma_cpy_nd. - -// CHECK: aie.device -// CHECK: memref.global "public" @[[SHIM_1:.+]] : memref<2048xi32> -// CHECK: memref.global "public" @[[SHIM_0:.+]] : memref<4096xi32> -// CHECK: aiex.runtime_sequence @controlcode(%[[ARG0:.+]]: memref<4096xi32>, %[[ARG1:.+]]: memref<2048xi32>) -// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 32][1, 1, 32, 32][0, 0, 64, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_0]]} : memref<4096xi32> -// CHECK: aiex.npu.dma_wait {symbol = @[[SHIM_0]]} -// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 1, 1, 2048][0, 0, 0, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_0]]} : memref<4096xi32> -// CHECK: aiex.npu.dma_wait {symbol = @[[SHIM_0]]} -// CHECK: scf.forall -// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 32][1, 1, 32, 32][0, 0, 64, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_1]]} : memref<2048xi32> -// CHECK: aiex.npu.dma_wait {symbol = @[[SHIM_1]]} -// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_1]]} : memref<2048xi32> -// CHECK: aiex.npu.dma_wait {symbol = @[[SHIM_1]]} -// CHECK: } -#pipeline_layout = #hal.pipeline.layout]> -#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> -module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @controlcode() { - amdaie.workgroup { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<4096xi32> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<2048xi32> - %tile_0_0 = amdaie.tile(%c0, %c0) - %tile_0_1 = amdaie.tile(%c0, %c1) - %bd_id_0 = amdaie.bd_id(%tile_0_0, 0) - %buffer = amdaie.buffer(%tile_0_1) : memref<4096xi32, 1 : i32> - %lock = amdaie.lock(%tile_0_1(0), 1) - %lock_1 = amdaie.lock(%tile_0_1(1), 0) - %buffer_1 = amdaie.buffer(%tile_0_1) : memref<2048xi32, 1 : i32> - %lock_2 = amdaie.lock(%tile_0_1(0), 1) - %lock_3 = amdaie.lock(%tile_0_1(1), 0) - %2 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> - %3 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> - %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) - %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) - %4 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} - %5 = amdaie.connection(%3 {%channel_1}, %2 {%channel}, flow = %4) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) - %6 = amdaie.logicalobjectfifo.placeholder{%tile_0_1} : !amdaie.logicalobjectfifo> - %7 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<2048xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> - %channel_2 = amdaie.channel(%tile_0_0, 1, port_type = DMA) - %channel_3 = amdaie.channel(%tile_0_1, 1, port_type = DMA) - %8 = amdaie.flow({%channel_2} -> {%channel_3}) {is_packet_flow = false} - %9 = amdaie.connection(%6 {%channel_2}, %7 {%channel_3}, flow = %8) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) - amdaie.controlcode { - %10 = amdaie.npu.circular_dma_cpy_nd %5([0] [4096] [1], [] [] []) - %11 = amdaie.npu.circular_dma_cpy_nd %9([] [] [], [0] [2048] [1]) - %12 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xi32> -> !amdaie.logicalobjectfifo> - %13 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<2048xi32> -> !amdaie.logicalobjectfifo> - %14 = amdaie.npu.dma_cpy_nd async_source %5([] [] [], %12[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%14 : !amdaie.async_source_token) - %15 = amdaie.npu.dma_cpy_nd async_source %5([] [] [], %12[0, 0, 0, 0] [1, 1, 1, 2048] [0, 0, 0, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%15 : !amdaie.async_source_token) - scf.forall (%arg0, %arg1) in (2, 1) { - %16 = amdaie.npu.dma_cpy_nd async_target %9(%13[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%16 : !amdaie.async_target_token) - %17 = amdaie.npu.dma_cpy_nd async_target %9(%13[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%17 : !amdaie.async_target_token) - } - amdaie.end - } - } - return - } -} - -// ----- - -// CHECK: aie.device -// CHECK: memref.global "public" @[[SHIM_1:.+]] : memref<2048xf32> -// CHECK: memref.global "public" @[[SHIM_0:.+]] : memref<4096xbf16> -// CHECK: aiex.runtime_sequence @controlcode_bf16_f32(%[[ARG0:.+]]: memref<4096xbf16>, %[[ARG1:.+]]: memref<2048xf32>) -// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 1, 2][1, 2, 32, 16][0, 16, 32, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_0]]} : memref<4096xbf16> -// CHECK: aiex.npu.dma_wait {symbol = @[[SHIM_0]]} -// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_0]]} : memref<4096xbf16> -// CHECK: aiex.npu.dma_wait {symbol = @[[SHIM_0]]} -// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 32][1, 1, 32, 32][0, 0, 64, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_1]]} : memref<2048xf32> -// CHECK: aiex.npu.dma_wait {symbol = @[[SHIM_1]]} -// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG1]][0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_1]]} : memref<2048xf32> -// CHECK: aiex.npu.dma_wait {symbol = @[[SHIM_1]]} -#pipeline_layout = #hal.pipeline.layout]> -#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> -module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { - func.func @controlcode_bf16_f32() { - amdaie.workgroup { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c2 = arith.constant 2 : index - %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<4096xbf16> - %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<2048xf32> - %tile_0_0 = amdaie.tile(%c0, %c0) - %tile_0_1 = amdaie.tile(%c0, %c1) - %bd_id_0 = amdaie.bd_id(%tile_0_0, 0) - %buffer = amdaie.buffer(%tile_0_1) : memref<4096xbf16, 1 : i32> - %lock = amdaie.lock(%tile_0_1(0), 1) - %lock_1 = amdaie.lock(%tile_0_1(1), 0) - %buffer_1 = amdaie.buffer(%tile_0_1) : memref<2048xf32, 1 : i32> - %lock_2 = amdaie.lock(%tile_0_1(0), 1) - %lock_3 = amdaie.lock(%tile_0_1(1), 0) - %2 = amdaie.logicalobjectfifo.placeholder{%tile_0_0} : !amdaie.logicalobjectfifo> - %3 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_1}) : memref<4096xbf16, 1 : i32> -> !amdaie.logicalobjectfifo, 1> - %channel = amdaie.channel(%tile_0_0, 0, port_type = DMA) - %channel_1 = amdaie.channel(%tile_0_1, 0, port_type = DMA) - %4 = amdaie.flow({%channel} -> {%channel_1}) {is_packet_flow = false} - %5 = amdaie.connection(%3 {%channel_1}, %2 {%channel}, flow = %4) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) - %6 = amdaie.logicalobjectfifo.placeholder{%tile_0_1} : !amdaie.logicalobjectfifo> - %7 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<2048xf32, 1 : i32> -> !amdaie.logicalobjectfifo, 1> - %channel_2 = amdaie.channel(%tile_0_0, 1, port_type = DMA) - %channel_3 = amdaie.channel(%tile_0_1, 1, port_type = DMA) - %8 = amdaie.flow({%channel_2} -> {%channel_3}) {is_packet_flow = false} - %9 = amdaie.connection(%6 {%channel_2}, %7 {%channel_3}, flow = %8) : (!amdaie.logicalobjectfifo, 1>, !amdaie.logicalobjectfifo, 1>) - amdaie.controlcode { - %10 = amdaie.npu.circular_dma_cpy_nd %5([0] [4096] [1], [] [] []) - %11 = amdaie.npu.circular_dma_cpy_nd %9([] [] [], [0] [2048] [1]) - %12 = amdaie.logicalobjectfifo.from_memref %0, {%tile_0_0} : memref<4096xbf16> -> !amdaie.logicalobjectfifo> - %13 = amdaie.logicalobjectfifo.from_memref %1, {%tile_0_0} : memref<2048xf32> -> !amdaie.logicalobjectfifo> - %14 = amdaie.npu.dma_cpy_nd async_source %5([] [] [], %12[0, 0, 1, 2] [1, 2, 32, 16] [0, 16, 32, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%14 : !amdaie.async_source_token) - %15 = amdaie.npu.dma_cpy_nd async_source %5([] [] [], %12[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id_0) : source_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%15 : !amdaie.async_source_token) - %16 = amdaie.npu.dma_cpy_nd async_target %9(%13[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%16 : !amdaie.async_target_token) - %17 = amdaie.npu.dma_cpy_nd async_target %9(%13[0, 0, 0, 0] [1, 1, 1, 1024] [0, 0, 0, 1] bd_id = %bd_id_0, [] [] []) : target_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%17 : !amdaie.async_target_token) - amdaie.end - } - } - return - } -} - -// ----- - //===----------------------------------------------------------------------===// // CoreOp tests //===----------------------------------------------------------------------===// @@ -1080,11 +859,7 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} // CHECK: aie.use_lock(%[[LOCK_1_2]], AcquireGreaterEqual, 1) // CHECK: aie.end // CHECK: } -// CHECK: aiex.runtime_sequence @large_example(%[[ARG0:.*]]: memref<4096xi32>) { -// CHECK: aiex.npu.dma_memcpy_nd(0, 0, %[[ARG0]][0, 0, 0, 32][1, 1, 32, 32][0, 0, 64, 1]) {id = 0 : i64, issue_token = true, metadata = @[[SHIM_0]]} : memref<4096xi32> -// CHECK: aiex.npu.dma_wait {symbol = @[[SHIM_0]]} -// CHECK: } -// CHECK: } +// CHECK: aiex.runtime_sequence @large_example #pipeline_layout = #hal.pipeline.layout]> #executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { @@ -1115,13 +890,13 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} %1 = amdaie.logicalobjectfifo.placeholder{%tile} : !amdaie.logicalobjectfifo> %2 = amdaie.logicalobjectfifo.from_buffers({%buffer, %buffer_3}, {%lock}, {%lock_4}) : memref<4096xi32, 1 : i32>, memref<4096xi32, 1 : i32> -> !amdaie.logicalobjectfifo, 2> %3 = amdaie.logicalobjectfifo.from_buffers({%buffer_5, %buffer_6, %buffer_9, %buffer_10}, {%lock_7, %lock_11}, {%lock_8, %lock_12}) : memref<4096xi32, 2 : i32>, memref<4096xi32, 2 : i32>, memref<4096xi32, 2 : i32>, memref<4096xi32, 2 : i32> -> !amdaie.logicalobjectfifo, 2> - %channel = amdaie.channel(%tile, 0, port_type = DMA) - %channel_13 = amdaie.channel(%tile_0, 0, port_type = DMA) + %channel = amdaie.channel(%tile, 0, port_type = DMA, direction = MM2S) + %channel_13 = amdaie.channel(%tile_0, 0, port_type = DMA, direction = S2MM) %4 = amdaie.flow({%channel} -> {%channel_13}) {is_packet_flow = false} %5 = amdaie.connection(%2 {%channel_13}, %1 {%channel}, flow = %4) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo>) - %channel_14 = amdaie.channel(%tile_0, 1, port_type = DMA) - %channel_15 = amdaie.channel(%tile_1, 0, port_type = DMA) - %channel_16 = amdaie.channel(%tile_2, 0, port_type = DMA) + %channel_14 = amdaie.channel(%tile_0, 1, port_type = DMA, direction = MM2S) + %channel_15 = amdaie.channel(%tile_1, 0, port_type = DMA, direction = S2MM) + %channel_16 = amdaie.channel(%tile_2, 0, port_type = DMA, direction = S2MM) %6 = amdaie.flow({%channel_14} -> {%channel_15, %channel_16}) {is_packet_flow = false} %7 = amdaie.connection(%3 {%channel_15, %channel_16}, %2 {%channel_14}, flow = %6) : (!amdaie.logicalobjectfifo, 2>, !amdaie.logicalobjectfifo, 2>) %8 = amdaie.core(%tile_1, in : [%7], out : []) { @@ -1149,9 +924,6 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} amdaie.controlcode { %10 = amdaie.npu.circular_dma_cpy_nd %5([0, 0] [64, 64] [32, 1], [] [] []) %11 = amdaie.npu.circular_dma_cpy_nd %7([] [] [], [0, 1024] [64, 64] [32, 1]) - %12 = amdaie.logicalobjectfifo.from_memref %0, {%tile} : memref<4096xi32> -> !amdaie.logicalobjectfifo> - %13 = amdaie.npu.dma_cpy_nd async_source %5([] [] [], %12[0, 0, 0, 32] [1, 1, 32, 32] [0, 0, 64, 1] bd_id = %bd_id) : source_type = !amdaie.logicalobjectfifo> - amdaie.npu.dma_wait(%13 : !amdaie.async_source_token) amdaie.end } } diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc index 2d23678d4..6ffc1b11a 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc +++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.cc @@ -159,6 +159,10 @@ AMDAIEDeviceModel::AMDAIEDeviceModel( TRY_XAIE_API_FATAL_ERROR(XAie_TurnEccOff, &devInst); } +uint8_t AMDAIEDeviceModel::getMinStrideBitWidth() const { + return deviceConfig.minStrideBitWidth; +} + int AMDAIEDeviceModel::rows() const { if (device == AMDAIEDevice::xcvc1902 || device == AMDAIEDevice::xcve2802) return MLIRAIELegacy::rows(*this); diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h index de8c855b7..e9a227d25 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h +++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h @@ -222,6 +222,10 @@ struct AMDAIEDeviceModel { /// aie-rt for whatever reason. Make sure the parameters can't be retrieved in /// another way before adding new fields to this struct. struct AMDAIEDeviceConfig { + /// Set default minimum stride bitwidth/addressing granularity to 32 bits as + /// this is the value for all current architecture versions. + uint8_t minStrideBitWidth{32}; + /// The max packet id. uint8_t packetIdMaxIdx{0}; /// Currently, the max arbiter/msel is hidden inside aie-rt. uint8_t streamSwitchCoreArbiterMax{0}; @@ -246,6 +250,7 @@ struct AMDAIEDeviceModel { AMDAIEDevice device, AMDAIEDeviceConfig deviceConfig); + uint8_t getMinStrideBitWidth() const; int rows() const; int columns() const;