diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp index 2d2fe21b9..4ab5420ee 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp @@ -7,10 +7,16 @@ #include "iree-amd-aie/IR/AMDAIEOps.h" #include +#include +#include #include "iree-amd-aie/IR/AMDAIEDialect.h" +#include "iree-amd-aie/IR/AMDAIELogicalObjFifoOpInterface.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/Linalg/Utils/Utils.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Utils/StaticValueUtils.h" #include "mlir/IR/OpDefinition.h" @@ -55,12 +61,242 @@ static void printAsyncTokenType(OpAsmPrinter &p, Operation *op, } } +namespace { + +template +SmallVector getSourceAndTarget(T op) { + return {op.getSource().getDefiningOp(), op.getTarget().getDefiningOp()}; +} + +/// Depending on what type of operation `op` is, we select a different set of +/// results/operands (consumers/producers) to traverse through in our search for +/// a compute operation. See `getComputeOpIndex` for more details. +SmallVector getNeighbors(Operation *op) { + auto appendUserRange = [&](Operation *op, SmallVector &ops) { + if (op) { + for (auto user : op->getUsers()) { + ops.push_back(user); + } + } + }; + + auto getUserRange = [&](Operation *op) -> SmallVector { + SmallVector ops; + appendUserRange(op, ops); + return ops; + }; + + auto isFifo = [](Operation *op) -> bool { + return op && (isa(op) || + isa(op)); + }; + + auto appendWithFifoUsers = [&](Operation *op, SmallVector &ops) { + if (op) { + ops.push_back(op); + for (auto user : op->getUsers()) { + if (isFifo(user)) { + ops.push_back(user); + } + } + } + }; + + if (auto connection = dyn_cast(op)) { + auto neighbors = getSourceAndTarget(connection); + appendUserRange(connection, neighbors); + return neighbors; + } else if (auto dma_cpy_nd_op = dyn_cast(op)) { + return getSourceAndTarget(dma_cpy_nd_op); + } else if (auto circular = dyn_cast(op)) { + return getSourceAndTarget(circular); + } else if (auto from_memref = + dyn_cast(op)) { + auto memref = from_memref.getMemref().getDefiningOp(); + if (memref) { + SmallVector neighbors = getUserRange(from_memref); + appendWithFifoUsers(memref, neighbors); + return neighbors; + } + } else if (auto from_buffers = + dyn_cast(op)) { + SmallVector neighbors = getUserRange(from_buffers); + for (auto buffer : from_buffers.getBuffers()) { + appendWithFifoUsers(buffer.getDefiningOp(), neighbors); + } + return neighbors; + } else if (auto access = dyn_cast(op)) { + return getUserRange(access); + } else if (auto reinterpret = dyn_cast(op)) { + return getUserRange(reinterpret); + } else if (auto acquire = dyn_cast(op)) { + return getUserRange(acquire); + } else if (auto buffer = dyn_cast(op)) { + return getUserRange(buffer); + } + // All other ops are currently considered `dead ends` in the graph, not + // leading to computation ops. + return {}; +} + +/// For most workloads, there is set of operations of type +/// +/// buffer & +/// from_memref & +/// from_buffers & +/// dma_cpy_nd & +/// circular_dma_cpy_nd & +/// connection & +/// access & +/// acquire & +/// etc etc. +/// +/// all of which have a 1:1 correspondence with an operand/result of the +/// high level linalg op(s) being lowered. For example for +/// matmul, most operations created during lowering are used exclusively +/// to move one of the A, B, and C tensors to/from the AIE (where the +/// matmul is C = A@B). It can be very useful to see which of the matmul +/// operands each new low-level operation corresponds to when trying to +/// understand the IR. +/// +/// This function attempts to find the index of the compute operation that +/// `root` corresponds to. For example if `root` is a `dma_cpy_nd` op for +/// moving a sub-tensor of B from L2 to L1, then this function will return +/// 1 (because B is operand number 1 of a matmul). +/// +/// If there is any ambiguity about which operand of the high-level operation +/// `root` corresponds to, the nullopt is returned. + +std::optional getComputeOpIndex(Operation *root) { + // Traverse the graph defined by `getNeighbors` to find the compute + // ops that use the buffer/memref. + DenseSet visited{root}; + SmallVector toTraverse{root}; + std::optional index{}; + while (!toTraverse.empty()) { + auto nxt = toTraverse.back(); + toTraverse.pop_back(); + for (auto neighbor : getNeighbors(nxt)) { + if (neighbor) { + if (!visited.contains(neighbor)) { + visited.insert(neighbor); + toTraverse.push_back(neighbor); + } + } + } + for (auto &use : nxt->getUses()) { + Operation *owner = use.getOwner(); + if (isa(owner) || isa(owner)) { + int32_t nxtIndex = use.getOperandNumber(); + // The case where this is the first compute operation: + if (!index.has_value()) { + index = nxtIndex; + } + + // The case where this is not the first compute operation, and the + // indices used at are different: + else if (index.value() != nxtIndex) { + return std::optional{}; + } + } + } + } + + return index; +} + +/// This function is a thin wrapper around `getComputeOpIndex` that returns +/// '_A' if the index is 0 +/// '_B' if the index is 1 +/// etc. +std::string getGenericOperandSuffix(Operation *root) { + std::optional maybeIndex = getComputeOpIndex(root); + + // The case where no compute operation was found: + if (!maybeIndex.has_value()) return ""; + + // The case where one or more compute operation was found, and all used the + // buffer/memref at the same index: + char c = ('A' + maybeIndex.value()); + std::ostringstream oss; + oss << '_' << c; + return oss.str(); +} + +/// String describing the column and row of `tileOp`. Example: if the column +/// of `tileOp` an integer value (3) and the row is not, the returned string +/// might be `_3_r`, where the `_r` denotes that the row is not known. +std::string getTileSuffix(TileOp tileOp) { + std::optional iCol = getConstantIntValue(tileOp.getCol()); + std::optional iRow = getConstantIntValue(tileOp.getRow()); + std::ostringstream oss; + auto add = [&](std::optional maybeValue, char unknown) { + oss << '_'; + if (maybeValue.has_value()) { + oss << maybeValue.value(); + } else { + oss << unknown; + } + }; + + if (iCol.has_value() || iRow.has_value()) { + add(iCol, 'c'); + add(iRow, 'r'); + } + return oss.str(); +} + +std::string getMultiTileSuffix(ArrayRef tiles) { + if (tiles.empty()) return ""; + + // Denotes one or more tiles with multiple possible row/column values. + constexpr int64_t multiple{-2}; + constexpr int64_t unset{-1}; + int64_t col{unset}; + int64_t row{unset}; + + for (TileOp tile : tiles) { + if (!tile) { + col = multiple; + row = multiple; + } else { + std::optional maybeCol = getConstantIntValue(tile.getCol()); + if (!maybeCol) col = multiple; + if (col >= 0 && maybeCol.value() != col) col = multiple; + if (col == unset) col = maybeCol.value(); + + std::optional maybeRow = getConstantIntValue(tile.getRow()); + if (!maybeRow) row = multiple; + if (row >= 0 && maybeRow.value() != row) row = multiple; + if (row == unset) row = maybeRow.value(); + } + } + + std::ostringstream namestream; + namestream << '_'; + + if (col >= 0) { + namestream << col; + } else { + namestream << 'c'; + } + if (row >= 0) { + namestream << '_' << row; + } else { + namestream << '_' << 'r'; + } + return namestream.str(); +} +} // namespace + //===----------------------------------------------------------------------===// // AMDAIE_BdIdOp //===----------------------------------------------------------------------===// void BdIdOp::getAsmResultNames(function_ref setNameFn) { - setNameFn(getResult(), "bd_id"); + TileOp tileOp = getTile().getDefiningOp(); + assert(tileOp && "expected `amdaie.tile` for `amdaie.bd_id`"); + setNameFn(getResult(), "bd_id" + getTileSuffix(tileOp)); } //===----------------------------------------------------------------------===// @@ -69,7 +305,10 @@ void BdIdOp::getAsmResultNames(function_ref setNameFn) { void BufferOp::getAsmResultNames( function_ref setNameFn) { - setNameFn(getResult(), "buffer"); + TileOp tileOp = getTile().getDefiningOp(); + assert(tileOp && "expected `amdaie.tile` for amdaie.buffer"); + setNameFn(getResult(), "buffer" + getTileSuffix(tileOp) + + getGenericOperandSuffix(getOperation())); } //===----------------------------------------------------------------------===// @@ -78,7 +317,9 @@ void BufferOp::getAsmResultNames( void ChannelOp::getAsmResultNames( function_ref setNameFn) { - setNameFn(getResult(), "channel"); + TileOp tileOp = getTile().getDefiningOp(); + assert(tileOp && "expected `amdaie.tile` for `amdaie.channel`"); + setNameFn(getResult(), "channel" + getTileSuffix(tileOp)); } TileOp ChannelOp::getTileOp() { @@ -474,6 +715,12 @@ void CircularDmaCpyNdOp::getCanonicalizationPatterns(RewritePatternSet &results, // AMDAIE_ConnectionOp //===----------------------------------------------------------------------===// +void ConnectionOp::getAsmResultNames( + function_ref setNameFn) { + setNameFn(getResult(), + "connection" + getGenericOperandSuffix(getOperation())); +} + void ConnectionOp::build(mlir::OpBuilder &b, mlir::OperationState &result, Value target, Value source) { build(b, result, target, {}, source, {}, nullptr, nullptr); @@ -522,7 +769,9 @@ LogicalResult FlowOp::verify() { //===----------------------------------------------------------------------===// void LockOp::getAsmResultNames(function_ref setNameFn) { - setNameFn(getResult(), "lock"); + TileOp tileOp = getTile().getDefiningOp(); + assert(tileOp && "expected `amdaie.tile` for `amdaie.lock`"); + setNameFn(getResult(), "lock" + getTileSuffix(tileOp)); } //===----------------------------------------------------------------------===// @@ -556,6 +805,17 @@ void LogicalObjectFifoAcquire::build(OpBuilder &b, mlir::OperationState &result, // AMDAIE_LogicalObjectFifoFromBuffersOp //===----------------------------------------------------------------------===// +void LogicalObjectFifoFromBuffersOp::getAsmResultNames( + function_ref setNameFn) { + SmallVector tiles; + tiles.reserve(getTiles().size()); + for (auto index : getTiles()) { + tiles.push_back(dyn_cast(index.getDefiningOp())); + } + setNameFn(getResult(), "lof" + getMultiTileSuffix(tiles) + + getGenericOperandSuffix(getOperation())); +} + SmallVector LogicalObjectFifoFromBuffersOp::getBuffersOnTile( TileOp tileOp) { SmallVector buffers; @@ -627,54 +887,28 @@ LogicalObjectFifoFromBuffersOp::replaceWithNewTiles( void LogicalObjectFifoFromMemrefOp::getAsmResultNames( function_ref setNameFn) { - // 'lof' for 'logical object fifo' - constexpr const char *const name = "lof"; - - auto tiles = getTiles(); - - if (tiles.empty()) { - setNameFn(getResult(), name); - return; + SmallVector tiles; + tiles.reserve(getTiles().size()); + for (auto index : getTiles()) { + tiles.push_back(dyn_cast(index.getDefiningOp())); } - // Denotes one or more tiles with multiple possible row/column values. - constexpr int64_t multiple{-2}; - constexpr int64_t unset{-1}; - int64_t col{unset}; - int64_t row{unset}; - - for (Value index : tiles) { - TileOp tile = dyn_cast(index.getDefiningOp()); - if (!tile) { - col = multiple; - row = multiple; - } else { - std::optional maybeCol = getConstantIntValue(tile.getCol()); - if (!maybeCol) col = multiple; - if (col >= 0 && maybeCol.value() != col) col = multiple; - if (col == unset) col = maybeCol.value(); - - std::optional maybeRow = getConstantIntValue(tile.getRow()); - if (!maybeRow) row = multiple; - if (row >= 0 && maybeRow.value() != row) row = multiple; - if (row == unset) row = maybeRow.value(); + auto multiTileSuffix = getMultiTileSuffix(tiles); + auto genericOperandSuffix = getGenericOperandSuffix(getOperation()); + + if (multiTileSuffix == "_c_r" || multiTileSuffix == "") { + // rather use the memory space: + auto type = getMemref().getType(); + auto memspace = type.getMemorySpaceAsInt(); + if (memspace == 0) { + multiTileSuffix = "_L3"; + } else if (memspace == 1) { + multiTileSuffix = "_L2"; + } else if (memspace == 2) { + multiTileSuffix = "_L1"; } } - - std::ostringstream namestream; - namestream << name << '_'; - - if (col >= 0) { - namestream << col; - } else { - namestream << 'c'; - } - if (row >= 0) { - namestream << '_' << row; - } else { - namestream << '_' << 'r'; - } - setNameFn(getResult(), namestream.str()); + setNameFn(getResult(), "lof" + multiTileSuffix + genericOperandSuffix); } /// Build with an array of static tile locations. @@ -774,8 +1008,8 @@ void LogicalObjectFifoRelease::build(OpBuilder &b, mlir::OperationState &result, // AMDAIE_NpuDmaCpyNdOp //===----------------------------------------------------------------------===// -// Build a NpuDmaCpyNdOp with mixed static and dynamic entries and target and -// source BD IDs. +// Build a NpuDmaCpyNdOp with mixed static and dynamic entries and target +// and source BD IDs. void NpuDmaCpyNdOp::build( OpBuilder &b, OperationState &result, TypeRange resultTypes, Value connection, Value target, ArrayRef targetOffsets, @@ -1419,26 +1653,8 @@ SmallVector NpuDmaWaitOp::getDmaOps() { // AMDAIE_TileOp //===----------------------------------------------------------------------===// -// Example: if the column is an integer value (3) and the row is not, the SSA -// value might be `%tile_3_r`, where the `_r` denotes that the row is not known. void TileOp::getAsmResultNames(function_ref setNameFn) { - std::optional iCol = getConstantIntValue(getCol()); - std::optional iRow = getConstantIntValue(getRow()); - std::ostringstream name; - name << "tile"; - - auto add = [&](std::optional maybeValue, char unknown) { - if (maybeValue.has_value()) { - name << '_' << maybeValue.value(); - } else { - name << '_' << unknown; - } - }; - if (iCol.has_value() || iRow.has_value()) { - add(iCol, 'c'); - add(iRow, 'r'); - } - setNameFn(getResult(), name.str()); + setNameFn(getResult(), "tile" + getTileSuffix(*this)); } bool TileOp::hasStaticLocation() { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td index efca31ebc..df8f65ef5 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td @@ -1052,7 +1052,9 @@ def AMDAIE_NpuTctSyncOp: AMDAIE_Op<"npu.tct_sync"> { //===----------------------------------------------------------------------===// def AMDAIE_ConnectionOp: AMDAIE_Op<"connection", - [Pure, CopyOpInterface, AttrSizedOperandSegments]> { + [Pure, CopyOpInterface, AttrSizedOperandSegments, + DeclareOpInterfaceMethods +]> { let summary = "A connection between two logical objectFifos."; let description = [{ Represents a connection between logical objectFifos. This connection can be @@ -1203,6 +1205,7 @@ def AMDAIE_LogicalObjectFifoAcquire: def AMDAIE_LogicalObjectFifoFromBuffersOp : AMDAIE_Op<"logicalobjectfifo.from_buffers", [DeclareOpInterfaceMethods, + DeclareOpInterfaceMethods, Pure, AttrSizedOperandSegments]> { let summary = "Create a logical objectFifo from a set of buffers"; let description = [{ diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/CMakeLists.txt index a33e459a0..887762675 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/CMakeLists.txt @@ -56,6 +56,7 @@ iree_cc_library( MLIRParser MLIRSupport MLIRViewLikeInterface + MLIRLinalgDialect PUBLIC ) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir index 4b70152ef..baa4f82e8 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir @@ -544,7 +544,7 @@ func.func @from_memref_known_tiles(%arg0 : memref<8xi32>, %t0 : index) { %tile_3_2 = amdaie.tile(%c3, %c2) %tile_2_2 = amdaie.tile(%c2, %c2) // logicalobjectfifo without any tiles: - // CHECK: %lof = + // CHECK: %lof_L3 = %fifo0 = amdaie.logicalobjectfifo.from_memref %arg0, {} : memref<8xi32> -> !amdaie.logicalobjectfifo> // logicalobjectfifo with one known tile: @@ -562,11 +562,11 @@ func.func @from_memref_known_tiles(%arg0 : memref<8xi32>, %t0 : index) { %fifo5 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_2_3, %tile_3_3} : memref<8xi32> -> !amdaie.logicalobjectfifo> // logicalobjectfifo with two known tiles, in different rows and columns: - // CHECK: %lof_c_r = + // CHECK: %lof_L3_0 = %fifo6 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_2_3, %tile_3_2} : memref<8xi32> -> !amdaie.logicalobjectfifo> // logicalobjectfifo with 4 tiles, spanning 2 rows and 2 columns: - // CHECK: %lof_c_r_0 = + // CHECK: %lof_L3_1 = %fifo7 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_2_3, %tile_3_3, %tile_3_2, %tile_2_2} : memref<8xi32> -> !amdaie.logicalobjectfifo> @@ -620,11 +620,11 @@ func.func @from_memref_unknown_row_column(%arg0 : memref<8xi32>, %t0 : index) { %tile_2_2 = amdaie.tile(%c2, %c2) %tile_u_u = amdaie.tile(%t0, %t0) // logicalobjectfifo with a single tile with unknown row and column: - // CHECK: %lof_c_r = + // CHECK: %lof_L3 = %fifo1 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_u_u} : memref<8xi32> -> !amdaie.logicalobjectfifo> // logicalobjectfifo with one unknown tile, and one known tile: - // CHECK: %lof_c_r_0 = + // CHECK: %lof_L3_0 = %fifo2 = amdaie.logicalobjectfifo.from_memref %arg0, {%tile_2_2, %tile_u_u} : memref<8xi32> -> !amdaie.logicalobjectfifo> amdaie.controlcode { @@ -633,3 +633,152 @@ func.func @from_memref_unknown_row_column(%arg0 : memref<8xi32>, %t0 : index) { } return } + + +// ----- + + + +#pipeline_layout = #hal.pipeline.layout, #hal.pipeline.binding, #hal.pipeline.binding], flags = Indirect> + +func.func private @generic_matmul_0_outlined(%arg0: memref<1x1x4x8x4x8xbf16, 2 : i32>, %arg1: memref<1x1x8x4x8x4xbf16, 2 : i32>, %arg2: memref<1x1x8x8x4x4xf32, 2 : i32>) { + return +} + +// CHECK-LABEL: func.func @operand_matching +func.func @operand_matching() { + %c0 = arith.constant 0 : index + %c2 = arith.constant 2 : index + %c1 = arith.constant 1 : index + amdaie.workgroup { + // CHECK: %tile_0_1 = + %tile = amdaie.tile(%c0, %c1) + + // CHECK: %buffer_0_1_A = + %buffer = amdaie.buffer(%tile) : memref<1024xbf16, 1 : i32> + + // CHECK: %lock_0_1 = + %lock = amdaie.lock(%tile(4), 1) + %lock_0 = amdaie.lock(%tile(5), 0) + + // CHECK: %buffer_0_1_B = + %buffer_1 = amdaie.buffer(%tile) : memref<1024xbf16, 1 : i32> + %lock_2 = amdaie.lock(%tile(2), 1) + %lock_3 = amdaie.lock(%tile(3), 0) + + // CHECK: %buffer_0_1_C = + %buffer_4 = amdaie.buffer(%tile) : memref<1024xf32, 1 : i32> + %lock_5 = amdaie.lock(%tile(0), 1) + %lock_6 = amdaie.lock(%tile(1), 0) + + // CHECK: %lof_0_1_C = + %lof = amdaie.logicalobjectfifo.from_buffers({%buffer_4}, {%lock_5}, {%lock_6}) : memref<1024xf32, 1 : i32> -> !amdaie.logicalobjectfifo> + + // CHECK: %lof_0_1_B = + %lof_7 = amdaie.logicalobjectfifo.from_buffers({%buffer_1}, {%lock_2}, {%lock_3}) : memref<1024xbf16, 1 : i32> -> !amdaie.logicalobjectfifo> + + // CHECK: %lof_0_1_A = + %lof_8 = amdaie.logicalobjectfifo.from_buffers({%buffer}, {%lock}, {%lock_0}) : memref<1024xbf16, 1 : i32> -> !amdaie.logicalobjectfifo> + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<32x32xbf16> + memref.assume_alignment %0, 64 : memref<32x32xbf16> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<32x32xbf16> + memref.assume_alignment %1, 64 : memref<32x32xbf16> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<32x32xf32> + memref.assume_alignment %2, 64 : memref<32x32xf32> + + // CHECK: %tile_0_0 = + %tile_9 = amdaie.tile(%c0, %c0) + %3 = amdaie.logicalobjectfifo.placeholder{%tile_9} : !amdaie.logicalobjectfifo> + + // CHECK: %channel_0_0 = + %channel = amdaie.channel(%tile_9, 0, port_type = DMA, direction = MM2S) + %channel_10 = amdaie.channel(%tile, 0, port_type = DMA, direction = S2MM) + %4 = amdaie.flow({%channel} -> {%channel_10}) {is_packet_flow = false} + + // CHECK: %connection_A = + %connection = amdaie.connection(%lof_8 {%channel_10}, %3 {%channel}, flow = %4) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %channel_11 = amdaie.channel(%tile_9, 1, port_type = DMA, direction = MM2S) + %channel_12 = amdaie.channel(%tile, 1, port_type = DMA, direction = S2MM) + %5 = amdaie.flow({%channel_11} -> {%channel_12}) {is_packet_flow = false} + + // CHECK: %connection_B = + %connection_13 = amdaie.connection(%lof_7 {%channel_12}, %3 {%channel_11}, flow = %5) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %tile_14 = amdaie.tile(%c0, %c2) + + // CHECK: %buffer_0_2_B = + %buffer_15 = amdaie.buffer(%tile_14) : memref<1024xbf16, 2 : i32> + %lock_16 = amdaie.lock(%tile_14(4), 1) + %lock_17 = amdaie.lock(%tile_14(5), 0) + + // CHECK: %buffer_0_2_A = + %buffer_18 = amdaie.buffer(%tile_14) : memref<1024xbf16, 2 : i32> + %lock_19 = amdaie.lock(%tile_14(2), 1) + %lock_20 = amdaie.lock(%tile_14(3), 0) + + // CHECK: %buffer_0_2_C = + %buffer_21 = amdaie.buffer(%tile_14) : memref<1024xf32, 2 : i32> + %lock_22 = amdaie.lock(%tile_14(0), 1) + %lock_23 = amdaie.lock(%tile_14(1), 0) + + // CHECK: %lof_0_2_C = + %lof_24 = amdaie.logicalobjectfifo.from_buffers({%buffer_21}, {%lock_22}, {%lock_23}) : memref<1024xf32, 2 : i32> -> !amdaie.logicalobjectfifo> + %lof_25 = amdaie.logicalobjectfifo.from_buffers({%buffer_18}, {%lock_19}, {%lock_20}) : memref<1024xbf16, 2 : i32> -> !amdaie.logicalobjectfifo> + %lof_26 = amdaie.logicalobjectfifo.from_buffers({%buffer_15}, {%lock_16}, {%lock_17}) : memref<1024xbf16, 2 : i32> -> !amdaie.logicalobjectfifo> + %channel_27 = amdaie.channel(%tile, 0, port_type = DMA, direction = MM2S) + %channel_28 = amdaie.channel(%tile_14, 0, port_type = DMA, direction = S2MM) + %6 = amdaie.flow({%channel_27} -> {%channel_28}) {is_packet_flow = false} + %connection_29 = amdaie.connection(%lof_26 {%channel_28}, %lof_7 {%channel_27}, flow = %6) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %channel_30 = amdaie.channel(%tile, 1, port_type = DMA, direction = MM2S) + %channel_31 = amdaie.channel(%tile_14, 1, port_type = DMA, direction = S2MM) + %7 = amdaie.flow({%channel_30} -> {%channel_31}) {is_packet_flow = false} + %connection_32 = amdaie.connection(%lof_25 {%channel_31}, %lof_8 {%channel_30}, flow = %7) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %channel_33 = amdaie.channel(%tile_14, 0, port_type = DMA, direction = MM2S) + %channel_34 = amdaie.channel(%tile, 2, port_type = DMA, direction = S2MM) + %8 = amdaie.flow({%channel_33} -> {%channel_34}) {is_packet_flow = false} + + // CHECK: %connection_C = + %connection_35 = amdaie.connection(%lof {%channel_34}, %lof_24 {%channel_33}, flow = %8) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %9 = amdaie.core(%tile_14, in : [%connection_29, %connection_32], out : [%connection_35]) { + %cst = arith.constant 0.000000e+00 : f32 + amdaie.use_lock(%lock_22, AcquireGreaterOrEqual(1)) + %reinterpret_cast = memref.reinterpret_cast %buffer_21 to offset: [0], sizes: [1, 1, 8, 8, 4, 4], strides: [1024, 1024, 128, 16, 4, 1] : memref<1024xf32, 2 : i32> to memref<1x1x8x8x4x4xf32, 2 : i32> + linalg.fill ins(%cst : f32) outs(%reinterpret_cast : memref<1x1x8x8x4x4xf32, 2 : i32>) + amdaie.use_lock(%lock_20, AcquireGreaterOrEqual(1)) + %reinterpret_cast_41 = memref.reinterpret_cast %buffer_18 to offset: [0], sizes: [1, 1, 4, 8, 4, 8], strides: [1024, 1024, 256, 32, 8, 1] : memref<1024xbf16, 2 : i32> to memref<1x1x4x8x4x8xbf16, 2 : i32> + amdaie.use_lock(%lock_17, AcquireGreaterOrEqual(1)) + %reinterpret_cast_42 = memref.reinterpret_cast %buffer_15 to offset: [0], sizes: [1, 1, 8, 4, 8, 4], strides: [1024, 1024, 128, 32, 4, 1] : memref<1024xbf16, 2 : i32> to memref<1x1x8x4x8x4xbf16, 2 : i32> + func.call @generic_matmul_0_outlined(%reinterpret_cast_41, %reinterpret_cast_42, %reinterpret_cast) : (memref<1x1x4x8x4x8xbf16, 2 : i32>, memref<1x1x8x4x8x4xbf16, 2 : i32>, memref<1x1x8x8x4x4xf32, 2 : i32>) -> () + amdaie.use_lock(%lock_19, Release(1)) + amdaie.use_lock(%lock_16, Release(1)) + amdaie.use_lock(%lock_23, Release(1)) + amdaie.end + } + %10 = amdaie.logicalobjectfifo.placeholder{%tile_9} : !amdaie.logicalobjectfifo> + %channel_36 = amdaie.channel(%tile, 2, port_type = DMA, direction = MM2S) + %channel_37 = amdaie.channel(%tile_9, 0, port_type = DMA, direction = S2MM) + %11 = amdaie.flow({%channel_36} -> {%channel_37}) {is_packet_flow = false} + %connection_38 = amdaie.connection(%10 {%channel_37}, %lof {%channel_36}, flow = %11) {connection_type = #amdaie} : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %channel_39 = amdaie.channel(%tile_9, 0, port_type = CTRL, direction = MM2S) + %channel_40 = amdaie.channel(%tile_9, 0, port_type = SOUTH, direction = S2MM) + %12 = amdaie.flow({%channel_39} -> {%channel_40}) {is_packet_flow = false} + amdaie.controlcode { + memref.assume_alignment %0, 64 : memref<32x32xbf16> + memref.assume_alignment %1, 64 : memref<32x32xbf16> + memref.assume_alignment %2, 64 : memref<32x32xf32> + + // CHECK: amdaie.npu.circular_dma_cpy_nd %connection_A([0, 0] [32, 32] [32, 1], [] [] []) + %13 = amdaie.npu.circular_dma_cpy_nd %connection([0, 0] [32, 32] [32, 1], [] [] []) + + // CHECK: amdaie.npu.circular_dma_cpy_nd %connection_B([0, 0] [32, 32] [32, 1], [] [] []) + %14 = amdaie.npu.circular_dma_cpy_nd %connection_13([0, 0] [32, 32] [32, 1], [] [] []) + %15 = amdaie.npu.circular_dma_cpy_nd %connection_29([0, 0, 0] [32, 8, 4] [4, 128, 1], [0, 0] [32, 32] [32, 1]) + %16 = amdaie.npu.circular_dma_cpy_nd %connection_32([0, 0, 0] [32, 4, 8] [8, 256, 1], [0, 0] [32, 32] [32, 1]) + + // CHECK: amdaie.npu.circular_dma_cpy_nd %connection_C([0, 0] [32, 32] [32, 1], [0, 0, 0] [32, 8, 4] [4, 128, 1]) + %17 = amdaie.npu.circular_dma_cpy_nd %connection_35([0, 0] [32, 32] [32, 1], [0, 0, 0] [32, 8, 4] [4, 128, 1]) + %18 = amdaie.npu.circular_dma_cpy_nd %connection_38([] [] [], [0, 0] [32, 32] [32, 1]) + amdaie.end + } + } {npu_instructions = dense_resource : tensor<106xui32>} + return +}