Skip to content

Commit

Permalink
[AssignTiles] Adapt assign-tiles to assign same tiles for L3 LOF on d…
Browse files Browse the repository at this point in the history
…iff block

-- Required to enable 4x8 AIE array on Strix.
-- Before this commit `iree-amdaie-assign-tiles` pass will end up
   allocating/distributing LHS L3 buffers on (0,0) -> (7,0).
-- Since this leads to consumer DMA channel exhaustion later on, this commit
   aims to address the same and tries to assign same tile set to L3 buffers on
   different block.
-- As a result, we will get LHS L3 buffers on (0,0) -> (4,0).

Signed-off-by: Abhishek Varma <abhvarma@amd.com>
  • Loading branch information
Abhishek-Varma committed Jan 15, 2025
1 parent 42fa1e9 commit e29f65b
Show file tree
Hide file tree
Showing 2 changed files with 141 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -350,7 +350,9 @@ LogicalResult assignNonLocalTiles(RewriterBase &rewriter, Operation *op,
};

// After filling tile candidates, find and assign a specific one.
DenseMap<MemRefType, int64_t> logicalObjFifoToTileId;
DenseMap<Value, SmallVector<AMDAIE::TileOp>> memrefToTileMap;
Block *prevBlock = nullptr;
bool pickFromBefore = false;
WalkResult res =
op->walk([&](AMDAIE::LogicalObjFifoOpInterface logicalObjectFifo) {
uint8_t memSpace = logicalObjectFifo.getMemorySpaceAsUInt();
Expand All @@ -365,9 +367,34 @@ LogicalResult assignNonLocalTiles(RewriterBase &rewriter, Operation *op,
llvm::map_to_vector(logicalObjectFifo.getTiles(), [](Value tile) {
return dyn_cast_if_present<TileOp>(tile.getDefiningOp());
});
AMDAIE::TileOp assignedTileOp =
*std::min_element(tiles.begin(), tiles.end(), tileLocAndUsageCmp);
bool hasMultipleTileCandidates = tiles.size() > 1;
auto fromMemrefOp = dyn_cast<AMDAIE::LogicalObjectFifoFromMemrefOp>(
logicalObjectFifo.getOperation());
Value memref = nullptr;
if (fromMemrefOp) {
memref = fromMemrefOp.getMemref();
if (hasMultipleTileCandidates && memSpace == 0) {
if (prevBlock && logicalObjectFifo->getBlock() != prevBlock) {
pickFromBefore = true;
}
prevBlock = logicalObjectFifo->getBlock();
}
}

AMDAIE::TileOp assignedTileOp = nullptr;
if (pickFromBefore && memrefToTileMap.contains(memref)) {
SmallVector<AMDAIE::TileOp> tiles = memrefToTileMap[memref];
assignedTileOp = tiles[0];
memrefToTileMap[memref].erase(memrefToTileMap[memref].begin());
memrefToTileMap[memref].push_back(assignedTileOp);
} else {
assignedTileOp =
*std::min_element(tiles.begin(), tiles.end(), tileLocAndUsageCmp);
if (hasMultipleTileCandidates && memSpace == 0) {
if (!memrefToTileMap.contains(memref)) memrefToTileMap[memref] = {};
memrefToTileMap[memref].push_back(assignedTileOp);
}
}
// Increase usage of the chosen tile as a new logical objectFifo will be
// assigned to it. This allows distributing the logical objectFifos
// evenly across the available tile resources.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -509,3 +509,114 @@ module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb}
return
}
}

// -----

// Test to demonstrate that we go ahead with the same tile assignment for different blocks in case
// the L3 logicalObjectFifo has multiple tile candidates.
//
// This is needed for enabling 4x8 AIE array on Strix.

// CHECK-LABEL: @same_tile_assg_for_l3_on_diff_block
// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
// CHECK-DAG: %[[C30:.*]] = arith.constant 30 : index
// CHECK: %[[LHS:.*]] = memref.alloc() : memref<2x3x4xi32>
// CHECK: %[[RHS:.*]] = memref.alloc() : memref<3x4x5xi32>
// CHECK: scf.forall
// CHECK: %[[TILE_0_0:.*]] = amdaie.tile(%[[C0:.*]], %[[C0:.*]])
// CHECK: %[[LOF_LHS_L3:.*]] = amdaie.logicalobjectfifo.from_memref %[[LHS]], {%[[TILE_0_0]]} :
// CHECK: %[[LHS_L3_to_L2:.*]] = amdaie.dma_cpy_nd
// CHECK-SAME: %[[LOF_LHS_L3]][0, 0, 0] [1, 3, 4] [12, 4, 1]) :
// CHECK: %[[LOF_RHS_L3_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[RHS]], {%[[TILE_0_0]]} :
// CHECK: %[[RHS_L3_to_L2_0:.*]] = amdaie.dma_cpy_nd
// CHECK-SAME: %[[LOF_RHS_L3_0]][0, 0, 0] [1, 4, 5] [20, 5, 1]) :
// CHECK: %[[TILE_1_0:.*]] = amdaie.tile(%[[C1:.*]], %[[C0:.*]])
// CHECK: %[[LOF_RHS_L3_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[RHS]], {%[[TILE_1_0]]} :
// CHECK: %[[RHS_L3_to_L2_1:.*]] = amdaie.dma_cpy_nd
// CHECK-SAME: %[[LOF_RHS_L3_1]][0, 0, 0] [1, 4, 5] [20, 5, 1]) :
// CHECK: scf.for
// CHECK-NOT: amdaie.logicalobjectfifo.from_memref %[[LHS]], {%[[TILE_1_0]]} :
// CHECK: %[[LHS_L3_to_L2:.*]] = amdaie.dma_cpy_nd
// CHECK-SAME: %[[LOF_LHS_L3]][0, 0, 0] [1, 3, 4] [12, 4, 1]) :
// CHECK: %[[RHS_L3_to_L2_0:.*]] = amdaie.dma_cpy_nd
// CHECK-SAME: %[[LOF_RHS_L3_0]][0, 0, 0] [1, 4, 5] [20, 5, 1]) :
// CHECK: %[[RHS_L3_to_L2_1:.*]] = amdaie.dma_cpy_nd
// CHECK-SAME: %[[LOF_RHS_L3_1]][0, 0, 0] [1, 4, 5] [20, 5, 1]) :
#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu4", ukernels = "none"}>
module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} {
func.func @same_tile_assg_for_l3_on_diff_block() {
%c30 = arith.constant 30 : index
%c2 = arith.constant 2 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%alloc_4 = memref.alloc() : memref<2x3x4xi32> // LHS L3
%alloc_5 = memref.alloc() : memref<3x4x5xi32> // RHS L3
%alloc_3 = memref.alloc() : memref<3x4xi32, 1 : i32> // LHS L2
%alloc_1 = memref.alloc() : memref<4x5xi32, 1 : i32> // RHS L2
%alloc_2 = memref.alloc() : memref<4x5xi32, 1 : i32> // RHS L2
%alloc_0 = memref.alloc() : memref<4xi32, 2 : i32> // LHS L1
%alloc = memref.alloc() : memref<5xi32, 2 : i32> // RHS L1

%tile_0_0 = amdaie.tile(%c0, %c0)
%tile_0_1 = amdaie.tile(%c0, %c1)
%tile_1_0 = amdaie.tile(%c1, %c0)
%tile_1_1 = amdaie.tile(%c1, %c1)

%0 = amdaie.logicalobjectfifo.from_memref %alloc_1, {%tile_0_1} : memref<4x5xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4x5xi32, 1 : i32>>
%1 = amdaie.logicalobjectfifo.from_memref %alloc_2, {%tile_0_1} : memref<4x5xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<4x5xi32, 1 : i32>>
%2 = amdaie.logicalobjectfifo.from_memref %alloc_3, {%tile_1_1} : memref<3x4xi32, 1 : i32> -> !amdaie.logicalobjectfifo<memref<3x4xi32, 1 : i32>>
%3 = amdaie.logicalobjectfifo.from_memref %alloc_4, {%tile_0_0} : memref<2x3x4xi32> -> !amdaie.logicalobjectfifo<memref<2x3x4xi32>>
%4 = amdaie.logicalobjectfifo.from_memref %alloc_5, {%tile_1_0} : memref<3x4x5xi32> -> !amdaie.logicalobjectfifo<memref<3x4x5xi32>>
scf.forall (%arg0, %arg1) in (2, 1) {
// L3 -> L2
%5 = amdaie.dma_cpy_nd(%2[0, 0] [3, 4] [4, 1], %3[0, 0, 0] [1, 3, 4] [12, 4, 1]) : (!amdaie.logicalobjectfifo<memref<3x4xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<2x3x4xi32>>)
%6 = amdaie.dma_cpy_nd(%0[0, 0] [4, 5] [5, 1], %4[0, 0, 0] [1, 4, 5] [20, 5, 1]) : (!amdaie.logicalobjectfifo<memref<4x5xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<3x4x5xi32>>)
%7 = amdaie.dma_cpy_nd(%1[0, 0] [4, 5] [5, 1], %4[0, 0, 0] [1, 4, 5] [20, 5, 1]) : (!amdaie.logicalobjectfifo<memref<4x5xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<3x4x5xi32>>)

%tile_0_2 = amdaie.tile(%c0, %c2)
%tile_1_2 = amdaie.tile(%c1, %c2)

%8 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile_1_2} : memref<5xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<5xi32, 2 : i32>>
%9 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile_0_2} : memref<5xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<5xi32, 2 : i32>>
%10 = amdaie.logicalobjectfifo.from_memref %alloc_0, {%tile_0_2, %tile_1_2} : memref<4xi32, 2 : i32> -> !amdaie.logicalobjectfifo<memref<4xi32, 2 : i32>>
// L2 -> L1
%11 = amdaie.dma_cpy_nd(%9[0] [5] [1], %0[0, 0] [1, 5] [5, 1]) : (!amdaie.logicalobjectfifo<memref<5xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<4x5xi32, 1 : i32>>)
%12 = amdaie.dma_cpy_nd(%8[0] [5] [1], %1[0, 0] [1, 5] [5, 1]) : (!amdaie.logicalobjectfifo<memref<5xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<4x5xi32, 1 : i32>>)
%13 = amdaie.dma_cpy_nd(%10[0] [4] [1], %2[0, 0] [1, 4] [4, 1]) : (!amdaie.logicalobjectfifo<memref<4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<3x4xi32, 1 : i32>>)

%14 = amdaie.core(%tile_0_2, in : [%13, %11], out : []) {
%16 = amdaie.logicalobjectfifo.access(%10, Read) : !amdaie.logicalobjectfifo<memref<4xi32, 2 : i32>> -> memref<4xi32, 2 : i32>
%17 = amdaie.logicalobjectfifo.access(%9, Read) : !amdaie.logicalobjectfifo<memref<5xi32, 2 : i32>> -> memref<5xi32, 2 : i32>
amdaie.end
}
%15 = amdaie.core(%tile_1_2, in : [%13, %12], out : []) {
%16 = amdaie.logicalobjectfifo.access(%10, Read) : !amdaie.logicalobjectfifo<memref<4xi32, 2 : i32>> -> memref<4xi32, 2 : i32>
%17 = amdaie.logicalobjectfifo.access(%8, Read) : !amdaie.logicalobjectfifo<memref<5xi32, 2 : i32>> -> memref<5xi32, 2 : i32>
amdaie.end
}
scf.for %arg2 = %c0 to %c30 step %c1 {
// L3 -> L2
%16 = amdaie.dma_cpy_nd(%2[0, 0] [3, 4] [4, 1], %3[0, 0, 0] [1, 3, 4] [12, 4, 1]) : (!amdaie.logicalobjectfifo<memref<3x4xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<2x3x4xi32>>)
%17 = amdaie.dma_cpy_nd(%0[0, 0] [4, 5] [5, 1], %4[0, 0, 0] [1, 4, 5] [20, 5, 1]) : (!amdaie.logicalobjectfifo<memref<4x5xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<3x4x5xi32>>)
%18 = amdaie.dma_cpy_nd(%1[0, 0] [4, 5] [5, 1], %4[0, 0, 0] [1, 4, 5] [20, 5, 1]) : (!amdaie.logicalobjectfifo<memref<4x5xi32, 1 : i32>>, !amdaie.logicalobjectfifo<memref<3x4x5xi32>>)
// L2 -> L1
%19 = amdaie.dma_cpy_nd(%9[0] [5] [1], %0[0, 0] [1, 5] [5, 1]) : (!amdaie.logicalobjectfifo<memref<5xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<4x5xi32, 1 : i32>>)
%20 = amdaie.dma_cpy_nd(%8[0] [5] [1], %1[0, 0] [1, 5] [5, 1]) : (!amdaie.logicalobjectfifo<memref<5xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<4x5xi32, 1 : i32>>)
%21 = amdaie.dma_cpy_nd(%10[0] [4] [1], %2[0, 0] [1, 4] [4, 1]) : (!amdaie.logicalobjectfifo<memref<4xi32, 2 : i32>>, !amdaie.logicalobjectfifo<memref<3x4xi32, 1 : i32>>)
%22 = amdaie.core(%tile_0_2, in : [%21, %19], out : []) {
%24 = amdaie.logicalobjectfifo.access(%10, Read) : !amdaie.logicalobjectfifo<memref<4xi32, 2 : i32>> -> memref<4xi32, 2 : i32>
%25 = amdaie.logicalobjectfifo.access(%9, Read) : !amdaie.logicalobjectfifo<memref<5xi32, 2 : i32>> -> memref<5xi32, 2 : i32>
amdaie.end
}
%23 = amdaie.core(%tile_1_2, in : [%21, %20], out : []) {
%24 = amdaie.logicalobjectfifo.access(%10, Read) : !amdaie.logicalobjectfifo<memref<4xi32, 2 : i32>> -> memref<4xi32, 2 : i32>
%25 = amdaie.logicalobjectfifo.access(%8, Read) : !amdaie.logicalobjectfifo<memref<5xi32, 2 : i32>> -> memref<5xi32, 2 : i32>
amdaie.end
}
}
} {mapping = [#gpu.block<y>, #gpu.block<x>]}
return
}
}

0 comments on commit e29f65b

Please sign in to comment.