Skip to content

Commit

Permalink
[SplitLogicalObjFifo] Fix split-logicalobjfifo pass to analyse unique…
Browse files Browse the repository at this point in the history
… producers/consumers ObjFifos (#1060)

Analyse the number of unique producers/consumers
ObjFifos for the ObjFifo being split and use that to infer the
split factor.

e2e CI test for Matmul both with/without ukernel via
`pack-peel-4-level-tiling` pipeline targeting 4x8 array on Strix have
been added.

Signed-off-by: Abhishek Varma <abhvarma@amd.com>
  • Loading branch information
Abhishek-Varma authored Jan 31, 2025
1 parent 406debf commit ddcbe0c
Show file tree
Hide file tree
Showing 7 changed files with 285 additions and 93 deletions.
34 changes: 34 additions & 0 deletions build_tools/ci/cpu_comparison/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -1721,6 +1721,22 @@ def __init__(self):
use_chess=False,
)
)
self.register(
Matmul(
512,
512,
256,
"i32",
"i32",
name_suffix="4rows_8cols_npu4_pack_peel_4_level_tiling",
tile_pipeline="pack-peel-4-level-tiling",
run_on_target=["npu4"],
aie_compilation_flags=[
"--iree-amdaie-num-rows=4",
"--iree-amdaie-num-cols=8",
],
)
)

for target in ["npu1_4col", "npu4"]:
self.register(
Expand Down Expand Up @@ -1786,6 +1802,24 @@ def __init__(self):
additional_labels=["I8UKernel"],
)
)
self.register(
Matmul(
64,
64,
64,
"bf16",
"f32",
name_suffix="4rows_8cols_npu4_pack_peel_4_level_tiling_ukernel",
use_ukernel=True,
tile_pipeline="pack-peel-4-level-tiling",
run_on_target=["npu4"],
aie_compilation_flags=[
"--iree-amdaie-num-rows=4",
"--iree-amdaie-num-cols=8",
],
use_chess=True,
)
)

# Matmul test on 2(rows)x2(cols) cores
self.register(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ SmallVector<mlir::CopyOpInterface> getCopyLikeConsumers(
SmallVector<mlir::CopyOpInterface> copyLikOps;
for (Operation *userOp : op->getUsers()) {
if (auto copyOp = dyn_cast<CopyOpInterface>(userOp);
dyn_cast_if_present<LogicalObjFifoOpInterface>(
copyOp.getSource().getDefiningOp()) == op) {
copyOp && dyn_cast_if_present<LogicalObjFifoOpInterface>(
copyOp.getSource().getDefiningOp()) == op) {
copyLikOps.push_back(copyOp);
}
}
Expand All @@ -31,8 +31,8 @@ SmallVector<mlir::CopyOpInterface> getCopyLikeProducers(
SmallVector<mlir::CopyOpInterface> copyLikOps;
for (Operation *userOp : op->getUsers()) {
if (auto copyOp = dyn_cast<CopyOpInterface>(userOp);
dyn_cast_if_present<LogicalObjFifoOpInterface>(
copyOp.getTarget().getDefiningOp()) == op) {
copyOp && dyn_cast_if_present<LogicalObjFifoOpInterface>(
copyOp.getTarget().getDefiningOp()) == op) {
copyLikOps.push_back(copyOp);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,7 @@ extern "C" {
}
matmul_combos(matmul_vectorized_c_func, 16, 8, 32)
matmul_combos(matmul_vectorized_c_func, 16, 8, 64)
matmul_combos(matmul_vectorized_c_func, 16, 16, 32)
matmul_combos(matmul_vectorized_c_func, 32, 32, 32)
matmul_combos(matmul_vectorized_c_func, 32, 32, 64)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,46 @@ FailureOr<int64_t> getSplitStride(ArrayRef<AMDAIE::DmaCpyNdOp> dmaOps,
return splitStride;
}

/// Given a list of Copy Ops, fetch the total no. of unique consumer/producer
/// LogicalObjectFifos. This would helps us figure out the split factor for
/// LogicalObjectFifos.
/// And example case which necessitated this feature :-
/// %lhs = LOF_on_L2
/// %a = LOF_on_L1_0
/// %b = LOF_on_L1_1
/// %c = LOF_on_L1_2
/// DMA(%a, %lhs)
/// DMA(%b, %lhs)
/// DMA(%c, %lhs)
/// DMA(%b, %lhs)
/// DMA(%c, %lhs)
///
/// In the above snippet, assume we want to split %lhs, it has 5 DMA ops.
/// But only 3 of them are unique : (%lhs -> %a), (%lhs -> %b) (%lhs -> %c).
/// Therefore this function is going to return 3. Which the caller is going
/// to use as split factor.
template <CopyOpOperateOn OperateOn>
static FailureOr<int64_t> fetchTotalUniqueLogicalObjFifoUsers(
SmallVector<CopyOpInterface> copyLikeOps) {
DenseSet<Operation *> uniqueLof;
for (CopyOpInterface copyOp : copyLikeOps) {
AMDAIE::LogicalObjectFifoFromMemrefOp lof = nullptr;
if constexpr (OperateOn == CopyOpOperateOn::Target) {
lof = dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>(
copyOp.getTarget().getDefiningOp());
} else {
lof = dyn_cast_if_present<AMDAIE::LogicalObjectFifoFromMemrefOp>(
copyOp.getSource().getDefiningOp());
}
if (!lof) {
return copyOp.emitOpError()
<< "could not retrieve source/target objectFifo";
}
uniqueLof.insert(lof);
}
return uniqueLof.size();
}

/// Find the logical objectFifo and DMA source/target splitting dimensions for
/// each DMA and objectFifo pair.
///
Expand All @@ -138,6 +178,9 @@ FailureOr<int64_t> getSplitStride(ArrayRef<AMDAIE::DmaCpyNdOp> dmaOps,
/// that has product size larger than the other side's product size after
/// splitting because that's the number of elements that should be
/// produced/consumed on the respective sides before splitting.
/// Towards the end fetch the number of unique producers (or consumers) for the
/// objectFifo which will be split. This would form the split factor which would
/// be capped by the total no. of columns OR std::gcd of source/target size.
LogicalResult collectSplittingDims(
const SmallVector<DmaObjFifoPairT> &dmaObjFifoPairs,
DenseMap<AMDAIE::DmaCpyNdOp, DmaSplitInfo> &dmaSplitInfoMap,
Expand Down Expand Up @@ -218,17 +261,31 @@ LogicalResult collectSplittingDims(
// Calculate the new source stride to be used for splitting the DMA.
int64_t newSourceStride =
splitStride != 1 ? splitDimSize / splitStride : 1;
FailureOr<int64_t> maybeNumUniqueConsumers =
fetchTotalUniqueLogicalObjFifoUsers<CopyOpOperateOn::Target>(
objFifo.getCopyLikeConsumers());
if (failed(maybeNumUniqueConsumers)) {
objFifo.emitOpError() << "could not retrieve the total number of "
"unique consumer objFifos";
}
int64_t splitFactor = std::gcd(*maybeNumUniqueConsumers, numCols);
int64_t sourceSize = (*sourceSizes)[sourceSplitDim];
int64_t targetSize = (*targetSizes)[targetSplitDim];
if (sourceSize % splitFactor != 0 || targetSize % splitFactor != 0) {
splitFactor = std::gcd(sourceSize, targetSize);
}
LLVM_DEBUG(llvm::dbgs() << "sourceSplitDim: " << sourceSplitDim << "\n");
LLVM_DEBUG(llvm::dbgs() << "targetSplitDim: " << targetSplitDim << "\n");
LLVM_DEBUG(llvm::dbgs()
<< "newSourceStride: " << newSourceStride << "\n");
LLVM_DEBUG(llvm::dbgs()
<< "objFifoSplitDim: " << objFifoSplitDim << "\n");
LLVM_DEBUG(llvm::dbgs() << "splitStride: " << splitStride << "\n");
LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << numCols << "\n");
LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << splitFactor << "\n");
dmaSplitInfoMap[dmaOp] = {sourceSplitDim, newSourceStride, targetSplitDim,
1, numCols};
objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, numCols, splitStride};
1, splitFactor};
objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, splitFactor,
splitStride};
} else if (dmaOp.getSourceObjectFifo() == objFifo) {
// Find outermost dimension in the access pattern that has stride ==
// sizeAfterSplit and size != 1.
Expand Down Expand Up @@ -274,17 +331,31 @@ LogicalResult collectSplittingDims(
// Calculate the new target stride to be used for splitting the DMA.
int64_t newTargetStride =
splitStride != 1 ? splitDimSize / splitStride : 1;
FailureOr<int64_t> maybeNumUniqueProducers =
fetchTotalUniqueLogicalObjFifoUsers<CopyOpOperateOn::Source>(
objFifo.getCopyLikeProducers());
if (failed(maybeNumUniqueProducers)) {
objFifo.emitOpError() << "could not retrieve the total number of "
"unique producer objFifos";
}
int64_t splitFactor = std::gcd(*maybeNumUniqueProducers, numCols);
int64_t sourceSize = (*sourceSizes)[sourceSplitDim];
int64_t targetSize = (*targetSizes)[targetSplitDim];
if (sourceSize % splitFactor != 0 || targetSize % splitFactor != 0) {
splitFactor = std::gcd(sourceSize, targetSize);
}
LLVM_DEBUG(llvm::dbgs() << "sourceSplitDim: " << sourceSplitDim << "\n");
LLVM_DEBUG(llvm::dbgs() << "targetSplitDim: " << targetSplitDim << "\n");
LLVM_DEBUG(llvm::dbgs()
<< "newTargetStride: " << newTargetStride << "\n");
LLVM_DEBUG(llvm::dbgs()
<< "objFifoSplitDim: " << objFifoSplitDim << "\n");
LLVM_DEBUG(llvm::dbgs() << "splitStride: " << splitStride << "\n");
LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << numCols << "\n");
LLVM_DEBUG(llvm::dbgs() << "splitFactor: " << splitFactor << "\n");
dmaSplitInfoMap[dmaOp] = {sourceSplitDim, 1, targetSplitDim,
newTargetStride, numCols};
objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, numCols, splitStride};
newTargetStride, splitFactor};
objFifoSplitInfoMap[objFifo] = {objFifoSplitDim, splitFactor,
splitStride};
}
}
return success();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -755,7 +755,7 @@ LogicalResult splitLogicalObjectFifo(IRRewriter &rewriter,
LogicalResult splitDoublyStridedOp(IRRewriter &rewriter,
AMDAIE::DoublyStridedOpInterface op,
size_t sourceSplitDim, size_t targetSplitDim,
std::optional<size_t> maybeSplitFactor,
int64_t splitFactor,
int64_t sourceSplitStride,
int64_t targetSplitStride) {
if (!op->use_empty())
Expand Down Expand Up @@ -800,15 +800,6 @@ LogicalResult splitDoublyStridedOp(IRRewriter &rewriter,
}
int64_t sourceSize = maybeSourceSize.value();
int64_t targetSize = maybeTargetSize.value();
int64_t splitFactor = maybeSplitFactor.has_value()
? maybeSplitFactor.value()
: std::gcd(sourceSize, targetSize);
if (sourceSize % splitFactor != 0 || targetSize % splitFactor != 0) {
int64_t newSplitFactor = std::gcd(sourceSize, targetSize);
LLVM_DEBUG(llvm::dbgs() << "split factor has been changed from "
<< splitFactor << " to " << newSplitFactor);
splitFactor = newSplitFactor;
}

int64_t newSourceSize = sourceSize / splitFactor;
int64_t newTargetSize = targetSize / splitFactor;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,14 @@ LogicalResult splitLogicalObjectFifo(
int64_t splitStride = 1);

/// Split doubly strided operations on a source and target split dimension with
/// the provided split factor. If no split factor is provided, the doubly
/// strided operation will be split on the size of the dimension being split.
LogicalResult splitDoublyStridedOp(
IRRewriter &rewriter, AMDAIE::DoublyStridedOpInterface op,
size_t sourceSplitDim = 0, size_t targetSplitDim = 0,
std::optional<size_t> splitFactor = std::nullopt,
int64_t sourceSplitStride = 1, int64_t targetSplitStride = 1);
/// the provided split factor.
LogicalResult splitDoublyStridedOp(IRRewriter &rewriter,
AMDAIE::DoublyStridedOpInterface op,
size_t sourceSplitDim = 0,
size_t targetSplitDim = 0,
int64_t splitFactor = 1,
int64_t sourceSplitStride = 1,
int64_t targetSplitStride = 1);

} // namespace mlir::iree_compiler::AMDAIE

Expand Down
Loading

0 comments on commit ddcbe0c

Please sign in to comment.