Skip to content

Commit

Permalink
fix buffer descriptor ID allocator; improve stride/wrap verification
Browse files Browse the repository at this point in the history
  • Loading branch information
andrej committed Aug 5, 2024
1 parent 3a581fa commit 50db8c5
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 25 deletions.
4 changes: 2 additions & 2 deletions include/aie/Dialect/AIE/IR/AIETargetModel.h
Original file line number Diff line number Diff line change
Expand Up @@ -333,9 +333,9 @@ class AIE2TargetModel : public AIETargetModel {
return true;
} else {
if ((channel & 1) == 0) { // even channel number
return channel < 24;
return bd_id < 24;
} else {
return channel >= 24;
return bd_id >= 24;
}
}
}
Expand Down
10 changes: 5 additions & 5 deletions lib/Dialect/AIE/Transforms/AIEAssignBufferDescriptorIDs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,14 @@ BdIdGenerator::BdIdGenerator(int col, int row,

std::optional<uint32_t> BdIdGenerator::nextBdId(int channelIndex) {
uint32_t bd_id = 0;
uint32_t max_bd_id = targetModel.getNumBDs(col, row) - 1;
// Find the next free BD ID. This is not an efficient algorithm, but doesn't
// need to be since BD IDs are small numbers.
// FIXME: Specify WireBundle
for (; bdIdAlreadyAssigned(bd_id) ||
!targetModel.isBdChannelAccessible(col, row, bd_id, channelIndex);
for (; (bdIdAlreadyAssigned(bd_id) ||
!targetModel.isBdChannelAccessible(col, row, bd_id, channelIndex)) &&
bd_id <= max_bd_id;
bd_id++)
;
if (bd_id >= targetModel.getNumBDs(col, row)) {
if (bd_id > max_bd_id) {
return std::nullopt;
}
assignBdId(bd_id);
Expand Down
50 changes: 43 additions & 7 deletions lib/Dialect/AIEX/IR/AIEXDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,16 +83,24 @@ getHardwareStridesWraps(const AIE::AIETargetModel &targetModel,
llvm::SmallVector<int64_t, 4> sizes = llvm::SmallVector<int64_t, 4>(4, 0);

if (inputSizes[0] == 0) {
// Illegal input, this won't transfer anything at all.
// Leave it to the verification functions to complain to the user.
return std::make_pair(sizes, strides);
}

// d0_size, d0_stride
sizes[0] = inputSizes[0] * elemWidth / addressGranularity;
strides[0] = inputStrides[0] * elemWidth / addressGranularity - 1;
if (strides[0] < 0) {
// For the case that stride < addressGranularity, but size >
// addressGranularity, we can allow this. This is verified in verify().
if (inputStrides[0] * elemWidth < addressGranularity) {
// While the hardware cannot transfer less than addressGranularity bits at
// a time, the user may expresses a contiguous transfer of multiple
// elements with a stride smaller than addressGranularity. We can thus set
// the stride to 1 (encoded in hardware as 0) here to allow such transfers.
// The verification function should ensure that
// inputStrides[0] * elemWidth < addressGranularity
// iff. inputSize[0] * elemWidth > addressGranularity.
strides[0] = 0;
} else {
strides[0] = inputStrides[0] * elemWidth / addressGranularity - 1;
}

// d1_size, d1_stride
Expand All @@ -110,10 +118,18 @@ getHardwareStridesWraps(const AIE::AIETargetModel &targetModel,
}

// iteration_size, iteration_stride
// strides[3] doesn't need to lower to hardware if sizes[3] is one
if (inputSizes[3] > 1) {
// Stride only matters if we have more than one iteration.
sizes[3] = inputSizes[3] - 1;
strides[3] = inputStrides[3] * elemWidth / addressGranularity - 1;
// Note that the iteration_stride must be positive, just like the other
// dimensions. However, one can encode a zero-stride "repeat" of the same
// transfer by setting a positive repeat_count on the pushToQueue instr,
// and setting the size here to 1. This causes the BD to "wrap" at every
// single iteration, effectively never adding the specified stride, in turn
// equalling a repeat without stride.
if (inputStrides[3] > 0) {
strides[3] = inputStrides[3] * elemWidth / addressGranularity - 1;
}
}

return std::make_pair(sizes, strides);
Expand All @@ -130,6 +146,26 @@ verifyStridesWraps(mlir::Operation *forOp, mlir::MemRefType referencedBufType,
auto addressGranularity = targetModel.getAddressGenGranularity();
auto elemWidth = referencedBufType.getElementTypeBitWidth();

for (int i = 0; i < 4; i++) {
if (inputSizes[i] <= 0) {
return forOp->emitOpError("Size ") << i << " must be a positive integer.";
}
}
for (int i = 0; i < 3; i++) {
if (inputSizes[i] > 1 && inputStrides[i] < 1) {
// If inputSize[i] == 1, anything is allowable in the stride, since that
// stride will never be applied. For any larger size, we must verify that
// the stride is positive.
return forOp->emitOpError("Stride ")
<< i << " must be a postive integer.";
}
}
// A value of zero is allowable for the fourth-dimension stride, as such a
// "repeat" can be accomplished by setting size==1 and repeat_count=size.
if (inputSizes[3] > 1 && inputStrides[3] < 0) {
return forOp->emitOpError("Stride 3 must be a non-negative integer.");
}

for (int i = 0; i < 4; i++) {
// strides[0] == 1 is ok iff the tranfer size is a multiple of
// addressGranularity, which is checked below
Expand Down Expand Up @@ -186,7 +222,7 @@ verifyStridesWraps(mlir::Operation *forOp, mlir::MemRefType referencedBufType,
"] range.");
// strides[3] exceeding the range is ok iff the sizes[3] is one, which is
// checked below
if (hardwareStrides[3] > (1 << step_bits) && hardwareSizes[3] != 1)
if (hardwareStrides[3] > (1 << step_bits) && hardwareSizes[3] > 0)

Check warning on line 225 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=ON rtti=ON

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 225 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=ON rtti=OFF

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 225 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=OFF rtti=ON

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 225 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=OFF rtti=OFF

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)
return forOp->emitOpError("Stride 3 exceeds the [1:" +
std::to_string(1 << step_bits) + "] range.");
if (hardwareStrides[2] > (1 << step_bits))

Check warning on line 228 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=ON rtti=ON

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 228 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=ON rtti=OFF

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 228 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=OFF rtti=ON

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)

Check warning on line 228 in lib/Dialect/AIEX/IR/AIEXDialect.cpp

View workflow job for this annotation

GitHub Actions / windows-2019 msvc assert=OFF rtti=OFF

'<<': result of 32-bit shift implicitly converted to 64 bits (was 64-bit shift intended?)
Expand Down
26 changes: 15 additions & 11 deletions lib/Dialect/AIEX/Transforms/AIEDmaToNpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -353,14 +353,21 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {
// d2_stride
d2_stride = IntegerAttr::get(i32ty, strides[2]);

// iteration_current

// iteration_size
// strides[3] doesn't need to lower to hardware if sizes[3] is one
iteration_size = IntegerAttr::get(i32ty, sizes[3]);

// iteration_stride
iteration_stride = IntegerAttr::get(i32ty, strides[3]);
// iteration_current, iteration_size, iteration_stride, repeat_count
if (inputSizes[3] > 1) {
if (inputStrides[3] > 0) {
iteration_size = IntegerAttr::get(i32ty, sizes[3]);
iteration_stride = IntegerAttr::get(i32ty, strides[3]);
} else {
// We allow users to encode the repeat_count as a dimension 3 stride of
// 0. This must lower to a iteration wrap of 0, so no stride is ever
// added. We then repeat the BD using the repeat_count in
// NpuPushQueueOp.
iteration_size = zero;
iteration_stride = zero;
}
}
repeat_count = IntegerAttr::get(i32ty, sizes[3]);

// next_bd

Expand All @@ -379,9 +386,6 @@ struct DmaToNpuPattern : OpConversionPattern<NpuDmaMemcpyNdOp> {

// lock_acq_id

// repeat_count
repeat_count = IntegerAttr::get(i32ty, sizes[3]);

// Set the issue_token
issue_token = BoolAttr::get(ctx, op.getIssueToken());
// Earlier, all S2MM channels were implicitly assumed to issue a token.
Expand Down

0 comments on commit 50db8c5

Please sign in to comment.