diff --git a/compiler/plugins/target/AMD-AIE/aievec/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/aievec/CMakeLists.txt index d9968222c..6bf43c709 100644 --- a/compiler/plugins/target/AMD-AIE/aievec/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/aievec/CMakeLists.txt @@ -83,6 +83,9 @@ iree_cc_library( MLIREmitCDialect ::AIEVecDialectIR ::AIEVecXLLVMOpsGen + iree-amd-aie::aie_runtime::iree_aie_runtime_static + iree::compiler::Dialect::HAL::IR + iree::compiler::Dialect::HAL::IR::HALDialect ) add_subdirectory(test) diff --git a/compiler/plugins/target/AMD-AIE/aievec/Passes.h b/compiler/plugins/target/AMD-AIE/aievec/Passes.h index c1dabcf67..01b7d9232 100644 --- a/compiler/plugins/target/AMD-AIE/aievec/Passes.h +++ b/compiler/plugins/target/AMD-AIE/aievec/Passes.h @@ -22,15 +22,17 @@ namespace mlir::iree_compiler::aievec { /** - * Append pass(es) for canonicalizing operations in the vector dialect to a form + * Append passes for canonicalizing operations in the vector dialect to a form * that can be lowered to the AIEVec dialect. */ void buildCanonicalizeVectorForAIEVec(mlir::OpPassManager &); /** - * A pass containing patterns for canonicalizing operations in the vector + * A pass containing some patterns for canonicalizing operations in the vector * dialect to a form that can be lowered to the AIEVec dialect. This pass is - * named `canonicalize-vector-for-aievec`. + * named `canonicalize-vector-for-aievec`. To ensure all required vector dialect + * canonicalizations take place, PassManagers should use + * `buildCanonicalizeVectorForAIEVec`. */ std::unique_ptr createCanonicalizeVectorForAIEVecPass(); @@ -39,6 +41,54 @@ std::unique_ptr createCanonicalizeVectorForAIEVecPass(); */ void registerCanonicalizeVectorForAIEVecPass(); +/** + * This pass ensures that reads from AIE tile memory are aligned according to + * hardware constraints. For example, suppose we have 128 bytes in tile memory, + * represented in hex as: + * + * 0x00 0x01 ... 0x7E 0x7F + * + * On AIE-2, the (vector) read instructions from the tile memory into registers + * must be aligned to 256-bits (32-bytes). So if we want to read 64 bytes + * starting from 0x00 that is fine, but if we want to read 64 bytes starting + * from 0x01, then we cannot use a vector read instruction directly. To work + * around this constraint, we do the following: + * + * 1. Perform a wider read, that loads 128 bytes (2x as many as we want) + * starting from 0x00 into a larger register. That is, bytes 0x00-0x7F are + * loaded, so we have 1 'junk' byte at the beginning and 63 'junk' bytes at + * the end. + * + * 2. Extract the target bytes 0x01 ... 0x40 from the larger register into a + * smaller register in 2 steps, using 2 AIE specific instructions: + * + * a) Extract: + * https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_ml_intrinsics/intrinsics/group__intr__gpvectorconv__elem.html + * + * b) Shift: + * https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_ml_intrinsics/intrinsics/group__intr__gpvectorop__shift.html + * + * First, we use the extract instruction to split the read 128-bytes into two + * halves, 0x00-0x3F and 0x40-0x7F, each in its own 64-byte register. Then, we + * use a shift operation to combine the upper 31 bytes from the first half + * and the lower 33 bytes from the second half into a new 64-byte register. + * This new register contains exactly the 64 bytes we want to read, starting + * from 0x01. + * + * If we want to read 32 bytes starting from 0x01, we can use a similar + * approach. The only consideration is that the shift operation requires 64-byte + * inputs, so the order of the of the shift and extracts is reversed. + * + * We do not currently support unaligned reads of vectors which are not 32-bytes + * or 64-bytes in length. + * + * TODO(newling) use this same approach to align writes to unaligned memory. + * */ + +std::unique_ptr createAlignTransferReadsPass(); + +void registerAlignTransferReadsPass(); + /** * Append pass(es) for lowering operations in the vector dialect to the AIEVec * dialect. Vector dialect ops are expected to be in a canonical form @@ -48,7 +98,7 @@ void buildLowerVectorToAIEVec(mlir::OpPassManager &pm); /** * A pass containing patterns for lowering operations in the vector dialect to - * the AIEVec dialect. The pass is currently named `test-lower-vector-to-aievec`. + * the AIEVec dialect. The pass is currently named `test-lower-vector-to-aievec` */ static std::unique_ptr createLowerVectorToAIEVec(); diff --git a/compiler/plugins/target/AMD-AIE/aievec/VectorToVectorConversions.cpp b/compiler/plugins/target/AMD-AIE/aievec/VectorToVectorConversions.cpp index ecb036674..8e7801773 100644 --- a/compiler/plugins/target/AMD-AIE/aievec/VectorToVectorConversions.cpp +++ b/compiler/plugins/target/AMD-AIE/aievec/VectorToVectorConversions.cpp @@ -15,6 +15,9 @@ #include #include "Passes.h" +#include "aievec/AIEVecOps.h" +#include "iree-amd-aie/Transforms/AMDAIEUtils.h" +#include "iree-amd-aie/aie_runtime/iree_aie_runtime.h" #include "llvm/ADT/STLExtras.h" #include "mlir/Conversion/AffineToStandard/AffineToStandard.h" #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h" @@ -300,7 +303,6 @@ class FlattenContiguousRowMajorTransferWritePattern } // namespace copied_from_mlir - static bool isGemmBTransposedContractionOp(vector::ContractionOp op) { if (op.getKind() != vector::CombiningKind::ADD) return false; @@ -897,6 +899,7 @@ struct CanonicalizeVectorForAIEVecPass populateBubbleSignExtensionsLate(patterns); (void)applyPatternsAndFoldGreedily(op, std::move(patterns)); } + { RewritePatternSet patterns(context); patterns @@ -914,6 +917,7 @@ struct CanonicalizeVectorForAIEVecPass mlir::vector::populateVectorBroadcastLoweringPatterns(patterns); (void)applyPatternsAndFoldGreedily(op, std::move(patterns)); } + { // These must run after 'populateFlattenVectorTransferPatterns' because // vector.shape_casts are introduced. Merging into a single pass creates @@ -925,6 +929,170 @@ struct CanonicalizeVectorForAIEVecPass } }; +/// Returns one of: +/// 1) failure, if there is definitely an error that should be propagated. +/// 2) a new transfer_read operation that is sufficiently aligned, if the old +/// transfer_read is determined to be insufficiently aligned and it is +/// possible to create a new transfer_read. +/// 3) the original transfer_read operation, otherwise. +FailureOr getAlignedTransferRead( + vector::TransferReadOp readOp, IRRewriter &rewriter, + const AMDAIE::AMDAIEDeviceModel &deviceModel) { + uint32_t vectorLoadStoreAlignmentBits = + deviceModel.getVectorLoadStoreAlignmentBits(); + uint32_t maxVectorSizeBits = deviceModel.getMaxVectorSizeBits(); + uint32_t shiftOperandBits = deviceModel.getShiftOperandBits(); + + // Check that it's not a splat transfer read. + if (readOp.getPermutationMap().isConstant()) return readOp.getVector(); + + MLIRContext *ctx = readOp.getContext(); + VectorType shortType = readOp.getVectorType(); + Location loc = readOp.getLoc(); + Value padding = readOp.getPadding(); + ShapedType sourceType = readOp.getSource().getType(); + Type elementType = shortType.getElementType(); + + if (sourceType.getRank() != 1 || shortType.getRank() != 1) { + return readOp.emitOpError( + "does not have rank-1 source and rank-1 vector type."); + } + + uint32_t elementBits = elementType.getIntOrFloatBitWidth(); + int64_t shortLength = shortType.getShape().back(); + int64_t shortBits = shortLength * elementBits; + uint32_t alignElements = vectorLoadStoreAlignmentBits / elementBits; + + rewriter.setInsertionPoint(readOp); + + AffineMap moduloMap = + AffineMap::get(1, 0, getAffineDimExpr(0, ctx) % alignElements); + + Value oldIndex = readOp.getIndices().back(); + + Value offset = rewriter.createOrFold( + loc, moduloMap, SmallVector{oldIndex}); + + // If the offset is constant and zero, the read is already aligned. + if (auto offsetConstantOp = offset.getDefiningOp()) + if (offsetConstantOp.getValue() == 0) return readOp.getVector(); + + // Verify that we can load a vector 2x as long as the original vector. + int64_t longBits = 2 * shortBits; + int64_t longLength = 2 * shortLength; + VectorType longType = VectorType::get(longLength, elementType); + if (longBits > maxVectorSizeBits) { + // Not returning failure, as it is possible that the read is already + // aligned, and we just couldn't prove it. + readOp.emitWarning() + << "`transfer_read` can't be aligned with a read twice " + << "as large because " << longBits + << " bits is greater than the maximum vector size of " + << maxVectorSizeBits << " bits."; + + return readOp.getVector(); + } + + SmallVector inBounds = readOp.getInBoundsValues(); + bool allInBounds = + std::all_of(inBounds.begin(), inBounds.end(), [](bool b) { return b; }); + + if (shortBits != shiftOperandBits / 2 && shortBits != shiftOperandBits) { + // Not returning failure, as it is possible that the read is already + // aligned, and we just couldn't prove it. + readOp.emitWarning() << "`transfer_read` doesn't have a vector with " + << shiftOperandBits / 2 << " or " << shiftOperandBits + << " bits." + << "This case is not currently handled."; + return readOp.getVector(); + } + + Value newIndex = rewriter.createOrFold(loc, oldIndex, offset); + + // Create the aligned transfer read for a vector 2x as long that covers the + // elements of the unaligned vector. + Value longVec = rewriter.create( + loc, longType, readOp.getSource(), SmallVector{newIndex}, padding, + SmallVector{allInBounds}); + + Value elementBytes = + rewriter.create(loc, elementBits / 8); + + Value offsetBytes = + rewriter.createOrFold(loc, offset, elementBytes); + + Value offsetBytes_i32 = rewriter.createOrFold( + loc, rewriter.getIntegerType(32), offsetBytes); + + Value replacement; + if (shortBits == shiftOperandBits) { + // - Extract lower 64 bytes + // - Extract upper 64 bytes + // - Apply shift to obtain new 64 bytes + Value low = rewriter.create(loc, shortType, longVec, + rewriter.getI8IntegerAttr(0)); + Value upp = rewriter.create(loc, shortType, longVec, + rewriter.getI8IntegerAttr(1)); + replacement = rewriter.createOrFold(loc, shortType, low, upp, + offsetBytes_i32); + } else if (shortBits == shiftOperandBits / 2) { + // - Apply shift to obtain new 64 bytes, bottom 32 being the required ones + // - Extract lower 32 bytes + Value shift = rewriter.createOrFold(loc, longType, longVec, + longVec, offsetBytes_i32); + replacement = rewriter.create(loc, shortType, shift, + rewriter.getI8IntegerAttr(0)); + } else { + assert(false && + "unreachable: already checked that shortBytes is equal to or half " + "of shiftOperandBytes"); + } + + rewriter.replaceOp(readOp, replacement); + + return replacement; +} + +struct AlignTransferReadsPass + : public PassWrapper> { + StringRef getArgument() const final { return "align-transfer-reads"; } + + StringRef getDescription() const final { + return "Align `vector.transfer_read` operations."; + } + + void getDependentDialects(DialectRegistry ®istry) const override { + registry + .insert(); + } + + void runOnOperation() override { + Operation *op = getOperation(); + + auto targetAttr = IREE::HAL::ExecutableTargetAttr::lookup(op); + std::optional maybeDevice = + mlir::iree_compiler::AMDAIE::getConfigAMDAIEDevice(targetAttr); + if (!maybeDevice) { + op->emitOpError() + << "has no AMDAIEDevice in the target attribute configuration. This " + "device-specific information is required to determine what vector " + "sizes and alignments are supported."; + return signalPassFailure(); + } + AMDAIE::AMDAIEDeviceModel deviceModel = + AMDAIE::getDeviceModel(maybeDevice.value()); + + IRRewriter rewriter(&getContext()); + op->walk([&](vector::TransferReadOp transferReadOp) { + if (failed( + getAlignedTransferRead(transferReadOp, rewriter, deviceModel))) { + signalPassFailure(); + } + }); + } +}; + struct DetectNonCanonicalOpsPass : public PassWrapper> { StringRef getArgument() const final { @@ -943,7 +1111,7 @@ struct DetectNonCanonicalOpsPass } void runOnOperation() override { - auto op = getOperation(); + Operation *op = getOperation(); MLIRContext *context = &getContext(); RewritePatternSet patterns(context); ConversionTarget target(*context); @@ -955,8 +1123,8 @@ struct DetectNonCanonicalOpsPass }; void buildCanonicalizeVectorForAIEVec(OpPassManager &pm) { - // TODO: Add passes to split vectors that won't fit in registers pm.addPass(createCanonicalizeVectorForAIEVecPass()); + pm.addPass(createAlignTransferReadsPass()); pm.addPass(mlir::createCanonicalizerPass()); pm.addPass(std::make_unique()); } @@ -971,4 +1139,14 @@ void registerCanonicalizeVectorForAIEVecPass() { }); } +std::unique_ptr<::mlir::Pass> createAlignTransferReadsPass() { + return std::make_unique(); +} + +void registerAlignTransferReadsPass() { + ::mlir::registerPass([]() -> std::unique_ptr { + return std::make_unique(); + }); +} + } // namespace mlir::iree_compiler::aievec diff --git a/compiler/plugins/target/AMD-AIE/aievec/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/aievec/test/CMakeLists.txt index 9560a848f..d9e7c36cb 100644 --- a/compiler/plugins/target/AMD-AIE/aievec/test/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/aievec/test/CMakeLists.txt @@ -1,4 +1,13 @@ -file(GLOB _mlir_files *.mlir) +set(_mlir_files + align-transfer-reads.mlir + fold-ops.mlir + matmul.mlir + precanonicalization-aieml-llvmir.mlir + test-mac_elem.mlir + test-shuffle.mlir + test-srs.mlir + test-ups.mlir +) iree_lit_test_suite( NAME diff --git a/compiler/plugins/target/AMD-AIE/aievec/test/align-transfer-reads.mlir b/compiler/plugins/target/AMD-AIE/aievec/test/align-transfer-reads.mlir new file mode 100644 index 000000000..b312b0022 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/aievec/test/align-transfer-reads.mlir @@ -0,0 +1,163 @@ +// RUN: iree-opt %s --align-transfer-reads --verify-diagnostics -split-input-file | FileCheck %s + +// check for affine_map that is used to determine the slice index of the original +// transfer_read operation: +// CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0, s1, s2] -> (s0 * 192 + s1 * 48 + s2 * 8)> + +// check for affine_map that is used to round the original slice index down to the +// nearest multiple of acceptable alignment (256 bits = 32 bytes = 16 bf16 elms). +// This map computes the remainder after rounding down. +// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (d0 mod 16)> + +// CHECK: func.func @test_bf16_0 +// CHECK-DAG: %[[CST:.*]] = arith.constant 0.000000e+00 : bf16 +// CHECK-DAG: %[[ALLOC:.*]] = memref.alloc() : memref<576xbf16> + +// check for the application of MAP0: +// CHECK: %[[APPLY0:.*]] = affine.apply #[[MAP0]] + +// check for the application of MAP1 on the result of MAP0. This is the +// remainder (number of bf16 elements). +// CHECK: %[[REM_BF16:.*]] = affine.apply #[[MAP1]](%[[APPLY0]]) + +// get the index of the new transfer_read. This is the original index, minus the remainder: +// CHECK: %[[NEW_INDEX:.*]] = arith.subi %[[APPLY0]], %[[REM_BF16]] + +// the new transfer_read (to a vector of 64 bf16s). +// CHECK: %[[NEW_READ:.*]] = vector.transfer_read +// CHECK-SAME: %[[ALLOC]][%[[NEW_INDEX]]], %[[CST]] +// CHECK-SAME: {in_bounds = [true]} +// CHECK-SAME: memref<576xbf16>, vector<64xbf16> + +// compute the remainder in bytes. This is needed in aievec.shift. +// CHECK: %[[BYTES_PER_BF16:.*]] = arith.constant 2 : index +// CHECK: %[[REM_BYTES_INDEX:.*]] = arith.muli %[[REM_BF16]], %[[BYTES_PER_BF16]] : index +// CHECK: %[[REM_BYTES_I32:.*]] = arith.index_cast %[[REM_BYTES_INDEX]] : index to i32 + +// the extraction ops, that copy the lower and upper halves of the 1024-bit vector to two 512-bit vectors. +// CHECK-DAG: %[[EXT0:.*]] = aievec.ext %[[NEW_READ]] {index = 0 : i8} : vector<64xbf16>, vector<32xbf16> +// CHECK-DAG: %[[EXT1:.*]] = aievec.ext %[[NEW_READ]] {index = 1 : i8} : vector<64xbf16>, vector<32xbf16> + +// the shift op, which concats the upper bits from EXT0 and the lower bits from EXT1. +// CHECK: %[[SHIFT:.*]] = aievec.shift %[[EXT0]], %[[EXT1]], %[[REM_BYTES_I32]] +// CHECK-SAME: {isAcc = false} : vector<32xbf16>, vector<32xbf16>, i32, vector<32xbf16> + +#map = affine_map<()[s0, s1, s2] -> (s0 * 192 + s1 * 48 + s2 * 8)> +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { +func.func @test_bf16_0(%arg0: index, %arg1: index, %arg2: index) -> vector<32xbf16> { + %cst = arith.constant 0.000000e+00 : bf16 + %alloc = memref.alloc() : memref<576xbf16> + %0 = affine.apply #map()[%arg0, %arg2, %arg1] + %1 = vector.transfer_read %alloc[%0], %cst {in_bounds = [true]} : memref<576xbf16>, vector<32xbf16> + return %1 : vector<32xbf16> +} +} + + +// ----- + +// An equivalent test to the above, but this time with i8 elements. + +// CHECK-DAG: #[[MAP0:.*]] = affine_map<()[s0] -> (s0 * 8)> +// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (d0 mod 32)> +// CHECK: func.func @test_i8_64bytes +// CHECK-DAG: %[[C0_I8:.*]] = arith.constant 0 : i8 +// CHECK-DAG: %[[ALLOC:.*]] = memref.alloc() : memref<576xi8> +// CHECK: %[[APPLY0:.*]] = affine.apply #[[MAP0]] +// CHECK: %[[REM:.*]] = affine.apply #[[MAP1]](%[[APPLY0]]) +// CHECK: %[[NEW_INDEX:.*]] = arith.subi %[[APPLY0]], %[[REM]] +// CHECK: %[[NEW_READ:.*]] = vector.transfer_read +// CHECK-SAME: %[[ALLOC]][%[[NEW_INDEX]]], %[[C0_I8]] +// CHECK-SAME: {in_bounds = [true]} +// CHECK-SAME: memref<576xi8>, vector<128xi8> +// CHECK-DAG: %[[EXT0:.*]] = aievec.ext %[[NEW_READ]] {index = 0 : i8} : vector<128xi8>, vector<64xi8> +// CHECK-DAG: %[[EXT1:.*]] = aievec.ext %[[NEW_READ]] {index = 1 : i8} : vector<128xi8>, vector<64xi8> +// CHECK-DAG: %[[REM_I32:.*]] = arith.index_cast %[[REM]] : index to i32 +// CHECK: %[[SHIFT:.*]] = aievec.shift %[[EXT0]], %[[EXT1]], %[[REM_I32]] +// CHECK-SAME: {isAcc = false} : vector<64xi8>, vector<64xi8>, i32, vector<64xi8> +// CHECK: return %[[SHIFT]] : vector<64xi8> + +#map = affine_map<()[s0] -> (s0 * 8)> +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { +func.func @test_i8_64bytes(%arg0: index) -> vector<64xi8> { + %cst = arith.constant 0 : i8 + %alloc = memref.alloc() : memref<576xi8> + %0 = affine.apply #map()[%arg0] + %1 = vector.transfer_read %alloc[%0], %cst {in_bounds = [true]} : memref<576xi8>, vector<64xi8> + return %1 : vector<64xi8> +} +} + +// ----- + +// An equivalent test to the above, but this time the vector only has 32 i8s +// (32 bytes). In this case, the order of the extraction op and shift op is +// reversed, because shift expects 64-byte input vectors. + +// CHECK-LABEL: func.func @test_i8_32bytes +// CHECK: %[[READ:.*]] = vector.transfer_read +// CHECK: %[[SHIFT:.*]] = aievec.shift %[[READ]], %[[READ]] +// CHECK-SAME: vector<64xi8>, vector<64xi8>, i32, vector<64xi8> +// CHECK: %[[EXT:.*]] = aievec.ext %[[SHIFT]] {index = 0 : i8} +// CHECK-SAME: vector<64xi8>, vector<32xi8> +// CHECK: return %[[EXT]] : vector<32xi8> + +#map = affine_map<()[s0] -> (s0 * 8)> +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { +func.func @test_i8_32bytes(%arg0: index) -> vector<32xi8> { + %cst = arith.constant 0 : i8 + %alloc = memref.alloc() : memref<576xi8> + %0 = affine.apply #map()[%arg0] + %1 = vector.transfer_read %alloc[%0], %cst {in_bounds = [true]} : memref<576xi8>, vector<32xi8> + return %1 : vector<32xi8> +} +} + +// ----- + +#map = affine_map<()[s0] -> (s0 * 8)> +// expected-error @+1 {{'builtin.module' op has no AMDAIEDevice in the target attribute configuration.}} +module { +func.func @test_i8_32bytes(%arg0: index) -> vector<32xi8> { + %cst = arith.constant 0 : i8 + %alloc = memref.alloc() : memref<576xi8> + %0 = affine.apply #map()[%arg0] + %1 = vector.transfer_read %alloc[%0], %cst {in_bounds = [true]} : memref<576xi8>, vector<32xi8> + return %1 : vector<32xi8> +} +} + +// ----- + +// An equivalent test to the above, but this time the vector only has 16 i8s +// (16 bytes). We currently don't support this. +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { +func.func @test_i8_16bytes(%arg0: index) -> vector<16xi8> { + %cst = arith.constant 0 : i8 + %alloc = memref.alloc() : memref<576xi8> + // expected-warning @+1 {{`transfer_read` doesn't have a vector with 256 or 512 bits.This case is not currently handled}} + %1 = vector.transfer_read %alloc[%arg0], %cst {in_bounds = [true]} : memref<576xi8>, vector<16xi8> + return %1 : vector<16xi8> +} +} + +// ----- + +// An equivalent test to the above, but this time the vector has 128 bytes. + +#executable_target_amdaie_xclbin_fb = #hal.executable.target<"amd-aie", "amdaie-xclbin-fb", {target_device = "npu1_4col", ukernels = "none"}> +module attributes {hal.executable.target = #executable_target_amdaie_xclbin_fb} { +func.func @test_i8_128bytes(%arg0: index) -> vector<128xi8> { + %cst = arith.constant 0 : i8 + %alloc = memref.alloc() : memref<576xi8> + // expected-warning @+1 {{`transfer_read` can't be aligned with a read twice as large because 2048 bits is greater than the maximum vector size of 1024 bits.}} + %1 = vector.transfer_read %alloc[%arg0], %cst {in_bounds = [true]} : memref<576xi8>, vector<128xi8> + return %1 : vector<128xi8> +} +} + + diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp index 9fe88cd64..1e6a56605 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/PluginRegistration.cpp @@ -35,6 +35,7 @@ struct AMDAIESession AMDAIE::registerAIRConversionPasses(); AMDAIE::registerAIRTransformPasses(); aievec::registerConvertAIEVecToLLVMPass(); + aievec::registerAlignTransferReadsPass(); aievec::registerCanonicalizeVectorForAIEVecPass(); aievec::registerLowerVectorToAIEVecPass(); } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index 481591212..d30d93e3d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -479,8 +479,6 @@ void addConvDecomposePassPipeline(OpPassManager &funcPassManager, funcPassManager.addPass(mlir::createLinalgFoldUnitExtentDimsPass(opts)); // Vectorization passes - // FIXME(newling) https://github.com/nod-ai/iree-amd-aie/issues/820 - enableVectorizationPasses = false; appendVectorizationToPipeline(funcPassManager, enableVectorizationPasses); funcPassManager.addPass(createCanonicalizerPass()); diff --git a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h index e9a227d25..b00c184e7 100644 --- a/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h +++ b/runtime/src/iree-amd-aie/aie_runtime/iree_aie_runtime.h @@ -17,11 +17,13 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/FormattedStream.h" #include "macros.h" + // clang-format off #include "iree-amd-aie/aie_runtime/AMDAIEEnums.h" // clang-format on extern "C" { + #include "xaie_hwcfg.h" #include "xaiengine.h" #include "xaiengine/xaie_device_aieml.h" @@ -204,7 +206,9 @@ enum class AIEArch : uint8_t { AIE1 = 1, AIE2 = 2 }; * This struct is meant to be a thin wrapper around aie-rt, which provides * the canonical representation/metadata for AIE devices; attributes like number * of locks, bds per tile, whether certain switch connections are legal or not, - * etc. + * etc. In addition this struct is meant to contain generational specific AIE + * VLIW processor constants, such as sizes of vectors supported for + * load/store/matmul etc. * * This representation is parameterized by platform specific constants * (BASE_ADDR, COL/ROW shift, NUM_MEM_TILE_ROWS, etc.) which are available in @@ -219,8 +223,9 @@ enum class AIEArch : uint8_t { AIE1 = 1, AIE2 = 2 }; */ struct AMDAIEDeviceModel { /// Contains additional device config parameters that can't be retrieved from - /// aie-rt for whatever reason. Make sure the parameters can't be retrieved in - /// another way before adding new fields to this struct. + /// aie-rt or elsewhere for whatever reason. Make sure the parameters can't be + /// retrieved in another way before adding new fields to this struct. + struct AMDAIEDeviceConfig { /// Set default minimum stride bitwidth/addressing granularity to 32 bits as /// this is the value for all current architecture versions. @@ -234,8 +239,26 @@ struct AMDAIEDeviceModel { uint8_t streamSwitchMemTileMSelMax{0}; uint8_t streamSwitchShimArbiterMax{0}; uint8_t streamSwitchShimMSelMax{0}; + + ////////////////////////////// + // VLIW processor constants // + ////////////////////////////// + /// The number of bits that L1 memory must be aligned by in order + /// to be loaded/stored into a register with a vector instruction. See for + /// example: + /// https://www.xilinx.com/htmldocs/xilinx2024_1/aiengine_ml_intrinsics/intrinsics/group__intr__loadstore.html + uint32_t vectorLoadStoreAlignmentBits{256}; + /// The largest vector size supported. See for example: + /// https://www.xilinx.com/htmldocs/xilinx2024_1/aiengine_ml_intrinsics/intrinsics/group__group__datatype__vector.html + uint32_t maxVectorSizeBits{1024}; + /// The number of bits that each of the two vector operands of the shift + /// intrinsic must have. See for example + /// https://www.xilinx.com/htmldocs/xilinx2024_1/aiengine_ml_intrinsics/intrinsics/group__intr__gpvectorop__shift.html + uint32_t shiftOperandBits{512}; + AMDAIEDeviceConfig() = default; }; + XAie_Config configPtr; XAie_DevInst devInst; AMDAIEDeviceConfig deviceConfig; @@ -328,6 +351,14 @@ struct AMDAIEDeviceModel { uint8_t getStreamSwitchArbiterMax(uint8_t col, uint8_t row) const; uint8_t getStreamSwitchMSelMax(uint8_t col, uint8_t row) const; + uint32_t getVectorLoadStoreAlignmentBits() const { + return deviceConfig.vectorLoadStoreAlignmentBits; + } + + uint32_t getMaxVectorSizeBits() const { return deviceConfig.maxVectorSizeBits; } + + uint32_t getShiftOperandBits() const { return deviceConfig.shiftOperandBits; } + /// Return a map from channels to valid BD ids for the requested tile type. /// TODO(jornt): find these ranges in the device model. DenseMap> getChannelToValidBdIds(