From 0b2030a23f1c17e003ca5f283088245b0b5465cf Mon Sep 17 00:00:00 2001 From: makslevental Date: Sun, 16 Jun 2024 14:13:58 -0500 Subject: [PATCH] refactor --- .../target/AMD-AIE/aie/AIEDmaToNpu.cpp | 298 ++++++++++++++---- .../plugins/target/AMD-AIE/aie/AIEPass.cpp | 208 ++---------- .../plugins/target/AMD-AIE/aie/AIETargets.cpp | 211 ------------- .../plugins/target/AMD-AIE/aie/AIETargets.h | 3 - compiler/plugins/target/AMD-AIE/aie/Passes.h | 6 - .../plugins/target/AMD-AIE/aie/XCLBinGen.cpp | 43 +-- .../aie_passes/aiex_standard_lowering.mlir | 22 -- .../AMD-AIE/aie/aie_passes/push_to_queue.mlir | 4 +- 8 files changed, 299 insertions(+), 496 deletions(-) delete mode 100644 compiler/plugins/target/AMD-AIE/aie/aie_passes/aiex_standard_lowering.mlir diff --git a/compiler/plugins/target/AMD-AIE/aie/AIEDmaToNpu.cpp b/compiler/plugins/target/AMD-AIE/aie/AIEDmaToNpu.cpp index 30377caf4..cbe4c906a 100644 --- a/compiler/plugins/target/AMD-AIE/aie/AIEDmaToNpu.cpp +++ b/compiler/plugins/target/AMD-AIE/aie/AIEDmaToNpu.cpp @@ -1,4 +1,5 @@ -//===- AIEDmaToNpu.cpp ------------------------------------------*- C++ -*-===// +//===- AMDAIEDmaToNpu.cpp ------------------------------------------*- C++ +//-*-===// // // This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -7,37 +8,210 @@ // (c) Copyright 2023 Advanced Micro Devices, Inc. // //===----------------------------------------------------------------------===// +//===- AIETargetNPU.cpp -----------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2023 Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include "AIETargets.h" #include "Passes.h" #include "aie/Dialect/AIE/IR/AIEDialect.h" #include "aie/Dialect/AIEX/IR/AIEXDialect.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/Support/Format.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Pass/Pass.h" +#include "mlir/Tools/mlir-translate/MlirTranslateMain.h" #include "mlir/Transforms/DialectConversion.h" using namespace mlir; using namespace xilinx; +using namespace xilinx::AIE; using namespace xilinx::AIEX; +#define TXN_OPC_WRITE 0x0 +#define TXN_OPC_BLOCKWRITE 0x1 +#define TXN_OPC_TCT 0x80 +#define TXN_OPC_DDR_PATCH 0x81 + namespace { +// Example: +// - instructions = {3,4,5} +// - tailSize = 2 +// instructions becomes {3,4,5,0,0} and +// a mutable reference to the tail {0,0} is returned. +llvm::MutableArrayRef reserveAndGetTail( + std::vector &instructions, uint64_t tailSize) { + auto oldSize = instructions.size(); + auto newSize = oldSize + tailSize; + instructions.resize(newSize, 0); + return llvm::MutableArrayRef(instructions.data() + oldSize, + tailSize); +} + +void appendSync(std::vector &instructions, NpuSyncOp op) { + auto words = reserveAndGetTail(instructions, 4); + + // XAIE_IO_CUSTOM_OP_TCT + words[0] = TXN_OPC_TCT; + + words[1] = words.size() * sizeof(uint32_t); // Operation Size + + words[2] |= static_cast(op.getDirection()) & 0xff; + words[2] |= (op.getRow() & 0xff) << 8; + words[2] |= (op.getColumn() & 0xff) << 16; + + words[3] |= (op.getRowNum() & 0xff) << 8; + words[3] |= (op.getColumnNum() & 0xff) << 16; + words[3] |= (op.getChannel() & 0xff) << 24; +} + +void appendWrite32(std::vector &instructions, NpuWrite32Op op) { + auto words = reserveAndGetTail(instructions, 6); + const AIETargetModel &tm = op->getParentOfType().getTargetModel(); + + // XAIE_IO_WRITE + words[0] = TXN_OPC_WRITE; + words[1] = 0; + words[2] = op.getAddress(); + auto col = op.getColumn(); + auto row = op.getRow(); + if (col && row) + words[2] = ((*col & 0xff) << tm.getColumnShift()) | + ((*row & 0xff) << tm.getRowShift()) | (words[2] & 0xFFFFF); + words[3] = 0; + words[4] = op.getValue(); // Value + words[5] = words.size() * sizeof(uint32_t); // Operation Size +} + +void appendAddressPatch(std::vector &instructions, + NpuAddressPatchOp op) { + auto words = reserveAndGetTail(instructions, 12); + + // XAIE_IO_CUSTOM_OP_DDR_PATCH + words[0] = TXN_OPC_DDR_PATCH; + words[1] = words.size() * sizeof(uint32_t); // Operation Size + + words[6] = op.getAddr(); + words[7] = 0; + + words[8] = op.getArgIdx(); + words[9] = 0; + + words[10] = op.getArgPlus(); + words[11] = 0; +} + +void appendWriteBdShimTile(std::vector &instructions, + NpuWriteBdOp op) { + auto words = reserveAndGetTail(instructions, 12); + const AIETargetModel &tm = op->getParentOfType().getTargetModel(); + + // XAIE_IO_BLOCKWRITE + words[0] = TXN_OPC_BLOCKWRITE; + words[1] = 0; + + // RegOff + auto bd_id = op.getBdId(); + uint32_t bd_addr = (op.getColumn() << tm.getColumnShift()) | + (op.getRow() << tm.getRowShift()) | + (0x1D000 + bd_id * 0x20); + words[2] = bd_addr; // ADDR + words[3] = words.size() * sizeof(uint32_t); // Operation Size + + // DMA_BDX_0 + words[4] = op.getBufferLength(); + + // DMA_BDX_1 + words[5] = op.getBufferOffset(); + + // DMA_BDX_2 + // En Packet , OoO BD ID , Packet ID , Packet Type + words[6] |= (op.getEnablePacket() & 0x1) << 30; + words[6] |= (op.getOutOfOrderId() & 0x3f) << 24; + words[6] |= (op.getPacketId() & 0x1f) << 19; + words[6] |= (op.getPacketType() & 0x7) << 16; + + // DMA_BDX_3 + // TODO: Secure Access + words[7] |= (op.getD0Size() & 0x3ff) << 20; + words[7] |= op.getD0Stride() & 0xfffff; + + // DMA_BDX_4 + words[8] = 0x80000000; // burst length; + words[8] |= (op.getD1Size() & 0x3ff) << 20; + words[8] |= op.getD1Stride() & 0xfffff; + + // DMA_BDX_5 + // TODO: SIMID, AxCache, AXQoS + words[9] = op.getD2Stride() & 0xfffff; + + // DMA_BDX_6 + words[10] |= (op.getIterationCurrent() & 0x3f) << 26; + words[10] |= (op.getIterationSize() & 0x3f) << 20; + words[10] |= op.getIterationStride() & 0xfffff; + + // DMA_BDX_7 + // TODO: TLAST Suppress + words[11] |= (op.getNextBd() & 0xf) << 27; + words[11] |= (op.getUseNextBd() & 0x1) << 26; + words[11] |= (op.getValidBd() & 0x1) << 25; + words[11] |= (op.getLockRelVal() & 0xef) << 18; + words[11] |= (op.getLockRelId() & 0xf) << 13; + words[11] |= (op.getLockAcqEnable() & 0x1) << 12; + words[11] |= (op.getLockAcqVal() & 0xef) << 5; + words[11] |= op.getLockAcqId() & 0xf; +} + +} // namespace + +template +class ConvertNpuOp : public OpConversionPattern { + public: + std::vector &instructions; + using AppenderTy = function_ref &, SourceOp)>; + AppenderTy appender; + uint32_t &count; + ConvertNpuOp(MLIRContext *ctx, std::vector &instructions, + AppenderTy appender, uint32_t &count) + : OpConversionPattern(ctx), + instructions(instructions), + appender(appender), + count(count) {} + + LogicalResult matchAndRewrite( + SourceOp op, typename SourceOp::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + appender(instructions, op); + rewriter.eraseOp(op); + count++; + return success(); + } +}; + +namespace { // Helper class to get a ShimDMAAllocationOp for a given // pair. An object of this class is invalidated if, for any symbol_name, a -// ShimDMAAllocationOp that uses it changes, as the cache is not updated in this -// case. +// ShimDMAAllocationOp that uses it changes, as the cache is not updated in +// this case. struct ShimDMAllocationGetter { - public: - // Return the first ShimDMAAllocationOp nested inside the DeviceOp 'dev' that - // uses the symbol 'sym_name' + // Return the first ShimDMAAllocationOp nested inside the DeviceOp 'dev' + // that uses the symbol 'sym_name' std::optional get(AIE::DeviceOp dev, StringRef sym_name) { - auto key = std::make_pair(dev, sym_name); auto it = allocGetter.find(key); - if (it != allocGetter.end()) - return it->second; + if (it != allocGetter.end()) return it->second; auto allocOp = cachelessGet(dev, sym_name); allocGetter[key] = allocOp; @@ -51,13 +225,12 @@ struct ShimDMAllocationGetter { // Finding the ShimDMAAllocationOp for a given pair // can be slow when the symbol is used in many places. This version of the - // function is only called when the cache does not have a ShimDMAAllocationOp - // stored from a previous lookup. + // function is only called when the cache does not have a + // ShimDMAAllocationOp stored from a previous lookup. std::optional cachelessGet(AIE::DeviceOp dev, StringRef sym_name) { auto *sym = dev.lookupSymbol(sym_name); - if (!sym) - return std::nullopt; + if (!sym) return std::nullopt; auto uses = SymbolTable::getSymbolUses(sym, dev); for (auto use : *uses) @@ -67,28 +240,25 @@ struct ShimDMAllocationGetter { return std::nullopt; } }; -} // namespace +} // namespace struct PushToNpuPattern : OpConversionPattern { - public: using OpConversionPattern::OpConversionPattern; PushToNpuPattern(MLIRContext *context, PatternBenefit benefit = 1) : OpConversionPattern(context, benefit) {} - LogicalResult - matchAndRewrite(NpuPushQueueOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - + LogicalResult matchAndRewrite( + NpuPushQueueOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { // the offset of the task queue register in the tile uint32_t queue_offset; if (op.getDirection() == AIE::DMAChannelDir::MM2S) queue_offset = 0x1D214; else queue_offset = 0x1D204; - if (op.getChannel() == 1) - queue_offset += 0x8; + if (op.getChannel() == 1) queue_offset += 0x8; // the value to write uint32_t bd_id = op.getBdId(); @@ -96,8 +266,7 @@ struct PushToNpuPattern : OpConversionPattern { uint32_t cmd = 0; cmd |= bd_id & 0xF; cmd |= (repeat_cnt & 0xFF) << 16; - if (op.getIssueToken()) - cmd |= 0x80000000; + if (op.getIssueToken()) cmd |= 0x80000000; auto i32ty = IntegerType::get(op->getContext(), 32); auto column = IntegerAttr::get(i32ty, op.getColumn()); @@ -119,17 +288,16 @@ struct DmaToNpuPattern : OpConversionPattern { PatternBenefit benefit = 1) : OpConversionPattern(context, benefit), allocGetter(getter) {} - LogicalResult - matchAndRewrite(NpuDmaMemcpyNdOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { + LogicalResult matchAndRewrite( + NpuDmaMemcpyNdOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { auto *ctx = op->getContext(); auto i32ty = IntegerType::get(ctx, 32); auto zero = IntegerAttr::get(i32ty, 0); auto memref = adaptor.getMemref(); auto dev = op->getParentOfType(); - if (!dev) - return failure(); + if (!dev) return failure(); auto infoOp = allocGetter.get(dev, op.getMetadata()); if (!infoOp) { @@ -193,19 +361,14 @@ struct DmaToNpuPattern : OpConversionPattern { break; } } - if (arg_idx < 0) - return failure(); + if (arg_idx < 0) return failure(); ddr_id = IntegerAttr::get(i32ty, arg_idx); // bd_id bd_id = IntegerAttr::get(i32ty, op.getId()); // buffer_length - int32_t repeat_length = 0; - for (int32_t index_3d = 0; index_3d < sizes[2]; index_3d++) - for (int32_t index_2d = 0; index_2d < sizes[1]; index_2d++) - repeat_length += sizes[0]; - buffer_length = IntegerAttr::get(i32ty, repeat_length); + buffer_length = IntegerAttr::get(i32ty, sizes[2] * sizes[1] * sizes[0]); // buffer_offset size_t stride = 1; @@ -232,33 +395,27 @@ struct DmaToNpuPattern : OpConversionPattern { // packet_type // d0_size - if (strides[0]) - d0_size = IntegerAttr::get(i32ty, sizes[0]); + if (strides[0]) d0_size = IntegerAttr::get(i32ty, sizes[0]); // d0_stride d0_stride = IntegerAttr::get(i32ty, 0); // d1_size - if (strides[1]) - d1_size = IntegerAttr::get(i32ty, sizes[1]); + if (strides[1]) d1_size = IntegerAttr::get(i32ty, sizes[1]); // d1_stride - if (strides[0]) - d1_stride = IntegerAttr::get(i32ty, strides[0] - 1); + if (strides[0]) d1_stride = IntegerAttr::get(i32ty, strides[0] - 1); // d2_stride - if (strides[1]) - d2_stride = IntegerAttr::get(i32ty, strides[1] - 1); + if (strides[1]) d2_stride = IntegerAttr::get(i32ty, strides[1] - 1); // iteration_current // iteration_size - if (strides[2]) - iteration_size = IntegerAttr::get(i32ty, sizes[3] - 1); + if (strides[2]) iteration_size = IntegerAttr::get(i32ty, sizes[3] - 1); // iteration_stride - if (strides[2]) - iteration_stride = IntegerAttr::get(i32ty, strides[2] - 1); + if (strides[2]) iteration_stride = IntegerAttr::get(i32ty, strides[2] - 1); // next_bd @@ -284,8 +441,7 @@ struct DmaToNpuPattern : OpConversionPattern { issue_token = BoolAttr::get(ctx, op.getIssueToken()); // Earlier, all S2MM channels were implicitly assumed to issue a token. // This logic is kept for now for backward compatibility. - if (!isMM2S) - issue_token = BoolAttr::get(ctx, true); + if (!isMM2S) issue_token = BoolAttr::get(ctx, true); rewriter.create( op->getLoc(), column, ddr_id, bd_id, buffer_length, buffer_offset, @@ -314,7 +470,6 @@ struct DmaToNpuPattern : OpConversionPattern { /// information from the ShimDMAAllocationOp referenced through the /// symbol argument of this op. struct DmaWaitToNpuPattern : OpConversionPattern { - private: ShimDMAllocationGetter &allocGetter; @@ -325,12 +480,11 @@ struct DmaWaitToNpuPattern : OpConversionPattern { PatternBenefit benefit = 1) : OpConversionPattern(context, benefit), allocGetter(getter) {} - LogicalResult - matchAndRewrite(NpuDmaWaitOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { + LogicalResult matchAndRewrite( + NpuDmaWaitOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { AIE::DeviceOp dev = op->getParentOfType(); - if (!dev) - return op->emitError("couldn't find parent of type DeviceOp"); + if (!dev) return op->emitError("couldn't find parent of type DeviceOp"); std::optional shimDmaAllocOp = allocGetter.get(dev, op.getSymbol()); @@ -350,7 +504,6 @@ struct DmaWaitToNpuPattern : OpConversionPattern { struct AIEDmaToNpuPass : xilinx::AIEX::impl::AIEDmaToNpuBase { void runOnOperation() override { - ShimDMAllocationGetter cachingGetter; AIE::DeviceOp device = getOperation(); @@ -371,6 +524,39 @@ struct AIEDmaToNpuPass : xilinx::AIEX::impl::AIEDmaToNpuBase { if (failed(applyPartialConversion(device, target, std::move(patterns)))) signalPassFailure(); + + patterns.clear(); + target.addIllegalOp(); + target.addIllegalOp(); + target.addIllegalOp(); + target.addIllegalOp(); + + std::vector instructions; + auto words = reserveAndGetTail(instructions, 4); + + // setup txn header + words[0] = 0x06030100; + words[1] = 0x00000105; + uint32_t count = 0; + + patterns.insert>(&getContext(), instructions, + appendSync, count); + patterns.insert>(&getContext(), instructions, + appendWrite32, count); + patterns.insert>( + &getContext(), instructions, appendAddressPatch, count); + patterns.insert>(&getContext(), instructions, + appendWriteBdShimTile, count); + + if (failed(applyPartialConversion(device, target, std::move(patterns)))) + signalPassFailure(); + + instructions[2] = count; + instructions[3] = instructions.size() * sizeof(uint32_t); + std::vector signedInstructions(instructions.begin(), + instructions.end()); + device->setAttr("npu_instructions", + DenseI32ArrayAttr::get(&getContext(), signedInstructions)); } }; diff --git a/compiler/plugins/target/AMD-AIE/aie/AIEPass.cpp b/compiler/plugins/target/AMD-AIE/aie/AIEPass.cpp index e3abd7699..eddbcf05e 100644 --- a/compiler/plugins/target/AMD-AIE/aie/AIEPass.cpp +++ b/compiler/plugins/target/AMD-AIE/aie/AIEPass.cpp @@ -46,8 +46,9 @@ #define DEBUG_TYPE "aie-pass" using namespace mlir; -using namespace xilinx; +using namespace mlir::vector; using namespace xilinx::AIE; +using namespace xilinx::AIEX; const std::map _WIRE_BUNDLE_TO_STRM_SW_PORT_TYPE = { @@ -88,59 +89,46 @@ xilinx::AIE::WireBundle STRM_SW_PORT_TYPE_TO_WIRE_BUNDLE(StrmSwPortType s) { template class AIEAssignBufferAddressesPassBasicBase - : public ::mlir::OperationPass { + : public mlir::OperationPass { public: using Base = AIEAssignBufferAddressesPassBasicBase; AIEAssignBufferAddressesPassBasicBase() - : ::mlir::OperationPass(::mlir::TypeID::get()) {} + : mlir::OperationPass(mlir::TypeID::get()) {} AIEAssignBufferAddressesPassBasicBase( const AIEAssignBufferAddressesPassBasicBase &other) - : ::mlir::OperationPass(other) {} - AIEAssignBufferAddressesPassBasicBase &operator=( - const AIEAssignBufferAddressesPassBasicBase &) = delete; - AIEAssignBufferAddressesPassBasicBase( - AIEAssignBufferAddressesPassBasicBase &&) = delete; - AIEAssignBufferAddressesPassBasicBase &operator=( - AIEAssignBufferAddressesPassBasicBase &&) = delete; - ~AIEAssignBufferAddressesPassBasicBase() = default; + : mlir::OperationPass(other) {} - /// Returns the command-line argument attached to this pass. - static constexpr ::llvm::StringLiteral getArgumentName() { - return ::llvm::StringLiteral("aie-assign-buffer-addresses-basic"); + static constexpr llvm::StringLiteral getArgumentName() { + return llvm::StringLiteral("aie-assign-buffer-addresses-basic"); } - ::llvm::StringRef getArgument() const override { + + llvm::StringRef getArgument() const override { return "aie-assign-buffer-addresses-basic"; } - ::llvm::StringRef getDescription() const override { + llvm::StringRef getDescription() const override { return "Assign memory locations for buffers in each tile"; } - /// Returns the derived pass name. - static constexpr ::llvm::StringLiteral getPassName() { - return ::llvm::StringLiteral("AIEAssignBufferAddressesBasic"); + static constexpr llvm::StringLiteral getPassName() { + return llvm::StringLiteral("AIEAssignBufferAddressesBasic"); } - ::llvm::StringRef getName() const override { + + llvm::StringRef getName() const override { return "AIEAssignBufferAddressesBasic"; } - /// Support isa/dyn_cast functionality for the derived pass class. - static bool classof(const ::mlir::Pass *pass) { - return pass->getTypeID() == ::mlir::TypeID::get(); + static bool classof(const mlir::Pass *pass) { + return pass->getTypeID() == mlir::TypeID::get(); } - /// A clone method to create a copy of this pass. - std::unique_ptr<::mlir::Pass> clonePass() const override { + std::unique_ptr clonePass() const override { return std::make_unique(*static_cast(this)); } - /// Register the dialects that must be loaded in the context before this pass. - void getDependentDialects(::mlir::DialectRegistry ®istry) const override {} + void getDependentDialects(mlir::DialectRegistry ®istry) const override {} - /// Explicitly declare the TypeID for this class. We declare an explicit - /// private instantiation because Pass classes should only be visible by the - /// current library. MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID( AIEAssignBufferAddressesPassBasicBase) }; @@ -208,24 +196,6 @@ struct AIEAssignBufferAddressesPassBasic if (address > maxDataMemorySize) { InFlightDiagnostic error = tile.emitOpError("allocated buffers exceeded available memory\n"); - auto ¬e = error.attachNote() << "MemoryMap:\n"; - auto printbuffer = [&](StringRef name, int address, int size) { - note << "\t" << name << " \t" - << ": 0x" << llvm::utohexstr(address) << "-0x" - << llvm::utohexstr(address + size - 1) << " \t(" << size - << " bytes)\n"; - }; - if (stacksize > 0) - printbuffer("(stack)", 0, stacksize); - else - error << "(no stack allocated)\n"; - - for (auto buffer : buffers) { - assert(buffer.getAddress().has_value() && - "buffer must have address assigned"); - printbuffer(buffer.name(), buffer.getAddress().value(), - buffer.getAllocationSize()); - } return signalPassFailure(); } } @@ -233,7 +203,7 @@ struct AIEAssignBufferAddressesPassBasic }; std::unique_ptr> -AIE::createAIEAssignBufferAddressesBasicPass() { +xilinx::AIE::createAIEAssignBufferAddressesBasicPass() { return std::make_unique(); } @@ -330,6 +300,7 @@ struct AIEAssignBufferDescriptorIDsPass bd.setBdId(gen.nextBdId(blockChannelMap[&block])); } } + for (TileElement memOp : memOps) { DenseMap blockBdIdMap; for (Block &block : memOp.getOperation()->getRegion(0)) { @@ -362,7 +333,7 @@ struct AIEAssignBufferDescriptorIDsPass }; std::unique_ptr> -AIE::createAIEAssignBufferDescriptorIDsPass() { +xilinx::AIE::createAIEAssignBufferDescriptorIDsPass() { return std::make_unique(); } @@ -455,7 +426,8 @@ struct AIEAssignLockIDsPass } }; -std::unique_ptr> AIE::createAIEAssignLockIDsPass() { +std::unique_ptr> +xilinx::AIE::createAIEAssignLockIDsPass() { return std::make_unique(); } @@ -470,11 +442,6 @@ std::unique_ptr> AIE::createAIEAssignLockIDsPass() { // //===----------------------------------------------------------------------===// -using namespace mlir; -using namespace mlir::vector; -using namespace xilinx; -using namespace xilinx::AIE; - static StringRef getArchIntrinsicString(AIEArch arch) { return "aie2"; } typedef std::tuple, std::vector> @@ -522,47 +489,6 @@ static void declareAIEIntrinsics(AIEArch arch, OpBuilder &builder) { registerIntrinsics(getAIE2Intrinsics(builder)); } -template -struct AIEOpRemoval : OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - using OpAdaptor = typename MyAIEOp::Adaptor; - ModuleOp &module; - - AIEOpRemoval(MLIRContext *context, ModuleOp &m, PatternBenefit benefit = 1) - : OpConversionPattern(context, benefit), module(m) {} - - LogicalResult matchAndRewrite( - MyAIEOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - rewriter.eraseOp(op); - return success(); - } -}; - -struct AIEDebugOpToStdLowering : OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - ModuleOp &module; - - AIEDebugOpToStdLowering(MLIRContext *context, ModuleOp &m, - PatternBenefit benefit = 1) - : OpConversionPattern(context, benefit), module(m) {} - - LogicalResult matchAndRewrite( - DebugOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - std::string funcName = "debug_i32"; - auto func = module.lookupSymbol(funcName); - if (!func) - return op.emitOpError("Could not find the intrinsic function ") - << funcName; - SmallVector args; - args.push_back(op.getArg()); - rewriter.create(rewriter.getUnknownLoc(), func, args); - rewriter.eraseOp(op); - return success(); - } -}; - struct AIEPutStreamToStdLowering : OpConversionPattern { using OpConversionPattern::OpConversionPattern; ModuleOp &module; @@ -906,9 +832,6 @@ struct AIECoreToStandardPass return signalPassFailure(); } DeviceOp device = *m.getOps().begin(); - AMDAIENPUDeviceModel &targetModel = - mlir::iree_compiler::AMDAIE::getDeviceModel(); - // Ensure that we don't have an incorrect target triple. This may override // some bogus target triple in the original mlir. m->setAttr(LLVM::LLVMDialect::getTargetTripleAttrName(), @@ -936,8 +859,8 @@ struct AIECoreToStandardPass RewritePatternSet patterns(&getContext()); patterns.add(m.getContext(), m); + AIEUseLockToStdLowering, AIEEventOpToStdLowering>( + m.getContext(), m); patterns.add(m.getContext(), m, /*benefit*/ 1, tileCol, tileRow); @@ -956,21 +879,14 @@ struct AIECoreToStandardPass outlineOps(device); outlineOps(device); - RewritePatternSet removepatterns(&getContext()); - removepatterns.add< - AIEOpRemoval, AIEOpRemoval, AIEOpRemoval, - AIEOpRemoval, AIEOpRemoval, AIEOpRemoval, - AIEOpRemoval, AIEOpRemoval, AIEOpRemoval, - AIEOpRemoval, AIEOpRemoval, - AIEOpRemoval, AIEOpRemoval>( - m.getContext(), m); - - if (failed(applyPartialConversion(m, target, std::move(removepatterns)))) - return signalPassFailure(); + MLIRContext &context = getContext(); + IRRewriter rewriter(&context); + rewriter.eraseOp(device); } }; -std::unique_ptr> AIE::createAIECoreToStandardPass() { +std::unique_ptr> +xilinx::AIE::createAIECoreToStandardPass() { return std::make_unique(); } @@ -1476,7 +1392,8 @@ struct AIELocalizeLocksPass } }; -std::unique_ptr> AIE::createAIELocalizeLocksPass() { +std::unique_ptr> +xilinx::AIE::createAIELocalizeLocksPass() { return std::make_unique(); } @@ -2810,7 +2727,7 @@ struct AIEObjectFifoStatefulTransformPass }; std::unique_ptr> -AIE::createAIEObjectFifoStatefulTransformPass() { +xilinx::AIE::createAIEObjectFifoStatefulTransformPass() { return std::make_unique(); } @@ -3265,60 +3182,6 @@ std::optional> Pathfinder::findPaths( return routingSolution; } -//===- AIEXToStandard.cpp ---------------------------------------*- C++ -*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// Copyright (C) 2023, Advanced Micro Devices, Inc. -// -//===----------------------------------------------------------------------===// - -using namespace xilinx::AIEX; - -template -struct AIEXOpRemoval : OpConversionPattern { - using OpConversionPattern::OpConversionPattern; - using OpAdaptor = typename MyAIEXOp::Adaptor; - ModuleOp &module; - - AIEXOpRemoval(MLIRContext *context, ModuleOp &m, PatternBenefit benefit = 1) - : OpConversionPattern(context, benefit), module(m) {} - - LogicalResult matchAndRewrite( - MyAIEXOp op, OpAdaptor adaptor, - ConversionPatternRewriter &rewriter) const override { - Operation *Op = op.getOperation(); - rewriter.eraseOp(Op); - return success(); - } -}; - -struct AIEXToStandardPass - : xilinx::AIEX::impl::AIEXToStandardBase { - void runOnOperation() override { - ModuleOp m = getOperation(); - ConversionTarget target(getContext()); - RewritePatternSet removepatterns(&getContext()); - removepatterns.add>(m.getContext(), m); - removepatterns.add>(m.getContext(), m); - removepatterns.add>(m.getContext(), m); - removepatterns.add>(m.getContext(), m); - removepatterns.add>(m.getContext(), m); - removepatterns.add>(m.getContext(), m); - removepatterns.add>(m.getContext(), m); - removepatterns.add>(m.getContext(), m); - - if (failed(applyPartialConversion(m, target, std::move(removepatterns)))) - signalPassFailure(); - } -}; - -std::unique_ptr> AIEX::createAIEXToStandardPass() { - return std::make_unique(); -} - namespace mlir::iree_compiler::AMDAIE { void registerAIETransformPasses() { xilinx::AIE::registerAIEAssignLockIDs(); @@ -3332,8 +3195,5 @@ void registerAIETransformPasses() { } // namespace mlir::iree_compiler::AMDAIE namespace mlir::iree_compiler::AMDAIE { -void registerAIEXTransformPasses() { - xilinx::AIEX::registerAIEXToStandard(); - xilinx::AIEX::registerAIEDmaToNpu(); -} +void registerAIEXTransformPasses() { xilinx::AIEX::registerAIEDmaToNpu(); } } // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/aie/AIETargets.cpp b/compiler/plugins/target/AMD-AIE/aie/AIETargets.cpp index 97ac3a72b..87bd5979d 100644 --- a/compiler/plugins/target/AMD-AIE/aie/AIETargets.cpp +++ b/compiler/plugins/target/AMD-AIE/aie/AIETargets.cpp @@ -301,214 +301,3 @@ SECTIONS } return success(); } -//===- AIETargetNPU.cpp -----------------------------------------*- C++ -*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -// (c) Copyright 2023 Advanced Micro Devices, Inc. -// -//===----------------------------------------------------------------------===// - -#include - -#include "AIETargets.h" -#include "aie/Dialect/AIE/IR/AIEDialect.h" -#include "aie/Dialect/AIEX/IR/AIEXDialect.h" -#include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/TypeSwitch.h" -#include "llvm/Support/Format.h" -#include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/Tools/mlir-translate/MlirTranslateMain.h" - -using namespace mlir; -using namespace xilinx; -using namespace xilinx::AIE; -using namespace xilinx::AIEX; - -#define TXN_OPC_WRITE 0x0 -#define TXN_OPC_BLOCKWRITE 0x1 -#define TXN_OPC_TCT 0x80 -#define TXN_OPC_DDR_PATCH 0x81 - -namespace { - -// Example: -// - instructions = {3,4,5} -// - tailSize = 2 -// instructions becomes {3,4,5,0,0} and -// a mutable reference to the tail {0,0} is returned. -llvm::MutableArrayRef reserveAndGetTail( - std::vector &instructions, uint64_t tailSize) { - auto oldSize = instructions.size(); - auto newSize = oldSize + tailSize; - instructions.resize(newSize, 0); - return llvm::MutableArrayRef(instructions.data() + oldSize, - tailSize); -} - -void appendSync(std::vector &instructions, NpuSyncOp op) { - auto words = reserveAndGetTail(instructions, 4); - - // XAIE_IO_CUSTOM_OP_TCT - words[0] = TXN_OPC_TCT; - - words[1] = words.size() * sizeof(uint32_t); // Operation Size - - words[2] |= static_cast(op.getDirection()) & 0xff; - words[2] |= (op.getRow() & 0xff) << 8; - words[2] |= (op.getColumn() & 0xff) << 16; - - words[3] |= (op.getRowNum() & 0xff) << 8; - words[3] |= (op.getColumnNum() & 0xff) << 16; - words[3] |= (op.getChannel() & 0xff) << 24; -} - -void appendWrite32(std::vector &instructions, NpuWrite32Op op) { - auto words = reserveAndGetTail(instructions, 6); - const AIETargetModel &tm = op->getParentOfType().getTargetModel(); - - // XAIE_IO_WRITE - words[0] = TXN_OPC_WRITE; - words[1] = 0; - words[2] = op.getAddress(); - auto col = op.getColumn(); - auto row = op.getRow(); - if (col && row) - words[2] = ((*col & 0xff) << tm.getColumnShift()) | - ((*row & 0xff) << tm.getRowShift()) | (words[2] & 0xFFFFF); - words[3] = 0; - words[4] = op.getValue(); // Value - words[5] = words.size() * sizeof(uint32_t); // Operation Size -} - -void appendAddressPatch(std::vector &instructions, - NpuAddressPatchOp op) { - auto words = reserveAndGetTail(instructions, 12); - - // XAIE_IO_CUSTOM_OP_DDR_PATCH - words[0] = TXN_OPC_DDR_PATCH; - words[1] = words.size() * sizeof(uint32_t); // Operation Size - - words[6] = op.getAddr(); - words[7] = 0; - - words[8] = op.getArgIdx(); - words[9] = 0; - - words[10] = op.getArgPlus(); - words[11] = 0; -} - -void appendWriteBdShimTile(std::vector &instructions, - NpuWriteBdOp op) { - auto words = reserveAndGetTail(instructions, 12); - const AIETargetModel &tm = op->getParentOfType().getTargetModel(); - - // XAIE_IO_BLOCKWRITE - words[0] = TXN_OPC_BLOCKWRITE; - words[1] = 0; - - // RegOff - auto bd_id = op.getBdId(); - uint32_t bd_addr = (op.getColumn() << tm.getColumnShift()) | - (op.getRow() << tm.getRowShift()) | - (0x1D000 + bd_id * 0x20); - words[2] = bd_addr; // ADDR - words[3] = words.size() * sizeof(uint32_t); // Operation Size - - // DMA_BDX_0 - words[4] = op.getBufferLength(); - - // DMA_BDX_1 - words[5] = op.getBufferOffset(); - - // DMA_BDX_2 - // En Packet , OoO BD ID , Packet ID , Packet Type - words[6] |= (op.getEnablePacket() & 0x1) << 30; - words[6] |= (op.getOutOfOrderId() & 0x3f) << 24; - words[6] |= (op.getPacketId() & 0x1f) << 19; - words[6] |= (op.getPacketType() & 0x7) << 16; - - // DMA_BDX_3 - // TODO: Secure Access - words[7] |= (op.getD0Size() & 0x3ff) << 20; - words[7] |= op.getD0Stride() & 0xfffff; - - // DMA_BDX_4 - words[8] = 0x80000000; // burst length; - words[8] |= (op.getD1Size() & 0x3ff) << 20; - words[8] |= op.getD1Stride() & 0xfffff; - - // DMA_BDX_5 - // TODO: SIMID, AxCache, AXQoS - words[9] = op.getD2Stride() & 0xfffff; - - // DMA_BDX_6 - words[10] |= (op.getIterationCurrent() & 0x3f) << 26; - words[10] |= (op.getIterationSize() & 0x3f) << 20; - words[10] |= op.getIterationStride() & 0xfffff; - - // DMA_BDX_7 - // TODO: TLAST Suppress - words[11] |= (op.getNextBd() & 0xf) << 27; - words[11] |= (op.getUseNextBd() & 0x1) << 26; - words[11] |= (op.getValidBd() & 0x1) << 25; - words[11] |= (op.getLockRelVal() & 0xef) << 18; - words[11] |= (op.getLockRelId() & 0xf) << 13; - words[11] |= (op.getLockAcqEnable() & 0x1) << 12; - words[11] |= (op.getLockAcqVal() & 0xef) << 5; - words[11] |= op.getLockAcqId() & 0xf; -} - -} // namespace - -std::vector xilinx::AIE::AIETranslateToNPU(ModuleOp module) { - std::vector instructions; - - auto words = reserveAndGetTail(instructions, 4); - - // setup txn header - words[0] = 0x06030100; - words[1] = 0x00000105; - - DeviceOp deviceOp = *module.getOps().begin(); - auto funcOps = deviceOp.getOps(); - int count = 0; - for (auto f : funcOps) { - if (f.isDeclaration()) continue; - Block &entry = f.getRegion().front(); - for (auto &o : entry) { - llvm::TypeSwitch(&o) - .Case([&](auto op) { - count++; - appendSync(instructions, op); - }) - .Case([&](auto op) { - count++; - appendWrite32(instructions, op); - }) - .Case([&](auto op) { - count++; - appendAddressPatch(instructions, op); - }) - .Case([&](auto op) { - count++; - appendWriteBdShimTile(instructions, op); - }); - } - } - - // write size fields of the txn header - instructions[2] = count; - instructions[3] = instructions.size() * sizeof(uint32_t); - return instructions; -} - -LogicalResult xilinx::AIE::AIETranslateToNPU(ModuleOp module, - raw_ostream &output) { - auto instructions = AIETranslateToNPU(module); - for (auto w : instructions) output << llvm::format("%08X\n", w); - return success(); -} \ No newline at end of file diff --git a/compiler/plugins/target/AMD-AIE/aie/AIETargets.h b/compiler/plugins/target/AMD-AIE/aie/AIETargets.h index d11a378c5..28ca62759 100644 --- a/compiler/plugins/target/AMD-AIE/aie/AIETargets.h +++ b/compiler/plugins/target/AMD-AIE/aie/AIETargets.h @@ -16,9 +16,6 @@ namespace xilinx { namespace AIE { -mlir::LogicalResult AIETranslateToNPU(mlir::ModuleOp module, - llvm::raw_ostream &output); -std::vector AIETranslateToNPU(mlir::ModuleOp); mlir::LogicalResult AIETranslateToLdScript(mlir::ModuleOp module, llvm::raw_ostream &output, int tileCol, int tileRow); diff --git a/compiler/plugins/target/AMD-AIE/aie/Passes.h b/compiler/plugins/target/AMD-AIE/aie/Passes.h index fa1f291c9..9e01feab9 100644 --- a/compiler/plugins/target/AMD-AIE/aie/Passes.h +++ b/compiler/plugins/target/AMD-AIE/aie/Passes.h @@ -134,12 +134,6 @@ inline void registerAIEDmaToNpu() { }); } -inline void registerAIEXToStandard() { - ::mlir::registerPass([]() -> std::unique_ptr<::mlir::Pass> { - return xilinx::AIEX::createAIEXToStandardPass(); - }); -} - } // namespace xilinx::AIEX namespace mlir::iree_compiler::AMDAIE { diff --git a/compiler/plugins/target/AMD-AIE/aie/XCLBinGen.cpp b/compiler/plugins/target/AMD-AIE/aie/XCLBinGen.cpp index 6ed200f72..e0281e0bb 100644 --- a/compiler/plugins/target/AMD-AIE/aie/XCLBinGen.cpp +++ b/compiler/plugins/target/AMD-AIE/aie/XCLBinGen.cpp @@ -61,8 +61,6 @@ namespace { // manager. These control when (if ever) and what IR gets printed between // passes, and whether the pass manager uses multi-theading. void applyConfigToPassManager(XCLBinGenConfig &TK, PassManager &pm) { - // pm.getContext()->disableMultithreading(TK.DisableThreading); - bool printBefore = TK.PrintIRBeforeAll; auto shouldPrintBeforePass = [printBefore](Pass *, Operation *) { return printBefore; @@ -692,7 +690,6 @@ static LogicalResult generateObject(MLIRContext *context, ModuleOp moduleOp, pm.addNestedPass(AIE::createAIELocalizeLocksPass()); pm.addPass(AIE::createAIECoreToStandardPass()); - pm.addPass(AIEX::createAIEXToStandardPass()); // Convert specific vector dialect ops (like vector.contract) to the AIEVec // dialect @@ -820,26 +817,30 @@ LogicalResult xilinx::aie2xclbin(MLIRContext *ctx, ModuleOp moduleOp, return moduleOp.emitOpError() << "Unexpected target architecture: " << TK.TargetArch; - // generateNPUInstructions - { - PassManager pm(ctx, moduleOp.getOperationName()); - applyConfigToPassManager(TK, pm); - - pm.addNestedPass(AIEX::createAIEDmaToNpuPass()); - ModuleOp copy = moduleOp.clone(); - if (failed(pm.run(copy))) - return moduleOp.emitOpError(": NPU Instruction pipeline failed"); - - std::string errorMessage; - auto output = openOutputFile(OutputNPU, &errorMessage); - if (!output) return moduleOp.emitOpError(errorMessage); + PassManager pm(ctx, moduleOp.getOperationName()); + applyConfigToPassManager(TK, pm); - if (failed(AIE::AIETranslateToNPU(copy, output->os()))) - return moduleOp.emitOpError(": NPU Instruction translation failed"); + pm.addNestedPass(AIEX::createAIEDmaToNpuPass()); + if (failed(pm.run(moduleOp))) + return moduleOp.emitOpError(": NPU Instruction pipeline failed"); + moduleOp->dump(); + + // TODO(max): should be using UI32 resource or something like that... + ArrayRef signedNpuInstructionsAttr = + cast( + (*moduleOp.getOps().begin()) + ->getAttr("npu_instructions")) + .asArrayRef(); + std::vector unsignedNpuInstructions( + signedNpuInstructionsAttr.begin(), signedNpuInstructionsAttr.end()); + for (const auto &item : unsignedNpuInstructions) std::cerr << item << "\n"; - output->keep(); - copy->erase(); - } + std::string errorMessage; + auto output = openOutputFile(OutputNPU, &errorMessage); + if (!output) return moduleOp.emitOpError(errorMessage); + for (auto w : unsignedNpuInstructions) + output->os() << llvm::format("%08X\n", w); + output->keep(); SmallString<64> object(TK.TempDir); sys::path::append(object, "input.o"); diff --git a/compiler/plugins/target/AMD-AIE/aie/aie_passes/aiex_standard_lowering.mlir b/compiler/plugins/target/AMD-AIE/aie/aie_passes/aiex_standard_lowering.mlir deleted file mode 100644 index edc031c40..000000000 --- a/compiler/plugins/target/AMD-AIE/aie/aie_passes/aiex_standard_lowering.mlir +++ /dev/null @@ -1,22 +0,0 @@ - -// RUN: iree-opt --aiex-standard-lowering %s | FileCheck %s - -// CHECK-LABEL: aie.device(npu1_4col) { -// CHECK: memref.global "public" @toMem : memref<16xi32> -// CHECK: func.func @dma_and_wait(%[[ARG0:.*]]: memref<16xi32>, %[[ARG1:.*]]: memref<16xi32>) { -// CHECK: return -// CHECK: } -// CHECK: aie.shim_dma_allocation @toMem(MM2S, 1, 1) -// CHECK: } - -module { - aie.device(npu1_4col) { - memref.global "public" @toMem : memref<16xi32> - func.func @dma_and_wait(%arg0: memref<16xi32>, %arg1: memref<16xi32>) { - aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 16, 16][0, 0, 64]) { metadata = @toMem, id = 1 : i64 } : memref<16xi32> - aiex.npu.dma_wait {symbol = @toMem} - return - } - aie.shim_dma_allocation @toMem (MM2S, 1, 1) - } -} diff --git a/compiler/plugins/target/AMD-AIE/aie/aie_passes/push_to_queue.mlir b/compiler/plugins/target/AMD-AIE/aie/aie_passes/push_to_queue.mlir index 511e8c4f5..cbab26092 100644 --- a/compiler/plugins/target/AMD-AIE/aie/aie_passes/push_to_queue.mlir +++ b/compiler/plugins/target/AMD-AIE/aie/aie_passes/push_to_queue.mlir @@ -3,11 +3,9 @@ // CHECK-LABEL: aie.device(npu1_4col) { // CHECK: func.func @sequence() { -// CHECK: aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483651 : ui32} -// CHECK: aiex.npu.write32 {address = 119316 : ui32, column = 2 : i32, row = 0 : i32, value = 196610 : ui32} // CHECK: return // CHECK: } -// CHECK: } +// CHECK: } {npu_instructions = array} module { aie.device(npu1_4col) {