diff --git a/compiler/plugins/target/AMD-AIE/aie/AIETransformPasses.cpp b/compiler/plugins/target/AMD-AIE/aie/AIETransformPasses.cpp index 5ba832707..21c59ec8b 100644 --- a/compiler/plugins/target/AMD-AIE/aie/AIETransformPasses.cpp +++ b/compiler/plugins/target/AMD-AIE/aie/AIETransformPasses.cpp @@ -6,14 +6,6 @@ #include "AIEAssignBufferAddressesBasic.h" #include "aie/Dialect/AIE/Transforms/AIEPasses.h" -#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h" -#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h" -#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h" -#include "mlir/Conversion/IndexToLLVM/IndexToLLVM.h" -#include "mlir/Conversion/MathToLLVM/MathToLLVM.h" -#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h" -#include "mlir/Dialect/Func/Extensions/AllExtensions.h" -#include "mlir/Target/LLVMIR/Dialect/All.h" namespace { #define GEN_PASS_REGISTRATION @@ -33,15 +25,5 @@ void registerAIETransformPasses() { registerAIEObjectFifoRegisterProcess(); registerAIEObjectFifoStatefulTransform(); registerAIERoutePacketFlows(); - // convert to llvm - DialectRegistry registry; - registerAllToLLVMIRTranslations(registry); - arith::registerConvertArithToLLVMInterface(registry); - cf::registerConvertControlFlowToLLVMInterface(registry); - func::registerAllExtensions(registry); - registerConvertFuncToLLVMInterface(registry); - index::registerConvertIndexToLLVMInterface(registry); - registerConvertMathToLLVMInterface(registry); - registerConvertMemRefToLLVMInterface(registry); } } // namespace mlir::iree_compiler::AMDAIE diff --git a/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt index 48974bee6..8ec9a4f6a 100644 --- a/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/aie/CMakeLists.txt @@ -93,6 +93,67 @@ iree_tablegen_library( -gen-enum-defs Dialect/AIEVec/IR/AIEVecEnums.cpp.inc ) +iree_tablegen_library( + NAME + AIEVecOpsGen + TD_FILE + "${IREE_MLIR_AIE_SOURCE_DIR}/include/aie/Dialect/AIEVec/IR/AIEVecOps.td" + OUTS + -gen-op-decls Dialect/AIEVec/IR/AIEVecOps.h.inc + -gen-op-defs Dialect/AIEVec/IR/AIEVecOps.cpp.inc +) + +iree_tablegen_library( + NAME + AIEVecConversionPassIncGen + TD_FILE + "${IREE_MLIR_AIE_SOURCE_DIR}/include/aie/Conversion/Passes.td" + OUTS + # this doesn't follow the correct naming convention but it's burned in downstream + -gen-pass-decls Conversion/Passes.h.inc + -gen-enum-decls Conversion/PassesEnums.h.inc + -gen-enum-defs Conversion/PassesEnums.cpp.inc +) + +iree_tablegen_library( + NAME + AIEVecPassIncGen + TD_FILE + "${IREE_MLIR_AIE_SOURCE_DIR}/include/aie/Dialect/AIEVec/Transforms/Passes.td" + OUTS + -gen-pass-decls Dialect/AIEVec/Transforms/Passes.h.inc +) + +iree_tablegen_library( + NAME + AIEVecAnalysisPassesIncGen + TD_FILE + "${IREE_MLIR_AIE_SOURCE_DIR}/include/aie/Dialect/AIEVec/Analysis/Passes.td" + OUTS + -gen-pass-decls Dialect/AIEVec/Analysis/Passes.h.inc +) + +iree_tablegen_library( + NAME + AIEVecXLLVMOpsGen + TD_FILE + "${IREE_MLIR_AIE_SOURCE_DIR}/include/aie/Dialect/XLLVM/IR/XLLVMOps.td" + OUTS + -gen-dialect-decls -dialect=xllvm Dialect/XLLVM/IR/XLLVMDialect.h.inc + -gen-dialect-defs -dialect=xllvm Dialect/XLLVM/IR/XLLVMDialect.cpp.inc + -gen-op-decls Dialect/XLLVM/IR/XLLVMOps.h.inc + -gen-op-defs Dialect/XLLVM/IR/XLLVMOps.cpp.inc +) + +iree_tablegen_library( + NAME + AIEVecXLLVMConversionPassIncGen + TD_FILE + "${IREE_MLIR_AIE_SOURCE_DIR}/include/aie/Dialect/XLLVM/IR/XLLVMOps.td" + OUTS + -gen-llvmir-conversions Dialect/XLLVM/IR/XLLVMConversions.inc +) + iree_tablegen_library( NAME AIEInterfacesGen @@ -169,6 +230,7 @@ iree_cc_library( SRCS ${IREE_MLIR_AIE_SOURCE_DIR}/lib/Dialect/AIEVec/IR/AIEVecOps.cpp ${IREE_MLIR_AIE_SOURCE_DIR}/lib/Dialect/AIEVec/IR/AIEVecTypes.cpp + ${IREE_MLIR_AIE_SOURCE_DIR}/lib/Dialect/XLLVM/XLLVMOps.cpp DEPS ::defs ::AIEVecOpsGen @@ -177,14 +239,30 @@ iree_cc_library( MLIRIR ) -iree_tablegen_library( +iree_cc_library( NAME - AIEVecOpsGen - TD_FILE - "${IREE_MLIR_AIE_SOURCE_DIR}/include/aie/Dialect/AIEVec/IR/AIEVecOps.td" - OUTS - -gen-op-decls Dialect/AIEVec/IR/AIEVecOps.h.inc - -gen-op-defs Dialect/AIEVec/IR/AIEVecOps.cpp.inc + AIEVecConvertToLLVM + SRCS + "${IREE_MLIR_AIE_SOURCE_DIR}/lib/Conversion/AIEVecToLLVM/AIEVecToLLVM.cpp" + "${IREE_MLIR_AIE_SOURCE_DIR}/lib/Target/LLVMIR/Dialect/XLLVM/XLLVMToLLVMIRTranslation.cpp" + "${IREE_MLIR_AIE_SOURCE_DIR}/lib/Dialect/AIEVec/Transforms/IntervalReuse.cpp" + "${IREE_MLIR_AIE_SOURCE_DIR}/lib/Dialect/AIEVec/Transforms/AIEVectorize.cpp" + "${IREE_MLIR_AIE_SOURCE_DIR}/lib/Dialect/AIEVec/Transforms/ConvertVectorToAIEVec.cpp" + "${IREE_MLIR_AIE_SOURCE_DIR}/lib/Dialect/AIEVec/Transforms/VectorToVectorConversions.cpp" + "${IREE_MLIR_AIE_SOURCE_DIR}/lib/Dialect/AIEVec/Transforms/VectorToAIEVecConversions.cpp" + "${IREE_MLIR_AIE_SOURCE_DIR}/lib/Dialect/AIEVec/Transforms/AIEVecOptimizations.cpp" + "${IREE_MLIR_AIE_SOURCE_DIR}/lib/Dialect/AIEVec/Transforms/FoldMulAddChainToConvOp.cpp" + "${IREE_MLIR_AIE_SOURCE_DIR}/lib/Dialect/AIEVec/Transforms/CopyRemoval.cpp" + "${IREE_MLIR_AIE_SOURCE_DIR}/lib/Dialect/AIEVec/Transforms/DynamicSizeNoImplicitBroadcast.cpp" + "${IREE_MLIR_AIE_SOURCE_DIR}/lib/Dialect/AIEVec/Utils/Utils.cpp" + DEPS + ::defs + ::AIEVecDialectIR + ::AIEVecAnalysisPassesIncGen + ::AIEVecConversionPassIncGen + ::AIEVecPassIncGen + ::AIEVecXLLVMConversionPassIncGen + ::AIEVecXLLVMOpsGen ) ############################################################################### @@ -276,20 +354,35 @@ iree_cc_library( ::defs ::AIEXDialectIR ::AIEXTransformPassHeaders - MLIRToLLVMIRTranslationRegistration - MLIRFuncAllExtensions ) ############################################################################### # AIE CDO Generation ############################################################################### +if(MSVC) + set(UUID "Rpcrt4.lib") +else() + find_library (UUID uuid REQUIRED) +endif() + iree_cc_library( NAME - AIETargetCDODirect + AIETargets SRCS + "XCLBinGen.cpp" + "${IREE_MLIR_AIE_SOURCE_DIR}/lib/Targets/AIETargetBCF.cpp" + "${IREE_MLIR_AIE_SOURCE_DIR}/lib/Targets/AIETargetLdScript.cpp" + "${IREE_MLIR_AIE_SOURCE_DIR}/lib/Targets/AIETargetNPU.cpp" "${IREE_MLIR_AIE_SOURCE_DIR}/lib/Targets/AIETargetCDODirect.cpp" DEPS ::AIEDialectIR + ::AIEDialectIR + ::AIETransformPasses + ::AIEVecDialectIR + ::AIEVecConvertToLLVM + MLIRToLLVMIRTranslationRegistration + MLIRFuncAllExtensions + ${UUID} iree-amd-aie::runtime::iree_aie_runtime_static -) \ No newline at end of file +) diff --git a/compiler/plugins/target/AMD-AIE/aie/XCLBinGen.cpp b/compiler/plugins/target/AMD-AIE/aie/XCLBinGen.cpp new file mode 100644 index 000000000..8acb1dfe6 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/aie/XCLBinGen.cpp @@ -0,0 +1,908 @@ +//===- XCLBinGen.cpp -------------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 Xilinx Inc. +// +//===---------------------------------------------------------------------===// + +#include "XCLBinGen.h" + +#include +#include +#include +#include + +#include "aie/AIEAssignBufferAddressesBasic.h" +#include "aie/Conversion/AIEVecToLLVM/AIEVecToLLVM.h" +#include "aie/Dialect/AIE/Transforms/AIEPasses.h" +#include "aie/Dialect/AIEVec/Pipelines/Passes.h" +#include "aie/Dialect/AIEX/Transforms/AIEXPasses.h" +#include "aie/Target/LLVMIR/Dialect/XLLVM/XLLVMToLLVMIRTranslation.h" +#include "aie/Targets/AIETargets.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/JSON.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/Program.h" +#include "llvm/Support/ToolOutputFile.h" +#include "mlir/Conversion/AffineToStandard/AffineToStandard.h" +#include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h" +#include "mlir/Conversion/ControlFlowToLLVM/ControlFlowToLLVM.h" +#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h" +#include "mlir/Conversion/MathToLLVM/MathToLLVM.h" +#include "mlir/Conversion/MemRefToLLVM/MemRefToLLVM.h" +#include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h" +#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVMPass.h" +#include "mlir/Dialect/MemRef/Transforms/Passes.h" +#include "mlir/IR/MLIRContext.h" +#include "mlir/Pass/PassManager.h" +#include "mlir/Support/FileUtilities.h" +#include "mlir/Target/LLVMIR/Export.h" +#include "mlir/Transforms/Passes.h" + +#ifdef _WIN32 +#include "windows.h" +// For UUID stuff +#include "rpcdce.h" + +#define setenv(name, var, ignore) _putenv_s(name, var) +#else +#include +#endif + +using namespace llvm; +using namespace mlir; +using namespace xilinx; + +namespace { + +// Apply the pass manager specific options of the XCLBinGenConfig to the pass +// manager. These control when (if ever) and what IR gets printed between +// passes, and whether the pass manager uses multi-theading. +void applyConfigToPassManager(XCLBinGenConfig &TK, PassManager &pm) { + // pm.getContext()->disableMultithreading(TK.DisableThreading); + + bool printBefore = TK.PrintIRBeforeAll; + auto shouldPrintBeforePass = [printBefore](Pass *, Operation *) { + return printBefore; + }; + + bool printAfter = TK.PrintIRAfterAll; + auto shouldPrintAfterPass = [printAfter](Pass *, Operation *) { + return printAfter; + }; + + pm.enableIRPrinting(shouldPrintBeforePass, shouldPrintAfterPass, + TK.PrintIRModuleScope); + + bool timing = TK.Timing; + if (timing) pm.enableTiming(); +} +} // namespace + +LogicalResult xilinx::findVitis(XCLBinGenConfig &TK) { + const char *env_vitis = ::getenv("VITIS"); + if (env_vitis == nullptr) { + if (auto vpp = sys::findProgramByName("v++")) { + SmallString<64> real_vpp; + std::error_code err = sys::fs::real_path(vpp.get(), real_vpp); + if (!err) { + sys::path::remove_filename(real_vpp); + sys::path::remove_filename(real_vpp); + ::setenv("VITIS", real_vpp.c_str(), 1); + dbgs() << "Found Vitis at " << real_vpp.c_str() << "\n"; + } + } + } + env_vitis = ::getenv("VITIS"); + if (env_vitis != nullptr) { + SmallString<64> vitis_path(env_vitis); + SmallString<64> vitis_bin_path(vitis_path); + sys::path::append(vitis_bin_path, "bin"); + + SmallString<64> aietools_path(vitis_path); + sys::path::append(aietools_path, "aietools"); + if (!sys::fs::exists(aietools_path)) { + aietools_path = vitis_path; + sys::path::append(aietools_path, "cardano"); + } + TK.AIEToolsDir = std::string(aietools_path); + ::setenv("AIETOOLS", TK.AIEToolsDir.c_str(), 1); + + SmallString<64> aietools_bin_path(aietools_path); + sys::path::append(aietools_bin_path, "bin", "unwrapped", "lnx64.o"); + const char *env_path = ::getenv("PATH"); + if (env_path == nullptr) env_path = ""; + SmallString<128> new_path(env_path); + if (new_path.size()) new_path += sys::EnvPathSeparator; + new_path += aietools_bin_path; + new_path += sys::EnvPathSeparator; + new_path += vitis_bin_path; + + SmallString<64> chessccPath(aietools_path); + sys::path::append(chessccPath, "tps", "lnx64", "target"); + sys::path::append(chessccPath, "bin", "LNa64bin"); + new_path += sys::EnvPathSeparator; + new_path += chessccPath; + + ::setenv("PATH", new_path.c_str(), 1); + + SmallString<64> lnx64o(TK.AIEToolsDir); + sys::path::append(lnx64o, "lib", "lnx64.o"); + SmallString<64> dotLib(TK.AIEToolsDir); + sys::path::append(dotLib, "lnx64", "tools", "dot", "lib"); + SmallString<64> ldLibraryPath(::getenv("LD_LIBRARY_PATH")); + ::setenv( + "LD_LIBRARY_PATH", + (lnx64o + std::string{sys::EnvPathSeparator} + dotLib + ldLibraryPath) + .str() + .c_str(), + 1); + + SmallString<64> rdiDataDir_(TK.AIEToolsDir); + sys::path::append(rdiDataDir_, "data"); + ::setenv("RDI_DATADIR", rdiDataDir_.c_str(), 1); + + return success(); + } else { + return failure(); + } +} + +static std::string getUUIDString() { + std::string val; +#ifdef _WIN32 + UUID *uuid; + RPC_STATUS status; + status = UuidCreate(uuid); + if (status != RPC_S_OK) errs() << "Failed to create UUID\n"; + RPC_CSTR *uuidstring; + status = UuidToStringA(uuid, uuidstring); + if (status != RPC_S_OK) errs() << "Failed to convert UUID to string\n"; + val = std::string((char *)uuidstring); + status = RpcStringFreeA(uuidstring); + if (status != RPC_S_OK) errs() << "Failed to free UUID string\n"; +#else + uuid_t binuuid; + uuid_generate_random(binuuid); + char uuid[37]; + uuid_unparse_lower(binuuid, uuid); + val = std::string(uuid); +#endif + return val; +} +static void addAIELoweringPasses(OpPassManager &pm) { + pm.addPass(createLowerAffinePass()); + pm.addPass(AIE::createAIECanonicalizeDevicePass()); + OpPassManager &devicePM = pm.nest(); + devicePM.addPass(AIE::createAIEAssignLockIDsPass()); + devicePM.addPass(AIE::createAIEAssignBufferDescriptorIDsPass()); + devicePM.addPass(AIE::createAIEObjectFifoRegisterProcessPass()); + devicePM.addPass(AIE::createAIEObjectFifoStatefulTransformPass()); + devicePM.addPass(AIEX::createAIEBroadcastPacketPass()); + devicePM.addPass(AIE::createAIERoutePacketFlowsPass()); + devicePM.addPass(AIEX::createAIELowerMulticastPass()); + devicePM.addPass(AIE::createAIEAssignBufferAddressesBasicPass()); + pm.addPass(createConvertSCFToCFPass()); +} + +static void addLowerToLLVMPasses(OpPassManager &pm) { + pm.addPass(createCanonicalizerPass()); + pm.addPass(createCSEPass()); + pm.addPass(xilinx::aievec::createConvertAIEVecToLLVMPass()); + + pm.addPass(createConvertVectorToLLVMPass()); + pm.addPass(memref::createExpandStridedMetadataPass()); + pm.addPass(createLowerAffinePass()); + pm.addPass(createConvertMathToLLVMPass()); + pm.addPass(createArithToLLVMConversionPass()); + pm.addPass(createFinalizeMemRefToLLVMConversionPass()); + pm.addPass(createCanonicalizerPass()); + pm.addPass(createCSEPass()); + ConvertFuncToLLVMPassOptions opts; + opts.useBarePtrCallConv = true; + pm.addPass(createConvertFuncToLLVMPass(opts)); + pm.addPass(createCanonicalizerPass()); + pm.addPass(createCSEPass()); + pm.addPass(createConvertControlFlowToLLVMPass()); + pm.addPass(createCanonicalizerPass()); + pm.addPass(createCSEPass()); +} + +int runTool(StringRef Program, ArrayRef Args, bool Verbose, + std::optional> Env = std::nullopt) { + if (Verbose) { + llvm::outs() << "Run:"; + if (Env) + for (auto &s : *Env) llvm::outs() << " " << s; + llvm::outs() << " " << Program; + for (auto &s : Args) llvm::outs() << " " << s; + llvm::outs() << "\n"; + } + std::string err_msg; + sys::ProcessStatistics stats; + std::optional opt_stats(stats); + SmallVector PArgs = {Program}; + PArgs.append(Args.begin(), Args.end()); + int result = sys::ExecuteAndWait(Program, PArgs, Env, {}, 0, 0, &err_msg, + nullptr, &opt_stats); + if (Verbose) + llvm::outs() << (result == 0 ? "Succeeded " : "Failed ") << "in " + << std::chrono::duration_cast>( + stats.TotalTime) + .count() + << " code: " << result << "\n"; + return result; +} + +const char *_CHESS_INTRINSIC_WRAPPER_LL = R"chess( +; ModuleID = 'aie_runtime_lib/AIE2/chess_intrinsic_wrapper.cpp' +source_filename = "aie_runtime_lib/AIE2/chess_intrinsic_wrapper.cpp" + +%struct.ipd.custom_type.uint2_t.uint2_t = type { i2 } + +; Function Attrs: mustprogress nounwind +define dso_local void @llvm___aie2___acquire(i32 noundef %0, i32 noundef %1) local_unnamed_addr addrspace(1) #0 { + tail call addrspace(1) void @llvm.chess_memory_fence() + tail call addrspace(1) void @_Z25chess_separator_schedulerv() #4 + tail call x86_regcallcc addrspace(1) void @__regcall3__chessintr_void_acquire_guarded___uint___uint(i32 zeroext %0, i32 zeroext %1) #4 + tail call addrspace(1) void @_Z25chess_separator_schedulerv() #4 + tail call addrspace(1) void @llvm.chess_memory_fence() + ret void +} + +; Function Attrs: mustprogress nounwind +define dso_local void @llvm___aie2___release(i32 noundef %0, i32 noundef %1) local_unnamed_addr addrspace(1) #0 { + tail call addrspace(1) void @llvm.chess_memory_fence() + tail call addrspace(1) void @_Z25chess_separator_schedulerv() #4 + tail call x86_regcallcc addrspace(1) void @__regcall3__chessintr_void_release_guarded___uint___sint(i32 zeroext %0, i32 signext %1) #4 + tail call addrspace(1) void @_Z25chess_separator_schedulerv() #4 + tail call addrspace(1) void @llvm.chess_memory_fence() + ret void +} + +; Function Attrs: nounwind +define dso_local void @llvm___aie___event0() local_unnamed_addr addrspace(1) #1 { + tail call x86_regcallcc addrspace(1) void @__regcall3__chessintr_void_event_uint2_t(%struct.ipd.custom_type.uint2_t.uint2_t zeroinitializer) #4 + ret void +} + +; Function Attrs: nounwind +define dso_local void @llvm___aie___event1() local_unnamed_addr addrspace(1) #1 { + tail call x86_regcallcc addrspace(1) void @__regcall3__chessintr_void_event_uint2_t(%struct.ipd.custom_type.uint2_t.uint2_t { i2 1 }) #4 + ret void +} + +; Function Attrs: mustprogress nounwind willreturn +declare void @llvm.chess_memory_fence() addrspace(1) #2 + +; Function Attrs: inaccessiblememonly nounwind +declare dso_local void @_Z25chess_separator_schedulerv() local_unnamed_addr addrspace(1) #3 + +; Function Attrs: inaccessiblememonly nounwind +declare dso_local x86_regcallcc void @__regcall3__chessintr_void_acquire_guarded___uint___uint(i32 zeroext, i32 zeroext) local_unnamed_addr addrspace(1) #3 + +; Function Attrs: inaccessiblememonly nounwind +declare dso_local x86_regcallcc void @__regcall3__chessintr_void_release_guarded___uint___sint(i32 zeroext, i32 signext) local_unnamed_addr addrspace(1) #3 + +; Function Attrs: inaccessiblememonly nounwind +declare dso_local x86_regcallcc void @__regcall3__chessintr_void_event_uint2_t(%struct.ipd.custom_type.uint2_t.uint2_t) local_unnamed_addr addrspace(1) #3 + +attributes #0 = { mustprogress nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtin-memcpy" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +attributes #1 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-builtin-memcpy" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +attributes #2 = { mustprogress nounwind willreturn } +attributes #3 = { inaccessiblememonly nounwind "frame-pointer"="all" "no-builtin-memcpy" "no-trapping-math"="true" "stack-protector-buffer-size"="8" } +attributes #4 = { inaccessiblememonly nounwind "no-builtin-memcpy" } + +!llvm.linker.options = !{} +!llvm.module.flags = !{!0, !1} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 7, !"frame-pointer", i32 2} +)chess"; + +std::vector chessArgs(const std::string &AIEToolsDir, + std::string workDir) { + SmallString<64> chessClang(AIEToolsDir); + sys::path::append(chessClang, "tps", "lnx64", "target"); + sys::path::append(chessClang, "bin", "LNa64bin", "chess-clang"); + SmallString<64> procModelLib(AIEToolsDir); + sys::path::append(procModelLib, "data", "aie_ml", "lib"); + return { + "+P", + "4", // parallel compilation (function + file level) + "-p", + "me", // parallel compilation (function level only) + "-C", + "Release_LLVM", // configuration + "-D__AIENGINE__", + "-D__AIE_ARCH__=20", + "-D__AIEARCH__=20", + "-Y", + "clang=" + chessClang.str().str(), + "-P", + procModelLib.str().str(), // processor model directory + "-d", // disassemble output + "-f", // use LLVM frontend + "+w", + std::move(workDir), + }; +} + +// Generate the elf files for the core +static LogicalResult generateCoreElfFiles(ModuleOp moduleOp, + const StringRef objFile, + XCLBinGenConfig &TK) { + auto deviceOps = moduleOp.getOps(); + if (!llvm::hasSingleElement(deviceOps)) + return moduleOp.emitOpError("expected a single device op"); + + AIE::DeviceOp deviceOp = *deviceOps.begin(); + auto tileOps = deviceOp.getOps(); + + std::string errorMessage; + + for (auto tileOp : tileOps) { + int col = tileOp.colIndex(); + int row = tileOp.rowIndex(); + auto coreOp = tileOp.getCoreOp(); + if (!coreOp) continue; + + std::string elfFileName; + if (auto fileAttr = coreOp.getElfFileAttr()) { + elfFileName = std::string(fileAttr.getValue()); + } else { + elfFileName = std::string("core_") + std::to_string(col) + "_" + + std::to_string(row) + ".elf"; + coreOp.setElfFile(elfFileName); + } + + SmallString<64> elfFile(TK.TempDir); + sys::path::append(elfFile, elfFileName); + + // Use xbridge (to remove any peano dependency with use-chess option) + SmallString<64> bcfPath(TK.TempDir); + sys::path::append(bcfPath, elfFileName + ".bcf"); + + { + auto bcfOutput = openOutputFile(bcfPath, &errorMessage); + if (!bcfOutput) return coreOp.emitOpError(errorMessage); + + if (failed(AIE::AIETranslateToBCF(moduleOp, bcfOutput->os(), col, row))) + return coreOp.emitOpError("Failed to generate BCF"); + bcfOutput->keep(); + } + + std::vector extractedIncludes; + { + auto bcfFileIn = openInputFile(bcfPath, &errorMessage); + if (!bcfFileIn) moduleOp.emitOpError(errorMessage); + + std::string bcfFile = std::string(bcfFileIn->getBuffer()); + std::regex r("_include _file (.*)"); + auto begin = std::sregex_iterator(bcfFile.begin(), bcfFile.end(), r); + auto end = std::sregex_iterator(); + for (std::sregex_iterator i = begin; i != end; ++i) + extractedIncludes.push_back(i->str(1)); + } + + SmallString<64> chessExe(TK.AIEToolsDir); + sys::path::append(chessExe, "bin", "unwrapped", "lnx64.o", "xchesscc"); + SmallString<64> chessworkDir(TK.TempDir); + sys::path::append(chessworkDir, "chesswork"); + SmallVector flags{"+l", std::string(bcfPath), + "-o", std::string(elfFile), + "-f", std::string(objFile)}; + for (const auto &inc : extractedIncludes) flags.push_back(inc); + auto chessArgs_ = chessArgs(TK.AIEToolsDir, chessworkDir.str().str()); + chessArgs_.insert(chessArgs_.end(), flags.begin(), flags.end()); + if (runTool(chessExe, chessArgs_, TK.Verbose) != 0) + coreOp.emitOpError("Failed to link with xbridge"); + } + return success(); +} + +static LogicalResult generateCDO(MLIRContext *context, ModuleOp moduleOp, + XCLBinGenConfig &TK) { + ModuleOp copy = moduleOp.clone(); + std::string errorMessage; + // This corresponds to `process_host_cgen`, which is listed as host + // compilation in aiecc.py... not sure we need this. + PassManager passManager(context, ModuleOp::getOperationName()); + applyConfigToPassManager(TK, passManager); + + passManager.addNestedPass(AIE::createAIEPathfinderPass()); + passManager.addNestedPass( + AIEX::createAIEBroadcastPacketPass()); + passManager.addNestedPass( + AIE::createAIERoutePacketFlowsPass()); + passManager.addNestedPass(AIEX::createAIELowerMulticastPass()); + if (failed(passManager.run(copy))) + return moduleOp.emitOpError( + "failed to run passes to prepare of XCLBin generation"); + + if (failed(AIE::AIETranslateToCDODirect(copy, TK.TempDir))) + return moduleOp.emitOpError("failed to emit CDO"); + + copy->erase(); + return success(); +} + +static json::Object makeKernelJSON(std::string name, std::string id, + std::string instance) { + return json::Object{ + {"name", name}, + {"type", "dpu"}, + {"extended-data", + json::Object{ + {"subtype", "DPU"}, {"functional", "0"}, {"dpu_kernel_id", id}}}, + {"arguments", json::Array{json::Object{{"name", "opcode"}, + {"address-qualifier", "SCALAR"}, + {"type", "uint64_t"}, + {"offset", "0x00"}}, + json::Object{{"name", "instr"}, + {"memory-connection", "SRAM"}, + {"address-qualifier", "GLOBAL"}, + {"type", "char *"}, + {"offset", "0x08"}}, + json::Object{{"name", "ninstr"}, + {"address-qualifier", "SCALAR"}, + {"type", "uint32_t"}, + {"offset", "0x10"}}, + json::Object{{"name", "bo0"}, + {"memory-connection", "HOST"}, + {"address-qualifier", "GLOBAL"}, + {"type", "void*"}, + {"offset", "0x14"}}, + json::Object{{"name", "bo1"}, + {"memory-connection", "HOST"}, + {"address-qualifier", "GLOBAL"}, + {"type", "void*"}, + {"offset", "0x1c"}}, + json::Object{{"name", "bo2"}, + {"memory-connection", "HOST"}, + {"address-qualifier", "GLOBAL"}, + {"type", "void*"}, + {"offset", "0x24"}}, + json::Object{{"name", "bo3"}, + {"memory-connection", "HOST"}, + {"address-qualifier", "GLOBAL"}, + {"type", "void*"}, + {"offset", "0x2c"}}, + json::Object{{"name", "bo4"}, + {"memory-connection", "HOST"}, + {"address-qualifier", "GLOBAL"}, + {"type", "void*"}, + {"offset", "0x34"}}}}, + {"instances", json::Array{json::Object{{"name", instance}}}}}; +} + +static LogicalResult generateXCLBin(MLIRContext *context, ModuleOp moduleOp, + XCLBinGenConfig &TK, + const StringRef &Output) { + std::string errorMessage; + // Create mem_topology.json. + SmallString<64> memTopologyJsonFile(TK.TempDir); + sys::path::append(memTopologyJsonFile, "mem_topology.json"); + { + auto memTopologyJsonOut = + openOutputFile(memTopologyJsonFile, &errorMessage); + if (!memTopologyJsonOut) return moduleOp.emitOpError(errorMessage); + + std::string mem_topology_data = R"({ + "mem_topology": { + "m_count": "2", + "m_mem_data": [ + { + "m_type": "MEM_DRAM", + "m_used": "1", + "m_sizeKB": "0x10000", + "m_tag": "HOST", + "m_base_address": "0x4000000" + }, + { + "m_type": "MEM_DRAM", + "m_used": "1", + "m_sizeKB": "0xc000", + "m_tag": "SRAM", + "m_base_address": "0x4000000" + } + ] + } + })"; + memTopologyJsonOut->os() << mem_topology_data; + memTopologyJsonOut->keep(); + } + + // Create aie_partition.json. + SmallString<64> aiePartitionJsonFile(TK.TempDir); + sys::path::append(aiePartitionJsonFile, "aie_partition.json"); + { + auto aiePartitionJsonOut = + openOutputFile(aiePartitionJsonFile, &errorMessage); + if (!aiePartitionJsonOut) return moduleOp.emitOpError(errorMessage); + + std::string uuid_str = getUUIDString(); + std::string aie_partition_json_data = R"( + { + "aie_partition": { + "name": "QoS", + "operations_per_cycle": "2048", + "inference_fingerprint": "23423", + "pre_post_fingerprint": "12345", + "partition": { + "column_width": 4, + "start_columns": [ + 1 + ] + }, + "PDIs": [ + { + "uuid": ")" + uuid_str + R"(", + "file_name": "./design.pdi", + "cdo_groups": [ + { + "name": "DPU", + "type": "PRIMARY", + "pdi_id": "0x01", + "dpu_kernel_ids": [ + "0x901" + ], + "pre_cdo_groups": [ + "0xC1" + ] + } + ] + } + ] + } + } + )"; + aiePartitionJsonOut->os() << aie_partition_json_data; + aiePartitionJsonOut->keep(); + } + + // Create kernels.json. + SmallString<64> kernelsJsonFile(TK.TempDir); + sys::path::append(kernelsJsonFile, "kernels.json"); + { + auto kernelsJsonOut = openOutputFile(kernelsJsonFile, &errorMessage); + if (!kernelsJsonOut) return moduleOp.emitOpError(errorMessage); + + json::Object kernels_data{ + {"ps-kernels", + json::Object{ + {"kernels", + json::Array{// TODO: Support for multiple kernels + makeKernelJSON(TK.XCLBinKernelName, TK.XCLBinKernelID, + TK.XCLBinInstanceName)}}}}}; + kernelsJsonOut->os() << formatv("{0:2}", + json::Value(std::move(kernels_data))); + kernelsJsonOut->keep(); + } + // Create design.bif. + SmallString<64> designBifFile(TK.TempDir); + sys::path::append(designBifFile, "design.bif"); + { + auto designBifOut = openOutputFile(designBifFile, &errorMessage); + if (!designBifOut) return moduleOp.emitOpError(errorMessage); + + designBifOut->os() << "all:\n" + << "{\n" + << " id_code = 0x14ca8093\n" + << " extended_id_code = 0x01\n" + << " image\n" + << " {\n" + << " name=aie_image, id=0x1c000000\n" + << " { type=cdo\n" + << " file=" << TK.TempDir << "/aie_cdo_elfs.bin\n" + << " file=" << TK.TempDir << "/aie_cdo_init.bin\n" + << " file=" << TK.TempDir << "/aie_cdo_enable.bin\n" + << " }\n" + << " }\n" + << "}"; + designBifOut->keep(); + } + + // Execute the bootgen command. + SmallString<64> designPdiFile(TK.TempDir); + sys::path::append(designPdiFile, "design.pdi"); + { + SmallVector flags{"-arch", "versal", + "-image", std::string(designBifFile), + "-o", std::string(designPdiFile), + "-w"}; + + // use ./Xilinx/Vitis/2023.2/bin/bootgen for now (will link to lib soon) + + if (auto bootgen = sys::findProgramByName("bootgen")) { + if (runTool(*bootgen, flags, TK.Verbose) != 0) + return moduleOp.emitOpError("failed to execute bootgen"); + } else { + return moduleOp.emitOpError("could not find bootgen"); + } + } + + // Execute the xclbinutil command. + { + std::string memArg = + "MEM_TOPOLOGY:JSON:" + std::string(memTopologyJsonFile); + std::string partArg = + "AIE_PARTITION:JSON:" + std::string(aiePartitionJsonFile); + SmallVector flags{"--add-replace-section", + memArg, + "--add-kernel", + std::string(kernelsJsonFile), + "--add-replace-section", + partArg, + "--force", + "--output", + std::string(Output)}; + + if (auto xclbinutil = sys::findProgramByName("xclbinutil")) { + if (runTool(*xclbinutil, flags, TK.Verbose) != 0) + return moduleOp.emitOpError("failed to execute xclbinutil"); + } else { + return moduleOp.emitOpError("could not find xclbinutil"); + } + } + return success(); +} + +static std::string chesshack(const std::string &input) { + std::string result(input); + static const std::unordered_map substitutions{ + {"memory\\(none\\)", "readnone"}, + {"memory\\(read\\)", "readonly"}, + {"memory\\(write\\)", "writeonly"}, + {"memory\\(argmem: readwrite\\)", "argmemonly"}, + {"memory\\(argmem: read\\)", "argmemonly readonly"}, + {"memory\\(argmem: write\\)", "argmemonly writeonly"}, + {"memory\\(inaccessiblemem: write\\)", "inaccessiblememonly writeonly"}, + {"memory\\(inaccessiblemem: readwrite\\)", "inaccessiblememonly"}, + {"memory\\(inaccessiblemem: read\\)", "inaccessiblememonly readonly"}, + {"memory(argmem: readwrite, inaccessiblemem: readwrite)", + "inaccessiblemem_or_argmemonly"}, + {"memory(argmem: read, inaccessiblemem: read)", + "inaccessiblemem_or_argmemonly readonly"}, + {"memory(argmem: write, inaccessiblemem: write)", + "inaccessiblemem_or_argmemonly writeonly"}, + }; + for (const auto &pair : substitutions) + result = std::regex_replace(result, std::regex(pair.first), pair.second); + return result; +} + +// A pass which removes the alignment attribute from llvm load operations, if +// the alignment is less than 4 (2 or 1). +// +// Example replaces: +// +// ``` +// %113 = llvm.load %112 {alignment = 2 : i64} : !llvm.ptr -> vector<32xbf16> +// ``` +// +// with +// +// ``` +// %113 = llvm.load %112 : !llvm.ptr -> vector<32xbf16> +// ``` +// +// If this pass is not included in the pipeline, there is an alignment error +// later in the compilation. This is a temporary workaround while a better +// solution is found: propagation of memref.assume_alignment is one option. See +// also https://jira.xilinx.com/projects/AIECC/issues/AIECC-589 +namespace { +struct RemoveAlignment2FromLLVMLoadPass + : public PassWrapper> { + void runOnOperation() override { + getOperation().walk([](Operation *op) { + if (auto loadOp = dyn_cast(op)) { + auto alignmentAttr = loadOp.getAlignmentAttr(); + if (alignmentAttr) { + int alignmentVal = alignmentAttr.getValue().getSExtValue(); + if (alignmentVal == 2 || alignmentVal == 1) { + loadOp.setAlignment(std::optional()); + } + } + } + }); + } + + public: + MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID( + RemoveAlignment2FromLLVMLoadPass); +}; +} // namespace + +static LogicalResult generateObject(MLIRContext *context, ModuleOp moduleOp, + XCLBinGenConfig &TK, + const std::string &outputFile) { + PassManager pm(context, moduleOp.getOperationName()); + applyConfigToPassManager(TK, pm); + + pm.addNestedPass(AIE::createAIELocalizeLocksPass()); + pm.addNestedPass(AIE::createAIENormalizeAddressSpacesPass()); + pm.addPass(AIE::createAIECoreToStandardPass()); + pm.addPass(AIEX::createAIEXToStandardPass()); + + // Convert specific vector dialect ops (like vector.contract) to the AIEVec + // dialect + { + xilinx::aievec::ConvertVectorToAIEVecOptions vectorToAIEVecOptions{}; + + std::string optionsString = [&]() { + std::ostringstream optionsStringStream; + optionsStringStream << "target-backend="; + optionsStringStream << (TK.UseChess ? "cpp" : "llvmir"); + optionsStringStream << ' ' << "aie-target=aieml"; + return optionsStringStream.str(); + }(); + + if (failed(vectorToAIEVecOptions.parseFromString(optionsString))) { + return moduleOp.emitOpError("Failed to parse options from '") + << optionsString + << "': Failed to construct ConvertVectorToAIEVecOptions."; + } + xilinx::aievec::buildConvertVectorToAIEVec(pm, vectorToAIEVecOptions); + } + + addLowerToLLVMPasses(pm); + pm.addPass(std::make_unique()); + + if (TK.Verbose) { + llvm::outs() << "Running: "; + pm.printAsTextualPipeline(llvm::outs()); + llvm::outs() << "\n"; + } + + ModuleOp copy = moduleOp.clone(); + if (failed(pm.run(copy))) + return moduleOp.emitOpError("Failed to lower to LLVM"); + + SmallString<64> LLVMIRFile(TK.TempDir); + sys::path::append(LLVMIRFile, "input.ll"); + + llvm::LLVMContext llvmContext; + auto llvmModule = translateModuleToLLVMIR(copy, llvmContext); + if (!llvmModule) + return moduleOp.emitOpError("Failed to translate module to LLVMIR"); + + std::string errorMessage; + { + auto output = openOutputFile(LLVMIRFile, &errorMessage); + if (!output) return moduleOp.emitOpError(errorMessage); + llvmModule->print(output->os(), nullptr); + output->keep(); + } + + SmallString<64> chessExe(TK.AIEToolsDir); + sys::path::append(chessExe, "bin", "unwrapped", "lnx64.o", "xchesscc"); + SmallString<64> chessworkDir(TK.TempDir); + sys::path::append(chessworkDir, "chesswork"); + SmallString<64> chessIntrinsicsLL(TK.InstallDir); + sys::path::append(chessIntrinsicsLL, "chess_intrinsic_wrapper.ll"); + { + auto chessIntrinsicWrapperLlFile = openOutputFile(chessIntrinsicsLL); + if (!chessIntrinsicWrapperLlFile) moduleOp.emitOpError(errorMessage); + + chessIntrinsicWrapperLlFile->os() << _CHESS_INTRINSIC_WRAPPER_LL; + chessIntrinsicWrapperLlFile->keep(); + } + + std::string llvmirString; + { + raw_string_ostream llvmirStream(llvmirString); + llvmModule->print(llvmirStream, nullptr); + } + + SmallString<64> chesslinkedFile(TK.TempDir); + sys::path::append(chesslinkedFile, "input.chesslinked.ll"); + SmallString<64> chessLlvmLinkBin(TK.AIEToolsDir); + sys::path::append(chessLlvmLinkBin, "tps", "lnx64", "target"); + sys::path::append(chessLlvmLinkBin, "bin", "LNa64bin", "chess-llvm-link"); + + if (runTool(chessLlvmLinkBin, + {std::string(LLVMIRFile), std::string(chessIntrinsicsLL), + "--opaque-pointers=1", "-S", "-o", std::string(chesslinkedFile)}, + TK.Verbose) != 0) + moduleOp.emitOpError("Couldn't link in the intrinsics"); + + std::string mungedLLVMIR; + { + auto chesslinkedIn = openInputFile(chesslinkedFile, &errorMessage); + if (!chesslinkedIn) moduleOp.emitOpError(errorMessage); + + mungedLLVMIR = std::string(chesslinkedIn->getBuffer()); + mungedLLVMIR = chesshack(mungedLLVMIR); + } + { + auto chesslinkedOut = openOutputFile(chesslinkedFile); + if (!chesslinkedOut) moduleOp.emitOpError(errorMessage); + + chesslinkedOut->os() << mungedLLVMIR; + chesslinkedOut->keep(); + } + + auto chessArgs_ = chessArgs(TK.AIEToolsDir, chessworkDir.str().str()); + chessArgs_.push_back("-c"); + chessArgs_.push_back(std::string(chesslinkedFile)); + chessArgs_.push_back("-o"); + chessArgs_.push_back(std::string(outputFile)); + + if (runTool(chessExe, chessArgs_, TK.Verbose) != 0) + return moduleOp.emitOpError("Failed to assemble with chess"); + copy->erase(); + return success(); +} + +LogicalResult xilinx::aie2xclbin(MLIRContext *ctx, ModuleOp moduleOp, + XCLBinGenConfig &TK, StringRef OutputNPU, + StringRef OutputXCLBin) { + if (failed(xilinx::findVitis(TK))) moduleOp.emitOpError("VITIS not found"); + + PassManager pm(ctx, moduleOp.getOperationName()); + applyConfigToPassManager(TK, pm); + + addAIELoweringPasses(pm); + + if (TK.Verbose) { + llvm::outs() << "Running: "; + pm.printAsTextualPipeline(llvm::outs()); + llvm::outs() << "\n"; + } + + if (failed(pm.run(moduleOp))) + return moduleOp.emitOpError("AIE lowering pipline failed"); + + TK.TargetArch = StringRef(TK.TargetArch).trim(); + + std::regex target_regex("AIE.?"); + if (!std::regex_search(TK.TargetArch, target_regex)) + return moduleOp.emitOpError() + << "Unexpected target architecture: " << TK.TargetArch; + + // generateNPUInstructions + { + PassManager pm(ctx, moduleOp.getOperationName()); + applyConfigToPassManager(TK, pm); + + pm.addNestedPass(AIEX::createAIEDmaToNpuPass()); + ModuleOp copy = moduleOp.clone(); + if (failed(pm.run(copy))) + return moduleOp.emitOpError("NPU Instruction pipeline failed"); + + std::string errorMessage; + auto output = openOutputFile(OutputNPU, &errorMessage); + if (!output) { + llvm::errs() << errorMessage << "\n"; + return moduleOp.emitOpError(""); + } + + if (failed(AIE::AIETranslateToNPU(copy, output->os()))) + return moduleOp.emitOpError("NPU Instruction translation failed"); + + output->keep(); + copy->erase(); + } + + SmallString<64> object(TK.TempDir); + sys::path::append(object, "input.o"); + if (failed(generateObject(ctx, moduleOp, TK, std::string(object)))) + return moduleOp.emitOpError("Failed to generate object"); + + if (failed(generateCoreElfFiles(moduleOp, object, TK))) + return moduleOp.emitOpError("Failed to generate core ELF file(s)"); + + if (failed(generateCDO(ctx, moduleOp, TK))) + return moduleOp.emitOpError("Failed to generate CDO"); + + if (failed(generateXCLBin(ctx, moduleOp, TK, OutputXCLBin))) + return moduleOp.emitOpError("Failed to generate XCLBin"); + + return success(); +} diff --git a/compiler/plugins/target/AMD-AIE/aie/XCLBinGen.h b/compiler/plugins/target/AMD-AIE/aie/XCLBinGen.h new file mode 100644 index 000000000..c58d7ce64 --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/aie/XCLBinGen.h @@ -0,0 +1,47 @@ +//===- XCLBinGen.h ---------------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 Xilinx Inc. +// +//===---------------------------------------------------------------------===// + +#include + +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringRef.h" +#include "mlir/IR/BuiltinOps.h" +#include "mlir/IR/MLIRContext.h" +#include "mlir/Support/LogicalResult.h" + +#pragma once + +namespace xilinx { + +struct XCLBinGenConfig { + std::string TargetArch; + std::string PeanoDir; + std::string InstallDir; + std::string AIEToolsDir; + std::string TempDir; + bool Verbose; + std::string HostArch; + std::string XCLBinKernelName; + std::string XCLBinKernelID; + std::string XCLBinInstanceName; + bool UseChess = false; + bool PrintIRAfterAll = false; + bool PrintIRBeforeAll = false; + bool PrintIRModuleScope = false; + bool Timing = false; +}; + +mlir::LogicalResult findVitis(XCLBinGenConfig &TK); + +mlir::LogicalResult aie2xclbin(mlir::MLIRContext *ctx, mlir::ModuleOp moduleOp, + XCLBinGenConfig &TK, llvm::StringRef OutputNPU, + llvm::StringRef OutputXCLBin); + +} // namespace xilinx diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETargetDirect.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETargetDirect.cpp index fdf299c2b..ae2497796 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETargetDirect.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETargetDirect.cpp @@ -9,8 +9,12 @@ #include #include "aie/Dialect/AIE/IR/AIEDialect.h" +#include "aie/Dialect/AIEVec/IR/AIEVecDialect.h" #include "aie/Dialect/AIEX/IR/AIEXDialect.h" +#include "aie/Dialect/XLLVM/XLLVMDialect.h" #include "aie/Passes.h" +#include "aie/Target/LLVMIR/Dialect/XLLVM/XLLVMToLLVMIRTranslation.h" +#include "aie/XCLBinGen.h" #include "air/Dialect/AIR/AIRDialect.h" #include "air/Dialect/AIRRt/AIRRtDialect.h" #include "iree-amd-aie/IR/AMDAIEDialect.h" @@ -29,15 +33,22 @@ #include "llvm/Support/Path.h" #include "llvm/Support/Program.h" #include "llvm/Support/ToolOutputFile.h" +#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVM.h" #include "mlir/Conversion/Passes.h" #include "mlir/Dialect/Affine/Passes.h" +#include "mlir/Dialect/DLTI/DLTI.h" +#include "mlir/Dialect/EmitC/IR/EmitC.h" +#include "mlir/Dialect/Func/Extensions/AllExtensions.h" #include "mlir/Dialect/Func/IR/FuncOps.h" #include "mlir/Dialect/Linalg/Passes.h" +#include "mlir/Dialect/Math/IR/Math.h" #include "mlir/Dialect/MemRef/Transforms/Passes.h" #include "mlir/Dialect/Transform/IR/TransformDialect.h" +#include "mlir/Dialect/Vector/IR/VectorOps.h" #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassManager.h" #include "mlir/Support/FileUtilities.h" +#include "mlir/Target/LLVMIR/Dialect/All.h" #include "mlir/Target/LLVMIR/Export.h" #include "mlir/Transforms/Passes.h" #include "runtime/plugins/AMD-AIE/iree-amd-aie/schemas/xrt_executable_def_builder.h" @@ -126,7 +137,22 @@ class AIETargetDirectBackend final : public IREE::HAL::TargetBackend { IREE::LinalgExt::IREELinalgExtDialect, transform::TransformDialect, xilinx::AIE::AIEDialect, xilinx::AIEX::AIEXDialect, xilinx::air::airDialect, - xilinx::airrt::AIRRtDialect>(); + xilinx::xllvm::XLLVMDialect, xilinx::aievec::AIEVecDialect, + emitc::EmitCDialect, LLVM::LLVMDialect, func::FuncDialect, + cf::ControlFlowDialect, DLTIDialect, arith::ArithDialect, + memref::MemRefDialect, math::MathDialect, + vector::VectorDialect, xilinx::airrt::AIRRtDialect>(); + + registerBuiltinDialectTranslation(registry); + registerLLVMDialectTranslation(registry); + xilinx::xllvm::registerXLLVMDialectTranslation(registry); + arith::registerConvertArithToLLVMInterface(registry); + cf::registerConvertControlFlowToLLVMInterface(registry); + func::registerAllExtensions(registry); + registerConvertFuncToLLVMInterface(registry); + index::registerConvertIndexToLLVMInterface(registry); + registerConvertMathToLLVMInterface(registry); + registerConvertMemRefToLLVMInterface(registry); } void buildTranslationPassPipeline(IREE::HAL::ExecutableTargetAttr, @@ -153,10 +179,6 @@ LogicalResult AIETargetDirectBackend::serializeExecutable( llvm::join_items("_", serOptions.dumpBaseName, variantOp.getName()); auto maybeWorkDir = [&]() -> FailureOr> { - // If a path for intermediates has been specified, assume it is common for - // all executables compiling in parallel, and so create an - // executable-specific subdir to keep this executable's intermediates - // separate. if (!serOptions.dumpIntermediatesPath.empty()) { SmallString<128> workDir{serOptions.dumpIntermediatesPath}; llvm::sys::path::append(workDir, basename); @@ -169,9 +191,6 @@ LogicalResult AIETargetDirectBackend::serializeExecutable( return workDir; } - // No path for intermediates: make a temporary directory for this - // executable that is certain to be distinct from the dir of any other - // executable. SmallString<128> workDirFromScratch; auto err = llvm::sys::fs::createUniqueDirectory( /* prefix = */ variantOp.getName(), workDirFromScratch); @@ -187,47 +206,75 @@ LogicalResult AIETargetDirectBackend::serializeExecutable( if (failed(maybeWorkDir)) return failure(); auto workDir = maybeWorkDir.value(); - ModuleOp coreMod = moduleOp.clone(); - PassManager passManager(coreMod->getContext(), ModuleOp::getOperationName()); - passManager.addPass(xilinx::AIE::createAIECoreToStandardPass()); - passManager.addPass(xilinx::AIEX::createAIEXToStandardPass()); - // convert to LLVM dialect - passManager.addPass(createCanonicalizerPass()); - passManager.addPass(createCSEPass()); - passManager.addPass(createConvertVectorToLLVMPass()); - passManager.addPass(createIREEExpandStridedMetadataPass()); - passManager.addPass(createLowerAffinePass()); - passManager.addPass(createConvertMathToLLVMPass()); - passManager.addPass(createArithToLLVMConversionPass()); - passManager.addPass(createIREEExpandStridedMetadataPass()); - passManager.addPass(createFinalizeMemRefToLLVMConversionPass()); - ConvertFuncToLLVMPassOptions funcToLlvmPassOptions; - funcToLlvmPassOptions.useBarePtrCallConv = true; - passManager.addPass(createConvertFuncToLLVMPass(funcToLlvmPassOptions)); - passManager.addPass(createConvertControlFlowToLLVMPass()); - passManager.addPass(createCanonicalizerPass()); - passManager.addPass(createCSEPass()); - - if (failed(passManager.run(coreMod))) { - variantOp.emitError() << "failed to run translation of source " - "executable to target executable for backend " - << variantOp.getTarget(); - return failure(); + xilinx::XCLBinGenConfig TK; + TK.TempDir = workDir.str(); + TK.TargetArch = "AIE2"; + TK.UseChess = true; + TK.Verbose = true; + + SmallVector entryPointNames; + for (auto exportOp : variantOp.getExportOps()) { + entryPointNames.emplace_back(exportOp.getSymName().substr(0, 48)); } - std::string llvmir; - llvm::raw_string_ostream os(llvmir); - llvm::LLVMContext llvmContext; - auto llvmModule = translateModuleToLLVMIR(coreMod, llvmContext); - llvmModule->dump(); - llvmModule->print(os, nullptr); + if (entryPointNames.size() != 1) { + return moduleOp.emitOpError("Expected a single entry point"); + } - if (failed(xilinx::AIE::AIETranslateToCDODirect(moduleOp, workDir))) + TK.XCLBinKernelName = entryPointNames[0]; + TK.XCLBinKernelID = "0x101"; + TK.XCLBinInstanceName = "FOO"; + SmallString<128> xclbinPath(workDir); + llvm::sys::path::append(xclbinPath, basename + ".xclbin"); + SmallString<128> npuInstPath(workDir); + llvm::sys::path::append(npuInstPath, basename + ".npu.txt"); + + if (failed(aie2xclbin(variantOp->getContext(), moduleOp, TK, npuInstPath, + xclbinPath))) return failure(); - moduleOp.emitOpError( - "unimplemented AIETargetDirectBackend::serializeExecutable"); - return failure(); + std::vector npuInstrs; + + std::ifstream instrFile(static_cast(npuInstPath)); + std::string line; + while (std::getline(instrFile, line)) { + std::istringstream iss(line); + uint32_t a; + if (!(iss >> std::hex >> a)) { + return moduleOp.emitOpError("Unable to parse instruction file"); + } + npuInstrs.push_back(a); + } + + std::string errorMessage; + auto xclbinIn = openInputFile(xclbinPath, &errorMessage); + if (!xclbinIn) { + moduleOp.emitOpError() << "Failed to open xclbin file: " << errorMessage; + } + + // Serialize the executable to flatbuffer format + FlatbufferBuilder builder; + iree_amd_aie_hal_xrt_ExecutableDef_start_as_root(builder); + auto entryPointsRef = builder.createStringVec(entryPointNames); + + iree_amd_aie_hal_xrt_ExecutableDef_entry_points_add(builder, entryPointsRef); + iree_amd_aie_hal_xrt_AsmInstDef_vec_start(builder); + auto npuInstrsVec = builder.createInt32Vec(npuInstrs); + iree_amd_aie_hal_xrt_AsmInstDef_vec_push_create(builder, npuInstrsVec); + auto npuInstrsRef = iree_amd_aie_hal_xrt_AsmInstDef_vec_end(builder); + iree_amd_aie_hal_xrt_ExecutableDef_asm_instrs_add(builder, npuInstrsRef); + auto xclbinStringRef = builder.createString(xclbinIn->getBuffer()); + iree_amd_aie_hal_xrt_ExecutableDef_xclbins_add(builder, xclbinStringRef); + iree_amd_aie_hal_xrt_ExecutableDef_end_as_root(builder); + + auto binaryOp = executableBuilder.create( + variantOp.getLoc(), variantOp.getSymName(), + variantOp.getTarget().getFormat(), + builder.getBufferAttr(executableBuilder.getContext())); + binaryOp.setMimeTypeAttr( + executableBuilder.getStringAttr("application/x-flatbuffers")); + + return success(); } std::shared_ptr createTargetDirect( diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/CMakeLists.txt index 2b1f31265..8be489156 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/CMakeLists.txt @@ -19,7 +19,7 @@ iree_cc_library( iree::compiler::Dialect::HAL::Target iree::target::amd-aie::IR::AMDAIEDialect iree::target::amd-aie::Transforms - iree::target::amd-aie::aie::AIETargetCDODirect + iree::target::amd-aie::aie::AIETargets iree::target::amd-aie::air::AIRDialectIR iree::base::internal::flatcc::building iree::base::internal::flatcc::parsing diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/CMakeLists.txt index efd23fb3c..09879d9d6 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/CMakeLists.txt @@ -8,6 +8,7 @@ iree_lit_test_suite( NAME lit SRCS + buffers_xclbin.mlir basic_dma_transpose.mlir basic_matrix_multiplication_matrix_vector.mlir basic_matrix_multiplication_single_core.mlir diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_dma_transpose.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_dma_transpose.mlir index a2605c077..4b5a2cb94 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_dma_transpose.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/basic_dma_transpose.mlir @@ -1,6 +1,7 @@ -// RUN: not iree-compile --compile-mode=hal-executable --iree-hal-target-backends=amd-aie-direct %s 2>&1 | FileCheck %s +// RUN: iree-compile --compile-mode=hal-executable --iree-hal-target-backends=amd-aie-direct %s | FileCheck %s // CHECK: Generating:{{.*}}aie_cdo_elfs.bin +// CHECK: Successfully wrote{{.*}}module_dummy1_amdaie_xclbin_fb.xclbin module attributes {hal.device.targets = [#hal.device.target<"amd-aie-direct", [#hal.executable.target<"amd-aie-direct", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>]>]} { hal.executable private @dummy1 { hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie-direct", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/buffers_xclbin.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/buffers_xclbin.mlir new file mode 100644 index 000000000..27f636bcd --- /dev/null +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/tests/lower_objfifo/buffers_xclbin.mlir @@ -0,0 +1,148 @@ +// RUN: iree-compile --compile-mode=hal-executable --iree-hal-target-backends=amd-aie-direct %s --iree-hal-dump-executable-files-to %T +// RUN: FileCheck %s --input-file=%T/module_dummy1_amdaie_xclbin_fb/kernels.json + +// CHECK: { +// CHECK: "ps-kernels": { +// CHECK: "kernels": [ +// CHECK: { +// CHECK: "arguments": [ +// CHECK: { +// CHECK: "address-qualifier": "SCALAR", +// CHECK: "name": "opcode", +// CHECK: "offset": "0x00", +// CHECK: "type": "uint64_t" +// CHECK: }, +// CHECK: { +// CHECK: "address-qualifier": "GLOBAL", +// CHECK: "memory-connection": "SRAM", +// CHECK: "name": "instr", +// CHECK: "offset": "0x08", +// CHECK: "type": "char *" +// CHECK: }, +// CHECK: { +// CHECK: "address-qualifier": "SCALAR", +// CHECK: "name": "ninstr", +// CHECK: "offset": "0x10", +// CHECK: "type": "uint32_t" +// CHECK: }, +// CHECK: { +// CHECK: "address-qualifier": "GLOBAL", +// CHECK: "memory-connection": "HOST", +// CHECK: "name": "bo0", +// CHECK: "offset": "0x14", +// CHECK: "type": "void*" +// CHECK: }, +// CHECK: { +// CHECK: "address-qualifier": "GLOBAL", +// CHECK: "memory-connection": "HOST", +// CHECK: "name": "bo1", +// CHECK: "offset": "0x1c", +// CHECK: "type": "void*" +// CHECK: }, +// CHECK: { +// CHECK: "address-qualifier": "GLOBAL", +// CHECK: "memory-connection": "HOST", +// CHECK: "name": "bo2", +// CHECK: "offset": "0x24", +// CHECK: "type": "void*" +// CHECK: }, +// CHECK: { +// CHECK: "address-qualifier": "GLOBAL", +// CHECK: "memory-connection": "HOST", +// CHECK: "name": "bo3", +// CHECK: "offset": "0x2c", +// CHECK: "type": "void*" +// CHECK: }, +// CHECK: { +// CHECK: "address-qualifier": "GLOBAL", +// CHECK: "memory-connection": "HOST", +// CHECK: "name": "bo4", +// CHECK: "offset": "0x34", +// CHECK: "type": "void*" +// CHECK: } +// CHECK: ], +// CHECK: "extended-data": { +// CHECK: "dpu_kernel_id": "0x101", +// CHECK: "functional": "0", +// CHECK: "subtype": "DPU" +// CHECK: }, +// CHECK: "instances": [ +// CHECK: { +// CHECK: "name": "FOO" +// CHECK: } +// CHECK: ], +// CHECK: "name": "dummy2", +// CHECK: "type": "dpu" +// CHECK: } +// CHECK: ] +// CHECK: } +// CHECK: } + + + +module attributes {hal.device.targets = [#hal.device.target<"amd-aie-direct", [#hal.executable.target<"amd-aie-direct", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>]>]} { + hal.executable private @dummy1 { + hal.executable.variant public @amdaie_xclbin_fb target(<"amd-aie-direct", "amdaie-xclbin-fb", {target_arch = "chip-tbd", ukernels = "none"}>) { + hal.executable.export public @dummy2 ordinal(0) layout(#hal.pipeline.layout]>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>]} { + ^bb0(%arg0: !hal.device): + %x, %y, %z = flow.dispatch.workgroup_count_from_slice + hal.return %x, %y, %z : index, index, index + } + builtin.module { + aie.device(npu1_4col) { + memref.global "public" @in0 : memref<1024xi32> + memref.global "public" @out0 : memref<1024xi32> + memref.global "public" @in1 : memref<1024xi32> + memref.global "public" @out1 : memref<1024xi32> + memref.global "public" @in2 : memref<1024xi32> + memref.global "public" @out2 : memref<1024xi32> + %02 = aie.tile(0, 2) + %12 = aie.tile(1, 2) + %22 = aie.tile(2, 2) + + aie.core(%12) { + aie.end + } + aie.shim_dma_allocation @in0(MM2S, 0, 0) + aie.shim_dma_allocation @out0(S2MM, 0, 0) + aie.shim_dma_allocation @in1(MM2S, 1, 0) + aie.shim_dma_allocation @out1(S2MM, 1, 0) + aie.shim_dma_allocation @in2(MM2S, 2, 0) + aie.shim_dma_allocation @out2(S2MM, 2, 0) + + func.func @sequence(%arg0: memref<1024xi32>, %arg1: memref<1024xi32>, %arg2: memref<1024xi32>, %arg3: memref<1024xi32>, %arg4: memref<1024xi32>, %arg5: memref<1024xi32>) { + aiex.npu.dma_memcpy_nd(0, 0, %arg0[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 0 : i64, metadata = @in0} : memref<1024xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg1[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 1 : i64, metadata = @out0} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg2[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 2 : i64, metadata = @in1} : memref<1024xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 3 : i64, metadata = @out1} : memref<1024xi32> + aiex.npu.sync {channel = 1 : i32, column = 0 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + aiex.npu.dma_memcpy_nd(0, 0, %arg4[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 2 : i64, metadata = @in2} : memref<1024xi32> + aiex.npu.dma_memcpy_nd(0, 0, %arg5[0, 0, 0, 0][1, 1, 1, 1024][0, 0, 0]) {id = 3 : i64, metadata = @out2} : memref<1024xi32> + aiex.npu.sync {channel = 0 : i32, column = 2 : i32, column_num = 1 : i32, direction = 0 : i32, row = 0 : i32, row_num = 1 : i32} + return + } + } + } + } + } + util.func public @dummy3(%arg0: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = ""}} { + // this is all gibberish just to hit serializeExecutable + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %element_type_i8 = hal.element_type : i32 + %dense_row_major = hal.encoding_type : i32 + hal.buffer_view.assert<%arg0 : !hal.buffer_view> message("input0") shape([%c1, %c1]) type(%element_type_i8) encoding(%dense_row_major) + %0 = stream.tensor.import %arg0 : !hal.buffer_view -> tensor<1024x512xi8> in !stream.resource{%c1} + %result, %result_timepoint = stream.resource.alloca uninitialized : !stream.resource{%c1} => !stream.timepoint + + %2 = stream.cmd.execute await(%result_timepoint) => with(%0 as %arg2: !stream.resource{%c1}) { + stream.cmd.dispatch @dummy1::@amdaie_xclbin_fb::@dummy2 { + ro %arg2[%c0 for %c1] : !stream.resource{%c1} + } + } => !stream.timepoint + %3 = stream.timepoint.await %2 => %result : !stream.resource{%c1} + %4 = stream.tensor.export %3 : tensor<1024x1024xi32> in !stream.resource{%c1} -> !hal.buffer_view + util.return %4 : !hal.buffer_view + } +} \ No newline at end of file diff --git a/lit.cfg.py b/lit.cfg.py index cc344cbed..4c4236f66 100644 --- a/lit.cfg.py +++ b/lit.cfg.py @@ -12,6 +12,7 @@ import os import tempfile +from pathlib import Path import lit.formats @@ -28,6 +29,12 @@ } ) +config.environment["VITIS"] = "/opt/Xilinx/Vitis/2023.2" +XILINXD_LICENSE_FILE = Path( + os.getenv("XILINXD_LICENSE_FILE", Path.home() / ".Xilinx/aie.lic") +).absolute() +config.environment["XILINXD_LICENSE_FILE"] = str(XILINXD_LICENSE_FILE) + # Use the most preferred temp directory. config.test_exec_root = ( os.environ.get("TEST_UNDECLARED_OUTPUTS_DIR")