diff --git a/include/aie-c/Translation.h b/include/aie-c/Translation.h index 52a0790b29..d51cd21499 100644 --- a/include/aie-c/Translation.h +++ b/include/aie-c/Translation.h @@ -19,7 +19,8 @@ MLIR_CAPI_EXPORTED MlirStringRef aieTranslateAIEVecToCpp(MlirOperation op, bool aie2); MLIR_CAPI_EXPORTED MlirStringRef aieTranslateModuleToLLVMIR(MlirOperation op); MLIR_CAPI_EXPORTED MlirStringRef aieTranslateToNPU(MlirOperation op); -MLIR_CAPI_EXPORTED MlirStringRef aieTranslateToControlPackets(MlirOperation op); +MLIR_CAPI_EXPORTED MlirStringRef +AIETranslateControlPacketsToUI32Vec(MlirOperation op); MLIR_CAPI_EXPORTED MlirStringRef aieTranslateToXAIEV2(MlirOperation op); MLIR_CAPI_EXPORTED MlirStringRef aieTranslateToHSA(MlirOperation op); MLIR_CAPI_EXPORTED MlirStringRef aieTranslateToBCF(MlirOperation op, int col, @@ -34,6 +35,9 @@ aieTranslateToCDODirect(MlirOperation moduleOp, MlirStringRef workDirPath, MLIR_CAPI_EXPORTED MlirLogicalResult aieTranslateToTxn( MlirOperation moduleOp, MlirStringRef outputFile, MlirStringRef workDirPath, bool aieSim, bool xaieDebug, bool enableCores); +MLIR_CAPI_EXPORTED MlirLogicalResult aieTranslateToCtrlpkt( + MlirOperation moduleOp, MlirStringRef outputFile, MlirStringRef workDirPath, + bool aieSim, bool xaieDebug, bool enableCores); MLIR_CAPI_EXPORTED MlirOperation aieTranslateBinaryToTxn(MlirContext ctx, MlirStringRef binary); diff --git a/include/aie/Targets/AIETargets.h b/include/aie/Targets/AIETargets.h index ea756994e6..4dd1581591 100644 --- a/include/aie/Targets/AIETargets.h +++ b/include/aie/Targets/AIETargets.h @@ -37,10 +37,11 @@ mlir::LogicalResult AIETranslateGraphXPE(mlir::ModuleOp module, mlir::LogicalResult AIETranslateToNPU(mlir::ModuleOp module, llvm::raw_ostream &output); mlir::LogicalResult AIETranslateToNPU(mlir::ModuleOp, std::vector &); -mlir::LogicalResult AIETranslateToControlPackets(mlir::ModuleOp module, - llvm::raw_ostream &output); -mlir::LogicalResult AIETranslateToControlPackets(mlir::ModuleOp, - std::vector &); +mlir::LogicalResult +AIETranslateControlPacketsToUI32Vec(mlir::ModuleOp module, + llvm::raw_ostream &output); +mlir::LogicalResult +AIETranslateControlPacketsToUI32Vec(mlir::ModuleOp, std::vector &); mlir::LogicalResult AIETranslateToLdScript(mlir::ModuleOp module, llvm::raw_ostream &output, int tileCol, int tileRow); @@ -63,6 +64,11 @@ AIETranslateToTxn(mlir::ModuleOp m, llvm::raw_ostream &output, llvm::StringRef workDirPath, bool outputBinary = false, bool aieSim = false, bool xaieDebug = false, bool enableCores = true); +mlir::LogicalResult +AIETranslateToControlPackets(mlir::ModuleOp m, llvm::raw_ostream &output, + llvm::StringRef workDirPath, + bool outputBinary = false, bool aieSim = false, + bool xaieDebug = false, bool enableCores = true); #ifdef AIE_ENABLE_AIRBIN mlir::LogicalResult AIETranslateToAirbin(mlir::ModuleOp module, @@ -77,6 +83,10 @@ mlir::LogicalResult AIETranslateToTargetArch(mlir::ModuleOp module, std::optional AIETranslateBinaryToTxn(mlir::MLIRContext *ctx, std::vector &binary); +std::optional +AIETranslateBinaryToCtrlpkt(mlir::MLIRContext *ctx, + std::vector &binary); + } // namespace AIE namespace aievec { diff --git a/lib/CAPI/Translation.cpp b/lib/CAPI/Translation.cpp index d33aee084e..f101e7e221 100644 --- a/lib/CAPI/Translation.cpp +++ b/lib/CAPI/Translation.cpp @@ -114,6 +114,39 @@ MlirLogicalResult aieTranslateToTxn(MlirOperation moduleOp, return wrap(status); } +MlirLogicalResult aieTranslateToCtrlpkt(MlirOperation moduleOp, + MlirStringRef outputFile, + MlirStringRef workDirPath, bool aieSim, + bool xaieDebug, bool enableCores) { + ModuleOp mod = llvm::cast(unwrap(moduleOp)); + bool outputBinary = false; + + std::string errorMessage; + auto output = openOutputFile(StringRef(outputFile.data, outputFile.length), + &errorMessage); + if (!output) { + llvm::errs() << errorMessage << "\n"; + return wrap(failure()); + } + + auto status = AIETranslateToControlPackets( + mod, output->os(), llvm::StringRef(workDirPath.data, workDirPath.length), + outputBinary, aieSim, xaieDebug, enableCores); + + std::vector diagnostics; + ScopedDiagnosticHandler handler(mod.getContext(), [&](Diagnostic &d) { + llvm::raw_string_ostream(diagnostics.emplace_back()) + << d.getLocation() << ": " << d; + }); + + if (failed(status)) + for (const auto &diagnostic : diagnostics) + std::cerr << diagnostic << "\n"; + else + output->keep(); + return wrap(status); +} + MlirOperation aieTranslateBinaryToTxn(MlirContext ctx, MlirStringRef binary) { std::vector binaryData(binary.data, binary.data + binary.length); auto mod = AIETranslateBinaryToTxn(unwrap(ctx), binaryData); @@ -133,11 +166,11 @@ MlirStringRef aieTranslateToNPU(MlirOperation moduleOp) { return mlirStringRefCreate(cStr, npu.size()); } -MlirStringRef aieTranslateToControlPackets(MlirOperation moduleOp) { +MlirStringRef AIETranslateControlPacketsToUI32Vec(MlirOperation moduleOp) { std::string npu; llvm::raw_string_ostream os(npu); ModuleOp mod = llvm::cast(unwrap(moduleOp)); - if (failed(AIETranslateToControlPackets(mod, os))) + if (failed(AIETranslateControlPacketsToUI32Vec(mod, os))) return mlirStringRefCreate(nullptr, 0); char *cStr = static_cast(malloc(npu.size())); npu.copy(cStr, npu.size()); diff --git a/lib/Targets/AIETargetCDODirect.cpp b/lib/Targets/AIETargetCDODirect.cpp index d652fdf1b3..0e7a501dcb 100644 --- a/lib/Targets/AIETargetCDODirect.cpp +++ b/lib/Targets/AIETargetCDODirect.cpp @@ -1189,6 +1189,115 @@ xilinx::AIE::AIETranslateBinaryToTxn(mlir::MLIRContext *ctx, return module; } +std::optional +xilinx::AIE::AIETranslateBinaryToCtrlpkt(mlir::MLIRContext *ctx, + std::vector &binary) { + + // parse the binary + std::vector operations; + auto c = parseTransactionBinary(binary, operations); + if (!c) { + llvm::errs() << "Failed to parse binary\n"; + return std::nullopt; + } + int columns = *c; + + auto loc = mlir::UnknownLoc::get(ctx); + + // create a new ModuleOp and set the insertion point + auto module = ModuleOp::create(loc); + OpBuilder builder(module.getBodyRegion()); + builder.setInsertionPointToStart(module.getBody()); + + // create aie.device + std::vector devices{AIEDevice::npu1_1col, AIEDevice::npu1_2col, + AIEDevice::npu1_3col, AIEDevice::npu1_4col, + AIEDevice::npu1}; + auto device = builder.create(loc, devices[columns - 1]); + device.getRegion().emplaceBlock(); + builder.setInsertionPointToStart(device.getBody()); + + // for each blockwrite in the binary, create a GlobalOp with the data + std::vector global_data; + for (auto &op : operations) { + if (op.cmd.Opcode != XAIE_IO_BLOCKWRITE) { + global_data.push_back(nullptr); + continue; + } + uint32_t size = op.cmd.Size / 4; + const uint32_t *d = reinterpret_cast(op.cmd.DataPtr); + std::vector data32(d, d + size); + + int id = 0; + std::string name = "blockwrite_data"; + while (device.lookupSymbol(name)) + name = "blockwrite_data_" + std::to_string(id++); + + MemRefType memrefType = MemRefType::get({size}, builder.getI32Type()); + TensorType tensorType = RankedTensorType::get({size}, builder.getI32Type()); + auto global = builder.create( + loc, name, builder.getStringAttr("private"), memrefType, + DenseElementsAttr::get(tensorType, data32), true, nullptr); + global_data.push_back(global); + } + + // create aiex.runtime_sequence + auto seq = builder.create(loc, nullptr); + seq.getBody().push_back(new Block); + + // create the txn ops + builder.setInsertionPointToStart(&seq.getBody().front()); + for (auto p : llvm::zip(operations, global_data)) { + auto op = std::get<0>(p); + memref::GlobalOp payload = std::get<1>(p); + + if (op.cmd.Opcode == XAie_TxnOpcode::XAIE_IO_WRITE) { + builder.create( + loc, builder.getUI32IntegerAttr(op.cmd.RegOff), nullptr, + /*opcode*/ builder.getI32IntegerAttr(0), + /*stream_id*/ builder.getI32IntegerAttr(0), + DenseI32ArrayAttr::get(ctx, ArrayRef(op.cmd.Value))); + } else if (op.cmd.Opcode == XAie_TxnOpcode::XAIE_IO_BLOCKWRITE) { + if (!std::get<1>(p).getInitialValue()) + continue; + auto blockWriteData = + dyn_cast(*std::get<1>(p).getInitialValue()); + if (!blockWriteData) { + payload.emitError( + "Global symbol initial value is not a dense int array"); + break; + } + auto blockWriteDataValues = blockWriteData.getValues(); + // Split block write data into beats of 4 or less, in int32_t. + int currAddr = op.cmd.RegOff; + for (size_t i = 0; i < blockWriteDataValues.size(); i += 4) { + auto last = std::min(blockWriteDataValues.size(), i + 4); + SmallVector splitData = + SmallVector(blockWriteDataValues.begin() + i, + blockWriteDataValues.begin() + last); + builder.create( + loc, builder.getUI32IntegerAttr(currAddr), nullptr, + /*opcode*/ builder.getI32IntegerAttr(0), + /*stream_id*/ builder.getI32IntegerAttr(0), + DenseI32ArrayAttr::get(ctx, ArrayRef(splitData))); + currAddr += splitData.size() * sizeof(int32_t); + } + + } else if (op.cmd.Opcode == XAie_TxnOpcode::XAIE_IO_MASKWRITE) { + builder.create( + loc, builder.getUI32IntegerAttr(op.cmd.RegOff), nullptr, + /*opcode*/ builder.getI32IntegerAttr(0), + /*stream_id*/ builder.getI32IntegerAttr(0), + DenseI32ArrayAttr::get(ctx, ArrayRef(op.cmd.Value))); + } else { + llvm::errs() << "Unhandled txn opcode: " << op.cmd.Opcode << "\n"; + return std::nullopt; + } + } + + return module; +} + LogicalResult xilinx::AIE::AIETranslateToTxn(ModuleOp m, llvm::raw_ostream &output, llvm::StringRef workDirPath, @@ -1208,8 +1317,27 @@ LogicalResult xilinx::AIE::AIETranslateToTxn(ModuleOp m, auto new_module = AIETranslateBinaryToTxn(m.getContext(), bin); if (!new_module) return failure(); - new_module->print(output); + return success(); +} +LogicalResult xilinx::AIE::AIETranslateToControlPackets( + ModuleOp m, llvm::raw_ostream &output, llvm::StringRef workDirPath, + bool outputBinary, bool enableSim, bool xaieDebug, bool enableCores) { + std::vector bin; + auto result = + translateToTxn(m, bin, workDirPath, enableSim, xaieDebug, enableCores); + if (failed(result)) + return result; + + if (outputBinary) { + output.write(reinterpret_cast(bin.data()), bin.size()); + return success(); + } + + auto new_module = AIETranslateBinaryToCtrlpkt(m.getContext(), bin); + if (!new_module) + return failure(); + new_module->print(output); return success(); } diff --git a/lib/Targets/AIETargetNPU.cpp b/lib/Targets/AIETargetNPU.cpp index 641ed255d2..0439c427e5 100644 --- a/lib/Targets/AIETargetNPU.cpp +++ b/lib/Targets/AIETargetNPU.cpp @@ -250,9 +250,8 @@ LogicalResult xilinx::AIE::AIETranslateToNPU(ModuleOp module, return success(); } -LogicalResult -xilinx::AIE::AIETranslateToControlPackets(ModuleOp module, - std::vector &instructions) { +LogicalResult xilinx::AIE::AIETranslateControlPacketsToUI32Vec( + ModuleOp module, std::vector &instructions) { DeviceOp deviceOp = *module.getOps().begin(); auto sequenceOps = deviceOp.getOps(); @@ -291,10 +290,11 @@ xilinx::AIE::AIETranslateToControlPackets(ModuleOp module, return success(); } -LogicalResult xilinx::AIE::AIETranslateToControlPackets(ModuleOp module, - raw_ostream &output) { +LogicalResult +xilinx::AIE::AIETranslateControlPacketsToUI32Vec(ModuleOp module, + raw_ostream &output) { std::vector instructions; - auto r = AIETranslateToControlPackets(module, instructions); + auto r = AIETranslateControlPacketsToUI32Vec(module, instructions); if (failed(r)) return r; for (auto w : instructions) diff --git a/lib/Targets/AIETargets.cpp b/lib/Targets/AIETargets.cpp index f7a90b457d..7638c45aaf 100644 --- a/lib/Targets/AIETargets.cpp +++ b/lib/Targets/AIETargets.cpp @@ -375,14 +375,32 @@ void registerAIETranslations() { [](ModuleOp module, raw_ostream &output) { if (outputBinary == true) { std::vector instructions; - auto r = AIETranslateToControlPackets(module, instructions); + auto r = AIETranslateControlPacketsToUI32Vec(module, instructions); if (failed(r)) return r; output.write(reinterpret_cast(instructions.data()), instructions.size() * sizeof(uint32_t)); return success(); } - return AIETranslateToControlPackets(module, output); + return AIETranslateControlPacketsToUI32Vec(module, output); + }, + registerDialects); + TranslateFromMLIRRegistration registrationCDOWithCtrlpkt( + "aie-generate-ctrlpkt", + "Generate control packet configuration. Use --aie-output-binary to " + "select between mlir (default) and binary output", + [](ModuleOp module, raw_ostream &output) { + SmallString<128> workDirPath_; + if (workDirPath.getNumOccurrences() == 0) { + if (llvm::sys::fs::current_path(workDirPath_)) + llvm::report_fatal_error( + "couldn't get cwd to use as work-dir-path"); + } else + workDirPath_ = workDirPath.getValue(); + LLVM_DEBUG(llvm::dbgs() << "work-dir-path: " << workDirPath_ << "\n"); + return AIETranslateToControlPackets(module, output, workDirPath_, + outputBinary, cdoAieSim, + cdoXaieDebug, cdoEnableCores); }, registerDialects); } diff --git a/python/AIEMLIRModule.cpp b/python/AIEMLIRModule.cpp index ea5d9dd457..481451fe73 100644 --- a/python/AIEMLIRModule.cpp +++ b/python/AIEMLIRModule.cpp @@ -132,6 +132,23 @@ PYBIND11_MODULE(_aie, m) { "module"_a, "output_file"_a, "work_dir_path"_a, "aiesim"_a = false, "xaie_debug"_a = false, "enable_cores"_a = true); + m.def( + "generate_ctrlpkt", + [](MlirOperation op, const std::string &outputFile, + const std::string &workDirPath, bool aieSim, bool xaieDebug, + bool enableCores) { + mlir::python::CollectDiagnosticsToStringScope scope( + mlirOperationGetContext(op)); + if (mlirLogicalResultIsFailure(aieTranslateToCtrlpkt( + op, {outputFile.data(), outputFile.size()}, + {workDirPath.data(), workDirPath.size()}, aieSim, xaieDebug, + enableCores))) + throw py::value_error("Failed to generate control packets because: " + + scope.takeMessage()); + }, + "module"_a, "output_file"_a, "work_dir_path"_a, "aiesim"_a = false, + "xaie_debug"_a = false, "enable_cores"_a = true); + m.def( "transaction_binary_to_mlir", [](MlirContext ctx, py::bytes bytes) { @@ -156,7 +173,8 @@ PYBIND11_MODULE(_aie, m) { m.def( "generate_control_packets", [&stealCStr](MlirOperation op) { - py::str ctrlPackets = stealCStr(aieTranslateToControlPackets(op)); + py::str ctrlPackets = + stealCStr(AIETranslateControlPacketsToUI32Vec(op)); auto individualInstructions = ctrlPackets.attr("split")().cast(); for (size_t i = 0; i < individualInstructions.size(); ++i) diff --git a/python/compiler/aiecc/cl_arguments.py b/python/compiler/aiecc/cl_arguments.py index a0a5656025..343d3cf045 100644 --- a/python/compiler/aiecc/cl_arguments.py +++ b/python/compiler/aiecc/cl_arguments.py @@ -252,6 +252,14 @@ def parse_args(args=None): const=True, help="Generate txn binary for configuration", ) + parser.add_argument( + "--aie-generate-ctrlpkt", + dest="ctrlpkt", + default=False, + action="store_const", + const=True, + help="Generate control packets for configuration", + ) parser.add_argument( "--aie-generate-xclbin", dest="xcl", diff --git a/python/compiler/aiecc/main.py b/python/compiler/aiecc/main.py index cf4b38ed30..dddb65a910 100644 --- a/python/compiler/aiecc/main.py +++ b/python/compiler/aiecc/main.py @@ -583,6 +583,26 @@ async def process_txn(self): txn_file = os.path.join(self.tmpdirname, "txn.mlir") generate_txn(input_physical.operation, txn_file, self.tmpdirname) + async def process_ctrlpkt(self): + from aie.dialects.aie import generate_ctrlpkt + + with Context(), Location.unknown(): + for elf in glob.glob("*.elf"): + try: + shutil.copy(elf, self.tmpdirname) + except shutil.SameFileError: + pass + for elf_map in glob.glob("*.elf.map"): + try: + shutil.copy(elf_map, self.tmpdirname) + except shutil.SameFileError: + pass + input_physical = Module.parse( + await read_file_async(self.prepend_tmp("input_physical.mlir")) + ) + ctrlpkt_file = os.path.join(self.tmpdirname, "ctrlpkt.mlir") + generate_ctrlpkt(input_physical.operation, ctrlpkt_file, self.tmpdirname) + async def process_xclbin_gen(self): if opts.progress: task = self.progress_bar.add_task( @@ -1125,6 +1145,9 @@ async def run_flow(self): if opts.txn and opts.execute: await self.process_txn() + if opts.ctrlpkt and opts.execute: + await self.process_ctrlpkt() + def dumpprofile(self): sortedruntimes = sorted( self.runtimes.items(), key=lambda item: item[1], reverse=True diff --git a/python/dialects/aie.py b/python/dialects/aie.py index ae9273ab2c..09f2451e30 100644 --- a/python/dialects/aie.py +++ b/python/dialects/aie.py @@ -23,6 +23,7 @@ generate_bcf, generate_cdo, generate_txn, + generate_ctrlpkt, generate_xaie, generate_control_packets, npu_instgen, diff --git a/test/npu-xrt/ctrl_packet_reconfig/aie.mlir b/test/npu-xrt/ctrl_packet_reconfig/aie.mlir deleted file mode 100644 index ec7a9bcc30..0000000000 --- a/test/npu-xrt/ctrl_packet_reconfig/aie.mlir +++ /dev/null @@ -1,415 +0,0 @@ -module { - aie.device(npu1_1col) { - memref.global "public" @ctrlpkt0 : memref<1024xi32> - memref.global "public" @objFifo_out0 : memref<64x64xi8> - %tile_0_0 = aie.tile(0, 0) {controller_id = #aie.packet_info} - %tile_0_1 = aie.tile(0, 1) {controller_id = #aie.packet_info} - %tile_0_2 = aie.tile(0, 2) {controller_id = #aie.packet_info} - - aie.packet_flow(0) { - aie.packet_source<%tile_0_1, DMA : 0> - aie.packet_dest<%tile_0_2, DMA : 0> - } - aie.packet_flow(1) { - aie.packet_source<%tile_0_2, DMA : 0> - aie.packet_dest<%tile_0_1, DMA : 1> - } - aie.packet_flow(2) { - aie.packet_source<%tile_0_1, DMA : 1> - aie.packet_dest<%tile_0_0, DMA : 0> - } - aie.packet_flow(3) { - aie.packet_source<%tile_0_0, DMA : 0> - aie.packet_dest<%tile_0_1, DMA : 0> - } - aie.packet_flow(4) { - aie.packet_source<%tile_0_0, Ctrl : 0> - aie.packet_dest<%tile_0_0, South : 0> - } {keep_pkt_header = true} - // TODO: make shim tile ctrl packet flow part of the column control overlay - // aie.packet_flow(4) { - // aie.packet_source<%tile_0_0, DMA : 0> - // aie.packet_dest<%tile_0_0, Ctrl : 0> - // } {keep_pkt_header = true} - aie.packet_flow(5) { - aie.packet_source<%tile_0_0, DMA : 0> - aie.packet_dest<%tile_0_1, Ctrl : 0> - } {keep_pkt_header = true} - aie.packet_flow(6) { - aie.packet_source<%tile_0_0, DMA : 0> - aie.packet_dest<%tile_0_2, Ctrl : 0> - } {keep_pkt_header = true} - aie.shim_dma_allocation @ctrlpkt0(MM2S, 0, 0) - aie.shim_dma_allocation @objFifo_out0(S2MM, 0, 0) - aiex.runtime_sequence(%arg0: memref<64x64xi8>, %arg2: memref<64x64xi8>, %arg3: memref<1024xi32>) { - - // Reset core (0,2) - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 0][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - - // Reset DMA channels (leads to deadlock) - // aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 2][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - // aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - // aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 4][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - // aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - // aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 6][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - // aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - // aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 8][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - // aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - - // Load core tile (0,2) program memory - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 10][1, 1, 1, 4][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 14][1, 1, 1, 4][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 18][1, 1, 1, 4][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 22][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 27][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 32][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 37][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 42][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 47][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 52][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 57][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 62][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 67][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 72][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 77][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 82][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 87][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 92][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 97][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 102][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 107][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 112][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 117][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 122][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 127][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 132][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 137][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 142][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 147][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 152][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 157][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 162][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 167][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 172][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 177][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 182][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 187][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 192][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 197][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 202][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 207][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 212][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 217][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 222][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 227][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 232][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 237][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 242][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 247][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 252][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 257][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 262][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 267][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 272][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 277][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 282][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 287][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 292][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 297][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 302][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 307][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 312][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 317][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 322][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 327][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 332][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 337][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 339][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 341][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 343][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 345][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 347][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - - // Core tile (0,2) locks - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 349][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 351][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 353][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 355][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 357][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 359][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 361][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 363][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 365][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 367][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 369][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 371][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 373][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 375][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 377][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 379][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 381][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 383][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 385][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 387][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - - // memtile (0,1) locks - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 389][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 391][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 393][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 395][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - - // core tile bds - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 397][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 402][1, 1, 1, 3][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 405][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 410][1, 1, 1, 3][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 413][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 415][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 417][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 419][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - - // memtile bds - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 421][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 426][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 431][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 436][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 441][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 446][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 451][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 456][1, 1, 1, 5][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 461][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 463][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 465][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 467][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 469][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 471][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 473][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 475][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - - // shim tile (0,0) bds - // TODO: make shim tile ctrl packet flow part of the column control overlay - // aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 477][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - // aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - // aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 479][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - // aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - // aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 481][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - // aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - // aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 483][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - // aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - // aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 485][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - // aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - // aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 487][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - // aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - // aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 489][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - // aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - // aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 491][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - // aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - // aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 493][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - // aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - - // memtile stream switches - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 495][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 497][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 499][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 501][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 503][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 505][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 507][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 509][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 511][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 513][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 515][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 517][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 519][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 521][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 523][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 525][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 527][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 529][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - - // core tile stream switches - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 531][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 533][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 535][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 537][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 539][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 541][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 543][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 545][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 547][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - - // shim tile stream switches - // TODO: make shim tile ctrl packet flow part of the column control overlay - // aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 549][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - // aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - // aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 551][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - // aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - - aiex.npu.dma_memcpy_nd(0, 0, %arg3[0, 0, 0, 553][1, 1, 1, 2][0, 0, 0, 1], packet = ) {id = 0 : i64, issue_token = true, metadata = @ctrlpkt0} : memref<1024xi32> - aiex.npu.sync {channel = 0 : i32, column = 0 : i32, column_num = 1 : i32, direction = 1 : i32, row = 0 : i32, row_num = 1 : i32} - - // AIE design's instructions - %c0_i64 = arith.constant 0 : i64 - %c1_i64 = arith.constant 1 : i64 - %c56_i64 = arith.constant 56 : i64 - %c61_i64 = arith.constant 61 : i64 - %c64_i64 = arith.constant 64 : i64 - aiex.npu.dma_memcpy_nd (0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64], packet = ) {id = 0 : i64, metadata = @ctrlpkt0} : memref<64x64xi8> - aiex.npu.dma_memcpy_nd (0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64x64xi8> - aiex.npu.dma_wait { symbol = @objFifo_out0 } - - } - - } -} - diff --git a/test/npu-xrt/ctrl_packet_reconfig/aie1.mlir b/test/npu-xrt/ctrl_packet_reconfig/aie1.mlir new file mode 100644 index 0000000000..fe8c67d906 --- /dev/null +++ b/test/npu-xrt/ctrl_packet_reconfig/aie1.mlir @@ -0,0 +1,17 @@ +//===- aie1.mlir -----------------------------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// + +module { + aie.device(npu1_1col) { + %tile_0_0 = aie.tile(0, 0) + %tile_0_1 = aie.tile(0, 1) + %tile_0_2 = aie.tile(0, 2) + } +} diff --git a/test/npu-xrt/ctrl_packet_reconfig/aie2.mlir b/test/npu-xrt/ctrl_packet_reconfig/aie2.mlir new file mode 100644 index 0000000000..cc5bc29473 --- /dev/null +++ b/test/npu-xrt/ctrl_packet_reconfig/aie2.mlir @@ -0,0 +1,129 @@ +//===- aie2.mlir -----------------------------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates +// +//===----------------------------------------------------------------------===// + +module { + aie.device(npu1_1col) { + memref.global "public" @objFifo_in0 : memref<56x56xi8> + memref.global "public" @objFifo_out0 : memref<64x64xi8> + + %tile_0_0 = aie.tile(0, 0) + %tile_0_1 = aie.tile(0, 1) + %tile_0_2 = aie.tile(0, 2) + + %objFifo_in1_cons_buff_0 = aie.buffer(%tile_0_2) {sym_name = "objFifo_in1_cons_buff_0"} : memref<64x64xi8> + %objFifo_in1_cons_buff_1 = aie.buffer(%tile_0_2) {sym_name = "objFifo_in1_cons_buff_1"} : memref<64x64xi8> + %objFifo_out1_buff_0 = aie.buffer(%tile_0_2) {sym_name = "objFifo_out1_buff_0"} : memref<64x64xi8> + %objFifo_out1_buff_1 = aie.buffer(%tile_0_2) {sym_name = "objFifo_out1_buff_1"} : memref<64x64xi8> + + %objFifo_in1_cons_prod_lock = aie.lock(%tile_0_2, 0) {init = 1 : i32, sym_name = "objFifo_in1_cons_prod_lock"} + %objFifo_in1_cons_cons_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "objFifo_in1_cons_cons_lock"} + %objFifo_out1_prod_lock = aie.lock(%tile_0_2, 2) {init = 1 : i32, sym_name = "objFifo_out1_prod_lock"} + %objFifo_out1_cons_lock = aie.lock(%tile_0_2, 3) {init = 0 : i32, sym_name = "objFifo_out1_cons_lock"} + + aie.packet_flow(0) { + aie.packet_source<%tile_0_1, DMA : 0> + aie.packet_dest<%tile_0_2, DMA : 0> + } + aie.packet_flow(1) { + aie.packet_source<%tile_0_2, DMA : 0> + aie.packet_dest<%tile_0_1, DMA : 1> + } + aie.packet_flow(2) { + aie.packet_source<%tile_0_1, DMA : 1> + aie.packet_dest<%tile_0_0, DMA : 0> + } + aie.packet_flow(3) { + aie.packet_source<%tile_0_0, DMA : 0> + aie.packet_dest<%tile_0_1, DMA : 0> + } + + %core_0_2 = aie.core(%tile_0_2) { + %c8 = arith.constant 8 : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c12_i8 = arith.constant 12 : i8 + %c2 = arith.constant 2 : index + %c64 = arith.constant 64 : index + aie.use_lock(%objFifo_in1_cons_cons_lock, AcquireGreaterEqual, 1) + aie.use_lock(%objFifo_out1_prod_lock, AcquireGreaterEqual, 1) + scf.for %arg1 = %c0 to %c64 step %c1 { + scf.for %arg2 = %c0 to %c64 step %c1 { + %0 = memref.load %objFifo_in1_cons_buff_0[%arg1, %arg2] : memref<64x64xi8> + %1 = arith.addi %0, %c12_i8 : i8 + memref.store %1, %objFifo_out1_buff_0[%arg1, %arg2] : memref<64x64xi8> + } + } + aie.use_lock(%objFifo_in1_cons_prod_lock, Release, 1) + aie.use_lock(%objFifo_out1_cons_lock, Release, 1) + aie.end + } + + aie.shim_dma_allocation @objFifo_in0(MM2S, 0, 0) + + aiex.runtime_sequence(%arg0: memref<64x64xi8>, %arg1: memref<32xi8>, %arg2: memref<64x64xi8>) { + %c0_i64 = arith.constant 0 : i64 + %c1_i64 = arith.constant 1 : i64 + %c56_i64 = arith.constant 56 : i64 + %c61_i64 = arith.constant 61 : i64 + %c64_i64 = arith.constant 64 : i64 + aiex.npu.dma_memcpy_nd (0, 0, %arg0[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64], packet = ) {id = 0 : i64, metadata = @objFifo_in0} : memref<64x64xi8> + aiex.npu.dma_memcpy_nd (0, 0, %arg2[%c0_i64, %c0_i64, %c0_i64, %c0_i64][%c1_i64, %c1_i64, %c64_i64, %c64_i64][%c0_i64, %c0_i64, %c64_i64, %c1_i64]) {id = 1 : i64, metadata = @objFifo_out0, issue_token = true} : memref<64x64xi8> + aiex.npu.dma_wait { symbol = @objFifo_out0 } + } + + %memtile_dma_0_1 = aie.memtile_dma(%tile_0_1) { + %objFifo_in0_cons_buff_0 = aie.buffer(%tile_0_1) {sym_name = "objFifo_in0_cons_buff_0"} : memref<64x64xi8> + %objFifo_in0_cons_buff_1 = aie.buffer(%tile_0_1) {sym_name = "objFifo_in0_cons_buff_1"} : memref<64x64xi8> + %objFifo_out0_buff_0 = aie.buffer(%tile_0_1) {sym_name = "objFifo_out0_buff_0"} : memref<64x64xi8> + %objFifo_out0_buff_1 = aie.buffer(%tile_0_1) {sym_name = "objFifo_out0_buff_1"} : memref<64x64xi8> + %objFifo_in0_cons_prod_lock = aie.lock(%tile_0_1, 0) {init = 1 : i32, sym_name = "objFifo_in0_cons_prod_lock"} + %objFifo_in0_cons_cons_lock = aie.lock(%tile_0_1, 1) {init = 0 : i32, sym_name = "objFifo_in0_cons_cons_lock"} + %objFifo_out0_prod_lock = aie.lock(%tile_0_1, 2) {init = 1 : i32, sym_name = "objFifo_out0_prod_lock"} + %objFifo_out0_cons_lock = aie.lock(%tile_0_1, 3) {init = 0 : i32, sym_name = "objFifo_out0_cons_lock"} + %0 = aie.dma(S2MM, 0) [{ + aie.use_lock(%objFifo_in0_cons_prod_lock, AcquireGreaterEqual, 1) + aie.dma_bd(%objFifo_in0_cons_buff_0 : memref<64x64xi8>) + aie.use_lock(%objFifo_in0_cons_cons_lock, Release, 1) + }] + %1 = aie.dma(MM2S, 0) [{ + aie.use_lock(%objFifo_in0_cons_cons_lock, AcquireGreaterEqual, 1) + aie.dma_bd(%objFifo_in0_cons_buff_0 : memref<64x64xi8>) {packet = #aie.packet_info} + aie.use_lock(%objFifo_in0_cons_prod_lock, Release, 1) + }] + %2 = aie.dma(MM2S, 1) [{ + aie.use_lock(%objFifo_out0_cons_lock, AcquireGreaterEqual, 1) + aie.dma_bd(%objFifo_out0_buff_0 : memref<64x64xi8>) {packet = #aie.packet_info} + aie.use_lock(%objFifo_out0_prod_lock, Release, 1) + }] + %3 = aie.dma(S2MM, 1) [{ + aie.use_lock(%objFifo_out0_prod_lock, AcquireGreaterEqual, 1) + aie.dma_bd(%objFifo_out0_buff_0 : memref<64x64xi8>) + aie.use_lock(%objFifo_out0_cons_lock, Release, 1) + }] + aie.end + } + + aie.shim_dma_allocation @objFifo_out0(S2MM, 0, 0) + + %mem_0_2 = aie.mem(%tile_0_2) { + %0 = aie.dma(S2MM, 0) [{ + aie.use_lock(%objFifo_in1_cons_prod_lock, AcquireGreaterEqual, 1) + aie.dma_bd(%objFifo_in1_cons_buff_0 : memref<64x64xi8>) + aie.use_lock(%objFifo_in1_cons_cons_lock, Release, 1) + }] + %1 = aie.dma(MM2S, 0) [{ + aie.use_lock(%objFifo_out1_cons_lock, AcquireGreaterEqual, 1) + aie.dma_bd(%objFifo_out1_buff_0 : memref<64x64xi8>) {packet = #aie.packet_info} + aie.use_lock(%objFifo_out1_prod_lock, Release, 1) + }] + aie.end + } + } +} diff --git a/test/npu-xrt/ctrl_packet_reconfig/ctrl_pkts.txt b/test/npu-xrt/ctrl_packet_reconfig/ctrl_pkts.txt deleted file mode 100644 index e470486471..0000000000 --- a/test/npu-xrt/ctrl_packet_reconfig/ctrl_pkts.txt +++ /dev/null @@ -1,555 +0,0 @@ -00032000 -00000000 -8001DE10 -00000002 -0001DE18 -00000002 -0001DE00 -00000002 -8001DE08 -00000002 -00204400 -00000260 -00000000 -00000000 -0020440C -00000000 -00000000 -00000000 -00204418 -00000000 -00000000 -00000008 -00320000 -38001043 -000001C3 -08000055 -00550000 -80320010 -00000C00 -16310799 -40400195 -0001C000 -80320020 -00010001 -00010001 -FC7FF855 -7659FFFF -00320030 -782F1F30 -04B20000 -00000062 -00000000 -80320040 -DC8C764D -0001DFF0 -00010001 -00010001 -00320050 -14190001 -00011000 -00010001 -00010001 -00320060 -14B10899 -20400195 -0001C000 -00010001 -80320070 -00010001 -183E7659 -244B2003 -00000000 -80320080 -70000115 -00010000 -00010001 -00010001 -00320090 -88000115 -06990001 -00011830 -00010001 -003200A0 -880003C0 -00000003 -00000000 -00000000 -803200B0 -00010001 -00010001 -00010001 -10000819 -003200C0 -00010001 -00010001 -00010001 -00000019 -803200D0 -68000095 -00010000 -00010001 -00010001 -803200E0 -38032019 -0FFFC299 -10000115 -C93B0001 -003200F0 -062827FF -FFECC000 -00010001 -00010001 -80320100 -10000115 -C8430001 -064827FF -00010000 -00320110 -00010001 -00000137 -00000000 -00000000 -00320120 -38B010BB -08000000 -805507E6 -000008E3 -80320130 -00680055 -10BB0007 -01C89A00 -00000000 -00320140 -100003C0 -000118A8 -00000000 -00000000 -80320150 -7801FD1D -000102A6 -00010001 -00010001 -80320160 -880003C0 -00000003 -00000000 -000022A0 -00320170 -380003C0 -0000064F -78000000 -000002A6 -00320180 -880003C0 -00000003 -00000000 -00000000 -80320190 -880003C0 -00000003 -00000000 -00000000 -803201A0 -880003C0 -00000003 -00000000 -00000000 -003201B0 -880003C0 -00000003 -00000000 -00000000 -803201C0 -880003C0 -00000003 -00000000 -00000000 -003201D0 -16308C19 -000C9E6D -000122A0 -00010001 -003201E0 -01150001 -00012000 -2003C843 -00000608 -803201F0 -00010001 -8EBB0001 -00000003 -00000000 -80320200 -07FFC2D9 -07FF6659 -20000095 -00010001 -00320210 -00010001 -C83B0001 -10280067 -07FFE000 -00320220 -10121219 -10001819 -00010001 -00010001 -80320230 -880003C0 -00000003 -00000000 -00000000 -00320240 -10101219 -10001819 -00010001 -00010001 -80320250 -880003C0 -00000003 -00000000 -00000000 -80320260 -00000055 -00550000 -00000C00 -16320799 -00320270 -70400195 -2019C801 -E4193803 -E2190FFE -00320280 -20030FFF -C002273B -2003FFEC -50039E0B -80320290 -007FFFC8 -00020000 -0000004C -00000000 -803202A0 -070386D9 -00010001 -00010001 -00010001 -003202B0 -10001419 -1C8E7659 -00010001 -00010001 -803202C0 -14F12899 -50400195 -0001C001 -00010001 -003202D0 -782F0001 -00380000 -00000040 -00000000 -003202E0 -07FE42D9 -07FEE459 -07FFE259 -07FF6659 -803202F0 -00010001 -18190001 -96451000 -07FFE738 -00320300 -00010001 -8EBB0001 -00000003 -00000000 -80320310 -1A1010B7 -200001D0 -FFA04803 -62001077 -80320320 -C28001D0 -00508FFF -A31B2843 -00004108 -00320330 -19B8113B -40000003 -2003FFE9 -0002001B -80320340 -36590051 -36591C4B -36591C8A -28431CC5 -00320350 -2F1A849B -28BB0004 -3106A41B -00000006 -00320360 -280003C0 -60F798CB -00000005 -00000000 -80320370 -07038ED9 -07FB86D9 -00010001 -00010001 -80320380 -54190001 -00011000 -00010001 -00000019 -00320390 -280003C0 -0002E00B -00000000 -00000000 -003203A0 -15AD8C19 -180B9659 -00010001 -00010001 -803203B0 -07FFC2D9 -07FF4259 -00010001 -00010001 -003203C0 -18190001 -00011000 -15602019 -4CCB2843 -803203D0 -00292101 -CC4B28B7 -00254102 -00000000 -803203E0 -280003C0 -C1000C0B -00000002 -FFFC0000 -8001DE10 -00000000 -0001DE18 -00000000 -0001DE00 -00000000 -8001DE08 -00000000 -00032000 -00000002 -00032000 -00000000 -0001F000 -00000000 -8001F010 -00000000 -8001F020 -00000000 -0001F030 -00000000 -8001F040 -00000000 -0001F050 -00000000 -0001F060 -00000000 -8001F070 -00000000 -8001F080 -00000000 -0001F090 -00000000 -0001F0A0 -00000000 -8001F0B0 -00000000 -0001F0C0 -00000000 -8001F0D0 -00000000 -8001F0E0 -00000000 -0001F0F0 -00000000 -0001F000 -00000001 -8001F010 -00000000 -8001F020 -00000001 -0001F030 -00000000 -800C0000 -00000001 -000C0010 -00000000 -000C0020 -00000001 -800C0030 -00000000 -8031D000 -00400400 -00000000 -00000000 -00000000 -8011D010 -00000000 -06043FE0 -0031D020 -02400400 -40080000 -00000000 -00000000 -0011D030 -00000000 -0E045FE3 -8001DE04 -00000000 -0001DE00 -00000001 -0001DE14 -00000001 -8001DE10 -00000001 -803A0000 -00000400 -000A0000 -00000000 -00000000 -003A0010 -00000000 -00000000 -00000000 -8141FF40 -003A0020 -80000400 -001A0000 -00000000 -00000000 -803A0030 -00000000 -00000000 -00000000 -8140FF41 -803A0300 -81000400 -018A0800 -00000000 -00000000 -003A0310 -00000000 -00000000 -00000000 -8142FF43 -003A0320 -00000400 -019A0800 -00000000 -00000000 -803A0330 -00000000 -00000000 -00000000 -8143FF42 -000A0604 -00000000 -800A0600 -00000001 -000A0634 -00000001 -800A0630 -00000001 -800A063C -00000018 -000A0638 -00000001 -800A060C -00000019 -000A0608 -00000001 -0003F010 -C0000088 -0003F008 -C000000A -0003F040 -C0000009 -0003F114 -C0000000 -0003F250 -00180101 -0003F100 -C0000000 -0003F200 -041F0102 -8003F140 -C0000000 -8003F300 -021F0100 -000B0000 -C000008B -800B0004 -C0000089 -000B0024 -C000000A -000B0030 -C0000008 -000B003C -C000000D -000B0018 -C000000C -000B012C -C0000000 -000B02B0 -061F0105 -000B012C -C0000000 -800B02B4 -051F0104 -000B012C -C0000000 -800B02B8 -031F0103 -000B0104 -C0000000 -000B0210 -021F0102 -000B0134 -C0000000 -000B02D0 -011F0101 -800B0100 -C0000000 -800B0200 -001F0100 -0003F004 -C0000088 -8003F014 -C0000009 -8003F00C -C000000A -0003F124 -C0000000 -0003F290 -061F0102 -8003F104 -C0000000 -8003F210 -011F0101 -0003F118 -C0000000 -0003F260 -001F0100 -0001F000 -00000400 -8001F004 -00000010 -00032000 -00000001 diff --git a/test/npu-xrt/ctrl_packet_reconfig/run.lit b/test/npu-xrt/ctrl_packet_reconfig/run.lit index c215cea20c..bd0094f177 100644 --- a/test/npu-xrt/ctrl_packet_reconfig/run.lit +++ b/test/npu-xrt/ctrl_packet_reconfig/run.lit @@ -3,9 +3,12 @@ // // REQUIRES: ryzen_ai // -// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --basic-alloc-scheme --generate-ctrl-pkt-overlay --xclbin-name=aie.xclbin --npu-insts-name=insts.txt %S/aie.mlir -// RUN: cp %S/ctrl_pkts.txt . +// RUN: %python aiecc.py --no-aiesim --aie-generate-cdo --aie-generate-npu --no-compile-host --basic-alloc-scheme --generate-ctrl-pkt-overlay --xclbin-name=aie1.xclbin --npu-insts-name=insts1.txt %S/aie1.mlir +// RUN: %python aiecc.py --no-aiesim --aie-generate-ctrlpkt --aie-generate-npu --no-compile-host --basic-alloc-scheme --generate-ctrl-pkt-overlay --npu-insts-name=insts2.txt %S/aie2.mlir +// RUN: aie-translate -aie-ctrlpkt-to-bin aie2.mlir.prj/ctrlpkt.mlir -o ctrlpkt.txt +// RUN: aie-opt -aie-ctrl-packet-infer-tiles -aie-generate-column-control-overlay="route-shim-to-tile-ctrl=true" -aie-ctrl-packet-to-dma aie2.mlir.prj/ctrlpkt.mlir > aie3.mlir +// RUN: %python aiecc.py --no-aiesim --aie-only-generate-npu --no-compile-host --generate-ctrl-pkt-overlay --xclbin-name=aie3.xclbin --npu-insts-name=insts3.txt aie3.mlir + // RUN: clang %S/test.cpp -o test.exe -std=c++11 -Wall %xrt_flags -lrt -lstdc++ -lboost_program_options -lboost_filesystem -// RUN: %run_on_npu ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.txt | FileCheck %s +// RUN: %run_on_npu ./test.exe | FileCheck %s // CHECK: PASS! -// XFAIL: * diff --git a/test/npu-xrt/ctrl_packet_reconfig/test.cpp b/test/npu-xrt/ctrl_packet_reconfig/test.cpp index 32bb72fbf9..381c3b81ac 100644 --- a/test/npu-xrt/ctrl_packet_reconfig/test.cpp +++ b/test/npu-xrt/ctrl_packet_reconfig/test.cpp @@ -8,6 +8,7 @@ // //===----------------------------------------------------------------------===// +#include #include #include #include @@ -15,6 +16,7 @@ #include #include +#include "experimental/xrt_kernel.h" #include "xrt/xrt_bo.h" #include "xrt/xrt_device.h" #include "xrt/xrt_kernel.h" @@ -42,8 +44,12 @@ std::vector load_instr_sequence(std::string instr_path) { } int main(int argc, const char *argv[]) { - std::vector instr_v = load_instr_sequence("insts.txt"); - std::vector ctrlPackets = load_instr_sequence("ctrl_pkts.txt"); + // AIE design's data streams + std::vector instr2_v = load_instr_sequence("insts2.txt"); + // AIE configuration as control packet streams + std::vector instr3_cfg_v = load_instr_sequence("insts3.txt"); + // AIE configuration control packets' raw data + std::vector ctrlPackets = load_instr_sequence("ctrlpkt.txt"); // Start the XRT test code // Get a device handle @@ -51,7 +57,8 @@ int main(int argc, const char *argv[]) { auto device = xrt::device(device_index); // Load the xclbin - auto xclbin = xrt::xclbin("aie.xclbin"); + // Skeleton xclbin containing only the control packet network + auto xclbin = xrt::xclbin("aie1.xclbin"); std::string Node = "MLIR_AIE"; @@ -73,14 +80,16 @@ int main(int argc, const char *argv[]) { // get a kernel handle auto kernel = xrt::kernel(context, kernelName); - auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), - XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + auto bo_instr3 = xrt::bo(device, instr3_cfg_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + auto bo_instr2 = xrt::bo(device, instr2_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(IN_DATATYPE), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(OUT_DATATYPE), XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4)); auto bo_ctrlpkt = xrt::bo(device, CTRL_IN_SIZE * sizeof(int32_t), - XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3)); IN_DATATYPE *bufInA = bo_inA.map(); std::vector srcVecA; @@ -88,24 +97,53 @@ int main(int argc, const char *argv[]) { srcVecA.push_back(1); memcpy(bufInA, srcVecA.data(), (srcVecA.size() * sizeof(IN_DATATYPE))); - void *bufInstr = bo_instr.map(); - memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + void *bufInstr2 = bo_instr2.map(); + memcpy(bufInstr2, instr2_v.data(), instr2_v.size() * sizeof(int)); + + void *bufInstr3 = bo_instr3.map(); + memcpy(bufInstr3, instr3_cfg_v.data(), instr3_cfg_v.size() * sizeof(int)); void *bufctrlpkt = bo_ctrlpkt.map(); memcpy(bufctrlpkt, ctrlPackets.data(), ctrlPackets.size() * sizeof(int)); - bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); - bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); + // Synchronizing BOs + bo_instr3.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_instr2.sync(XCL_BO_SYNC_BO_TO_DEVICE); bo_ctrlpkt.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE); + unsigned int opcode = 3; - auto run = - kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_out, bo_ctrlpkt); - ert_cmd_state r = run.wait(); - if (r != ERT_CMD_STATE_COMPLETED) { - std::cout << "Kernel did not complete. Returned status: " << r << "\n"; - return 1; - } + + // Creating a runlist to contain two seperate runs + xrt::runlist runlist = xrt::runlist(context); + + // Run 0: configuration + auto run0 = xrt::run(kernel); + run0.set_arg(0, opcode); + run0.set_arg(1, bo_instr3); + run0.set_arg(2, instr3_cfg_v.size()); + run0.set_arg(3, bo_ctrlpkt); + run0.set_arg(4, 0); + run0.set_arg(5, 0); + run0.set_arg(6, 0); + run0.set_arg(7, 0); + // Run 1: the design + auto run1 = xrt::run(kernel); + run1.set_arg(0, opcode); + run1.set_arg(1, bo_instr2); + run1.set_arg(2, instr2_v.size()); + run1.set_arg(3, bo_inA); + run1.set_arg(4, 0); + run1.set_arg(5, bo_out); + run1.set_arg(6, 0); + run1.set_arg(7, 0); + + // Executing and waiting on the runlist + runlist.add(run0); + runlist.add(run1); + runlist.execute(); + runlist.wait(); bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);