nod-ai · nirvedhmeshram · Jun 13, 2024 · Jun 17, 2024 · Jun 17, 2024 · Jun 17, 2024
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -103,7 +103,7 @@ jobs:
         run: |
           python3 -m venv .venv
           source .venv/bin/activate
-          pip install https://github.com/Xilinx/mlir-aie/releases/download/latest-wheels/mlir_aie-0.0.1.2024061222+3ac9566-py3-none-manylinux_2_35_x86_64.whl
+          pip install https://github.com/Xilinx/mlir-aie/releases/download/latest-wheels/mlir_aie-0.0.1.2024061622+18c8815-py3-none-manylinux_2_35_x86_64.whl
 
           pip install -r tests/matmul/requirements.txt
 

diff --git a/build_tools/ci/cpu_comparison/run_test.sh b/build_tools/ci/cpu_comparison/run_test.sh
@@ -301,6 +301,9 @@ function run_test() {
 run_test \
   --test_file ${THIS_DIR}/test_files/matmul_int32.mlir
 
+run_test \
+  --test_file ${THIS_DIR}/test_files/three_matmuls.mlir
+
 run_test \
    --name_prefix "matmul" \
    --lhs_rhs_type "bf16" \

diff --git a/build_tools/ci/cpu_comparison/test_files/three_matmuls.mlir b/build_tools/ci/cpu_comparison/test_files/three_matmuls.mlir
@@ -0,0 +1,31 @@
+// This test shows arbitory matmuls that would have producer consumer relationships
+// across different dispatches running on CI.
+
+// These lines are strictly required by the script which generates input data:
+//
+// input 32x32xf32
+// input 32x32xf32
+// input 32x4xf32
+// input 4x32xf32
+
+!A_TYPE = tensor<32x32xf32>
+!B_TYPE = tensor<32x4xf32>
+!C_TYPE = tensor <4x32xf32>
+!D_TYPE = tensor <4x4xf32>
+func.func @two_mm(%lhs : !A_TYPE,
+    %rhs : !A_TYPE, %rhs_2 : !B_TYPE, %lhs_2 : !C_TYPE) -> !D_TYPE {
+  %empty = tensor.empty() : !A_TYPE
+  %empty_2 = tensor.empty() : !B_TYPE
+  %empty_3 = tensor.empty() : !D_TYPE
+  %cst = arith.constant 0.0 : f32
+  %fill = linalg.fill ins(%cst : f32) outs(%empty : !A_TYPE) -> !A_TYPE
+  %fill_2 = linalg.fill ins(%cst : f32) outs(%empty_2 : !B_TYPE) -> !B_TYPE
+  %fill_3 = linalg.fill ins(%cst : f32) outs(%empty_3 : !D_TYPE) -> !D_TYPE
+  %2 = linalg.matmul ins(%lhs, %rhs : !A_TYPE, !A_TYPE)
+      outs(%fill : !A_TYPE) -> !A_TYPE
+  %3 = linalg.matmul ins(%2, %rhs_2 : !A_TYPE, !B_TYPE)
+      outs(%fill_2 : !B_TYPE) -> !B_TYPE
+  %4 = linalg.matmul ins(%lhs_2, %3 : !C_TYPE, !B_TYPE)
+      outs(%fill_3 : !D_TYPE) -> !D_TYPE
+  return %4 : !D_TYPE
+}
@@ -268,6 +268,8 @@ LogicalResult AIETargetBackend::serializeExecutable(
   SmallVector<uint32_t> xclbinIndices(ordinalCount);
   SmallVector<uint32_t> asmInstrIndices(ordinalCount);
 
+  SmallVector<SmallString<128>> xclbinPaths;
+
   for (size_t i = 0; i < entryPointNames.size(); i++) {
     uint64_t ordinal = entryPointOrdinals.at(entryPointNames[i]);
 
@@ -300,18 +302,34 @@ LogicalResult AIETargetBackend::serializeExecutable(
     llvm::sys::path::append(npuInstPath,
                             entryPointNamesFb[ordinal] + ".npu.txt");
 
-    SmallVector<StringRef> cmdArgs{aie2xclbin,
-                                   inputMlirPath,
-                                   "--peano",
-                                   options.peanoInstallDir,
-                                   "--xclbin-name",
-                                   xclbinPath,
-                                   "--npu-insts-name",
-                                   npuInstPath,
-                                   "--xclbin-kernel-name",
-                                   entryPointNamesFb[ordinal],
-                                   "--tmpdir",
-                                   entryPointWorkDir};
+    // Convert ordinal to hexadecimal string for xclbin kernel id.
+    std::stringstream ss;
+    ss << "0x" << std::hex << ordinal + 10;
+    std::string ordinalHex = ss.str();
+
+    SmallVector<StringRef> cmdArgs;
+    SmallVector<StringRef> cmdArgsBase{aie2xclbin,
+                                       inputMlirPath,
+                                       "--peano",
+                                       options.peanoInstallDir,
+                                       "--xclbin-name",
+                                       xclbinPath,
+                                       "--npu-insts-name",
+                                       npuInstPath,
+                                       "--xclbin-kernel-name",
+                                       entryPointNamesFb[ordinal],
+                                       "--tmpdir",
+                                       entryPointWorkDir,
+                                       "--xclbin-kernel-id",
+                                       ordinalHex};
+    cmdArgs = cmdArgsBase;
+    bool AttemptingMerge = false;
+    if (i > 0) {
+      cmdArgs.push_back("--input-xclbin-name");
+      cmdArgs.push_back(xclbinPaths.back());
+      AttemptingMerge = true;
+    }
+    xclbinPaths.push_back(xclbinPath);
 
     auto addOpt = [&](StringRef arg, bool value) {
       if (value) cmdArgs.push_back(arg);
@@ -350,11 +368,24 @@ LogicalResult AIETargetBackend::serializeExecutable(
     {
       SmallVector<StringRef> cmdEnvRefs{cmdEnv.begin(), cmdEnv.end()};
       int result = llvm::sys::ExecuteAndWait(cmdArgs[0], cmdArgs, cmdEnvRefs);
-      if (result != 0)
+      if (result != 0 && AttemptingMerge) {
+        // we failed to create xclbin but maybe we failed becuase we were trying
+        // to merge the kerenel in exisiting xclbin, try again to see if perhaps
+        // we have success if we dont try to merge.
+        AttemptingMerge = false;
+        result =
+            llvm::sys::ExecuteAndWait(cmdArgsBase[0], cmdArgsBase, cmdEnvRefs);
+        xclbinPaths.push_back(xclbinPath);
+      }
+      if (result != 0) {
         return moduleOp.emitOpError(
             "Failed to produce an XCLBin with external tool.");
+      }
+      // delete the previous xclbin if we were able to merge as the new one now
+      // will have all the kernels from the previous one.
+      if (AttemptingMerge) xclbinPaths.erase(xclbinPaths.end() - 2);
+      xclbinIndices[ordinal] = xclbinPaths.size() - 1;
     }
-
     std::ifstream instrFile(static_cast<std::string>(npuInstPath));
     std::string line;
     while (std::getline(instrFile, line)) {
@@ -369,17 +400,21 @@ LogicalResult AIETargetBackend::serializeExecutable(
     asmInstrIndices[ordinal] = asmInstrRefs.size();
     asmInstrRefs.push_back(
         iree_amd_aie_hal_xrt_AsmInstDef_create(builder, npuInstrsVec));
-
+  }
+  // Write out the final xclbins to flatbuffer.
+  for (auto xclbinPath : xclbinPaths) {
+    llvm::outs() << "writing xclbin from path: " << xclbinPath << "\n";
+    std::string errorMessage;
     xclbinIn = openInputFile(xclbinPath, &errorMessage);
     if (!xclbinIn) {
       moduleOp.emitOpError() << "Failed to open xclbin file: " << errorMessage;
     }
     auto xclbinStringRef = builder.createString(xclbinIn->getBuffer());
-    xclbinIndices[ordinal] = xclbinRefs.size();
     xclbinRefs.push_back(
         iree_amd_aie_hal_xrt_XclbinDef_create(builder, xclbinStringRef));
   }
-  // Serialize the executable to flatbuffer format
+
+  // Serialize the executable to flatbuffer format.
   auto entryPointsRef = builder.createStringVec(entryPointNamesFb);
 
   iree_amd_aie_hal_xrt_ExecutableDef_entry_points_add(builder, entryPointsRef);

@@ -128,6 +128,9 @@ iree_status_t iree_hal_xrt_native_executable_create(
   iree_amd_aie_hal_xrt_XclbinDef_vec_t xclbins_vec =
       iree_amd_aie_hal_xrt_ExecutableDef_xclbins_get(executable_def);
 
+  iree_host_size_t number_xclbin =
+      iree_amd_aie_hal_xrt_XclbinDef_vec_len(xclbins_vec);
+
   iree_amd_aie_hal_xrt_AsmInstDef_vec_t asm_instrs_vec =
       iree_amd_aie_hal_xrt_ExecutableDef_asm_instrs_get(executable_def);
 
@@ -163,17 +166,15 @@ iree_status_t iree_hal_xrt_native_executable_create(
                                &executable->resource);
   executable->host_allocator = host_allocator;
   executable->entry_point_count = entry_point_count;
-  for (iree_host_size_t entry_ordinal = 0; entry_ordinal < entry_point_count;
-       entry_ordinal++) {
-    const char* entry_name =
-        flatbuffers_string_vec_at(entry_points_vec, entry_ordinal);
-    uint32_t xclbin_index =
-        flatbuffers_uint32_vec_at(xclbin_indices_vec, entry_ordinal);
+  // collect all the hardware contexts first as muliple entry points can map to
+  // the same context and this way we dont need to keep reloading them.
+  std::vector<xrt::hw_context> contexts;
+  for (iree_host_size_t xclbin_index = 0; xclbin_index < number_xclbin;
+       xclbin_index++) {
     iree_amd_aie_hal_xrt_XclbinDef_table_t xclbin_def =
         iree_amd_aie_hal_xrt_XclbinDef_vec_at(xclbins_vec, xclbin_index);
     flatbuffers_string_t xclbin_fb =
         iree_amd_aie_hal_xrt_XclbinDef_xclbin_get(xclbin_def);
-
     // XRT API needs this vector and cant actually read a void*.
     std::vector<char> xclbinVector(
         xclbin_fb, xclbin_fb + flatbuffers_string_len(xclbin_fb));
@@ -186,6 +187,14 @@ iree_status_t iree_hal_xrt_native_executable_create(
     }
     device.register_xclbin(xclbin);
     xrt::hw_context context(device, xclbin.get_uuid());
+    contexts.push_back(context);
+  }
+  for (iree_host_size_t entry_ordinal = 0; entry_ordinal < entry_point_count;
+       entry_ordinal++) {
+    const char* entry_name =
+        flatbuffers_string_vec_at(entry_points_vec, entry_ordinal);
+    uint32_t xclbin_index =
+        flatbuffers_uint32_vec_at(xclbin_indices_vec, entry_ordinal);
     uint32_t asm_instr_index =
         flatbuffers_uint32_vec_at(asm_instr_indices_vec, entry_ordinal);
     iree_amd_aie_hal_xrt_AsmInstDef_table_t asminst_def =
@@ -196,7 +205,8 @@ iree_status_t iree_hal_xrt_native_executable_create(
     std::unique_ptr<xrt::kernel> kernel;
     std::unique_ptr<xrt::bo> instr;
     try {
-      kernel = std::make_unique<xrt::kernel>(context, entry_name);
+      kernel =
+          std::make_unique<xrt::kernel>(contexts[xclbin_index], entry_name);
       // XCL_BO_FLAGS_CACHEABLE is used to indicate that this is an instruction
       // buffer that resides in instr_memory. This buffer is always passed as
       // the second argument to the kernel and we can use the