Numerical test for strix compiled with peano (#1077)

The main change here is an update to the pass that unsets the load and store alignments. Now it unsets _all_ llvm.loads and llvm.stores (see comments in pass). Without this, peano tries to scalarize an llvm.load and gets into an infinite/quadratic loop. This PR adds an end-to-end numerical test of peano on strix. One unrelated change in AMDAIELowerToAIE.cpp : while experimenting with adding alignment/noalias attributes to the function signature in function outlining, noticed that the attributes were being dropped in this pass.
nod-ai · Feb 5, 2025 · 4c43795 · 4c43795
1 parent f13c033
commit 4c43795
Show file tree

Hide file tree

Showing 11 changed files with 148 additions and 83 deletions.
diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
@@ -1871,7 +1871,7 @@ def __init__(self):
         )
 
         # Matmul tests on a single core.
-        for device in ["npu1_4col"]:
+        for device in ["npu1_4col", "npu4"]:
             self.register(
                 Matmul(
                     128,

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELoadAlignmentReset.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELoadAlignmentReset.cpp
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELoadStoreAlignmentReset.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELoadStoreAlignmentReset.cpp
@@ -0,0 +1,104 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree-amd-aie/IR/AMDAIEDialect.h"
+#include "iree-amd-aie/Transforms/Passes.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/SCF/Transforms/Transforms.h"
+#include "mlir/Dialect/SCF/Utils/Utils.h"
+#include "mlir/Pass/Pass.h"
+#define DEBUG_TYPE "iree-amdaie-loads-store-alignment-reset"
+
+namespace mlir::iree_compiler::AMDAIE {
+
+using namespace mlir;
+
+namespace {
+
+/// A pass that removes the alignment attribute from llvm.load and llvm.store
+/// operations. As example, this pass will replace
+///
+///  ```
+///  %20 = llvm.load %19 {alignment = 1 : i64} : !llvm.ptr -> vector<32xi8>
+///  ```
+///
+///  with
+///
+///  ```
+///  %20 = llvm.load %19 : !llvm.ptr -> vector<32xi8>
+///  ```
+///
+/// The motivation for this is that the alignment on the llvm.load operation,
+/// which is assigned in the pass `convert-vector-to-llvm` is currently too
+/// conservative and if left as is, results in poor (and sometimes invalid)
+/// scalarized code in peano.
+///
+/// The pass `convert-vector-to-llvm` does not seem to be doing any analysis to
+/// choose the highest possible alignment. It lowers
+///
+/// ```
+/// %11 = vector.transfer_read %collapse_shape[%10], %c0_i8 {in_bounds = [true]}
+/// : memref<1024xi8>, vector<32xi8>
+/// ```
+///
+/// to
+///
+/// ```
+/// %20 = llvm.load %19 {alignment = 1 : i64} : !llvm.ptr -> vector<32xi8>
+/// ```
+///
+/// even when it can be inferred that %10 is a multiple ot 32. The pass
+/// seems to always just use the alignment of the element type.
+///
+/// By resetting the alignments that the `convert-vector-to-llvm` pass assigns,
+/// the lowering/translation to LLVMIR assigns new alignments. In other words it
+/// seems that the translation does not modify existing alignments on llvm.load
+/// or llvm.store, but if there is no alignment present it assigns one.
+///
+/// Whereas the `convert-vector-to-llvm` pass assigns an alignment based on
+/// the element-type, the translation to LLVMIR assigns an alignment based on
+/// the vector width. For example, for a vector of 32 i8 values, the alignment
+/// assigned is 32.
+///
+/// I can imagine cases where the llvm.load actually loads with an alignment
+/// less than the vector width, for example if you're loading overlapping
+/// vectors (for a convolution say):
+///
+/// iteration 1: load bytes 0-8.
+/// iteration 2: load bytes 4-12.
+/// iteration 3: load bytes 8-16.
+///
+/// in this case using the width of the vector (8 bytes) as the alignment would
+/// be incorrect, as the loads are only 4-byte aligned. Future work: check if
+/// LLVMIR lowering correctly handles this case, if not implement an analysis.
+///
+/// See also https://jira.xilinx.com/projects/AIECC/issues/AIECC-589
+
+class AMDAIELoadStoreAlignmentReset
+    : public impl::AMDAIELoadStoreAlignmentResetBase<
+          AMDAIELoadStoreAlignmentReset> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<AMDAIEDialect>();
+  }
+
+  void runOnOperation() override {
+    getOperation()->walk([](Operation *op) {
+      if (auto loadOp = dyn_cast<LLVM::LoadOp>(op)) {
+        loadOp.setAlignment(std::optional<uint64_t>());
+      } else if (auto storeOp = dyn_cast<LLVM::StoreOp>(op)) {
+        storeOp.setAlignment(std::optional<uint64_t>());
+      }
+    });
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<Pass> createAMDAIELoadStoreAlignmentResetPass() {
+  return std::make_unique<AMDAIELoadStoreAlignmentReset>();
+}
+
+}  // namespace mlir::iree_compiler::AMDAIE
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp
@@ -423,6 +423,7 @@ LogicalResult AIEDeviceBuilder::coreFuncCallOpToAIE(
   StringRef fnName = oldCallOp.getCallee();
   auto fnDecl = dyn_cast_if_present<func::FuncOp>(
       SymbolTable::lookupSymbolIn(moduleOp, fnName));
+
   assert(fnDecl && "expected function declaration");
   // Check the mapper to see if we've already created a new function declaration
   // with the new function type. If not, create the same. We need to create a
@@ -437,14 +438,20 @@ LogicalResult AIEDeviceBuilder::coreFuncCallOpToAIE(
     SymbolTable::setSymbolVisibility(newFnDecl,
                                      SymbolTable::Visibility::Private);
     newFnDecl->setAttr("llvm.bareptr", rewriter.getBoolAttr(true));
+
     fnDecl.getBody().cloneInto(&(newFnDecl.getBody()), mapper);
+    if (ArrayAttr oldAttrs = fnDecl.getAllArgAttrs()) {
+      newFnDecl.setAllArgAttrs(oldAttrs);
+    }
+
     mapper.map(fnDecl.getOperation(), newFnDecl.getOperation());
     fnDecl = newFnDecl;
   }
   // Fetch the new function declaration and create the new func.call op.
   auto newFnDecl = cast<func::FuncOp>(mapper.lookupOrDefault(fnDecl));
   rewriter.create<func::CallOp>(oldCallOp->getLoc(), newFnDecl, newArgs);
   toBeErased.push_back(oldCallOp);
+
   return success();
 }
 

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/CMakeLists.txt
@@ -86,7 +86,7 @@ iree_cc_library(
     "AMDAIEInsertLoopsForVectorization.cpp"
     "AMDAIELinkExecutables.cpp"
     "AMDAIELocalizeLogicalObjectFifo.cpp"
-    "AMDAIELoadAlignmentReset.cpp"
+    "AMDAIELoadStoreAlignmentReset.cpp"
     "AMDAIELowerExecutableTarget.cpp"
     "AMDAIELowerFuncArgs.cpp"
     "AMDAIELowerToAIE.cpp"

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/PassDetail.h
@@ -67,7 +67,7 @@ namespace mlir::iree_compiler::AMDAIE {
 #define GEN_PASS_DEF_AMDAIEINSERTINFINITELOOPAROUNDCOREBLOCK
 #define GEN_PASS_DEF_AMDAIEINSERTLOOPSFORVECTORIZATION
 #define GEN_PASS_DEF_AMDAIELINKEXECUTABLES
-#define GEN_PASS_DEF_AMDAIELOADALIGNMENTRESET
+#define GEN_PASS_DEF_AMDAIELOADSTOREALIGNMENTRESET
 #define GEN_PASS_DEF_AMDAIELOCALIZELOGICALOBJECTFIFO
 #define GEN_PASS_DEF_AMDAIELOWEREXECUTABLETARGET
 #define GEN_PASS_DEF_AMDAIELOWERINGSTRATEGY

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
@@ -929,7 +929,7 @@ void addMLIRAIELoweringPasses(OpPassManager &pm) {
   pm.addPass(createArithToLLVMConversionPass());
   pm.addPass(createCanonicalizerPass());
   pm.addPass(createCSEPass());
-  pm.addPass(createAMDAIELoadAlignmentResetPass());
+  pm.addPass(createAMDAIELoadStoreAlignmentResetPass());
   pm.addPass(createCanonicalizerPass());
 }
 

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h
@@ -338,7 +338,7 @@ std::unique_ptr<Pass> createAMDAIETileAndFusePass(
 std::unique_ptr<Pass> createAMDAIEPropagateDataLayoutPass();
 
 /// Create pass to reset the alignment of LLVM load operations.
-std::unique_ptr<Pass> createAMDAIELoadAlignmentResetPass();
+std::unique_ptr<Pass> createAMDAIELoadStoreAlignmentResetPass();
 
 void registerAMDAIEPasses();
 

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td
@@ -321,8 +321,8 @@ def AMDAIELinalgFunctionOutlining :
                    "All ops are outlined."),
         clEnumValN(mlir::iree_compiler::AMDAIE::OutliningStrategy::Balanced, "balanced",
                    "A strategy that tries to achieve a balanced tradeoff between performance and program size.")
-      )}]>,
-  ];
+      )}]>
+    ];
 }
 
 def AMDAIEFoldDmaWaits :
@@ -467,14 +467,14 @@ def AMDAIELinkExecutables :
   let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIELinkExecutablesPass()";
 }
 
-def AMDAIELoadAlignmentReset :
-    Pass<"iree-amdaie-load-alignment-reset", ""> {
+def AMDAIELoadStoreAlignmentReset :
+    Pass<"iree-amdaie-load-store-alignment-reset", ""> {
     let summary = "Reset the alignment of the LLVM load operations.";
-    let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIELoadAlignmentResetPass()";
+    let constructor = "mlir::iree_compiler::AMDAIE::createAMDAIELoadStoreAlignmentResetPass()";
     let description = [{
-      Reset the alignment of the LLVM load operations to the 'unset'
-      optional value. This is a workaround for an issue in peano, and
-      should eventually be removed.
+      Reset the alignment of the LLVM load and store operations to the 'unset'
+      optional value. This is a workaround for an issue where alignments assigned
+      are too low/conservative in the vector-to-llvm pass.
     }];
 }
 

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt
@@ -57,6 +57,7 @@ iree_lit_test_suite(
     "insert_dma_bd_chain.mlir"
     "insert_infinite_loop_around_core_block.mlir"
     "insert_loops_for_vectorization.mlir"
+    "load_store_alignment_reset.mlir"
     "localize_logical_objectfifo.mlir"
     "lower_func_args.mlir"
     "lower_to_aie.mlir"

diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/load_store_alignment_reset.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/load_store_alignment_reset.mlir
@@ -0,0 +1,23 @@
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-amdaie-load-store-alignment-reset))" %s | FileCheck %s
+
+
+// CHECK-LABEL: func @alignmentsWillBeRemoved(%arg0: !llvm.ptr)
+func.func  @alignmentsWillBeRemoved(%arg0: !llvm.ptr) {
+
+  // CHECK:      %[[L1:.+]] = llvm.load %arg0 : !llvm.ptr -> vector<32xi8>
+  // CHECK-NEXT: %[[L2:.+]] = llvm.load %arg0 : !llvm.ptr -> vector<32xi8>
+  // CHECK-NEXT: %[[L4:.+]] = llvm.load %arg0 : !llvm.ptr -> vector<32xi8>
+  %l1 = llvm.load %arg0 {alignment = 1 : i64} : !llvm.ptr -> vector<32xi8>
+  %l2 = llvm.load %arg0 {alignment = 2 : i64} : !llvm.ptr -> vector<32xi8>
+  %l4 = llvm.load %arg0 {alignment = 4 : i64} : !llvm.ptr -> vector<32xi8>
+
+  // CHECK-NEXT: llvm.store %[[L1]], %arg0 : vector<32xi8>, !llvm.ptr
+  // CHECK-NEXT: llvm.store %[[L1]], %arg0 : vector<32xi8>, !llvm.ptr
+  // CHECK-NEXT: llvm.store %[[L1]], %arg0 : vector<32xi8>, !llvm.ptr
+  llvm.store %l1, %arg0 {alignment = 1 : i64} : vector<32xi8>, !llvm.ptr
+  llvm.store %l1, %arg0 {alignment = 2 : i64} : vector<32xi8>, !llvm.ptr
+  llvm.store %l1, %arg0 {alignment = 4 : i64} : vector<32xi8>, !llvm.ptr
+
+  // CHECK-NEXT: return
+  return
+}