From dbee22b72f6859e6c9cfb40e9796166472043ee7 Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Tue, 4 Feb 2025 18:28:12 +0000
Subject: [PATCH 1/8] WIP i8->i8 Strix

---
 .../matmul_trunci_MxK_KxN.mlir                |  13 ++
 build_tools/ci/cpu_comparison/run.py          | 121 +++++++++++++++++-
 .../aievec/VectorToVectorConversions.cpp      |  20 +--
 .../AMD-AIE/iree-amd-aie/Target/mm_npu4.cc    |   4 +
 .../Transforms/AMDAIEInsertCores.cpp          |   4 +-
 .../iree-amd-aie/Transforms/Passes.cpp        |   1 +
 6 files changed, 150 insertions(+), 13 deletions(-)
 create mode 100644 build_tools/ci/cpu_comparison/matmul_template/matmul_trunci_MxK_KxN.mlir

diff --git a/build_tools/ci/cpu_comparison/matmul_template/matmul_trunci_MxK_KxN.mlir b/build_tools/ci/cpu_comparison/matmul_template/matmul_trunci_MxK_KxN.mlir
new file mode 100644
index 000000000..b6fe8e361
--- /dev/null
+++ b/build_tools/ci/cpu_comparison/matmul_template/matmul_trunci_MxK_KxN.mlir
@@ -0,0 +1,13 @@
+// input ${M}x${K}x${TYPE1}
+// input ${K}x${N}x${TYPE1}
+
+func.func @matmul_trunci(%arg0: tensor<${M}x${K}x${TYPE1}>, %arg1: tensor<${K}x${N}x${TYPE1}>) -> tensor<${M}x${N}x${TYPE1}>
+{
+  %cst = arith.constant ${ZERO} : ${TYPE2}
+  %0 = tensor.empty() : tensor<${M}x${N}x${TYPE2}>
+  %1 = linalg.fill ins(%cst : ${TYPE2}) outs(%0 : tensor<${M}x${N}x${TYPE2}>) -> tensor<${M}x${N}x${TYPE2}>
+  %2 = linalg.matmul ins(%arg0, %arg1 : tensor<${M}x${K}x${TYPE1}>, tensor<${K}x${N}x${TYPE1}>)
+    outs(%1: tensor<${M}x${N}x${TYPE2}>) -> tensor<${M}x${N}x${TYPE2}>
+  %3 = arith.trunci %2 : tensor<${M}x${N}x${TYPE2}> to tensor<${M}x${N}x${TYPE1}>
+  return %3: tensor<${M}x${N}x${TYPE1}>
+}
diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
index e4d1eac3c..09a264e23 100755
--- a/build_tools/ci/cpu_comparison/run.py
+++ b/build_tools/ci/cpu_comparison/run.py
@@ -669,6 +669,88 @@ def _execute(self, config):
         return True
 
 
+class MatmulTrunci(BaseMatmul):
+    """
+    A test of the form matmul(A,B) + trunci(C) where A:MxK, B:KxM and C:MxM
+    """
+
+    def __init__(
+        self,
+        M,
+        K,
+        input_type,
+        acc_type,
+        lhs,
+        rhs,
+        expected_out,
+        run_on_target=["npu1_4col"],
+        tile_pipeline="pack-peel",
+        aie_compilation_flags=None,
+        use_ukernel=False,
+        n_repeats=1,
+        use_chess=False,
+    ):
+        super().__init__(
+            run_on_target=run_on_target,
+            aie_compilation_flags=aie_compilation_flags,
+            M=M,
+            N=M,
+            K=K,
+            input_type=input_type,
+            acc_type=acc_type,
+            tile_pipeline=tile_pipeline,
+            n_repeats=n_repeats,
+            use_ukernel=use_ukernel,
+            use_chess=use_chess,
+        )
+        self.labels.append("MatmulTrunci")
+
+        # Assertions on shapes: Check that lhs is MxK, rhs is KxM, and expected_out is MxM
+        assert lhs.shape == (M, K)
+        assert rhs.shape == (K, M)
+        assert expected_out.shape == (M, M)
+
+        self.name = f"matmul_trunci_{M}_{K}_{input_type}_{acc_type}"
+        if tile_pipeline == "pack-peel-4-level-tiling":
+            self.name += "_4_level_tiling"
+        self.lhs = lhs
+        self.rhs = rhs
+        self.expected_out = expected_out
+
+    def _execute(self, config):
+        matmul_template_dir = config.file_dir / "matmul_template"
+        template_name = matmul_template_dir / "matmul_trunci_MxK_KxN.mlir"
+        self.generate(config, template_name)
+        filename = self.get_filename(config)
+        input_args = generate_inputs(
+            filename, self.get_dir(config), 1, {1: self.lhs, 2: self.rhs}
+        )
+        """
+        Currently without function outlining, we run out of program memory.
+        """
+        self.add_aie_compilation_flags(
+            ["--iree-amdaie-enable-function-outlining=balanced"]
+        )
+        aie_vs_baseline(
+            config=config,
+            aie_compilation_flags=self.aie_compilation_flags,
+            test_file=self.get_filename(config),
+            input_args=input_args,
+            baseline_value=self.expected_out,
+            use_ukernel=self.use_ukernel,
+            tile_pipeline=self.tile_pipeline,
+            function_name=None,
+            seed=1,
+            rtol=0,
+            atol=0,
+            lower_to_aie_pipeline=self.lower_to_aie_pipeline,
+            n_repeats=self.n_repeats,
+            output_type=get_output_type(self.get_filename(config)),
+        )
+
+        return True
+
+
 def find_executable(install_dir: Path, executable_name):
     """
     Search for an executable in the given directory and its subdirectories
@@ -1450,6 +1532,25 @@ def __init__(self):
         self.existing_names = []
         self.tests = []
 
+        self.register(
+            MatmulTrunci(
+                256,
+                32,
+                "i8",
+                "i32",
+                1 * np.ones([256, 32], dtype=np.int8),
+                1 * np.ones([32, 256], dtype=np.int8),
+                32 * np.ones([256, 256], dtype=np.int8),
+                tile_pipeline="pack-peel-4-level-tiling",
+                run_on_target=["npu4"],
+                aie_compilation_flags=[
+                    "--iree-amdaie-num-rows=4",
+                    "--iree-amdaie-num-cols=8",
+                ],
+                use_chess=True,
+                use_ukernel=True,
+            )
+        )
         # Matmul with truncf test(s):
         for tile_pipeline in ["pack-peel", "pack-peel-4-level-tiling"]:
             self.register(
@@ -1628,9 +1729,10 @@ def __init__(self):
         )
         self.register(
             Matmul(
-                512,
-                512,
+                128,
+                128,
                 256,
+                "i8",
                 "i32",
                 "i32",
                 test_params=TestParams(
@@ -1664,6 +1766,21 @@ def __init__(self):
                 )
             )
 
+        # self.register(
+        #     Matmul(
+        #         64,
+        #         64,
+        #         64,
+        #         "bf16",
+        #         "f32",
+        #         use_ukernel=True,
+        #         use_chess=True,
+        #         run_on_target=["npu4"],
+        #         aie_compilation_flags=[
+        #             "--mlir-print-ir-before-all",
+        #         ],
+        #     )
+        # )
         self.register(
             Matmul(
                 64,
diff --git a/compiler/plugins/target/AMD-AIE/aievec/VectorToVectorConversions.cpp b/compiler/plugins/target/AMD-AIE/aievec/VectorToVectorConversions.cpp
index b7e8cf744..2cde4b91e 100644
--- a/compiler/plugins/target/AMD-AIE/aievec/VectorToVectorConversions.cpp
+++ b/compiler/plugins/target/AMD-AIE/aievec/VectorToVectorConversions.cpp
@@ -802,10 +802,11 @@ struct ToMinorIdentityTransferReadPattern
 ///     %1 = arith.truncf %0 : vector<6xf32> to vector<6xbf16>
 ///     %2 = vector.shape_cast %1 : vector<6xbf16> to vector<2x3xbf16>
 // clang-format on
-struct FlattenArithTruncFOpPattern : public OpRewritePattern<arith::TruncFOp> {
-  using OpRewritePattern<arith::TruncFOp>::OpRewritePattern;
+template <typename TruncOpTy>
+struct FlattenArithTruncOpPattern : public OpRewritePattern<TruncOpTy> {
+  using OpRewritePattern<TruncOpTy>::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(arith::TruncFOp op,
+  LogicalResult matchAndRewrite(TruncOpTy op,
                                 PatternRewriter &rewriter) const override {
     // Get old shape type.
     auto oldShapedType = dyn_cast<VectorType>(op.getType());
@@ -826,7 +827,7 @@ struct FlattenArithTruncFOpPattern : public OpRewritePattern<arith::TruncFOp> {
     Value newInputVector = rewriter.create<vector::ShapeCastOp>(
         op.getLoc(), newVectorTypeForInput, origInputOfTruncFOp);
     // Create new base operation with the linearized input/output.
-    Value newTruncFOp = rewriter.create<arith::TruncFOp>(
+    Value newTruncFOp = rewriter.create<TruncOpTy>(
         op.getLoc(), newVectorTypeForOutput, newInputVector);
     // Delinearize the output back to the original type.
     rewriter.replaceOpWithNewOp<vector::ShapeCastOp>(op, op.getType(),
@@ -1054,11 +1055,12 @@ struct CanonicalizeVectorForAIEVecPass
 
     {
       RewritePatternSet patterns(context);
-      patterns
-          .add<ExtractTransposeFromContractionOp, FlattenArithTruncFOpPattern,
-               ToMinorIdentityTransferReadPattern,
-               ToMinorIdentityTransferWritePattern,
-               ConvertLeadingUnitDimInsertToReshapePattern>(context);
+      patterns.add<ExtractTransposeFromContractionOp,
+                   FlattenArithTruncOpPattern<arith::TruncFOp>,
+                   FlattenArithTruncOpPattern<arith::TruncIOp>,
+                   ToMinorIdentityTransferReadPattern,
+                   ToMinorIdentityTransferWritePattern,
+                   ConvertLeadingUnitDimInsertToReshapePattern>(context);
       patterns.add<ConvertSplatTransferReadToBroadcastPattern>(context);
       patterns
           .add<copied_from_mlir::FlattenContiguousRowMajorTransferReadPattern,
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/mm_npu4.cc b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/mm_npu4.cc
index cfd7c55cc..400f7e86c 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/mm_npu4.cc
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/mm_npu4.cc
@@ -300,12 +300,16 @@ matmul_combos(matmul_vectorized_c_func, 32, 32, 64)
 matmul_combos(matmul_vectorized_c_func, 64, 64, 64)
 
 matmul_combos_i8(matmul_vectorized_c_func, 16, 16, 32)
+matmul_combos_i8(matmul_vectorized_c_func, 32, 16, 32)
+matmul_combos_i8(matmul_vectorized_c_func, 32, 16, 64)
+matmul_combos_i8(matmul_vectorized_c_func, 32, 32, 8)
 matmul_combos_i8(matmul_vectorized_c_func, 32, 32, 32)
 matmul_combos_i8(matmul_vectorized_c_func, 32, 32, 64)
 matmul_combos_i8(matmul_vectorized_c_func, 64, 64, 64)
 
 zero_fill_combos(zero_vectorized_c_func, 16, 8)
 zero_fill_combos(zero_vectorized_c_func, 16, 16)
+zero_fill_combos(zero_vectorized_c_func, 32, 16)
 zero_fill_combos(zero_vectorized_c_func, 32, 32)
 zero_fill_combos(zero_vectorized_c_func, 64, 64)
 
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertCores.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertCores.cpp
index 46ce86187..39c7ef03a 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertCores.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEInsertCores.cpp
@@ -39,8 +39,8 @@ namespace {
 static bool isCoreComputeOp(Operation *op) {
   return isa<linalg::LinalgOp, vector::ContractionOp,
              memref::ExtractStridedMetadataOp, func::CallOp, arith::ExtFOp,
-             arith::TruncFOp, vector::TransferReadOp, vector::TransferWriteOp>(
-      op);
+             arith::TruncFOp, arith::TruncIOp, vector::TransferReadOp,
+             vector::TransferWriteOp>(op);
 }
 
 /// Utility to map the parallel mapping attributes to the corresponding
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
index 7c04e74f5..64a09c9bd 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp
@@ -369,6 +369,7 @@ void addPackPeel4LevelTilingBasedPassPipeline(
     AMDAIETileAndFuseOptions tileFuseOptions;
     tileFuseOptions.tilingLevel = 1;
     tileFuseOptions.useSCFFor = false;
+    tileFuseOptions.tileElementwise = false;
     funcPassManager.addPass(createAMDAIETileAndFusePass(tileFuseOptions));
   }
   funcPassManager.addPass(createAMDAIECleanupPass());

From a3f9a72d396160e410eafc7f7ad6131a2c772507 Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Thu, 6 Feb 2025 15:10:34 +0000
Subject: [PATCH 2/8] Phoenix ukernel + vectorization support

---
 build_tools/ci/cpu_comparison/run.py          |  39 +++++
 .../precanonicalization-aieml-llvmir.mlir     |  13 ++
 .../AMD-AIE/iree-amd-aie/Target/mm_npu1.cc    | 159 +++++++++++++++++-
 .../Transforms/AMDAIEVectorization.cpp        |   2 +-
 .../Transforms/test/vectorization.mlir        |  18 +-
 5 files changed, 227 insertions(+), 4 deletions(-)

diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
index 09a264e23..f937bcf1b 100755
--- a/build_tools/ci/cpu_comparison/run.py
+++ b/build_tools/ci/cpu_comparison/run.py
@@ -1532,6 +1532,45 @@ def __init__(self):
         self.existing_names = []
         self.tests = []
 
+        # Tests Matmul + Trunci.
+        # Phoenix : Ukernel + Peano.
+        self.register(
+            MatmulTrunci(
+                256,
+                32,
+                "i8",
+                "i32",
+                1 * np.ones([256, 32], dtype=np.int8),
+                1 * np.ones([32, 256], dtype=np.int8),
+                32 * np.ones([256, 256], dtype=np.int8),
+                tile_pipeline="pack-peel-4-level-tiling",
+                run_on_target=["npu1_4col"],
+                aie_compilation_flags=[
+                    "--iree-amdaie-num-rows=4",
+                    "--iree-amdaie-num-cols=4",
+                ],
+                use_ukernel=True,
+            )
+        )
+        # Phoenix : Vectorization + Peano.
+        self.register(
+            MatmulTrunci(
+                256,
+                32,
+                "i8",
+                "i32",
+                1 * np.ones([256, 32], dtype=np.int8),
+                1 * np.ones([32, 256], dtype=np.int8),
+                32 * np.ones([256, 256], dtype=np.int8),
+                tile_pipeline="pack-peel-4-level-tiling",
+                run_on_target=["npu1_4col"],
+                aie_compilation_flags=[
+                    "--iree-amdaie-num-rows=4",
+                    "--iree-amdaie-num-cols=4",
+                ],
+            )
+        )
+        # Strix : Ukernel + Chess.
         self.register(
             MatmulTrunci(
                 256,
diff --git a/compiler/plugins/target/AMD-AIE/aievec/test/precanonicalization-aieml-llvmir.mlir b/compiler/plugins/target/AMD-AIE/aievec/test/precanonicalization-aieml-llvmir.mlir
index 1317069bd..97ab9cbae 100644
--- a/compiler/plugins/target/AMD-AIE/aievec/test/precanonicalization-aieml-llvmir.mlir
+++ b/compiler/plugins/target/AMD-AIE/aievec/test/precanonicalization-aieml-llvmir.mlir
@@ -169,6 +169,19 @@ func.func @arith_truncf(%inp: vector<2x3xf32>) -> vector<2x3xbf16> {
 
 // -----
 
+// CHECK-LABEL: @arith_trunci(
+// CHECK-SAME:      %[[INP:.*]]: vector<2x3xi32>)
+func.func @arith_trunci(%inp: vector<2x3xi32>) -> vector<2x3xi8> {
+    // CHECK:     %[[LINEARIZE:.*]] = vector.shape_cast %[[INP]] : vector<2x3xi32> to vector<6xi32>
+    // CHECK:     %[[TRUNCI:.*]] = arith.trunci %[[LINEARIZE]] : vector<6xi32> to vector<6xi8>
+    // CHECK:     %[[DELINEARIZE:.*]] = vector.shape_cast %[[TRUNCF]] : vector<6xi8> to vector<2x3xi8>
+    // CHECK:     return %[[DELINEARIZE]]
+    %0 = arith.trunci %inp : vector<2x3xi32> to vector<2x3xi8>
+    return %0 : vector<2x3xi8>
+}
+
+// -----
+
 // CHECK:       #map = affine_map<()[s0] -> (s0 * 256 + 96)>
 // CHECK-LABEL: @trivial_read_access
 // CHECK-SAME:  (%[[ARG0:.*]]: memref<4x8x4x8xbf16, strided<[256, 32, 8, 1]>>,
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/mm_npu1.cc b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/mm_npu1.cc
index c50012951..c4c69e744 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/mm_npu1.cc
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/mm_npu1.cc
@@ -262,6 +262,137 @@ void matmul_vectorized(const T_in *__restrict pA, unsigned offsetA,
   event1();
 }
 
+template <typename T_in, typename T_out, unsigned rowA, unsigned colA,
+          unsigned colB, unsigned r, unsigned s, unsigned t>
+static inline void matmul_vectorized_4x2(const T_in *__restrict pA,
+                                         unsigned offsetA,
+                                         const T_in *__restrict pB,
+                                         unsigned offsetB,
+                                         T_out *__restrict pC,
+                                         unsigned offsetC) {
+
+  using MMUL = aie::mmul<r, s, t, T_in, T_in, accauto>;
+
+  event0();
+
+  for (unsigned z = 0; z < rowA; z += 4)
+    chess_prepare_for_pipelining chess_loop_range(4, ) {
+      T_out *__restrict pC1 = pC + offsetC + (z * colB + 0) * MMUL::size_C;
+      T_out *__restrict pC2 = pC + offsetC + ((z + 1) * colB + 0) * MMUL::size_C;
+      T_out *__restrict pC3 = pC + offsetC + ((z + 2) * colB + 0) * MMUL::size_C;
+      T_out *__restrict pC4 = pC + offsetC + ((z + 3) * colB + 0) * MMUL::size_C;
+
+      for (unsigned j = 0; j < colB; j += 2)
+#ifdef OPT_PERF_ENABLED
+        chess_flatten_loop
+#endif
+        {
+          const T_in *__restrict pA1 = pA + offsetA + (z * colA + 0) * MMUL::size_A;
+          const T_in *__restrict pA2 = pA + offsetA + ((z + 1) * colA + 0) * MMUL::size_A;
+          const T_in *__restrict pA3 = pA + offsetA + ((z + 2) * colA + 0) * MMUL::size_A;
+          const T_in *__restrict pA4 = pA + offsetA + ((z + 3) * colA + 0) * MMUL::size_A;
+
+          const T_in *__restrict pB1 = pB + offsetB + (0 * colB + j) * MMUL::size_B;
+          const T_in *__restrict pB2 = pB + offsetB + (0 * colB + (j + 1)) * MMUL::size_B;
+
+          aie::vector<T_in, MMUL::size_A> A01 = aie::load_v<MMUL::size_A>(pA1);
+          pA1 += MMUL::size_A;
+          aie::vector<T_in, MMUL::size_A> A11 = aie::load_v<MMUL::size_A>(pA2);
+          pA2 += MMUL::size_A;
+          aie::vector<T_in, MMUL::size_A> A21 = aie::load_v<MMUL::size_A>(pA3);
+          pA3 += MMUL::size_A;
+          aie::vector<T_in, MMUL::size_A> A31 = aie::load_v<MMUL::size_A>(pA4);
+          pA4 += MMUL::size_A;
+          aie::vector<T_in, MMUL::size_B> B01 = aie::load_v<MMUL::size_B>(pB1);
+          pB1 += (MMUL::size_B * colB);
+          aie::vector<T_in, MMUL::size_B> B11 = aie::load_v<MMUL::size_B>(pB2);
+          pB2 += (MMUL::size_B * colB);
+
+          aie::vector<T_out, MMUL::size_C> acc_C00 =
+              aie::load_v<MMUL::size_C>(pC1);
+          aie::vector<T_out, MMUL::size_C> acc_C01 =
+              aie::load_v<MMUL::size_C>(pC1 + MMUL::size_C);
+          aie::vector<T_out, MMUL::size_C> acc_C10 =
+              aie::load_v<MMUL::size_C>(pC2);
+          aie::vector<T_out, MMUL::size_C> acc_C11 =
+              aie::load_v<MMUL::size_C>(pC2 + MMUL::size_C);
+          aie::vector<T_out, MMUL::size_C> acc_C20 =
+              aie::load_v<MMUL::size_C>(pC3);
+          aie::vector<T_out, MMUL::size_C> acc_C21 =
+              aie::load_v<MMUL::size_C>(pC3 + MMUL::size_C);
+          aie::vector<T_out, MMUL::size_C> acc_C30 =
+              aie::load_v<MMUL::size_C>(pC4);
+          aie::vector<T_out, MMUL::size_C> acc_C31 =
+              aie::load_v<MMUL::size_C>(pC4 + MMUL::size_C);
+
+          MMUL C00(acc_C00);
+          MMUL C01(acc_C01);
+          MMUL C10(acc_C10);
+          MMUL C11(acc_C11);
+          MMUL C20(acc_C20);
+          MMUL C21(acc_C21);
+          MMUL C30(acc_C30);
+          MMUL C31(acc_C31);
+
+          C00.mac(A01, B01);
+          C01.mac(A01, B11);
+          C10.mac(A11, B01);
+          C11.mac(A11, B11);
+          C20.mac(A21, B01);
+          C21.mac(A21, B11);
+          C30.mac(A31, B01);
+          C31.mac(A31, B11);
+
+          for (unsigned i = 1; i < colA; i += 1)
+#ifdef OPT_PERF_ENABLED
+            chess_flatten_loop
+#endif
+            {
+              A01 = aie::load_v<MMUL::size_A>(pA1);
+              pA1 += MMUL::size_A;
+              A11 = aie::load_v<MMUL::size_A>(pA2);
+              pA2 += MMUL::size_A;
+              A21 = aie::load_v<MMUL::size_A>(pA3);
+              pA3 += MMUL::size_A;
+              A31 = aie::load_v<MMUL::size_A>(pA4);
+              pA4 += MMUL::size_A;
+              B01 = aie::load_v<MMUL::size_B>(pB1);
+              pB1 += (MMUL::size_B * colB);
+              B11 = aie::load_v<MMUL::size_B>(pB2);
+              pB2 += (MMUL::size_B * colB);
+
+              C00.mac(A01, B01);
+              C01.mac(A01, B11);
+              C10.mac(A11, B01);
+              C11.mac(A11, B11);
+              C20.mac(A21, B01);
+              C21.mac(A21, B11);
+              C30.mac(A31, B01);
+              C31.mac(A31, B11);
+            }
+
+          aie::store_v(pC1, C00.template to_vector<T_out>());
+          pC1 += MMUL::size_C;
+          aie::store_v(pC1, C01.template to_vector<T_out>());
+          pC1 += MMUL::size_C;
+          aie::store_v(pC2, C10.template to_vector<T_out>());
+          pC2 += MMUL::size_C;
+          aie::store_v(pC2, C11.template to_vector<T_out>());
+          pC2 += MMUL::size_C;
+          aie::store_v(pC3, C20.template to_vector<T_out>());
+          pC3 += MMUL::size_C;
+          aie::store_v(pC3, C21.template to_vector<T_out>());
+          pC3 += MMUL::size_C;
+          aie::store_v(pC4, C30.template to_vector<T_out>());
+          pC4 += MMUL::size_C;
+          aie::store_v(pC4, C31.template to_vector<T_out>());
+          pC4 += MMUL::size_C;
+        }
+    }
+
+  event1();
+}
+
 template <unsigned m, unsigned k, unsigned n>
 void matmul_vectorized_4x8x4_bf16_bf16_bf16(const bfloat16 *__restrict pA,
                                        unsigned offsetA,
@@ -295,15 +426,35 @@ void matmul_vectorized_4x8x4_bf16_bf16_f32(const bfloat16 *__restrict pA,
       pA, offsetA, pB, offsetB, pC, offsetC);
 }
 
+template <unsigned m, unsigned k, unsigned n>
+void matmul_vectorized_4x8x8_i8_i8_i32(const int8 *__restrict pA,
+                                      unsigned offsetA,
+                                      const int8 *__restrict pB,
+                                      unsigned offsetB, int32 *__restrict pC,
+                                      unsigned offsetC) {
+  constexpr int r = 4;
+  constexpr int s = 8;
+  constexpr int t = 8;
+  static_assert(m % (4 * r) == 0); // 'm' dimension
+  static_assert(k % s == 0);       // 'k' dimension
+  static_assert(n % (2 * t) == 0); // 'n' dimension
+  return matmul_vectorized_4x2<int8, int32, m / r, k / s, n / t, r, s, t>(
+      pA, offsetA, pB, offsetB, pC, offsetC);
+}
+
 extern "C" {
 
 #define matmul_combos(X, M, N, K)                                     \
   X(bfloat16, bf16, bfloat16, bf16, bfloat16, bf16, M, N, K, 4, 8, 4) \
   X(bfloat16, bf16, bfloat16, bf16, float, f32, M, N, K, 4, 8, 4)
 
+#define matmul_combos_i8(X, M, N, K)                                  \
+  X(int8, i8, int8, i8, int32, i32, M, N, K, 4, 8, 8)
+
 #define zero_fill_combos(X, M, N)  \
   X(bfloat16, bf16, M, N, N/2)     \
-  X(float, f32, M, N, N/2)
+  X(float, f32, M, N, N/2)         \
+  X(int32, i32, M, N, N/2)
 
 #define matmul_vectorized_c_func(lhs_ctype_in, lhs_mlir_type_in,                                             \
                                  rhs_ctype_in, rhs_mlir_type_in,                                             \
@@ -324,6 +475,12 @@ matmul_combos(matmul_vectorized_c_func, 16, 16, 32)
 matmul_combos(matmul_vectorized_c_func, 32, 32, 32)
 matmul_combos(matmul_vectorized_c_func, 64, 64, 64)
 matmul_combos(matmul_vectorized_c_func, 32, 32, 64)
+matmul_combos_i8(matmul_vectorized_c_func, 16, 16, 32)
+matmul_combos_i8(matmul_vectorized_c_func, 32, 32, 8)
+matmul_combos_i8(matmul_vectorized_c_func, 32, 32, 16)
+matmul_combos_i8(matmul_vectorized_c_func, 32, 32, 32)
+matmul_combos_i8(matmul_vectorized_c_func, 32, 32, 64)
+matmul_combos_i8(matmul_vectorized_c_func, 64, 64, 64)
 
 zero_fill_combos(zero_vectorized_c_func, 16, 16)
 zero_fill_combos(zero_vectorized_c_func, 32, 32)
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEVectorization.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEVectorization.cpp
index be3c47e42..dafea8ee1 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEVectorization.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEVectorization.cpp
@@ -89,7 +89,7 @@ void AMDAIEVectorizationPass::runOnOperation() {
       //               gap between this pass and vector-to-aievec.
       for (Operation &innerOps :
            cast<linalg::GenericOp>(op).getBody()->getOperations()) {
-        if (!isa<arith::TruncFOp, linalg::YieldOp>(innerOps)) {
+        if (!isa<arith::TruncFOp, arith::TruncIOp, linalg::YieldOp>(innerOps)) {
           op->emitRemark() << "not vectorizing linalg elementwise op";
           return;
         }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/vectorization.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/vectorization.mlir
index 4f6c95dfe..241eb4da6 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/vectorization.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/vectorization.mlir
@@ -64,9 +64,9 @@ func.func @fillAndCopy() -> tensor<8xbf16> {
 }
 
 
-// CHECK-LABEL: @matmul_elementwise
+// CHECK-LABEL: @matmul_elementwise_truncf
 //  CHECK-SAME: (%[[ARG0:.*]]: tensor<4240x160xf32>, %[[ARG1:.*]]: tensor<4240x160xbf16>)
-func.func @matmul_elementwise(%arg0: tensor<4240x160xf32>, %arg1: tensor<4240x160xbf16>) -> tensor<4240x160xbf16> {
+func.func @matmul_elementwise_truncf(%arg0: tensor<4240x160xf32>, %arg1: tensor<4240x160xbf16>) -> tensor<4240x160xbf16> {
   %0 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0: tensor<4240x160xf32>) outs(%arg1 : tensor<4240x160xbf16>) {
     ^bb0(%in: f32, %out: bf16):
         %1 = arith.truncf %in : f32 to bf16
@@ -77,3 +77,17 @@ func.func @matmul_elementwise(%arg0: tensor<4240x160xf32>, %arg1: tensor<4240x16
 // CHECK: %[[VEC_OPERAND_0:.*]] = vector.transfer_read %[[ARG0]]{{.*}} vector<4240x160xf32>
 // CHECK: %[[TRUNCF:.*]] = arith.truncf %[[VEC_OPERAND_0]]
 // CHECK: vector.transfer_write %[[TRUNCF]], %[[ARG1]]
+
+// CHECK-LABEL: @matmul_elementwise_trunci
+//  CHECK-SAME: (%[[ARG0:.*]]: tensor<4240x160xi32>, %[[ARG1:.*]]: tensor<4240x160xi8>)
+func.func @matmul_elementwise_trunci(%arg0: tensor<4240x160xi32>, %arg1: tensor<4240x160xi8>) -> tensor<4240x160xi8> {
+  %0 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0: tensor<4240x160xi32>) outs(%arg1 : tensor<4240x160xi8>) {
+    ^bb0(%in: i32, %out: i8):
+        %1 = arith.trunci %in : i32 to bf16
+        linalg.yield %1 : bf16
+    } -> tensor<4240x160xi8>
+  return %0 : tensor<4240x160xi8>
+}
+// CHECK: %[[VEC_OPERAND_0:.*]] = vector.transfer_read %[[ARG0]]{{.*}} vector<4240x160xi32>
+// CHECK: %[[TRUNCI:.*]] = arith.trunci %[[VEC_OPERAND_0]]
+// CHECK: vector.transfer_write %[[TRUNCF]], %[[ARG1]]

From e1bb675a5a02b906d2f6eb4e92129a5aa64d81a3 Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Thu, 6 Feb 2025 15:13:44 +0000
Subject: [PATCH 3/8] Revert previous tests

---
 build_tools/ci/cpu_comparison/run.py | 32 ++++++++++++----------------
 1 file changed, 14 insertions(+), 18 deletions(-)

diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
index f937bcf1b..7770e80fe 100755
--- a/build_tools/ci/cpu_comparison/run.py
+++ b/build_tools/ci/cpu_comparison/run.py
@@ -1616,6 +1616,18 @@ def __init__(self):
                     test_params=TestParams(tile_pipeline=tile_pipeline),
                 )
             )
+            self.register(
+                MatmulTruncf(
+                    128,
+                    256,
+                    "bf16",
+                    "f32",
+                    2 * np.ones([128, 256]),
+                    3 * np.ones([256, 128]),
+                    1536 * np.ones([128, 128]),
+                    tile_pipeline=tile_pipeline,
+                )
+            )
 
         # BatchMatmul test(s):
         # TODO(jornt): BatchMatmul tests with the pack-peel-4-level-tiling pipeline result in intermittent
@@ -1768,10 +1780,9 @@ def __init__(self):
         )
         self.register(
             Matmul(
-                128,
-                128,
+                512,
+                512,
                 256,
-                "i8",
                 "i32",
                 "i32",
                 test_params=TestParams(
@@ -1805,21 +1816,6 @@ def __init__(self):
                 )
             )
 
-        # self.register(
-        #     Matmul(
-        #         64,
-        #         64,
-        #         64,
-        #         "bf16",
-        #         "f32",
-        #         use_ukernel=True,
-        #         use_chess=True,
-        #         run_on_target=["npu4"],
-        #         aie_compilation_flags=[
-        #             "--mlir-print-ir-before-all",
-        #         ],
-        #     )
-        # )
         self.register(
             Matmul(
                 64,

From 51c86dad9b0c5aa2ba7431f2360634a9b9841a5b Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Thu, 6 Feb 2025 15:19:15 +0000
Subject: [PATCH 4/8] Add insert-cores lit test

---
 .../precanonicalization-aieml-llvmir.mlir     |  2 +-
 .../Transforms/test/insert_cores.mlir         | 25 +++++++++++++++++++
 .../Transforms/test/vectorization.mlir        |  6 ++---
 3 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/compiler/plugins/target/AMD-AIE/aievec/test/precanonicalization-aieml-llvmir.mlir b/compiler/plugins/target/AMD-AIE/aievec/test/precanonicalization-aieml-llvmir.mlir
index 97ab9cbae..bae91b9c4 100644
--- a/compiler/plugins/target/AMD-AIE/aievec/test/precanonicalization-aieml-llvmir.mlir
+++ b/compiler/plugins/target/AMD-AIE/aievec/test/precanonicalization-aieml-llvmir.mlir
@@ -174,7 +174,7 @@ func.func @arith_truncf(%inp: vector<2x3xf32>) -> vector<2x3xbf16> {
 func.func @arith_trunci(%inp: vector<2x3xi32>) -> vector<2x3xi8> {
     // CHECK:     %[[LINEARIZE:.*]] = vector.shape_cast %[[INP]] : vector<2x3xi32> to vector<6xi32>
     // CHECK:     %[[TRUNCI:.*]] = arith.trunci %[[LINEARIZE]] : vector<6xi32> to vector<6xi8>
-    // CHECK:     %[[DELINEARIZE:.*]] = vector.shape_cast %[[TRUNCF]] : vector<6xi8> to vector<2x3xi8>
+    // CHECK:     %[[DELINEARIZE:.*]] = vector.shape_cast %[[TRUNCI]] : vector<6xi8> to vector<2x3xi8>
     // CHECK:     return %[[DELINEARIZE]]
     %0 = arith.trunci %inp : vector<2x3xi32> to vector<2x3xi8>
     return %0 : vector<2x3xi8>
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_cores.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_cores.mlir
index 808f6d188..09352b518 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_cores.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/insert_cores.mlir
@@ -325,3 +325,28 @@ module {
     return
   }
 }
+
+// -----
+
+// CHECK-LABEL: @insert_trunci_within_core
+// CHECK:           scf.forall
+// CHECK:             amdaie.tile
+// CHECK:             amdaie.core
+// CHECK:               vector.transfer_read
+// CHECK:               arith.trunci
+// CHECK:               vector.transfer_write
+// CHECK:               amdaie.end
+module {
+  func.func @insert_trunci_within_core(%arg0: memref<10x10xi32, 2 : i32>, %arg1: memref<10x10xi8, 2 : i32>) {
+    %cst = arith.constant 0 : i32
+    %c1 = arith.constant 1 : index
+    %c3 = arith.constant 3 : index
+    %c0 = arith.constant 0 : index
+    scf.forall (%arg3, %arg4) in (2, 2) {
+      %read = vector.transfer_read %arg0[%c0, %c1], %cst {in_bounds = [true, true]} : memref<10x10xi32, 2 : i32>, vector<1x1xi32>
+      %trunci = arith.trunci %read : vector<1x1xi32> to vector<1x1xi8>
+      vector.transfer_write %trunci, %arg1[%c0, %c1] {in_bounds = [true, true]} : vector<1x1xi8>, memref<10x10xi8, 2 : i32>
+    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
+    return
+  }
+}
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/vectorization.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/vectorization.mlir
index 241eb4da6..72d0be752 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/vectorization.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/vectorization.mlir
@@ -83,11 +83,11 @@ func.func @matmul_elementwise_truncf(%arg0: tensor<4240x160xf32>, %arg1: tensor<
 func.func @matmul_elementwise_trunci(%arg0: tensor<4240x160xi32>, %arg1: tensor<4240x160xi8>) -> tensor<4240x160xi8> {
   %0 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg0: tensor<4240x160xi32>) outs(%arg1 : tensor<4240x160xi8>) {
     ^bb0(%in: i32, %out: i8):
-        %1 = arith.trunci %in : i32 to bf16
-        linalg.yield %1 : bf16
+        %1 = arith.trunci %in : i32 to i8
+        linalg.yield %1 : i8
     } -> tensor<4240x160xi8>
   return %0 : tensor<4240x160xi8>
 }
 // CHECK: %[[VEC_OPERAND_0:.*]] = vector.transfer_read %[[ARG0]]{{.*}} vector<4240x160xi32>
 // CHECK: %[[TRUNCI:.*]] = arith.trunci %[[VEC_OPERAND_0]]
-// CHECK: vector.transfer_write %[[TRUNCF]], %[[ARG1]]
+// CHECK: vector.transfer_write %[[TRUNCI]], %[[ARG1]]

From 6d29a033eef9dd04ae35bf4214a5eb5f7668227b Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Mon, 10 Feb 2025 07:50:18 +0000
Subject: [PATCH 5/8] Refactored e2e test rebase

---
 build_tools/ci/cpu_comparison/run.py | 76 +++++++++++-----------------
 1 file changed, 30 insertions(+), 46 deletions(-)

diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
index 7770e80fe..5778eec15 100755
--- a/build_tools/ci/cpu_comparison/run.py
+++ b/build_tools/ci/cpu_comparison/run.py
@@ -683,25 +683,18 @@ def __init__(
         lhs,
         rhs,
         expected_out,
-        run_on_target=["npu1_4col"],
-        tile_pipeline="pack-peel",
-        aie_compilation_flags=None,
-        use_ukernel=False,
         n_repeats=1,
-        use_chess=False,
+        running_params: RunningParams = RunningParams(),
     ):
         super().__init__(
-            run_on_target=run_on_target,
-            aie_compilation_flags=aie_compilation_flags,
+            name=f"matmul_trunci_{M}_{K}_{input_type}_{acc_type}",
+            running_params=running_params,
             M=M,
             N=M,
             K=K,
             input_type=input_type,
             acc_type=acc_type,
-            tile_pipeline=tile_pipeline,
             n_repeats=n_repeats,
-            use_ukernel=use_ukernel,
-            use_chess=use_chess,
         )
         self.labels.append("MatmulTrunci")
 
@@ -710,9 +703,6 @@ def __init__(
         assert rhs.shape == (K, M)
         assert expected_out.shape == (M, M)
 
-        self.name = f"matmul_trunci_{M}_{K}_{input_type}_{acc_type}"
-        if tile_pipeline == "pack-peel-4-level-tiling":
-            self.name += "_4_level_tiling"
         self.lhs = lhs
         self.rhs = rhs
         self.expected_out = expected_out
@@ -1543,13 +1533,15 @@ def __init__(self):
                 1 * np.ones([256, 32], dtype=np.int8),
                 1 * np.ones([32, 256], dtype=np.int8),
                 32 * np.ones([256, 256], dtype=np.int8),
-                tile_pipeline="pack-peel-4-level-tiling",
-                run_on_target=["npu1_4col"],
-                aie_compilation_flags=[
-                    "--iree-amdaie-num-rows=4",
-                    "--iree-amdaie-num-cols=4",
-                ],
-                use_ukernel=True,
+                running_params=RunningParams(
+                    tile_pipeline="pack-peel-4-level-tiling",
+                    run_on_target=["npu1_4col"],
+                    aie_compilation_flags=[
+                        "--iree-amdaie-num-rows=4",
+                        "--iree-amdaie-num-cols=4",
+                    ],
+                    use_ukernel=True,
+                ),
             )
         )
         # Phoenix : Vectorization + Peano.
@@ -1562,12 +1554,14 @@ def __init__(self):
                 1 * np.ones([256, 32], dtype=np.int8),
                 1 * np.ones([32, 256], dtype=np.int8),
                 32 * np.ones([256, 256], dtype=np.int8),
-                tile_pipeline="pack-peel-4-level-tiling",
-                run_on_target=["npu1_4col"],
-                aie_compilation_flags=[
-                    "--iree-amdaie-num-rows=4",
-                    "--iree-amdaie-num-cols=4",
-                ],
+                running_params=RunningParams(
+                    tile_pipeline="pack-peel-4-level-tiling",
+                    run_on_target=["npu1_4col"],
+                    aie_compilation_flags=[
+                        "--iree-amdaie-num-rows=4",
+                        "--iree-amdaie-num-cols=4",
+                    ],
+                ),
             )
         )
         # Strix : Ukernel + Chess.
@@ -1580,14 +1574,16 @@ def __init__(self):
                 1 * np.ones([256, 32], dtype=np.int8),
                 1 * np.ones([32, 256], dtype=np.int8),
                 32 * np.ones([256, 256], dtype=np.int8),
-                tile_pipeline="pack-peel-4-level-tiling",
-                run_on_target=["npu4"],
-                aie_compilation_flags=[
-                    "--iree-amdaie-num-rows=4",
-                    "--iree-amdaie-num-cols=8",
-                ],
-                use_chess=True,
-                use_ukernel=True,
+                running_params=RunningParams(
+                    tile_pipeline="pack-peel-4-level-tiling",
+                    run_on_target=["npu4"],
+                    aie_compilation_flags=[
+                        "--iree-amdaie-num-rows=4",
+                        "--iree-amdaie-num-cols=8",
+                    ],
+                    use_chess=True,
+                    use_ukernel=True,
+                ),
             )
         )
         # Matmul with truncf test(s):
@@ -1616,18 +1612,6 @@ def __init__(self):
                     test_params=TestParams(tile_pipeline=tile_pipeline),
                 )
             )
-            self.register(
-                MatmulTruncf(
-                    128,
-                    256,
-                    "bf16",
-                    "f32",
-                    2 * np.ones([128, 256]),
-                    3 * np.ones([256, 128]),
-                    1536 * np.ones([128, 128]),
-                    tile_pipeline=tile_pipeline,
-                )
-            )
 
         # BatchMatmul test(s):
         # TODO(jornt): BatchMatmul tests with the pack-peel-4-level-tiling pipeline result in intermittent

From aed3430a5cb4b2517a55692f8c2bfc62a58cf491 Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Mon, 10 Feb 2025 11:33:35 +0000
Subject: [PATCH 6/8] Update for not running ukernel test if Vitis is not found

---
 build_tools/ci/cpu_comparison/run.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
index 5778eec15..910c8fdfc 100755
--- a/build_tools/ci/cpu_comparison/run.py
+++ b/build_tools/ci/cpu_comparison/run.py
@@ -159,6 +159,8 @@ def run(self, config):
         # does not).
         if self.use_chess and not config.vitis_dir:
             return False
+        if self.use_ukernel and not config.vitis_dir:
+            return False
 
         # If use_chess=0, and config has not provided a valid
         # path to peano, then bail: a path to peano must be provided.
@@ -684,11 +686,11 @@ def __init__(
         rhs,
         expected_out,
         n_repeats=1,
-        running_params: RunningParams = RunningParams(),
+        test_params=None,
     ):
         super().__init__(
             name=f"matmul_trunci_{M}_{K}_{input_type}_{acc_type}",
-            running_params=running_params,
+            test_params=test_params,
             M=M,
             N=M,
             K=K,
@@ -1533,7 +1535,7 @@ def __init__(self):
                 1 * np.ones([256, 32], dtype=np.int8),
                 1 * np.ones([32, 256], dtype=np.int8),
                 32 * np.ones([256, 256], dtype=np.int8),
-                running_params=RunningParams(
+                test_params=TestParams(
                     tile_pipeline="pack-peel-4-level-tiling",
                     run_on_target=["npu1_4col"],
                     aie_compilation_flags=[
@@ -1554,7 +1556,7 @@ def __init__(self):
                 1 * np.ones([256, 32], dtype=np.int8),
                 1 * np.ones([32, 256], dtype=np.int8),
                 32 * np.ones([256, 256], dtype=np.int8),
-                running_params=RunningParams(
+                test_params=TestParams(
                     tile_pipeline="pack-peel-4-level-tiling",
                     run_on_target=["npu1_4col"],
                     aie_compilation_flags=[
@@ -1574,7 +1576,7 @@ def __init__(self):
                 1 * np.ones([256, 32], dtype=np.int8),
                 1 * np.ones([32, 256], dtype=np.int8),
                 32 * np.ones([256, 256], dtype=np.int8),
-                running_params=RunningParams(
+                test_params=TestParams(
                     tile_pipeline="pack-peel-4-level-tiling",
                     run_on_target=["npu4"],
                     aie_compilation_flags=[

From 7563854623a7f496ce8156f3866a52f21b8ad1ca Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Wed, 12 Feb 2025 06:17:44 +0000
Subject: [PATCH 7/8] Fix n_repeats trivial issue

---
 build_tools/ci/cpu_comparison/run.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
index 910c8fdfc..3c27acb74 100755
--- a/build_tools/ci/cpu_comparison/run.py
+++ b/build_tools/ci/cpu_comparison/run.py
@@ -685,7 +685,6 @@ def __init__(
         lhs,
         rhs,
         expected_out,
-        n_repeats=1,
         test_params=None,
     ):
         super().__init__(
@@ -696,7 +695,6 @@ def __init__(
             K=K,
             input_type=input_type,
             acc_type=acc_type,
-            n_repeats=n_repeats,
         )
         self.labels.append("MatmulTrunci")
 

From 53921245ccea1e4a181a26eb3d42d604ca2d8b7b Mon Sep 17 00:00:00 2001
From: Abhishek Varma <abhvarma@amd.com>
Date: Wed, 12 Feb 2025 09:52:11 +0000
Subject: [PATCH 8/8] Review comments v1.0

---
 build_tools/ci/cpu_comparison/run.py | 40 +++++++++++-----------------
 1 file changed, 16 insertions(+), 24 deletions(-)

diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
index 3c27acb74..683e23bc6 100755
--- a/build_tools/ci/cpu_comparison/run.py
+++ b/build_tools/ci/cpu_comparison/run.py
@@ -645,12 +645,6 @@ def _execute(self, config):
         input_args = generate_inputs(
             filename, self.get_dir(config), 1, {1: self.lhs, 2: self.rhs}
         )
-        """
-        Currently without function outlining, we run out of program memory.
-        """
-        self.add_aie_compilation_flags(
-            ["--iree-amdaie-enable-function-outlining=balanced"]
-        )
         aie_vs_baseline(
             config=config,
             aie_compilation_flags=self.aie_compilation_flags,
@@ -673,12 +667,13 @@ def _execute(self, config):
 
 class MatmulTrunci(BaseMatmul):
     """
-    A test of the form matmul(A,B) + trunci(C) where A:MxK, B:KxM and C:MxM
+    A test of the form matmul(A,B) + trunci(C) where A:MxK, B:KxN and C:MxN
     """
 
     def __init__(
         self,
         M,
+        N,
         K,
         input_type,
         acc_type,
@@ -688,20 +683,20 @@ def __init__(
         test_params=None,
     ):
         super().__init__(
-            name=f"matmul_trunci_{M}_{K}_{input_type}_{acc_type}",
+            name=f"matmul_trunci_{M}_{N}_{K}_{input_type}_{acc_type}",
             test_params=test_params,
             M=M,
-            N=M,
+            N=N,
             K=K,
             input_type=input_type,
             acc_type=acc_type,
         )
         self.labels.append("MatmulTrunci")
 
-        # Assertions on shapes: Check that lhs is MxK, rhs is KxM, and expected_out is MxM
+        # Assertions on shapes: Check that lhs is MxK, rhs is KxN, and expected_out is MxN
         assert lhs.shape == (M, K)
-        assert rhs.shape == (K, M)
-        assert expected_out.shape == (M, M)
+        assert rhs.shape == (K, N)
+        assert expected_out.shape == (M, N)
 
         self.lhs = lhs
         self.rhs = rhs
@@ -715,12 +710,6 @@ def _execute(self, config):
         input_args = generate_inputs(
             filename, self.get_dir(config), 1, {1: self.lhs, 2: self.rhs}
         )
-        """
-        Currently without function outlining, we run out of program memory.
-        """
-        self.add_aie_compilation_flags(
-            ["--iree-amdaie-enable-function-outlining=balanced"]
-        )
         aie_vs_baseline(
             config=config,
             aie_compilation_flags=self.aie_compilation_flags,
@@ -1527,12 +1516,13 @@ def __init__(self):
         self.register(
             MatmulTrunci(
                 256,
+                128,
                 32,
                 "i8",
                 "i32",
                 1 * np.ones([256, 32], dtype=np.int8),
-                1 * np.ones([32, 256], dtype=np.int8),
-                32 * np.ones([256, 256], dtype=np.int8),
+                1 * np.ones([32, 128], dtype=np.int8),
+                32 * np.ones([256, 128], dtype=np.int8),
                 test_params=TestParams(
                     tile_pipeline="pack-peel-4-level-tiling",
                     run_on_target=["npu1_4col"],
@@ -1548,12 +1538,13 @@ def __init__(self):
         self.register(
             MatmulTrunci(
                 256,
+                128,
                 32,
                 "i8",
                 "i32",
                 1 * np.ones([256, 32], dtype=np.int8),
-                1 * np.ones([32, 256], dtype=np.int8),
-                32 * np.ones([256, 256], dtype=np.int8),
+                1 * np.ones([32, 128], dtype=np.int8),
+                32 * np.ones([256, 128], dtype=np.int8),
                 test_params=TestParams(
                     tile_pipeline="pack-peel-4-level-tiling",
                     run_on_target=["npu1_4col"],
@@ -1568,12 +1559,13 @@ def __init__(self):
         self.register(
             MatmulTrunci(
                 256,
+                128,
                 32,
                 "i8",
                 "i32",
                 1 * np.ones([256, 32], dtype=np.int8),
-                1 * np.ones([32, 256], dtype=np.int8),
-                32 * np.ones([256, 256], dtype=np.int8),
+                1 * np.ones([32, 128], dtype=np.int8),
+                32 * np.ones([256, 128], dtype=np.int8),
                 test_params=TestParams(
                     tile_pipeline="pack-peel-4-level-tiling",
                     run_on_target=["npu4"],