Add performance test

nod-ai · Feb 11, 2025 · 11d610c · 11d610c
1 parent 1281e8d
commit 11d610c
Show file tree

Hide file tree

Showing 2 changed files with 102 additions and 10 deletions.
diff --git a/build_tools/ci/cpu_comparison/matmul_template/matmul4d_MxKxM0xK0_NxKxK0xN0.mlir b/build_tools/ci/cpu_comparison/matmul_template/matmul4d_MxKxM0xK0_NxKxK0xN0.mlir
@@ -0,0 +1,17 @@
+// input ${M}x${K}x32x64xbf16
+// input ${N}x${K}x64x32xbf16
+
+func.func @matmul4d(%arg0: tensor<${M}x${K}x32x64xbf16>, %arg1: tensor<${N}x${K}x64x32xbf16>) -> tensor<${N}x${M}x32x32xf32> {
+  %cst = arith.constant 0.0 : f32
+  %0 = tensor.empty() : tensor<${N}x${M}x32x32xf32>
+  %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<${N}x${M}x32x32xf32>) -> tensor<${N}x${M}x32x32xf32>
+  %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<${M}x${K}x32x64xbf16>, tensor<${N}x${K}x64x32xbf16>) outs(%1 : tensor<${N}x${M}x32x32xf32>) {
+    ^bb0(%in: bf16, %in_1: bf16, %out: f32):
+      %12 = arith.extf %in : bf16 to f32
+      %13 = arith.extf %in_1 : bf16 to f32
+      %14 = arith.mulf %12, %13 : f32
+      %15 = arith.addf %out, %14 : f32
+      linalg.yield %15 : f32
+    } -> tensor<${N}x${M}x32x32xf32>
+  return %2 : tensor<${N}x${M}x32x32xf32>
+}
diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py
@@ -19,7 +19,6 @@
 from input_generator import (
     generate_inputs,
     verify_determinism,
-    load_input,
     get_output_type,
     np_from_binfile,
 )
@@ -473,6 +472,61 @@ def _execute(self, config):
         return self.vs_cpu(config)
 
 
+class Matmul4d(BaseMatmul):
+    """
+    A test of linalg.generic with 4d inputs and output implementing form:
+    C += matmul4d(A,B) where A:MxKxM0xK0, B:NxKxK0xN0, C:NxMxM0xN0
+
+    Note that the order of the input dims for this operation corresponds to
+    the L2 shapes of a standard matmul op after the first level packing.
+    For comparison purpose, the input values of inner dims M0/N0/K0 are
+    fixed as 32/32/64 currently.
+    TODO(vivian): Generalize the class and the template.
+    """
+
+    def __init__(
+        self,
+        M,
+        N,
+        K,
+        input_type,
+        acc_type,
+        additional_labels=None,
+        n_repeats=1,
+        n_kernel_runs=1,
+        test_params=None,
+    ):
+        super().__init__(
+            name=f"matmul4d_{M}_{N}_{K}_{input_type}_{acc_type}",
+            test_params=test_params,
+            M=M,
+            N=N,
+            K=K,
+            input_type=input_type,
+            acc_type=acc_type,
+            function_name="matmul4d",
+            n_repeats=n_repeats,
+            n_kernel_runs=n_kernel_runs,
+        )
+        self.labels.append("Matmul4d")
+        if additional_labels:
+            self.labels += additional_labels
+        if self.run_benchmark:
+            self.aie_compilation_flags += [
+                "--iree-amdaie-enable-infinite-loop-around-core-block=true"
+            ]
+            self.labels.append("Matmul4dBenchmark")
+
+    def _execute(self, config):
+        matmul_template_dir = config.file_dir / "matmul_template"
+        template_name = matmul_template_dir / "matmul4d_MxKxM0xK0_NxKxK0xN0.mlir"
+        self.generate(config, template_name)
+        if self.run_benchmark:
+            return self.benchmark(config)
+
+        return self.vs_cpu(config)
+
+
 class MatmulThinBias(BaseMatmul):
     """
     A test of the form matmul(A,B) + C where A:MxK, B:KxN, C:N
@@ -489,9 +543,11 @@ def __init__(
     ):
         super().__init__(
             name=f"matmul_thin_bias_{M}_{N}_{K}_{input_type}_{acc_type}",
-            test_params=test_params
-            if test_params is not None
-            else TestParams(lower_to_aie_pipeline="air"),
+            test_params=(
+                test_params
+                if test_params is not None
+                else TestParams(lower_to_aie_pipeline="air")
+            ),
             M=M,
             N=N,
             K=K,
@@ -530,9 +586,11 @@ def __init__(
     ):
         super().__init__(
             name=f"matmul_full_bias_{M}_{N}_{K}_{input_type}_{acc_type}",
-            test_params=test_params
-            if test_params is not None
-            else TestParams(lower_to_aie_pipeline="air"),
+            test_params=(
+                test_params
+                if test_params is not None
+                else TestParams(lower_to_aie_pipeline="air")
+            ),
             M=M,
             N=N,
             K=K,
@@ -552,8 +610,7 @@ def _execute(self, config):
                 "--iree-amdaie-num-cols=2",
             ]
         )
-        self.vs_cpu(config)
-        return True
+        return self.vs_cpu(config)
 
 
 class BatchMatmul(BaseMatmul):
@@ -1977,6 +2034,21 @@ def __init__(self):
                 "transpose_b": False,
                 "tile_pipeline": "pack-peel-4-level-tiling",
             },
+            # matmul4d test where the input M/N/K are outer dim values.
+            # The total input values correspond to a standard matmul
+            # from the above test are M:512, N:4096, K:512.
+            {
+                "M": 16,
+                "N": 128,
+                "K": 8,
+                "use_ukernel": True,
+                "peano_opt_level": 3,
+                "outline": "balanced",
+                "transpose_a": False,
+                "transpose_b": False,
+                "matmul4d": True,
+                "tile_pipeline": "pack-peel-4-level-tiling",
+            },
             # Test where the compute is omitted, this should help triangulate
             # how much performance gain can be obtained with better matmul
             # on core vs data movement.
@@ -2069,6 +2141,7 @@ def __init__(self):
             transpose_a = test["transpose_a"]
             transpose_b = test["transpose_b"]
             tile_pipeline = test["tile_pipeline"]
+            matmul4d = test["matmul4d"] if "matmul4d" in test else False
             run_on_target = (
                 test["run_on_target"] if "run_on_target" in test else "npu1_4col"
             )
@@ -2106,7 +2179,9 @@ def __init__(self):
                 else:
                     name_suffix += "_outline"
 
-            if (transpose_a, transpose_b) == (False, False):
+            if matmul4d:
+                TestClass = Matmul4d
+            elif (transpose_a, transpose_b) == (False, False):
                 TestClass = Matmul
             elif (transpose_a, transpose_b) == (True, False):
                 TestClass = MatmulTransposeA