Skip to content

Commit

Permalink
Add performance test
Browse files Browse the repository at this point in the history
  • Loading branch information
yzhang93 committed Feb 11, 2025
1 parent 1281e8d commit 11d610c
Show file tree
Hide file tree
Showing 2 changed files with 102 additions and 10 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// input ${M}x${K}x32x64xbf16
// input ${N}x${K}x64x32xbf16

func.func @matmul4d(%arg0: tensor<${M}x${K}x32x64xbf16>, %arg1: tensor<${N}x${K}x64x32xbf16>) -> tensor<${N}x${M}x32x32xf32> {
%cst = arith.constant 0.0 : f32
%0 = tensor.empty() : tensor<${N}x${M}x32x32xf32>
%1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<${N}x${M}x32x32xf32>) -> tensor<${N}x${M}x32x32xf32>
%2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d0, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<${M}x${K}x32x64xbf16>, tensor<${N}x${K}x64x32xbf16>) outs(%1 : tensor<${N}x${M}x32x32xf32>) {
^bb0(%in: bf16, %in_1: bf16, %out: f32):
%12 = arith.extf %in : bf16 to f32
%13 = arith.extf %in_1 : bf16 to f32
%14 = arith.mulf %12, %13 : f32
%15 = arith.addf %out, %14 : f32
linalg.yield %15 : f32
} -> tensor<${N}x${M}x32x32xf32>
return %2 : tensor<${N}x${M}x32x32xf32>
}
95 changes: 85 additions & 10 deletions build_tools/ci/cpu_comparison/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
from input_generator import (
generate_inputs,
verify_determinism,
load_input,
get_output_type,
np_from_binfile,
)
Expand Down Expand Up @@ -473,6 +472,61 @@ def _execute(self, config):
return self.vs_cpu(config)


class Matmul4d(BaseMatmul):
"""
A test of linalg.generic with 4d inputs and output implementing form:
C += matmul4d(A,B) where A:MxKxM0xK0, B:NxKxK0xN0, C:NxMxM0xN0
Note that the order of the input dims for this operation corresponds to
the L2 shapes of a standard matmul op after the first level packing.
For comparison purpose, the input values of inner dims M0/N0/K0 are
fixed as 32/32/64 currently.
TODO(vivian): Generalize the class and the template.
"""

def __init__(
self,
M,
N,
K,
input_type,
acc_type,
additional_labels=None,
n_repeats=1,
n_kernel_runs=1,
test_params=None,
):
super().__init__(
name=f"matmul4d_{M}_{N}_{K}_{input_type}_{acc_type}",
test_params=test_params,
M=M,
N=N,
K=K,
input_type=input_type,
acc_type=acc_type,
function_name="matmul4d",
n_repeats=n_repeats,
n_kernel_runs=n_kernel_runs,
)
self.labels.append("Matmul4d")
if additional_labels:
self.labels += additional_labels
if self.run_benchmark:
self.aie_compilation_flags += [
"--iree-amdaie-enable-infinite-loop-around-core-block=true"
]
self.labels.append("Matmul4dBenchmark")

def _execute(self, config):
matmul_template_dir = config.file_dir / "matmul_template"
template_name = matmul_template_dir / "matmul4d_MxKxM0xK0_NxKxK0xN0.mlir"
self.generate(config, template_name)
if self.run_benchmark:
return self.benchmark(config)

return self.vs_cpu(config)


class MatmulThinBias(BaseMatmul):
"""
A test of the form matmul(A,B) + C where A:MxK, B:KxN, C:N
Expand All @@ -489,9 +543,11 @@ def __init__(
):
super().__init__(
name=f"matmul_thin_bias_{M}_{N}_{K}_{input_type}_{acc_type}",
test_params=test_params
if test_params is not None
else TestParams(lower_to_aie_pipeline="air"),
test_params=(
test_params
if test_params is not None
else TestParams(lower_to_aie_pipeline="air")
),
M=M,
N=N,
K=K,
Expand Down Expand Up @@ -530,9 +586,11 @@ def __init__(
):
super().__init__(
name=f"matmul_full_bias_{M}_{N}_{K}_{input_type}_{acc_type}",
test_params=test_params
if test_params is not None
else TestParams(lower_to_aie_pipeline="air"),
test_params=(
test_params
if test_params is not None
else TestParams(lower_to_aie_pipeline="air")
),
M=M,
N=N,
K=K,
Expand All @@ -552,8 +610,7 @@ def _execute(self, config):
"--iree-amdaie-num-cols=2",
]
)
self.vs_cpu(config)
return True
return self.vs_cpu(config)


class BatchMatmul(BaseMatmul):
Expand Down Expand Up @@ -1977,6 +2034,21 @@ def __init__(self):
"transpose_b": False,
"tile_pipeline": "pack-peel-4-level-tiling",
},
# matmul4d test where the input M/N/K are outer dim values.
# The total input values correspond to a standard matmul
# from the above test are M:512, N:4096, K:512.
{
"M": 16,
"N": 128,
"K": 8,
"use_ukernel": True,
"peano_opt_level": 3,
"outline": "balanced",
"transpose_a": False,
"transpose_b": False,
"matmul4d": True,
"tile_pipeline": "pack-peel-4-level-tiling",
},
# Test where the compute is omitted, this should help triangulate
# how much performance gain can be obtained with better matmul
# on core vs data movement.
Expand Down Expand Up @@ -2069,6 +2141,7 @@ def __init__(self):
transpose_a = test["transpose_a"]
transpose_b = test["transpose_b"]
tile_pipeline = test["tile_pipeline"]
matmul4d = test["matmul4d"] if "matmul4d" in test else False
run_on_target = (
test["run_on_target"] if "run_on_target" in test else "npu1_4col"
)
Expand Down Expand Up @@ -2106,7 +2179,9 @@ def __init__(self):
else:
name_suffix += "_outline"

if (transpose_a, transpose_b) == (False, False):
if matmul4d:
TestClass = Matmul4d
elif (transpose_a, transpose_b) == (False, False):
TestClass = Matmul
elif (transpose_a, transpose_b) == (True, False):
TestClass = MatmulTransposeA
Expand Down

0 comments on commit 11d610c

Please sign in to comment.