From b375644e638db9852eb54e64b77de9cfc46d180e Mon Sep 17 00:00:00 2001 From: Benoit Jacob Date: Tue, 28 May 2024 14:42:43 -0400 Subject: [PATCH] Revert "Data tiling: transpose narrow-N into narrow-M (#17446)" This reverts commit 16bdaa90e2f02769db2a8949ab88e58c9443392d. --- .../Common/CPU/CPUMaterializeEncodingPass.cpp | 14 +- .../test/llvmcpu_materialize_encoding.mlir | 183 +++++++----------- .../compiler/Codegen/Common/EncodingUtils.cpp | 107 ++-------- .../compiler/Codegen/Common/EncodingUtils.h | 4 - .../MaterializeEncodingIntoPackUnPack.cpp | 99 ++++------ tests/e2e/linalg/BUILD.bazel | 4 - tests/e2e/linalg/CMakeLists.txt | 4 - tests/e2e/linalg/narrow_n_matmuls.mlir | 126 ------------ 8 files changed, 130 insertions(+), 411 deletions(-) delete mode 100644 tests/e2e/linalg/narrow_n_matmuls.mlir diff --git a/compiler/src/iree/compiler/Codegen/Common/CPU/CPUMaterializeEncodingPass.cpp b/compiler/src/iree/compiler/Codegen/Common/CPU/CPUMaterializeEncodingPass.cpp index c1948b941801..85973609efd1 100644 --- a/compiler/src/iree/compiler/Codegen/Common/CPU/CPUMaterializeEncodingPass.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/CPU/CPUMaterializeEncodingPass.cpp @@ -498,8 +498,20 @@ materializeEncodingForTarget(RankedTensorType tensorType, if (enumeratedTileMxNxK.empty()) { return failure(); } + // Check if the encoding specifies static narrow sizes for the M/N dimensions. + // This can be used to choose a correspondingly narrow tile shape. + // With microkernels, we keep this logic in sync with the set of actual + // optimized microkernel tile functions to avoid a tile shape specialization + // causing a fallback to a slow generic tile function. At the moment, + // microkernel tile functions are only specialize for narrow M, not for narrow + // N. Accordingly, we leave matmulNarrowN as 0 (default) when microkernels are + // used. Generally it would be best to deal with narrow-N cases by transposing + // the whole matmul and swapping LHS<->RHS, reducing the narrow-N case to + // narrow-M. int64_t matmulNarrowM = getIntOrZero(encoding.getMatmulNarrow_M()); - int64_t matmulNarrowN = getIntOrZero(encoding.getMatmulNarrow_N()); + int64_t matmulNarrowN = hasUkernel(targetAttr, "mmt4d") + ? 0 + : getIntOrZero(encoding.getMatmulNarrow_N()); // Choose a final matmul TileMxNxK from the above-enumarated tile shapes, // taking narrow dimensions into account. TileMxNxK chosenTileMxNxK = diff --git a/compiler/src/iree/compiler/Codegen/Common/CPU/test/llvmcpu_materialize_encoding.mlir b/compiler/src/iree/compiler/Codegen/Common/CPU/test/llvmcpu_materialize_encoding.mlir index 64a87b07f4da..eaa1661535ea 100644 --- a/compiler/src/iree/compiler/Codegen/Common/CPU/test/llvmcpu_materialize_encoding.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/CPU/test/llvmcpu_materialize_encoding.mlir @@ -254,45 +254,6 @@ func.func @pack_gemm_fill_dynamic(%arg0 : tensor, %arg1 : tensor (d0, d2)> -#map1 = affine_map<(d0, d1, d2) -> (d2, d1)> -#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> -func.func @matvec_shaped_matmul_lowering_f32f32f32_aarch64(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes { - hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="aarch64-xyz-xyz"}> -} { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<16x16xf32> - %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<16x1xf32> - %2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<16x1xf32> - %padded = tensor.pad %0 low[0, 0] high[%c0, %c0] { - ^bb0(%arg3: index, %arg4: index): - tensor.yield %cst : f32 - } : tensor<16x16xf32> to tensor - %3 = iree_encoding.set_encoding %padded : tensor -> tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> - %padded_0 = tensor.pad %1 low[0, 0] high[%c0, %c0] { - ^bb0(%arg3: index, %arg4: index): - tensor.yield %cst : f32 - } : tensor<16x1xf32> to tensor - %4 = iree_encoding.set_encoding %padded_0 : tensor -> tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> - %padded_1 = tensor.pad %2 low[0, 0] high[%c0, %c0] { - ^bb0(%arg3: index, %arg4: index): - tensor.yield %cst : f32 - } : tensor<16x1xf32> to tensor - %5 = iree_encoding.set_encoding %padded_1 : tensor -> tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> - %6 = linalg.matmul ins(%3, %4 : tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>, tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>) outs(%5 : tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>) -> tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> - %7 = iree_encoding.unset_encoding %6 : tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> tensor - %extracted_slice = tensor.extract_slice %7[0, 0] [16, 1] [1, 1] : tensor to tensor<16x1xf32> - %8 = hal.tensor.export %extracted_slice "output0" : tensor<16x1xf32> -> !hal.buffer_view - func.return %8 : !hal.buffer_view -} -// CHECK-LABEL: func @matvec_shaped_matmul_lowering_f32f32f32_aarch64( -// CHECK: %[[MMT4D:.+]] = linalg.mmt4d -// CHECK-SAME: ins({{.*}} : tensor<1x16x1x1xf32>, tensor<2x16x8x1xf32>) -// CHECK-SAME: outs({{.*}} : tensor<1x2x1x8xf32>) -> tensor<1x2x1x8xf32> - -// ----- - #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -356,45 +317,6 @@ func.func @matmul_lowering_f32f32f32_aarch64() attributes { // ----- -#map = affine_map<(d0, d1, d2) -> (d0, d2)> -#map1 = affine_map<(d0, d1, d2) -> (d2, d1)> -#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> -func.func @matvec_lowering_f32f32f32_aarch64(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes { - hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="aarch64-xyz-xyz"}> -} { - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<16x16xf32> - %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<16xf32> - %2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<16xf32> - %padded = tensor.pad %0 low[0, 0] high[%c0, %c0] { - ^bb0(%arg3: index, %arg4: index): - tensor.yield %cst : f32 - } : tensor<16x16xf32> to tensor - %3 = iree_encoding.set_encoding %padded : tensor -> tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>]>> - %padded_0 = tensor.pad %1 low[0] high[%c0] { - ^bb0(%arg3: index): - tensor.yield %cst : f32 - } : tensor<16xf32> to tensor - %4 = iree_encoding.set_encoding %padded_0 : tensor -> tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>]>> - %padded_1 = tensor.pad %2 low[0] high[%c0] { - ^bb0(%arg3: index): - tensor.yield %cst : f32 - } : tensor<16xf32> to tensor - %5 = iree_encoding.set_encoding %padded_1 : tensor -> tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>]>> - %6 = linalg.matvec ins(%3, %4 : tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>]>>, tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>]>>) outs(%5 : tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>]>>) -> tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>]>> - %7 = iree_encoding.unset_encoding %6 : tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1) -> (d0)>]>> -> tensor - %extracted_slice = tensor.extract_slice %7[0] [16] [1] : tensor to tensor<16xf32> - %8 = hal.tensor.export %extracted_slice "output0" : tensor<16xf32> -> !hal.buffer_view - func.return %8 : !hal.buffer_view -} -// CHECK-LABEL: func @matvec_lowering_f32f32f32_aarch64( -// CHECK: %[[MMT4D:.+]] = linalg.mmt4d -// CHECK-SAME: ins({{.*}} : tensor<1x16x1x1xf32>, tensor<2x16x8x1xf32>) -// CHECK-SAME: outs({{.*}} : tensor<1x2x1x8xf32>) -> tensor<1x2x1x8xf32> - -// ----- - #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> @@ -434,18 +356,18 @@ func.func @matvec_lowering_f32f32f32_aarch64() attributes { // CHECK: %[[RHS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(1) // CHECK-SAME: !flow.dispatch.tensor> // CHECK: %[[OUTS_BINDING:.+]] = hal.interface.binding.subspan set(0) binding(2) -// CHECK-SAME: !flow.dispatch.tensor> +// CHECK-SAME: !flow.dispatch.tensor> // CHECK: %[[LHS:.+]] = flow.dispatch.tensor.load %[[LHS_BINDING]] // CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [2, 16, 8, 1], strides = [1, 1, 1, 1] // CHECK: %[[RHS:.+]] = flow.dispatch.tensor.load %[[RHS_BINDING]] // CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [1, 16, 1, 1], strides = [1, 1, 1, 1] // CHECK: %[[OUTS:.+]] = flow.dispatch.tensor.load %[[OUTS_BINDING]] -// CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [1, 2, 1, 8], strides = [1, 1, 1, 1] +// CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [2, 1, 8, 1], strides = [1, 1, 1, 1] // CHECK: %[[MMT4D:.+]] = linalg.mmt4d -// CHECK-SAME: ins(%[[RHS]], %[[LHS]] : +// CHECK-SAME: ins(%[[LHS]], %[[RHS]] : // CHECK-SAME: outs(%[[OUTS]] : // CHECK: flow.dispatch.tensor.store %[[MMT4D]], %[[OUTS_BINDING]] -// CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [1, 2, 1, 8], strides = [1, 1, 1, 1] +// CHECK-SAME: offsets = [0, 0, 0, 0], sizes = [2, 1, 8, 1], strides = [1, 1, 1, 1] // ----- @@ -2246,10 +2168,10 @@ func.func @matvec(%arg0: tensor<11008x128xi8>, %arg1: tensor<128xi8>) -> tensor< // CHECK-NEXT: linalg.yield %[[RHS_EXT_OP]] : i32 // CHECK: %[[INIT_FILL:.+]] = tensor.empty() : tensor<688x16xi32> // CHECK: %[[EXPAND_RHS:.+]] = tensor.expand_shape %[[RHS_EXT]] {{\[}}[0, 1], [2, 3]] output_shape [1, 64, 1, 2] : tensor<64x2xi32> into tensor<1x64x1x2xi32> -// CHECK: %[[EXPAND_INIT:.+]] = tensor.expand_shape %[[INIT_FILL:.+]] {{\[}}[0, 1], [2, 3]] output_shape [1, 688, 1, 16] : tensor<688x16xi32> into tensor<1x688x1x16xi32> -// CHECK: %[[FILL:.+]] = linalg.fill ins(%[[C0_I32]] : i32) outs(%[[EXPAND_INIT]] : tensor<1x688x1x16xi32>) -> tensor<1x688x1x16xi32> -// CHECK: %[[MMT4D:.+]] = linalg.mmt4d ins(%[[EXPAND_RHS]], %[[LHS_EXT]] : tensor<1x64x1x2xi32>, tensor<688x64x16x2xi32>) outs(%[[FILL]] : tensor<1x688x1x16xi32>) -> tensor<1x688x1x16xi32> -// CHECK: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[MMT4D]] {{\[}}[0, 1], [2, 3]] : tensor<1x688x1x16xi32> into tensor<688x16xi32> +// CHECK: %[[EXPAND_INIT:.+]] = tensor.expand_shape %[[INIT_FILL:.+]] {{\[}}[0, 1], [2, 3]] output_shape [688, 1, 16, 1] : tensor<688x16xi32> into tensor<688x1x16x1xi32> +// CHECK: %[[FILL:.+]] = linalg.fill ins(%[[C0_I32]] : i32) outs(%[[EXPAND_INIT]] : tensor<688x1x16x1xi32>) -> tensor<688x1x16x1xi32> +// CHECK: %[[MMT4D:.+]] = linalg.mmt4d ins(%[[LHS_EXT]], %[[EXPAND_RHS]] : tensor<688x64x16x2xi32>, tensor<1x64x1x2xi32>) outs(%[[FILL]] : tensor<688x1x16x1xi32>) -> tensor<688x1x16x1xi32> +// CHECK: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[MMT4D]] {{\[}}[0, 1], [2, 3]] : tensor<688x1x16x1xi32> into tensor<688x16xi32> // CHECK: %[[INIT_UNPACK:.+]] = tensor.empty() : tensor<11008xi32> // CHECK: %[[UNPACK:.+]] = tensor.unpack %[[COLLAPSED]] outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [16] into %[[INIT_UNPACK]] : tensor<688x16xi32> -> tensor<11008xi32> // CHECK: return %[[UNPACK]] @@ -2320,10 +2242,10 @@ func.func @matvec_with_narrow_M(%arg0: tensor<15x128xi8>, %arg1: tensor<128xi8>) // CHECK-NEXT: linalg.yield %[[RHS_EXT_OP]] : i32 // CHECK: %[[INIT_FILL:.+]] = tensor.empty() : tensor<1x16xi32> // CHECK: %[[EXPAND_RHS:.+]] = tensor.expand_shape %[[RHS_EXT]] {{\[}}[0, 1], [2, 3]] output_shape [1, 64, 1, 2] : tensor<64x2xi32> into tensor<1x64x1x2xi32> -// CHECK: %[[EXPAND_INIT:.+]] = tensor.expand_shape %[[INIT_FILL:.+]] {{\[}}[0, 1], [2, 3]] output_shape [1, 1, 1, 16] : tensor<1x16xi32> into tensor<1x1x1x16xi32> -// CHECK: %[[FILL:.+]] = linalg.fill ins(%[[C0_I32]] : i32) outs(%[[EXPAND_INIT]] : tensor<1x1x1x16xi32>) -> tensor<1x1x1x16xi32> -// CHECK: %[[MMT4D:.+]] = linalg.mmt4d ins(%[[EXPAND_RHS]], %[[LHS_EXT]] : tensor<1x64x1x2xi32>, tensor<1x64x16x2xi32>) outs(%[[FILL]] : tensor<1x1x1x16xi32>) -> tensor<1x1x1x16xi32> -// CHECK: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[MMT4D]] {{\[}}[0, 1], [2, 3]] : tensor<1x1x1x16xi32> into tensor<1x16xi32> +// CHECK: %[[EXPAND_INIT:.+]] = tensor.expand_shape %[[INIT_FILL:.+]] {{\[}}[0, 1], [2, 3]] output_shape [1, 1, 16, 1] : tensor<1x16xi32> into tensor<1x1x16x1xi32> +// CHECK: %[[FILL:.+]] = linalg.fill ins(%[[C0_I32]] : i32) outs(%[[EXPAND_INIT]] : tensor<1x1x16x1xi32>) -> tensor<1x1x16x1xi32> +// CHECK: %[[MMT4D:.+]] = linalg.mmt4d ins(%[[LHS_EXT]], %[[EXPAND_RHS]] : tensor<1x64x16x2xi32>, tensor<1x64x1x2xi32>) outs(%[[FILL]] : tensor<1x1x16x1xi32>) -> tensor<1x1x16x1xi32> +// CHECK: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[MMT4D]] {{\[}}[0, 1], [2, 3]] : tensor<1x1x16x1xi32> into tensor<1x16xi32> // CHECK: %[[INIT_UNPACK:.+]] = tensor.empty() : tensor<15xi32> // CHECK: %[[UNPACK:.+]] = tensor.unpack %[[COLLAPSED]] outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [16] into %[[INIT_UNPACK]] : tensor<1x16xi32> -> tensor<15xi32> // CHECK: return %[[UNPACK]] @@ -2404,38 +2326,77 @@ func.func @batch_vecmat(%arg0: tensor<32x128xi8>, %arg1: tensor<32x128x11008xi8> // ----- -func.func @batch_matvec(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes { +#map = affine_map<(d0, d1, d2) -> (d0, d1, d2)> +#map1 = affine_map<(d0, d1, d2) -> (d0, d2)> +#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +func.func @batch_matvec(%arg0: tensor<32x11008x128xi8>, %arg1: tensor<32x128xi8>) -> tensor<32x11008xi32> attributes { hal.executable.target = #hal.executable.target<"xyz", "xyz", {target_triple="x86_64-xyz-xyz", cpu_features="+avx512vnni"}> } { + %c32 = arith.constant 32 : index %c0 = arith.constant 0 : index - %c0_i32 = arith.constant 0 : i32 + %c1 = arith.constant 1 : index + %c128 = arith.constant 128 : index + %c11008 = arith.constant 11008 : index %c0_i8 = arith.constant 0 : i8 - %0 = hal.tensor.import %arg0 "input0" : !hal.buffer_view -> tensor<32x11008x128xi8> - %1 = hal.tensor.import %arg1 "input1" : !hal.buffer_view -> tensor<32x128xi8> - %2 = hal.tensor.import %arg2 "input2" : !hal.buffer_view -> tensor<32x11008xi32> - %padded = tensor.pad %0 low[0, 0, 0] high[%c0, %c0, %c0] { - ^bb0(%arg3: index, %arg4: index, %arg5: index): + %c0_i32 = arith.constant 0 : i32 + %padded = tensor.pad %arg0 low[0, 0, 0] high[%c0, %c0, %c0] { + ^bb0(%arg2: index, %arg3: index, %arg4: index): tensor.yield %c0_i8 : i8 } : tensor<32x11008x128xi8> to tensor - %3 = iree_encoding.set_encoding %padded : tensor -> tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> - %padded_0 = tensor.pad %1 low[0, 0] high[%c0, %c0] { - ^bb0(%arg3: index, %arg4: index): + %4 = iree_encoding.set_encoding %padded : tensor -> tensor, user_indexing_maps = [#map, #map1, #map2]>> + %5 = tensor.empty(%c32, %c11008, %c128) : tensor, user_indexing_maps = [#map, #map1, #map2]>> + %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) outs(%5 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) { + ^bb0(%in: i8, %out: i32): + %17 = arith.extsi %in : i8 to i32 + linalg.yield %17 : i32 + } -> tensor, user_indexing_maps = [#map, #map1, #map2]>> + %padded_0 = tensor.pad %arg1 low[0, 0] high[%c0, %c0] { + ^bb0(%arg2: index, %arg3: index): tensor.yield %c0_i8 : i8 } : tensor<32x128xi8> to tensor - %4 = iree_encoding.set_encoding %padded_0 : tensor -> tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> - %padded_1 = tensor.pad %2 low[0, 0] high[%c0, %c0] { - ^bb0(%arg3: index, %arg4: index): - tensor.yield %c0_i32 : i32 - } : tensor<32x11008xi32> to tensor - %5 = iree_encoding.set_encoding %padded_1 : tensor -> tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> - %6 = linalg.batch_matvec ins(%3, %4 : tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>, tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>) outs(%5 : tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>>) -> tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> - %7 = iree_encoding.unset_encoding %6 : tensor, matmul_narrow_N = 1 : index, user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>]>> -> tensor - %extracted_slice = tensor.extract_slice %7[0, 0] [32, 11008] [1, 1] : tensor to tensor<32x11008xi32> - %8 = hal.tensor.export %extracted_slice "output0" : tensor<32x11008xi32> -> !hal.buffer_view - func.return %8 : !hal.buffer_view + %7 = iree_encoding.set_encoding %padded_0 : tensor -> tensor, user_indexing_maps = [#map, #map1, #map2]>> + %8 = tensor.empty(%c32, %c128) : tensor, user_indexing_maps = [#map, #map1, #map2]>> + %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%7 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) outs(%8 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) { + ^bb0(%in: i8, %out: i32): + %17 = arith.extsi %in : i8 to i32 + linalg.yield %17 : i32 + } -> tensor, user_indexing_maps = [#map, #map1, #map2]>> + %10 = tensor.empty(%c32, %c11008) : tensor, user_indexing_maps = [#map, #map1, #map2]>> + %11 = linalg.fill ins(%c0_i32 : i32) outs(%10 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) -> tensor, user_indexing_maps = [#map, #map1, #map2]>> + %12 = linalg.batch_matvec ins(%6, %9 : tensor, user_indexing_maps = [#map, #map1, #map2]>>, tensor, user_indexing_maps = [#map, #map1, #map2]>>) outs(%11 : tensor, user_indexing_maps = [#map, #map1, #map2]>>) -> tensor, user_indexing_maps = [#map, #map1, #map2]>> + %13 = iree_encoding.unset_encoding %12 : tensor, user_indexing_maps = [#map, #map1, #map2]>> -> tensor + %extracted_slice = tensor.extract_slice %13[0, 0] [32, 11008] [1, 1] : tensor to tensor<32x11008xi32> + return %extracted_slice : tensor<32x11008xi32> } +// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)> +// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)> // CHECK-LABEL: func.func @batch_matvec( +// CHECK-SAME: %[[LHS:.+]]: tensor<32x11008x128xi8>, %[[RHS:.+]]: tensor<32x128xi8>) -> tensor<32x11008xi32> +// CHECK-DAG: %[[C0_I32:.+]] = arith.constant 0 : i32 +// CHECK-DAG: %[[INIT_LHS_PACK:.+]] = tensor.empty() : tensor<32x688x64x16x2xi8> +// CHECK-DAG: %[[LHS_PACK:.+]] = tensor.pack %[[LHS]] outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 2] into %[[INIT_LHS_PACK]] : tensor<32x11008x128xi8> -> tensor<32x688x64x16x2xi8> +// CHECK-DAG: %[[INIT_LHS_EXT:.+]] = tensor.empty() : tensor<32x688x64x16x2xi32> +// CHECK: %[[LHS_EXT:.+]] = linalg.generic {indexing_maps = [#[[$MAP0]], #[[$MAP0]]], iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"]} ins(%[[LHS_PACK]] : tensor<32x688x64x16x2xi8>) outs(%[[INIT_LHS_EXT]] : tensor<32x688x64x16x2xi32>) { +// CHECK-NEXT: ^bb0(%[[LHS_EXT_ARG_IN:.+]]: i8, %[[LHS_EXT_ARG_OUT:.+]]: i32): +// CHECK-NEXT: %[[LHS_EXT_OP:.+]] = arith.extsi %[[LHS_EXT_ARG_IN]] : i8 to i32 +// CHECK-NEXT: linalg.yield %[[LHS_EXT_OP]] : i32 +// CHECK-DAG: %[[INIT_RHS_PACK:.+]] = tensor.empty() : tensor<32x64x2xi8> +// CHECK-DAG: %[[RHS_PACK:.+]] = tensor.pack %[[RHS]] outer_dims_perm = [0, 1] inner_dims_pos = [1] inner_tiles = [2] into %[[INIT_RHS_PACK]] : tensor<32x128xi8> -> tensor<32x64x2xi8> +// CHECK-DAG: %[[INIT_RHS_EXT:.+]] = tensor.empty() : tensor<32x64x2xi32> +// CHECK: %[[RHS_EXT:.+]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP1]]], iterator_types = ["parallel", "parallel", "parallel"]} ins(%[[RHS_PACK]] : tensor<32x64x2xi8>) outs(%[[INIT_RHS_EXT]] : tensor<32x64x2xi32>) { +// CHECK-NEXT: ^bb0(%[[RHS_EXT_ARG_IN:.+]]: i8, %[[RHS_EXT_ARG_OUT:.+]]: i32): +// CHECK-NEXT: %[[RHS_EXT_OP:.+]] = arith.extsi %[[RHS_EXT_ARG_IN]] : i8 to i32 +// CHECK-NEXT: linalg.yield %[[RHS_EXT_OP]] : i32 +// CHECK: %[[INIT_FILL:.+]] = tensor.empty() : tensor<32x688x16xi32> +// CHECK: %[[EXPAND_RHS:.+]] = tensor.expand_shape %[[RHS_EXT]] {{\[}}[0], [1, 2], [3, 4]] output_shape [32, 1, 64, 1, 2] : tensor<32x64x2xi32> into tensor<32x1x64x1x2xi32> +// CHECK: %[[EXPAND_INIT:.+]] = tensor.expand_shape %[[INIT_FILL:.+]] {{\[}}[0], [1, 2], [3, 4]] output_shape [32, 688, 1, 16, 1] : tensor<32x688x16xi32> into tensor<32x688x1x16x1xi32> +// CHECK: %[[FILL:.+]] = linalg.fill ins(%[[C0_I32]] : i32) outs(%[[EXPAND_INIT]] : tensor<32x688x1x16x1xi32>) -> tensor<32x688x1x16x1xi32> +// CHECK: %[[MMT4D:.+]] = linalg.batch_mmt4d ins(%[[LHS_EXT]], %[[EXPAND_RHS]] : tensor<32x688x64x16x2xi32>, tensor<32x1x64x1x2xi32>) outs(%[[FILL]] : tensor<32x688x1x16x1xi32>) -> tensor<32x688x1x16x1xi32> +// CHECK: %[[COLLAPSED:.+]] = tensor.collapse_shape %[[MMT4D]] {{\[}}[0], [1, 2], [3, 4]] : tensor<32x688x1x16x1xi32> into tensor<32x688x16xi32> +// CHECK: %[[INIT_UNPACK:.+]] = tensor.empty() : tensor<32x11008xi32> +// CHECK: %[[UNPACK:.+]] = tensor.unpack %[[COLLAPSED]] outer_dims_perm = [0, 1] inner_dims_pos = [1] inner_tiles = [16] into %[[INIT_UNPACK]] : tensor<32x688x16xi32> -> tensor<32x11008xi32> +// CHECK: return %[[UNPACK]] // ----- diff --git a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp index 83874316f868..6cea37ab912e 100644 --- a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.cpp @@ -9,8 +9,6 @@ #include "mlir/Dialect/Tensor/IR/Tensor.h" #include "mlir/Dialect/Utils/IndexingUtils.h" -#include - namespace mlir::iree_compiler { using IREE::Encoding::EncodingAttr; @@ -18,98 +16,23 @@ using IREE::Encoding::EncodingRole; using IREE::Encoding::getEncodingAttr; using IREE::Encoding::getEncodingContractionDims; -// If tensorType has the encoding of a matmul RESULT with narrow N, returns -// the transposed type. Otherwise, just returns tensorType. -static RankedTensorType transposeIfNarrowNResult(RankedTensorType tensorType) { - auto encoding = - llvm::dyn_cast_or_null(tensorType.getEncoding()); - if (!encoding) { - return tensorType; - } - if (!isNarrowNResult(encoding)) { - return tensorType; - } - auto newRole = encoding.getRole().getValue(); - TypeAttr originalTypeAttr = encoding.getOriginalType(); - RankedTensorType originalType = tensorType; - if (originalTypeAttr) { - originalType = - llvm::dyn_cast(originalTypeAttr.getValue()); - } - SmallVector newOriginalShape(originalType.getShape()); - auto userIndexingMaps = encoding.getUserIndexingMaps(); - SmallVector maps; - for (auto a : userIndexingMaps) { - maps.push_back(cast(a).getAffineMap()); - } - auto cDims = linalg::inferContractionDims(maps); - SmallVector newShape(tensorType.getShape()); - SmallVector permIndices(maps[0].getNumDims()); - std::iota(std::begin(permIndices), std::end(permIndices), 0); - // Matrix case: there are both M and N dimensions. Transposing means swapping - // them. - if (cDims->m.size() == 1 && cDims->n.size() == 1) { - int m = cDims->m[0]; - int n = cDims->n[0]; - std::swap(permIndices[m], permIndices[n]); - int mDim = encoding.mapDimToRoleIndex(m); - int nDim = encoding.mapDimToRoleIndex(n); - std::swap(newShape[mDim], newShape[nDim]); - std::swap(newOriginalShape[mDim], newOriginalShape[nDim]); - } - // Vector case: there is no N dimension to swap the M dimension with. We - // swap the maps themselves. - if (cDims->n.empty()) { - std::swap(maps[0], maps[1]); - } - - // auto newRoundDimsTo = encoding.getRoundDimsToArray(); - SmallVector newRoundDimsTo(encoding.getRoundDimsToArray()); - assert(newRoundDimsTo.size() == 0 || newRoundDimsTo.size() == 3); - if (newRoundDimsTo.size() != 0) - std::swap(newRoundDimsTo[0], newRoundDimsTo[1]); - - auto context = tensorType.getContext(); - AffineMap permutation = AffineMap::getPermutationMap(permIndices, context); - for (auto &map : maps) { - map = map.compose(permutation); - } - SmallVector newMaps; - for (auto map : maps) { - newMaps.push_back(AffineMapAttr::get(map)); - } - ArrayAttr newIndexingMaps = ArrayAttr::get(context, newMaps); - auto elemType = tensorType.getElementType(); - OpBuilder builder(context); - - auto newEncoding = IREE::Encoding::EncodingAttr::get( - context, IREE::Encoding::EncodingRoleAttr::get(context, newRole), - encoding.getElementTypes(), - TypeAttr::get(RankedTensorType::get(newOriginalShape, elemType)), - encoding.getMatmulNarrow_N(), encoding.getMatmulNarrow_M(), - newIndexingMaps, DenseI64ArrayAttr::get(context, newRoundDimsTo)); - return RankedTensorType::get(newShape, elemType, newEncoding); -} - /// For a given tensor type with an encoding, return the materialized /// type to use for it. If no encoding is set, then return the tensor type /// itself. static RankedTensorType getMaterializedType(RankedTensorType tensorType, MaterializeEncodingFn materializeEncodingFn) { - RankedTensorType maybeTransposedTensorType = - transposeIfNarrowNResult(tensorType); FailureOr materializeEncodingInfo = - materializeEncodingFn(maybeTransposedTensorType); + materializeEncodingFn(tensorType); if (failed(materializeEncodingInfo)) { return dropEncoding(tensorType); } - return cast(tensor::PackOp::inferPackedType( - getOriginalTypeWithEncoding(maybeTransposedTensorType) - .clone(tensorType.getElementType()), - materializeEncodingInfo->innerTileSizes, - materializeEncodingInfo->innerDimsPos, - materializeEncodingInfo->outerDimsPerm)); + return cast( + tensor::PackOp::inferPackedType(getOriginalTypeWithEncoding(tensorType) + .clone(tensorType.getElementType()), + materializeEncodingInfo->innerTileSizes, + materializeEncodingInfo->innerDimsPos, + materializeEncodingInfo->outerDimsPerm)); } MaterializeEncodingTypeConverter::MaterializeEncodingTypeConverter( @@ -119,9 +42,10 @@ MaterializeEncodingTypeConverter::MaterializeEncodingTypeConverter( addConversion([](IndexType indexType) { return indexType; }); addConversion([](FloatType floatType) { return floatType; }); addConversion([](MemRefType memrefType) { return memrefType; }); - addConversion([=](RankedTensorType t) -> RankedTensorType { - return getMaterializedType(t, materializeEncodingFn); - }); + addConversion( + [materializeEncodingFn](RankedTensorType t) -> RankedTensorType { + return getMaterializedType(t, materializeEncodingFn); + }); } MaterializeEncodingConversionTarget::MaterializeEncodingConversionTarget( @@ -203,13 +127,4 @@ MaterializeEncodingInfo getEncodingInfoForMatmul(EncodingAttr encoding, return encodingInfo; } -bool isNarrowNResult(EncodingAttr encoding) { - if (encoding.getRole().getValue() != EncodingRole::RESULT) { - return false; - } - IntegerAttr narrowM = encoding.getMatmulNarrow_M(); - IntegerAttr narrowN = encoding.getMatmulNarrow_N(); - return narrowN && (!narrowM || narrowM.getInt() > narrowN.getInt()); -} - } // namespace mlir::iree_compiler diff --git a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h index 42b4438f63a6..cf9e9a632bea 100644 --- a/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h +++ b/compiler/src/iree/compiler/Codegen/Common/EncodingUtils.h @@ -99,10 +99,6 @@ void populateMaterializeEncodingIntoPackUnPackPatterns( void populateMaterializeUpperBoundTileSizePatterns( RewritePatternSet &patterns, MaterializeEncodingFn materializeEncodingFn); -// Returns true if `encoding` represents a narrow-N matmul RESULT, e.g. the -// result of a matvec. -bool isNarrowNResult(IREE::Encoding::EncodingAttr encoding); - } // namespace mlir::iree_compiler #endif // IREE_COMPILER_SRC_IREE_COMPILER_CODEGEN_COMMON_ENCODINGUTILS_H_ diff --git a/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp b/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp index 689e88c98db7..b6443d346ca4 100644 --- a/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/MaterializeEncodingIntoPackUnPack.cpp @@ -18,7 +18,6 @@ #include "llvm/ADT/SmallVectorExtras.h" #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/Arith/IR/Arith.h" -#include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h" #include "mlir/Dialect/MemRef/Transforms/Transforms.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" @@ -142,12 +141,11 @@ static Value createElementWiseExtUIOp(RewriterBase &rewriter, Value input, /// the canonical mmt4d input shape. If the input element type is unsigned, /// create a producer Linalg::GenericOp on the input that unsigned extends the /// input to the output element type. This extension is required to keep the -/// unsignedness information on the input for ukernels. If `transpose` is true, -/// the `linalgOp`'s indexing maps are transposed. -static Value getMmt4dOperand(Value value, linalg::LinalgOp linalgOp, - bool transpose, RewriterBase &rewriter, - SmallVectorImpl &ri, - ArrayRef elemTypes, int operandIdx) { +/// unsignedness information on the input for ukernels. +Value getMmt4dOperand(Value value, linalg::LinalgOp linalgOp, + RewriterBase &rewriter, + SmallVectorImpl &ri, + ArrayRef elemTypes, int operandIdx) { assert(linalgOp.getNumDpsInputs() == 2); assert(linalgOp.getNumDpsInits() == 1); auto cDims = linalg::inferContractionDims(linalgOp); @@ -160,7 +158,7 @@ static Value getMmt4dOperand(Value value, linalg::LinalgOp linalgOp, auto type = cast(value.getType()); RankedTensorType newType = getExpandedType( type, /*isBatched=*/!cDims->batch.empty(), - /*isTransposed=*/operandIdx == 2 && (transpose ^ cDims->n.empty()), ri); + /*isTransposed=*/operandIdx == 2 && cDims->n.empty(), ri); expandedValue = rewriter.create(loc, newType, value, ri); } @@ -171,22 +169,6 @@ static Value getMmt4dOperand(Value value, linalg::LinalgOp linalgOp, return expandedValue; } -static void transposeInPlace(MaterializeEncodingInfo &info) { - // Vector cases: nothing to do. - if (info.innerTileSizes.size() < 2) { - return; - } - // Not a vector case, so all three arrays in `info` have size at least 2, - // outerDimsPerm may have size 3 if there is a batch dimension, but in all - // cases, the last 2 entries of each array are M and N, not batch. - auto transpose = [](SmallVector &a) { - std::swap(a[a.size() - 2], a[a.size() - 1]); - }; - transpose(info.innerDimsPos); - transpose(info.innerTileSizes); - transpose(info.outerDimsPerm); -} - //===---------------------------------------------------------------------===// // Methods to convert `set_encoding` and `unset_encoding` operations // to `pack` and `unpack` operations respectively. @@ -218,18 +200,11 @@ static FailureOr lowerSetEncodingOpToPackOp( MaterializeEncodingFn materializeEncodingFn, MaterializeEncodingValueFn materializeEncodingValueFn) { RankedTensorType resultType = encodingOp.getResultType(); - auto encoding = getEncodingAttr(resultType); - if (!encoding) { - return failure(); - } FailureOr materializeEncodingInfo = materializeEncodingFn(resultType); if (failed(materializeEncodingInfo)) { return rewriter.notifyMatchFailure(encodingOp, "unhandled result encoding"); } - if (isNarrowNResult(encoding)) { - transposeInPlace(*materializeEncodingInfo); - } // Create `tensor.empty` operation for the result of the pack operation. Location loc = encodingOp.getLoc(); FailureOr> innerTileSizesOfr = @@ -239,6 +214,7 @@ static FailureOr lowerSetEncodingOpToPackOp( return rewriter.notifyMatchFailure( encodingOp, "failed to generate runtime tile size query"); } + auto encoding = getEncodingAttr(resultType); if (!encoding) { return failure(); } @@ -275,9 +251,6 @@ static FailureOr lowerUnsetEncodingToUnpackOp( if (failed(materializeEncodingInfo)) { return rewriter.notifyMatchFailure(encodingOp, "unhandled source encoding"); } - if (isNarrowNResult(getEncodingAttr(sourceType))) { - transposeInPlace(*materializeEncodingInfo); - } // Create an `tensor.empty` for the result of the unpack operation. Location loc = encodingOp.getLoc(); SmallVector resultDims = @@ -366,22 +339,22 @@ lowerContractionOpWithEncoding(RewriterBase &rewriter, operands.take_front(inputs.size()), operands.drop_front(inputs.size())); } else { - bool transpose = isNarrowNResult(resultEncoding); auto elemTypes = llvm::map_to_vector( lhsEncoding.getElementTypes().getValue(), [](Attribute a) { return cast(a).getValue(); }); SmallVector ri; - Value newLhs = getMmt4dOperand(operands[0], linalgOp, transpose, rewriter, - ri, elemTypes, /*operandIdx=*/0); - Value newRhs = getMmt4dOperand(operands[1], linalgOp, transpose, rewriter, - ri, elemTypes, /*operandIdx=*/1); + Value newLhs = + getMmt4dOperand(operands[0], linalgOp, rewriter, ri, elemTypes, + /*operandIdx=*/0); + Value newRhs = + getMmt4dOperand(operands[1], linalgOp, rewriter, ri, elemTypes, + /*operandIdx=*/1); Value newResult = - getMmt4dOperand(operands[2], linalgOp, transpose, rewriter, ri, - elemTypes, /*operandIdx=*/2); - if (transpose) { - std::swap(newLhs, newRhs); - } + getMmt4dOperand(operands[2], linalgOp, rewriter, ri, elemTypes, + /*operandIdx=*/2); + Type newResultType = newResult.getType(); + auto cDims = IREE::Encoding::getEncodingContractionDims(lhsEncoding); if (cDims->batch.empty()) { result = rewriter.create( @@ -418,9 +391,7 @@ lowerOpWithEncoding(RewriterBase &rewriter, tensor::EmptyOp emptyOp, loc, emptyOp.getMixedSizes(), resultType.getElementType()); return newEmptyOp; } - if (isNarrowNResult(getEncodingAttr(emptyType))) { - transposeInPlace(*materializeEncodingInfo); - } + FailureOr> innerTileSizesOfr = getInnerTileSizesOfr(rewriter, loc, resultType, *materializeEncodingInfo, materializeEncodingValueFn); @@ -436,6 +407,7 @@ lowerOpWithEncoding(RewriterBase &rewriter, tensor::EmptyOp emptyOp, materializeEncodingInfo->outerDimsPerm); Operation *newEmptyOp = rewriter.create( loc, newShape, resultType.getElementType()); + return newEmptyOp; } @@ -527,9 +499,6 @@ static FailureOr> getPackedDimsForDispatchTensor( if (failed(encodingInfo)) { return failure(); } - if (isNarrowNResult(getEncodingAttr(boundTensorType))) { - transposeInPlace(*encodingInfo); - } SmallVector targetShape = getMixedValues(originalTensorType.getShape(), dynamicDims, builder); @@ -741,10 +710,10 @@ struct SetEncodingOpToPackOpConversion LogicalResult matchAndRewrite(SetEncodingOp encodingOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - auto converter = static_cast( - getTypeConverter()); MaterializeEncodingFn materializeEncodingFn = - converter->getMaterializeEncodingFn(); + static_cast( + getTypeConverter()) + ->getMaterializeEncodingFn(); auto packOp = lowerSetEncodingOpToPackOp( rewriter, encodingOp, adaptor.getSource(), materializeEncodingFn, this->materializeEncodingValueFn); @@ -773,10 +742,10 @@ struct UnsetEncodingOpToUnPackOpConversion LogicalResult matchAndRewrite(UnsetEncodingOp encodingOp, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { - auto converter = static_cast( - this->getTypeConverter()); MaterializeEncodingFn materializeEncodingFn = - converter->getMaterializeEncodingFn(); + static_cast( + this->getTypeConverter()) + ->getMaterializeEncodingFn(); auto unpackOp = lowerUnsetEncodingToUnpackOp( rewriter, encodingOp, adaptor.getSource(), materializeEncodingFn, this->materializeEncodingValueFn); @@ -833,10 +802,10 @@ struct MaterializeDPSOperation : public OpMaterializeEncodingPattern { LogicalResult matchAndRewrite(OpTy dpsOp, typename OpTy::Adaptor adaptor, ConversionPatternRewriter &rewriter) const override { - auto converter = static_cast( - this->getTypeConverter()); MaterializeEncodingFn materializeEncodingFn = - converter->getMaterializeEncodingFn(); + static_cast( + this->getTypeConverter()) + ->getMaterializeEncodingFn(); FailureOr convertedOp = lowerOpWithEncoding( rewriter, dpsOp, adaptor.getInputs(), adaptor.getOutputs(), materializeEncodingFn, this->materializeEncodingValueFn); @@ -856,10 +825,10 @@ struct MaterializeOperation : public OpMaterializeEncodingPattern { LogicalResult matchAndRewrite(OpTy op, typename OpTy::Adaptor adaptor, ConversionPatternRewriter &rewriter) const override { - auto converter = static_cast( - this->getTypeConverter()); MaterializeEncodingFn materializeEncodingFn = - converter->getMaterializeEncodingFn(); + static_cast( + this->getTypeConverter()) + ->getMaterializeEncodingFn(); FailureOr convertedOp = lowerOpWithEncoding( rewriter, op, adaptor.getOperands(), materializeEncodingFn, this->materializeEncodingValueFn); @@ -899,10 +868,10 @@ class MaterializeContractionOp : public OpInterfaceConversionPattern< matchAndRewrite(mlir::linalg::ContractionOpInterface op, ArrayRef operands, ConversionPatternRewriter &rewriter) const override { - auto converter = static_cast( - this->getTypeConverter()); MaterializeEncodingFn materializeEncodingFn = - converter->getMaterializeEncodingFn(); + static_cast( + this->getTypeConverter()) + ->getMaterializeEncodingFn(); auto linalgOp = dyn_cast(op.getOperation()); if (!linalgOp || operands.size() != 3) { return failure(); diff --git a/tests/e2e/linalg/BUILD.bazel b/tests/e2e/linalg/BUILD.bazel index 791fd2879b9d..73645d2a5158 100644 --- a/tests/e2e/linalg/BUILD.bazel +++ b/tests/e2e/linalg/BUILD.bazel @@ -23,7 +23,6 @@ LLVM_SRCS = enforce_glob( [ "conv2d.mlir", "fp_to_subbyte.mlir", - "narrow_n_matmuls.mlir", "subbyte_to_fp.mlir", ], include = ["*.mlir"], @@ -46,7 +45,6 @@ iree_check_single_backend_test_suite( VMVX_SRCS = enforce_glob( [ "conv2d.mlir", - "narrow_n_matmuls.mlir", ], include = ["*.mlir"], exclude = [ @@ -67,7 +65,6 @@ VULKAN_SRCS = enforce_glob( [ "conv2d.mlir", "subbyte_to_fp.mlir", - "narrow_n_matmuls.mlir", ], include = ["*.mlir"], exclude = [ @@ -115,7 +112,6 @@ CUDA_SRCS = enforce_glob( "subbyte_to_fp.mlir", # currently only enabled on cuda as it can be slow on other backends. "large_linalg_matmul.mlir", - "narrow_n_matmuls.mlir", ], include = ["*.mlir"], exclude = [ diff --git a/tests/e2e/linalg/CMakeLists.txt b/tests/e2e/linalg/CMakeLists.txt index 9794387a9691..fdd9c04f718e 100644 --- a/tests/e2e/linalg/CMakeLists.txt +++ b/tests/e2e/linalg/CMakeLists.txt @@ -16,7 +16,6 @@ iree_check_single_backend_test_suite( SRCS "conv2d.mlir" "fp_to_subbyte.mlir" - "narrow_n_matmuls.mlir" "subbyte_to_fp.mlir" TARGET_BACKEND "llvm-cpu" @@ -31,7 +30,6 @@ iree_check_single_backend_test_suite( check_vmvx_local-task SRCS "conv2d.mlir" - "narrow_n_matmuls.mlir" TARGET_BACKEND "vmvx" DRIVER @@ -43,7 +41,6 @@ iree_check_single_backend_test_suite( check_vulkan-spirv_vulkan SRCS "conv2d.mlir" - "narrow_n_matmuls.mlir" "subbyte_to_fp.mlir" TARGET_BACKEND "vulkan-spirv" @@ -84,7 +81,6 @@ iree_check_single_backend_test_suite( "conv2d.mlir" "fp_to_subbyte.mlir" "large_linalg_matmul.mlir" - "narrow_n_matmuls.mlir" "subbyte_to_fp.mlir" TARGET_BACKEND "cuda" diff --git a/tests/e2e/linalg/narrow_n_matmuls.mlir b/tests/e2e/linalg/narrow_n_matmuls.mlir deleted file mode 100644 index 578d7f72a632..000000000000 --- a/tests/e2e/linalg/narrow_n_matmuls.mlir +++ /dev/null @@ -1,126 +0,0 @@ -// Test various forms of matmuls with narrow N, in particual matvec/batch_matvec -// (implicitly N=1) and matmuls with N=1 and N=2. -// -// The reason why this needs extensive e2e testing is the transposition of -// narrow N to narrow M in data tiling (around CPUMaterializeEncodingPass). -// It doesn't hurt to enable this case on all backends though. - -func.func @matvec() { - %lhs = util.unfoldable_constant dense<[ - [1, 2, 0, 5], - [3, 4, -1, -3], - [5, 6, -7, 0] - ]> : tensor<3x4xi8> - %rhs = util.unfoldable_constant dense<[-2, 3, 4, -1]> : tensor<4xi8> - %acc = util.unfoldable_constant dense<[1, 2, 3]> : tensor<3xi32> - %result = linalg.matvec ins(%lhs, %rhs : tensor<3x4xi8>, tensor<4xi8>) outs(%acc : tensor<3xi32>) -> tensor<3xi32> - check.expect_eq_const(%result, dense< - [0, 7, -17] - > : tensor<3xi32>) : tensor<3xi32> - return -} - -func.func @batch_matvec() { - %lhs = util.unfoldable_constant dense<[[ - [1, 2, 0, 5], - [3, 4, -1, -3], - [5, 6, -7, 0] - ], [ - [-3, 1, 4, 2], - [-1, 0, 6, -1], - [1, -2, 3, -4] - ]]> : tensor<2x3x4xi8> - %rhs = util.unfoldable_constant dense<[ - [-2, 3, 4, -1], - [1, 2, -5, 3] - ]> : tensor<2x4xi8> - %acc = util.unfoldable_constant dense<[[1, 2, 3], [4, 5, 6]]> : tensor<2x3xi32> - %result = linalg.batch_matvec ins(%lhs, %rhs : tensor<2x3x4xi8>, tensor<2x4xi8>) outs(%acc : tensor<2x3xi32>) -> tensor<2x3xi32> - check.expect_eq_const(%result, dense<[ - [0, 7, -17], - [-11, -29, -24] - ]> : tensor<2x3xi32>) : tensor<2x3xi32> - return -} - -func.func @matmul_narrow_n_1() { - %lhs = util.unfoldable_constant dense<[ - [1, 2, 0, 5], - [3, 4, -1, -3], - [5, 6, -7, 0] - ]> : tensor<3x4xi8> - %rhs = util.unfoldable_constant dense<[[-2], [3], [4], [-1]]> : tensor<4x1xi8> - %acc = util.unfoldable_constant dense<[[1], [2], [3]]> : tensor<3x1xi32> - %result = linalg.matmul ins(%lhs, %rhs : tensor<3x4xi8>, tensor<4x1xi8>) outs(%acc : tensor<3x1xi32>) -> tensor<3x1xi32> - check.expect_eq_const(%result, dense< - [[0], [7], [-17]] - > : tensor<3x1xi32>) : tensor<3x1xi32> - return -} - -func.func @batch_matmul_narrow_n_1() { - %lhs = util.unfoldable_constant dense<[[ - [1, 2, 0, 5], - [3, 4, -1, -3], - [5, 6, -7, 0] - ], [ - [-3, 1, 4, 2], - [-1, 0, 6, -1], - [1, -2, 3, -4] - ]]> : tensor<2x3x4xi8> - %rhs = util.unfoldable_constant dense<[ - [[-2], [3], [4], [-1]], - [[1], [2], [-5], [3]] - ]> : tensor<2x4x1xi8> - %acc = util.unfoldable_constant dense<[ - [[1], [2], [3]], - [[4], [5], [6]] - ]> : tensor<2x3x1xi32> - %result = linalg.batch_matmul ins(%lhs, %rhs : tensor<2x3x4xi8>, tensor<2x4x1xi8>) outs(%acc : tensor<2x3x1xi32>) -> tensor<2x3x1xi32> - check.expect_eq_const(%result, dense<[ - [[0], [7], [-17]], - [[-11], [-29], [-24]] - ]> : tensor<2x3x1xi32>) : tensor<2x3x1xi32> - return -} - -func.func @matmul_narrow_n_2() { - %lhs = util.unfoldable_constant dense<[ - [1, 2, 0, 5], - [3, 4, -1, -3], - [5, 6, -7, 0] - ]> : tensor<3x4xi8> - %rhs = util.unfoldable_constant dense<[[-2, 1], [3, -1], [4, 0], [-1, 2]]> : tensor<4x2xi8> - %acc = util.unfoldable_constant dense<[[1, -1], [2, 0], [3, 1]]> : tensor<3x2xi32> - %result = linalg.matmul ins(%lhs, %rhs : tensor<3x4xi8>, tensor<4x2xi8>) outs(%acc : tensor<3x2xi32>) -> tensor<3x2xi32> - check.expect_eq_const(%result, dense< - [[0, 8], [7, -7], [-17, 0]] - > : tensor<3x2xi32>) : tensor<3x2xi32> - return -} - -func.func @batch_matmul_narrow_n_2() { - %lhs = util.unfoldable_constant dense<[[ - [1, 2, 0, 5], - [3, 4, -1, -3], - [5, 6, -7, 0] - ], [ - [-3, 1, 4, 2], - [-1, 0, 6, -1], - [1, -2, 3, -4] - ]]> : tensor<2x3x4xi8> - %rhs = util.unfoldable_constant dense<[ - [[-2, 0], [3, 1], [4, -1], [-1, 2]], - [[1, -2], [2, 3], [-5, -3], [3, 0]] - ]> : tensor<2x4x2xi8> - %acc = util.unfoldable_constant dense<[ - [[1, -1], [2, 0], [3, 1]], - [[4, 2], [5, 1], [6, -1]] - ]> : tensor<2x3x2xi32> - %result = linalg.batch_matmul ins(%lhs, %rhs : tensor<2x3x4xi8>, tensor<2x4x2xi8>) outs(%acc : tensor<2x3x2xi32>) -> tensor<2x3x2xi32> - check.expect_eq_const(%result, dense<[ - [[0, 11], [7, -1], [-17, 14]], - [[-11, -1], [-29, -15], [-24, -18]] - ]> : tensor<2x3x2xi32>) : tensor<2x3x2xi32> - return -}