[LLVMGPU] Support i8 MFMA intrinsics in GPUTileAndFuse pipeline (#18104)

This PR simply adds `MFMA_I8_16x16x32_I32` to the list of allowed enums in `MMAAttr::populateOperandOffsetsSizesStrides`. This enables the logic to pack and lower a contraction to this intrinsic with the `GPUTileAndFuse`. The layout for `MFMA_I8_16x16x32_I32` is very similar to the layout for `MFMA_F16_16x16x16_F32`, which is already supported, so the pipeline is already ready to handle the i8 intrinsic case. Numerical correctness has been verified on an MI300 card for a 256x256x256 matmul. Signed-off-by: Max Dawkins <max.dawkins@gmail.com>
iree-org · Aug 6, 2024 · 1c50edd · 1c50edd
1 parent 3a29039
commit 1c50edd
Show file tree

Hide file tree

Showing 2 changed files with 68 additions and 3 deletions.
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
@@ -714,7 +714,11 @@ LogicalResult MMAAttr::populateOperandOffsetsSizesStrides(
     Value laneId, ArrayRef<int64_t> permutation,
     SmallVector<OpFoldResult> &offsets, SmallVector<OpFoldResult> &sizes,
     SmallVector<OpFoldResult> &strides) const {
-  if (getIntrinsic().getValue() != MMAIntrinsic::MFMA_F16_16x16x16_F32) {
+  switch (getIntrinsic().getValue()) {
+  case MMAIntrinsic::MFMA_F16_16x16x16_F32:
+  case MMAIntrinsic::MFMA_I8_16x16x32_I32:
+    break;
+  default:
     return failure();
   }
 

diff --git a/.../src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/distribute_multi_mma.mlir b/.../src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/distribute_multi_mma.mlir
@@ -5,7 +5,7 @@
  affine_map<(i, j, k) -> (k, j)>,
  affine_map<(i, j, k) -> (i, j)>
 ]
-func.func @distribute_multi_mma_16x16x16(%lhs: tensor<2x2x16x16xf16>, %rhs: tensor<2x2x16x16xf16>, %acc: tensor<2x2x16x16xf32>) -> tensor<2x2x16x16xf32> {
+func.func @distribute_multi_mma_F16_16x16x16_F32(%lhs: tensor<2x2x16x16xf16>, %rhs: tensor<2x2x16x16xf16>, %acc: tensor<2x2x16x16xf32>) -> tensor<2x2x16x16xf32> {
   %0 = iree_gpu.multi_mma %lhs, %rhs, %acc {
     indexing_maps = #contraction_accesses,
     iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>],
@@ -30,7 +30,7 @@ module attributes { transform.with_named_sequence } {
 
 // CHECK-DAG:   #[[$MAP:.+]]  = affine_map<(d0) -> (d0 mod 16)>
 // CHECK-DAG:   #[[$MAP1:.+]] = affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>
-// CHECK-LABEL: func @distribute_multi_mma_16x16x16
+// CHECK-LABEL: func @distribute_multi_mma_F16_16x16x16_F32
 //  CHECK-SAME:   %[[LHS:[A-Za-z0-9]+]]: tensor<2x2x16x16xf16>
 //  CHECK-SAME:   %[[RHS:[A-Za-z0-9]+]]: tensor<2x2x16x16xf16>
 //  CHECK-SAME:   %[[ACC:[A-Za-z0-9]+]]: tensor<2x2x16x16xf32>
@@ -49,3 +49,64 @@ module attributes { transform.with_named_sequence } {
 //       CHECK:       tensor.parallel_insert_slice %[[MMA]] into %[[ITER_ARG]][0, 0, %[[ID1]], %[[ID]]]
 //  CHECK-SAME:         [2, 2, 4, 1] [1, 1, 1, 1] : tensor<2x2x4x1xf32> into tensor<2x2x16x16xf32>
 //       CHECK:   mapping = [#iree_gpu.lane_id<0>]
+
+// -----
+
+#contraction_accesses = [
+ affine_map<(i, j, k) -> (i, k)>,
+ affine_map<(i, j, k) -> (j, k)>,
+ affine_map<(i, j, k) -> (i, j)>
+]
+func.func @distribute_multi_mma_I8_16x16x32_I32(%lhs: tensor<2x2x16x32xi8>, %rhs: tensor<2x2x16x32xi8>, %acc: tensor<2x2x16x16xi32>) -> tensor<2x2x16x16xi32> {
+  %0 = iree_gpu.multi_mma %lhs, %rhs, %acc {
+    indexing_maps = #contraction_accesses,
+    iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>],
+    kind = #iree_gpu.mma_layout<MFMA_I8_16x16x32_I32>,
+    rhs_permutation = array<i64: 1, 0>
+  } : tensor<2x2x16x32xi8>, tensor<2x2x16x32xi8> into tensor<2x2x16x16xi32>
+  return %0 : tensor<2x2x16x16xi32>
+}
+
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
+    %multi_mma = transform.structured.match ops{["iree_gpu.multi_mma"]} in %root : (!transform.any_op) -> !transform.any_op
+    transform.iree.distribute_multi_mma %multi_mma : (!transform.any_op) -> !transform.any_op
+    %func = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op
+     transform.apply_patterns to %func {
+      transform.apply_patterns.canonicalization
+    } : !transform.any_op
+    transform.apply_cse to %func : !transform.any_op
+
+    transform.yield
+  }
+}
+#map = affine_map<(d0) -> (d0 mod 16)>
+#map1 = affine_map<(d0) -> ((d0 floordiv 16) * 8 - ((d0 floordiv 16) floordiv 4) * 32)>
+#map2 = affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>
+#map3 = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map4 = affine_map<(d0, d1, d2) -> (d1, d2)>
+#map5 = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// CHECK-DAG:   #[[$MAP:.+]]  = affine_map<(d0) -> (d0 mod 16)>
+// CHECK-DAG:   #[[$MAP1:.+]] = affine_map<(d0) -> ((d0 floordiv 16) * 8 - ((d0 floordiv 16) floordiv 4) * 32)>
+// CHECK-DAG:   #[[$MAP2:.+]] = affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>
+// CHECK-LABEL: func @distribute_multi_mma_I8_16x16x32_I32
+//  CHECK-SAME:   %[[LHS:[A-Za-z0-9]+]]: tensor<2x2x16x32xi8>
+//  CHECK-SAME:   %[[RHS:[A-Za-z0-9]+]]: tensor<2x2x16x32xi8>
+//  CHECK-SAME:   %[[ACC:[A-Za-z0-9]+]]: tensor<2x2x16x16xi32>
+//       CHECK:   scf.forall (%[[LANE_ID:.+]]) in (64) shared_outs(%[[ITER_ARG:.+]] = %[[ACC]]) -> (tensor<2x2x16x16xi32>)
+//       CHECK:     %[[ID:.+]]  = affine.apply #[[$MAP]](%[[LANE_ID]])
+//       CHECK:     %[[ID1:.+]] = affine.apply #[[$MAP1]](%[[LANE_ID]])
+//       CHECK:     %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][0, 0, %[[ID]], %[[ID1]]]
+//  CHECK-SAME:       [2, 2, 1, 8] [1, 1, 1, 1] : tensor<2x2x16x32xi8> to tensor<2x2x1x8xi8>
+//       CHECK:     %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, 0, %[[ID]], %[[ID1]]]
+//  CHECK-SAME:       [2, 2, 1, 8] [1, 1, 1, 1] : tensor<2x2x16x32xi8> to tensor<2x2x1x8xi8>
+//       CHECK:     %[[ID2:.+]] = affine.apply #[[$MAP2]](%[[LANE_ID]])
+//       CHECK:     %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ITER_ARG]][0, 0, %[[ID2]], %[[ID]]]
+//  CHECK-SAME:       [2, 2, 4, 1] [1, 1, 1, 1] : tensor<2x2x16x16xi32> to tensor<2x2x4x1xi32>
+//       CHECK:     %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]]
+//  CHECK-SAME:       : tensor<2x2x1x8xi8>, tensor<2x2x1x8xi8> into tensor<2x2x4x1xi32>
+//       CHECK:     scf.forall.in_parallel
+//       CHECK:       tensor.parallel_insert_slice %[[MMA]] into %[[ITER_ARG]][0, 0, %[[ID2]], %[[ID]]]
+//  CHECK-SAME:         [2, 2, 4, 1] [1, 1, 1, 1] : tensor<2x2x4x1xi32> into tensor<2x2x16x16xi32>
+//       CHECK:   mapping = [#iree_gpu.lane_id<0>]