Skip to content

Commit

Permalink
[LLVMGPU] Support i8 MFMA intrinsics in GPUTileAndFuse pipeline (iree…
Browse files Browse the repository at this point in the history
…-org#18104)

This PR simply adds `MFMA_I8_16x16x32_I32` to the list of allowed enums
in `MMAAttr::populateOperandOffsetsSizesStrides`. This enables the logic
to pack and lower a contraction to this intrinsic with the
`GPUTileAndFuse`. The layout for `MFMA_I8_16x16x32_I32` is very similar
to the layout for `MFMA_F16_16x16x16_F32`, which is already supported,
so the pipeline is already ready to handle the i8 intrinsic case.
Numerical correctness has been verified on an MI300 card for a
256x256x256 matmul.

Signed-off-by: Max Dawkins <max.dawkins@gmail.com>
  • Loading branch information
Max191 authored Aug 6, 2024
1 parent 3a29039 commit 1c50edd
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -714,7 +714,11 @@ LogicalResult MMAAttr::populateOperandOffsetsSizesStrides(
Value laneId, ArrayRef<int64_t> permutation,
SmallVector<OpFoldResult> &offsets, SmallVector<OpFoldResult> &sizes,
SmallVector<OpFoldResult> &strides) const {
if (getIntrinsic().getValue() != MMAIntrinsic::MFMA_F16_16x16x16_F32) {
switch (getIntrinsic().getValue()) {
case MMAIntrinsic::MFMA_F16_16x16x16_F32:
case MMAIntrinsic::MFMA_I8_16x16x32_I32:
break;
default:
return failure();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
affine_map<(i, j, k) -> (k, j)>,
affine_map<(i, j, k) -> (i, j)>
]
func.func @distribute_multi_mma_16x16x16(%lhs: tensor<2x2x16x16xf16>, %rhs: tensor<2x2x16x16xf16>, %acc: tensor<2x2x16x16xf32>) -> tensor<2x2x16x16xf32> {
func.func @distribute_multi_mma_F16_16x16x16_F32(%lhs: tensor<2x2x16x16xf16>, %rhs: tensor<2x2x16x16xf16>, %acc: tensor<2x2x16x16xf32>) -> tensor<2x2x16x16xf32> {
%0 = iree_gpu.multi_mma %lhs, %rhs, %acc {
indexing_maps = #contraction_accesses,
iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>],
Expand All @@ -30,7 +30,7 @@ module attributes { transform.with_named_sequence } {

// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0) -> (d0 mod 16)>
// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>
// CHECK-LABEL: func @distribute_multi_mma_16x16x16
// CHECK-LABEL: func @distribute_multi_mma_F16_16x16x16_F32
// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<2x2x16x16xf16>
// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<2x2x16x16xf16>
// CHECK-SAME: %[[ACC:[A-Za-z0-9]+]]: tensor<2x2x16x16xf32>
Expand All @@ -49,3 +49,64 @@ module attributes { transform.with_named_sequence } {
// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ITER_ARG]][0, 0, %[[ID1]], %[[ID]]]
// CHECK-SAME: [2, 2, 4, 1] [1, 1, 1, 1] : tensor<2x2x4x1xf32> into tensor<2x2x16x16xf32>
// CHECK: mapping = [#iree_gpu.lane_id<0>]

// -----

#contraction_accesses = [
affine_map<(i, j, k) -> (i, k)>,
affine_map<(i, j, k) -> (j, k)>,
affine_map<(i, j, k) -> (i, j)>
]
func.func @distribute_multi_mma_I8_16x16x32_I32(%lhs: tensor<2x2x16x32xi8>, %rhs: tensor<2x2x16x32xi8>, %acc: tensor<2x2x16x16xi32>) -> tensor<2x2x16x16xi32> {
%0 = iree_gpu.multi_mma %lhs, %rhs, %acc {
indexing_maps = #contraction_accesses,
iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>],
kind = #iree_gpu.mma_layout<MFMA_I8_16x16x32_I32>,
rhs_permutation = array<i64: 1, 0>
} : tensor<2x2x16x32xi8>, tensor<2x2x16x32xi8> into tensor<2x2x16x16xi32>
return %0 : tensor<2x2x16x16xi32>
}

module attributes { transform.with_named_sequence } {
transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
%multi_mma = transform.structured.match ops{["iree_gpu.multi_mma"]} in %root : (!transform.any_op) -> !transform.any_op
transform.iree.distribute_multi_mma %multi_mma : (!transform.any_op) -> !transform.any_op
%func = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op
transform.apply_patterns to %func {
transform.apply_patterns.canonicalization
} : !transform.any_op
transform.apply_cse to %func : !transform.any_op

transform.yield
}
}
#map = affine_map<(d0) -> (d0 mod 16)>
#map1 = affine_map<(d0) -> ((d0 floordiv 16) * 8 - ((d0 floordiv 16) floordiv 4) * 32)>
#map2 = affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>
#map3 = affine_map<(d0, d1, d2) -> (d0, d2)>
#map4 = affine_map<(d0, d1, d2) -> (d1, d2)>
#map5 = affine_map<(d0, d1, d2) -> (d0, d1)>

// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0) -> (d0 mod 16)>
// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0) -> ((d0 floordiv 16) * 8 - ((d0 floordiv 16) floordiv 4) * 32)>
// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)>
// CHECK-LABEL: func @distribute_multi_mma_I8_16x16x32_I32
// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<2x2x16x32xi8>
// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<2x2x16x32xi8>
// CHECK-SAME: %[[ACC:[A-Za-z0-9]+]]: tensor<2x2x16x16xi32>
// CHECK: scf.forall (%[[LANE_ID:.+]]) in (64) shared_outs(%[[ITER_ARG:.+]] = %[[ACC]]) -> (tensor<2x2x16x16xi32>)
// CHECK: %[[ID:.+]] = affine.apply #[[$MAP]](%[[LANE_ID]])
// CHECK: %[[ID1:.+]] = affine.apply #[[$MAP1]](%[[LANE_ID]])
// CHECK: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][0, 0, %[[ID]], %[[ID1]]]
// CHECK-SAME: [2, 2, 1, 8] [1, 1, 1, 1] : tensor<2x2x16x32xi8> to tensor<2x2x1x8xi8>
// CHECK: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, 0, %[[ID]], %[[ID1]]]
// CHECK-SAME: [2, 2, 1, 8] [1, 1, 1, 1] : tensor<2x2x16x32xi8> to tensor<2x2x1x8xi8>
// CHECK: %[[ID2:.+]] = affine.apply #[[$MAP2]](%[[LANE_ID]])
// CHECK: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ITER_ARG]][0, 0, %[[ID2]], %[[ID]]]
// CHECK-SAME: [2, 2, 4, 1] [1, 1, 1, 1] : tensor<2x2x16x16xi32> to tensor<2x2x4x1xi32>
// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]]
// CHECK-SAME: : tensor<2x2x1x8xi8>, tensor<2x2x1x8xi8> into tensor<2x2x4x1xi32>
// CHECK: scf.forall.in_parallel
// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ITER_ARG]][0, 0, %[[ID2]], %[[ID]]]
// CHECK-SAME: [2, 2, 4, 1] [1, 1, 1, 1] : tensor<2x2x4x1xi32> into tensor<2x2x16x16xi32>
// CHECK: mapping = [#iree_gpu.lane_id<0>]

0 comments on commit 1c50edd

Please sign in to comment.