diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp index 0cfe17e3f96d..c5f99c0ff031 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp @@ -714,7 +714,11 @@ LogicalResult MMAAttr::populateOperandOffsetsSizesStrides( Value laneId, ArrayRef permutation, SmallVector &offsets, SmallVector &sizes, SmallVector &strides) const { - if (getIntrinsic().getValue() != MMAIntrinsic::MFMA_F16_16x16x16_F32) { + switch (getIntrinsic().getValue()) { + case MMAIntrinsic::MFMA_F16_16x16x16_F32: + case MMAIntrinsic::MFMA_I8_16x16x32_I32: + break; + default: return failure(); } diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/distribute_multi_mma.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/distribute_multi_mma.mlir index a723155b8bf7..ef8ca4b58e1d 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/distribute_multi_mma.mlir +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/distribute_multi_mma.mlir @@ -5,7 +5,7 @@ affine_map<(i, j, k) -> (k, j)>, affine_map<(i, j, k) -> (i, j)> ] -func.func @distribute_multi_mma_16x16x16(%lhs: tensor<2x2x16x16xf16>, %rhs: tensor<2x2x16x16xf16>, %acc: tensor<2x2x16x16xf32>) -> tensor<2x2x16x16xf32> { +func.func @distribute_multi_mma_F16_16x16x16_F32(%lhs: tensor<2x2x16x16xf16>, %rhs: tensor<2x2x16x16xf16>, %acc: tensor<2x2x16x16xf32>) -> tensor<2x2x16x16xf32> { %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { indexing_maps = #contraction_accesses, iterator_types = [#iree_gpu.iterator_type, #iree_gpu.iterator_type, #iree_gpu.iterator_type], @@ -30,7 +30,7 @@ module attributes { transform.with_named_sequence } { // CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0) -> (d0 mod 16)> // CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)> -// CHECK-LABEL: func @distribute_multi_mma_16x16x16 +// CHECK-LABEL: func @distribute_multi_mma_F16_16x16x16_F32 // CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<2x2x16x16xf16> // CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<2x2x16x16xf16> // CHECK-SAME: %[[ACC:[A-Za-z0-9]+]]: tensor<2x2x16x16xf32> @@ -49,3 +49,64 @@ module attributes { transform.with_named_sequence } { // CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ITER_ARG]][0, 0, %[[ID1]], %[[ID]]] // CHECK-SAME: [2, 2, 4, 1] [1, 1, 1, 1] : tensor<2x2x4x1xf32> into tensor<2x2x16x16xf32> // CHECK: mapping = [#iree_gpu.lane_id<0>] + +// ----- + +#contraction_accesses = [ + affine_map<(i, j, k) -> (i, k)>, + affine_map<(i, j, k) -> (j, k)>, + affine_map<(i, j, k) -> (i, j)> +] +func.func @distribute_multi_mma_I8_16x16x32_I32(%lhs: tensor<2x2x16x32xi8>, %rhs: tensor<2x2x16x32xi8>, %acc: tensor<2x2x16x16xi32>) -> tensor<2x2x16x16xi32> { + %0 = iree_gpu.multi_mma %lhs, %rhs, %acc { + indexing_maps = #contraction_accesses, + iterator_types = [#iree_gpu.iterator_type, #iree_gpu.iterator_type, #iree_gpu.iterator_type], + kind = #iree_gpu.mma_layout, + rhs_permutation = array + } : tensor<2x2x16x32xi8>, tensor<2x2x16x32xi8> into tensor<2x2x16x16xi32> + return %0 : tensor<2x2x16x16xi32> +} + +module attributes { transform.with_named_sequence } { + transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) { + %multi_mma = transform.structured.match ops{["iree_gpu.multi_mma"]} in %root : (!transform.any_op) -> !transform.any_op + transform.iree.distribute_multi_mma %multi_mma : (!transform.any_op) -> !transform.any_op + %func = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op + transform.apply_patterns to %func { + transform.apply_patterns.canonicalization + } : !transform.any_op + transform.apply_cse to %func : !transform.any_op + + transform.yield + } +} +#map = affine_map<(d0) -> (d0 mod 16)> +#map1 = affine_map<(d0) -> ((d0 floordiv 16) * 8 - ((d0 floordiv 16) floordiv 4) * 32)> +#map2 = affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)> +#map3 = affine_map<(d0, d1, d2) -> (d0, d2)> +#map4 = affine_map<(d0, d1, d2) -> (d1, d2)> +#map5 = affine_map<(d0, d1, d2) -> (d0, d1)> + +// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0) -> (d0 mod 16)> +// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0) -> ((d0 floordiv 16) * 8 - ((d0 floordiv 16) floordiv 4) * 32)> +// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0) -> ((d0 floordiv 16) * 4 - ((d0 floordiv 16) floordiv 4) * 16)> +// CHECK-LABEL: func @distribute_multi_mma_I8_16x16x32_I32 +// CHECK-SAME: %[[LHS:[A-Za-z0-9]+]]: tensor<2x2x16x32xi8> +// CHECK-SAME: %[[RHS:[A-Za-z0-9]+]]: tensor<2x2x16x32xi8> +// CHECK-SAME: %[[ACC:[A-Za-z0-9]+]]: tensor<2x2x16x16xi32> +// CHECK: scf.forall (%[[LANE_ID:.+]]) in (64) shared_outs(%[[ITER_ARG:.+]] = %[[ACC]]) -> (tensor<2x2x16x16xi32>) +// CHECK: %[[ID:.+]] = affine.apply #[[$MAP]](%[[LANE_ID]]) +// CHECK: %[[ID1:.+]] = affine.apply #[[$MAP1]](%[[LANE_ID]]) +// CHECK: %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][0, 0, %[[ID]], %[[ID1]]] +// CHECK-SAME: [2, 2, 1, 8] [1, 1, 1, 1] : tensor<2x2x16x32xi8> to tensor<2x2x1x8xi8> +// CHECK: %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, 0, %[[ID]], %[[ID1]]] +// CHECK-SAME: [2, 2, 1, 8] [1, 1, 1, 1] : tensor<2x2x16x32xi8> to tensor<2x2x1x8xi8> +// CHECK: %[[ID2:.+]] = affine.apply #[[$MAP2]](%[[LANE_ID]]) +// CHECK: %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ITER_ARG]][0, 0, %[[ID2]], %[[ID]]] +// CHECK-SAME: [2, 2, 4, 1] [1, 1, 1, 1] : tensor<2x2x16x16xi32> to tensor<2x2x4x1xi32> +// CHECK: %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]] +// CHECK-SAME: : tensor<2x2x1x8xi8>, tensor<2x2x1x8xi8> into tensor<2x2x4x1xi32> +// CHECK: scf.forall.in_parallel +// CHECK: tensor.parallel_insert_slice %[[MMA]] into %[[ITER_ARG]][0, 0, %[[ID2]], %[[ID]]] +// CHECK-SAME: [2, 2, 4, 1] [1, 1, 1, 1] : tensor<2x2x4x1xi32> into tensor<2x2x16x16xi32> +// CHECK: mapping = [#iree_gpu.lane_id<0>]