From 49a8ee7f28990e68e5379d28da78222a2fd3113a Mon Sep 17 00:00:00 2001
From: Quinn Dawkins <quinn@nod-labs.com>
Date: Sun, 22 Sep 2024 00:59:41 -0400
Subject: [PATCH] [Codegen][GPU] Make operand promotion controlled by lowering
 config

Promoting the operands of a matmul is optional and best to control
through the lowering config rather than based on on the fly analysis.
This gives greater flexibility for adding support for other operations
too (like promotion of another kind of contraction or convolution like
op without have to always extend this pass).
---
 .../Common/GPU/GPUPromoteMatmulOperands.cpp   | 48 +++++--------
 .../GPU/test/gpu_promote_matmul_operands.mlir | 31 ++++++---
 .../Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp   | 24 +++++--
 .../Codegen/Dialect/GPU/IR/IREEGPUAttrs.td    |  3 +
 .../Dialect/GPU/TargetUtils/ConfigUtils.cpp   | 36 ++++++++++
 .../test/ROCDL/config_tile_and_fuse.mlir      |  3 +
 .../test/ROCDL/pipeline_tile_and_fuse.mlir    | 67 +++++++++++++------
 7 files changed, 144 insertions(+), 68 deletions(-)
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp
index 5024d9cf7748a..d5a4a280e83ad 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp
@@ -64,47 +64,29 @@ void promoteOperand(OpBuilder &builder, Operation *op, unsigned index) {
   op->setOperand(index, copy.getResult(0));
 }
 
-bool isNonMatvecContraction(linalg::LinalgOp linalgOp) {
-  SmallVector<int64_t, 4> bounds = linalgOp.getStaticLoopRanges();
-  FailureOr<mlir::linalg::ContractionDimensions> contractionDims =
-      mlir::linalg::inferContractionDims(linalgOp);
-  if (failed(contractionDims)) {
-    return false;
-  }
-
-  if (contractionDims->k.size() < 1 || contractionDims->m.size() < 1 ||
-      contractionDims->n.size() < 1) {
-    return false;
-  }
-
-  auto getElementCount = [&](ArrayRef<unsigned> dims) {
-    int64_t acc = 1;
-    for (auto mDim : dims) {
-      int64_t size = bounds[mDim];
-      if (ShapedType::isDynamic(size)) {
-        return size;
-      }
-      acc *= size;
-    }
-    return acc;
-  };
-  return getElementCount(contractionDims->m) != 1 &&
-         getElementCount(contractionDims->n) != 1;
-}
-
 struct GPUPromoteMatmulOperandsPass final
     : impl::GPUPromoteMatmulOperandsPassBase<GPUPromoteMatmulOperandsPass> {
   void runOnOperation() override {
     FunctionOpInterface funcOp = getOperation();
 
     OpBuilder builder(funcOp);
-    funcOp.walk([&](linalg::LinalgOp linalgOp) {
-      if (!isNonMatvecContraction(linalgOp)) {
+    funcOp.walk([&](Operation *op) {
+      auto loweringConfig =
+          getLoweringConfig<IREE::GPU::LoweringConfigAttr>(op);
+      if (!loweringConfig) {
+        return;
+      }
+
+      std::optional<SmallVector<int64_t>> promotedOperands =
+          loweringConfig.getPromotedOperandList();
+      if (!promotedOperands) {
         return;
       }
-      builder.setInsertionPoint(linalgOp);
-      promoteOperand(builder, linalgOp, 0);
-      promoteOperand(builder, linalgOp, 1);
+
+      builder.setInsertionPoint(op);
+      for (auto operand : promotedOperands.value()) {
+        promoteOperand(builder, op, operand);
+      }
     });
   }
 };
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir
index 2873140477535..5ec02698451a5 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir
@@ -1,10 +1,13 @@
 // RUN: iree-opt %s --split-input-file --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-promote-matmul-operands))" | FileCheck %s
 
+#lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1]}>
+
 func.func @matmul(%a: tensor<32x1024xf32>, %b: tensor<1024x128xf32>) -> tensor<32x128xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %empty = tensor.empty() : tensor<32x128xf32>
   %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<32x128xf32>) -> tensor<32x128xf32>
-  %mm = linalg.matmul ins(%a, %b : tensor<32x1024xf32>, tensor<1024x128xf32>) outs(%fill : tensor<32x128xf32>) -> tensor<32x128xf32>
+  %mm = linalg.matmul {lowering_config = #lowering_config}
+    ins(%a, %b : tensor<32x1024xf32>, tensor<1024x128xf32>) outs(%fill : tensor<32x128xf32>) -> tensor<32x128xf32>
   return %mm : tensor<32x128xf32>
 }
 
@@ -13,33 +16,40 @@ func.func @matmul(%a: tensor<32x1024xf32>, %b: tensor<1024x128xf32>) -> tensor<3
 //  CHECK-SAME:   %[[B:[A-Za-z0-9]+]]: tensor<1024x128xf32>
 //   CHECK-DAG:   %[[PA:.+]] = linalg.copy {{.*}} ins(%[[A]] : tensor<32x1024xf32>)
 //   CHECK-DAG:   %[[PB:.+]] = linalg.copy {{.*}} ins(%[[B]] : tensor<1024x128xf32>)
-//       CHECK:   linalg.matmul ins(%[[PA]], %[[PB]] : tensor<32x1024xf32>, tensor<1024x128xf32>)
+//       CHECK:   linalg.matmul {{.*}} ins(%[[PA]], %[[PB]] : tensor<32x1024xf32>, tensor<1024x128xf32>)
 
 // -----
 
-func.func @matvec(%a: tensor<1x1024xf32>, %b: tensor<1024x128xf32>) -> tensor<1x128xf32> {
+#lowering_config = #iree_gpu.lowering_config<{promote_operands = []}>
+
+func.func @empty_config(%a: tensor<1x1024xf32>, %b: tensor<1024x128xf32>) -> tensor<1x128xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %empty = tensor.empty() : tensor<1x128xf32>
   %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<1x128xf32>) -> tensor<1x128xf32>
-  %mm = linalg.matmul ins(%a, %b : tensor<1x1024xf32>, tensor<1024x128xf32>) outs(%fill : tensor<1x128xf32>) -> tensor<1x128xf32>
+  %mm = linalg.matmul {lowering_config = #lowering_config}
+    ins(%a, %b : tensor<1x1024xf32>, tensor<1024x128xf32>) outs(%fill : tensor<1x128xf32>) -> tensor<1x128xf32>
   return %mm : tensor<1x128xf32>
 }
 
-// Verify that no copies are generated for matvec operations.
-// CHECK-LABEL: func.func @matvec
+// Verify that no copies are generated with an empty lowering config
+// CHECK-LABEL: func.func @empty_config
 //   CHECK-NOT:   linalg.copy
 //       CHECK: return
 
 // -----
 
+#lowering_config = #iree_gpu.lowering_config<{promote_operands = [0]}>
+
 #map = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
-func.func @generic_matmul(%a: tensor<32x1024xf32>, %b: tensor<1024x128xf32>) -> tensor<32x128xf32> {
+func.func @lhs_only_matmul(%a: tensor<32x1024xf32>, %b: tensor<1024x128xf32>) -> tensor<32x128xf32> {
   %cst = arith.constant 0.000000e+00 : f32
   %empty = tensor.empty() : tensor<32x128xf32>
   %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<32x128xf32>) -> tensor<32x128xf32>
-  %mm = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]}
+  %mm = linalg.generic {
+    indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"],
+    lowering_config = #lowering_config}
     ins(%a, %b : tensor<32x1024xf32>, tensor<1024x128xf32>) outs(%fill : tensor<32x128xf32>) {
   ^bb0(%in: f32, %in_0: f32, %out: f32):
     %7 = arith.mulf %in, %in_0 : f32
@@ -49,9 +59,8 @@ func.func @generic_matmul(%a: tensor<32x1024xf32>, %b: tensor<1024x128xf32>) ->
   return %mm : tensor<32x128xf32>
 }
 
-// CHECK-LABEL: func.func @generic_matmul
+// CHECK-LABEL: func.func @lhs_only_matmul
 //  CHECK-SAME:   %[[A:[A-Za-z0-9]+]]: tensor<32x1024xf32>
 //  CHECK-SAME:   %[[B:[A-Za-z0-9]+]]: tensor<1024x128xf32>
 //   CHECK-DAG:   %[[PA:.+]] = linalg.copy {{.*}} ins(%[[A]] : tensor<32x1024xf32>)
-//   CHECK-DAG:   %[[PB:.+]] = linalg.copy {{.*}} ins(%[[B]] : tensor<1024x128xf32>)
-//       CHECK:   linalg.generic {{.*}} ins(%[[PA]], %[[PB]] : tensor<32x1024xf32>, tensor<1024x128xf32>)
+//       CHECK:   linalg.generic {{.*}} ins(%[[PA]], %[[B]] : tensor<32x1024xf32>, tensor<1024x128xf32>)
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
index 07cd27df23e87..10a5c65605d0d 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
@@ -1312,17 +1312,20 @@ static StringRef getTilingLevelName(GPU::TilingLevel level) {
   return StringAttr();
 }
 
-static SmallVector<int64_t> getTileSizes(DictionaryAttr config,
-                                         GPU::TilingLevel level) {
-  auto sizes = config.getAs<ArrayAttr>(getTilingLevelName(level));
-  if (!sizes || !llvm::all_of(sizes.getValue(), llvm::IsaPred<IntegerAttr>)) {
+static SmallVector<int64_t> getIntegerList(ArrayAttr array) {
+  if (!array || !llvm::all_of(array.getValue(), llvm::IsaPred<IntegerAttr>)) {
     return {};
   }
-  return llvm::map_to_vector(sizes.getValue(), [](Attribute s) -> int64_t {
+  return llvm::map_to_vector(array.getValue(), [](Attribute s) -> int64_t {
     return cast<IntegerAttr>(s).getInt();
   });
 }
 
+static SmallVector<int64_t> getTileSizes(DictionaryAttr config,
+                                         GPU::TilingLevel level) {
+  return getIntegerList(config.getAs<ArrayAttr>(getTilingLevelName(level)));
+}
+
 SmallVector<int64_t> LoweringConfigAttr::getWorkgroupTileSizes() const {
   return getTileSizes(getAttributes(), GPU::TilingLevel::Workgroup);
 }
@@ -1366,6 +1369,17 @@ IREE::GPU::MmaInterfaceAttr LoweringConfigAttr::getMmaKind() const {
   return getAttributes().getAs<IREE::GPU::MmaInterfaceAttr>(kMmaKindName);
 }
 
+constexpr StringLiteral kPromoteOperandsName = "promote_operands";
+
+std::optional<SmallVector<int64_t>>
+LoweringConfigAttr::getPromotedOperandList() const {
+  auto array = getAttributes().getAs<ArrayAttr>(kPromoteOperandsName);
+  if (!array) {
+    return std::nullopt;
+  }
+  return getIntegerList(array);
+}
+
 //===----------------------------------------------------------------------===//
 // DerivedThreadConfigAttr
 //===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td
index be04d1925a38d..d2b69166748ea 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td
@@ -59,6 +59,9 @@ def IREEGPU_LoweringConfigAttr :
   let extraClassDeclaration = [{
     /// Helper to retrieve a target mma intrinsic if present.
     ::mlir::iree_compiler::IREE::GPU::MmaInterfaceAttr getMmaKind() const;
+
+    /// Helper to retrieve a list of operand indices to promote.
+    std::optional<SmallVector<int64_t>> getPromotedOperandList() const;
   }];
 }
 
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
index 9b6180b7d4d88..4318cd2dc316e 100644
--- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
@@ -199,6 +199,8 @@ LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target,
   attrs.emplace_back(StringAttr::get(context, "subgroup"),
                      b.getI64ArrayAttr(subgroupTileSizes));
   attrs.emplace_back(StringAttr::get(context, "mma_kind"), mmaKind);
+  attrs.emplace_back(StringAttr::get(context, "promote_operands"),
+                     b.getI64ArrayAttr({0, 1}));
   auto configDict = DictionaryAttr::get(context, attrs);
   auto loweringConfig = IREE::GPU::LoweringConfigAttr::get(context, configDict);
 
@@ -220,6 +222,35 @@ LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target,
       workgroupSize, targetSubgroupSize, pipelineConfig);
 }
 
+/// Helper to identify contraction like operations for operand promotiong.
+static bool isNonMatvecContraction(linalg::LinalgOp linalgOp) {
+  SmallVector<int64_t, 4> bounds = linalgOp.getStaticLoopRanges();
+  FailureOr<mlir::linalg::ContractionDimensions> contractionDims =
+      mlir::linalg::inferContractionDims(linalgOp);
+  if (failed(contractionDims)) {
+    return false;
+  }
+
+  if (contractionDims->k.size() < 1 || contractionDims->m.size() < 1 ||
+      contractionDims->n.size() < 1) {
+    return false;
+  }
+
+  auto getElementCount = [&](ArrayRef<unsigned> dims) {
+    int64_t acc = 1;
+    for (auto mDim : dims) {
+      int64_t size = bounds[mDim];
+      if (ShapedType::isDynamic(size)) {
+        return size;
+      }
+      acc *= size;
+    }
+    return acc;
+  };
+  return getElementCount(contractionDims->m) != 1 &&
+         getElementCount(contractionDims->n) != 1;
+}
+
 LogicalResult setTileAndFuseLoweringConfig(IREE::GPU::TargetAttr target,
                                            mlir::FunctionOpInterface entryPoint,
                                            Operation *op) {
@@ -439,6 +470,11 @@ LogicalResult setTileAndFuseLoweringConfig(IREE::GPU::TargetAttr target,
   attrs.emplace_back(StringAttr::get(context, "thread"),
                      b.getI64ArrayAttr(threadTileSizes));
 
+  if (isNonMatvecContraction(linalgOp)) {
+    attrs.emplace_back(StringAttr::get(context, "promote_operands"),
+                       b.getI64ArrayAttr({0, 1}));
+  }
+
   // Heuristic value chosen to limit maximum vector sizes when tiling below.
   const unsigned maxVectorSize = 32;
 
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
index 93736c704412b..9a6362a5ee002 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir
@@ -35,6 +35,7 @@ func.func @expanded_matmul_transpose_b(%lhs: tensor<2x64x2048xf16>, %rhs: tensor
 
 //       CHECK:   linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config
 //  CHECK-SAME:     mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
+//  CHECK-SAME:     promote_operands = [0, 1]
 //  CHECK-SAME:     reduction = [0, 0, 0, 0, 4]
 //  CHECK-SAME:     subgroup = [0, 0, 4, 1, 0]
 //  CHECK-SAME:     workgroup = [1, 1, 64, 64, 0]
@@ -59,6 +60,7 @@ func.func @mfma_matmul_1024x1024x1024(%lhs: tensor<1024x1024xf16>, %rhs: tensor<
 
 //       CHECK:   linalg.matmul {{.*}}lowering_config = #iree_gpu.lowering_config
 //  CHECK-SAME:     mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>
+//  CHECK-SAME:     promote_operands = [0, 1]
 //  CHECK-SAME:     reduction = [0, 0, 2]
 //  CHECK-SAME:     subgroup = [4, 4, 0]
 //  CHECK-SAME:     workgroup = [128, 128, 0]
@@ -100,6 +102,7 @@ module {
 // CHECK-LABEL: func.func @matmul_dynamic_dim
 //  CHECK-SAME:   #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64>
 //       CHECK:   linalg.matmul {{.*}}lowering_config = #iree_gpu.lowering_config
+//  CHECK-SAME:     promote_operands = [0, 1]
 //  CHECK-SAME:     reduction = [0, 0, 4]
 //  CHECK-SAME:     thread = [1, 1, 0]
 //  CHECK-SAME:     workgroup = [1, 64, 0]
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
index 61c1c248ce87b..46c174e05107e 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir
@@ -6,7 +6,12 @@
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>
 ]>
-#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 4], thread = [8, 4]}>
+#config = #iree_gpu.lowering_config<{
+  workgroup = [64, 64, 0],
+  reduction = [0, 0, 4],
+  thread = [8, 4],
+  promote_operands = [0, 1]
+}>
 hal.executable public @main {
   hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
     hal.executable.export public @matmul_transpose_b ordinal(0) layout(#pipeline_layout) {
@@ -65,7 +70,13 @@ hal.executable public @main {
   #hal.pipeline.binding<storage_buffer>,
   #hal.pipeline.binding<storage_buffer>
 ]>
-#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 2], subgroup = [2, 2], mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>}>
+#config = #iree_gpu.lowering_config<{
+  workgroup = [64, 64, 0],
+  reduction = [0, 0, 2],
+  subgroup = [2, 2],
+  mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
+  promote_operands = [0, 1]
+}>
 hal.executable public @main {
   hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
     hal.executable.export public @matmul_transpose_b_mfma ordinal(0) layout(#pipeline_layout) {
@@ -124,7 +135,13 @@ hal.executable public @main {
   #hal.pipeline.binding<storage_buffer, ReadOnly>,
   #hal.pipeline.binding<storage_buffer>
 ]>
-#config = #iree_gpu.lowering_config<{workgroup = [1, 64, 64, 0], reduction = [0, 0, 0, 2], subgroup = [1, 2, 2], mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>}>
+#config = #iree_gpu.lowering_config<{
+  workgroup = [1, 64, 64, 0],
+  reduction = [0, 0, 0, 2],
+  subgroup = [1, 2, 2],
+  mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
+  promote_operands = [0, 1]
+}>
 hal.executable private @main {
   hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
     hal.executable.export public @conv_igemm_im2col ordinal(0) layout(#pipeline_layout) {
@@ -211,7 +228,9 @@ hal.executable private @main {
   workgroup = [64, 64, 0],
   reduction = [0, 0, 2],
   subgroup = [2, 2],
-  mma_kind = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>}>
+  mma_kind = #iree_gpu.mma_layout<WMMA_F32_16x16x16_F16>,
+  promote_operands = [0, 1]
+}>
 hal.executable public @main {
   hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
     hal.executable.export public @matmul_transpose_b_wmma ordinal(0) layout(#pipeline_layout) {
@@ -274,7 +293,9 @@ hal.executable public @main {
   workgroup = [64, 64, 0],
   reduction = [0, 0, 2],
   subgroup = [2, 2],
-  mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>}>
+  mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x4_F32>,
+  promote_operands = [0, 1]
+}>
 
 !eltype = f32
 !aeltype = f32
@@ -326,7 +347,9 @@ hal.executable public @main {
   workgroup = [64, 64, 0],
   reduction = [0, 0, 2],
   subgroup = [2, 2],
-  mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>}>
+  mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F8E4M3FNUZ>,
+  promote_operands = [0, 1]
+}>
 
 !eltype = f8E4M3FNUZ
 !aeltype = f32
@@ -378,7 +401,9 @@ hal.executable public @main {
   workgroup = [64, 64, 0],
   reduction = [0, 0, 2],
   subgroup = [2, 2],
-  mma_kind = #iree_gpu.mma_layout<MFMA_I32_32x32x16_I8>}>
+  mma_kind = #iree_gpu.mma_layout<MFMA_I32_32x32x16_I8>,
+  promote_operands = [0, 1]
+}>
 
 !eltype = i8
 !aeltype = i32
@@ -430,7 +455,9 @@ hal.executable public @main {
   workgroup = [64, 64, 0],
   reduction = [0, 0, 2],
   subgroup = [2, 2],
-  mma_kind = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F16>}>
+  mma_kind = #iree_gpu.mma_layout<WMMA_F16_16x16x16_F16>,
+  promote_operands = [0, 1]
+}>
 
 !eltype = f16
 !aeltype = f16
@@ -474,9 +501,9 @@ hal.executable public @main {
 // -----
 
 #lowering_config = #iree_gpu.lowering_config<{
-  reduction = [0 : index, 0 : index, 0 : index, 0 : index, 1 : index, 3 : index, 3 : index],
-  thread = [1 : index, 1 : index, 1 : index, 1 : index, 0 : index, 0 : index, 0 : index],
-  workgroup = [1 : index, 1 : index, 4 : index, 8 : index, 0 : index, 0 : index, 0 : index]
+  reduction = [0, 0, 0, 0, 1, 3, 3],
+  thread = [1, 1, 1, 1, 0, 0, 0],
+  workgroup = [1, 1, 4, 8, 0, 0, 0]
 }>
 
 #translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [8, 4, 1] subgroup_size = 32>
@@ -536,9 +563,10 @@ hal.executable public @main {
 // -----
 
 #lowering_config = #iree_gpu.lowering_config<{
-  reduction = [0 : index, 0 : index, 4 : index],
-  thread = [1 : index, 4 : index, 0 : index],
-  workgroup = [4 : index, 32 : index, 0 : index]
+  reduction = [0, 0, 4],
+  thread = [1, 4, 0],
+  workgroup = [4, 32, 0],
+  promote_operands = [0, 1]
 }>
 
 #translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [8, 4, 1] subgroup_size = 32>
@@ -620,9 +648,11 @@ hal.executable public @main {
 
 #lowering_config = #iree_gpu.lowering_config<{
   mma_kind = #iree_gpu.mma_layout<WMMA_I32_16x16x16_I8>,
-  reduction = [0 : index, 0 : index, 4 : index],
-  subgroup = [2 : index, 4 : index, 0 : index],
-  workgroup = [64 : index, 128 : index, 0 : index]}>
+  reduction = [0, 0, 4],
+  subgroup = [2, 4, 0],
+  workgroup = [64, 128, 0],
+  promote_operands = [0, 1]
+}>
 
 hal.executable public @main {
   hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
@@ -678,8 +708,7 @@ hal.executable public @main {
 // -----
 
 #lowering_config = #iree_gpu.lowering_config<{
-  thread = [1 : index, 1 : index],
-  workgroup = [1 : index, 1 : index]
+  thread = [1, 1], workgroup = [1, 1]
 }>
 
 #translation_info = #iree_codegen.translation_info<LLVMGPUTileAndFuse workgroup_size = [32, 1, 1] subgroup_size = 32>