From 49a8ee7f28990e68e5379d28da78222a2fd3113a Mon Sep 17 00:00:00 2001 From: Quinn Dawkins Date: Sun, 22 Sep 2024 00:59:41 -0400 Subject: [PATCH] [Codegen][GPU] Make operand promotion controlled by lowering config Promoting the operands of a matmul is optional and best to control through the lowering config rather than based on on the fly analysis. This gives greater flexibility for adding support for other operations too (like promotion of another kind of contraction or convolution like op without have to always extend this pass). --- .../Common/GPU/GPUPromoteMatmulOperands.cpp | 48 +++++-------- .../GPU/test/gpu_promote_matmul_operands.mlir | 31 ++++++--- .../Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp | 24 +++++-- .../Codegen/Dialect/GPU/IR/IREEGPUAttrs.td | 3 + .../Dialect/GPU/TargetUtils/ConfigUtils.cpp | 36 ++++++++++ .../test/ROCDL/config_tile_and_fuse.mlir | 3 + .../test/ROCDL/pipeline_tile_and_fuse.mlir | 67 +++++++++++++------ 7 files changed, 144 insertions(+), 68 deletions(-) diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp index 5024d9cf7748a..d5a4a280e83ad 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp @@ -64,47 +64,29 @@ void promoteOperand(OpBuilder &builder, Operation *op, unsigned index) { op->setOperand(index, copy.getResult(0)); } -bool isNonMatvecContraction(linalg::LinalgOp linalgOp) { - SmallVector bounds = linalgOp.getStaticLoopRanges(); - FailureOr contractionDims = - mlir::linalg::inferContractionDims(linalgOp); - if (failed(contractionDims)) { - return false; - } - - if (contractionDims->k.size() < 1 || contractionDims->m.size() < 1 || - contractionDims->n.size() < 1) { - return false; - } - - auto getElementCount = [&](ArrayRef dims) { - int64_t acc = 1; - for (auto mDim : dims) { - int64_t size = bounds[mDim]; - if (ShapedType::isDynamic(size)) { - return size; - } - acc *= size; - } - return acc; - }; - return getElementCount(contractionDims->m) != 1 && - getElementCount(contractionDims->n) != 1; -} - struct GPUPromoteMatmulOperandsPass final : impl::GPUPromoteMatmulOperandsPassBase { void runOnOperation() override { FunctionOpInterface funcOp = getOperation(); OpBuilder builder(funcOp); - funcOp.walk([&](linalg::LinalgOp linalgOp) { - if (!isNonMatvecContraction(linalgOp)) { + funcOp.walk([&](Operation *op) { + auto loweringConfig = + getLoweringConfig(op); + if (!loweringConfig) { + return; + } + + std::optional> promotedOperands = + loweringConfig.getPromotedOperandList(); + if (!promotedOperands) { return; } - builder.setInsertionPoint(linalgOp); - promoteOperand(builder, linalgOp, 0); - promoteOperand(builder, linalgOp, 1); + + builder.setInsertionPoint(op); + for (auto operand : promotedOperands.value()) { + promoteOperand(builder, op, operand); + } }); } }; diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir index 2873140477535..5ec02698451a5 100644 --- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir +++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir @@ -1,10 +1,13 @@ // RUN: iree-opt %s --split-input-file --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-promote-matmul-operands))" | FileCheck %s +#lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1]}> + func.func @matmul(%a: tensor<32x1024xf32>, %b: tensor<1024x128xf32>) -> tensor<32x128xf32> { %cst = arith.constant 0.000000e+00 : f32 %empty = tensor.empty() : tensor<32x128xf32> %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<32x128xf32>) -> tensor<32x128xf32> - %mm = linalg.matmul ins(%a, %b : tensor<32x1024xf32>, tensor<1024x128xf32>) outs(%fill : tensor<32x128xf32>) -> tensor<32x128xf32> + %mm = linalg.matmul {lowering_config = #lowering_config} + ins(%a, %b : tensor<32x1024xf32>, tensor<1024x128xf32>) outs(%fill : tensor<32x128xf32>) -> tensor<32x128xf32> return %mm : tensor<32x128xf32> } @@ -13,33 +16,40 @@ func.func @matmul(%a: tensor<32x1024xf32>, %b: tensor<1024x128xf32>) -> tensor<3 // CHECK-SAME: %[[B:[A-Za-z0-9]+]]: tensor<1024x128xf32> // CHECK-DAG: %[[PA:.+]] = linalg.copy {{.*}} ins(%[[A]] : tensor<32x1024xf32>) // CHECK-DAG: %[[PB:.+]] = linalg.copy {{.*}} ins(%[[B]] : tensor<1024x128xf32>) -// CHECK: linalg.matmul ins(%[[PA]], %[[PB]] : tensor<32x1024xf32>, tensor<1024x128xf32>) +// CHECK: linalg.matmul {{.*}} ins(%[[PA]], %[[PB]] : tensor<32x1024xf32>, tensor<1024x128xf32>) // ----- -func.func @matvec(%a: tensor<1x1024xf32>, %b: tensor<1024x128xf32>) -> tensor<1x128xf32> { +#lowering_config = #iree_gpu.lowering_config<{promote_operands = []}> + +func.func @empty_config(%a: tensor<1x1024xf32>, %b: tensor<1024x128xf32>) -> tensor<1x128xf32> { %cst = arith.constant 0.000000e+00 : f32 %empty = tensor.empty() : tensor<1x128xf32> %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<1x128xf32>) -> tensor<1x128xf32> - %mm = linalg.matmul ins(%a, %b : tensor<1x1024xf32>, tensor<1024x128xf32>) outs(%fill : tensor<1x128xf32>) -> tensor<1x128xf32> + %mm = linalg.matmul {lowering_config = #lowering_config} + ins(%a, %b : tensor<1x1024xf32>, tensor<1024x128xf32>) outs(%fill : tensor<1x128xf32>) -> tensor<1x128xf32> return %mm : tensor<1x128xf32> } -// Verify that no copies are generated for matvec operations. -// CHECK-LABEL: func.func @matvec +// Verify that no copies are generated with an empty lowering config +// CHECK-LABEL: func.func @empty_config // CHECK-NOT: linalg.copy // CHECK: return // ----- +#lowering_config = #iree_gpu.lowering_config<{promote_operands = [0]}> + #map = affine_map<(d0, d1, d2) -> (d0, d2)> #map1 = affine_map<(d0, d1, d2) -> (d2, d1)> #map2 = affine_map<(d0, d1, d2) -> (d0, d1)> -func.func @generic_matmul(%a: tensor<32x1024xf32>, %b: tensor<1024x128xf32>) -> tensor<32x128xf32> { +func.func @lhs_only_matmul(%a: tensor<32x1024xf32>, %b: tensor<1024x128xf32>) -> tensor<32x128xf32> { %cst = arith.constant 0.000000e+00 : f32 %empty = tensor.empty() : tensor<32x128xf32> %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<32x128xf32>) -> tensor<32x128xf32> - %mm = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} + %mm = linalg.generic { + indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"], + lowering_config = #lowering_config} ins(%a, %b : tensor<32x1024xf32>, tensor<1024x128xf32>) outs(%fill : tensor<32x128xf32>) { ^bb0(%in: f32, %in_0: f32, %out: f32): %7 = arith.mulf %in, %in_0 : f32 @@ -49,9 +59,8 @@ func.func @generic_matmul(%a: tensor<32x1024xf32>, %b: tensor<1024x128xf32>) -> return %mm : tensor<32x128xf32> } -// CHECK-LABEL: func.func @generic_matmul +// CHECK-LABEL: func.func @lhs_only_matmul // CHECK-SAME: %[[A:[A-Za-z0-9]+]]: tensor<32x1024xf32> // CHECK-SAME: %[[B:[A-Za-z0-9]+]]: tensor<1024x128xf32> // CHECK-DAG: %[[PA:.+]] = linalg.copy {{.*}} ins(%[[A]] : tensor<32x1024xf32>) -// CHECK-DAG: %[[PB:.+]] = linalg.copy {{.*}} ins(%[[B]] : tensor<1024x128xf32>) -// CHECK: linalg.generic {{.*}} ins(%[[PA]], %[[PB]] : tensor<32x1024xf32>, tensor<1024x128xf32>) +// CHECK: linalg.generic {{.*}} ins(%[[PA]], %[[B]] : tensor<32x1024xf32>, tensor<1024x128xf32>) diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp index 07cd27df23e87..10a5c65605d0d 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp @@ -1312,17 +1312,20 @@ static StringRef getTilingLevelName(GPU::TilingLevel level) { return StringAttr(); } -static SmallVector getTileSizes(DictionaryAttr config, - GPU::TilingLevel level) { - auto sizes = config.getAs(getTilingLevelName(level)); - if (!sizes || !llvm::all_of(sizes.getValue(), llvm::IsaPred)) { +static SmallVector getIntegerList(ArrayAttr array) { + if (!array || !llvm::all_of(array.getValue(), llvm::IsaPred)) { return {}; } - return llvm::map_to_vector(sizes.getValue(), [](Attribute s) -> int64_t { + return llvm::map_to_vector(array.getValue(), [](Attribute s) -> int64_t { return cast(s).getInt(); }); } +static SmallVector getTileSizes(DictionaryAttr config, + GPU::TilingLevel level) { + return getIntegerList(config.getAs(getTilingLevelName(level))); +} + SmallVector LoweringConfigAttr::getWorkgroupTileSizes() const { return getTileSizes(getAttributes(), GPU::TilingLevel::Workgroup); } @@ -1366,6 +1369,17 @@ IREE::GPU::MmaInterfaceAttr LoweringConfigAttr::getMmaKind() const { return getAttributes().getAs(kMmaKindName); } +constexpr StringLiteral kPromoteOperandsName = "promote_operands"; + +std::optional> +LoweringConfigAttr::getPromotedOperandList() const { + auto array = getAttributes().getAs(kPromoteOperandsName); + if (!array) { + return std::nullopt; + } + return getIntegerList(array); +} + //===----------------------------------------------------------------------===// // DerivedThreadConfigAttr //===----------------------------------------------------------------------===// diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td index be04d1925a38d..d2b69166748ea 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td @@ -59,6 +59,9 @@ def IREEGPU_LoweringConfigAttr : let extraClassDeclaration = [{ /// Helper to retrieve a target mma intrinsic if present. ::mlir::iree_compiler::IREE::GPU::MmaInterfaceAttr getMmaKind() const; + + /// Helper to retrieve a list of operand indices to promote. + std::optional> getPromotedOperandList() const; }]; } diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp index 9b6180b7d4d88..4318cd2dc316e 100644 --- a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp +++ b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp @@ -199,6 +199,8 @@ LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target, attrs.emplace_back(StringAttr::get(context, "subgroup"), b.getI64ArrayAttr(subgroupTileSizes)); attrs.emplace_back(StringAttr::get(context, "mma_kind"), mmaKind); + attrs.emplace_back(StringAttr::get(context, "promote_operands"), + b.getI64ArrayAttr({0, 1})); auto configDict = DictionaryAttr::get(context, attrs); auto loweringConfig = IREE::GPU::LoweringConfigAttr::get(context, configDict); @@ -220,6 +222,35 @@ LogicalResult setMatmulLoweringConfig(IREE::GPU::TargetAttr target, workgroupSize, targetSubgroupSize, pipelineConfig); } +/// Helper to identify contraction like operations for operand promotiong. +static bool isNonMatvecContraction(linalg::LinalgOp linalgOp) { + SmallVector bounds = linalgOp.getStaticLoopRanges(); + FailureOr contractionDims = + mlir::linalg::inferContractionDims(linalgOp); + if (failed(contractionDims)) { + return false; + } + + if (contractionDims->k.size() < 1 || contractionDims->m.size() < 1 || + contractionDims->n.size() < 1) { + return false; + } + + auto getElementCount = [&](ArrayRef dims) { + int64_t acc = 1; + for (auto mDim : dims) { + int64_t size = bounds[mDim]; + if (ShapedType::isDynamic(size)) { + return size; + } + acc *= size; + } + return acc; + }; + return getElementCount(contractionDims->m) != 1 && + getElementCount(contractionDims->n) != 1; +} + LogicalResult setTileAndFuseLoweringConfig(IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPoint, Operation *op) { @@ -439,6 +470,11 @@ LogicalResult setTileAndFuseLoweringConfig(IREE::GPU::TargetAttr target, attrs.emplace_back(StringAttr::get(context, "thread"), b.getI64ArrayAttr(threadTileSizes)); + if (isNonMatvecContraction(linalgOp)) { + attrs.emplace_back(StringAttr::get(context, "promote_operands"), + b.getI64ArrayAttr({0, 1})); + } + // Heuristic value chosen to limit maximum vector sizes when tiling below. const unsigned maxVectorSize = 32; diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir index 93736c704412b..9a6362a5ee002 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir @@ -35,6 +35,7 @@ func.func @expanded_matmul_transpose_b(%lhs: tensor<2x64x2048xf16>, %rhs: tensor // CHECK: linalg.generic {{.*}}lowering_config = #iree_gpu.lowering_config // CHECK-SAME: mma_kind = #iree_gpu.mma_layout +// CHECK-SAME: promote_operands = [0, 1] // CHECK-SAME: reduction = [0, 0, 0, 0, 4] // CHECK-SAME: subgroup = [0, 0, 4, 1, 0] // CHECK-SAME: workgroup = [1, 1, 64, 64, 0] @@ -59,6 +60,7 @@ func.func @mfma_matmul_1024x1024x1024(%lhs: tensor<1024x1024xf16>, %rhs: tensor< // CHECK: linalg.matmul {{.*}}lowering_config = #iree_gpu.lowering_config // CHECK-SAME: mma_kind = #iree_gpu.mma_layout +// CHECK-SAME: promote_operands = [0, 1] // CHECK-SAME: reduction = [0, 0, 2] // CHECK-SAME: subgroup = [4, 4, 0] // CHECK-SAME: workgroup = [128, 128, 0] @@ -100,6 +102,7 @@ module { // CHECK-LABEL: func.func @matmul_dynamic_dim // CHECK-SAME: #iree_codegen.translation_info // CHECK: linalg.matmul {{.*}}lowering_config = #iree_gpu.lowering_config +// CHECK-SAME: promote_operands = [0, 1] // CHECK-SAME: reduction = [0, 0, 4] // CHECK-SAME: thread = [1, 1, 0] // CHECK-SAME: workgroup = [1, 64, 0] diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir index 61c1c248ce87b..46c174e05107e 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_tile_and_fuse.mlir @@ -6,7 +6,12 @@ #hal.pipeline.binding, #hal.pipeline.binding ]> -#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 4], thread = [8, 4]}> +#config = #iree_gpu.lowering_config<{ + workgroup = [64, 64, 0], + reduction = [0, 0, 4], + thread = [8, 4], + promote_operands = [0, 1] +}> hal.executable public @main { hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { hal.executable.export public @matmul_transpose_b ordinal(0) layout(#pipeline_layout) { @@ -65,7 +70,13 @@ hal.executable public @main { #hal.pipeline.binding, #hal.pipeline.binding ]> -#config = #iree_gpu.lowering_config<{workgroup = [64, 64, 0], reduction = [0, 0, 2], subgroup = [2, 2], mma_kind = #iree_gpu.mma_layout}> +#config = #iree_gpu.lowering_config<{ + workgroup = [64, 64, 0], + reduction = [0, 0, 2], + subgroup = [2, 2], + mma_kind = #iree_gpu.mma_layout, + promote_operands = [0, 1] +}> hal.executable public @main { hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { hal.executable.export public @matmul_transpose_b_mfma ordinal(0) layout(#pipeline_layout) { @@ -124,7 +135,13 @@ hal.executable public @main { #hal.pipeline.binding, #hal.pipeline.binding ]> -#config = #iree_gpu.lowering_config<{workgroup = [1, 64, 64, 0], reduction = [0, 0, 0, 2], subgroup = [1, 2, 2], mma_kind = #iree_gpu.mma_layout}> +#config = #iree_gpu.lowering_config<{ + workgroup = [1, 64, 64, 0], + reduction = [0, 0, 0, 2], + subgroup = [1, 2, 2], + mma_kind = #iree_gpu.mma_layout, + promote_operands = [0, 1] +}> hal.executable private @main { hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { hal.executable.export public @conv_igemm_im2col ordinal(0) layout(#pipeline_layout) { @@ -211,7 +228,9 @@ hal.executable private @main { workgroup = [64, 64, 0], reduction = [0, 0, 2], subgroup = [2, 2], - mma_kind = #iree_gpu.mma_layout}> + mma_kind = #iree_gpu.mma_layout, + promote_operands = [0, 1] +}> hal.executable public @main { hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { hal.executable.export public @matmul_transpose_b_wmma ordinal(0) layout(#pipeline_layout) { @@ -274,7 +293,9 @@ hal.executable public @main { workgroup = [64, 64, 0], reduction = [0, 0, 2], subgroup = [2, 2], - mma_kind = #iree_gpu.mma_layout}> + mma_kind = #iree_gpu.mma_layout, + promote_operands = [0, 1] +}> !eltype = f32 !aeltype = f32 @@ -326,7 +347,9 @@ hal.executable public @main { workgroup = [64, 64, 0], reduction = [0, 0, 2], subgroup = [2, 2], - mma_kind = #iree_gpu.mma_layout}> + mma_kind = #iree_gpu.mma_layout, + promote_operands = [0, 1] +}> !eltype = f8E4M3FNUZ !aeltype = f32 @@ -378,7 +401,9 @@ hal.executable public @main { workgroup = [64, 64, 0], reduction = [0, 0, 2], subgroup = [2, 2], - mma_kind = #iree_gpu.mma_layout}> + mma_kind = #iree_gpu.mma_layout, + promote_operands = [0, 1] +}> !eltype = i8 !aeltype = i32 @@ -430,7 +455,9 @@ hal.executable public @main { workgroup = [64, 64, 0], reduction = [0, 0, 2], subgroup = [2, 2], - mma_kind = #iree_gpu.mma_layout}> + mma_kind = #iree_gpu.mma_layout, + promote_operands = [0, 1] +}> !eltype = f16 !aeltype = f16 @@ -474,9 +501,9 @@ hal.executable public @main { // ----- #lowering_config = #iree_gpu.lowering_config<{ - reduction = [0 : index, 0 : index, 0 : index, 0 : index, 1 : index, 3 : index, 3 : index], - thread = [1 : index, 1 : index, 1 : index, 1 : index, 0 : index, 0 : index, 0 : index], - workgroup = [1 : index, 1 : index, 4 : index, 8 : index, 0 : index, 0 : index, 0 : index] + reduction = [0, 0, 0, 0, 1, 3, 3], + thread = [1, 1, 1, 1, 0, 0, 0], + workgroup = [1, 1, 4, 8, 0, 0, 0] }> #translation_info = #iree_codegen.translation_info @@ -536,9 +563,10 @@ hal.executable public @main { // ----- #lowering_config = #iree_gpu.lowering_config<{ - reduction = [0 : index, 0 : index, 4 : index], - thread = [1 : index, 4 : index, 0 : index], - workgroup = [4 : index, 32 : index, 0 : index] + reduction = [0, 0, 4], + thread = [1, 4, 0], + workgroup = [4, 32, 0], + promote_operands = [0, 1] }> #translation_info = #iree_codegen.translation_info @@ -620,9 +648,11 @@ hal.executable public @main { #lowering_config = #iree_gpu.lowering_config<{ mma_kind = #iree_gpu.mma_layout, - reduction = [0 : index, 0 : index, 4 : index], - subgroup = [2 : index, 4 : index, 0 : index], - workgroup = [64 : index, 128 : index, 0 : index]}> + reduction = [0, 0, 4], + subgroup = [2, 4, 0], + workgroup = [64, 128, 0], + promote_operands = [0, 1] +}> hal.executable public @main { hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) { @@ -678,8 +708,7 @@ hal.executable public @main { // ----- #lowering_config = #iree_gpu.lowering_config<{ - thread = [1 : index, 1 : index], - workgroup = [1 : index, 1 : index] + thread = [1, 1], workgroup = [1, 1] }> #translation_info = #iree_codegen.translation_info