Skip to content

Commit

Permalink
[Codegen] Replace LICM with a version that checks trip count (#18679)
Browse files Browse the repository at this point in the history
The upstream LICM pass does no verification that it is safe to hoist an
op out of a loop (i.e. the loop has >= 1 trip count). This replaces all
uses of LICM in codegen with a version that does this verification.
Other phases of the compiler probably should switch as well.
  • Loading branch information
qedawkins authored Oct 15, 2024
1 parent a3d8ad6 commit 3ccd4f1
Show file tree
Hide file tree
Showing 13 changed files with 213 additions and 19 deletions.
1 change: 1 addition & 0 deletions compiler/src/iree/compiler/Codegen/Common/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ iree_compiler_cc_library(
"HoistUnrolledVectorExtractInsertSlice.cpp",
"IREEComprehensiveBufferizePass.cpp",
"IREEExpandStridedMetadata.cpp",
"IREELoopInvariantCodeMotion.cpp",
"InstrumentMemoryAccesses.cpp",
"LowerExecutableUsingTransformDialect.cpp",
"LowerUKernelsToCalls.cpp",
Expand Down
1 change: 1 addition & 0 deletions compiler/src/iree/compiler/Codegen/Common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ iree_cc_library(
"HoistUnrolledVectorExtractInsertSlice.cpp"
"IREEComprehensiveBufferizePass.cpp"
"IREEExpandStridedMetadata.cpp"
"IREELoopInvariantCodeMotion.cpp"
"InstrumentMemoryAccesses.cpp"
"LowerExecutableUsingTransformDialect.cpp"
"LowerUKernelsToCalls.cpp"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// Copyright 2024 The IREE Authors
//
// Licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include "iree/compiler/Codegen/Common/Passes.h"
#include "iree/compiler/Codegen/Transforms/Transforms.h"

namespace mlir::iree_compiler {

#define GEN_PASS_DEF_IREELOOPINVARIANTCODEMOTIONPASS
#include "iree/compiler/Codegen/Common/Passes.h.inc"

namespace {
/// IREE loop invariant code motion (LICM) pass.
struct IREELoopInvariantCodeMotionPass
: public impl::IREELoopInvariantCodeMotionPassBase<
IREELoopInvariantCodeMotionPass> {
void runOnOperation() override;
};
} // namespace

void IREELoopInvariantCodeMotionPass::runOnOperation() {
moveLoopInvariantCodeFromGuaranteedLoops(getOperation());
}

} // namespace mlir::iree_compiler
15 changes: 15 additions & 0 deletions compiler/src/iree/compiler/Codegen/Common/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,21 @@ def RemoveSingleIterationLoopPass :
let summary = "Remove distributed loop with single iteration.";
}

// TODO: Replace with upstream: https://github.com/iree-org/iree/issues/18759
def IREELoopInvariantCodeMotionPass :
Pass<"iree-loop-invariant-code-motion", ""> {
let summary = "Performs LICM on loops guaranteed to have >= 1 trip";
let description = [{
This is a mirror of the upstream LICM pass that restricts to loops that are
guaranteed to have at least one trip. This currently only supports loops
that expose a lower and upper bound as the generic loop-like interface does
not expose a way to query for trip count.

Additionally code motion of `scf.forall` ops with mappings is always unsafe
and is explicitly disabled.
}];
}

def SplitFullPartialTransferPass :
InterfacePass<"iree-codegen-split-full-partial-transfer", "mlir::FunctionOpInterface"> {
let summary =
Expand Down
1 change: 1 addition & 0 deletions compiler/src/iree/compiler/Codegen/Common/test/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ iree_lit_test_suite(
"hoist_unrolled_vector_extract_insert_slice.mlir",
"iree_comprehensive_bufferize.mlir",
"iree_expand_strided_metadata.mlir",
"iree_loop_invariant_code_motion.mlir",
"lower_ukernel_to_calls.mlir",
"materialize_encoding_into_nop.mlir",
"materialize_user_configs.mlir",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ iree_lit_test_suite(
"hoist_unrolled_vector_extract_insert_slice.mlir"
"iree_comprehensive_bufferize.mlir"
"iree_expand_strided_metadata.mlir"
"iree_loop_invariant_code_motion.mlir"
"lower_ukernel_to_calls.mlir"
"materialize_encoding_into_nop.mlir"
"materialize_user_configs.mlir"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
// RUN: iree-opt %s -split-input-file --iree-loop-invariant-code-motion | FileCheck %s

func.func @nested_loops_code_invariant_to_both() {
%m = memref.alloc() : memref<10xf32>
%cf7 = arith.constant 7.0 : f32
%cf8 = arith.constant 8.0 : f32

affine.for %arg0 = 0 to 10 {
affine.for %arg1 = 0 to 10 {
%v0 = arith.addf %cf7, %cf8 : f32
}
}
return
}

// CHECK-LABEL: @nested_loops_code_invariant_to_both
// CHECK: memref.alloc() : memref<10xf32>
// CHECK-NEXT: arith.constant 7
// CHECK-NEXT: arith.constant 8
// CHECK-NEXT: arith.addf

// -----

func.func @do_not_hoist_with_unknown_trip_count(%lb: index, %ub: index) {
affine.for %arg1 = %lb to %ub {
affine.for %arg0 = 0 to 10 {
}
}
return
}

// CHECK-LABEL: @do_not_hoist_with_unknown_trip_count
// CHECK-NEXT: affine.for
// CHECK-NEXT: affine.for
// CHECK-NEXT: }
// CHECK-NEXT: }

// -----

func.func @do_not_hoist_scf_for_with_unknown_trip_count(%lb: index, %ub: index) {
%c1 = arith.constant 1 : index
%cf7 = arith.constant 7.0 : f32
%cf8 = arith.constant 8.0 : f32
scf.for %arg0 = %lb to %ub step %c1 {
%v0 = arith.addf %cf7, %cf8 : f32
}
return
}

// CHECK-LABEL: @do_not_hoist_scf_for_with_unknown_trip_count
// CHECK: scf.for
// CHECK-NEXT: arith.addf
// CHECK-NEXT: }

// -----

func.func @do_hoist_scf_for_with_known_trip_count() {
%c4 = arith.constant 4 : index
%c6 = arith.constant 6 : index
%c1 = arith.constant 1 : index
%cf7 = arith.constant 7.0 : f32
%cf8 = arith.constant 8.0 : f32
scf.for %arg0 = %c4 to %c6 step %c1 {
%v0 = arith.addf %cf7, %cf8 : f32
}
return
}

// CHECK-LABEL: @do_hoist_scf_for_with_known_trip_count
// CHECK: arith.addf
// CHECK: scf.for

// -----

func.func @do_not_hoist_scf_while() {
%c4 = arith.constant 4 : index
%c0 = arith.constant 0 : index
%cf7 = arith.constant 7.0 : f32
%cf8 = arith.constant 8.0 : f32
scf.while (%iter = %c0) : (index) -> (index) {
%cond = arith.cmpi slt, %iter, %c4 : index
scf.condition(%cond) %iter : index
} do {
^bb0(%arg1: index):
%v0 = arith.addf %cf7, %cf8 : f32
scf.yield %arg1 : index
}
return
}

// CHECK-LABEL: @do_not_hoist_scf_while
// CHECK: scf.while
// CHECK: scf.condition
// CHECK: arith.addf
// CHECK: scf.yield
26 changes: 13 additions & 13 deletions compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ void addGPUVectorizationPassPipeline(OpPassManager &funcPassManager) {
funcPassManager.addPass(createGPUDistributePass());

// Post bufferization optimizations.
funcPassManager.addPass(createLoopInvariantCodeMotionPass());
funcPassManager.addPass(createIREELoopInvariantCodeMotionPass());
funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass());
funcPassManager.addPass(createCanonicalizerPass());
funcPassManager.addPass(createCSEPass());
Expand Down Expand Up @@ -395,7 +395,7 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager,
// TODO: This LICM instance is load bearing due to brittleness of the
// hoisting and fusion pass, as well as a lack of a fallback distribution
// pass.
funcPassManager.addPass(createLoopInvariantCodeMotionPass());
funcPassManager.addPass(createIREELoopInvariantCodeMotionPass());
{
OptimizeTensorInsertExtractSlicesPassOptions options;
options.foldIdentitySlices = true;
Expand All @@ -408,7 +408,7 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager,
funcPassManager.addPass(createGPUGreedilyDistributeToThreadsPass());
funcPassManager.addPass(createCanonicalizerPass());
funcPassManager.addPass(createCSEPass());
funcPassManager.addPass(createLoopInvariantCodeMotionPass());
funcPassManager.addPass(createIREELoopInvariantCodeMotionPass());
funcPassManager.addPass(IREE::GPU::createCombineBarrierRegionsPass());

// Step 6. Lower special ops and vectorize.
Expand Down Expand Up @@ -438,7 +438,7 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager,
// Step 9. Remaining post-bufferization optimizations/lowerings.
funcPassManager.addPass(IREE::GPU::createLowerIREEGPUOpsPass());
funcPassManager.addPass(createUnrollAnnotatedLoopsPass());
funcPassManager.addPass(createLoopInvariantCodeMotionPass());
funcPassManager.addPass(createIREELoopInvariantCodeMotionPass());
if (pipelineOptions.enableReduceSharedMemoryBankConflicts) {
GPUReduceBankConflictsPassOptions options = {};
options.paddingBits = 64;
Expand Down Expand Up @@ -492,7 +492,7 @@ void addGPUWinogradVectorizePassPipeline(OpPassManager &funcPassManager) {
funcPassManager.addPass(createGPUDistributeScfForPass(options));

// Post bufferization optimizations.
funcPassManager.addPass(createLoopInvariantCodeMotionPass());
funcPassManager.addPass(createIREELoopInvariantCodeMotionPass());
funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass());
funcPassManager.addPass(createCanonicalizerPass());
funcPassManager.addPass(createCSEPass());
Expand Down Expand Up @@ -562,7 +562,7 @@ void addGPUMatmulSimtPassPipeline(OpPassManager &funcPassManager,
funcPassManager.addPass(createOptimizeVectorTransferPass());

// Hoist loop invariant code to avoid pipelining it.
funcPassManager.addPass(createLoopInvariantCodeMotionPass());
funcPassManager.addPass(createIREELoopInvariantCodeMotionPass());
// Pipeline memory operations.
funcPassManager.addPass(createGPUPipeliningPass());
}
Expand Down Expand Up @@ -625,7 +625,7 @@ void addGPUMatmulTensorCorePassPipeline(OpPassManager &funcPassManager,
funcPassManager.addPass(createCSEPass());

// Hoist loop invariant code to avoid pipelining it.
funcPassManager.addPass(createLoopInvariantCodeMotionPass());
funcPassManager.addPass(createIREELoopInvariantCodeMotionPass());
// Pipeline memory operations.
GPUPipeliningPassOptions pipelieningOptions = {};
pipelieningOptions.epiloguePeeling = false;
Expand Down Expand Up @@ -692,7 +692,7 @@ void addGPUMatmulTensorCoreMmaSyncPassPipeline(
funcPassManager.addPass(createCSEPass());

// Hoist loop invariant code to avoid pipelining it.
funcPassManager.addPass(createLoopInvariantCodeMotionPass());
funcPassManager.addPass(createIREELoopInvariantCodeMotionPass());
// Pipeline memory operations.
GPUPipeliningPassOptions pipelieningOptions = {};
pipelieningOptions.epiloguePeeling = false;
Expand Down Expand Up @@ -855,7 +855,7 @@ void addGPUVectorDistributePassPipeline(OpPassManager &funcPassManager,
// Set anchors at tensor level for vector distribution later and hoist out
// loop invariant anchors.
funcPassManager.addPass(createLLVMGPUConfigureTensorLayoutsPass());
funcPassManager.addPass(createLoopInvariantCodeMotionPass());
funcPassManager.addPass(createIREELoopInvariantCodeMotionPass());

// Generalize all named ops so that we can fold away unit extent dims. By this
// point, all tiling is finished so the tiling configurations on those ops can
Expand Down Expand Up @@ -936,7 +936,7 @@ void addGPUWarpReductionPassPipeline(OpPassManager &funcPassManager) {
funcPassManager.addPass(createCanonicalizerPass());
funcPassManager.addPass(createCSEPass());
}
funcPassManager.addPass(createLoopInvariantCodeMotionPass());
funcPassManager.addPass(createIREELoopInvariantCodeMotionPass());
funcPassManager.addPass(createCanonicalizerPass());
funcPassManager.addPass(createCSEPass());

Expand All @@ -945,7 +945,7 @@ void addGPUWarpReductionPassPipeline(OpPassManager &funcPassManager) {
funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass());
funcPassManager.addPass(createOptimizeVectorTransferPass());
funcPassManager.addPass(createOptimizeTensorInsertExtractSlicesPass());
funcPassManager.addPass(createLoopInvariantCodeMotionPass());
funcPassManager.addPass(createIREELoopInvariantCodeMotionPass());
funcPassManager.addPass(createCanonicalizerPass());
funcPassManager.addPass(createCSEPass());
funcPassManager.addPass(createForOpCanonicalizationPass());
Expand Down Expand Up @@ -1040,13 +1040,13 @@ addLowerAndOptimizeAddressComputationPasses(FunctionLikeNest &funcPassManager) {
.addPass(memref::createExpandStridedMetadataPass)
// Hoist loop invariant variables to give affine decomposition pass the
// right loop dependencies.
.addPass(createLoopInvariantCodeMotionPass)
.addPass(createIREELoopInvariantCodeMotionPass)
// Decompose affine ops.
.addPass(createDecomposeAffineOpsPass)
// Get rid of the redundant computations.
.addPass(createCSEPass)
// Hoist the resulting decompositions.
.addPass(createLoopInvariantCodeMotionPass)
.addPass(createIREELoopInvariantCodeMotionPass)
.addPass(createLowerAffinePass);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -639,10 +639,9 @@ hal.executable public @main {
// the producer's (convolution's) distributed scf.forall loop.
// CHECK-LABEL: func @conv_nchw_fused
// CHECK: %[[ALLOCA:.+]] = memref.alloca() : memref<1x1x1x1xf32, #gpu.address_space<private>>
// CHECK: %[[ALLOCA2:.+]] = memref.alloca() : memref<1x1x1x1xf32, #gpu.address_space<private>>
// CHECK: scf.for %{{.*}} = %c0 to %c64 step %c1
// CHECK: linalg.conv_2d_nchw_fchw
// CHECK-SAME: outs(%[[ALLOCA2]] : memref<1x1x1x1xf32, #gpu.address_space<private>>)
// CHECK-SAME: outs(%[[ALLOCA]] : memref<1x1x1x1xf32, #gpu.address_space<private>>)
// CHECK: arith.addf
// CHECK: arith.cmpf
// CHECK: arith.select
Expand Down
6 changes: 3 additions & 3 deletions compiler/src/iree/compiler/Codegen/SPIRV/Passes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -530,7 +530,7 @@ void addSPIRVMatmulPromoteVectorizePassPipeline(OpPassManager &funcPassManager,
funcPassManager.addPass(createOptimizeVectorTransferPass());

// Hoist loop invariant code to avoid pipelining it.
funcPassManager.addPass(createLoopInvariantCodeMotionPass());
funcPassManager.addPass(createIREELoopInvariantCodeMotionPass());
PipeliningSchedulingStrategy schedule =
storeStage == 0 ? PipeliningSchedulingStrategy::loadStoreStage0
: PipeliningSchedulingStrategy::loadGlobalStage0;
Expand Down Expand Up @@ -572,7 +572,7 @@ void addSPIRVSubgroupReducePassPipeline(OpPassManager &funcPassManager) {
funcPassManager.addPass(createCSEPass());
}

funcPassManager.addPass(createLoopInvariantCodeMotionPass());
funcPassManager.addPass(createIREELoopInvariantCodeMotionPass());
funcPassManager.addPass(createCanonicalizerPass());
funcPassManager.addPass(createCSEPass());

Expand All @@ -587,7 +587,7 @@ void addSPIRVSubgroupReducePassPipeline(OpPassManager &funcPassManager) {

// Simplify the IR for vector distribution.
funcPassManager.addPass(memref::createFoldMemRefAliasOpsPass());
funcPassManager.addPass(createLoopInvariantCodeMotionPass());
funcPassManager.addPass(createIREELoopInvariantCodeMotionPass());
funcPassManager.addPass(createCanonicalizerPass());
funcPassManager.addPass(createCSEPass());
funcPassManager.addPass(createForOpCanonicalizationPass());
Expand Down
47 changes: 47 additions & 0 deletions compiler/src/iree/compiler/Codegen/Transforms/Transforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
#include "mlir/IR/Matchers.h"
#include "mlir/IR/OpDefinition.h"
#include "mlir/Interfaces/ValueBoundsOpInterface.h"
#include "mlir/Transforms/LoopInvariantCodeMotionUtils.h"

#define DEBUG_TYPE "iree-codegen-transforms"

Expand Down Expand Up @@ -492,6 +493,52 @@ LogicalResult lowerWorkgroupCountFromSliceOp(
maxWorkgroupParallelDims);
}

//===---------------------------------------------------------------------===//
// Helper to perform LICM on loops that are guaranteed at least one trip.
//===---------------------------------------------------------------------===//

void moveLoopInvariantCodeFromGuaranteedLoops(Operation *target) {
// Walk through all loops in a function in innermost-loop-first order. This
// way, we first LICM from the inner loop, and place the ops in
// the outer loop, which in turn can be further LICM'ed.
//
// Hoisting is only performed on loops with guaranteed non-zero trip counts.
// `scf.forall` ops with mapping attributes can never be proven to have a
// non-zero trip count until the loop is resolved and is blanket included
// here.
target->walk([&](LoopLikeOpInterface loopLike) {
if (auto forallOp = dyn_cast<scf::ForallOp>(*loopLike)) {
if (forallOp.getMapping()) {
return;
}
}

// Skip loops without lower/upper bounds. There is no generic way to verify
// whether a loop has at least one trip so new loop types of interest can be
// added as needed. For example, `scf.while` needs non-trivial analysis of
// its condition region to know that it has at least one trip.
std::optional<SmallVector<OpFoldResult>> maybeLowerBounds =
loopLike.getLoopLowerBounds();
std::optional<SmallVector<OpFoldResult>> maybeUpperBounds =
loopLike.getLoopUpperBounds();
if (!maybeLowerBounds || !maybeUpperBounds) {
return;
}

// If any lower + upper bound pair cannot be definitely verified as lb < ub
// then the loop may have a zero trip count.
for (auto [lb, ub] :
llvm::zip_equal(*maybeLowerBounds, *maybeUpperBounds)) {
if (!ValueBoundsConstraintSet::compare(lb, ValueBoundsConstraintSet::LT,
ub)) {
return;
}
}

moveLoopInvariantCode(loopLike);
});
}

//===---------------------------------------------------------------------===//
// Patterns to fold tensor.expand/collapse_shape into
// `hal.interface.binding.subspan`
Expand Down
Loading

0 comments on commit 3ccd4f1

Please sign in to comment.