-
Notifications
You must be signed in to change notification settings - Fork 12.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[LICM] Fold associative binary ops to promote code hoisting #81608
Conversation
@llvm/pr-subscribers-backend-powerpc @llvm/pr-subscribers-llvm-transforms Author: Ricardo Jesus (rj-jesus) ChangesPerform the transformation "(LV op C1) op C2" ==> "LV op (C1 op C2)" where op is an associative binary op, LV is a loop variant, and C1 and Similar patterns could be folded (commented on I've benchmarked this with RAJAPerf on a Grace system and see an overall performance uplift of 2.5% across the whole benchmark, mainly from ~70% uplifts in three kernels. The worst I saw was a 2% regression on a single kernel which I will look into later. If needed we can make this more restrictive. Full diff: https://github.com/llvm/llvm-project/pull/81608.diff 3 Files Affected:
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 9ec9c31e48e0b6..34de5cdff0771b 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -113,6 +113,8 @@ STATISTIC(NumFPAssociationsHoisted, "Number of invariant FP expressions "
STATISTIC(NumIntAssociationsHoisted,
"Number of invariant int expressions "
"reassociated and hoisted out of the loop");
+STATISTIC(NumBOAssociationsHoisted, "Number of invariant BinaryOp expressions "
+ "reassociated and hoisted out of the loop");
/// Memory promotion is enabled by default.
static cl::opt<bool>
@@ -2754,6 +2756,75 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L,
return true;
}
+/// Reassociate general associative binary expressions of the form
+///
+/// 1. "(LV op C1) op C2" ==> "LV op (C1 op C2)"
+///
+/// where op is an associative binary op, LV is a loop variant, and C1 and C2
+/// are loop invariants.
+///
+/// TODO: This can be extended to more cases such as
+/// 2. "C1 op (C2 op LV)" ==> "(C1 op C2) op LV"
+/// 3. "(C1 op LV) op C2" ==> "LV op (C1 op C2)" if op is commutative
+/// 4. "C1 op (LV op C2)" ==> "(C1 op C2) op LV" if op is commutative
+static bool hoistBOAssociation(Instruction &I, Loop &L,
+ ICFLoopSafetyInfo &SafetyInfo,
+ MemorySSAUpdater &MSSAU, AssumptionCache *AC,
+ DominatorTree *DT) {
+ if (!isa<BinaryOperator>(I))
+ return false;
+
+ Instruction::BinaryOps Opcode = dyn_cast<BinaryOperator>(&I)->getOpcode();
+ BinaryOperator *Op0 = dyn_cast<BinaryOperator>(I.getOperand(0));
+
+ auto ClearSubclassDataAfterReassociation = [](Instruction &I) {
+ FPMathOperator *FPMO = dyn_cast<FPMathOperator>(&I);
+ if (!FPMO) {
+ I.clearSubclassOptionalData();
+ return;
+ }
+
+ FastMathFlags FMF = I.getFastMathFlags();
+ I.clearSubclassOptionalData();
+ I.setFastMathFlags(FMF);
+ };
+
+ if (I.isAssociative()) {
+ // Transform: "(LV op C1) op C2" ==> "LV op (C1 op C2)"
+ if (Op0 && Op0->getOpcode() == Opcode) {
+ Value *LV = Op0->getOperand(0);
+ Value *C1 = Op0->getOperand(1);
+ Value *C2 = I.getOperand(1);
+
+ if (L.isLoopInvariant(LV) || !L.isLoopInvariant(C1) ||
+ !L.isLoopInvariant(C2))
+ return false;
+
+ bool singleUseOp0 = Op0->hasOneUse();
+
+ // Conservatively clear all optional flags since they may not be
+ // preserved by the reassociation, but preserve fast-math flags where
+ // applicable,
+ ClearSubclassDataAfterReassociation(I);
+
+ auto *Preheader = L.getLoopPreheader();
+ assert(Preheader && "Loop is not in simplify form?");
+ IRBuilder<> Builder(Preheader->getTerminator());
+ Value *V = Builder.CreateBinOp(Opcode, C1, C2, "invariant.op");
+ I.setOperand(0, LV);
+ I.setOperand(1, V);
+
+ // Note: (LV op CV1) might not be erased if it has more than one use.
+ if (singleUseOp0)
+ eraseInstruction(cast<Instruction>(*Op0), SafetyInfo, MSSAU);
+
+ return true;
+ }
+ }
+
+ return false;
+}
+
static bool hoistArithmetics(Instruction &I, Loop &L,
ICFLoopSafetyInfo &SafetyInfo,
MemorySSAUpdater &MSSAU, AssumptionCache *AC,
@@ -2791,6 +2862,12 @@ static bool hoistArithmetics(Instruction &I, Loop &L,
return true;
}
+ if (hoistBOAssociation(I, L, SafetyInfo, MSSAU, AC, DT)) {
+ ++NumHoisted;
+ ++NumBOAssociationsHoisted;
+ return true;
+ }
+
return false;
}
diff --git a/llvm/test/Transforms/LICM/hoist-binop.ll b/llvm/test/Transforms/LICM/hoist-binop.ll
new file mode 100644
index 00000000000000..a9281039c9a39c
--- /dev/null
+++ b/llvm/test/Transforms/LICM/hoist-binop.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=licm < %s | FileCheck %s
+
+; Adapted from
+; for(long i = 0; i < n; ++i)
+; a[i] = (i+1) * v;
+define void @test1(i64 %n) {
+; CHECK-LABEL: define void @test1(
+; CHECK-SAME: i64 [[N:%.*]]) {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: br label [[FOR_PH:%.*]]
+; CHECK: for.ph:
+; CHECK-NEXT: [[VSCALE:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[VSCALE_2:%.*]] = shl nuw nsw i64 [[VSCALE]], 1
+; CHECK-NEXT: [[VSCALE_4:%.*]] = shl nuw nsw i64 [[VSCALE]], 2
+; CHECK-NEXT: [[VEC_INIT:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 1, i64 1
+; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[VSCALE_2]], i64 0
+; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+; CHECK-NEXT: [[INVARIANT_OP:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT: [[INVARIANT_OP1:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[DOTSPLAT]]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[FOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[VEC_INIT]], [[FOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ADD1:%.*]] = add nuw nsw <vscale x 2 x i64> [[VEC_IND]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT: [[ADD2:%.*]] = add <vscale x 2 x i64> [[VEC_IND]], [[INVARIANT_OP]]
+; CHECK-NEXT: call void @use(<vscale x 2 x i64> [[ADD1]])
+; CHECK-NEXT: call void @use(<vscale x 2 x i64> [[ADD2]])
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[VSCALE_4]]
+; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[INVARIANT_OP1]]
+; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N]]
+; CHECK-NEXT: br i1 [[CMP]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK: for.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ br label %for.ph
+
+for.ph:
+ %vscale = tail call i64 @llvm.vscale.i64()
+ %vscale.2 = shl nuw nsw i64 %vscale, 1
+ %vscale.4 = shl nuw nsw i64 %vscale, 2
+ %vec.init = insertelement <vscale x 2 x i64> zeroinitializer, i64 1, i64 1
+ %.splatinsert = insertelement <vscale x 2 x i64> poison, i64 %vscale.2, i64 0
+ %.splat = shufflevector <vscale x 2 x i64> %.splatinsert, <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
+ br label %for.body
+
+for.body:
+ %index = phi i64 [ 0, %for.ph ], [ %index.next, %for.body ]
+ %vec.ind = phi <vscale x 2 x i64> [ %vec.init, %for.ph ], [ %vec.ind.next, %for.body ]
+ %step.add = add <vscale x 2 x i64> %vec.ind, %.splat
+ %add1 = add nuw nsw <vscale x 2 x i64> %vec.ind, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+ %add2 = add nuw nsw <vscale x 2 x i64> %step.add, shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+ call void @use(<vscale x 2 x i64> %add1)
+ call void @use(<vscale x 2 x i64> %add2)
+ %index.next = add nuw i64 %index, %vscale.4
+ %vec.ind.next = add <vscale x 2 x i64> %step.add, %.splat
+ %cmp = icmp eq i64 %index.next, %n
+ br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+ ret void
+}
+
+declare i64 @llvm.vscale.i64()
+declare void @use(<vscale x 2 x i64>)
diff --git a/llvm/test/Transforms/LICM/sink-foldable.ll b/llvm/test/Transforms/LICM/sink-foldable.ll
index 38577a5a12563b..4f853a8f065d9f 100644
--- a/llvm/test/Transforms/LICM/sink-foldable.ll
+++ b/llvm/test/Transforms/LICM/sink-foldable.ll
@@ -97,7 +97,7 @@ define ptr @test2(i32 %j, ptr readonly %P, ptr readnone %Q) {
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds ptr, ptr [[ADD_PTR]], i64 [[IDX2_EXT]]
; CHECK-NEXT: [[L1:%.*]] = load ptr, ptr [[ARRAYIDX2]], align 8
; CHECK-NEXT: [[CMP2:%.*]] = icmp ugt ptr [[L1]], [[Q]]
-; CHECK-NEXT: [[ADD]] = add nsw i32 [[ADD_I]], 1
+; CHECK-NEXT: [[ADD]] = add i32 [[I_ADDR]], 2
; CHECK-NEXT: br i1 [[CMP2]], label [[LOOPEXIT2:%.*]], label [[FOR_COND]]
; CHECK: loopexit0:
; CHECK-NEXT: [[P0:%.*]] = phi ptr [ null, [[FOR_COND]] ]
|
@amy-kwan @chenzheng1030 Could you please have a look at the failing PowerPC tests below? I can make the transform more restrictive but I'd appreciate some feedback before doing so!
|
Would you please upload the diff for these two cases? They were both generated by scripts, so it should not be hard to update them. Thanks. |
Hi, of course, just did that—thanks very much for having a look at this! |
; CHECK-NEXT: .cfi_offset r30, -16 | ||
; CHECK-NEXT: .cfi_offset cr2, 8 | ||
; CHECK-NEXT: .cfi_offset cr3, 8 | ||
; CHECK-NEXT: .cfi_offset cr4, 8 | ||
; CHECK-NEXT: std r30, 32(r1) # 8-byte Folded Spill | ||
; CHECK-NEXT: std r29, 40(r1) # 8-byte Folded Spill | ||
; CHECK-NEXT: std r30, 48(r1) # 8-byte Folded Spill |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This case shows negative impact. After the re-association, more registers are used. It could be an issue to later passes like register allocation or scheduler. I don't think it should be a blocker for this patch. What do you think? @amy-kwan
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Any updates on this?
Can you please add a test for AArch64? |
✅ With the latest revision this PR passed the C/C++ code formatter. |
@madhur13490 The test would overlap with the one already present. Since the fold is architecture-agnostic I suggest we avoid adding a new (mostly replicated) test case and keep the one we have presently. What do you think? |
Any updates on this? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The idea makes sense, and the implementation looks good to me too.
But please wait a couple of days to give other reviewers a last chance to look at this; please merge early next week if there are no further comments;
And please have a look at the formatting and other pre-merge checks that failed. |
Perform the transformation "(LV op C1) op C2" ==> "LV op (C1 op C2)" where op is an associative binary op, LV is a loop variant, and C1 and C2 are loop invariants to hoist (C1 op C2) into the preheader.
* Simplify test Transforms/LICM/hoist-binop.ll * Create new instructions instead of modifying in-place * Replacing isa + dyn_cast with single dyn_cast * Early exit * Add test for single use case
* Minor formatting change * Updated comment
722d3fb
to
dfc3fc7
Compare
Thanks, I will do!
Thanks, I've addressed the formatting check, and I believe the other checks that failed were unrelated to this change and have disappeared with the most recent commit. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've reverted this due to the incorrect flag preservation, but generally the test coverage on this PR leaves a lot to be desired. Please see https://llvm.org/docs/InstCombineContributorGuide.html#tests for some testing guidelines.
In particular, things I would note are:
- This implements a fairly generic fold using isAssociative(), but then only tests adds. Please also add tests for some other opcodes -- in particular also for FP operations. Note that for FP operations two binary operators with the same opcode may not both be associative! Only one of them might have the reassoc flag, in which case the transform is not legal.
- It's okay to include a test that is close to your original motivation, but please keep the bulk of the test coverage simple. Don't test on vectors when scalars would do. This reduces test noise and makes proof checking faster.
- There is a complete lack of negative tests. You have a bunch of conditions that abort the transform, but none of them are tested.
|
||
auto *NewBO = | ||
BinaryOperator::Create(Opcode, LV, Inv, BO->getName() + ".reass", BO); | ||
NewBO->copyIRFlags(BO); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is incorrect. You can't preserve flags like this during a reassociation. It's possible to preserve flags in some cases, but this depends on the flags of both original instructions, the opcode and possibly also other operand characteristics like non-zero-ness or non-negative-ness.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for catching this, I'll fix this and increase the test coverage.
/cherry-pick b48819d |
/pull-request #100094 |
…lvm#81608)" This reverts commit f2ccf80. The flag propagation code is incorrect. (cherry picked from commit b48819d)
Perform the transformation "(LV op C1) op C2" ==> "LV op (C1 op C2)" where op is an associative binary op, LV is a loop variant, and C1 and C2 are loop invariants to hoist. Similar patterns could be folded (left in comment) but this one seems to be the most impactful.
…81608)" Summary: This reverts commit f2ccf80. The flag propagation code is incorrect. Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60251147
…81608)" (#100377) This reapplies a more strict version of f2ccf80. Perform the transformation "(LV op C1) op C2" ==> "LV op (C1 op C2)" where op is an associative binary op, LV is a loop variant, and C1 and C2 are loop invariants, and hoist (C1 op C2) into the preheader. For now this fold is restricted to ADDs.
Perform the transformation
"(LV op C1) op C2" ==> "LV op (C1 op C2)"
where op is an associative binary op, LV is a loop variant, and C1 and
C2 are loop invariants to hoist (C1 op C2) into the preheader.
Similar patterns could be folded (commented on
hoistBOAssociation
) but this one seems to be the most impactful. The impact of this change is exemplified here: https://godbolt.org/z/1986Tdz4MI've benchmarked this with RAJAPerf on a Grace system and see an overall performance uplift of 2.5% across the whole benchmark, mainly from ~70% uplifts in three kernels. The worst I saw was a 2% regression on a single kernel which I will look into later. If needed we can make this more restrictive.