Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[VPlan] Delay adding canonical IV increment and exit branches. #82270

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 19 additions & 22 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7549,6 +7549,15 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
// cost model is complete for better cost estimates.
VPlanTransforms::unrollByUF(BestVPlan, BestUF,
OrigLoop->getHeader()->getContext());

TailFoldingStyle Style = CM.getTailFoldingStyle(
!isIndvarOverflowCheckKnownFalse(&CM, BestVF, BestUF));
// When not folding the tail, we know that the induction increment will not
// overflow.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Otherwise, when folding tail, the induction increment may always overflow? Perhaps consider above isIndvarOverflowCheckKnownFalse()?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current code just moves the existing logic. Put up #111758 to improve this separately.

bool HasNUW = Style == TailFoldingStyle::None;
bool WithoutRuntimeCheck =
Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
VPlanTransforms::lowerCanonicalIV(BestVPlan, HasNUW, WithoutRuntimeCheck);
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);

LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
Expand Down Expand Up @@ -8664,36 +8673,31 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
if (CM.foldTailWithEVL() &&
!VPlanTransforms::tryAddExplicitVectorLength(*Plan))
break;
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
VPlans.push_back(std::move(Plan));
}
VF = SubRange.End;
}
}

// Add the necessary canonical IV and branch recipes required to control the
// loop.
static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
DebugLoc DL) {
// Add the required canonical IV.
static void addCanonicalIV(VPlan &Plan, Type *IdxTy, DebugLoc DL) {
Value *StartIdx = ConstantInt::get(IdxTy, 0);
auto *StartV = Plan.getOrAddLiveIn(StartIdx);

// Add a VPCanonicalIVPHIRecipe starting at 0 to the header.
// TODO: Introduce a separate scalar phi recipe that can be used for codegen,
// turning VPCanonicalIVPHIRecipe into an 'abstract' recipe which cannot be
// executed directly.
auto *CanonicalIVPHI = new VPCanonicalIVPHIRecipe(StartV, DL);
VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
VPBasicBlock *Header = TopRegion->getEntryBasicBlock();
Header->insert(CanonicalIVPHI, Header->begin());

VPBuilder Builder(TopRegion->getExitingBasicBlock());
// Add a VPInstruction to increment the scalar canonical IV by VF * UF.
auto *CanonicalIVIncrement = Builder.createOverflowingOp(
Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
"index.next");
CanonicalIVPHI->addOperand(CanonicalIVIncrement);

// Add the BranchOnCount VPInstruction to the latch.
VPBuilder Builder(TopRegion->getExitingBasicBlock());
// TODO: introduce branch-on-count during VPlan final (pre-codegen) lowering.
Builder.createNaryOp(VPInstruction::BranchOnCount,
{CanonicalIVIncrement, &Plan.getVectorTripCount()}, DL);
{CanonicalIVPHI, &Plan.getVectorTripCount()}, DL);
}

// Collect VPIRInstructions for phis in the original exit block that are modeled
Expand Down Expand Up @@ -8943,10 +8947,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {

DebugLoc DL = getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
// When not folding the tail, we know that the induction increment will not
// overflow.
bool HasNUW = Style == TailFoldingStyle::None;
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
addCanonicalIV(*Plan, Legal->getWidestInductionType(), DL);

VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);

Expand Down Expand Up @@ -9179,11 +9180,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
Plan->getVectorLoopRegion()->getExitingBasicBlock()->getTerminator();
Term->eraseFromParent();

// Tail folding is not supported for outer loops, so the induction increment
// is guaranteed to not wrap.
bool HasNUW = true;
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
DebugLoc());
addCanonicalIV(*Plan, Legal->getWidestInductionType(), DebugLoc());
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
return Plan;
}
Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "VPlanCFG.h"
#include "VPlanDominatorTree.h"
#include "VPlanPatternMatch.h"
#include "VPlanVerifier.h"
#include "VPlanTransforms.h"
#include "VPlanUtils.h"
#include "llvm/ADT/PostOrderIterator.h"
Expand Down Expand Up @@ -1018,6 +1019,8 @@ static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
/// Assumes a single pre-header basic-block was created for this. Introduce
/// additional basic-blocks as needed, and fill them all.
void VPlan::execute(VPTransformState *State) {
assert(verifyVPlanIsValid(*this) && "VPlan is invalid");

// Initialize CFG state.
State->CFG.PrevVPBB = nullptr;
State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor();
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -2991,7 +2991,8 @@ class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {

VPCanonicalIVPHIRecipe *clone() override {
auto *R = new VPCanonicalIVPHIRecipe(getOperand(0), getDebugLoc());
R->addOperand(getBackedgeValue());
if (getNumOperands() == 2)
R->addOperand(getBackedgeValue());
return R;
}

Expand Down
139 changes: 85 additions & 54 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1201,19 +1201,12 @@ void VPlanTransforms::optimize(VPlan &Plan) {
// %Negated = Not %ALM
// branch-on-cond %Negated
//
static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
VPlan &Plan, bool DataAndControlFlowWithoutRuntimeCheck) {
static VPActiveLaneMaskPHIRecipe *createActiveLaneMaskPhi(VPlan &Plan) {
VPRegionBlock *TopRegion = Plan.getVectorLoopRegion();
VPBasicBlock *EB = TopRegion->getExitingBasicBlock();
auto *CanonicalIVPHI = Plan.getCanonicalIV();
VPValue *StartV = CanonicalIVPHI->getStartValue();

auto *CanonicalIVIncrement =
cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
// TODO: Check if dropping the flags is needed if
// !DataAndControlFlowWithoutRuntimeCheck.
CanonicalIVIncrement->dropPoisonGeneratingFlags();
DebugLoc DL = CanonicalIVIncrement->getDebugLoc();
DebugLoc DL = CanonicalIVPHI->getDebugLoc();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is for an increment placed in the preheader, so ok to use the DL of the phi instead of that of the in-loop/backedged-value increment?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I think so, the DL from the CanonicalIV should be the closest accurate debug location

// We can't use StartV directly in the ActiveLaneMask VPInstruction, since
// we have to take unrolling into account. Each part needs to start at
// Part * VF
Expand All @@ -1223,21 +1216,6 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
// Create the ActiveLaneMask instruction using the correct start values.
VPValue *TC = Plan.getTripCount();

VPValue *TripCount, *IncrementValue;
if (!DataAndControlFlowWithoutRuntimeCheck) {
// When the loop is guarded by a runtime overflow check for the loop
// induction variable increment by VF, we can increment the value before
// the get.active.lane mask and use the unmodified tripcount.
IncrementValue = CanonicalIVIncrement;
TripCount = TC;
} else {
// When avoiding a runtime check, the active.lane.mask inside the loop
// uses a modified trip count and the induction variable increment is
// done after the active.lane.mask intrinsic is called.
IncrementValue = CanonicalIVPHI;
TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
{TC}, DL);
}
auto *EntryIncrement = Builder.createOverflowingOp(
VPInstruction::CanonicalIVIncrementForPart, {StartV}, {false, false}, DL,
"index.part.next");
Expand All @@ -1251,24 +1229,6 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch(
// preheader ActiveLaneMask instruction.
auto *LaneMaskPhi = new VPActiveLaneMaskPHIRecipe(EntryALM, DebugLoc());
LaneMaskPhi->insertAfter(CanonicalIVPHI);

// Create the active lane mask for the next iteration of the loop before the
// original terminator.
VPRecipeBase *OriginalTerminator = EB->getTerminator();
Builder.setInsertPoint(OriginalTerminator);
auto *InLoopIncrement =
Builder.createOverflowingOp(VPInstruction::CanonicalIVIncrementForPart,
{IncrementValue}, {false, false}, DL);
auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
{InLoopIncrement, TripCount}, DL,
"active.lane.mask.next");
LaneMaskPhi->addOperand(ALM);

// Replace the original terminator with BranchOnCond. We have to invert the
// mask here because a true condition means jumping to the exit block.
auto *NotMask = Builder.createNot(ALM, DL);
Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
OriginalTerminator->eraseFromParent();
return LaneMaskPhi;
}

Expand Down Expand Up @@ -1334,8 +1294,7 @@ void VPlanTransforms::addActiveLaneMask(
cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
VPSingleDefRecipe *LaneMask;
if (UseActiveLaneMaskForControlFlow) {
LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(
Plan, DataAndControlFlowWithoutRuntimeCheck);
LaneMask = createActiveLaneMaskPhi(Plan);
} else {
VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV);
LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask,
Expand Down Expand Up @@ -1451,6 +1410,7 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {

auto *CanonicalIVPHI = Plan.getCanonicalIV();
VPValue *StartV = CanonicalIVPHI->getStartValue();
VPBasicBlock *Latch = Plan.getVectorLoopRegion()->getExitingBasicBlock();

// Create the ExplicitVectorLengthPhi recipe in the main loop.
auto *EVLPhi = new VPEVLBasedIVPHIRecipe(StartV, DebugLoc());
Expand All @@ -1464,30 +1424,26 @@ bool VPlanTransforms::tryAddExplicitVectorLength(VPlan &Plan) {
new VPInstruction(VPInstruction::ExplicitVectorLength, AVL, DebugLoc());
VPEVL->insertAfter(AVL);

auto *CanonicalIVIncrement =
cast<VPInstruction>(CanonicalIVPHI->getBackedgeValue());
VPSingleDefRecipe *OpVPEVL = VPEVL;
VPRecipeBase *LatchTerm = Latch->getTerminator();
if (unsigned IVSize = CanonicalIVPHI->getScalarType()->getScalarSizeInBits();
IVSize != 32) {
OpVPEVL = new VPScalarCastRecipe(IVSize < 32 ? Instruction::Trunc
: Instruction::ZExt,
OpVPEVL, CanonicalIVPHI->getScalarType());
OpVPEVL->insertBefore(CanonicalIVIncrement);
OpVPEVL->insertBefore(LatchTerm);
}
auto *NextEVLIV =
new VPInstruction(Instruction::Add, {OpVPEVL, EVLPhi},
{CanonicalIVIncrement->hasNoUnsignedWrap(),
CanonicalIVIncrement->hasNoSignedWrap()},
CanonicalIVIncrement->getDebugLoc(), "index.evl.next");
NextEVLIV->insertBefore(CanonicalIVIncrement);
new VPInstruction(Instruction::Add, {OpVPEVL, EVLPhi}, {false, false},
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wrap flags initiated to false may later be turned on?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes at this point we don't know if it wraps or not. Without #111758, the flags for EVL increment would always be false I think, as it always folds the tail.

With it, we could update it when introducing the increment (currently not done)

CanonicalIVPHI->getDebugLoc(), "index.evl.next");
NextEVLIV->insertBefore(LatchTerm);
EVLPhi->addOperand(NextEVLIV);

transformRecipestoEVLRecipes(Plan, *VPEVL);

// Replace all uses of VPCanonicalIVPHIRecipe by
// VPEVLBasedIVPHIRecipe except for the canonical IV increment.
// VPEVLBasedIVPHIRecipe.
CanonicalIVPHI->replaceAllUsesWith(EVLPhi);
CanonicalIVIncrement->setOperand(0, CanonicalIVPHI);
// TODO: support unroll factor > 1.
Plan.setUF(1);
return true;
Expand Down Expand Up @@ -1664,3 +1620,78 @@ void VPlanTransforms::createInterleaveGroups(
}
}
}

void VPlanTransforms::lowerCanonicalIV(
VPlan &Plan, bool HasNUW, bool DataAndControlFlowWithoutRuntimeCheck) {
auto *CanIV = Plan.getCanonicalIV();

VPBasicBlock *EB = Plan.getVectorLoopRegion()->getExitingBasicBlock();
auto *Term = EB->getTerminator();
VPBuilder Builder(Term);
DebugLoc DL = CanIV->getDebugLoc();
// Add a VPInstruction to increment the scalar canonical IV by VF * UF.
auto *CanonicalIVIncrement =
Builder.createOverflowingOp(Instruction::Add, {CanIV, &Plan.getVFxUF()},
{HasNUW, false}, DL, "index.next");

CanIV->addOperand(CanonicalIVIncrement);

auto FoundLaneMaskPhi = find_if(
Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis(),
[](VPRecipeBase &P) { return isa<VPActiveLaneMaskPHIRecipe>(P); });

if (FoundLaneMaskPhi ==
Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis().end()) {

// Update BranchOnCount VPInstruction in the latch to use increment.
// TODO: Should have separate opcodes for separate semantics.
Term->setOperand(0, CanonicalIVIncrement);
return;
}

// Now introduce a conditional branch to control the loop until the lane mask
// is exhuasted.
auto *LaneMaskPhi = cast<VPActiveLaneMaskPHIRecipe>(&*FoundLaneMaskPhi);
auto *VecPreheader =
cast<VPBasicBlock>(Plan.getVectorLoopRegion()->getSinglePredecessor());
Builder.setInsertPoint(VecPreheader);

VPValue *TC = Plan.getTripCount();

// TODO: Check if dropping the flags is needed if
// !DataAndControlFlowWithoutRuntimeCheck.
CanonicalIVIncrement->dropPoisonGeneratingFlags();
VPValue *TripCount, *IncrementValue;
if (!DataAndControlFlowWithoutRuntimeCheck) {
// When the loop is guarded by a runtime overflow check for the loop
// induction variable increment by VF, we can increment the value before
// the get.active.lane mask and use the unmodified tripcount.
IncrementValue = CanonicalIVIncrement;
TripCount = TC;
} else {
// When avoiding a runtime check, the active.lane.mask inside the loop
// uses a modified trip count and the induction variable increment is
// done after the active.lane.mask intrinsic is called.
IncrementValue = CanIV;
TripCount = Builder.createNaryOp(VPInstruction::CalculateTripCountMinusVF,
{TC}, DL);
}
// Create the active lane mask for the next iteration of the loop before the
// original terminator.
Builder.setInsertPoint(EB);
auto *InLoopIncrement = Plan.getUF() > 1
? Builder.createOverflowingOp(
VPInstruction::CanonicalIVIncrementForPart,
{IncrementValue}, {false, false}, DL)
: IncrementValue;
auto *ALM = Builder.createNaryOp(VPInstruction::ActiveLaneMask,
{InLoopIncrement, TripCount}, DL,
"active.lane.mask.next");
LaneMaskPhi->addOperand(ALM);

// Replace the original terminator with BranchOnCond. We have to invert the
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This now introduces the terminal rather than replace it.

Perhaps a terminal branch-on-count recipe should be introduced along with the abstract canonical IV (could conceptually check the pre-bumped IV with TC-step), delaying only the introduction of the canonical IV's increment between them for later? The canonical IV still remains abstract until this increment is added, but the VPlan continues to be "valid" w/o updating verify().

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated to initially still introduce the branch early. At the moment it still uses the same opcode, but would probably need to introduce a separate one for the different semantics (or define them conditionally on whether lowering has been finalized)

// mask here because a true condition means jumping to the exit block.
auto *NotMask = Builder.createNot(ALM, DL);
Builder.createNaryOp(VPInstruction::BranchOnCond, {NotMask}, DL);
Term->eraseFromParent();
}
5 changes: 5 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,11 @@ struct VPlanTransforms {

/// Remove dead recipes from \p Plan.
static void removeDeadRecipes(VPlan &Plan);

/// Finalize \p Plan by introducing explicit increments for the canonical
/// induction.
static void lowerCanonicalIV(VPlan &Plan, bool HasNUW,
bool DataAndControlFlowWithoutRuntimeCheck);
};

} // namespace llvm
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) {

auto *CanonicalIV = R->getParent()->getPlan()->getCanonicalIV();
// Canonical IV chain is uniform.
if (V == CanonicalIV || V == CanonicalIV->getBackedgeValue())
if (V == CanonicalIV) // || V == CanonicalIV->getBackedgeValue())
return true;

return TypeSwitch<const VPRecipeBase *, bool>(R)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,6 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) #0 {
; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[N]])
; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-ORDERED-TF: vector.body:
; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
Expand Down Expand Up @@ -653,11 +652,11 @@ define void @fadd_strict_interleave(ptr noalias nocapture readonly %a, ptr noali
; CHECK-ORDERED-TF-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4
; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP2]])
; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 4
; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = sub i64 [[TMP2]], [[TMP9]]
; CHECK-ORDERED-TF-NEXT: [[TMP11:%.*]] = icmp ugt i64 [[TMP2]], [[TMP9]]
; CHECK-ORDERED-TF-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i64 [[TMP10]], i64 0
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP2]])
; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-ORDERED-TF: vector.body:
; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
Expand Down Expand Up @@ -915,7 +914,6 @@ define float @fadd_of_sum(ptr noalias nocapture readonly %a, ptr noalias nocaptu
; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = sub i64 [[N]], [[TMP7]]
; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = icmp ugt i64 [[N]], [[TMP7]]
; CHECK-ORDERED-TF-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i64 [[TMP8]], i64 0
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-ORDERED-TF: vector.body:
; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
Expand Down Expand Up @@ -1154,7 +1152,6 @@ define float @fadd_conditional(ptr noalias nocapture readonly %a, ptr noalias no
; CHECK-ORDERED-TF-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]]
; CHECK-ORDERED-TF-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]]
; CHECK-ORDERED-TF-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0
; CHECK-ORDERED-TF-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]])
; CHECK-ORDERED-TF-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-ORDERED-TF: vector.body:
; CHECK-ORDERED-TF-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
Expand Down
Loading
Loading