From 263080528a0dc74a1e827ea6db6d6d9e9d3bfb30 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Mon, 22 Apr 2024 12:18:44 +0100 Subject: [PATCH] [VPlan] Add new VPInstruction ocpode for header mask. This patch adds a new VPInstruction::HeaderMask opcode to model the abstract header-mask used for tail-folding. It will be lowered depending on target preference (either using active-lane-mask, explicit-vector-length or a wide compare of the canonical IV and the backedge taken count) Similarly to https://github.com/llvm/llvm-project/pull/82270, it would be good to clarify/agree on the terminology w.r.t. to recipes/opcodes that cannot be code-gen'd directly (i.e. require further gradual lowering). NOTE: some tests are failing or needed updating, due to widened IVs being replaced by scalar-steps, as their only use was the earlier wide compare. This could be fixed by either adding a suitable wide canonical IV as operand to the header-mask recipe and exactly preserve the original behavior. Alternatively we could keep the current behavior of the patch and update the tests. Or introduce a wide induction PHI instead of VPWidenCanonicalIVReicpe; currently we *only* use a wide IV for VPWidenCanonicalIVRecipe, if there was a suitable IV in the original loop, *even* if the mask compare is the *only* wide use. Either never or always using a wide PHI would be more consistent (or eventually make a more informed cost-based decision). --- .../Transforms/Vectorize/LoopVectorize.cpp | 16 +- llvm/lib/Transforms/Vectorize/VPlan.h | 3 + .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 1 + .../Transforms/Vectorize/VPlanTransforms.cpp | 194 ++++++-------- .../Transforms/Vectorize/VPlanTransforms.h | 4 + .../dont-fold-tail-for-divisible-TC.ll | 39 +-- .../LoopVectorize/reduction-inloop-pred.ll | 243 ++++++++++-------- .../LoopVectorize/reduction-predselect.ll | 65 +++-- .../vplan-sink-scalars-and-merge.ll | 12 +- 9 files changed, 277 insertions(+), 300 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 33c4decd58a6c2..70beb4b1bfd415 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8035,21 +8035,14 @@ void VPRecipeBuilder::createHeaderMask() { return; } - // Introduce the early-exit compare IV <= BTC to form header block mask. - // This is used instead of IV < TC because TC may wrap, unlike BTC. Start by - // constructing the desired canonical IV in the header block as its first - // non-phi instructions. - + // Introduce an abstract header-mask VPInstruction. This will be lowered later + // depending on target preference. VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi(); - auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); - HeaderVPBB->insert(IV, NewInsertionPoint); - VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint); - VPValue *BlockMask = nullptr; - VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); - BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC); + VPValue *BlockMask = + Builder.createNaryOp(VPInstruction::HeaderMask, {Plan.getCanonicalIV()}); BlockMaskCache[Header] = BlockMask; } @@ -8555,6 +8548,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, // TODO: try to put it close to addActiveLaneMask(). if (CM.foldTailWithEVL()) VPlanTransforms::addExplicitVectorLength(*Plan); + VPlanTransforms::lowerRecipes(*Plan); assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid"); VPlans.push_back(std::move(Plan)); } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index c74329a0bcc4ac..bd472d770803e0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1180,6 +1180,9 @@ class VPInstruction : public VPRecipeWithIRFlags { // operand). Only generates scalar values (either for the first lane only or // for all lanes, depending on its uses). PtrAdd, + // An abstract representation of the vector loops header mask, to be lowered + // later depending on target preference. + HeaderMask, }; private: diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 9ec422ec002c82..87bcea50b21572 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -132,6 +132,7 @@ bool VPRecipeBase::mayHaveSideEffects() const { case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::PtrAdd: + case VPInstruction::HeaderMask: return false; default: return true; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 007dc3f89b3fb9..9c56e4b8d2b21f 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -434,44 +434,6 @@ static void removeRedundantInductionCasts(VPlan &Plan) { } } -/// Try to replace VPWidenCanonicalIVRecipes with a widened canonical IV -/// recipe, if it exists. -static void removeRedundantCanonicalIVs(VPlan &Plan) { - VPCanonicalIVPHIRecipe *CanonicalIV = Plan.getCanonicalIV(); - VPWidenCanonicalIVRecipe *WidenNewIV = nullptr; - for (VPUser *U : CanonicalIV->users()) { - WidenNewIV = dyn_cast(U); - if (WidenNewIV) - break; - } - - if (!WidenNewIV) - return; - - VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); - for (VPRecipeBase &Phi : HeaderVPBB->phis()) { - auto *WidenOriginalIV = dyn_cast(&Phi); - - if (!WidenOriginalIV || !WidenOriginalIV->isCanonical() || - WidenOriginalIV->getScalarType() != WidenNewIV->getScalarType()) - continue; - - // Replace WidenNewIV with WidenOriginalIV if WidenOriginalIV provides - // everything WidenNewIV's users need. That is, WidenOriginalIV will - // generate a vector phi or all users of WidenNewIV demand the first lane - // only. - if (any_of(WidenOriginalIV->users(), - [WidenOriginalIV](VPUser *U) { - return !U->usesScalars(WidenOriginalIV); - }) || - vputils::onlyFirstLaneUsed(WidenNewIV)) { - WidenNewIV->replaceAllUsesWith(WidenOriginalIV); - WidenNewIV->eraseFromParent(); - return; - } - } -} - /// Returns true if \p R is dead and can be removed. static bool isDeadRecipe(VPRecipeBase &R) { using namespace llvm::PatternMatch; @@ -1086,7 +1048,6 @@ void VPlanTransforms::truncateToMinimalBitwidths( } void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) { - removeRedundantCanonicalIVs(Plan); removeRedundantInductionCasts(Plan); simplifyRecipes(Plan, SE.getContext()); @@ -1203,52 +1164,32 @@ static VPActiveLaneMaskPHIRecipe *addVPLaneMaskPhiAndUpdateExitBranch( return LaneMaskPhi; } -/// Collect all VPValues representing a header mask through the (ICMP_ULE, -/// WideCanonicalIV, backedge-taken-count) pattern. -/// TODO: Introduce explicit recipe for header-mask instead of searching -/// for the header-mask pattern manually. -static SmallVector collectAllHeaderMasks(VPlan &Plan) { - SmallVector WideCanonicalIVs; - auto *FoundWidenCanonicalIVUser = - find_if(Plan.getCanonicalIV()->users(), - [](VPUser *U) { return isa(U); }); - assert(count_if(Plan.getCanonicalIV()->users(), - [](VPUser *U) { return isa(U); }) <= - 1 && - "Must have at most one VPWideCanonicalIVRecipe"); - if (FoundWidenCanonicalIVUser != Plan.getCanonicalIV()->users().end()) { - auto *WideCanonicalIV = - cast(*FoundWidenCanonicalIVUser); - WideCanonicalIVs.push_back(WideCanonicalIV); - } - - // Also include VPWidenIntOrFpInductionRecipes that represent a widened - // version of the canonical induction. +/// Return the header mask recipe of the VPlan, if there is one. +static VPInstruction *getHeaderMask(VPlan &Plan) { VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); - for (VPRecipeBase &Phi : HeaderVPBB->phis()) { - auto *WidenOriginalIV = dyn_cast(&Phi); - if (WidenOriginalIV && WidenOriginalIV->isCanonical()) - WideCanonicalIVs.push_back(WidenOriginalIV); - } + auto R = find_if(*HeaderVPBB, [](VPRecipeBase &R) { + using namespace llvm::VPlanPatternMatch; + return match(&R, m_VPInstruction(m_VPValue())); + }); + return R == HeaderVPBB->end() ? nullptr : cast(&*R); +} - // Walk users of wide canonical IVs and collect to all compares of the form - // (ICMP_ULE, WideCanonicalIV, backedge-taken-count). - SmallVector HeaderMasks; - VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); - for (auto *Wide : WideCanonicalIVs) { - for (VPUser *U : SmallVector(Wide->users())) { - auto *HeaderMask = dyn_cast(U); - if (!HeaderMask || HeaderMask->getOpcode() != Instruction::ICmp || - HeaderMask->getPredicate() != CmpInst::ICMP_ULE || - HeaderMask->getOperand(1) != BTC) - continue; +static VPValue *getOrCreateWideCanonicalIV(VPlan &Plan, + VPRecipeBase *InsertPt) { - assert(HeaderMask->getOperand(0) == Wide && - "WidenCanonicalIV must be the first operand of the compare"); - HeaderMasks.push_back(HeaderMask); - } + VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); + for (VPRecipeBase &R : HeaderVPBB->phis()) { + auto *WideIV = dyn_cast(&R); + if (!WideIV || !WideIV->isCanonical() || + Plan.getCanonicalIV()->getScalarType() != WideIV->getScalarType()) + continue; + return WideIV; + break; } - return HeaderMasks; + + auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV()); + IV->insertBefore(InsertPt); + return IV; } void VPlanTransforms::addActiveLaneMask( @@ -1258,30 +1199,23 @@ void VPlanTransforms::addActiveLaneMask( UseActiveLaneMaskForControlFlow) && "DataAndControlFlowWithoutRuntimeCheck implies " "UseActiveLaneMaskForControlFlow"); - - auto FoundWidenCanonicalIVUser = - find_if(Plan.getCanonicalIV()->users(), - [](VPUser *U) { return isa(U); }); - assert(FoundWidenCanonicalIVUser && - "Must have widened canonical IV when tail folding!"); - auto *WideCanonicalIV = - cast(*FoundWidenCanonicalIVUser); + VPValue *HeaderMask = getHeaderMask(Plan); + assert(HeaderMask && "Active-lane-mask not needed?"); VPSingleDefRecipe *LaneMask; if (UseActiveLaneMaskForControlFlow) { LaneMask = addVPLaneMaskPhiAndUpdateExitBranch( Plan, DataAndControlFlowWithoutRuntimeCheck); } else { - VPBuilder B = VPBuilder::getToInsertAfter(WideCanonicalIV); - LaneMask = B.createNaryOp(VPInstruction::ActiveLaneMask, - {WideCanonicalIV, Plan.getTripCount()}, nullptr, - "active.lane.mask"); + VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock(); + VPBuilder B; + B.setInsertPoint(HeaderVPBB, HeaderVPBB->getFirstNonPhi()); + LaneMask = B.createNaryOp( + VPInstruction::ActiveLaneMask, + {getOrCreateWideCanonicalIV(Plan, &*HeaderVPBB->getFirstNonPhi()), + Plan.getTripCount()}, + nullptr, "active.lane.mask"); } - - // Walk users of WideCanonicalIV and replace all compares of the form - // (ICMP_ULE, WideCanonicalIV, backedge-taken-count) with an - // active-lane-mask. - for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) - HeaderMask->replaceAllUsesWith(LaneMask); + HeaderMask->replaceAllUsesWith(LaneMask); } /// Add a VPEVLBasedIVPHIRecipe and related recipes to \p Plan and @@ -1307,6 +1241,10 @@ void VPlanTransforms::addActiveLaneMask( /// ... /// void VPlanTransforms::addExplicitVectorLength(VPlan &Plan) { + VPValue *HeaderMask = getHeaderMask(Plan); + if (!HeaderMask) + return; + VPBasicBlock *Header = Plan.getVectorLoopRegion()->getEntryBasicBlock(); auto *CanonicalIVPHI = Plan.getCanonicalIV(); VPValue *StartV = CanonicalIVPHI->getStartValue(); @@ -1336,31 +1274,30 @@ void VPlanTransforms::addExplicitVectorLength(VPlan &Plan) { NextEVLIV->insertBefore(CanonicalIVIncrement); EVLPhi->addOperand(NextEVLIV); - for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) { - for (VPUser *U : collectUsersRecursively(HeaderMask)) { - auto *MemR = dyn_cast(U); - if (!MemR) - continue; - assert(!MemR->isReverse() && - "Reversed memory operations not supported yet."); - VPValue *OrigMask = MemR->getMask(); - assert(OrigMask && "Unmasked widen memory recipe when folding tail"); - VPValue *NewMask = HeaderMask == OrigMask ? nullptr : OrigMask; - if (auto *L = dyn_cast(MemR)) { - auto *N = new VPWidenLoadEVLRecipe(L, VPEVL, NewMask); - N->insertBefore(L); - L->replaceAllUsesWith(N); - L->eraseFromParent(); - } else if (auto *S = dyn_cast(MemR)) { - auto *N = new VPWidenStoreEVLRecipe(S, VPEVL, NewMask); - N->insertBefore(S); - S->eraseFromParent(); - } else { - llvm_unreachable("unsupported recipe"); - } + for (VPUser *U : collectUsersRecursively(HeaderMask)) { + auto *MemR = dyn_cast(U); + if (!MemR) + continue; + assert(!MemR->isReverse() && + "Reversed memory operations not supported yet."); + VPValue *OrigMask = MemR->getMask(); + assert(OrigMask && "Unmasked widen memory recipe when folding tail"); + VPValue *NewMask = HeaderMask == OrigMask ? nullptr : OrigMask; + if (auto *L = dyn_cast(MemR)) { + auto *N = new VPWidenLoadEVLRecipe(L, VPEVL, NewMask); + N->insertBefore(L); + L->replaceAllUsesWith(N); + L->eraseFromParent(); + } else if (auto *S = dyn_cast(MemR)) { + auto *N = new VPWidenStoreEVLRecipe(S, VPEVL, NewMask); + N->insertBefore(S); + S->eraseFromParent(); + } else { + llvm_unreachable("unsupported recipe"); } - recursivelyDeleteDeadRecipes(HeaderMask); } + recursivelyDeleteDeadRecipes(HeaderMask); + // Replace all uses of VPCanonicalIVPHIRecipe by // VPEVLBasedIVPHIRecipe except for the canonical IV increment. CanonicalIVPHI->replaceAllUsesWith(EVLPhi); @@ -1465,3 +1402,16 @@ void VPlanTransforms::dropPoisonGeneratingRecipes( } } } + +void VPlanTransforms::lowerRecipes(VPlan &Plan) { + VPInstruction *HeaderMask = getHeaderMask(Plan); + if (!HeaderMask) + return; + + VPValue *IV = getOrCreateWideCanonicalIV(Plan, HeaderMask); + VPBuilder Builder(HeaderMask); + VPValue *BTC = Plan.getOrCreateBackedgeTakenCount(); + VPValue *M = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC); + HeaderMask->replaceAllUsesWith(M); + HeaderMask->eraseFromParent(); +} diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 0cbc70713d9c10..607c43458c1b40 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -105,6 +105,10 @@ struct VPlanTransforms { /// VPCanonicalIVPHIRecipe is only used to control the loop after /// this transformation. static void addExplicitVectorLength(VPlan &Plan); + + /// Lower abstract VPInstruction recipes to a concrete sequence of recipes for + /// which code can be generated. + static void lowerRecipes(VPlan &Plan); }; } // namespace llvm diff --git a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll index 76ca2507b914cf..cc015ce465c472 100644 --- a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll +++ b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll @@ -37,7 +37,7 @@ define dso_local void @alignTC(ptr noalias nocapture %A, i32 %n) optsize { ; CHECK-NEXT: store i32 13, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], [[ALIGNEDTC]] -; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -158,13 +158,15 @@ define dso_local void @cannotProveAlignedTC(ptr noalias nocapture %A, i32 %p, i3 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[N]], 1 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT1]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ] -; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE8:%.*]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i32> [[VEC_IV]], [[BROADCAST_SPLAT2]] ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: @@ -174,31 +176,30 @@ define dso_local void @cannotProveAlignedTC(ptr noalias nocapture %A, i32 %p, i3 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: ; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1 -; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] -; CHECK: pred.store.if1: +; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] +; CHECK: pred.store.if3: ; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP5]] ; CHECK-NEXT: store i32 13, ptr [[TMP6]], align 1 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]] -; CHECK: pred.store.continue2: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] +; CHECK: pred.store.continue4: ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2 -; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] -; CHECK: pred.store.if3: +; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6:%.*]] +; CHECK: pred.store.if5: ; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 2 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP8]] ; CHECK-NEXT: store i32 13, ptr [[TMP9]], align 1 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] -; CHECK: pred.store.continue4: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] +; CHECK: pred.store.continue6: ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3 -; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]] -; CHECK: pred.store.if5: +; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8]] +; CHECK: pred.store.if7: ; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[INDEX]], 3 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP11]] ; CHECK-NEXT: store i32 13, ptr [[TMP12]], align 1 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] -; CHECK: pred.store.continue6: +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]] +; CHECK: pred.store.continue8: ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll index e79f0983a63785..c18c2f27e0f437 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-pred.ll @@ -11,9 +11,11 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP26:%.*]], [[PRED_LOAD_CONTINUE6]] ] -; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = or disjoint <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IV]], ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: @@ -57,7 +59,6 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) { ; CHECK-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP24]]) ; CHECK-NEXT: [[TMP26]] = add i32 [[TMP25]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], ; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: @@ -96,11 +97,13 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE8:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE8]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[PRED_LOAD_CONTINUE8]] ] -; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[PRED_LOAD_CONTINUE8]] ] -; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = or disjoint <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IV]], ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: @@ -115,8 +118,8 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_LOAD_IF]] ] ; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ] ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i64 1 -; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] -; CHECK: pred.load.if3: +; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] +; CHECK: pred.load.if1: ; CHECK-NEXT: [[TMP11:%.*]] = or disjoint i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]] ; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4 @@ -124,13 +127,13 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP11]] ; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4 ; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP16]], i64 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]] -; CHECK: pred.load.continue4: -; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF3]] ] -; CHECK-NEXT: [[TMP19:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]] +; CHECK: pred.load.continue2: +; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP19:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], [[PRED_LOAD_IF1]] ] ; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP0]], i64 2 -; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] -; CHECK: pred.load.if5: +; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; CHECK: pred.load.if3: ; CHECK-NEXT: [[TMP21:%.*]] = or disjoint i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP21]] ; CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4 @@ -138,13 +141,13 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP21]] ; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4 ; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP26]], i64 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]] -; CHECK: pred.load.continue6: -; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ] -; CHECK-NEXT: [[TMP29:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP27]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]] +; CHECK: pred.load.continue4: +; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP24]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP29:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP27]], [[PRED_LOAD_IF3]] ] ; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP0]], i64 3 -; CHECK-NEXT: br i1 [[TMP30]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8]] -; CHECK: pred.load.if7: +; CHECK-NEXT: br i1 [[TMP30]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]] +; CHECK: pred.load.if5: ; CHECK-NEXT: [[TMP31:%.*]] = or disjoint i64 [[INDEX]], 3 ; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP31]] ; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4 @@ -152,11 +155,11 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP31]] ; CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[TMP35]], align 4 ; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP29]], i32 [[TMP36]], i64 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE8]] -; CHECK: pred.load.continue8: -; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP34]], [[PRED_LOAD_IF7]] ] -; CHECK-NEXT: [[TMP39:%.*]] = phi <4 x i32> [ [[TMP29]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP37]], [[PRED_LOAD_IF7]] ] -; CHECK-NEXT: [[TMP40:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[VEC_IND1]], <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]] +; CHECK: pred.load.continue6: +; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP34]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP39:%.*]] = phi <4 x i32> [ [[TMP29]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP37]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP40:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[VEC_IND]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP40]]) ; CHECK-NEXT: [[TMP42:%.*]] = add i32 [[TMP41]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP43:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP38]], <4 x i32> zeroinitializer @@ -166,8 +169,7 @@ define i32 @reduction_sum(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: [[TMP47:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP46]]) ; CHECK-NEXT: [[TMP48]] = add i32 [[TMP47]], [[TMP45]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[VEC_IND_NEXT2]] = add <4 x i32> [[VEC_IND1]], +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; CHECK-NEXT: [[TMP49:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP49]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: @@ -212,9 +214,11 @@ define i32 @reduction_sum_const(ptr noalias nocapture %A) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[PRED_LOAD_CONTINUE6]] ] -; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = or disjoint <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IV]], ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: @@ -261,7 +265,6 @@ define i32 @reduction_sum_const(ptr noalias nocapture %A) { ; CHECK-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP27]]) ; CHECK-NEXT: [[TMP29]] = add i32 [[TMP28]], [[TMP26]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], ; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: @@ -301,11 +304,13 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE8:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE8]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 1, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[PRED_LOAD_CONTINUE8]] ] -; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[PRED_LOAD_CONTINUE8]] ] -; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 1, [[VECTOR_PH]] ], [ [[TMP48:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = or disjoint <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IV]], ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: @@ -320,8 +325,8 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_LOAD_IF]] ] ; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ] ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i64 1 -; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] -; CHECK: pred.load.if3: +; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] +; CHECK: pred.load.if1: ; CHECK-NEXT: [[TMP11:%.*]] = or disjoint i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]] ; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4 @@ -329,13 +334,13 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP11]] ; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4 ; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP16]], i64 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]] -; CHECK: pred.load.continue4: -; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF3]] ] -; CHECK-NEXT: [[TMP19:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]] +; CHECK: pred.load.continue2: +; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP19:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], [[PRED_LOAD_IF1]] ] ; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP0]], i64 2 -; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] -; CHECK: pred.load.if5: +; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; CHECK: pred.load.if3: ; CHECK-NEXT: [[TMP21:%.*]] = or disjoint i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP21]] ; CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4 @@ -343,13 +348,13 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP21]] ; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4 ; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP26]], i64 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]] -; CHECK: pred.load.continue6: -; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ] -; CHECK-NEXT: [[TMP29:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP27]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]] +; CHECK: pred.load.continue4: +; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP24]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP29:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP27]], [[PRED_LOAD_IF3]] ] ; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP0]], i64 3 -; CHECK-NEXT: br i1 [[TMP30]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8]] -; CHECK: pred.load.if7: +; CHECK-NEXT: br i1 [[TMP30]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]] +; CHECK: pred.load.if5: ; CHECK-NEXT: [[TMP31:%.*]] = or disjoint i64 [[INDEX]], 3 ; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP31]] ; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4 @@ -357,11 +362,11 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP31]] ; CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[TMP35]], align 4 ; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP29]], i32 [[TMP36]], i64 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE8]] -; CHECK: pred.load.continue8: -; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP34]], [[PRED_LOAD_IF7]] ] -; CHECK-NEXT: [[TMP39:%.*]] = phi <4 x i32> [ [[TMP29]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP37]], [[PRED_LOAD_IF7]] ] -; CHECK-NEXT: [[TMP40:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[VEC_IND1]], <4 x i32> +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]] +; CHECK: pred.load.continue6: +; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP34]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP39:%.*]] = phi <4 x i32> [ [[TMP29]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP37]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP40:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[VEC_IND]], <4 x i32> ; CHECK-NEXT: [[TMP41:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP40]]) ; CHECK-NEXT: [[TMP42:%.*]] = mul i32 [[TMP41]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP43:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP38]], <4 x i32> @@ -371,8 +376,7 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: [[TMP47:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP46]]) ; CHECK-NEXT: [[TMP48]] = mul i32 [[TMP47]], [[TMP45]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[VEC_IND_NEXT2]] = add <4 x i32> [[VEC_IND1]], +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; CHECK-NEXT: [[TMP49:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP49]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: @@ -416,11 +420,13 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE8:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE8]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[PRED_LOAD_CONTINUE8]] ] -; CHECK-NEXT: [[VEC_IND1:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT2:%.*]], [[PRED_LOAD_CONTINUE8]] ] -; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP46:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = or disjoint <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IV]], ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: @@ -435,8 +441,8 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: [[TMP8:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_LOAD_IF]] ] ; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x i32> [ poison, [[VECTOR_BODY]] ], [ [[TMP7]], [[PRED_LOAD_IF]] ] ; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i64 1 -; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] -; CHECK: pred.load.if3: +; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_LOAD_IF1:%.*]], label [[PRED_LOAD_CONTINUE2:%.*]] +; CHECK: pred.load.if1: ; CHECK-NEXT: [[TMP11:%.*]] = or disjoint i64 [[INDEX]], 1 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP11]] ; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4 @@ -444,13 +450,13 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP11]] ; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4 ; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP16]], i64 1 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]] -; CHECK: pred.load.continue4: -; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF3]] ] -; CHECK-NEXT: [[TMP19:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE2]] +; CHECK: pred.load.continue2: +; CHECK-NEXT: [[TMP18:%.*]] = phi <4 x i32> [ [[TMP8]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP14]], [[PRED_LOAD_IF1]] ] +; CHECK-NEXT: [[TMP19:%.*]] = phi <4 x i32> [ [[TMP9]], [[PRED_LOAD_CONTINUE]] ], [ [[TMP17]], [[PRED_LOAD_IF1]] ] ; CHECK-NEXT: [[TMP20:%.*]] = extractelement <4 x i1> [[TMP0]], i64 2 -; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6:%.*]] -; CHECK: pred.load.if5: +; CHECK-NEXT: br i1 [[TMP20]], label [[PRED_LOAD_IF3:%.*]], label [[PRED_LOAD_CONTINUE4:%.*]] +; CHECK: pred.load.if3: ; CHECK-NEXT: [[TMP21:%.*]] = or disjoint i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP21]] ; CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4 @@ -458,13 +464,13 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP21]] ; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[TMP25]], align 4 ; CHECK-NEXT: [[TMP27:%.*]] = insertelement <4 x i32> [[TMP19]], i32 [[TMP26]], i64 2 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]] -; CHECK: pred.load.continue6: -; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP24]], [[PRED_LOAD_IF5]] ] -; CHECK-NEXT: [[TMP29:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP27]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE4]] +; CHECK: pred.load.continue4: +; CHECK-NEXT: [[TMP28:%.*]] = phi <4 x i32> [ [[TMP18]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP24]], [[PRED_LOAD_IF3]] ] +; CHECK-NEXT: [[TMP29:%.*]] = phi <4 x i32> [ [[TMP19]], [[PRED_LOAD_CONTINUE2]] ], [ [[TMP27]], [[PRED_LOAD_IF3]] ] ; CHECK-NEXT: [[TMP30:%.*]] = extractelement <4 x i1> [[TMP0]], i64 3 -; CHECK-NEXT: br i1 [[TMP30]], label [[PRED_LOAD_IF7:%.*]], label [[PRED_LOAD_CONTINUE8]] -; CHECK: pred.load.if7: +; CHECK-NEXT: br i1 [[TMP30]], label [[PRED_LOAD_IF5:%.*]], label [[PRED_LOAD_CONTINUE6]] +; CHECK: pred.load.if5: ; CHECK-NEXT: [[TMP31:%.*]] = or disjoint i64 [[INDEX]], 3 ; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP31]] ; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[TMP32]], align 4 @@ -472,20 +478,19 @@ define i32 @reduction_mix(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP31]] ; CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[TMP35]], align 4 ; CHECK-NEXT: [[TMP37:%.*]] = insertelement <4 x i32> [[TMP29]], i32 [[TMP36]], i64 3 -; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE8]] -; CHECK: pred.load.continue8: -; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP34]], [[PRED_LOAD_IF7]] ] -; CHECK-NEXT: [[TMP39:%.*]] = phi <4 x i32> [ [[TMP29]], [[PRED_LOAD_CONTINUE6]] ], [ [[TMP37]], [[PRED_LOAD_IF7]] ] +; CHECK-NEXT: br label [[PRED_LOAD_CONTINUE6]] +; CHECK: pred.load.continue6: +; CHECK-NEXT: [[TMP38:%.*]] = phi <4 x i32> [ [[TMP28]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP34]], [[PRED_LOAD_IF5]] ] +; CHECK-NEXT: [[TMP39:%.*]] = phi <4 x i32> [ [[TMP29]], [[PRED_LOAD_CONTINUE4]] ], [ [[TMP37]], [[PRED_LOAD_IF5]] ] ; CHECK-NEXT: [[TMP40:%.*]] = mul nsw <4 x i32> [[TMP39]], [[TMP38]] -; CHECK-NEXT: [[TMP41:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[VEC_IND1]], <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP41:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[VEC_IND]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP41]]) ; CHECK-NEXT: [[TMP43:%.*]] = add i32 [[TMP42]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP44:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP40]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP44]]) ; CHECK-NEXT: [[TMP46]] = add i32 [[TMP45]], [[TMP43]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], -; CHECK-NEXT: [[VEC_IND_NEXT2]] = add <4 x i32> [[VEC_IND1]], +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; CHECK-NEXT: [[TMP47:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP47]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: @@ -530,9 +535,11 @@ define i32 @reduction_mul(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 19, [[VECTOR_PH]] ], [ [[TMP45:%.*]], [[PRED_LOAD_CONTINUE6]] ] -; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = or disjoint <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IV]], ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: @@ -595,7 +602,6 @@ define i32 @reduction_mul(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[TMP43]]) ; CHECK-NEXT: [[TMP45]] = mul i32 [[TMP44]], [[TMP42]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], ; CHECK-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP46]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: @@ -638,9 +644,11 @@ define i32 @reduction_and(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ -1, [[VECTOR_PH]] ], [ [[TMP45:%.*]], [[PRED_LOAD_CONTINUE6]] ] -; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = or disjoint <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IV]], ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: @@ -703,7 +711,6 @@ define i32 @reduction_and(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP43]]) ; CHECK-NEXT: [[TMP45]] = and i32 [[TMP44]], [[TMP42]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], ; CHECK-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP46]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: @@ -746,9 +753,11 @@ define i32 @reduction_or(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[PRED_LOAD_CONTINUE6]] ] -; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = or disjoint <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IV]], ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: @@ -809,7 +818,6 @@ define i32 @reduction_or(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP41]]) ; CHECK-NEXT: [[TMP43]] = or i32 [[TMP42]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], ; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: @@ -852,9 +860,11 @@ define i32 @reduction_xor(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[PRED_LOAD_CONTINUE6]] ] -; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = or disjoint <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IV]], ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: @@ -915,7 +925,6 @@ define i32 @reduction_xor(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: [[TMP42:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP41]]) ; CHECK-NEXT: [[TMP43]] = xor i32 [[TMP42]], [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], ; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: @@ -958,9 +967,11 @@ define float @reduction_fadd(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP43:%.*]], [[PRED_LOAD_CONTINUE6]] ] -; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = or disjoint <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IV]], ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: @@ -1021,7 +1032,6 @@ define float @reduction_fadd(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: [[TMP42:%.*]] = select fast <4 x i1> [[TMP0]], <4 x float> [[TMP39]], <4 x float> zeroinitializer ; CHECK-NEXT: [[TMP43]] = call fast float @llvm.vector.reduce.fadd.v4f32(float [[TMP41]], <4 x float> [[TMP42]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], ; CHECK-NEXT: [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP44]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: @@ -1064,9 +1074,11 @@ define float @reduction_fmul(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP45:%.*]], [[PRED_LOAD_CONTINUE6]] ] -; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = or disjoint <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IV]], ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: @@ -1129,7 +1141,6 @@ define float @reduction_fmul(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: [[TMP44:%.*]] = call fast float @llvm.vector.reduce.fmul.v4f32(float 1.000000e+00, <4 x float> [[TMP43]]) ; CHECK-NEXT: [[TMP45]] = fmul fast float [[TMP44]], [[TMP42]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], ; CHECK-NEXT: [[TMP46:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP46]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: middle.block: @@ -1172,9 +1183,11 @@ define i32 @reduction_min(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 1000, [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[PRED_LOAD_CONTINUE6]] ] -; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = or disjoint <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IV]], ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: @@ -1218,7 +1231,6 @@ define i32 @reduction_min(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> [[TMP24]]) ; CHECK-NEXT: [[RDX_MINMAX]] = call i32 @llvm.smin.i32(i32 [[TMP25]], i32 [[VEC_PHI]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], ; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK: middle.block: @@ -1259,9 +1271,11 @@ define i32 @reduction_max(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 1000, [[VECTOR_PH]] ], [ [[RDX_MINMAX:%.*]], [[PRED_LOAD_CONTINUE6]] ] -; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IND]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = or disjoint <4 x i64> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i64> [[VEC_IV]], ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: @@ -1305,7 +1319,6 @@ define i32 @reduction_max(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> [[TMP24]]) ; CHECK-NEXT: [[RDX_MINMAX]] = call i32 @llvm.umax.i32(i32 [[TMP25]], i32 [[VEC_PHI]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], ; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]] ; CHECK: middle.block: @@ -1438,9 +1451,11 @@ define i8 @reduction_add_trunc(ptr noalias nocapture %A) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[PRED_LOAD_CONTINUE6]] ] -; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = or disjoint <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IV]], ; CHECK-NEXT: [[TMP1:%.*]] = and <4 x i32> [[VEC_PHI]], ; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0 ; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] @@ -1488,7 +1503,6 @@ define i8 @reduction_add_trunc(ptr noalias nocapture %A) { ; CHECK-NEXT: [[TMP29:%.*]] = zext <4 x i8> [[TMP28]] to <4 x i32> ; CHECK-NEXT: [[TMP30]] = add nuw nsw <4 x i32> [[TMP1]], [[TMP29]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] ; CHECK: middle.block: @@ -1534,9 +1548,11 @@ define i8 @reduction_and_trunc(ptr noalias nocapture %A) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP29:%.*]], [[PRED_LOAD_CONTINUE6]] ] -; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = or disjoint <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IV]], ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: @@ -1583,7 +1599,6 @@ define i8 @reduction_and_trunc(ptr noalias nocapture %A) { ; CHECK-NEXT: [[TMP28:%.*]] = zext <4 x i8> [[TMP27]] to <4 x i32> ; CHECK-NEXT: [[TMP29]] = and <4 x i32> [[VEC_PHI]], [[TMP28]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; CHECK-NEXT: [[TMP30:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP30]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll b/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll index 7fd762c7b735a0..e7f750a9a80433 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-predselect.ll @@ -11,9 +11,11 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[PRED_LOAD_CONTINUE6]] ] -; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = or disjoint <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IV]], ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: @@ -56,7 +58,6 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) { ; CHECK-NEXT: [[TMP24:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP23]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP25]] = add <4 x i32> [[VEC_PHI]], [[TMP24]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: @@ -65,7 +66,7 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) { ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: -; CHECK-NEXT: br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: ._crit_edge: ; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ poison, [[DOTLR_PH]] ], [ [[TMP27]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] @@ -203,9 +204,11 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP42:%.*]], [[PRED_LOAD_CONTINUE6]] ] -; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = or disjoint <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IV]], ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: @@ -265,7 +268,6 @@ define i32 @reduction_prod(ptr noalias nocapture %A, ptr noalias nocapture %B) { ; CHECK-NEXT: [[TMP41:%.*]] = mul <4 x i32> [[TMP40]], [[TMP39]] ; CHECK-NEXT: [[TMP42]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP41]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; CHECK-NEXT: [[TMP43:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: @@ -308,9 +310,11 @@ define i32 @reduction_and(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP42:%.*]], [[PRED_LOAD_CONTINUE6]] ] -; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = or disjoint <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IV]], ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: @@ -370,7 +374,6 @@ define i32 @reduction_and(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: [[TMP41:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP40]], <4 x i32> ; CHECK-NEXT: [[TMP42]] = and <4 x i32> [[VEC_PHI]], [[TMP41]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; CHECK-NEXT: [[TMP43:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: @@ -413,9 +416,11 @@ define i32 @reduction_or(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP42:%.*]], [[PRED_LOAD_CONTINUE6]] ] -; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = or disjoint <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IV]], ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: @@ -475,7 +480,6 @@ define i32 @reduction_or(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: [[TMP41:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP40]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP42]] = or <4 x i32> [[VEC_PHI]], [[TMP41]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; CHECK-NEXT: [[TMP43:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: @@ -518,9 +522,11 @@ define i32 @reduction_xor(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP42:%.*]], [[PRED_LOAD_CONTINUE6]] ] -; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = or disjoint <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IV]], ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: @@ -580,7 +586,6 @@ define i32 @reduction_xor(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: [[TMP41:%.*]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP40]], <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP42]] = xor <4 x i32> [[VEC_PHI]], [[TMP41]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; CHECK-NEXT: [[TMP43:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: @@ -623,9 +628,11 @@ define float @reduction_fadd(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP42:%.*]], [[PRED_LOAD_CONTINUE6]] ] -; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = or disjoint <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IV]], ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: @@ -685,7 +692,6 @@ define float @reduction_fadd(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: [[TMP41:%.*]] = fadd fast <4 x float> [[TMP40]], [[TMP39]] ; CHECK-NEXT: [[TMP42]] = select fast <4 x i1> [[TMP0]], <4 x float> [[TMP41]], <4 x float> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; CHECK-NEXT: [[TMP43:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: @@ -728,9 +734,11 @@ define float @reduction_fmul(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[TMP42:%.*]], [[PRED_LOAD_CONTINUE6]] ] -; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = or disjoint <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IV]], ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: @@ -790,7 +798,6 @@ define float @reduction_fmul(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: [[TMP41:%.*]] = fmul fast <4 x float> [[TMP40]], [[TMP39]] ; CHECK-NEXT: [[TMP42]] = select fast <4 x i1> [[TMP0]], <4 x float> [[TMP41]], <4 x float> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; CHECK-NEXT: [[TMP43:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP43]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK: middle.block: @@ -833,9 +840,11 @@ define i32 @reduction_min(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[PRED_LOAD_CONTINUE6]] ] -; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = or disjoint <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IV]], ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: @@ -878,7 +887,6 @@ define i32 @reduction_min(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: [[TMP24:%.*]] = call <4 x i32> @llvm.smin.v4i32(<4 x i32> [[VEC_PHI]], <4 x i32> [[TMP23]]) ; CHECK-NEXT: [[TMP25]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP24]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: middle.block: @@ -919,9 +927,11 @@ define i32 @reduction_max(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_LOAD_CONTINUE6:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_LOAD_CONTINUE6]] ] ; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[PRED_LOAD_CONTINUE6]] ] -; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[INDEX]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VEC_IV:%.*]] = or disjoint <4 x i32> [[BROADCAST_SPLAT]], +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult <4 x i32> [[VEC_IV]], ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i64 0 ; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_LOAD_IF:%.*]], label [[PRED_LOAD_CONTINUE:%.*]] ; CHECK: pred.load.if: @@ -964,7 +974,6 @@ define i32 @reduction_max(ptr nocapture %A, ptr nocapture %B) { ; CHECK-NEXT: [[TMP24:%.*]] = call <4 x i32> @llvm.umax.v4i32(<4 x i32> [[VEC_PHI]], <4 x i32> [[TMP23]]) ; CHECK-NEXT: [[TMP25]] = select <4 x i1> [[TMP0]], <4 x i32> [[TMP24]], <4 x i32> [[VEC_PHI]] ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], 260 ; CHECK-NEXT: br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK: middle.block: diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll index 108b78a70fa1a9..20e3366b652517 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll @@ -26,8 +26,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION -; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next, ir<1> -; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv>, vp<[[BTC]]> +; CHECK-NEXT: EMIT vp<[[WIDEN_IV:%.+]]> = WIDEN-CANONICAL-INDUCTION vp<[[CAN_IV]]> +; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule vp<[[WIDEN_IV]]>, vp<[[BTC]]> ; CHECK-NEXT: Successor(s): pred.store ; CHECK: pred.store: { @@ -759,8 +759,8 @@ define void @update_2_uses_in_same_recipe_in_merged_block(i32 %k) { ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION -; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next, ir<1> -; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv>, vp<[[BTC]]> +; CHECK-NEXT: EMIT vp<[[WIDEN_IV:%.+]]> = WIDEN-CANONICAL-INDUCTION vp<[[CAN_IV]]> +; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule vp<[[WIDEN_IV]]>, vp<[[BTC]]> ; CHECK-NEXT: Successor(s): pred.store ; CHECK-EMPTY: ; CHECK-NEXT: pred.store: { @@ -826,10 +826,10 @@ define void @recipe_in_merge_candidate_used_by_first_order_recurrence(i32 %k) { ; CHECK-NEXT: vector loop: { ; CHECK-NEXT: vector.body: ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION -; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next, ir<1> ; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%for> = phi ir<0>, vp<[[PRED:%.+]]> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> -; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule ir<%iv>, vp<[[BTC]]> +; CHECK-NEXT: EMIT vp<[[WIDEN_IV:%.+]]> = WIDEN-CANONICAL-INDUCTION vp<[[CAN_IV]]> +; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = icmp ule vp<[[WIDEN_IV]]>, vp<[[BTC]]> ; CHECK-NEXT: REPLICATE ir<%gep.a> = getelementptr inbounds ir<@a>, ir<0>, vp<[[STEPS]]> ; CHECK-NEXT: Successor(s): pred.load ; CHECK-EMPTY: