Skip to content

Commit

Permalink
[VPlan] Remove unneeded State.UF after 8ec4067 (NFC).
Browse files Browse the repository at this point in the history
State.UF is not needed any longer after 8ec4067
(#95842). Clean it up,
simplifying ::execute of existing recipes.
  • Loading branch information
fhahn committed Sep 22, 2024
1 parent 76cffc2 commit 06c3a7d
Show file tree
Hide file tree
Showing 4 changed files with 510 additions and 706 deletions.
30 changes: 6 additions & 24 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7440,7 +7440,7 @@ static void createAndCollectMergePhiForReduction(
const RecurrenceDescriptor &RdxDesc = PhiR->getRecurrenceDescriptor();

Value *FinalValue =
State.get(RedResult, VPIteration(State.UF - 1, VPLane::getFirstLane()));
State.get(RedResult, VPIteration(0, VPLane::getFirstLane()));
auto *ResumePhi =
dyn_cast<PHINode>(PhiR->getStartValue()->getUnderlyingValue());
if (VectorizingEpilogue && RecurrenceDescriptor::isAnyOfRecurrenceKind(
Expand Down Expand Up @@ -9453,24 +9453,8 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
}

if (IsUniform) {
// If the recipe is uniform across all parts (instead of just per VF), only
// generate a single instance.
if ((isa<LoadInst>(UI) || isa<StoreInst>(UI)) &&
all_of(operands(),
[](VPValue *Op) { return Op->isDefinedOutsideLoopRegions(); })) {
State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State);
if (user_begin() != user_end()) {
for (unsigned Part = 1; Part < State.UF; ++Part)
State.set(this, State.get(this, VPIteration(0, 0)),
VPIteration(Part, 0));
}
return;
}

// Uniform within VL means we need to generate lane 0 only for each
// unrolled copy.
for (unsigned Part = 0; Part < State.UF; ++Part)
State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, 0), State);
// Uniform within VL means we need to generate lane 0.
State.ILV->scalarizeInstruction(UI, this, VPIteration(0, 0), State);
return;
}

Expand All @@ -9479,17 +9463,15 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
if (isa<StoreInst>(UI) &&
vputils::isUniformAfterVectorization(getOperand(1))) {
auto Lane = VPLane::getLastLaneForVF(State.VF);
State.ILV->scalarizeInstruction(UI, this, VPIteration(State.UF - 1, Lane),
State);
State.ILV->scalarizeInstruction(UI, this, VPIteration(0, Lane), State);
return;
}

// Generate scalar instances for all VF lanes of all UF parts.

This comment has been minimized.

Copy link
@ayalz

ayalz Sep 24, 2024

Collaborator

Drop "of all UF parts".
Worth searching for "UF" and "part" throughout documentation.

This comment has been minimized.

Copy link
@fhahn

fhahn Sep 24, 2024

Author Contributor

Removed in 31ac400, some more references have been removed in 3fbf6f8, thanks!

assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
const unsigned EndLane = State.VF.getKnownMinValue();
for (unsigned Part = 0; Part < State.UF; ++Part)
for (unsigned Lane = 0; Lane < EndLane; ++Lane)
State.ILV->scalarizeInstruction(UI, this, VPIteration(Part, Lane), State);
for (unsigned Lane = 0; Lane < EndLane; ++Lane)
State.ILV->scalarizeInstruction(UI, this, VPIteration(0, Lane), State);
}

// Determine how to lower the scalar epilogue, which depends on 1) optimising
Expand Down
40 changes: 8 additions & 32 deletions llvm/lib/Transforms/Vectorize/VPlan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
VPTransformState::VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
DominatorTree *DT, IRBuilderBase &Builder,
InnerLoopVectorizer *ILV, VPlan *Plan)
: VF(VF), UF(UF), CFG(DT), LI(LI), Builder(Builder), ILV(ILV), Plan(Plan),
: VF(VF), CFG(DT), LI(LI), Builder(Builder), ILV(ILV), Plan(Plan),
LVer(nullptr), TypeAnalysis(Plan->getCanonicalIV()->getScalarType()) {}

Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {
Expand Down Expand Up @@ -772,9 +772,6 @@ void VPRegionBlock::execute(VPTransformState *State) {

// Enter replicating mode.
State->Instance = VPIteration(0, 0);

for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) {
State->Instance->Part = Part;
assert(!State->VF.isScalable() && "VF is assumed to be non scalable.");

This comment has been minimized.

Copy link
@ayalz

ayalz Sep 24, 2024

Collaborator

Indentation should be fixed.

This comment has been minimized.

Copy link
@fhahn

fhahn Sep 24, 2024

Author Contributor

Should be fixed as part of 3fbf6f8, thanks!

for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF;
++Lane) {
Expand All @@ -784,7 +781,6 @@ void VPRegionBlock::execute(VPTransformState *State) {
LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
Block->execute(State);
}
}
}

// Exit replicating mode.
Expand Down Expand Up @@ -963,16 +959,15 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
// FIXME: Model VF * UF computation completely in VPlan.
assert(VFxUF.getNumUsers() && "VFxUF expected to always have users");
unsigned UF = getUF();
if (VF.getNumUsers()) {
Value *RuntimeVF = getRuntimeVF(Builder, TCTy, State.VF);
VF.setUnderlyingValue(RuntimeVF);
VFxUF.setUnderlyingValue(
State.UF > 1
? Builder.CreateMul(RuntimeVF, ConstantInt::get(TCTy, State.UF))
: RuntimeVF);
UF > 1 ? Builder.CreateMul(RuntimeVF, ConstantInt::get(TCTy, UF))
: RuntimeVF);
} else {
VFxUF.setUnderlyingValue(
createStepForVF(Builder, TCTy, State.VF, State.UF));
VFxUF.setUnderlyingValue(createStepForVF(Builder, TCTy, State.VF, UF));
}

// When vectorizing the epilogue loop, the canonical induction start value
Expand Down Expand Up @@ -1019,10 +1014,6 @@ static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
/// Assumes a single pre-header basic-block was created for this. Introduce
/// additional basic-blocks as needed, and fill them all.
void VPlan::execute(VPTransformState *State) {
// Set UF to 1, as the unrollByUF VPlan transform already explicitly unrolled
// the VPlan.
// TODO: Remove State::UF and all uses.
State->UF = 1;
// Initialize CFG state.
State->CFG.PrevVPBB = nullptr;
State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor();
Expand Down Expand Up @@ -1106,28 +1097,13 @@ void VPlan::execute(VPTransformState *State) {
}

auto *PhiR = cast<VPHeaderPHIRecipe>(&R);
// For canonical IV, first-order recurrences and in-order reduction phis,
// only a single part is generated, which provides the last part from the
// previous iteration. For non-ordered reductions all UF parts are
// generated.
bool SinglePartNeeded =
isa<VPCanonicalIVPHIRecipe>(PhiR) ||
isa<VPFirstOrderRecurrencePHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) ||
(isa<VPReductionPHIRecipe>(PhiR) &&
cast<VPReductionPHIRecipe>(PhiR)->isOrdered());
bool NeedsScalar =
isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) ||
(isa<VPReductionPHIRecipe>(PhiR) &&
cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;

for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
Value *Phi = State->get(PhiR, Part, NeedsScalar);
Value *Val =
State->get(PhiR->getBackedgeValue(),
SinglePartNeeded ? State->UF - 1 : Part, NeedsScalar);
cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
}
Value *Phi = State->get(PhiR, 0, NeedsScalar);
Value *Val = State->get(PhiR->getBackedgeValue(), 0, NeedsScalar);
cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
}

State->CFG.DTU.flush();
Expand Down
14 changes: 6 additions & 8 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,6 @@ struct VPTransformState {

/// The chosen Vectorization and Unroll Factors of the loop being vectorized.

This comment has been minimized.

Copy link
@ayalz

ayalz Sep 24, 2024

Collaborator

"and Unroll Factors" >> "Factor"

This comment has been minimized.

Copy link
@fhahn

fhahn Sep 24, 2024

Author Contributor

Should be updated in 3fbf6f8 together with some more instances, thanks!

ElementCount VF;
unsigned UF;

/// Hold the indices to generate specific scalar instructions. Null indicates
/// that all instances are to be generated, using either scalar or vector
Expand Down Expand Up @@ -309,7 +308,7 @@ struct VPTransformState {
assert((VF.isScalar() || V->getType()->isVectorTy()) &&
"scalar values must be stored as (Part, 0)");
if (!Data.PerPartOutput.count(Def)) {
DataState::PerPartValuesTy Entry(UF);
DataState::PerPartValuesTy Entry(1);
Data.PerPartOutput[Def] = Entry;
}
Data.PerPartOutput[Def][Part] = V;
Expand Down Expand Up @@ -1306,11 +1305,10 @@ class VPInstruction : public VPRecipeWithIRFlags,
/// needed.
bool canGenerateScalarForFirstLane() const;

/// Utility methods serving execute(): generates a single instance of the
/// modeled instruction for a given part. \returns the generated value for \p
/// Part. In some cases an existing value is returned rather than a generated
/// one.
Value *generatePerPart(VPTransformState &State, unsigned Part);
/// Utility methods serving execute(): generates a single vector instance of
/// the modeled instruction. \returns the generated value. . In some cases an
/// existing value is returned rather than a generated one.
Value *generate(VPTransformState &State);

/// Utility methods serving execute(): generates a scalar single instance of
/// the modeled instruction for a given lane. \returns the scalar generated
Expand Down Expand Up @@ -1616,7 +1614,7 @@ class VPScalarCastRecipe : public VPSingleDefRecipe {

Type *ResultTy;

Value *generate(VPTransformState &State, unsigned Part);
Value *generate(VPTransformState &State);

public:
VPScalarCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
Expand Down
Loading

4 comments on commit 06c3a7d

@mikaelholmen
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @fhahn,

There is something funny going on.

Starting with this NFC patch, the following
clang -O2 -ffast-math -g bbi-99425.c
fails with

invalid fpmath accuracy!
  %vec.ind.next = fadd fast <2 x float> %step.add, <float 2.000000e+00, float 2.000000e+00>, !dbg !25, !fpmath !25
clang-20: ../lib/Transforms/Vectorize/LoopVectorize.cpp:10207: bool llvm::LoopVectorizePass::processLoop(llvm::Loop *): Assertion `!verifyFunction(*L->getHeader()->getParent(), &dbgs())' failed.

bbi-99425.c.gz

@mikaelholmen
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reduced opt reproducer:
opt -passes=loop-vectorize bbi-99425.ll -o /dev/null
Removing the debug info from
%arrayidx.real = phi float [ 0.000000e+00, %entry ], [ %add.r, %for.body ], !dbg !4
makes the failure disappear.
bbi-99425.ll.gz

@mikaelholmen
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem is fixed with 040bb37

[VPlan] Fix incorrect argument for CreateBinOp after 06c3a7d2d764.

06c3a7d2d764 incorrectly updated CreateBinOp to pass the debug location,
which gets interpreted as FPMath node. Remove the argument.

@fhahn
Copy link
Contributor Author

@fhahn fhahn commented on 06c3a7d Sep 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mikaelholmen yes that should be the one, thanks for the reproducer

Please sign in to comment.