Skip to content

Commit

Permalink
[VPlan] Implement unrolling as VPlan-to-VPlan transform. (#95842)
Browse files Browse the repository at this point in the history
This patch implements explicit unrolling by UF  as VPlan transform. In
follow up patches this will allow simplifying VPTransform state (no need
to store unrolled parts) as well as recipe execution (no need to
generate code for multiple parts in an each recipe). It also allows for
more general optimziations (e.g. avoid generating code for recipes that
are uniform-across parts).

It also unifies the logic dealing with unrolled parts in a single place,
rather than spreading it out across multiple places (e.g. VPlan post
processing for header-phi recipes previously.)

In the initial implementation, a number of recipes still take the
unrolled part as additional, optional argument, if their execution
depends on the unrolled part.

The computation for start/step values for scalable inductions changed
slightly. Previously the step would be computed as scalar and then
splatted, now vscale gets splatted and multiplied by the step in a
vector mul.

This has been split off #94339
which also includes changes to simplify VPTransfomState and recipes'
::execute.

The current version mostly leaves existing ::execute untouched and
instead sets VPTransfomState::UF to 1.

A follow-up patch will clean up all references to VPTransformState::UF.

Another follow-up patch will simplify VPTransformState to only store a
single vector value per VPValue.

PR: #95842
  • Loading branch information
fhahn authored Sep 21, 2024
1 parent 6032fee commit 8ec4067
Show file tree
Hide file tree
Showing 41 changed files with 1,262 additions and 562 deletions.
1 change: 1 addition & 0 deletions llvm/lib/Transforms/Vectorize/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ add_llvm_component_library(LLVMVectorize
VPlanRecipes.cpp
VPlanSLP.cpp
VPlanTransforms.cpp
VPlanUnroll.cpp
VPlanVerifier.cpp
VPlanUtils.cpp

Expand Down
15 changes: 15 additions & 0 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,15 @@ class VPBuilder {
DebugLoc DL, const Twine &Name = "") {
return createInstruction(Opcode, Operands, DL, Name);
}
VPInstruction *createNaryOp(unsigned Opcode,
std::initializer_list<VPValue *> Operands,
std::optional<FastMathFlags> FMFs = {},
DebugLoc DL = {}, const Twine &Name = "") {
if (FMFs)
return tryInsertInstruction(
new VPInstruction(Opcode, Operands, *FMFs, DL, Name));
return createInstruction(Opcode, Operands, DL, Name);
}

VPInstruction *createOverflowingOp(unsigned Opcode,
std::initializer_list<VPValue *> Operands,
Expand All @@ -164,6 +173,7 @@ class VPBuilder {
return tryInsertInstruction(
new VPInstruction(Opcode, Operands, WrapFlags, DL, Name));
}

VPValue *createNot(VPValue *Operand, DebugLoc DL = {},
const Twine &Name = "") {
return createInstruction(VPInstruction::Not, {Operand}, DL, Name);
Expand Down Expand Up @@ -223,6 +233,11 @@ class VPBuilder {
return tryInsertInstruction(new VPScalarCastRecipe(Opcode, Op, ResultTy));
}

VPWidenCastRecipe *createWidenCast(Instruction::CastOps Opcode, VPValue *Op,
Type *ResultTy) {
return tryInsertInstruction(new VPWidenCastRecipe(Opcode, Op, ResultTy));
}

VPScalarIVStepsRecipe *
createScalarIVSteps(Instruction::BinaryOps InductionOpcode,
FPMathOperator *FPBinOp, VPValue *IV, VPValue *Step) {
Expand Down
6 changes: 5 additions & 1 deletion llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7507,6 +7507,10 @@ LoopVectorizationPlanner::executePlan(
"expanded SCEVs to reuse can only be used during epilogue vectorization");
(void)IsEpilogueVectorization;

// TODO: Move to VPlan transform stage once the transition to the VPlan-based
// cost model is complete for better cost estimates.
VPlanTransforms::unrollByUF(BestVPlan, BestUF,
OrigLoop->getHeader()->getModule()->getContext());
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);

LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
Expand Down Expand Up @@ -7625,7 +7629,7 @@ LoopVectorizationPlanner::executePlan(
if (MiddleTerm->isConditional() &&
hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
// Assume that `Count % VectorTripCount` is equally distributed.
unsigned TripCount = State.UF * State.VF.getKnownMinValue();
unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue();
assert(TripCount > 0 && "trip count should not be zero");
const uint32_t Weights[] = {1, TripCount - 1};
setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false);
Expand Down
9 changes: 9 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,7 @@ void VPTransformState::setDebugLocFrom(DebugLoc DL) {
->shouldEmitDebugInfoForProfiling() &&
!EnableFSDiscriminator) {
// FIXME: For scalable vectors, assume vscale=1.
unsigned UF = Plan->getUF();
auto NewDIL =
DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
if (NewDIL)
Expand Down Expand Up @@ -1018,6 +1019,10 @@ static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
/// Assumes a single pre-header basic-block was created for this. Introduce
/// additional basic-blocks as needed, and fill them all.
void VPlan::execute(VPTransformState *State) {
// Set UF to 1, as the unrollByUF VPlan transform already explicitly unrolled
// the VPlan.
// TODO: Remove State::UF and all uses.
State->UF = 1;
// Initialize CFG state.
State->CFG.PrevVPBB = nullptr;
State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor();
Expand Down Expand Up @@ -1093,6 +1098,10 @@ void VPlan::execute(VPTransformState *State) {
// consistent placement of all induction updates.
Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());

// Use the steps for the last part as backedge value for the induction.
if (auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R))
Inc->setOperand(0, State->get(IV->getLastUnrolledPartOperand(), 0));
continue;
}

Expand Down
84 changes: 76 additions & 8 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -532,6 +532,7 @@ class VPBlockBase {
VPBlocksTy &getSuccessors() { return Successors; }

iterator_range<VPBlockBase **> successors() { return Successors; }
iterator_range<VPBlockBase **> predecessors() { return Predecessors; }

const VPBlocksTy &getPredecessors() const { return Predecessors; }
VPBlocksTy &getPredecessors() { return Predecessors; }
Expand Down Expand Up @@ -724,6 +725,11 @@ class VPLiveOut : public VPUser {

PHINode *getPhi() const { return Phi; }

/// Live-outs are marked as only using the first part during the transition
/// to unrolling directly on VPlan.
/// TODO: Remove after unroller transition.
bool onlyFirstPartUsed(const VPValue *Op) const override { return true; }

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the VPLiveOut to \p O.
void print(raw_ostream &O, VPSlotTracker &SlotTracker) const;
Expand Down Expand Up @@ -1226,11 +1232,24 @@ class VPRecipeWithIRFlags : public VPSingleDefRecipe {
#endif
};

/// Helper to access the operand that contains the unroll part for this recipe
/// after unrolling.
template <unsigned PartOpIdx> class VPUnrollPartAccessor {
protected:
/// Return the VPValue operand containing the unroll part or null if there is
/// no such operand.
VPValue *getUnrollPartOperand(VPUser &U) const;

/// Return the unroll part.
unsigned getUnrollPart(VPUser &U) const;
};

/// This is a concrete Recipe that models a single VPlan-level instruction.
/// While as any Recipe it may generate a sequence of IR instructions when
/// executed, these instructions would always form a single-def expression as
/// the VPInstruction is also a single def-use vertex.
class VPInstruction : public VPRecipeWithIRFlags {
class VPInstruction : public VPRecipeWithIRFlags,
public VPUnrollPartAccessor<1> {
friend class VPlanSlp;

public:
Expand Down Expand Up @@ -1764,7 +1783,8 @@ class VPWidenGEPRecipe : public VPRecipeWithIRFlags {
/// A recipe to compute the pointers for widened memory accesses of IndexTy for
/// all parts. If IsReverse is true, compute pointers for accessing the input in
/// reverse order per part.
class VPVectorPointerRecipe : public VPRecipeWithIRFlags {
class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
public VPUnrollPartAccessor<1> {
Type *IndexedTy;
bool IsReverse;

Expand All @@ -1789,7 +1809,7 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags {
bool onlyFirstPartUsed(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
assert(getNumOperands() == 1 && "must have a single operand");
assert(getNumOperands() <= 2 && "must have at most two operands");
return true;
}

Expand Down Expand Up @@ -1948,6 +1968,12 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe {
VPValue *getVFValue() { return getOperand(2); }
const VPValue *getVFValue() const { return getOperand(2); }

VPValue *getSplatVFValue() {
// If the recipe has been unrolled (4 operands), return the VPValue for the
// induction increment.
return getNumOperands() == 5 ? getOperand(3) : nullptr;
}

/// Returns the first defined value as TruncInst, if it is one or nullptr
/// otherwise.
TruncInst *getTruncInst() { return Trunc; }
Expand All @@ -1967,9 +1993,17 @@ class VPWidenIntOrFpInductionRecipe : public VPHeaderPHIRecipe {
Type *getScalarType() const {
return Trunc ? Trunc->getType() : IV->getType();
}

/// Returns the VPValue representing the value of this induction at
/// the last unrolled part, if it exists. Returns itself if unrolling did not
/// take place.
VPValue *getLastUnrolledPartOperand() {
return getNumOperands() == 5 ? getOperand(4) : this;
}
};

class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe {
class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe,
public VPUnrollPartAccessor<3> {
const InductionDescriptor &IndDesc;

bool IsScalarAfterVectorization;
Expand Down Expand Up @@ -2006,6 +2040,13 @@ class VPWidenPointerInductionRecipe : public VPHeaderPHIRecipe {
/// Returns the induction descriptor for the recipe.
const InductionDescriptor &getInductionDescriptor() const { return IndDesc; }

/// Returns the VPValue representing the value of this induction at
/// the first unrolled part, if it exists. Returns itself if unrolling did not
/// take place.
VPValue *getFirstUnrolledPartOperand() {
return getUnrollPart(*this) == 0 ? this : getOperand(2);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
Expand Down Expand Up @@ -2088,7 +2129,8 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe {
/// A recipe for handling reduction phis. The start value is the first operand
/// of the recipe and the incoming value from the backedge is the second
/// operand.
class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
class VPReductionPHIRecipe : public VPHeaderPHIRecipe,
public VPUnrollPartAccessor<2> {
/// Descriptor for the reduction.
const RecurrenceDescriptor &RdxDesc;

Expand Down Expand Up @@ -2907,7 +2949,10 @@ class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {
~VPActiveLaneMaskPHIRecipe() override = default;

VPActiveLaneMaskPHIRecipe *clone() override {
return new VPActiveLaneMaskPHIRecipe(getOperand(0), getDebugLoc());
auto *R = new VPActiveLaneMaskPHIRecipe(getOperand(0), getDebugLoc());
if (getNumOperands() == 2)
R->addOperand(getOperand(1));
return R;
}

VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskPHISC)
Expand Down Expand Up @@ -2966,7 +3011,8 @@ class VPEVLBasedIVPHIRecipe : public VPHeaderPHIRecipe {
};

/// A Recipe for widening the canonical induction variable of the vector loop.
class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe {
class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe,
public VPUnrollPartAccessor<1> {
public:
VPWidenCanonicalIVRecipe(VPCanonicalIVPHIRecipe *CanonicalIV)
: VPSingleDefRecipe(VPDef::VPWidenCanonicalIVSC, {CanonicalIV}) {}
Expand Down Expand Up @@ -3052,7 +3098,8 @@ class VPDerivedIVRecipe : public VPSingleDefRecipe {

/// A recipe for handling phi nodes of integer and floating-point inductions,
/// producing their scalar values.
class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags {
class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags,
public VPUnrollPartAccessor<2> {
Instruction::BinaryOps InductionOpcode;

public:
Expand Down Expand Up @@ -3548,6 +3595,11 @@ class VPlan {

bool hasUF(unsigned UF) const { return UFs.empty() || UFs.contains(UF); }

unsigned getUF() const {
assert(UFs.size() == 1 && "Expected a single UF");
return UFs[0];
}

void setUF(unsigned UF) {
assert(hasUF(UF) && "Cannot set the UF not already in plan");
UFs.clear();
Expand Down Expand Up @@ -3732,6 +3784,22 @@ class VPBlockUtils {
connectBlocks(BlockPtr, NewBlock);
}

/// Insert disconnected block \p NewBlock before \p Blockptr. First
/// disconnects all predecessors of \p BlockPtr and connects them to \p
/// NewBlock. Add \p NewBlock as predecessor of \p BlockPtr and \p BlockPtr as
/// successor of \p NewBlock.
static void insertBlockBefore(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) {
assert(NewBlock->getSuccessors().empty() &&
NewBlock->getPredecessors().empty() &&
"Can't insert new block with predecessors or successors.");
NewBlock->setParent(BlockPtr->getParent());
for (VPBlockBase *Pred : to_vector(BlockPtr->predecessors())) {
disconnectBlocks(Pred, BlockPtr);
connectBlocks(Pred, NewBlock);
}
connectBlocks(NewBlock, BlockPtr);
}

/// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p
/// BlockPtr. Add \p IfTrue and \p IfFalse as succesors of \p BlockPtr and \p
/// BlockPtr as predecessor of \p IfTrue and \p IfFalse. Propagate \p BlockPtr
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,10 @@ struct UnaryRecipe_match {
return DefR && match(DefR);
}

bool match(const VPSingleDefRecipe *R) {
return match(static_cast<const VPRecipeBase *>(R));
}

bool match(const VPRecipeBase *R) {
if (!detail::MatchRecipeAndOpcode<Opcode, RecipeTys...>::match(R))
return false;
Expand Down
Loading

0 comments on commit 8ec4067

Please sign in to comment.