Skip to content

Commit

Permalink
[VPlan] Implement interleaving as VPlan-to-VPlan transform.
Browse files Browse the repository at this point in the history
This patch implements explicit interleaving as VPlan transform. In
follow up patches this will allow  simplifying VPTransform state
(no need to store unrolled parts) as well as recipe execution (no
need to generate code for multiple parts in a each recipe). It also
allows for more general optimziations (e.g. avoid generating code for
recipes that are uniform-across parts).

In the initial implementation, a number of recipes still take the
unrolled part as additional, optional argument, if their execution
depends on the unrolled part.

The computation for start/step values for scalable inductions changed
slightly. Previously the step would be computed as scalar and then
splatted, now vscale gets splatted and multiplied by the step in a
vector mul.

This has been split off  llvm#94339
which also includes changes to simplify VPTransfomState and recipes'
::execute.

The current version mostly leaves existing ::execute untouched and
instead sets VPTransfomState::UF to 1.
  • Loading branch information
fhahn committed Aug 14, 2024
1 parent 00ab8a6 commit 0c3c293
Show file tree
Hide file tree
Showing 27 changed files with 917 additions and 428 deletions.
9 changes: 9 additions & 0 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,15 @@ class VPBuilder {
return tryInsertInstruction(
new VPInstruction(Opcode, Operands, WrapFlags, DL, Name));
}

VPInstruction *createFPOp(unsigned Opcode,
std::initializer_list<VPValue *> Operands,
DebugLoc DL = {}, const Twine &Name = "",
FastMathFlags FMFs = {}) {
auto *Op = new VPInstruction(Opcode, Operands, FMFs, DL, Name);
return tryInsertInstruction(Op);
}

VPValue *createNot(VPValue *Operand, DebugLoc DL = {},
const Twine &Name = "") {
return createInstruction(VPInstruction::Not, {Operand}, DL, Name);
Expand Down
83 changes: 83 additions & 0 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7392,6 +7392,8 @@ LoopVectorizationPlanner::executePlan(
"expanded SCEVs to reuse can only be used during epilogue vectorization");
(void)IsEpilogueVectorization;

VPlanTransforms::interleave(BestVPlan, BestUF,
OrigLoop->getHeader()->getModule()->getContext());
VPlanTransforms::optimizeForVFAndUF(BestVPlan, BestVF, BestUF, PSE);

LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF
Expand Down Expand Up @@ -9228,6 +9230,87 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
VPlanTransforms::clearReductionWrapFlags(*Plan);
}

void VPWidenPointerInductionRecipe::execute(VPTransformState &State) {
assert(IndDesc.getKind() == InductionDescriptor::IK_PtrInduction &&
"Not a pointer induction according to InductionDescriptor!");
assert(cast<PHINode>(getUnderlyingInstr())->getType()->isPointerTy() &&
"Unexpected type.");
assert(!onlyScalarsGenerated(State.VF.isScalable()) &&
"Recipe should have been replaced");

auto *IVR = getParent()->getPlan()->getCanonicalIV();
PHINode *CanonicalIV = cast<PHINode>(State.get(IVR, 0, /*IsScalar*/ true));
unsigned CurrentPart = 0;
if (getNumOperands() == 5)
CurrentPart =
cast<ConstantInt>(getOperand(4)->getLiveInIRValue())->getZExtValue();
Type *PhiType = IndDesc.getStep()->getType();

// Build a pointer phi
Value *ScalarStartValue = getStartValue()->getLiveInIRValue();
Type *ScStValueType = ScalarStartValue->getType();
PHINode *NewPointerPhi = nullptr;

BasicBlock *VectorPH = State.CFG.getPreheaderBBFor(this);
if (getNumOperands() == 5) {
auto *GEP = cast<GetElementPtrInst>(State.get(getOperand(3), 0));
NewPointerPhi = cast<PHINode>(GEP->getPointerOperand());
} else {
NewPointerPhi =
PHINode::Create(ScStValueType, 2, "pointer.phi", CanonicalIV);
NewPointerPhi->addIncoming(ScalarStartValue, VectorPH);
}

// A pointer induction, performed by using a gep
BasicBlock::iterator InductionLoc = State.Builder.GetInsertPoint();
unsigned UF = getNumOperands() == 2
? 1
: cast<ConstantInt>(getOperand(2)->getLiveInIRValue())
->getZExtValue();

Value *ScalarStepValue = State.get(getOperand(1), VPIteration(0, 0));
Value *RuntimeVF = getRuntimeVF(State.Builder, PhiType, State.VF);
Value *NumUnrolledElems =
State.Builder.CreateMul(RuntimeVF, ConstantInt::get(PhiType, UF));
// Add induction update using an incorrect block temporarily. The phi node
// will be fixed after VPlan execution. Note that at this point the latch
// block cannot be used, as it does not exist yet.
// TODO: Model increment value in VPlan, by turning the recipe into a
// multi-def and a subclass of VPHeaderPHIRecipe.
if (getNumOperands() != 5) {
Value *InductionGEP = GetElementPtrInst::Create(
State.Builder.getInt8Ty(), NewPointerPhi,
State.Builder.CreateMul(ScalarStepValue, NumUnrolledElems), "ptr.ind",
InductionLoc);

NewPointerPhi->addIncoming(InductionGEP, VectorPH);
}

// Create UF many actual address geps that use the pointer
// phi as base and a vectorized version of the step value
// (<step*0, ..., step*N>) as offset.
for (unsigned Part = 0; Part < State.UF; ++Part) {
Type *VecPhiType = VectorType::get(PhiType, State.VF);
Value *StartOffsetScalar = State.Builder.CreateMul(
RuntimeVF, ConstantInt::get(PhiType, CurrentPart));
Value *StartOffset =
State.Builder.CreateVectorSplat(State.VF, StartOffsetScalar);
// Create a vector of consecutive numbers from zero to VF.
StartOffset = State.Builder.CreateAdd(
StartOffset, State.Builder.CreateStepVector(VecPhiType));

assert(ScalarStepValue == State.get(getOperand(1), VPIteration(Part, 0)) &&
"scalar step must be the same across all parts");
Value *GEP = State.Builder.CreateGEP(
State.Builder.getInt8Ty(), NewPointerPhi,
State.Builder.CreateMul(
StartOffset,
State.Builder.CreateVectorSplat(State.VF, ScalarStepValue),
"vector.gep"));
State.set(this, GEP, Part);
}
}

void VPDerivedIVRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "VPDerivedIVRecipe being replicated.");

Expand Down
12 changes: 12 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -931,6 +931,10 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
// FIXME: Model VF * UF computation completely in VPlan.
VFxUF.setUnderlyingValue(
createStepForVF(Builder, TripCountV->getType(), State.VF, State.UF));
if (VF.getNumUsers() > 0) {
VF.setUnderlyingValue(
createStepForVF(Builder, TripCountV->getType(), State.VF, 1));
}

// When vectorizing the epilogue loop, the canonical induction start value
// needs to be changed from zero to the value after the main vector loop.
Expand Down Expand Up @@ -974,6 +978,7 @@ static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
/// Assumes a single pre-header basic-block was created for this. Introduce
/// additional basic-blocks as needed, and fill them all.
void VPlan::execute(VPTransformState *State) {
State->UF = 1;
// Initialize CFG state.
State->CFG.PrevVPBB = nullptr;
State->CFG.ExitBB = State->CFG.PrevBB->getSingleSuccessor();
Expand Down Expand Up @@ -1048,6 +1053,9 @@ void VPlan::execute(VPTransformState *State) {
// Move the last step to the end of the latch block. This ensures
// consistent placement of all induction updates.
Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
if (isa<VPWidenIntOrFpInductionRecipe>(&R) && R.getNumOperands() == 4)
Inc->setOperand(0, State->get(R.getOperand(3), 0));

Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
continue;
}
Expand Down Expand Up @@ -1418,6 +1426,10 @@ void VPlanIngredient::print(raw_ostream &O) const {

template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT);

bool VPValue::isDefinedOutsideVectorRegions() const {
return !hasDefiningRecipe() || !getDefiningRecipe()->getParent()->getParent();
}

void VPValue::replaceAllUsesWith(VPValue *New) {
replaceUsesWithIf(New, [](VPUser &, unsigned) { return true; });
}
Expand Down
50 changes: 49 additions & 1 deletion llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -727,6 +727,8 @@ class VPLiveOut : public VPUser {

PHINode *getPhi() const { return Phi; }

bool onlyFirstPartUsed(const VPValue *Op) const override { return true; }

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the VPLiveOut to \p O.
void print(raw_ostream &O, VPSlotTracker &SlotTracker) const;
Expand Down Expand Up @@ -1397,6 +1399,9 @@ class VPInstruction : public VPRecipeWithIRFlags {
/// Returns true if this VPInstruction's operands are single scalars and the
/// result is also a single scalar.
bool isSingleScalar() const;

/// Return the interleave count from the VPInstruction's last argument.
unsigned getInterleaveCount() const;
};

/// VPWidenRecipe is a recipe for producing a widened instruction using the
Expand Down Expand Up @@ -1686,6 +1691,9 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags {
isInBounds(), getDebugLoc());
}

/// Return the current part for this vector pointer.
unsigned getPartForRecipe() const;

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
Expand Down Expand Up @@ -2026,6 +2034,9 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {

/// Returns true, if the phi is part of an in-loop reduction.
bool isInLoop() const { return IsInLoop; }

/// Return the current part for this scalar step.
unsigned getPartForRecipe() const;
};

/// A recipe for vectorizing a phi-node as a sequence of mask-based select
Expand Down Expand Up @@ -2736,6 +2747,9 @@ class VPCanonicalIVPHIRecipe : public VPHeaderPHIRecipe {
/// Generate the canonical scalar induction phi of the vector loop.
void execute(VPTransformState &State) override;

/// Return the current part for this scalar step.
unsigned getPartForRecipe() const;

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
Expand Down Expand Up @@ -2780,7 +2794,9 @@ class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {
~VPActiveLaneMaskPHIRecipe() override = default;

VPActiveLaneMaskPHIRecipe *clone() override {
return new VPActiveLaneMaskPHIRecipe(getOperand(0), getDebugLoc());
auto *R = new VPActiveLaneMaskPHIRecipe(getOperand(0), getDebugLoc());
R->addOperand(getOperand(1));
return R;
}

VP_CLASSOF_IMPL(VPDef::VPActiveLaneMaskPHISC)
Expand Down Expand Up @@ -2858,6 +2874,9 @@ class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe {
/// step = <VF*UF, VF*UF, ..., VF*UF>.
void execute(VPTransformState &State) override;

/// Return the current part for this scalar step.
unsigned getPartForRecipe() const;

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
Expand Down Expand Up @@ -2970,6 +2989,9 @@ class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags {
"Op must be an operand of the recipe");
return true;
}

/// Return the current part for this scalar step.
unsigned getPartForRecipe() const;
};

/// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It
Expand Down Expand Up @@ -3294,6 +3316,8 @@ class VPlan {
/// Represents the loop-invariant VF * UF of the vector loop region.
VPValue VFxUF;

VPValue VF;

/// Holds a mapping between Values and their corresponding VPValue inside
/// VPlan.
Value2VPValueTy Value2VPValue;
Expand Down Expand Up @@ -3388,6 +3412,7 @@ class VPlan {

/// Returns VF * UF of the vector loop region.
VPValue &getVFxUF() { return VFxUF; }
VPValue &getVF() { return VF; }

void addVF(ElementCount VF) { VFs.insert(VF); }

Expand Down Expand Up @@ -3825,6 +3850,29 @@ inline bool isUniformAfterVectorization(const VPValue *VPV) {

/// Return true if \p V is a header mask in \p Plan.
bool isHeaderMask(const VPValue *V, VPlan &Plan);

/// Checks if \p C is uniform across all VFs and UFs. It is considered as such
/// if it is either defined outside the vector region or its operand is known to
/// be uniform across all VFs and UFs (e.g. VPDerivedIV or VPCanonicalIVPHI).
inline bool isUniformAcrossVFsAndUFs(VPValue *V) {
if (auto *VPI = dyn_cast_or_null<VPInstruction>(V->getDefiningRecipe())) {
return VPI ==
VPI->getParent()->getPlan()->getCanonicalIV()->getBackedgeValue();
}
if (isa<VPCanonicalIVPHIRecipe, VPDerivedIVRecipe, VPExpandSCEVRecipe>(V))
return true;
if (isa<VPReplicateRecipe>(V) && cast<VPReplicateRecipe>(V)->isUniform() &&
(isa<LoadInst, StoreInst>(V->getUnderlyingValue())) &&
all_of(V->getDefiningRecipe()->operands(),
[](VPValue *Op) { return Op->isDefinedOutsideVectorRegions(); }))
return true;

auto *C = dyn_cast_or_null<VPScalarCastRecipe>(V->getDefiningRecipe());
return C && (C->isDefinedOutsideVectorRegions() ||
isa<VPDerivedIVRecipe>(C->getOperand(0)) ||
isa<VPCanonicalIVPHIRecipe>(C->getOperand(0)));
}

} // end namespace vputils

} // end namespace llvm
Expand Down
Loading

0 comments on commit 0c3c293

Please sign in to comment.