Skip to content

Commit

Permalink
[VPlan] Implement interleaving as VPlan-to-VPlan transform.
Browse files Browse the repository at this point in the history
This patch implements explicit interleaving as VPlan transform, thus
simplifying VPTransform state (no need to store unrolled parts) as well
as recipe execution (no need to generate code for multiple parts in a
each recipe). It also allos for more general optimziations (e.g. avoid
generating code for recipes that are uniform-across parts).

In the initial implementation, a number of recipes still take the
unrolled part as additional, optional argument, if their execution
depends on the unrolled part.

The computation for start/step values for scalable inductions changed
slightly. Previously the step would be computed as scalar and then
splatted, now vscale gets splatted and multiplied by the step in a
vector mul.

Depends on #93396.
  • Loading branch information
fhahn committed Jun 4, 2024
1 parent abf67b8 commit 737bbfc
Show file tree
Hide file tree
Showing 30 changed files with 1,363 additions and 1,056 deletions.
9 changes: 9 additions & 0 deletions llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,15 @@ class VPBuilder {
return tryInsertInstruction(
new VPInstruction(Opcode, Operands, WrapFlags, DL, Name));
}

VPInstruction *createFPOp(unsigned Opcode,
std::initializer_list<VPValue *> Operands,
DebugLoc DL = {}, const Twine &Name = "",
FastMathFlags FMFs = {}) {
auto *Op = new VPInstruction(Opcode, Operands, FMFs, DL, Name);
return tryInsertInstruction(Op);
}

VPValue *createNot(VPValue *Operand, DebugLoc DL = {},
const Twine &Name = "") {
return createInstruction(VPInstruction::Not, {Operand}, DL, Name);
Expand Down
367 changes: 167 additions & 200 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Large diffs are not rendered by default.

170 changes: 84 additions & 86 deletions llvm/lib/Transforms/Vectorize/VPlan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,47 +223,47 @@ VPTransformState::VPTransformState(ElementCount VF, unsigned UF, LoopInfo *LI,
LVer(nullptr),
TypeAnalysis(Plan->getCanonicalIV()->getScalarType(), Ctx) {}

Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {
Value *VPTransformState::get(VPValue *Def, const VPLane &Lane) {
if (Def->isLiveIn())
return Def->getLiveInIRValue();

if (hasScalarValue(Def, Instance)) {
return Data
.PerPartScalars[Def][Instance.Part][Instance.Lane.mapToCacheIndex(VF)];
if (hasScalarValue(Def, Lane)) {
return Data.Scalars[Def][Lane.mapToCacheIndex(VF)];
}

assert(hasVectorValue(Def, Instance.Part));
auto *VecPart = Data.PerPartOutput[Def][Instance.Part];
assert(hasVectorValue(Def));
auto *VecPart = Data.Output[Def];
if (!VecPart->getType()->isVectorTy()) {
assert(Instance.Lane.isFirstLane() && "cannot get lane > 0 for scalar");
assert(Lane.isFirstLane() && "cannot get lane > 0 for scalar");
return VecPart;
}
// TODO: Cache created scalar values.
Value *Lane = Instance.Lane.getAsRuntimeExpr(Builder, VF);
auto *Extract = Builder.CreateExtractElement(VecPart, Lane);
// set(Def, Extract, Instance);
Value *LaneV = Lane.getAsRuntimeExpr(Builder, VF);
auto *Extract = Builder.CreateExtractElement(VecPart, LaneV);
// set(Def, Extract, Lane);
return Extract;
}

Value *VPTransformState::get(VPValue *Def, unsigned Part, bool NeedsScalar) {
Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
if (NeedsScalar) {
assert((VF.isScalar() || Def->isLiveIn() || hasVectorValue(Def, Part) ||
(hasScalarValue(Def, VPIteration(Part, 0)) &&
Data.PerPartScalars[Def][Part].size() == 1)) &&
"Trying to access a single scalar per part but has multiple scalars "
"per part.");
return get(Def, VPIteration(Part, 0));
assert(
(VF.isScalar() || Def->isLiveIn() || hasVectorValue(Def) ||
(hasScalarValue(Def, VPLane(0)) && Data.Scalars[Def].size() == 1)) &&
"Trying to access a single scalar per part but has multiple scalars "
"per part.");
return get(Def, VPLane(0));
}

// If Values have been set for this Def return the one relevant for \p Part.
if (hasVectorValue(Def, Part))
return Data.PerPartOutput[Def][Part];
if (hasVectorValue(Def))
return Data.Output[Def];

auto GetBroadcastInstrs = [this, Def](Value *V) {
bool SafeToHoist = Def->isDefinedOutsideVectorRegions();
if (VF.isScalar())
return V;
// Place the code for broadcasting invariant variables in the new preheader.
// Place the code for broadcasting invariant variables in the new
// preheader.
IRBuilder<>::InsertPointGuard Guard(Builder);
if (SafeToHoist) {
BasicBlock *LoopVectorPreHeader = CFG.VPBB2IRBB[cast<VPBasicBlock>(
Expand All @@ -272,56 +272,55 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part, bool NeedsScalar) {
Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
}

// Place the code for broadcasting invariant variables in the new preheader.
// Broadcast the scalar into all locations in the vector.
// Place the code for broadcasting invariant variables in the new
// preheader. Broadcast the scalar into all locations in the vector.
Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");

return Shuf;
};

if (!hasScalarValue(Def, {Part, 0})) {
if (!hasScalarValue(Def, VPLane(0))) {
assert(Def->isLiveIn() && "expected a live-in");
if (Part != 0)
return get(Def, 0);
Value *IRV = Def->getLiveInIRValue();
Value *B = GetBroadcastInstrs(IRV);
set(Def, B, Part);
set(Def, B);
return B;
}

Value *ScalarValue = get(Def, {Part, 0});
Value *ScalarValue = get(Def, VPLane(0));
// If we aren't vectorizing, we can just copy the scalar map values over
// to the vector map.
if (VF.isScalar()) {
set(Def, ScalarValue, Part);
set(Def, ScalarValue);
return ScalarValue;
}

bool IsUniform = vputils::isUniformAfterVectorization(Def);

unsigned LastLane = IsUniform ? 0 : VF.getKnownMinValue() - 1;
VPLane LastLane = VPLane(IsUniform ? 0 : VF.getKnownMinValue() - 1);
// Check if there is a scalar value for the selected lane.
if (!hasScalarValue(Def, {Part, LastLane})) {
// At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes and
// VPExpandSCEVRecipes can also be uniform.
if (!hasScalarValue(Def, LastLane)) {
// At the moment, VPWidenIntOrFpInductionRecipes, VPScalarIVStepsRecipes
// and VPExpandSCEVRecipes can also be uniform.
assert((isa<VPWidenIntOrFpInductionRecipe>(Def->getDefiningRecipe()) ||
isa<VPScalarIVStepsRecipe>(Def->getDefiningRecipe()) ||
isa<VPExpandSCEVRecipe>(Def->getDefiningRecipe())) &&
"unexpected recipe found to be invariant");
IsUniform = true;
LastLane = 0;
LastLane = VPLane(0);
}

auto *LastInst = cast<Instruction>(get(Def, {Part, LastLane}));
// Set the insert point after the last scalarized instruction or after the
// last PHI, if LastInst is a PHI. This ensures the insertelement sequence
// will directly follow the scalar definitions.
auto OldIP = Builder.saveIP();
auto NewIP =
isa<PHINode>(LastInst)
? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
: std::next(BasicBlock::iterator(LastInst));
Builder.SetInsertPoint(&*NewIP);
if (auto *LastInst = dyn_cast<Instruction>(get(Def, LastLane))) {
// Set the insert point after the last scalarized instruction or after the
// last PHI, if LastInst is a PHI. This ensures the insertelement sequence
// will directly follow the scalar definitions.
auto NewIP =
isa<PHINode>(LastInst)
? BasicBlock::iterator(LastInst->getParent()->getFirstNonPHI())
: std::next(BasicBlock::iterator(LastInst));
Builder.SetInsertPoint(&*NewIP);
}

// However, if we are vectorizing, we need to construct the vector values.
// If the value is known to be uniform after vectorization, we can just
Expand All @@ -332,15 +331,16 @@ Value *VPTransformState::get(VPValue *Def, unsigned Part, bool NeedsScalar) {
Value *VectorValue = nullptr;
if (IsUniform) {
VectorValue = GetBroadcastInstrs(ScalarValue);
set(Def, VectorValue, Part);
set(Def, VectorValue);
} else {
// Initialize packing with insertelements to start from undef.
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
Value *Undef = PoisonValue::get(VectorType::get(LastInst->getType(), VF));
set(Def, Undef, Part);
Value *Undef =
PoisonValue::get(VectorType::get(ScalarValue->getType(), VF));
set(Def, Undef);
for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
packScalarIntoVectorValue(Def, {Part, Lane});
VectorValue = get(Def, Part);
packScalarIntoVectorValue(Def, Lane);
VectorValue = get(Def);
}
Builder.restoreIP(OldIP);
return VectorValue;
Expand Down Expand Up @@ -392,12 +392,12 @@ void VPTransformState::setDebugLocFrom(DebugLoc DL) {
}

void VPTransformState::packScalarIntoVectorValue(VPValue *Def,
const VPIteration &Instance) {
Value *ScalarInst = get(Def, Instance);
Value *VectorValue = get(Def, Instance.Part);
VectorValue = Builder.CreateInsertElement(
VectorValue, ScalarInst, Instance.Lane.getAsRuntimeExpr(Builder, VF));
set(Def, VectorValue, Instance.Part);
const VPLane &Lane) {
Value *ScalarInst = get(Def, Lane);
Value *VectorValue = get(Def);
VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
Lane.getAsRuntimeExpr(Builder, VF));
set(Def, VectorValue);
}

BasicBlock *
Expand Down Expand Up @@ -453,7 +453,7 @@ void VPIRBasicBlock::execute(VPTransformState *State) {
}

void VPBasicBlock::execute(VPTransformState *State) {
bool Replica = State->Instance && !State->Instance->isFirstIteration();
bool Replica = State->Lane && !State->Lane->isFirstLane();
VPBasicBlock *PrevVPBB = State->CFG.PrevVPBB;
VPBlockBase *SingleHPred = nullptr;
BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible.
Expand Down Expand Up @@ -724,27 +724,24 @@ void VPRegionBlock::execute(VPTransformState *State) {
return;
}

assert(!State->Instance && "Replicating a Region with non-null instance.");
assert(!State->Lane && "Replicating a Region with non-null instance.");

// Enter replicating mode.
State->Instance = VPIteration(0, 0);

for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) {
State->Instance->Part = Part;
assert(!State->VF.isScalable() && "VF is assumed to be non scalable.");
for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF;
++Lane) {
State->Instance->Lane = VPLane(Lane, VPLane::Kind::First);
// Visit the VPBlocks connected to \p this, starting from it.
for (VPBlockBase *Block : RPOT) {
LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
Block->execute(State);
}
State->Lane = VPLane(0);

assert(!State->VF.isScalable() && "VF is assumed to be non scalable.");
for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF;
++Lane) {
State->Lane = VPLane(Lane, VPLane::Kind::First);
// Visit the VPBlocks connected to \p this, starting from it.
for (VPBlockBase *Block : RPOT) {
LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
Block->execute(State);
}
}

// Exit replicating mode.
State->Instance.reset();
State->Lane.reset();
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
Expand Down Expand Up @@ -816,10 +813,15 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
// FIXME: Model VF * UF computation completely in VPlan.
VFxUF.setUnderlyingValue(
createStepForVF(Builder, TripCountV->getType(), State.VF, State.UF));
if (VF.getNumUsers() > 0) {
VF.setUnderlyingValue(
createStepForVF(Builder, TripCountV->getType(), State.VF, 1));
}

// When vectorizing the epilogue loop, the canonical induction start value
// needs to be changed from zero to the value after the main vector loop.
// FIXME: Improve modeling for canonical IV start values in the epilogue loop.
// FIXME: Improve modeling for canonical IV start values in the epilogue
// loop.
if (CanonicalIVStartValue) {
VPValue *VPV = getOrAddLiveIn(CanonicalIVStartValue);
auto *IV = getCanonicalIV();
Expand Down Expand Up @@ -871,12 +873,12 @@ void VPlan::execute(VPTransformState *State) {
isa<VPWidenIntOrFpInductionRecipe>(&R)) {
PHINode *Phi = nullptr;
if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
Phi = cast<PHINode>(State->get(R.getVPSingleValue(), 0));
Phi = cast<PHINode>(State->get(R.getVPSingleValue()));
} else {
auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
assert(!WidenPhi->onlyScalarsGenerated(State->VF.isScalable()) &&
"recipe generating only scalars should have been replaced");
auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi, 0));
auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi));
Phi = cast<PHINode>(GEP->getPointerOperand());
}

Expand All @@ -885,6 +887,9 @@ void VPlan::execute(VPTransformState *State) {
// Move the last step to the end of the latch block. This ensures
// consistent placement of all induction updates.
Instruction *Inc = cast<Instruction>(Phi->getIncomingValue(1));
if (isa<VPWidenIntOrFpInductionRecipe>(&R) && R.getNumOperands() == 4)
Inc->setOperand(0, State->get(R.getOperand(3)));

Inc->moveBefore(VectorLatchBB->getTerminator()->getPrevNode());
continue;
}
Expand All @@ -894,24 +899,13 @@ void VPlan::execute(VPTransformState *State) {
// only a single part is generated, which provides the last part from the
// previous iteration. For non-ordered reductions all UF parts are
// generated.
bool SinglePartNeeded =
isa<VPCanonicalIVPHIRecipe>(PhiR) ||
isa<VPFirstOrderRecurrencePHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) ||
(isa<VPReductionPHIRecipe>(PhiR) &&
cast<VPReductionPHIRecipe>(PhiR)->isOrdered());
bool NeedsScalar =
isa<VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe>(PhiR) ||
(isa<VPReductionPHIRecipe>(PhiR) &&
cast<VPReductionPHIRecipe>(PhiR)->isInLoop());
unsigned LastPartForNewPhi = SinglePartNeeded ? 1 : State->UF;

for (unsigned Part = 0; Part < LastPartForNewPhi; ++Part) {
Value *Phi = State->get(PhiR, Part, NeedsScalar);
Value *Val =
State->get(PhiR->getBackedgeValue(),
SinglePartNeeded ? State->UF - 1 : Part, NeedsScalar);
cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
}
Value *Phi = State->get(PhiR, NeedsScalar);
Value *Val = State->get(PhiR->getBackedgeValue(), NeedsScalar);
cast<PHINode>(Phi)->addIncoming(Val, VectorLatchBB);
}

State->CFG.DTU.flush();
Expand Down Expand Up @@ -1249,6 +1243,10 @@ void VPlanIngredient::print(raw_ostream &O) const {

template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT);

bool VPValue::isDefinedOutsideVectorRegions() const {
return !hasDefiningRecipe() || !getDefiningRecipe()->getParent()->getParent();
}

void VPValue::replaceAllUsesWith(VPValue *New) {
replaceUsesWithIf(New, [](VPUser &, unsigned) { return true; });
}
Expand Down
Loading

0 comments on commit 737bbfc

Please sign in to comment.