From 0169d980c6d44a8cf16019412b95e4ce02b0b105 Mon Sep 17 00:00:00 2001 From: Graham Hunter Date: Tue, 14 May 2024 13:19:32 +0000 Subject: [PATCH] [LSR] Recognize vscale-relative immediates --- .../Transforms/Scalar/LoopStrengthReduce.cpp | 539 ++++++++++++------ .../AArch64/vscale-fixups.ll | 23 +- 2 files changed, 389 insertions(+), 173 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index eb1904ccaff352..80f70f3f20a383 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -197,6 +197,14 @@ static cl::opt AllowDropSolutionIfLessProfitable( "lsr-drop-solution", cl::Hidden, cl::init(false), cl::desc("Attempt to drop solution if it is less profitable")); +static cl::opt EnableVScaleImmediates( + "lsr-enable-vscale-immediates", cl::Hidden, cl::init(true), + cl::desc("Enable analysis of vscale-relative immediates in LSR")); + +static cl::opt DropScaledForVScale( + "lsr-drop-scaled-reg-for-vscale", cl::Hidden, cl::init(true), + cl::desc("Avoid using scaled registers with vscale-relative addressing")); + STATISTIC(NumTermFold, "Number of terminating condition fold recognized and performed"); @@ -247,6 +255,68 @@ class RegSortData { void dump() const; }; +// An offset from an address that is either scalable or fixed. Used for +// per-target optimizations of addressing modes. +class Immediate : public details::FixedOrScalableQuantity { + constexpr Immediate(ScalarTy MinVal, bool Scalable) + : FixedOrScalableQuantity(MinVal, Scalable) {} + + constexpr Immediate(const FixedOrScalableQuantity &V) + : FixedOrScalableQuantity(V) {} + +public: + constexpr Immediate() : FixedOrScalableQuantity() {} + + static constexpr Immediate getFixed(ScalarTy MinVal) { + return Immediate(MinVal, false); + } + static constexpr Immediate getScalable(ScalarTy MinVal) { + return Immediate(MinVal, true); + } + static constexpr Immediate get(ScalarTy MinVal, bool Scalable) { + return Immediate(MinVal, Scalable); + } + + constexpr bool isLessThanZero() const { return Quantity < 0; } + + constexpr bool isGreaterThanZero() const { return Quantity > 0; } + + constexpr bool isMin() const { + return Quantity == std::numeric_limits::min(); + } + + constexpr bool isMax() const { + return Quantity == std::numeric_limits::max(); + } +}; + +// This is needed for the Compare type of std::map when Immediate is used +// as a key. We don't need it to be fully correct against any value of vscale, +// just to make sure that vscale-related terms in the map are considered against +// each other rather than being mixed up and potentially missing opportunities. +struct KeyOrderTargetImmediate { + bool operator()(const Immediate &LHS, const Immediate &RHS) const { + if (LHS.isScalable() && !RHS.isScalable()) + return false; + if (!LHS.isScalable() && RHS.isScalable()) + return true; + return LHS.getKnownMinValue() < RHS.getKnownMinValue(); + } +}; + +// This would be nicer if we could be generic instead of directly using size_t, +// but there doesn't seem to be a type trait for is_orderable or +// is_lessthan_comparable or similar. +struct KeyOrderSizeTAndImmediate { + bool operator()(const std::pair &LHS, + const std::pair &RHS) const { + size_t LSize = LHS.first; + size_t RSize = RHS.first; + if (LSize != RSize) + return LSize < RSize; + return KeyOrderTargetImmediate()(LHS.second, RHS.second); + } +}; } // end anonymous namespace #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) @@ -357,7 +427,7 @@ struct Formula { GlobalValue *BaseGV = nullptr; /// Base offset for complex addressing. - int64_t BaseOffset = 0; + Immediate BaseOffset; /// Whether any complex addressing has a base register. bool HasBaseReg = false; @@ -388,7 +458,7 @@ struct Formula { /// An additional constant offset which added near the use. This requires a /// temporary register, but the offset itself can live in an add immediate /// field rather than a register. - int64_t UnfoldedOffset = 0; + Immediate UnfoldedOffset; Formula() = default; @@ -628,7 +698,7 @@ void Formula::print(raw_ostream &OS) const { if (!First) OS << " + "; else First = false; BaseGV->printAsOperand(OS, /*PrintType=*/false); } - if (BaseOffset != 0) { + if (BaseOffset.isNonZero()) { if (!First) OS << " + "; else First = false; OS << BaseOffset; } @@ -652,7 +722,7 @@ void Formula::print(raw_ostream &OS) const { OS << ""; OS << ')'; } - if (UnfoldedOffset != 0) { + if (UnfoldedOffset.isNonZero()) { if (!First) OS << " + "; OS << "imm(" << UnfoldedOffset << ')'; } @@ -798,28 +868,34 @@ static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS, /// If S involves the addition of a constant integer value, return that integer /// value, and mutate S to point to a new SCEV with that value excluded. -static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) { +static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) { if (const SCEVConstant *C = dyn_cast(S)) { if (C->getAPInt().getSignificantBits() <= 64) { S = SE.getConstant(C->getType(), 0); - return C->getValue()->getSExtValue(); + return Immediate::getFixed(C->getValue()->getSExtValue()); } } else if (const SCEVAddExpr *Add = dyn_cast(S)) { SmallVector NewOps(Add->operands()); - int64_t Result = ExtractImmediate(NewOps.front(), SE); - if (Result != 0) + Immediate Result = ExtractImmediate(NewOps.front(), SE); + if (Result.isNonZero()) S = SE.getAddExpr(NewOps); return Result; } else if (const SCEVAddRecExpr *AR = dyn_cast(S)) { SmallVector NewOps(AR->operands()); - int64_t Result = ExtractImmediate(NewOps.front(), SE); - if (Result != 0) + Immediate Result = ExtractImmediate(NewOps.front(), SE); + if (Result.isNonZero()) S = SE.getAddRecExpr(NewOps, AR->getLoop(), // FIXME: AR->getNoWrapFlags(SCEV::FlagNW) SCEV::FlagAnyWrap); return Result; - } - return 0; + } else if (EnableVScaleImmediates) + if (const SCEVMulExpr *M = dyn_cast(S)) + if (const SCEVConstant *C = dyn_cast(M->getOperand(0))) + if (isa(M->getOperand(1))) { + S = SE.getConstant(M->getType(), 0); + return Immediate::getScalable(C->getValue()->getSExtValue()); + } + return Immediate(); } /// If S involves the addition of a GlobalValue address, return that symbol, and @@ -1134,7 +1210,7 @@ struct LSRFixup { /// A constant offset to be added to the LSRUse expression. This allows /// multiple fixups to share the same LSRUse with different offsets, for /// example in an unrolled loop. - int64_t Offset = 0; + Immediate Offset; LSRFixup() = default; @@ -1197,8 +1273,10 @@ class LSRUse { SmallVector Fixups; /// Keep track of the min and max offsets of the fixups. - int64_t MinOffset = std::numeric_limits::max(); - int64_t MaxOffset = std::numeric_limits::min(); + Immediate MinOffset = + Immediate::getFixed(std::numeric_limits::max()); + Immediate MaxOffset = + Immediate::getFixed(std::numeric_limits::min()); /// This records whether all of the fixups using this LSRUse are outside of /// the loop, in which case some special-case heuristics may be used. @@ -1234,9 +1312,9 @@ class LSRUse { void pushFixup(LSRFixup &f) { Fixups.push_back(f); - if (f.Offset > MaxOffset) + if (Immediate::isKnownGT(f.Offset, MaxOffset)) MaxOffset = f.Offset; - if (f.Offset < MinOffset) + if (Immediate::isKnownLT(f.Offset, MinOffset)) MinOffset = f.Offset; } @@ -1254,7 +1332,7 @@ class LSRUse { static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, - GlobalValue *BaseGV, int64_t BaseOffset, + GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg, int64_t Scale, Instruction *Fixup = nullptr); @@ -1310,7 +1388,7 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg, // addressing. if (AMK == TTI::AMK_PreIndexed) { if (auto *Step = dyn_cast(AR->getStepRecurrence(*SE))) - if (Step->getAPInt() == F.BaseOffset) + if (Step->getAPInt() == F.BaseOffset.getFixedValue()) LoopCost = 0; } else if (AMK == TTI::AMK_PostIndexed) { const SCEV *LoopStep = AR->getStepRecurrence(*SE); @@ -1401,24 +1479,29 @@ void Cost::RateFormula(const Formula &F, // allows to fold 2 registers. C.NumBaseAdds += NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F))); - C.NumBaseAdds += (F.UnfoldedOffset != 0); + C.NumBaseAdds += (F.UnfoldedOffset.isNonZero()); // Accumulate non-free scaling amounts. C.ScaleCost += *getScalingFactorCost(*TTI, LU, F, *L).getValue(); // Tally up the non-zero immediates. for (const LSRFixup &Fixup : LU.Fixups) { - int64_t O = Fixup.Offset; - int64_t Offset = (uint64_t)O + F.BaseOffset; + // FIXME: We probably want to noticeably increase the cost if the + // two offsets differ in scalability? + bool Scalable = Fixup.Offset.isScalable() || F.BaseOffset.isScalable(); + int64_t O = Fixup.Offset.getKnownMinValue(); + Immediate Offset = Immediate::get( + (uint64_t)(O) + F.BaseOffset.getKnownMinValue(), Scalable); if (F.BaseGV) C.ImmCost += 64; // Handle symbolic values conservatively. // TODO: This should probably be the pointer size. - else if (Offset != 0) - C.ImmCost += APInt(64, Offset, true).getSignificantBits(); + else if (Offset.isNonZero()) + C.ImmCost += + APInt(64, Offset.getKnownMinValue(), true).getSignificantBits(); // Check with target if this offset with this instruction is // specifically not supported. - if (LU.Kind == LSRUse::Address && Offset != 0 && + if (LU.Kind == LSRUse::Address && Offset.isNonZero() && !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV, Offset, F.HasBaseReg, F.Scale, Fixup.UserInst)) C.NumBaseAdds++; @@ -1546,7 +1629,7 @@ void LSRFixup::print(raw_ostream &OS) const { PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false); } - if (Offset != 0) + if (Offset.isNonZero()) OS << ", Offset=" << Offset; } @@ -1673,14 +1756,19 @@ LLVM_DUMP_METHOD void LSRUse::dump() const { static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, - GlobalValue *BaseGV, int64_t BaseOffset, + GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg, int64_t Scale, - Instruction *Fixup/*= nullptr*/) { + Instruction *Fixup /*= nullptr*/) { switch (Kind) { - case LSRUse::Address: - return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, BaseOffset, - HasBaseReg, Scale, AccessTy.AddrSpace, Fixup); - + case LSRUse::Address: { + int64_t FixedOffset = + BaseOffset.isScalable() ? 0 : BaseOffset.getFixedValue(); + int64_t ScalableOffset = + BaseOffset.isScalable() ? BaseOffset.getKnownMinValue() : 0; + return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, FixedOffset, + HasBaseReg, Scale, AccessTy.AddrSpace, + Fixup, ScalableOffset); + } case LSRUse::ICmpZero: // There's not even a target hook for querying whether it would be legal to // fold a GV into an ICmp. @@ -1688,7 +1776,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, return false; // ICmp only has two operands; don't allow more than two non-trivial parts. - if (Scale != 0 && HasBaseReg && BaseOffset != 0) + if (Scale != 0 && HasBaseReg && BaseOffset.isNonZero()) return false; // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by @@ -1698,7 +1786,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, // If we have low-level target information, ask the target if it can fold an // integer immediate on an icmp. - if (BaseOffset != 0) { + if (BaseOffset.isNonZero()) { // We have one of: // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset @@ -1706,8 +1794,8 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, if (Scale == 0) // The cast does the right thing with // std::numeric_limits::min(). - BaseOffset = -(uint64_t)BaseOffset; - return TTI.isLegalICmpImmediate(BaseOffset); + BaseOffset = BaseOffset.getFixed(-(uint64_t)BaseOffset.getFixedValue()); + return TTI.isLegalICmpImmediate(BaseOffset.getFixedValue()); } // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg @@ -1715,30 +1803,35 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, case LSRUse::Basic: // Only handle single-register values. - return !BaseGV && Scale == 0 && BaseOffset == 0; + return !BaseGV && Scale == 0 && BaseOffset.isZero(); case LSRUse::Special: // Special case Basic to handle -1 scales. - return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset == 0; + return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset.isZero(); } llvm_unreachable("Invalid LSRUse Kind!"); } static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, - int64_t MinOffset, int64_t MaxOffset, + Immediate MinOffset, Immediate MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, - GlobalValue *BaseGV, int64_t BaseOffset, + GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg, int64_t Scale) { + if (BaseOffset.isNonZero() && + (BaseOffset.isScalable() != MinOffset.isScalable() || + BaseOffset.isScalable() != MaxOffset.isScalable())) + return false; // Check for overflow. - if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) != - (MinOffset > 0)) + int64_t Base = BaseOffset.getKnownMinValue(); + int64_t Min = MinOffset.getKnownMinValue(); + int64_t Max = MaxOffset.getKnownMinValue(); + if (((int64_t)((uint64_t)Base + Min) > Base) != (Min > 0)) return false; - MinOffset = (uint64_t)BaseOffset + MinOffset; - if (((int64_t)((uint64_t)BaseOffset + MaxOffset) > BaseOffset) != - (MaxOffset > 0)) + MinOffset = Immediate::get((uint64_t)Base + Min, MinOffset.isScalable()); + if (((int64_t)((uint64_t)Base + Max) > Base) != (Max > 0)) return false; - MaxOffset = (uint64_t)BaseOffset + MaxOffset; + MaxOffset = Immediate::get((uint64_t)Base + Max, MaxOffset.isScalable()); return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset, HasBaseReg, Scale) && @@ -1747,7 +1840,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, } static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, - int64_t MinOffset, int64_t MaxOffset, + Immediate MinOffset, Immediate MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, const Formula &F, const Loop &L) { // For the purpose of isAMCompletelyFolded either having a canonical formula @@ -1763,10 +1856,10 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, } /// Test whether we know how to expand the current formula. -static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, - int64_t MaxOffset, LSRUse::KindType Kind, +static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset, + Immediate MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, - int64_t BaseOffset, bool HasBaseReg, int64_t Scale) { + Immediate BaseOffset, bool HasBaseReg, int64_t Scale) { // We know how to expand completely foldable formulae. return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale) || @@ -1777,13 +1870,21 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, BaseGV, BaseOffset, true, 0)); } -static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, - int64_t MaxOffset, LSRUse::KindType Kind, +static bool isLegalUse(const TargetTransformInfo &TTI, Immediate MinOffset, + Immediate MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, const Formula &F) { return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale); } +static bool isLegalAddImmediate(const TargetTransformInfo &TTI, + Immediate Offset) { + if (Offset.isScalable()) + return TTI.isLegalAddScalableImmediate(Offset.getKnownMinValue()); + + return TTI.isLegalAddImmediate(Offset.getFixedValue()); +} + static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, const LSRUse &LU, const Formula &F) { // Target may want to look at the user instructions. @@ -1816,14 +1917,20 @@ static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI, switch (LU.Kind) { case LSRUse::Address: { // Check the scaling factor cost with both the min and max offsets. + int64_t ScalableMin = 0, ScalableMax = 0, FixedMin = 0, FixedMax = 0; + if (F.BaseOffset.isScalable()) { + ScalableMin = (F.BaseOffset + LU.MinOffset).getKnownMinValue(); + ScalableMax = (F.BaseOffset + LU.MaxOffset).getKnownMinValue(); + } else { + FixedMin = (F.BaseOffset + LU.MinOffset).getFixedValue(); + FixedMax = (F.BaseOffset + LU.MaxOffset).getFixedValue(); + } InstructionCost ScaleCostMinOffset = TTI.getScalingFactorCost( - LU.AccessTy.MemTy, F.BaseGV, - StackOffset::getFixed(F.BaseOffset + LU.MinOffset), F.HasBaseReg, - F.Scale, LU.AccessTy.AddrSpace); + LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMin, ScalableMin), + F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace); InstructionCost ScaleCostMaxOffset = TTI.getScalingFactorCost( - LU.AccessTy.MemTy, F.BaseGV, - StackOffset::getFixed(F.BaseOffset + LU.MaxOffset), F.HasBaseReg, - F.Scale, LU.AccessTy.AddrSpace); + LU.AccessTy.MemTy, F.BaseGV, StackOffset::get(FixedMax, ScalableMax), + F.HasBaseReg, F.Scale, LU.AccessTy.AddrSpace); assert(ScaleCostMinOffset.isValid() && ScaleCostMaxOffset.isValid() && "Legal addressing mode has an illegal cost!"); @@ -1842,10 +1949,11 @@ static InstructionCost getScalingFactorCost(const TargetTransformInfo &TTI, static bool isAlwaysFoldable(const TargetTransformInfo &TTI, LSRUse::KindType Kind, MemAccessTy AccessTy, - GlobalValue *BaseGV, int64_t BaseOffset, + GlobalValue *BaseGV, Immediate BaseOffset, bool HasBaseReg) { // Fast-path: zero is always foldable. - if (BaseOffset == 0 && !BaseGV) return true; + if (BaseOffset.isZero() && !BaseGV) + return true; // Conservatively, create an address with an immediate and a // base and a scale. @@ -1858,13 +1966,22 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI, HasBaseReg = true; } + // FIXME: Try with + without a scale? Maybe based on TTI? + // I think basereg + scaledreg + immediateoffset isn't a good 'conservative' + // default for many architectures, not just AArch64 SVE. More investigation + // needed later to determine if this should be used more widely than just + // on scalable types. + if (HasBaseReg && BaseOffset.isNonZero() && Kind != LSRUse::ICmpZero && + AccessTy.MemTy && AccessTy.MemTy->isScalableTy() && DropScaledForVScale) + Scale = 0; + return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale); } static bool isAlwaysFoldable(const TargetTransformInfo &TTI, - ScalarEvolution &SE, int64_t MinOffset, - int64_t MaxOffset, LSRUse::KindType Kind, + ScalarEvolution &SE, Immediate MinOffset, + Immediate MaxOffset, LSRUse::KindType Kind, MemAccessTy AccessTy, const SCEV *S, bool HasBaseReg) { // Fast-path: zero is always foldable. @@ -1872,14 +1989,18 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI, // Conservatively, create an address with an immediate and a // base and a scale. - int64_t BaseOffset = ExtractImmediate(S, SE); + Immediate BaseOffset = ExtractImmediate(S, SE); GlobalValue *BaseGV = ExtractSymbol(S, SE); // If there's anything else involved, it's not foldable. if (!S->isZero()) return false; // Fast-path: zero is always foldable. - if (BaseOffset == 0 && !BaseGV) return true; + if (BaseOffset.isZero() && !BaseGV) + return true; + + if (BaseOffset.isScalable()) + return false; // Conservatively, create an address with an immediate and a // base and a scale. @@ -2028,11 +2149,11 @@ class LSRInstance { using UseMapTy = DenseMap; UseMapTy UseMap; - bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, + bool reconcileNewOffset(LSRUse &LU, Immediate NewOffset, bool HasBaseReg, LSRUse::KindType Kind, MemAccessTy AccessTy); - std::pair getUse(const SCEV *&Expr, LSRUse::KindType Kind, - MemAccessTy AccessTy); + std::pair getUse(const SCEV *&Expr, LSRUse::KindType Kind, + MemAccessTy AccessTy); void DeleteUse(LSRUse &LU, size_t LUIdx); @@ -2058,7 +2179,7 @@ class LSRInstance { void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base); void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx, const Formula &Base, - const SmallVectorImpl &Worklist, + const SmallVectorImpl &Worklist, size_t Idx, bool IsScaledReg = false); void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base); void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base); @@ -2566,11 +2687,11 @@ LSRInstance::OptimizeLoopTermCond() { /// Determine if the given use can accommodate a fixup at the given offset and /// other details. If so, update the use and return true. -bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, +bool LSRInstance::reconcileNewOffset(LSRUse &LU, Immediate NewOffset, bool HasBaseReg, LSRUse::KindType Kind, MemAccessTy AccessTy) { - int64_t NewMinOffset = LU.MinOffset; - int64_t NewMaxOffset = LU.MaxOffset; + Immediate NewMinOffset = LU.MinOffset; + Immediate NewMaxOffset = LU.MaxOffset; MemAccessTy NewAccessTy = AccessTy; // Check for a mismatched kind. It's tempting to collapse mismatched kinds to @@ -2590,18 +2711,25 @@ bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, } // Conservatively assume HasBaseReg is true for now. - if (NewOffset < LU.MinOffset) { + if (Immediate::isKnownLT(NewOffset, LU.MinOffset)) { if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr, LU.MaxOffset - NewOffset, HasBaseReg)) return false; NewMinOffset = NewOffset; - } else if (NewOffset > LU.MaxOffset) { + } else if (Immediate::isKnownGT(NewOffset, LU.MaxOffset)) { if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr, NewOffset - LU.MinOffset, HasBaseReg)) return false; NewMaxOffset = NewOffset; } + // FIXME: We should be able to handle some level of scalable offset support + // for 'void', but in order to get basic support up and running this is + // being left out. + if (NewAccessTy.MemTy && NewAccessTy.MemTy->isVoidTy() && + (NewMinOffset.isScalable() || NewMaxOffset.isScalable())) + return false; + // Update the use. LU.MinOffset = NewMinOffset; LU.MaxOffset = NewMaxOffset; @@ -2612,17 +2740,17 @@ bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, /// Return an LSRUse index and an offset value for a fixup which needs the given /// expression, with the given kind and optional access type. Either reuse an /// existing use or create a new one, as needed. -std::pair LSRInstance::getUse(const SCEV *&Expr, - LSRUse::KindType Kind, - MemAccessTy AccessTy) { +std::pair LSRInstance::getUse(const SCEV *&Expr, + LSRUse::KindType Kind, + MemAccessTy AccessTy) { const SCEV *Copy = Expr; - int64_t Offset = ExtractImmediate(Expr, SE); + Immediate Offset = ExtractImmediate(Expr, SE); // Basic uses can't accept any offset, for example. if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr, Offset, /*HasBaseReg=*/ true)) { Expr = Copy; - Offset = 0; + Offset = Immediate::getFixed(0); } std::pair P = @@ -2683,7 +2811,7 @@ LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF, F.BaseGV == OrigF.BaseGV && F.Scale == OrigF.Scale && F.UnfoldedOffset == OrigF.UnfoldedOffset) { - if (F.BaseOffset == 0) + if (F.BaseOffset.isZero()) return &LU; // This is the formula where all the registers and symbols matched; // there aren't going to be any others. Since we declined it, we @@ -3174,7 +3302,7 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand); int64_t IncOffset = IncConst->getValue()->getSExtValue(); if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr, - IncOffset, /*HasBaseReg=*/false)) + Immediate::getFixed(IncOffset), /*HasBaseReg=*/false)) return false; return true; @@ -3380,9 +3508,9 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { } // Get or create an LSRUse. - std::pair P = getUse(S, Kind, AccessTy); + std::pair P = getUse(S, Kind, AccessTy); size_t LUIdx = P.first; - int64_t Offset = P.second; + Immediate Offset = P.second; LSRUse &LU = Uses[LUIdx]; // Record the fixup. @@ -3572,10 +3700,10 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { continue; } - std::pair P = getUse( - S, LSRUse::Basic, MemAccessTy()); + std::pair P = + getUse(S, LSRUse::Basic, MemAccessTy()); size_t LUIdx = P.first; - int64_t Offset = P.second; + Immediate Offset = P.second; LSRUse &LU = Uses[LUIdx]; LSRFixup &LF = LU.getNewFixup(); LF.UserInst = const_cast(UserInst); @@ -3733,11 +3861,13 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx, // Add the remaining pieces of the add back into the new formula. const SCEVConstant *InnerSumSC = dyn_cast(InnerSum); + // FIXME: Scalable immediates!!! if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 && - TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset + + TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() + InnerSumSC->getValue()->getZExtValue())) { F.UnfoldedOffset = - (uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue(); + Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() + + InnerSumSC->getValue()->getZExtValue()); if (IsScaledReg) F.ScaledReg = nullptr; else @@ -3750,10 +3880,11 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx, // Add J as its own register, or an unfolded immediate. const SCEVConstant *SC = dyn_cast(*J); if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 && - TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset + + TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset.getFixedValue() + SC->getValue()->getZExtValue())) F.UnfoldedOffset = - (uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue(); + Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() + + SC->getValue()->getZExtValue()); else F.BaseRegs.push_back(*J); // We may have changed the number of register in base regs, adjust the @@ -3794,7 +3925,8 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base) { // This method is only interesting on a plurality of registers. if (Base.BaseRegs.size() + (Base.Scale == 1) + - (Base.UnfoldedOffset != 0) <= 1) + (Base.UnfoldedOffset.isNonZero()) <= + 1) return; // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before @@ -3845,9 +3977,9 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx, // registers collected. if (NewBase.UnfoldedOffset) { assert(CombinedIntegerType && "Missing a type for the unfolded offset"); - Ops.push_back(SE.getConstant(CombinedIntegerType, NewBase.UnfoldedOffset, - true)); - NewBase.UnfoldedOffset = 0; + Ops.push_back(SE.getConstant(CombinedIntegerType, + NewBase.UnfoldedOffset.getFixedValue(), true)); + NewBase.UnfoldedOffset = Immediate::getFixed(0); GenerateFormula(SE.getAddExpr(Ops)); } } @@ -3887,15 +4019,26 @@ void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, /// Helper function for LSRInstance::GenerateConstantOffsets. void LSRInstance::GenerateConstantOffsetsImpl( LSRUse &LU, unsigned LUIdx, const Formula &Base, - const SmallVectorImpl &Worklist, size_t Idx, bool IsScaledReg) { + const SmallVectorImpl &Worklist, size_t Idx, bool IsScaledReg) { - auto GenerateOffset = [&](const SCEV *G, int64_t Offset) { + auto GenerateOffset = [&](const SCEV *G, Immediate Offset) { Formula F = Base; - F.BaseOffset = (uint64_t)Base.BaseOffset - Offset; + if (Base.BaseOffset.isScalable() != Offset.isScalable() && + Base.BaseOffset.isNonZero() && Offset.isNonZero()) + return; + bool Scalable = Base.BaseOffset.isScalable() || Offset.isScalable(); + F.BaseOffset = Immediate::get((uint64_t)Base.BaseOffset.getKnownMinValue() - + Offset.getKnownMinValue(), + Scalable); if (isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) { // Add the offset to the base register. - const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), Offset), G); + const SCEV *NewOffset = + SE.getConstant(G->getType(), Offset.getKnownMinValue()); + if (Scalable) + NewOffset = + SE.getMulExpr(NewOffset, SE.getVScale(NewOffset->getType())); + const SCEV *NewG = SE.getAddExpr(NewOffset, G); // If it cancelled out, drop the base register, otherwise update it. if (NewG->isZero()) { if (IsScaledReg) { @@ -3931,21 +4074,24 @@ void LSRInstance::GenerateConstantOffsetsImpl( int64_t Step = StepInt.isNegative() ? StepInt.getSExtValue() : StepInt.getZExtValue(); - for (int64_t Offset : Worklist) { - Offset -= Step; + for (Immediate Offset : Worklist) { + Offset = Immediate::getFixed(Offset.getFixedValue() - Step); GenerateOffset(G, Offset); } } } } - for (int64_t Offset : Worklist) + for (Immediate Offset : Worklist) GenerateOffset(G, Offset); - int64_t Imm = ExtractImmediate(G, SE); - if (G->isZero() || Imm == 0) + Immediate Imm = ExtractImmediate(G, SE); + if (G->isZero() || Imm.isZero() || + Base.BaseOffset.isScalable() != Imm.isScalable()) return; Formula F = Base; - F.BaseOffset = (uint64_t)F.BaseOffset + Imm; + F.BaseOffset = Immediate::get((uint64_t)F.BaseOffset.getKnownMinValue() + + Imm.getKnownMinValue(), + Imm.isScalable()); if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F)) return; if (IsScaledReg) { @@ -3964,7 +4110,7 @@ void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base) { // TODO: For now, just add the min and max offset, because it usually isn't // worthwhile looking at everything inbetween. - SmallVector Worklist; + SmallVector Worklist; Worklist.push_back(LU.MinOffset); if (LU.MaxOffset != LU.MinOffset) Worklist.push_back(LU.MaxOffset); @@ -4004,27 +4150,29 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, if (!ConstantInt::isValueValidForType(IntTy, Factor)) continue; // Check that the multiplication doesn't overflow. - if (Base.BaseOffset == std::numeric_limits::min() && Factor == -1) + if (Base.BaseOffset.isMin() && Factor == -1) continue; - int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor; + Immediate NewBaseOffset = + Immediate::getFixed((uint64_t)Base.BaseOffset.getFixedValue() * Factor); assert(Factor != 0 && "Zero factor not expected!"); - if (NewBaseOffset / Factor != Base.BaseOffset) + if (NewBaseOffset.getFixedValue() / Factor != + Base.BaseOffset.getFixedValue()) continue; // If the offset will be truncated at this use, check that it is in bounds. if (!IntTy->isPointerTy() && - !ConstantInt::isValueValidForType(IntTy, NewBaseOffset)) + !ConstantInt::isValueValidForType(IntTy, NewBaseOffset.getFixedValue())) continue; // Check that multiplying with the use offset doesn't overflow. - int64_t Offset = LU.MinOffset; - if (Offset == std::numeric_limits::min() && Factor == -1) + Immediate Offset = LU.MinOffset; + if (Offset.isMin() && Factor == -1) continue; - Offset = (uint64_t)Offset * Factor; - if (Offset / Factor != LU.MinOffset) + Offset = Immediate::getFixed((uint64_t)Offset.getFixedValue() * Factor); + if (Offset.getFixedValue() / Factor != LU.MinOffset.getFixedValue()) continue; // If the offset will be truncated at this use, check that it is in bounds. if (!IntTy->isPointerTy() && - !ConstantInt::isValueValidForType(IntTy, Offset)) + !ConstantInt::isValueValidForType(IntTy, Offset.getFixedValue())) continue; Formula F = Base; @@ -4035,7 +4183,9 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, continue; // Compensate for the use having MinOffset built into it. - F.BaseOffset = (uint64_t)F.BaseOffset + Offset - LU.MinOffset; + F.BaseOffset = Immediate::getFixed((uint64_t)F.BaseOffset.getFixedValue() + + Offset.getFixedValue() - + LU.MinOffset.getFixedValue()); const SCEV *FactorS = SE.getConstant(IntTy, Factor); @@ -4054,16 +4204,17 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, } // Check that multiplying with the unfolded offset doesn't overflow. - if (F.UnfoldedOffset != 0) { - if (F.UnfoldedOffset == std::numeric_limits::min() && - Factor == -1) + if (F.UnfoldedOffset.isNonZero()) { + if (F.UnfoldedOffset.isMin() && Factor == -1) continue; - F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor; - if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset) + F.UnfoldedOffset = Immediate::getFixed( + (uint64_t)F.UnfoldedOffset.getFixedValue() * Factor); + if (F.UnfoldedOffset.getFixedValue() / Factor != + Base.UnfoldedOffset.getFixedValue()) continue; // If the offset will be truncated, check that it is in bounds. - if (!IntTy->isPointerTy() && - !ConstantInt::isValueValidForType(IntTy, F.UnfoldedOffset)) + if (!IntTy->isPointerTy() && !ConstantInt::isValueValidForType( + IntTy, F.UnfoldedOffset.getFixedValue())) continue; } @@ -4106,8 +4257,8 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) { } // For an ICmpZero, negating a solitary base register won't lead to // new solutions. - if (LU.Kind == LSRUse::ICmpZero && - !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV) + if (LU.Kind == LSRUse::ICmpZero && !Base.HasBaseReg && + Base.BaseOffset.isZero() && !Base.BaseGV) continue; // For each addrec base reg, if its loop is current loop, apply the scale. for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) { @@ -4233,10 +4384,10 @@ namespace { /// structures moving underneath it. struct WorkItem { size_t LUIdx; - int64_t Imm; + Immediate Imm; const SCEV *OrigReg; - WorkItem(size_t LI, int64_t I, const SCEV *R) + WorkItem(size_t LI, Immediate I, const SCEV *R) : LUIdx(LI), Imm(I), OrigReg(R) {} void print(raw_ostream &OS) const; @@ -4260,14 +4411,14 @@ LLVM_DUMP_METHOD void WorkItem::dump() const { /// opportunities between them. void LSRInstance::GenerateCrossUseConstantOffsets() { // Group the registers by their value without any added constant offset. - using ImmMapTy = std::map; + using ImmMapTy = std::map; DenseMap Map; DenseMap UsedByIndicesMap; SmallVector Sequence; for (const SCEV *Use : RegUses) { const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify. - int64_t Imm = ExtractImmediate(Reg, SE); + Immediate Imm = ExtractImmediate(Reg, SE); auto Pair = Map.insert(std::make_pair(Reg, ImmMapTy())); if (Pair.second) Sequence.push_back(Reg); @@ -4279,7 +4430,8 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // a list of work to do and do the work in a separate step so that we're // not adding formulae and register counts while we're searching. SmallVector WorkItems; - SmallSet, 32> UniqueItems; + SmallSet, 32, KeyOrderSizeTAndImmediate> + UniqueItems; for (const SCEV *Reg : Sequence) { const ImmMapTy &Imms = Map.find(Reg)->second; @@ -4298,7 +4450,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { J != JE; ++J) { const SCEV *OrigReg = J->second; - int64_t JImm = J->first; + Immediate JImm = J->first; const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg); if (!isa(OrigReg) && @@ -4310,22 +4462,39 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // Conservatively examine offsets between this orig reg a few selected // other orig regs. - int64_t First = Imms.begin()->first; - int64_t Last = std::prev(Imms.end())->first; + Immediate First = Imms.begin()->first; + Immediate Last = std::prev(Imms.end())->first; + if (First.isScalable() != Last.isScalable() && First.isNonZero() && + Last.isNonZero()) { + LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg + << "\n"); + continue; + } + // Only scalable if both terms are scalable, or if one is scalable and + // the other is 0. + bool Scalable = First.isScalable() || Last.isScalable(); + int64_t FI = First.getKnownMinValue(); + int64_t LI = Last.getKnownMinValue(); // Compute (First + Last) / 2 without overflow using the fact that // First + Last = 2 * (First + Last) + (First ^ Last). - int64_t Avg = (First & Last) + ((First ^ Last) >> 1); - // If the result is negative and First is odd and Last even (or vice versa), + int64_t Avg = (FI & LI) + ((FI ^ LI) >> 1); + // If the result is negative and FI is odd and LI even (or vice versa), // we rounded towards -inf. Add 1 in that case, to round towards 0. - Avg = Avg + ((First ^ Last) & ((uint64_t)Avg >> 63)); + Avg = Avg + ((FI ^ LI) & ((uint64_t)Avg >> 63)); ImmMapTy::const_iterator OtherImms[] = { Imms.begin(), std::prev(Imms.end()), - Imms.lower_bound(Avg)}; + Imms.lower_bound(Immediate::get(Avg, Scalable))}; for (const auto &M : OtherImms) { if (M == J || M == JE) continue; + if (JImm.isScalable() != M->first.isScalable() && JImm.isNonZero() && + M->first.isNonZero()) + continue; // Compute the difference between the two. - int64_t Imm = (uint64_t)JImm - M->first; + bool Scalable = JImm.isScalable() || M->first.isScalable(); + Immediate Imm = Immediate::get((uint64_t)JImm.getKnownMinValue() - + M->first.getKnownMinValue(), + Scalable); for (unsigned LUIdx : UsedByIndices.set_bits()) // Make a memo of this use, offset, and register tuple. if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second) @@ -4343,11 +4512,14 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { for (const WorkItem &WI : WorkItems) { size_t LUIdx = WI.LUIdx; LSRUse &LU = Uses[LUIdx]; - int64_t Imm = WI.Imm; + Immediate Imm = WI.Imm; const SCEV *OrigReg = WI.OrigReg; Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType()); - const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm)); + const SCEV *NegImmS = + SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm.getKnownMinValue())); + if (Imm.isScalable()) + NegImmS = SE.getMulExpr(NegImmS, SE.getVScale(NegImmS->getType())); unsigned BitWidth = SE.getTypeSizeInBits(IntTy); // TODO: Use a more targeted data structure. @@ -4360,10 +4532,20 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { F.unscale(); // Use the immediate in the scaled register. if (F.ScaledReg == OrigReg) { - int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale; + if (F.BaseOffset.isScalable() != Imm.isScalable() && + F.BaseOffset.isNonZero() && Imm.isNonZero()) + continue; + bool Scalable = F.BaseOffset.isScalable() || Imm.isScalable(); + Immediate Offset = + Immediate::get((uint64_t)F.BaseOffset.getKnownMinValue() + + Imm.getKnownMinValue() * (uint64_t)F.Scale, + Scalable); // Don't create 50 + reg(-50). - if (F.referencesReg(SE.getSCEV( - ConstantInt::get(IntTy, -(uint64_t)Offset)))) + const SCEV *S = SE.getSCEV( + ConstantInt::get(IntTy, -(uint64_t)Offset.getKnownMinValue())); + if (Scalable) + S = SE.getMulExpr(S, SE.getVScale(S->getType())); + if (F.referencesReg(S)) continue; Formula NewF = F; NewF.BaseOffset = Offset; @@ -4376,9 +4558,10 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // value to the immediate would produce a value closer to zero than the // immediate itself, then the formula isn't worthwhile. if (const SCEVConstant *C = dyn_cast(NewF.ScaledReg)) - if (C->getValue()->isNegative() != (NewF.BaseOffset < 0) && + if (C->getValue()->isNegative() != + (NewF.BaseOffset.isLessThanZero()) && (C->getAPInt().abs() * APInt(BitWidth, F.Scale)) - .ule(std::abs(NewF.BaseOffset))) + .ule(std::abs(NewF.BaseOffset.getFixedValue()))) continue; // OK, looks good. @@ -4391,16 +4574,27 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { if (BaseReg != OrigReg) continue; Formula NewF = F; - NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm; + if (NewF.BaseOffset.isScalable() != Imm.isScalable() && + NewF.BaseOffset.isNonZero() && Imm.isNonZero()) + continue; + bool Scalable = NewF.BaseOffset.isScalable() || Imm.isScalable(); + NewF.BaseOffset = + Immediate::get((uint64_t)NewF.BaseOffset.getKnownMinValue() + + Imm.getKnownMinValue(), + Scalable); if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, NewF)) { if (AMK == TTI::AMK_PostIndexed && mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE)) continue; - if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm)) + Immediate NewUnfoldedOffset = Immediate::get( + (uint64_t)NewF.UnfoldedOffset.getKnownMinValue() + + Imm.getKnownMinValue(), + Scalable); + if (!isLegalAddImmediate(TTI, NewUnfoldedOffset)) continue; NewF = F; - NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm; + NewF.UnfoldedOffset = NewUnfoldedOffset; } NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg); @@ -4409,11 +4603,13 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // zero than the immediate itself, then the formula isn't worthwhile. for (const SCEV *NewReg : NewF.BaseRegs) if (const SCEVConstant *C = dyn_cast(NewReg)) - if ((C->getAPInt() + NewF.BaseOffset) + if ((C->getAPInt() + NewF.BaseOffset.getFixedValue()) .abs() - .slt(std::abs(NewF.BaseOffset)) && - (C->getAPInt() + NewF.BaseOffset).countr_zero() >= - (unsigned)llvm::countr_zero(NewF.BaseOffset)) + .slt(std::abs(NewF.BaseOffset.getFixedValue())) && + (C->getAPInt() + NewF.BaseOffset.getFixedValue()) + .countr_zero() >= + (unsigned)llvm::countr_zero( + NewF.BaseOffset.getFixedValue())) goto skip_formula; // Ok, looks good. @@ -4607,7 +4803,9 @@ void LSRInstance::NarrowSearchSpaceByDetectingSupersets() { Formula NewF = F; //FIXME: Formulas should store bitwidth to do wrapping properly. // See PR41034. - NewF.BaseOffset += (uint64_t)C->getValue()->getSExtValue(); + NewF.BaseOffset = + Immediate::getFixed(NewF.BaseOffset.getFixedValue() + + (uint64_t)C->getValue()->getSExtValue()); NewF.BaseRegs.erase(NewF.BaseRegs.begin() + (I - F.BaseRegs.begin())); if (LU.HasFormulaWithSameRegs(NewF)) { @@ -4663,7 +4861,7 @@ void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() { for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { LSRUse &LU = Uses[LUIdx]; for (const Formula &F : LU.Formulae) { - if (F.BaseOffset == 0 || (F.Scale != 0 && F.Scale != 1)) + if (F.BaseOffset.isZero() || (F.Scale != 0 && F.Scale != 1)) continue; LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU); @@ -5488,31 +5686,48 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF, Ops.push_back(SE.getUnknown(FullV)); } + // FIXME: Are we sure we won't get a mismatch here? Is there a way to bail + // out at this point, or should we generate a SCEV adding together mixed + // offsets? + assert((F.BaseOffset.isScalable() == LF.Offset.isScalable() || + F.BaseOffset.isZero() || LF.Offset.isZero()) && + "Expanding mismatched offsets\n"); + bool Scalable = F.BaseOffset.isScalable() || LF.Offset.isScalable(); // Expand the immediate portion. - int64_t Offset = (uint64_t)F.BaseOffset + LF.Offset; - if (Offset != 0) { + Immediate Offset = Immediate::get((uint64_t)F.BaseOffset.getKnownMinValue() + + LF.Offset.getKnownMinValue(), + Scalable); + if (Offset.isNonZero()) { if (LU.Kind == LSRUse::ICmpZero) { // The other interesting way of "folding" with an ICmpZero is to use a // negated immediate. if (!ICmpScaledV) - ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset); + ICmpScaledV = + ConstantInt::get(IntTy, -(uint64_t)Offset.getFixedValue()); else { Ops.push_back(SE.getUnknown(ICmpScaledV)); - ICmpScaledV = ConstantInt::get(IntTy, Offset); + ICmpScaledV = ConstantInt::get(IntTy, Offset.getFixedValue()); } } else { // Just add the immediate values. These again are expected to be matched // as part of the address. - Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset))); + const SCEV *SU = SE.getUnknown( + ConstantInt::getSigned(IntTy, Offset.getKnownMinValue())); + if (Scalable) + SU = SE.getMulExpr(SU, SE.getVScale(SU->getType())); + Ops.push_back(SU); } } // Expand the unfolded offset portion. - int64_t UnfoldedOffset = F.UnfoldedOffset; - if (UnfoldedOffset != 0) { + Immediate UnfoldedOffset = F.UnfoldedOffset; + if (UnfoldedOffset.isNonZero()) { + const SCEV *SU = SE.getUnknown( + ConstantInt::getSigned(IntTy, UnfoldedOffset.getKnownMinValue())); + if (UnfoldedOffset.isScalable()) + SU = SE.getMulExpr(SU, SE.getVScale(SU->getType())); // Just add the immediate values. - Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, - UnfoldedOffset))); + Ops.push_back(SU); } // Emit instructions summing all the operands. @@ -5548,7 +5763,7 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF, "ICmp does not support folding a global value and " "a scale at the same time!"); Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy), - -(uint64_t)Offset); + -(uint64_t)Offset.getFixedValue()); if (C->getType() != OpTy) { C = ConstantFoldCastOperand( CastInst::getCastOpcode(C, false, OpTy, false), C, OpTy, diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll index 4652d0d83919c8..de640fa2516a60 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll @@ -10,15 +10,16 @@ define void @mulvl123_addressing(ptr %src, ptr %dst, i64 %count) #0 { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[VSCALE:%.*]] = tail call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP0:%.*]] = shl nuw nsw i64 [[VSCALE]], 4 -; CHECK-NEXT: [[TMP2:%.*]] = mul nuw nsw i64 [[VSCALE]], 48 ; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i64 [[VSCALE]], 80 -; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[VSCALE]], 5 +; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[VSCALE]], 48 +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[VSCALE]], 5 +; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[VSCALE]], 4 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[FOR_BODY]] ], [ [[SRC]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IDX_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = load , ptr [[LSR_IV]], align 16 -; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[LSR_IV]], i64 [[TMP0]] +; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[LSR_IV]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = load , ptr [[SCEVGEP3]], align 16 ; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[LSR_IV]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP7:%.*]] = load , ptr [[SCEVGEP2]], align 16 @@ -73,12 +74,9 @@ define void @many_mulvl1_addressing(ptr %src_rows, ptr %dst_rows, i64 %stride, i ; CHECK-NEXT: entry: ; CHECK-NEXT: [[VSCALE:%.*]] = tail call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[MUL:%.*]] = shl i64 [[VSCALE]], 5 -; CHECK-NEXT: [[TMP0:%.*]] = shl nuw nsw i64 [[VSCALE]], 4 ; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[SRC_ROWS]], i64 [[STRIDE]] -; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[TMP0]] -; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[SRC_ROWS]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[VSCALE]], 3 -; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr i8, ptr [[DST_ROWS]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP0:%.*]] = shl i64 [[VSCALE]], 4 +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[VSCALE]], 3 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[LSR_IV10:%.*]] = phi i64 [ [[LSR_IV_NEXT11:%.*]], [[FOR_BODY]] ], [ [[COUNT]], [[ENTRY:%.*]] ] @@ -86,11 +84,13 @@ define void @many_mulvl1_addressing(ptr %src_rows, ptr %dst_rows, i64 %stride, i ; CHECK-NEXT: [[SCEVGEP6:%.*]] = getelementptr i8, ptr [[SRC_ROWS]], i64 [[LSR_IV]] ; CHECK-NEXT: [[SCEVGEP9:%.*]] = getelementptr i8, ptr [[DST_ROWS]], i64 [[LSR_IV]] ; CHECK-NEXT: [[TMP2:%.*]] = load , ptr [[SCEVGEP6]], align 16 -; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[SCEVGEP4]], i64 [[LSR_IV]] +; CHECK-NEXT: [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[SRC_ROWS]], i64 [[LSR_IV]] +; CHECK-NEXT: [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[SCEVGEP4]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP3:%.*]] = load , ptr [[SCEVGEP5]], align 16 ; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[LSR_IV]] ; CHECK-NEXT: [[TMP4:%.*]] = load , ptr [[SCEVGEP3]], align 16 -; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SCEVGEP1]], i64 [[LSR_IV]] +; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[LSR_IV]] +; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SCEVGEP1]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP5:%.*]] = load , ptr [[SCEVGEP2]], align 16 ; CHECK-NEXT: [[TMP6:%.*]] = add [[TMP2]], [[TMP4]] ; CHECK-NEXT: [[TMP7:%.*]] = add [[TMP3]], [[TMP5]] @@ -98,7 +98,8 @@ define void @many_mulvl1_addressing(ptr %src_rows, ptr %dst_rows, i64 %stride, i ; CHECK-NEXT: [[TMP9:%.*]] = trunc [[TMP8]] to ; CHECK-NEXT: store [[TMP9]], ptr [[SCEVGEP9]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = bitcast [[TMP7]] to -; CHECK-NEXT: [[SCEVGEP8:%.*]] = getelementptr i8, ptr [[SCEVGEP7]], i64 [[LSR_IV]] +; CHECK-NEXT: [[SCEVGEP7:%.*]] = getelementptr i8, ptr [[DST_ROWS]], i64 [[LSR_IV]] +; CHECK-NEXT: [[SCEVGEP8:%.*]] = getelementptr i8, ptr [[SCEVGEP7]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP11:%.*]] = trunc [[TMP10]] to ; CHECK-NEXT: store [[TMP11]], ptr [[SCEVGEP8]], align 8 ; CHECK-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], [[MUL]]