diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp index 86eb259c090152..9cc6d9b9fa715f 100644 --- a/llvm/lib/CodeGen/MachineLICM.cpp +++ b/llvm/lib/CodeGen/MachineLICM.cpp @@ -223,8 +223,8 @@ namespace { void HoistPostRA(MachineInstr *MI, unsigned Def, MachineLoop *CurLoop, MachineBasicBlock *CurPreheader); - void ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs, - BitVector &PhysRegClobbers, SmallSet &StoredFIs, + void ProcessMI(MachineInstr *MI, BitVector &RUDefs, BitVector &RUClobbers, + SmallSet &StoredFIs, SmallVectorImpl &Candidates, MachineLoop *CurLoop); @@ -423,10 +423,47 @@ static bool InstructionStoresToFI(const MachineInstr *MI, int FI) { return false; } +static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo &TRI, + BitVector &RUs, + const uint32_t *Mask) { + // Iterate over the RegMask raw to avoid constructing a BitVector, which is + // expensive as it implies dynamically allocating memory. + // + // We also work backwards. + const unsigned NumRegs = TRI.getNumRegs(); + const unsigned MaskWords = (NumRegs + 31) / 32; + for (unsigned K = 0; K < MaskWords; ++K) { + // We want to set the bits that aren't in RegMask, so flip it. + uint32_t Word = ~Mask[K]; + + // Iterate all set bits, starting from the right. + while (Word) { + const unsigned SetBitIdx = countr_zero(Word); + + // The bits are numbered from the LSB in each word. + const unsigned PhysReg = (K * 32) + SetBitIdx; + + // Clear the bit at SetBitIdx. Doing it this way appears to generate less + // instructions on x86. This works because negating a number will flip all + // the bits after SetBitIdx. So (Word & -Word) == (1 << SetBitIdx), but + // faster. + Word ^= Word & -Word; + + if (PhysReg == NumRegs) + return; + + if (PhysReg) { + for (MCRegUnitIterator RUI(PhysReg, &TRI); RUI.isValid(); ++RUI) + RUs.set(*RUI); + } + } + } +} + /// Examine the instruction for potentai LICM candidate. Also /// gather register def and frame object update information. -void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs, - BitVector &PhysRegClobbers, +void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &RUDefs, + BitVector &RUClobbers, SmallSet &StoredFIs, SmallVectorImpl &Candidates, MachineLoop *CurLoop) { @@ -448,7 +485,7 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs, // We can't hoist an instruction defining a physreg that is clobbered in // the loop. if (MO.isRegMask()) { - PhysRegClobbers.setBitsNotInMask(MO.getRegMask()); + applyBitsNotInRegMaskToRegUnitsMask(*TRI, RUClobbers, MO.getRegMask()); continue; } @@ -460,16 +497,22 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs, assert(Reg.isPhysical() && "Not expecting virtual register!"); if (!MO.isDef()) { - if (Reg && (PhysRegDefs.test(Reg) || PhysRegClobbers.test(Reg))) - // If it's using a non-loop-invariant register, then it's obviously not - // safe to hoist. - HasNonInvariantUse = true; + if (!HasNonInvariantUse) { + for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) { + // If it's using a non-loop-invariant register, then it's obviously + // not safe to hoist. + if (RUDefs.test(*RUI) || RUClobbers.test(*RUI)) { + HasNonInvariantUse = true; + break; + } + } + } continue; } if (MO.isImplicit()) { - for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) - PhysRegClobbers.set(*AI); + for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) + RUClobbers.set(*RUI); if (!MO.isDead()) // Non-dead implicit def? This cannot be hoisted. RuledOut = true; @@ -488,19 +531,18 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs, // If we have already seen another instruction that defines the same // register, then this is not safe. Two defs is indicated by setting a // PhysRegClobbers bit. - for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS) { - if (PhysRegDefs.test(*AS)) - PhysRegClobbers.set(*AS); + for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) { + if (RUDefs.test(*RUI)) { + RUClobbers.set(*RUI); + RuledOut = true; + } else if (RUClobbers.test(*RUI)) { + // MI defined register is seen defined by another instruction in + // the loop, it cannot be a LICM candidate. + RuledOut = true; + } + + RUDefs.set(*RUI); } - // Need a second loop because MCRegAliasIterator can visit the same - // register twice. - for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS) - PhysRegDefs.set(*AS); - - if (PhysRegClobbers.test(Reg)) - // MI defined register is seen defined by another instruction in - // the loop, it cannot be a LICM candidate. - RuledOut = true; } // Only consider reloads for now and remats which do not have register @@ -521,9 +563,9 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop, if (!Preheader) return; - unsigned NumRegs = TRI->getNumRegs(); - BitVector PhysRegDefs(NumRegs); // Regs defined once in the loop. - BitVector PhysRegClobbers(NumRegs); // Regs defined more than once. + unsigned NumRegUnits = TRI->getNumRegUnits(); + BitVector RUDefs(NumRegUnits); // RUs defined once in the loop. + BitVector RUClobbers(NumRegUnits); // RUs defined more than once. SmallVector Candidates; SmallSet StoredFIs; @@ -540,22 +582,21 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop, // FIXME: That means a reload that're reused in successor block(s) will not // be LICM'ed. for (const auto &LI : BB->liveins()) { - for (MCRegAliasIterator AI(LI.PhysReg, TRI, true); AI.isValid(); ++AI) - PhysRegDefs.set(*AI); + for (MCRegUnitIterator RUI(LI.PhysReg, TRI); RUI.isValid(); ++RUI) + RUDefs.set(*RUI); } // Funclet entry blocks will clobber all registers if (const uint32_t *Mask = BB->getBeginClobberMask(TRI)) - PhysRegClobbers.setBitsNotInMask(Mask); + applyBitsNotInRegMaskToRegUnitsMask(*TRI, RUClobbers, Mask); SpeculationState = SpeculateUnknown; for (MachineInstr &MI : *BB) - ProcessMI(&MI, PhysRegDefs, PhysRegClobbers, StoredFIs, Candidates, - CurLoop); + ProcessMI(&MI, RUDefs, RUClobbers, StoredFIs, Candidates, CurLoop); } // Gather the registers read / clobbered by the terminator. - BitVector TermRegs(NumRegs); + BitVector TermRUs(NumRegUnits); MachineBasicBlock::iterator TI = Preheader->getFirstTerminator(); if (TI != Preheader->end()) { for (const MachineOperand &MO : TI->operands()) { @@ -564,8 +605,8 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop, Register Reg = MO.getReg(); if (!Reg) continue; - for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) - TermRegs.set(*AI); + for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) + TermRUs.set(*RUI); } } @@ -583,24 +624,36 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop, continue; unsigned Def = Candidate.Def; - if (!PhysRegClobbers.test(Def) && !TermRegs.test(Def)) { - bool Safe = true; - MachineInstr *MI = Candidate.MI; - for (const MachineOperand &MO : MI->all_uses()) { - if (!MO.getReg()) - continue; - Register Reg = MO.getReg(); - if (PhysRegDefs.test(Reg) || - PhysRegClobbers.test(Reg)) { + bool Safe = true; + for (MCRegUnitIterator RUI(Def, TRI); RUI.isValid(); ++RUI) { + if (RUClobbers.test(*RUI) || TermRUs.test(*RUI)) { + Safe = false; + break; + } + } + + if (!Safe) + continue; + + MachineInstr *MI = Candidate.MI; + for (const MachineOperand &MO : MI->all_uses()) { + if (!MO.getReg()) + continue; + for (MCRegUnitIterator RUI(MO.getReg(), TRI); RUI.isValid(); ++RUI) { + if (RUDefs.test(*RUI) || RUClobbers.test(*RUI)) { // If it's using a non-loop-invariant register, then it's obviously // not safe to hoist. Safe = false; break; } } - if (Safe) - HoistPostRA(MI, Candidate.Def, CurLoop, CurPreheader); + + if (!Safe) + break; } + + if (Safe) + HoistPostRA(MI, Candidate.Def, CurLoop, CurPreheader); } } diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll index 7799b9509ceb03..da8aa544698355 100644 --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -886,12 +886,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GCN-NEXT: v_writelane_b32 v40, s62, 30 ; GCN-NEXT: v_writelane_b32 v40, s63, 31 ; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: s_movk_i32 s4, 0x7b ; GCN-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 ; GCN-NEXT: v_readfirstlane_b32 s8, v0 ; GCN-NEXT: v_readfirstlane_b32 s9, v1 ; GCN-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] ; GCN-NEXT: s_and_saveexec_b64 s[10:11], vcc +; GCN-NEXT: s_movk_i32 s4, 0x7b ; GCN-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GCN-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN-NEXT: s_xor_b64 exec, exec, s[10:11] @@ -980,12 +980,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) { ; GISEL-NEXT: v_writelane_b32 v40, s62, 30 ; GISEL-NEXT: v_writelane_b32 v40, s63, 31 ; GISEL-NEXT: s_mov_b64 s[6:7], exec -; GISEL-NEXT: s_movk_i32 s4, 0x7b ; GISEL-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 ; GISEL-NEXT: v_readfirstlane_b32 s8, v0 ; GISEL-NEXT: v_readfirstlane_b32 s9, v1 ; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1] ; GISEL-NEXT: s_and_saveexec_b64 s[10:11], vcc +; GISEL-NEXT: s_movk_i32 s4, 0x7b ; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9] ; GISEL-NEXT: ; implicit-def: $vgpr0 ; GISEL-NEXT: s_xor_b64 exec, exec, s[10:11]