[CodeGen][MachineLICM] Use RegUnits in HoistRegionPostRA #94608

Pierre-vh · 2024-06-06T12:41:27Z

Those BitVectors get expensive on targets like AMDGPU with thousands of registers, and RegAliasIterator is also expensive.

We can move all liveness calculations to use RegUnits instead to speed it up.

Those bitvectors get expensive on targets like AMDGPU with thousands of registers, and RegAliasIterator is also expensive. We can move all liveness calculations to use RegUnits instead to speed it up.

llvmbot · 2024-06-06T12:41:58Z

@llvm/pr-subscribers-backend-amdgpu

Author: Pierre van Houtryve (Pierre-vh)

Changes

Those BitVectors get expensive on targets like AMDGPU with thousands of registers, and RegAliasIterator is also expensive.

We can move all liveness calculations to use RegUnits instead to speed it up.

Full diff: https://github.com/llvm/llvm-project/pull/94608.diff

2 Files Affected:

(modified) llvm/lib/CodeGen/MachineLICM.cpp (+72-44)
(modified) llvm/test/CodeGen/AMDGPU/indirect-call.ll (+2-2)

diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 86eb259c09015..d802867f1fe8e 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -223,8 +223,8 @@ namespace {
     void HoistPostRA(MachineInstr *MI, unsigned Def, MachineLoop *CurLoop,
                      MachineBasicBlock *CurPreheader);
 
-    void ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
-                   BitVector &PhysRegClobbers, SmallSet<int, 32> &StoredFIs,
+    void ProcessMI(MachineInstr *MI, BitVector &RUDefs, BitVector &RUClobbers,
+                   SmallSet<int, 32> &StoredFIs,
                    SmallVectorImpl<CandidateInfo> &Candidates,
                    MachineLoop *CurLoop);
 
@@ -423,10 +423,25 @@ static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
   return false;
 }
 
+static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo *TRI,
+                                                BitVector &RUs,
+                                                const uint32_t *Mask) {
+  // FIXME: Use RUs better here
+  BitVector MaskedRegs(TRI->getNumRegs());
+  MaskedRegs.setBitsNotInMask(Mask);
+  for (const auto &Set : MaskedRegs.set_bits()) {
+    if (!Set)
+      continue;
+
+    for (MCRegUnitIterator RUI(Set, TRI); RUI.isValid(); ++RUI)
+      RUs.set(*RUI);
+  }
+}
+
 /// Examine the instruction for potentai LICM candidate. Also
 /// gather register def and frame object update information.
-void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
-                                BitVector &PhysRegClobbers,
+void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &RUDefs,
+                                BitVector &RUClobbers,
                                 SmallSet<int, 32> &StoredFIs,
                                 SmallVectorImpl<CandidateInfo> &Candidates,
                                 MachineLoop *CurLoop) {
@@ -448,7 +463,7 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
     // We can't hoist an instruction defining a physreg that is clobbered in
     // the loop.
     if (MO.isRegMask()) {
-      PhysRegClobbers.setBitsNotInMask(MO.getRegMask());
+      applyBitsNotInRegMaskToRegUnitsMask(TRI, RUClobbers, MO.getRegMask());
       continue;
     }
 
@@ -460,16 +475,18 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
     assert(Reg.isPhysical() && "Not expecting virtual register!");
 
     if (!MO.isDef()) {
-      if (Reg && (PhysRegDefs.test(Reg) || PhysRegClobbers.test(Reg)))
+      for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) {
         // If it's using a non-loop-invariant register, then it's obviously not
         // safe to hoist.
-        HasNonInvariantUse = true;
+        if (RUDefs.test(*RUI) || RUClobbers.test(*RUI))
+          HasNonInvariantUse = true;
+      }
       continue;
     }
 
     if (MO.isImplicit()) {
-      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-        PhysRegClobbers.set(*AI);
+      for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI)
+        RUClobbers.set(*RUI);
       if (!MO.isDead())
         // Non-dead implicit def? This cannot be hoisted.
         RuledOut = true;
@@ -488,19 +505,18 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
     // If we have already seen another instruction that defines the same
     // register, then this is not safe.  Two defs is indicated by setting a
     // PhysRegClobbers bit.
-    for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS) {
-      if (PhysRegDefs.test(*AS))
-        PhysRegClobbers.set(*AS);
+    for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) {
+      if (RUDefs.test(*RUI)) {
+        RUClobbers.set(*RUI);
+        RuledOut = true;
+      } else if (RUClobbers.test(*RUI)) {
+        // MI defined register is seen defined by another instruction in
+        // the loop, it cannot be a LICM candidate.
+        RuledOut = true;
+      }
+
+      RUDefs.set(*RUI);
     }
-    // Need a second loop because MCRegAliasIterator can visit the same
-    // register twice.
-    for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS)
-      PhysRegDefs.set(*AS);
-
-    if (PhysRegClobbers.test(Reg))
-      // MI defined register is seen defined by another instruction in
-      // the loop, it cannot be a LICM candidate.
-      RuledOut = true;
   }
 
   // Only consider reloads for now and remats which do not have register
@@ -521,9 +537,9 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop,
   if (!Preheader)
     return;
 
-  unsigned NumRegs = TRI->getNumRegs();
-  BitVector PhysRegDefs(NumRegs); // Regs defined once in the loop.
-  BitVector PhysRegClobbers(NumRegs); // Regs defined more than once.
+  unsigned NumRegUnits = TRI->getNumRegUnits();
+  BitVector RUDefs(NumRegUnits);     // RUs defined once in the loop.
+  BitVector RUClobbers(NumRegUnits); // RUs defined more than once.
 
   SmallVector<CandidateInfo, 32> Candidates;
   SmallSet<int, 32> StoredFIs;
@@ -540,22 +556,22 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop,
     // FIXME: That means a reload that're reused in successor block(s) will not
     // be LICM'ed.
     for (const auto &LI : BB->liveins()) {
-      for (MCRegAliasIterator AI(LI.PhysReg, TRI, true); AI.isValid(); ++AI)
-        PhysRegDefs.set(*AI);
+      for (MCRegUnitIterator RUI(LI.PhysReg, TRI); RUI.isValid(); ++RUI)
+        RUDefs.set(*RUI);
     }
 
     // Funclet entry blocks will clobber all registers
-    if (const uint32_t *Mask = BB->getBeginClobberMask(TRI))
-      PhysRegClobbers.setBitsNotInMask(Mask);
+    if (const uint32_t *Mask = BB->getBeginClobberMask(TRI)) {
+      applyBitsNotInRegMaskToRegUnitsMask(TRI, RUClobbers, Mask);
+    }
 
     SpeculationState = SpeculateUnknown;
     for (MachineInstr &MI : *BB)
-      ProcessMI(&MI, PhysRegDefs, PhysRegClobbers, StoredFIs, Candidates,
-                CurLoop);
+      ProcessMI(&MI, RUDefs, RUClobbers, StoredFIs, Candidates, CurLoop);
   }
 
   // Gather the registers read / clobbered by the terminator.
-  BitVector TermRegs(NumRegs);
+  BitVector TermRUs(NumRegUnits);
   MachineBasicBlock::iterator TI = Preheader->getFirstTerminator();
   if (TI != Preheader->end()) {
     for (const MachineOperand &MO : TI->operands()) {
@@ -564,8 +580,8 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop,
       Register Reg = MO.getReg();
       if (!Reg)
         continue;
-      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-        TermRegs.set(*AI);
+      for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI)
+        TermRUs.set(*RUI);
     }
   }
 
@@ -583,24 +599,36 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop,
       continue;
 
     unsigned Def = Candidate.Def;
-    if (!PhysRegClobbers.test(Def) && !TermRegs.test(Def)) {
-      bool Safe = true;
-      MachineInstr *MI = Candidate.MI;
-      for (const MachineOperand &MO : MI->all_uses()) {
-        if (!MO.getReg())
-          continue;
-        Register Reg = MO.getReg();
-        if (PhysRegDefs.test(Reg) ||
-            PhysRegClobbers.test(Reg)) {
+    bool Safe = true;
+    for (MCRegUnitIterator RUI(Def, TRI); RUI.isValid(); ++RUI) {
+      if (RUClobbers.test(*RUI) || TermRUs.test(*RUI)) {
+        Safe = false;
+        break;
+      }
+    }
+
+    if (!Safe)
+      continue;
+
+    MachineInstr *MI = Candidate.MI;
+    for (const MachineOperand &MO : MI->all_uses()) {
+      if (!MO.getReg())
+        continue;
+      for (MCRegUnitIterator RUI(MO.getReg(), TRI); RUI.isValid(); ++RUI) {
+        if (RUDefs.test(*RUI) || RUClobbers.test(*RUI)) {
           // If it's using a non-loop-invariant register, then it's obviously
           // not safe to hoist.
           Safe = false;
           break;
         }
       }
-      if (Safe)
-        HoistPostRA(MI, Candidate.Def, CurLoop, CurPreheader);
+
+      if (!Safe)
+        break;
     }
+
+    if (Safe)
+      HoistPostRA(MI, Candidate.Def, CurLoop, CurPreheader);
   }
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index 7799b9509ceb0..da8aa54469835 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -886,12 +886,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GCN-NEXT:    v_writelane_b32 v40, s62, 30
 ; GCN-NEXT:    v_writelane_b32 v40, s63, 31
 ; GCN-NEXT:    s_mov_b64 s[6:7], exec
-; GCN-NEXT:    s_movk_i32 s4, 0x7b
 ; GCN-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_readfirstlane_b32 s8, v0
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v1
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
 ; GCN-NEXT:    s_and_saveexec_b64 s[10:11], vcc
+; GCN-NEXT:    s_movk_i32 s4, 0x7b
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN-NEXT:    s_xor_b64 exec, exec, s[10:11]
@@ -980,12 +980,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GISEL-NEXT:    v_writelane_b32 v40, s62, 30
 ; GISEL-NEXT:    v_writelane_b32 v40, s63, 31
 ; GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GISEL-NEXT:    s_movk_i32 s4, 0x7b
 ; GISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GISEL-NEXT:    v_readfirstlane_b32 s8, v0
 ; GISEL-NEXT:    v_readfirstlane_b32 s9, v1
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
 ; GISEL-NEXT:    s_and_saveexec_b64 s[10:11], vcc
+; GISEL-NEXT:    s_movk_i32 s4, 0x7b
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GISEL-NEXT:    s_xor_b64 exec, exec, s[10:11]

Pierre-vh · 2024-06-06T12:42:19Z

llvm/test/CodeGen/AMDGPU/indirect-call.ll

 ; GCN-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_readfirstlane_b32 s8, v0
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v1
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
 ; GCN-NEXT:    s_and_saveexec_b64 s[10:11], vcc
+; GCN-NEXT:    s_movk_i32 s4, 0x7b


This is the only regression I observed. I didn't investigate it in depth yet but I plan to spend some time on it to figure it out.
Any hints/help is welcome of course

github-actions · 2024-06-06T12:44:18Z

✅ With the latest revision this PR passed the C/C++ code formatter.

arsenm · 2024-06-06T12:46:56Z