Skip to content

Commit

Permalink
Reapply "[AMDGPU]Optimize SGPR spills (llvm#93668)"
Browse files Browse the repository at this point in the history
This reverts commit c2fc7f7. As the
dependent patch about split vgpr regalloc pipeline solved the issue(llvm#96353).
  • Loading branch information
vg0204 committed Oct 3, 2024
1 parent 650c41a commit cc7e4c2
Show file tree
Hide file tree
Showing 7 changed files with 126 additions and 102 deletions.
5 changes: 5 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1533,6 +1533,11 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
// since FastRegAlloc does the replacements itself.
addPass(createVirtRegRewriter(false));

// At this point, the sgpr-regalloc has been done and it is good to have the
// stack slot coloring to try to optimize the SGPR spill stack indices before
// attempting the custom SGPR spill lowering.
addPass(&StackSlotColoringID);

// Equivalent of PEI for SGPRs.
addPass(&SILowerSGPRSpillsLegacyID);

Expand Down
9 changes: 7 additions & 2 deletions llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1959,8 +1959,13 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,

if (SpillToVGPR) {

assert(SB.NumSubRegs == VGPRSpills.size() &&
"Num of VGPR lanes should be equal to num of SGPRs spilled");
// Since stack slot coloring pass is trying to optimize SGPR spills,
// VGPR lanes (mapped from spill stack slot) may be shared for SGPR
// spills of different sizes. This accounts for number of VGPR lanes alloted
// equal to the largest SGPR being spilled in them.
assert(SB.NumSubRegs <= VGPRSpills.size() &&
"Num of SGPRs spilled should be less than or equal to num of "
"the VGPR lanes.");

for (unsigned i = 0, e = SB.NumSubRegs; i < e; ++i) {
Register SubReg =
Expand Down
8 changes: 8 additions & 0 deletions llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
Original file line number Diff line number Diff line change
Expand Up @@ -366,10 +366,12 @@
; GCN-O1-NEXT: Machine Optimization Remark Emitter
; GCN-O1-NEXT: Greedy Register Allocator
; GCN-O1-NEXT: Virtual Register Rewriter
; GCN-O1-NEXT: Stack Slot Coloring
; GCN-O1-NEXT: SI lower SGPR spill instructions
; GCN-O1-NEXT: Virtual Register Map
; GCN-O1-NEXT: Live Register Matrix
; GCN-O1-NEXT: SI Pre-allocate WWM Registers
; GCN-O1-NEXT: Live Stack Slot Analysis
; GCN-O1-NEXT: Greedy Register Allocator
; GCN-O1-NEXT: SI Lower WWM Copies
; GCN-O1-NEXT: Virtual Register Rewriter
Expand Down Expand Up @@ -674,10 +676,12 @@
; GCN-O1-OPTS-NEXT: Machine Optimization Remark Emitter
; GCN-O1-OPTS-NEXT: Greedy Register Allocator
; GCN-O1-OPTS-NEXT: Virtual Register Rewriter
; GCN-O1-OPTS-NEXT: Stack Slot Coloring
; GCN-O1-OPTS-NEXT: SI lower SGPR spill instructions
; GCN-O1-OPTS-NEXT: Virtual Register Map
; GCN-O1-OPTS-NEXT: Live Register Matrix
; GCN-O1-OPTS-NEXT: SI Pre-allocate WWM Registers
; GCN-O1-OPTS-NEXT: Live Stack Slot Analysis
; GCN-O1-OPTS-NEXT: Greedy Register Allocator
; GCN-O1-OPTS-NEXT: SI Lower WWM Copies
; GCN-O1-OPTS-NEXT: Virtual Register Rewriter
Expand Down Expand Up @@ -988,10 +992,12 @@
; GCN-O2-NEXT: Machine Optimization Remark Emitter
; GCN-O2-NEXT: Greedy Register Allocator
; GCN-O2-NEXT: Virtual Register Rewriter
; GCN-O2-NEXT: Stack Slot Coloring
; GCN-O2-NEXT: SI lower SGPR spill instructions
; GCN-O2-NEXT: Virtual Register Map
; GCN-O2-NEXT: Live Register Matrix
; GCN-O2-NEXT: SI Pre-allocate WWM Registers
; GCN-O2-NEXT: Live Stack Slot Analysis
; GCN-O2-NEXT: Greedy Register Allocator
; GCN-O2-NEXT: SI Lower WWM Copies
; GCN-O2-NEXT: Virtual Register Rewriter
Expand Down Expand Up @@ -1314,10 +1320,12 @@
; GCN-O3-NEXT: Machine Optimization Remark Emitter
; GCN-O3-NEXT: Greedy Register Allocator
; GCN-O3-NEXT: Virtual Register Rewriter
; GCN-O3-NEXT: Stack Slot Coloring
; GCN-O3-NEXT: SI lower SGPR spill instructions
; GCN-O3-NEXT: Virtual Register Map
; GCN-O3-NEXT: Live Register Matrix
; GCN-O3-NEXT: SI Pre-allocate WWM Registers
; GCN-O3-NEXT: Live Stack Slot Analysis
; GCN-O3-NEXT: Greedy Register Allocator
; GCN-O3-NEXT: SI Lower WWM Copies
; GCN-O3-NEXT: Virtual Register Rewriter
Expand Down
72 changes: 36 additions & 36 deletions llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll
Original file line number Diff line number Diff line change
Expand Up @@ -196,39 +196,39 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; def s21
; GFX906-NEXT: ;;#ASMEND
; GFX906-NEXT: v_writelane_b32 v39, s21, 24
; GFX906-NEXT: v_writelane_b32 v39, s21, 12
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; def s22
; GFX906-NEXT: ;;#ASMEND
; GFX906-NEXT: v_writelane_b32 v39, s22, 25
; GFX906-NEXT: v_writelane_b32 v39, s22, 13
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; def s23
; GFX906-NEXT: ;;#ASMEND
; GFX906-NEXT: v_writelane_b32 v39, s23, 26
; GFX906-NEXT: v_writelane_b32 v39, s23, 14
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; def s24
; GFX906-NEXT: ;;#ASMEND
; GFX906-NEXT: v_writelane_b32 v39, s24, 27
; GFX906-NEXT: v_writelane_b32 v39, s24, 15
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; def s25
; GFX906-NEXT: ;;#ASMEND
; GFX906-NEXT: v_writelane_b32 v39, s25, 28
; GFX906-NEXT: v_writelane_b32 v39, s25, 16
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; def s26
; GFX906-NEXT: ;;#ASMEND
; GFX906-NEXT: v_writelane_b32 v39, s26, 29
; GFX906-NEXT: v_writelane_b32 v39, s26, 17
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; def s27
; GFX906-NEXT: ;;#ASMEND
; GFX906-NEXT: v_writelane_b32 v39, s27, 30
; GFX906-NEXT: v_writelane_b32 v39, s27, 18
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; def s28
; GFX906-NEXT: ;;#ASMEND
; GFX906-NEXT: v_writelane_b32 v39, s28, 31
; GFX906-NEXT: v_writelane_b32 v39, s28, 19
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; def s29
; GFX906-NEXT: ;;#ASMEND
; GFX906-NEXT: v_writelane_b32 v39, s29, 32
; GFX906-NEXT: v_writelane_b32 v39, s29, 20
; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX906-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
; GFX906-NEXT: s_mov_b64 exec, s[34:35]
Expand Down Expand Up @@ -267,39 +267,39 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX906-NEXT: v_readlane_b32 s15, v39, 2
; GFX906-NEXT: v_mov_b32_e32 v31, v40
; GFX906-NEXT: v_readlane_b32 s17, v39, 23
; GFX906-NEXT: v_readlane_b32 s21, v39, 24
; GFX906-NEXT: v_readlane_b32 s21, v39, 12
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s21
; GFX906-NEXT: ;;#ASMEND
; GFX906-NEXT: v_readlane_b32 s22, v39, 25
; GFX906-NEXT: v_readlane_b32 s22, v39, 13
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s22
; GFX906-NEXT: ;;#ASMEND
; GFX906-NEXT: v_readlane_b32 s23, v39, 26
; GFX906-NEXT: v_readlane_b32 s23, v39, 14
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s23
; GFX906-NEXT: ;;#ASMEND
; GFX906-NEXT: v_readlane_b32 s24, v39, 27
; GFX906-NEXT: v_readlane_b32 s24, v39, 15
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s24
; GFX906-NEXT: ;;#ASMEND
; GFX906-NEXT: v_readlane_b32 s25, v39, 28
; GFX906-NEXT: v_readlane_b32 s25, v39, 16
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s25
; GFX906-NEXT: ;;#ASMEND
; GFX906-NEXT: v_readlane_b32 s26, v39, 29
; GFX906-NEXT: v_readlane_b32 s26, v39, 17
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s26
; GFX906-NEXT: ;;#ASMEND
; GFX906-NEXT: v_readlane_b32 s27, v39, 30
; GFX906-NEXT: v_readlane_b32 s27, v39, 18
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s27
; GFX906-NEXT: ;;#ASMEND
; GFX906-NEXT: v_readlane_b32 s28, v39, 31
; GFX906-NEXT: v_readlane_b32 s28, v39, 19
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s28
; GFX906-NEXT: ;;#ASMEND
; GFX906-NEXT: v_readlane_b32 s29, v39, 32
; GFX906-NEXT: v_readlane_b32 s29, v39, 20
; GFX906-NEXT: ;;#ASMSTART
; GFX906-NEXT: ; use s29
; GFX906-NEXT: ;;#ASMEND
Expand Down Expand Up @@ -575,39 +575,39 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; def s21
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: v_writelane_b32 v39, s21, 24
; GFX908-NEXT: v_writelane_b32 v39, s21, 12
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; def s22
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: v_writelane_b32 v39, s22, 25
; GFX908-NEXT: v_writelane_b32 v39, s22, 13
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; def s23
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: v_writelane_b32 v39, s23, 26
; GFX908-NEXT: v_writelane_b32 v39, s23, 14
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; def s24
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: v_writelane_b32 v39, s24, 27
; GFX908-NEXT: v_writelane_b32 v39, s24, 15
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; def s25
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: v_writelane_b32 v39, s25, 28
; GFX908-NEXT: v_writelane_b32 v39, s25, 16
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; def s26
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: v_writelane_b32 v39, s26, 29
; GFX908-NEXT: v_writelane_b32 v39, s26, 17
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; def s27
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: v_writelane_b32 v39, s27, 30
; GFX908-NEXT: v_writelane_b32 v39, s27, 18
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; def s28
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: v_writelane_b32 v39, s28, 31
; GFX908-NEXT: v_writelane_b32 v39, s28, 19
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; def s29
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: v_writelane_b32 v39, s29, 32
; GFX908-NEXT: v_writelane_b32 v39, s29, 20
; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1
; GFX908-NEXT: buffer_store_dword v39, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
; GFX908-NEXT: s_mov_b64 exec, s[34:35]
Expand Down Expand Up @@ -646,39 +646,39 @@ define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 {
; GFX908-NEXT: v_readlane_b32 s15, v39, 2
; GFX908-NEXT: v_mov_b32_e32 v31, v40
; GFX908-NEXT: v_readlane_b32 s17, v39, 23
; GFX908-NEXT: v_readlane_b32 s21, v39, 24
; GFX908-NEXT: v_readlane_b32 s21, v39, 12
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use s21
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: v_readlane_b32 s22, v39, 25
; GFX908-NEXT: v_readlane_b32 s22, v39, 13
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use s22
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: v_readlane_b32 s23, v39, 26
; GFX908-NEXT: v_readlane_b32 s23, v39, 14
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use s23
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: v_readlane_b32 s24, v39, 27
; GFX908-NEXT: v_readlane_b32 s24, v39, 15
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use s24
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: v_readlane_b32 s25, v39, 28
; GFX908-NEXT: v_readlane_b32 s25, v39, 16
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use s25
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: v_readlane_b32 s26, v39, 29
; GFX908-NEXT: v_readlane_b32 s26, v39, 17
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use s26
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: v_readlane_b32 s27, v39, 30
; GFX908-NEXT: v_readlane_b32 s27, v39, 18
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use s27
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: v_readlane_b32 s28, v39, 31
; GFX908-NEXT: v_readlane_b32 s28, v39, 19
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use s28
; GFX908-NEXT: ;;#ASMEND
; GFX908-NEXT: v_readlane_b32 s29, v39, 32
; GFX908-NEXT: v_readlane_b32 s29, v39, 20
; GFX908-NEXT: ;;#ASMSTART
; GFX908-NEXT: ; use s29
; GFX908-NEXT: ;;#ASMEND
Expand Down
8 changes: 8 additions & 0 deletions llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,12 @@

; DEFAULT: Greedy Register Allocator
; DEFAULT-NEXT: Virtual Register Rewriter
; DEFAULT-NEXT: Stack Slot Coloring
; DEFAULT-NEXT: SI lower SGPR spill instructions
; DEFAULT-NEXT: Virtual Register Map
; DEFAULT-NEXT: Live Register Matrix
; DEFAULT-NEXT: SI Pre-allocate WWM Registers
; DEFAULT-NEXT: Live Stack Slot Analysis
; DEFAULT-NEXT: Greedy Register Allocator
; DEFAULT-NEXT: SI Lower WWM Copies
; DEFAULT-NEXT: Virtual Register Rewriter
Expand Down Expand Up @@ -57,10 +59,12 @@
; BASIC-DEFAULT-NEXT: Live Register Matrix
; BASIC-DEFAULT-NEXT: Basic Register Allocator
; BASIC-DEFAULT-NEXT: Virtual Register Rewriter
; BASIC-DEFAULT-NEXT: Stack Slot Coloring
; BASIC-DEFAULT-NEXT: SI lower SGPR spill instructions
; BASIC-DEFAULT-NEXT: Virtual Register Map
; BASIC-DEFAULT-NEXT: Live Register Matrix
; BASIC-DEFAULT-NEXT: SI Pre-allocate WWM Registers
; BASIC-DEFAULT-NEXT: Live Stack Slot Analysis
; BASIC-DEFAULT-NEXT: Bundle Machine CFG Edges
; BASIC-DEFAULT-NEXT: Spill Code Placement Analysis
; BASIC-DEFAULT-NEXT: Lazy Machine Block Frequency Analysis
Expand All @@ -81,10 +85,12 @@

; DEFAULT-BASIC: Greedy Register Allocator
; DEFAULT-BASIC-NEXT: Virtual Register Rewriter
; DEFAULT-BASIC-NEXT: Stack Slot Coloring
; DEFAULT-BASIC-NEXT: SI lower SGPR spill instructions
; DEFAULT-BASIC-NEXT: Virtual Register Map
; DEFAULT-BASIC-NEXT: Live Register Matrix
; DEFAULT-BASIC-NEXT: SI Pre-allocate WWM Registers
; DEFAULT-BASIC-NEXT: Live Stack Slot Analysis
; DEFAULT-BASIC-NEXT: Basic Register Allocator
; DEFAULT-BASIC-NEXT: SI Lower WWM Copies
; DEFAULT-BASIC-NEXT: Virtual Register Rewriter
Expand All @@ -107,10 +113,12 @@
; BASIC-BASIC-NEXT: Live Register Matrix
; BASIC-BASIC-NEXT: Basic Register Allocator
; BASIC-BASIC-NEXT: Virtual Register Rewriter
; BASIC-BASIC-NEXT: Stack Slot Coloring
; BASIC-BASIC-NEXT: SI lower SGPR spill instructions
; BASIC-BASIC-NEXT: Virtual Register Map
; BASIC-BASIC-NEXT: Live Register Matrix
; BASIC-BASIC-NEXT: SI Pre-allocate WWM Registers
; BASIC-BASIC-NEXT: Live Stack Slot Analysis
; BASIC-BASIC-NEXT: Basic Register Allocator
; BASIC-BASIC-NEXT: SI Lower WWM Copies
; BASIC-BASIC-NEXT: Virtual Register Rewriter
Expand Down
Loading

0 comments on commit cc7e4c2

Please sign in to comment.