From 2cd448cc27c463b6eca4983557246b06bd50e063 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= Date: Thu, 3 Oct 2024 14:32:46 +0200 Subject: [PATCH 1/2] [AMDGPU][SIPreEmitPeephole] rename test and add branch-probabilities to insert-handle-flat-vmem-ds.mir --- .../AMDGPU/insert-handle-flat-vmem-ds.mir | 95 +++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/insert-handle-flat-vmem-ds.mir diff --git a/llvm/test/CodeGen/AMDGPU/insert-handle-flat-vmem-ds.mir b/llvm/test/CodeGen/AMDGPU/insert-handle-flat-vmem-ds.mir new file mode 100644 index 00000000000000..c4f3b5f6c1ae97 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/insert-handle-flat-vmem-ds.mir @@ -0,0 +1,95 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=polaris10 -run-pass si-pre-emit-peephole -verify-machineinstrs %s -o - | FileCheck %s + +--- + +name: skip_execz_flat +body: | + ; CHECK-LABEL: name: skip_execz_flat + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x7fffffff), %bb.2(0x00000001) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: FLAT_STORE_DWORD undef $vgpr1_vgpr2, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x70000000), %bb.2(0x00000001) + S_CBRANCH_EXECZ %bb.2, implicit $exec + + bb.1: + successors: %bb.2 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + FLAT_STORE_DWORD undef $vgpr1_vgpr2, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr + + bb.2: + S_ENDPGM 0 +... + +--- + +name: skip_execz_mubuf +body: | + ; CHECK-LABEL: name: skip_execz_mubuf + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x7fffffff), %bb.2(0x00000001) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x70000000), %bb.2(0x00000001) + S_CBRANCH_EXECZ %bb.2, implicit $exec + + bb.1: + successors: %bb.2 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, implicit $exec + + bb.2: + S_ENDPGM 0 +... + +--- + +name: skip_execz_ds +body: | + ; CHECK-LABEL: name: skip_execz_ds + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x7fffffff), %bb.2(0x00000001) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: DS_WRITE_B32 $vgpr0, $vgpr0, 0, 0, implicit $m0, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_ENDPGM 0 + bb.0: + successors: %bb.1(0x70000000), %bb.2(0x00000001) + S_CBRANCH_EXECZ %bb.2, implicit $exec + + bb.1: + successors: %bb.2 + $vgpr0 = V_MOV_B32_e32 0, implicit $exec + DS_WRITE_B32 $vgpr0, $vgpr0, 0, 0, implicit $m0, implicit $exec + + bb.2: + S_ENDPGM 0 +... From 1b79db113f5f60179610f0c2a9f79340a1ed4438 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Manuel=20Martinez=20Caama=C3=B1o?= Date: Fri, 20 Sep 2024 14:24:37 +0200 Subject: [PATCH 2/2] [AMDGPU][SIPreEmitPeephole] mustRetainExeczBranch: use BranchProbability and TargetSchedModel Remove s_cbranch_execnz branches if the transformation is profitable according to BranchProbability and TargetSchedmodel. --- llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp | 89 +++++++++++++---- .../AMDGPU/amdgpu-demote-scc-branches.ll | 9 +- .../CodeGen/AMDGPU/branch-condition-and.ll | 3 +- llvm/test/CodeGen/AMDGPU/else.ll | 1 - llvm/test/CodeGen/AMDGPU/fptoi.i128.ll | 6 +- llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll | 3 +- .../AMDGPU/insert-handle-flat-vmem-ds.mir | 12 +-- .../AMDGPU/insert-skips-flat-vmem-ds.mir | 95 ------------------- .../CodeGen/AMDGPU/insert-skips-gfx10.mir | 2 +- .../CodeGen/AMDGPU/insert-skips-gfx12.mir | 2 +- llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir | 2 +- .../AMDGPU/insert-skips-ignored-insts.mir | 30 +++++- llvm/test/CodeGen/AMDGPU/itofp.i128.ll | 6 +- .../AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll | 7 +- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll | 18 ++-- .../CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll | 18 ++-- .../CodeGen/AMDGPU/local-atomicrmw-fadd.ll | 22 ++--- ...emove-short-exec-branches-gpr-idx-mode.mir | 3 +- ...ort-exec-branches-special-instructions.mir | 3 +- llvm/test/CodeGen/AMDGPU/ret_jump.ll | 1 - .../si-unify-exit-return-unreachable.ll | 13 ++- .../AMDGPU/skip-branch-taildup-ret.mir | 2 +- 22 files changed, 150 insertions(+), 197 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem-ds.mir diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp index 1334029544f999..8cb98ed4ecf881 100644 --- a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp +++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp @@ -15,19 +15,13 @@ #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/Support/BranchProbability.h" using namespace llvm; #define DEBUG_TYPE "si-pre-emit-peephole" -static unsigned SkipThreshold; - -static cl::opt SkipThresholdFlag( - "amdgpu-skip-threshold", cl::Hidden, - cl::desc( - "Number of instructions before jumping over divergent control flow"), - cl::location(SkipThreshold), cl::init(12)); - namespace { class SIPreEmitPeephole : public MachineFunctionPass { @@ -41,7 +35,8 @@ class SIPreEmitPeephole : public MachineFunctionPass { MachineBasicBlock *&TrueMBB, MachineBasicBlock *&FalseMBB, SmallVectorImpl &Cond); - bool mustRetainExeczBranch(const MachineBasicBlock &From, + bool mustRetainExeczBranch(const MachineInstr &Branch, + const MachineBasicBlock &From, const MachineBasicBlock &To) const; bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB); @@ -304,11 +299,61 @@ bool SIPreEmitPeephole::getBlockDestinations( return true; } +namespace { +class BranchWeightCostModel { + const SIInstrInfo &TII; + const TargetSchedModel &SchedModel; + BranchProbability BranchProb; + static constexpr uint64_t BranchNotTakenCost = 1; + uint64_t BranchTakenCost; + uint64_t ThenCyclesCost = 0; + +public: + BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch, + const MachineBasicBlock &Succ) + : TII(TII), SchedModel(TII.getSchedModel()) { + const MachineBasicBlock &Head = *Branch.getParent(); + const auto *FromIt = find(Head.successors(), &Succ); + assert(FromIt != Head.succ_end()); + + BranchProb = Head.getSuccProbability(FromIt); + if (BranchProb.isUnknown()) + BranchProb = BranchProbability::getZero(); + BranchTakenCost = SchedModel.computeInstrLatency(&Branch); + } + + bool isProfitable(const MachineInstr &MI) { + if (TII.isWaitcnt(MI.getOpcode())) + return false; + + ThenCyclesCost += SchedModel.computeInstrLatency(&MI); + + // Consider `P = N/D` to be the probability of execz being false (skipping + // the then-block) The transformation is profitable if always executing the + // 'then' block is cheaper than executing sometimes 'then' and always + // executing s_cbranch_execz: + // * ThenCost <= P*ThenCost + (1-P)*BranchTakenCost + P*BranchNotTakenCost + // * (1-P) * ThenCost <= (1-P)*BranchTakenCost + P*BranchNotTakenCost + // * (D-N)/D * ThenCost <= (D-N)/D * BranchTakenCost + N/D * + // BranchNotTakenCost + uint64_t Numerator = BranchProb.getNumerator(); + uint64_t Denominator = BranchProb.getDenominator(); + return (Denominator - Numerator) * ThenCyclesCost <= + ((Denominator - Numerator) * BranchTakenCost + + Numerator * BranchNotTakenCost); + } +}; + bool SIPreEmitPeephole::mustRetainExeczBranch( - const MachineBasicBlock &From, const MachineBasicBlock &To) const { - unsigned NumInstr = 0; - const MachineFunction *MF = From.getParent(); + const MachineInstr &Branch, const MachineBasicBlock &From, + const MachineBasicBlock &To) const { + + const MachineBasicBlock &Head = *Branch.getParent(); + assert(is_contained(Head.successors(), &From)); + + BranchWeightCostModel CostModel{*TII, Branch, From}; + const MachineFunction *MF = From.getParent(); for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end(); MBBI != End && MBBI != ToI; ++MBBI) { const MachineBasicBlock &MBB = *MBBI; @@ -326,23 +371,22 @@ bool SIPreEmitPeephole::mustRetainExeczBranch( if (TII->hasUnwantedEffectsWhenEXECEmpty(MI)) return true; - // These instructions are potentially expensive even if EXEC = 0. - if (TII->isSMRD(MI) || TII->isVMEM(MI) || TII->isFLAT(MI) || - TII->isDS(MI) || TII->isWaitcnt(MI.getOpcode())) - return true; - - ++NumInstr; - if (NumInstr >= SkipThreshold) + if (!CostModel.isProfitable(MI)) return true; } } return false; } +} // namespace // Returns true if the skip branch instruction is removed. bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB) { + + if (!TII->getSchedModel().hasInstrSchedModel()) + return false; + MachineBasicBlock *TrueMBB = nullptr; MachineBasicBlock *FalseMBB = nullptr; SmallVector Cond; @@ -351,8 +395,11 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI, return false; // Consider only the forward branches. - if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) || - mustRetainExeczBranch(*FalseMBB, *TrueMBB)) + if (SrcMBB.getNumber() >= TrueMBB->getNumber()) + return false; + + // Consider only when it is legal and profitable + if (mustRetainExeczBranch(MI, *FalseMBB, *TrueMBB)) return false; LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI); diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll index c293891140008d..aa38f43368694d 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll @@ -292,7 +292,6 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8) ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX9-NEXT: s_cbranch_execz .LBB5_2 ; GFX9-NEXT: ; %bb.1: ; %if.then ; GFX9-NEXT: s_mov_b32 s11, s18 ; GFX9-NEXT: s_mov_b32 s10, s17 @@ -301,7 +300,7 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s19 ; GFX9-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen -; GFX9-NEXT: .LBB5_2: ; %if.end +; GFX9-NEXT: ; %bb.2: ; %if.end ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -311,7 +310,6 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8) ; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1010-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0, v0 ; GFX1010-NEXT: s_and_saveexec_b32 s4, vcc_lo -; GFX1010-NEXT: s_cbranch_execz .LBB5_2 ; GFX1010-NEXT: ; %bb.1: ; %if.then ; GFX1010-NEXT: v_mov_b32_e32 v0, s6 ; GFX1010-NEXT: v_mov_b32_e32 v1, s19 @@ -320,7 +318,7 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8) ; GFX1010-NEXT: s_mov_b32 s9, s16 ; GFX1010-NEXT: s_mov_b32 s8, s7 ; GFX1010-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen -; GFX1010-NEXT: .LBB5_2: ; %if.end +; GFX1010-NEXT: ; %bb.2: ; %if.end ; GFX1010-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1010-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -331,7 +329,6 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8) ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1030-NEXT: s_mov_b32 s4, exec_lo ; GFX1030-NEXT: v_cmpx_lt_i32_e32 0, v0 -; GFX1030-NEXT: s_cbranch_execz .LBB5_2 ; GFX1030-NEXT: ; %bb.1: ; %if.then ; GFX1030-NEXT: v_mov_b32_e32 v0, s6 ; GFX1030-NEXT: v_mov_b32_e32 v1, s19 @@ -340,7 +337,7 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8) ; GFX1030-NEXT: s_mov_b32 s9, s16 ; GFX1030-NEXT: s_mov_b32 s8, s7 ; GFX1030-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen -; GFX1030-NEXT: .LBB5_2: ; %if.end +; GFX1030-NEXT: ; %bb.2: ; %if.end ; GFX1030-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX1030-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll b/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll index 6efc9f2ae77b82..d23d7a7c8e0c80 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll @@ -17,13 +17,12 @@ define amdgpu_ps void @ham(float %arg, float %arg1) #0 { ; GCN-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v1 ; GCN-NEXT: s_and_b64 s[0:1], vcc, s[0:1] ; GCN-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] -; GCN-NEXT: s_cbranch_execz .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %bb4 ; GCN-NEXT: v_mov_b32_e32 v0, 4 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_write_b32 v0, v0 ; GCN-NEXT: ; divergent unreachable -; GCN-NEXT: .LBB0_2: ; %UnifiedReturnBlock +; GCN-NEXT: ; %bb.2: ; %UnifiedReturnBlock ; GCN-NEXT: s_endpgm bb: %tmp = fcmp ogt float %arg, 0.000000e+00 diff --git a/llvm/test/CodeGen/AMDGPU/else.ll b/llvm/test/CodeGen/AMDGPU/else.ll index 655c5cd184a1ed..d3d4b860f9ac7f 100644 --- a/llvm/test/CodeGen/AMDGPU/else.ll +++ b/llvm/test/CodeGen/AMDGPU/else.ll @@ -30,7 +30,6 @@ end: ; CHECK-NEXT: s_and_b64 exec, exec, [[INIT_EXEC]] ; CHECK-NEXT: s_and_b64 [[AND_INIT:s\[[0-9]+:[0-9]+\]]], exec, [[DST]] ; CHECK-NEXT: s_xor_b64 exec, exec, [[AND_INIT]] -; CHECK-NEXT: s_cbranch_execz define amdgpu_ps void @else_execfix_leave_wqm(i32 %z, float %v) #0 { main_body: %cc = icmp sgt i32 %z, 5 diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index 667a3f398c08a2..6e8e6c07217895 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -1578,6 +1578,7 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; SDAG-NEXT: .LBB6_4: ; %Flow ; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13] +; SDAG-NEXT: s_cbranch_execz .LBB6_6 ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; SDAG-NEXT: v_sub_u32_e32 v2, 0x86, v5 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7] @@ -1589,7 +1590,7 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; SDAG-NEXT: v_mul_i32_i24_e32 v0, v0, v8 ; SDAG-NEXT: v_mov_b32_e32 v3, v2 -; SDAG-NEXT: ; %bb.6: ; %Flow1 +; SDAG-NEXT: .LBB6_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: .LBB6_7: ; %Flow2 ; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] @@ -1929,6 +1930,7 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; SDAG-NEXT: .LBB7_4: ; %Flow ; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13] +; SDAG-NEXT: s_cbranch_execz .LBB7_6 ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; SDAG-NEXT: v_sub_u32_e32 v2, 0x86, v5 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7] @@ -1940,7 +1942,7 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; SDAG-NEXT: v_mul_i32_i24_e32 v0, v0, v8 ; SDAG-NEXT: v_mov_b32_e32 v3, v2 -; SDAG-NEXT: ; %bb.6: ; %Flow1 +; SDAG-NEXT: .LBB7_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: .LBB7_7: ; %Flow2 ; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll index f34f9f38feeb4a..2fd501014d467d 100644 --- a/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll +++ b/llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll @@ -36,11 +36,12 @@ define amdgpu_ps void @i1_copy_from_loop(ptr addrspace(8) inreg %rsrc, i32 %tid) ; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v1 ; SI-NEXT: s_mov_b64 s[8:9], -1 ; SI-NEXT: s_and_saveexec_b64 s[12:13], vcc +; SI-NEXT: s_cbranch_execz .LBB0_6 ; SI-NEXT: ; %bb.5: ; %end.loop ; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; SI-NEXT: s_add_i32 s14, s14, 1 ; SI-NEXT: s_xor_b64 s[8:9], exec, -1 -; SI-NEXT: ; %bb.6: ; %Flow1 +; SI-NEXT: .LBB0_6: ; %Flow1 ; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; SI-NEXT: s_or_b64 exec, exec, s[12:13] ; SI-NEXT: s_branch .LBB0_2 diff --git a/llvm/test/CodeGen/AMDGPU/insert-handle-flat-vmem-ds.mir b/llvm/test/CodeGen/AMDGPU/insert-handle-flat-vmem-ds.mir index c4f3b5f6c1ae97..d89f306c96a36a 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-handle-flat-vmem-ds.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-handle-flat-vmem-ds.mir @@ -7,9 +7,7 @@ name: skip_execz_flat body: | ; CHECK-LABEL: name: skip_execz_flat ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x7fffffff), %bb.2(0x00000001) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: successors: %bb.1(0x7fffffff) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -38,9 +36,7 @@ name: skip_execz_mubuf body: | ; CHECK-LABEL: name: skip_execz_mubuf ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x7fffffff), %bb.2(0x00000001) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: successors: %bb.1(0x7fffffff) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x80000000) @@ -69,9 +65,7 @@ name: skip_execz_ds body: | ; CHECK-LABEL: name: skip_execz_ds ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x7fffffff), %bb.2(0x00000001) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: successors: %bb.1(0x7fffffff) ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.1: ; CHECK-NEXT: successors: %bb.2(0x80000000) diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem-ds.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem-ds.mir deleted file mode 100644 index da38929fab9907..00000000000000 --- a/llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem-ds.mir +++ /dev/null @@ -1,95 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=polaris10 -run-pass si-pre-emit-peephole -amdgpu-skip-threshold=1 -verify-machineinstrs %s -o - | FileCheck %s - ---- - -name: skip_execz_flat -body: | - ; CHECK-LABEL: name: skip_execz_flat - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: successors: %bb.2(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: FLAT_STORE_DWORD undef $vgpr1_vgpr2, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: S_ENDPGM 0 - bb.0: - successors: %bb.1, %bb.2 - S_CBRANCH_EXECZ %bb.2, implicit $exec - - bb.1: - successors: %bb.2 - $vgpr0 = V_MOV_B32_e32 0, implicit $exec - FLAT_STORE_DWORD undef $vgpr1_vgpr2, $vgpr0, 0, 0, implicit $exec, implicit $flat_scr - - bb.2: - S_ENDPGM 0 -... - ---- - -name: skip_execz_mubuf -body: | - ; CHECK-LABEL: name: skip_execz_mubuf - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: successors: %bb.2(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: S_ENDPGM 0 - bb.0: - successors: %bb.1, %bb.2 - S_CBRANCH_EXECZ %bb.2, implicit $exec - - bb.1: - successors: %bb.2 - $vgpr0 = V_MOV_B32_e32 0, implicit $exec - BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, implicit $exec - - bb.2: - S_ENDPGM 0 -... - ---- - -name: skip_execz_ds -body: | - ; CHECK-LABEL: name: skip_execz_ds - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: successors: %bb.2(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: DS_WRITE_B32 $vgpr0, $vgpr0, 0, 0, implicit $m0, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: S_ENDPGM 0 - bb.0: - successors: %bb.1, %bb.2 - S_CBRANCH_EXECZ %bb.2, implicit $exec - - bb.1: - successors: %bb.2 - $vgpr0 = V_MOV_B32_e32 0, implicit $exec - DS_WRITE_B32 $vgpr0, $vgpr0, 0, 0, implicit $m0, implicit $exec - - bb.2: - S_ENDPGM 0 -... diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-gfx10.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-gfx10.mir index b4ed3cafbacb5f..0bf74d96e134e6 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-skips-gfx10.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-skips-gfx10.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 -# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass si-pre-emit-peephole -amdgpu-skip-threshold=10 -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass si-pre-emit-peephole -verify-machineinstrs %s -o - | FileCheck %s --- name: skip_waitcnt_vscnt diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir index 2d092974ac566f..d88dc204e1336f 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 -# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass si-pre-emit-peephole -amdgpu-skip-threshold=10 -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass si-pre-emit-peephole -verify-machineinstrs %s -o - | FileCheck %s --- name: skip_wait_loadcnt diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir index 85618539be5ff5..3281e4ccda7ab3 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass si-pre-emit-peephole -amdgpu-skip-threshold=1 -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass si-pre-emit-peephole -verify-machineinstrs %s -o - | FileCheck %s # Make sure mandatory skips are inserted to ensure GWS ops aren't run with exec = 0 --- diff --git a/llvm/test/CodeGen/AMDGPU/insert-skips-ignored-insts.mir b/llvm/test/CodeGen/AMDGPU/insert-skips-ignored-insts.mir index 92719a5ee53aea..372f22019aefba 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-skips-ignored-insts.mir +++ b/llvm/test/CodeGen/AMDGPU/insert-skips-ignored-insts.mir @@ -1,12 +1,34 @@ -# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass si-pre-emit-peephole -amdgpu-skip-threshold=3 %s -o - | FileCheck %s +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -run-pass si-pre-emit-peephole %s -o - | FileCheck %s --- -# CHECK-LABEL: name: no_count_dbg_value -# CHECK: $vgpr1 = V_MOV_B32_e32 7, implicit $exec -# CHECK-NOT: S_CBRANCH_EXECZ name: no_count_dbg_value body: | + ; CHECK-LABEL: name: no_count_dbg_value + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 7, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec + ; CHECK-NEXT: DBG_VALUE + ; CHECK-NEXT: DBG_VALUE + ; CHECK-NEXT: DBG_VALUE + ; CHECK-NEXT: DBG_VALUE + ; CHECK-NEXT: DBG_VALUE + ; CHECK-NEXT: DBG_VALUE + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: successors: %bb.3(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.3: + ; CHECK-NEXT: S_ENDPGM 0 bb.0: successors: %bb.1, %bb.2 diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll index c5198cdb421a50..38d928a006fb20 100644 --- a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll @@ -766,6 +766,7 @@ define double @sitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] ; GISEL-NEXT: .LBB2_8: ; %Flow2 ; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: s_cbranch_execz .LBB2_10 ; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb ; GISEL-NEXT: v_lshlrev_b64 v[9:10], 1, v[0:1] ; GISEL-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3] @@ -775,7 +776,7 @@ define double @sitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_mov_b32_e32 v1, v10 ; GISEL-NEXT: v_mov_b32_e32 v2, v11 ; GISEL-NEXT: v_mov_b32_e32 v3, v12 -; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog +; GISEL-NEXT: .LBB2_10: ; %itofp-sw-epilog ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GISEL-NEXT: v_bfe_u32 v3, v0, 2, 1 ; GISEL-NEXT: v_or_b32_e32 v0, v0, v3 @@ -1043,6 +1044,7 @@ define double @uitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: s_or_b64 exec, exec, s[12:13] ; GISEL-NEXT: .LBB3_8: ; %Flow2 ; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; GISEL-NEXT: s_cbranch_execz .LBB3_10 ; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb ; GISEL-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1] ; GISEL-NEXT: v_lshlrev_b64 v[10:11], 1, v[2:3] @@ -1052,7 +1054,7 @@ define double @uitofp_i128_to_f64(i128 %x) { ; GISEL-NEXT: v_mov_b32_e32 v1, v9 ; GISEL-NEXT: v_mov_b32_e32 v2, v10 ; GISEL-NEXT: v_mov_b32_e32 v3, v11 -; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog +; GISEL-NEXT: .LBB3_10: ; %itofp-sw-epilog ; GISEL-NEXT: s_or_b64 exec, exec, s[4:5] ; GISEL-NEXT: v_bfe_u32 v4, v0, 2, 1 ; GISEL-NEXT: v_or_b32_e32 v0, v0, v4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll index 0ca01784d83383..1b1c89d9f5ad2f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.init.whole.wave-w64.ll @@ -21,6 +21,7 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64 ; GISEL12-NEXT: s_mov_b32 s5, s6 ; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: s_and_saveexec_b64 s[6:7], s[10:11] +; GISEL12-NEXT: s_cbranch_execz .LBB0_2 ; GISEL12-NEXT: ; %bb.1: ; %shader ; GISEL12-NEXT: s_or_saveexec_b64 s[10:11], -1 ; GISEL12-NEXT: s_wait_alu 0xfffe @@ -35,7 +36,8 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64 ; GISEL12-NEXT: v_add_nc_u32_e32 v10, 42, v13 ; GISEL12-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GISEL12-NEXT: v_mov_b32_e32 v12, v1 -; GISEL12-NEXT: ; %bb.2: ; %tail +; GISEL12-NEXT: .LBB0_2: ; %tail +; GISEL12-NEXT: s_wait_alu 0xfffe ; GISEL12-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL12-NEXT: s_mov_b64 exec, s[4:5] ; GISEL12-NEXT: s_wait_alu 0xfffe @@ -81,6 +83,7 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64 ; GISEL10-NEXT: s_mov_b32 s4, s5 ; GISEL10-NEXT: s_mov_b32 s5, s6 ; GISEL10-NEXT: s_and_saveexec_b64 s[6:7], s[10:11] +; GISEL10-NEXT: s_cbranch_execz .LBB0_2 ; GISEL10-NEXT: ; %bb.1: ; %shader ; GISEL10-NEXT: s_or_saveexec_b64 s[10:11], -1 ; GISEL10-NEXT: v_cndmask_b32_e64 v0, 0x47, v13, s[10:11] @@ -91,7 +94,7 @@ define amdgpu_cs_chain void @basic(<3 x i32> inreg %sgpr, ptr inreg %callee, i64 ; GISEL10-NEXT: v_mov_b32_e32 v11, v0 ; GISEL10-NEXT: v_add_nc_u32_e32 v10, 42, v13 ; GISEL10-NEXT: v_mov_b32_e32 v12, v1 -; GISEL10-NEXT: ; %bb.2: ; %tail +; GISEL10-NEXT: .LBB0_2: ; %tail ; GISEL10-NEXT: s_or_b64 exec, exec, s[6:7] ; GISEL10-NEXT: s_mov_b64 exec, s[4:5] ; GISEL10-NEXT: s_setpc_b64 s[8:9] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll index 7371d498a70706..6cc2393d598e04 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll @@ -539,11 +539,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr4 ; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX8DAGISEL-NEXT: ; %bb.1: ; %else ; GFX8DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 -; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX8DAGISEL-NEXT: ; %bb.2: ; %Flow ; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s4 @@ -611,11 +610,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4 ; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX9DAGISEL-NEXT: ; %bb.1: ; %else ; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 -; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX9DAGISEL-NEXT: ; %bb.2: ; %Flow ; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 @@ -681,11 +679,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr4 ; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1064DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0 -; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1064DAGISEL-NEXT: ; %bb.2: ; %Flow ; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s4 @@ -751,11 +748,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1 ; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1032DAGISEL-NEXT: s_load_dword s1, s[2:3], 0x2c ; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0 -; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1032DAGISEL-NEXT: ; %bb.2: ; %Flow ; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -823,11 +819,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 ; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c ; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0 -; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1164DAGISEL-NEXT: ; %bb.2: ; %Flow ; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 @@ -903,11 +898,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 ; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[2:3], 0x2c ; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0 -; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1132DAGISEL-NEXT: ; %bb.2: ; %Flow ; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll index 60af21524a04a1..f7f8536219db6d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umin.ll @@ -540,11 +540,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX8DAGISEL-NEXT: ; implicit-def: $sgpr4 ; GFX8DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX8DAGISEL-NEXT: ; %bb.1: ; %else ; GFX8DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX8DAGISEL-NEXT: ; implicit-def: $vgpr0 -; GFX8DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX8DAGISEL-NEXT: ; %bb.2: ; %Flow ; GFX8DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX8DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX8DAGISEL-NEXT: v_mov_b32_e32 v1, s4 @@ -612,11 +611,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX9DAGISEL-NEXT: ; implicit-def: $sgpr4 ; GFX9DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX9DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX9DAGISEL-NEXT: ; %bb.1: ; %else ; GFX9DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX9DAGISEL-NEXT: ; implicit-def: $vgpr0 -; GFX9DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX9DAGISEL-NEXT: ; %bb.2: ; %Flow ; GFX9DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX9DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9DAGISEL-NEXT: v_mov_b32_e32 v1, s4 @@ -682,11 +680,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1064DAGISEL-NEXT: ; implicit-def: $sgpr4 ; GFX1064DAGISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1064DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1064DAGISEL-NEXT: s_load_dword s4, s[2:3], 0x2c ; GFX1064DAGISEL-NEXT: ; implicit-def: $vgpr0 -; GFX1064DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1064DAGISEL-NEXT: ; %bb.2: ; %Flow ; GFX1064DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1064DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064DAGISEL-NEXT: v_mov_b32_e32 v1, s4 @@ -752,11 +749,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1032DAGISEL-NEXT: ; implicit-def: $sgpr1 ; GFX1032DAGISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1032DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1032DAGISEL-NEXT: s_load_dword s1, s[2:3], 0x2c ; GFX1032DAGISEL-NEXT: ; implicit-def: $vgpr0 -; GFX1032DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1032DAGISEL-NEXT: ; %bb.2: ; %Flow ; GFX1032DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1032DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032DAGISEL-NEXT: v_mov_b32_e32 v1, s1 @@ -824,11 +820,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1164DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1164DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 ; GFX1164DAGISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX1164DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1164DAGISEL-NEXT: s_load_b32 s4, s[2:3], 0x2c ; GFX1164DAGISEL-NEXT: ; implicit-def: $vgpr0 -; GFX1164DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1164DAGISEL-NEXT: ; %bb.2: ; %Flow ; GFX1164DAGISEL-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GFX1164DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164DAGISEL-NEXT: v_mov_b32_e32 v1, s4 @@ -904,11 +899,10 @@ define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) { ; GFX1132DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132DAGISEL-NEXT: v_cmpx_lt_u32_e32 15, v0 ; GFX1132DAGISEL-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX1132DAGISEL-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132DAGISEL-NEXT: ; %bb.1: ; %else ; GFX1132DAGISEL-NEXT: s_load_b32 s1, s[2:3], 0x2c ; GFX1132DAGISEL-NEXT: ; implicit-def: $vgpr0 -; GFX1132DAGISEL-NEXT: .LBB4_2: ; %Flow +; GFX1132DAGISEL-NEXT: ; %bb.2: ; %Flow ; GFX1132DAGISEL-NEXT: s_or_saveexec_b32 s0, s0 ; GFX1132DAGISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132DAGISEL-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll index 0b3ef62856f540..08b089a32d1d75 100644 --- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll @@ -7957,11 +7957,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_xor_b32 s1, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execz .LBB29_8 ; GFX12-NEXT: ; %bb.7: ; GFX12-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_mov_b32 v2, s0 ; GFX12-NEXT: ds_add_rtn_f32 v1, v1, v2 -; GFX12-NEXT: .LBB29_8: +; GFX12-NEXT: ; %bb.8: ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 @@ -8041,11 +8040,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX940-NEXT: ; implicit-def: $vgpr2 ; GFX940-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX940-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX940-NEXT: s_cbranch_execz .LBB29_8 ; GFX940-NEXT: ; %bb.7: ; GFX940-NEXT: v_mov_b32_e32 v2, s4 ; GFX940-NEXT: ds_add_rtn_f32 v2, v2, v1 -; GFX940-NEXT: .LBB29_8: +; GFX940-NEXT: ; %bb.8: ; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) @@ -8128,11 +8126,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: ; implicit-def: $vgpr2 ; GFX11-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX11-NEXT: s_cbranch_execz .LBB29_8 ; GFX11-NEXT: ; %bb.7: ; GFX11-NEXT: v_mov_b32_e32 v2, s4 ; GFX11-NEXT: ds_add_rtn_f32 v2, v2, v1 -; GFX11-NEXT: .LBB29_8: +; GFX11-NEXT: ; %bb.8: ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -8207,11 +8204,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX10-NEXT: ; implicit-def: $vgpr2 ; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0 -; GFX10-NEXT: s_cbranch_execz .LBB29_8 ; GFX10-NEXT: ; %bb.7: ; GFX10-NEXT: v_mov_b32_e32 v2, s4 ; GFX10-NEXT: ds_add_rtn_f32 v2, v2, v1 -; GFX10-NEXT: .LBB29_8: +; GFX10-NEXT: ; %bb.8: ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 @@ -8220,7 +8216,6 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_add_f32_e32 v0, s2, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo -; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_endpgm ; @@ -8289,11 +8284,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: ; implicit-def: $vgpr2 ; GFX90A-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX90A-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX90A-NEXT: s_cbranch_execz .LBB29_8 ; GFX90A-NEXT: ; %bb.7: ; GFX90A-NEXT: v_mov_b32_e32 v2, s4 ; GFX90A-NEXT: ds_add_rtn_f32 v2, v2, v1 -; GFX90A-NEXT: .LBB29_8: +; GFX90A-NEXT: ; %bb.8: ; GFX90A-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) @@ -8370,11 +8364,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX908-NEXT: ; implicit-def: $vgpr2 ; GFX908-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX908-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX908-NEXT: s_cbranch_execz .LBB29_8 ; GFX908-NEXT: ; %bb.7: ; GFX908-NEXT: v_mov_b32_e32 v2, s4 ; GFX908-NEXT: ds_add_rtn_f32 v2, v2, v1 -; GFX908-NEXT: .LBB29_8: +; GFX908-NEXT: ; %bb.8: ; GFX908-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX908-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX908-NEXT: s_waitcnt lgkmcnt(0) @@ -8452,12 +8445,11 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: ; implicit-def: $vgpr2 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] -; GFX8-NEXT: s_cbranch_execz .LBB29_8 ; GFX8-NEXT: ; %bb.7: ; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: ds_add_rtn_f32 v2, v2, v1 -; GFX8-NEXT: .LBB29_8: +; GFX8-NEXT: ; %bb.8: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-gpr-idx-mode.mir b/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-gpr-idx-mode.mir index ab6f38c83b94e2..ee5546befd12d3 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-gpr-idx-mode.mir +++ b/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-gpr-idx-mode.mir @@ -1,7 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-pre-emit-peephole -amdgpu-skip-threshold=10 -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=si-pre-emit-peephole -verify-machineinstrs %s -o - | FileCheck %s # Make sure mandatory skips are not removed around mode defs. -# FIXME: -amdgpu-skip-threshold seems to be backwards. --- diff --git a/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir b/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir index d154fc04b0cd91..20de119471ba39 100644 --- a/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir +++ b/llvm/test/CodeGen/AMDGPU/remove-short-exec-branches-special-instructions.mir @@ -1,7 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=si-pre-emit-peephole -amdgpu-skip-threshold=10 -verify-machineinstrs %s -o - | FileCheck %s +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=si-pre-emit-peephole -verify-machineinstrs %s -o - | FileCheck %s # Make sure mandatory skips are not removed around mode defs. -# FIXME: -amdgpu-skip-threshold seems to be backwards. --- diff --git a/llvm/test/CodeGen/AMDGPU/ret_jump.ll b/llvm/test/CodeGen/AMDGPU/ret_jump.ll index ad38d78ddb2ff1..66a55d9eb128c6 100644 --- a/llvm/test/CodeGen/AMDGPU/ret_jump.ll +++ b/llvm/test/CodeGen/AMDGPU/ret_jump.ll @@ -65,7 +65,6 @@ ret.bb: ; preds = %else, %main_body ; GCN: .LBB{{[0-9]+_[0-9]+}}: ; %else ; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc -; GCN-NEXT: s_cbranch_execz .LBB1_{{[0-9]+}} ; GCN-NEXT: ; %unreachable.bb ; GCN: ds_write_b32 diff --git a/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll b/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll index 1eef7b967f6d99..f232275c998d23 100644 --- a/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll +++ b/llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll @@ -122,14 +122,16 @@ define void @my_func(i32 %0) { ; GCN-NEXT: s_cbranch_execz .LBB0_8 ; GCN-NEXT: .LBB0_2: ; %Flow11 ; GCN-NEXT: s_and_saveexec_b64 s[8:9], s[6:7] +; GCN-NEXT: s_cbranch_execz .LBB0_4 ; GCN-NEXT: .LBB0_3: ; %do.body ; GCN-NEXT: s_or_b64 s[4:5], s[4:5], exec ; GCN-NEXT: .LBB0_4: ; %Flow17 ; GCN-NEXT: s_or_b64 exec, exec, s[8:9] ; GCN-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB0_6 ; GCN-NEXT: ; %bb.5: ; %UnifiedUnreachableBlock ; GCN-NEXT: ; divergent unreachable -; GCN-NEXT: ; %bb.6: ; %UnifiedReturnBlock +; GCN-NEXT: .LBB0_6: ; %UnifiedReturnBlock ; GCN-NEXT: s_or_b64 exec, exec, s[6:7] ; GCN-NEXT: s_setpc_b64 s[30:31] ; GCN-NEXT: .LBB0_7: ; %Flow @@ -149,12 +151,14 @@ define void @my_func(i32 %0) { ; GCN-NEXT: s_mov_b64 s[6:7], 0 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB0_12 ; GCN-NEXT: ; %bb.11: ; %LeafBlock5 ; GCN-NEXT: s_mov_b64 s[6:7], exec ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 ; GCN-NEXT: s_and_b64 s[8:9], vcc, exec -; GCN-NEXT: ; %bb.12: ; %Flow13 +; GCN-NEXT: .LBB0_12: ; %Flow13 ; GCN-NEXT: s_andn2_saveexec_b64 s[10:11], s[4:5] +; GCN-NEXT: s_cbranch_execz .LBB0_14 ; GCN-NEXT: ; %bb.13: ; %LeafBlock3 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 @@ -164,7 +168,7 @@ define void @my_func(i32 %0) { ; GCN-NEXT: s_and_b64 s[12:13], vcc, exec ; GCN-NEXT: s_or_b64 s[6:7], s[6:7], s[4:5] ; GCN-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] -; GCN-NEXT: ; %bb.14: ; %Flow14 +; GCN-NEXT: .LBB0_14: ; %Flow14 ; GCN-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN-NEXT: s_mov_b64 s[4:5], 0 ; GCN-NEXT: s_and_saveexec_b64 s[10:11], s[8:9] @@ -173,10 +177,11 @@ define void @my_func(i32 %0) { ; GCN-NEXT: v_cmp_lt_i32_e32 vcc, 1, v0 ; GCN-NEXT: s_mov_b64 s[8:9], -1 ; GCN-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GCN-NEXT: s_cbranch_execz .LBB0_17 ; GCN-NEXT: ; %bb.16: ; %do.body.i.i.i.i ; GCN-NEXT: s_mov_b64 s[4:5], exec ; GCN-NEXT: s_xor_b64 s[8:9], exec, -1 -; GCN-NEXT: ; %bb.17: ; %Flow16 +; GCN-NEXT: .LBB0_17: ; %Flow16 ; GCN-NEXT: s_or_b64 exec, exec, s[12:13] ; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN-NEXT: s_andn2_b64 s[6:7], s[6:7], exec diff --git a/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir b/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir index 558c9d633127c9..5bc13617bea954 100644 --- a/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir +++ b/llvm/test/CodeGen/AMDGPU/skip-branch-taildup-ret.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=si-pre-emit-peephole -amdgpu-skip-threshold=1000000 -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -run-pass=si-pre-emit-peephole -o - %s | FileCheck %s --- name: skip_branch_taildup_endpgm