Skip to content

Commit

Permalink
[AMDGPU][SIPreEmitPeephole] mustRetainExeczBranch: use BranchProbabil…
Browse files Browse the repository at this point in the history
…ity and TargetSchedModel

Remove s_cbranch_execnz branches if the transformation is
profitable according to BranchProbability and TargetSchedmodel.
  • Loading branch information
jmmartinez committed Sep 25, 2024
1 parent 808c498 commit d02e468
Show file tree
Hide file tree
Showing 26 changed files with 349 additions and 482 deletions.
111 changes: 98 additions & 13 deletions llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/Support/BranchProbability.h"

using namespace llvm;

Expand All @@ -41,7 +43,8 @@ class SIPreEmitPeephole : public MachineFunctionPass {
MachineBasicBlock *&TrueMBB,
MachineBasicBlock *&FalseMBB,
SmallVectorImpl<MachineOperand> &Cond);
bool mustRetainExeczBranch(const MachineBasicBlock &From,
bool mustRetainExeczBranch(const MachineBasicBlock &Head,
const MachineBasicBlock &From,
const MachineBasicBlock &To) const;
bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);

Expand Down Expand Up @@ -304,11 +307,95 @@ bool SIPreEmitPeephole::getBlockDestinations(
return true;
}

bool SIPreEmitPeephole::mustRetainExeczBranch(
const MachineBasicBlock &From, const MachineBasicBlock &To) const {
namespace {
class CostModelBase {
public:
virtual bool isProfitable(const MachineInstr &MI) = 0;
virtual ~CostModelBase() = default;
static std::unique_ptr<CostModelBase> Create(const MachineBasicBlock &MBB,
const MachineBasicBlock &,
const SIInstrInfo &TII);
};

class TrivialCostModel : public CostModelBase {
friend CostModelBase;

unsigned NumInstr = 0;
const MachineFunction *MF = From.getParent();
const SIInstrInfo &TII;

TrivialCostModel(const SIInstrInfo &TII) : TII(TII) {}

public:
bool isProfitable(const MachineInstr &MI) override {
++NumInstr;
if (NumInstr >= SkipThreshold)
return false;
// These instructions are potentially expensive even if EXEC = 0.
if (TII.isSMRD(MI) || TII.isVMEM(MI) || TII.isFLAT(MI) || TII.isDS(MI) ||
TII.isWaitcnt(MI.getOpcode()))
return false;
return true;
}
~TrivialCostModel() override = default;
};

class BranchWeightCostModel : public CostModelBase {
friend CostModelBase;

BranchProbability BranchProb;
const TargetSchedModel &SchedModel;
uint64_t BranchCost;
uint64_t ThenCyclesCost = 0;

BranchWeightCostModel(const MachineInstr &Branch, const BranchProbability &BP,
const TargetSchedModel &SchedModel)
: BranchProb(BP), SchedModel(SchedModel) {
assert(!BP.isUnknown());
BranchCost = SchedModel.computeInstrLatency(&Branch, false);
}

public:
bool isProfitable(const MachineInstr &MI) override {
ThenCyclesCost += SchedModel.computeInstrLatency(&MI, false);

// Consider `P = N/D` to be the probability of execnz being true
// The transformation is profitable if always executing the 'then' block
// is cheaper than executing sometimes 'then' and always
// executing s_cbranch_execnz:
// * ThenCost <= P*ThenCost + BranchCost
// * (1-P) * ThenCost <= BranchCost
// * (D-N)/D * ThenCost <= BranchCost
uint64_t Numerator = BranchProb.getNumerator();
uint64_t Denominator = BranchProb.getDenominator();
return (Denominator - Numerator) * ThenCyclesCost <=
Denominator * BranchCost;
}
~BranchWeightCostModel() override = default;
};

std::unique_ptr<CostModelBase>
CostModelBase::Create(const MachineBasicBlock &Head,
const MachineBasicBlock &Succ, const SIInstrInfo &TII) {
const auto *FromIt = find(Head.successors(), &Succ);
assert(FromIt != Head.succ_end());

BranchProbability ExecNZProb = Head.getSuccProbability(FromIt);
const auto &SchedModel = TII.getSchedModel();
if (!ExecNZProb.isUnknown()) {
return std::unique_ptr<CostModelBase>(new BranchWeightCostModel(
*Head.getFirstTerminator(), ExecNZProb, SchedModel));
}

return std::unique_ptr<CostModelBase>(new TrivialCostModel(TII));
}

bool SIPreEmitPeephole::mustRetainExeczBranch(
const MachineBasicBlock &Head, const MachineBasicBlock &From,
const MachineBasicBlock &To) const {

auto CostModel = CostModelBase::Create(Head, From, *TII);

const MachineFunction *MF = From.getParent();
for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
MBBI != End && MBBI != ToI; ++MBBI) {
const MachineBasicBlock &MBB = *MBBI;
Expand All @@ -326,19 +413,14 @@ bool SIPreEmitPeephole::mustRetainExeczBranch(
if (TII->hasUnwantedEffectsWhenEXECEmpty(MI))
return true;

// These instructions are potentially expensive even if EXEC = 0.
if (TII->isSMRD(MI) || TII->isVMEM(MI) || TII->isFLAT(MI) ||
TII->isDS(MI) || TII->isWaitcnt(MI.getOpcode()))
return true;

++NumInstr;
if (NumInstr >= SkipThreshold)
if (!CostModel->isProfitable(MI))
return true;
}
}

return false;
}
} // namespace

// Returns true if the skip branch instruction is removed.
bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
Expand All @@ -351,8 +433,11 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
return false;

// Consider only the forward branches.
if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) ||
mustRetainExeczBranch(*FalseMBB, *TrueMBB))
if (SrcMBB.getNumber() >= TrueMBB->getNumber())
return false;

// Consider only when it is legal and profitable
if (mustRetainExeczBranch(SrcMBB, *FalseMBB, *TrueMBB))
return false;

LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
Expand Down
18 changes: 6 additions & 12 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1726,7 +1726,6 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB59_2
; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24
; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
Expand All @@ -1736,7 +1735,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
; GFX90A-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: .LBB59_2:
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: local_atomic_fadd_f64_noret_pat:
Expand All @@ -1747,7 +1746,6 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB59_2
; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24
; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
Expand All @@ -1757,7 +1755,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
; GFX940-NEXT: v_mov_b32_e32 v2, s2
; GFX940-NEXT: ds_add_f64 v2, v[0:1]
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: .LBB59_2:
; GFX940-NEXT: ; %bb.2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
Expand All @@ -1773,7 +1771,6 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB60_2
; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24
; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
Expand All @@ -1783,7 +1780,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
; GFX90A-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: .LBB60_2:
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush:
Expand All @@ -1794,7 +1791,6 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB60_2
; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24
; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
Expand All @@ -1804,7 +1800,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
; GFX940-NEXT: v_mov_b32_e32 v2, s2
; GFX940-NEXT: ds_add_f64 v2, v[0:1]
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: .LBB60_2:
; GFX940-NEXT: ; %bb.2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
Expand All @@ -1820,7 +1816,6 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX90A-NEXT: s_cbranch_execz .LBB61_2
; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dword s2, s[2:3], 0x24
; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
Expand All @@ -1830,7 +1825,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
; GFX90A-NEXT: v_mov_b32_e32 v2, s2
; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: .LBB61_2:
; GFX90A-NEXT: ; %bb.2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
Expand All @@ -1841,7 +1836,6 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX940-NEXT: s_cbranch_execz .LBB61_2
; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dword s2, s[2:3], 0x24
; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
Expand All @@ -1851,7 +1845,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
; GFX940-NEXT: v_mov_b32_e32 v2, s2
; GFX940-NEXT: ds_add_f64 v2, v[0:1]
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: .LBB61_2:
; GFX940-NEXT: ; %bb.2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst, !amdgpu.no.fine.grained.memory !0
Expand Down
12 changes: 4 additions & 8 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -526,21 +526,19 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1)
; GFX10-NEXT: v_cmp_ge_u64_e32 vcc_lo, 0, v[2:3]
; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX10-NEXT: s_cbranch_execz .LBB10_2
; GFX10-NEXT: ; %bb.1: ; %else
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, v2, v4, 0
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s1, v2, v5, v[1:2]
; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX10-NEXT: .LBB10_2: ; %Flow
; GFX10-NEXT: ; %bb.2: ; %Flow
; GFX10-NEXT: s_andn2_saveexec_b32 s0, s0
; GFX10-NEXT: s_cbranch_execz .LBB10_4
; GFX10-NEXT: ; %bb.3: ; %if
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mul_lo_u32 v1, v2, v5
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: .LBB10_4: ; %endif
; GFX10-NEXT: ; %bb.4: ; %endif
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
Expand All @@ -563,7 +561,6 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1)
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_cmpx_ge_u64_e32 0, v[2:3]
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX11-NEXT: s_cbranch_execz .LBB10_2
; GFX11-NEXT: ; %bb.1: ; %else
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v4, 0
Expand All @@ -572,14 +569,13 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1)
; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX11-NEXT: v_mov_b32_e32 v1, v3
; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX11-NEXT: .LBB10_2: ; %Flow
; GFX11-NEXT: ; %bb.2: ; %Flow
; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX11-NEXT: s_cbranch_execz .LBB10_4
; GFX11-NEXT: ; %bb.3: ; %if
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mul_lo_u32 v1, v2, v5
; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: .LBB10_4: ; %endif
; GFX11-NEXT: ; %bb.4: ; %endif
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
Expand Down
9 changes: 3 additions & 6 deletions llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,6 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 0, v0
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB5_2
; GFX9-NEXT: ; %bb.1: ; %if.then
; GFX9-NEXT: s_mov_b32 s11, s18
; GFX9-NEXT: s_mov_b32 s10, s17
Expand All @@ -301,7 +300,7 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8)
; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: v_mov_b32_e32 v1, s19
; GFX9-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
; GFX9-NEXT: .LBB5_2: ; %if.end
; GFX9-NEXT: ; %bb.2: ; %if.end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
Expand All @@ -311,7 +310,6 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8)
; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1010-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0, v0
; GFX1010-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1010-NEXT: s_cbranch_execz .LBB5_2
; GFX1010-NEXT: ; %bb.1: ; %if.then
; GFX1010-NEXT: v_mov_b32_e32 v0, s6
; GFX1010-NEXT: v_mov_b32_e32 v1, s19
Expand All @@ -320,7 +318,7 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8)
; GFX1010-NEXT: s_mov_b32 s9, s16
; GFX1010-NEXT: s_mov_b32 s8, s7
; GFX1010-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
; GFX1010-NEXT: .LBB5_2: ; %if.end
; GFX1010-NEXT: ; %bb.2: ; %if.end
; GFX1010-NEXT: s_waitcnt_depctr 0xffe3
; GFX1010-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
Expand All @@ -331,7 +329,6 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8)
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1030-NEXT: s_mov_b32 s4, exec_lo
; GFX1030-NEXT: v_cmpx_lt_i32_e32 0, v0
; GFX1030-NEXT: s_cbranch_execz .LBB5_2
; GFX1030-NEXT: ; %bb.1: ; %if.then
; GFX1030-NEXT: v_mov_b32_e32 v0, s6
; GFX1030-NEXT: v_mov_b32_e32 v1, s19
Expand All @@ -340,7 +337,7 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8)
; GFX1030-NEXT: s_mov_b32 s9, s16
; GFX1030-NEXT: s_mov_b32 s8, s7
; GFX1030-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
; GFX1030-NEXT: .LBB5_2: ; %if.end
; GFX1030-NEXT: ; %bb.2: ; %if.end
; GFX1030-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1030-NEXT: s_setpc_b64 s[30:31]
Expand Down
Loading

0 comments on commit d02e468

Please sign in to comment.