Skip to content

Commit

Permalink
[AMDGPU][SIPreEmitPeephole] mustRetainExeczBranch: use BranchProbabil…
Browse files Browse the repository at this point in the history
…ity and TargetSchedModel

Remove s_cbranch_execnz branches if the transformation is
profitable according to BranchProbability and TargetSchedmodel.
  • Loading branch information
jmmartinez committed Oct 10, 2024
1 parent e7ff08e commit 72e6e20
Show file tree
Hide file tree
Showing 22 changed files with 150 additions and 197 deletions.
89 changes: 68 additions & 21 deletions llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,13 @@
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/Support/BranchProbability.h"

using namespace llvm;

#define DEBUG_TYPE "si-pre-emit-peephole"

static unsigned SkipThreshold;

static cl::opt<unsigned, true> SkipThresholdFlag(
"amdgpu-skip-threshold", cl::Hidden,
cl::desc(
"Number of instructions before jumping over divergent control flow"),
cl::location(SkipThreshold), cl::init(12));

namespace {

class SIPreEmitPeephole : public MachineFunctionPass {
Expand All @@ -41,7 +35,8 @@ class SIPreEmitPeephole : public MachineFunctionPass {
MachineBasicBlock *&TrueMBB,
MachineBasicBlock *&FalseMBB,
SmallVectorImpl<MachineOperand> &Cond);
bool mustRetainExeczBranch(const MachineBasicBlock &From,
bool mustRetainExeczBranch(const MachineInstr &Branch,
const MachineBasicBlock &From,
const MachineBasicBlock &To) const;
bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);

Expand Down Expand Up @@ -304,11 +299,61 @@ bool SIPreEmitPeephole::getBlockDestinations(
return true;
}

namespace {
class BranchWeightCostModel {
const SIInstrInfo &TII;
const TargetSchedModel &SchedModel;
BranchProbability BranchProb;
static constexpr uint64_t BranchNotTakenCost = 1;
uint64_t BranchTakenCost;
uint64_t ThenCyclesCost = 0;

public:
BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch,
const MachineBasicBlock &Succ)
: TII(TII), SchedModel(TII.getSchedModel()) {
const MachineBasicBlock &Head = *Branch.getParent();
const auto *FromIt = find(Head.successors(), &Succ);
assert(FromIt != Head.succ_end());

BranchProb = Head.getSuccProbability(FromIt);
if (BranchProb.isUnknown())
BranchProb = BranchProbability::getZero();
BranchTakenCost = SchedModel.computeInstrLatency(&Branch, false);
}

bool isProfitable(const MachineInstr &MI) {
if (TII.isWaitcnt(MI.getOpcode()))
return false;

ThenCyclesCost += SchedModel.computeInstrLatency(&MI, false);

// Consider `P = N/D` to be the probability of execz being false (skipping
// the then-block) The transformation is profitable if always executing the
// 'then' block is cheaper than executing sometimes 'then' and always
// executing s_cbranch_execz:
// * ThenCost <= P*ThenCost + (1-P)*BranchTakenCost + P*BranchNotTakenCost
// * (1-P) * ThenCost <= (1-P)*BranchTakenCost + P*BranchNotTakenCost
// * (D-N)/D * ThenCost <= (D-N)/D * BranchTakenCost + N/D *
// BranchNotTakenCost
uint64_t Numerator = BranchProb.getNumerator();
uint64_t Denominator = BranchProb.getDenominator();
return (Denominator - Numerator) * ThenCyclesCost <=
((Denominator - Numerator) * BranchTakenCost +
Numerator * BranchNotTakenCost);
}
};

bool SIPreEmitPeephole::mustRetainExeczBranch(
const MachineBasicBlock &From, const MachineBasicBlock &To) const {
unsigned NumInstr = 0;
const MachineFunction *MF = From.getParent();
const MachineInstr &Branch, const MachineBasicBlock &From,
const MachineBasicBlock &To) const {

const MachineBasicBlock &Head = *Branch.getParent();
assert(is_contained(Head.successors(), &From));

BranchWeightCostModel CostModel{*TII, Branch, From};

const MachineFunction *MF = From.getParent();
for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
MBBI != End && MBBI != ToI; ++MBBI) {
const MachineBasicBlock &MBB = *MBBI;
Expand All @@ -326,23 +371,22 @@ bool SIPreEmitPeephole::mustRetainExeczBranch(
if (TII->hasUnwantedEffectsWhenEXECEmpty(MI))
return true;

// These instructions are potentially expensive even if EXEC = 0.
if (TII->isSMRD(MI) || TII->isVMEM(MI) || TII->isFLAT(MI) ||
TII->isDS(MI) || TII->isWaitcnt(MI.getOpcode()))
return true;

++NumInstr;
if (NumInstr >= SkipThreshold)
if (!CostModel.isProfitable(MI))
return true;
}
}

return false;
}
} // namespace

// Returns true if the skip branch instruction is removed.
bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
MachineBasicBlock &SrcMBB) {

if (!TII->getSchedModel().hasInstrSchedModelOrItineraries())
return false;

MachineBasicBlock *TrueMBB = nullptr;
MachineBasicBlock *FalseMBB = nullptr;
SmallVector<MachineOperand, 1> Cond;
Expand All @@ -351,8 +395,11 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
return false;

// Consider only the forward branches.
if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) ||
mustRetainExeczBranch(*FalseMBB, *TrueMBB))
if (SrcMBB.getNumber() >= TrueMBB->getNumber())
return false;

// Consider only when it is legal and profitable
if (mustRetainExeczBranch(MI, *FalseMBB, *TrueMBB))
return false;

LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
Expand Down
9 changes: 3 additions & 6 deletions llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,6 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 0, v0
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB5_2
; GFX9-NEXT: ; %bb.1: ; %if.then
; GFX9-NEXT: s_mov_b32 s11, s18
; GFX9-NEXT: s_mov_b32 s10, s17
Expand All @@ -301,7 +300,7 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8)
; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: v_mov_b32_e32 v1, s19
; GFX9-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
; GFX9-NEXT: .LBB5_2: ; %if.end
; GFX9-NEXT: ; %bb.2: ; %if.end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
Expand All @@ -311,7 +310,6 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8)
; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1010-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0, v0
; GFX1010-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1010-NEXT: s_cbranch_execz .LBB5_2
; GFX1010-NEXT: ; %bb.1: ; %if.then
; GFX1010-NEXT: v_mov_b32_e32 v0, s6
; GFX1010-NEXT: v_mov_b32_e32 v1, s19
Expand All @@ -320,7 +318,7 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8)
; GFX1010-NEXT: s_mov_b32 s9, s16
; GFX1010-NEXT: s_mov_b32 s8, s7
; GFX1010-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
; GFX1010-NEXT: .LBB5_2: ; %if.end
; GFX1010-NEXT: ; %bb.2: ; %if.end
; GFX1010-NEXT: s_waitcnt_depctr 0xffe3
; GFX1010-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
Expand All @@ -331,7 +329,6 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8)
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1030-NEXT: s_mov_b32 s4, exec_lo
; GFX1030-NEXT: v_cmpx_lt_i32_e32 0, v0
; GFX1030-NEXT: s_cbranch_execz .LBB5_2
; GFX1030-NEXT: ; %bb.1: ; %if.then
; GFX1030-NEXT: v_mov_b32_e32 v0, s6
; GFX1030-NEXT: v_mov_b32_e32 v1, s19
Expand All @@ -340,7 +337,7 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8)
; GFX1030-NEXT: s_mov_b32 s9, s16
; GFX1030-NEXT: s_mov_b32 s8, s7
; GFX1030-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
; GFX1030-NEXT: .LBB5_2: ; %if.end
; GFX1030-NEXT: ; %bb.2: ; %if.end
; GFX1030-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1030-NEXT: s_setpc_b64 s[30:31]
Expand Down
3 changes: 1 addition & 2 deletions llvm/test/CodeGen/AMDGPU/branch-condition-and.ll
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,12 @@ define amdgpu_ps void @ham(float %arg, float %arg1) #0 {
; GCN-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v1
; GCN-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GCN-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
; GCN-NEXT: s_cbranch_execz .LBB0_2
; GCN-NEXT: ; %bb.1: ; %bb4
; GCN-NEXT: v_mov_b32_e32 v0, 4
; GCN-NEXT: s_mov_b32 m0, -1
; GCN-NEXT: ds_write_b32 v0, v0
; GCN-NEXT: ; divergent unreachable
; GCN-NEXT: .LBB0_2: ; %UnifiedReturnBlock
; GCN-NEXT: ; %bb.2: ; %UnifiedReturnBlock
; GCN-NEXT: s_endpgm
bb:
%tmp = fcmp ogt float %arg, 0.000000e+00
Expand Down
1 change: 0 additions & 1 deletion llvm/test/CodeGen/AMDGPU/else.ll
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ end:
; CHECK-NEXT: s_and_b64 exec, exec, [[INIT_EXEC]]
; CHECK-NEXT: s_and_b64 [[AND_INIT:s\[[0-9]+:[0-9]+\]]], exec, [[DST]]
; CHECK-NEXT: s_xor_b64 exec, exec, [[AND_INIT]]
; CHECK-NEXT: s_cbranch_execz
define amdgpu_ps void @else_execfix_leave_wqm(i32 %z, float %v) #0 {
main_body:
%cc = icmp sgt i32 %z, 5
Expand Down
6 changes: 4 additions & 2 deletions llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1578,6 +1578,7 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7
; SDAG-NEXT: .LBB6_4: ; %Flow
; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13]
; SDAG-NEXT: s_cbranch_execz .LBB6_6
; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12
; SDAG-NEXT: v_sub_u32_e32 v2, 0x86, v5
; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7]
Expand All @@ -1589,7 +1590,7 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
; SDAG-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; SDAG-NEXT: v_mul_i32_i24_e32 v0, v0, v8
; SDAG-NEXT: v_mov_b32_e32 v3, v2
; SDAG-NEXT: ; %bb.6: ; %Flow1
; SDAG-NEXT: .LBB6_6: ; %Flow1
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: .LBB6_7: ; %Flow2
; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11]
Expand Down Expand Up @@ -1929,6 +1930,7 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7
; SDAG-NEXT: .LBB7_4: ; %Flow
; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13]
; SDAG-NEXT: s_cbranch_execz .LBB7_6
; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12
; SDAG-NEXT: v_sub_u32_e32 v2, 0x86, v5
; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7]
Expand All @@ -1940,7 +1942,7 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
; SDAG-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; SDAG-NEXT: v_mul_i32_i24_e32 v0, v0, v8
; SDAG-NEXT: v_mov_b32_e32 v3, v2
; SDAG-NEXT: ; %bb.6: ; %Flow1
; SDAG-NEXT: .LBB7_6: ; %Flow1
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: .LBB7_7: ; %Flow2
; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11]
Expand Down
3 changes: 2 additions & 1 deletion llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,12 @@ define amdgpu_ps void @i1_copy_from_loop(ptr addrspace(8) inreg %rsrc, i32 %tid)
; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v1
; SI-NEXT: s_mov_b64 s[8:9], -1
; SI-NEXT: s_and_saveexec_b64 s[12:13], vcc
; SI-NEXT: s_cbranch_execz .LBB0_6
; SI-NEXT: ; %bb.5: ; %end.loop
; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1
; SI-NEXT: s_add_i32 s14, s14, 1
; SI-NEXT: s_xor_b64 s[8:9], exec, -1
; SI-NEXT: ; %bb.6: ; %Flow1
; SI-NEXT: .LBB0_6: ; %Flow1
; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1
; SI-NEXT: s_or_b64 exec, exec, s[12:13]
; SI-NEXT: s_branch .LBB0_2
Expand Down
12 changes: 3 additions & 9 deletions llvm/test/CodeGen/AMDGPU/insert-handle-flat-vmem-ds.mir
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,7 @@ name: skip_execz_flat
body: |
; CHECK-LABEL: name: skip_execz_flat
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x7fffffff), %bb.2(0x00000001)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; CHECK-NEXT: successors: %bb.1(0x7fffffff)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.2(0x80000000)
Expand Down Expand Up @@ -38,9 +36,7 @@ name: skip_execz_mubuf
body: |
; CHECK-LABEL: name: skip_execz_mubuf
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x7fffffff), %bb.2(0x00000001)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; CHECK-NEXT: successors: %bb.1(0x7fffffff)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.2(0x80000000)
Expand Down Expand Up @@ -69,9 +65,7 @@ name: skip_execz_ds
body: |
; CHECK-LABEL: name: skip_execz_ds
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x7fffffff), %bb.2(0x00000001)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; CHECK-NEXT: successors: %bb.1(0x7fffffff)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.2(0x80000000)
Expand Down
95 changes: 0 additions & 95 deletions llvm/test/CodeGen/AMDGPU/insert-skips-flat-vmem-ds.mir

This file was deleted.

2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/insert-skips-gfx10.mir
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass si-pre-emit-peephole -amdgpu-skip-threshold=10 -verify-machineinstrs %s -o - | FileCheck %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass si-pre-emit-peephole -verify-machineinstrs %s -o - | FileCheck %s

---
name: skip_waitcnt_vscnt
Expand Down
Loading

0 comments on commit 72e6e20

Please sign in to comment.