Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[AMDGPU][SIPreEmitPeephole] mustRetainExeczBranch: use BranchProbability and TargetSchedmodel #109818

Merged
merged 2 commits into from
Oct 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 68 additions & 21 deletions llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,13 @@
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/Support/BranchProbability.h"

using namespace llvm;

#define DEBUG_TYPE "si-pre-emit-peephole"

static unsigned SkipThreshold;

static cl::opt<unsigned, true> SkipThresholdFlag(
"amdgpu-skip-threshold", cl::Hidden,
cl::desc(
"Number of instructions before jumping over divergent control flow"),
cl::location(SkipThreshold), cl::init(12));

namespace {

class SIPreEmitPeephole : public MachineFunctionPass {
Expand All @@ -41,7 +35,8 @@ class SIPreEmitPeephole : public MachineFunctionPass {
MachineBasicBlock *&TrueMBB,
MachineBasicBlock *&FalseMBB,
SmallVectorImpl<MachineOperand> &Cond);
bool mustRetainExeczBranch(const MachineBasicBlock &From,
bool mustRetainExeczBranch(const MachineInstr &Branch,
const MachineBasicBlock &From,
const MachineBasicBlock &To) const;
bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);

Expand Down Expand Up @@ -304,11 +299,61 @@ bool SIPreEmitPeephole::getBlockDestinations(
return true;
}

namespace {
class BranchWeightCostModel {
const SIInstrInfo &TII;
const TargetSchedModel &SchedModel;
BranchProbability BranchProb;
static constexpr uint64_t BranchNotTakenCost = 1;
uint64_t BranchTakenCost;
uint64_t ThenCyclesCost = 0;

public:
BranchWeightCostModel(const SIInstrInfo &TII, const MachineInstr &Branch,
const MachineBasicBlock &Succ)
: TII(TII), SchedModel(TII.getSchedModel()) {
const MachineBasicBlock &Head = *Branch.getParent();
const auto *FromIt = find(Head.successors(), &Succ);
assert(FromIt != Head.succ_end());

BranchProb = Head.getSuccProbability(FromIt);
if (BranchProb.isUnknown())
BranchProb = BranchProbability::getZero();
BranchTakenCost = SchedModel.computeInstrLatency(&Branch);
}

bool isProfitable(const MachineInstr &MI) {
if (TII.isWaitcnt(MI.getOpcode()))
return false;

ThenCyclesCost += SchedModel.computeInstrLatency(&MI);

// Consider `P = N/D` to be the probability of execz being false (skipping
// the then-block) The transformation is profitable if always executing the
// 'then' block is cheaper than executing sometimes 'then' and always
// executing s_cbranch_execz:
// * ThenCost <= P*ThenCost + (1-P)*BranchTakenCost + P*BranchNotTakenCost
// * (1-P) * ThenCost <= (1-P)*BranchTakenCost + P*BranchNotTakenCost
// * (D-N)/D * ThenCost <= (D-N)/D * BranchTakenCost + N/D *
// BranchNotTakenCost
uint64_t Numerator = BranchProb.getNumerator();
uint64_t Denominator = BranchProb.getDenominator();
return (Denominator - Numerator) * ThenCyclesCost <=
((Denominator - Numerator) * BranchTakenCost +
Numerator * BranchNotTakenCost);
}
};

bool SIPreEmitPeephole::mustRetainExeczBranch(
const MachineBasicBlock &From, const MachineBasicBlock &To) const {
unsigned NumInstr = 0;
const MachineFunction *MF = From.getParent();
const MachineInstr &Branch, const MachineBasicBlock &From,
const MachineBasicBlock &To) const {

const MachineBasicBlock &Head = *Branch.getParent();
assert(is_contained(Head.successors(), &From));

BranchWeightCostModel CostModel{*TII, Branch, From};

const MachineFunction *MF = From.getParent();
for (MachineFunction::const_iterator MBBI(&From), ToI(&To), End = MF->end();
MBBI != End && MBBI != ToI; ++MBBI) {
const MachineBasicBlock &MBB = *MBBI;
Expand All @@ -326,23 +371,22 @@ bool SIPreEmitPeephole::mustRetainExeczBranch(
if (TII->hasUnwantedEffectsWhenEXECEmpty(MI))
return true;

// These instructions are potentially expensive even if EXEC = 0.
if (TII->isSMRD(MI) || TII->isVMEM(MI) || TII->isFLAT(MI) ||
TII->isDS(MI) || TII->isWaitcnt(MI.getOpcode()))
return true;

++NumInstr;
if (NumInstr >= SkipThreshold)
if (!CostModel.isProfitable(MI))
return true;
}
}

return false;
}
} // namespace

// Returns true if the skip branch instruction is removed.
bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
MachineBasicBlock &SrcMBB) {

if (!TII->getSchedModel().hasInstrSchedModel())
return false;

MachineBasicBlock *TrueMBB = nullptr;
MachineBasicBlock *FalseMBB = nullptr;
SmallVector<MachineOperand, 1> Cond;
Expand All @@ -351,8 +395,11 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
return false;

// Consider only the forward branches.
if ((SrcMBB.getNumber() >= TrueMBB->getNumber()) ||
mustRetainExeczBranch(*FalseMBB, *TrueMBB))
if (SrcMBB.getNumber() >= TrueMBB->getNumber())
return false;

// Consider only when it is legal and profitable
if (mustRetainExeczBranch(MI, *FalseMBB, *TrueMBB))
return false;

LLVM_DEBUG(dbgs() << "Removing the execz branch: " << MI);
Expand Down
9 changes: 3 additions & 6 deletions llvm/test/CodeGen/AMDGPU/amdgpu-demote-scc-branches.ll
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,6 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8)
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_cmp_lt_i32_e32 vcc, 0, v0
; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
; GFX9-NEXT: s_cbranch_execz .LBB5_2
; GFX9-NEXT: ; %bb.1: ; %if.then
; GFX9-NEXT: s_mov_b32 s11, s18
; GFX9-NEXT: s_mov_b32 s10, s17
Expand All @@ -301,7 +300,7 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8)
; GFX9-NEXT: v_mov_b32_e32 v0, s6
; GFX9-NEXT: v_mov_b32_e32 v1, s19
; GFX9-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
; GFX9-NEXT: .LBB5_2: ; %if.end
; GFX9-NEXT: ; %bb.2: ; %if.end
; GFX9-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: s_setpc_b64 s[30:31]
Expand All @@ -311,7 +310,6 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8)
; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1010-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0, v0
; GFX1010-NEXT: s_and_saveexec_b32 s4, vcc_lo
; GFX1010-NEXT: s_cbranch_execz .LBB5_2
; GFX1010-NEXT: ; %bb.1: ; %if.then
; GFX1010-NEXT: v_mov_b32_e32 v0, s6
; GFX1010-NEXT: v_mov_b32_e32 v1, s19
Expand All @@ -320,7 +318,7 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8)
; GFX1010-NEXT: s_mov_b32 s9, s16
; GFX1010-NEXT: s_mov_b32 s8, s7
; GFX1010-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
; GFX1010-NEXT: .LBB5_2: ; %if.end
; GFX1010-NEXT: ; %bb.2: ; %if.end
; GFX1010-NEXT: s_waitcnt_depctr 0xffe3
; GFX1010-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1010-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
Expand All @@ -331,7 +329,6 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8)
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1030-NEXT: s_mov_b32 s4, exec_lo
; GFX1030-NEXT: v_cmpx_lt_i32_e32 0, v0
; GFX1030-NEXT: s_cbranch_execz .LBB5_2
; GFX1030-NEXT: ; %bb.1: ; %if.then
; GFX1030-NEXT: v_mov_b32_e32 v0, s6
; GFX1030-NEXT: v_mov_b32_e32 v1, s19
Expand All @@ -340,7 +337,7 @@ define void @divergent_br_profitable(i32 noundef inreg %value, ptr addrspace(8)
; GFX1030-NEXT: s_mov_b32 s9, s16
; GFX1030-NEXT: s_mov_b32 s8, s7
; GFX1030-NEXT: buffer_store_dword v0, v1, s[8:11], 0 offen
; GFX1030-NEXT: .LBB5_2: ; %if.end
; GFX1030-NEXT: ; %bb.2: ; %if.end
; GFX1030-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GFX1030-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX1030-NEXT: s_setpc_b64 s[30:31]
Expand Down
3 changes: 1 addition & 2 deletions llvm/test/CodeGen/AMDGPU/branch-condition-and.ll
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,12 @@ define amdgpu_ps void @ham(float %arg, float %arg1) #0 {
; GCN-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v1
; GCN-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
; GCN-NEXT: s_and_saveexec_b64 s[2:3], s[0:1]
; GCN-NEXT: s_cbranch_execz .LBB0_2
; GCN-NEXT: ; %bb.1: ; %bb4
; GCN-NEXT: v_mov_b32_e32 v0, 4
; GCN-NEXT: s_mov_b32 m0, -1
; GCN-NEXT: ds_write_b32 v0, v0
; GCN-NEXT: ; divergent unreachable
; GCN-NEXT: .LBB0_2: ; %UnifiedReturnBlock
; GCN-NEXT: ; %bb.2: ; %UnifiedReturnBlock
; GCN-NEXT: s_endpgm
bb:
%tmp = fcmp ogt float %arg, 0.000000e+00
Expand Down
1 change: 0 additions & 1 deletion llvm/test/CodeGen/AMDGPU/else.ll
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ end:
; CHECK-NEXT: s_and_b64 exec, exec, [[INIT_EXEC]]
; CHECK-NEXT: s_and_b64 [[AND_INIT:s\[[0-9]+:[0-9]+\]]], exec, [[DST]]
; CHECK-NEXT: s_xor_b64 exec, exec, [[AND_INIT]]
; CHECK-NEXT: s_cbranch_execz
define amdgpu_ps void @else_execfix_leave_wqm(i32 %z, float %v) #0 {
main_body:
%cc = icmp sgt i32 %z, 5
Expand Down
6 changes: 4 additions & 2 deletions llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1578,6 +1578,7 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7
; SDAG-NEXT: .LBB6_4: ; %Flow
; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13]
; SDAG-NEXT: s_cbranch_execz .LBB6_6
; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12
; SDAG-NEXT: v_sub_u32_e32 v2, 0x86, v5
; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7]
Expand All @@ -1589,7 +1590,7 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
; SDAG-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; SDAG-NEXT: v_mul_i32_i24_e32 v0, v0, v8
; SDAG-NEXT: v_mov_b32_e32 v3, v2
; SDAG-NEXT: ; %bb.6: ; %Flow1
; SDAG-NEXT: .LBB6_6: ; %Flow1
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: .LBB6_7: ; %Flow2
; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11]
Expand Down Expand Up @@ -1929,6 +1930,7 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7
; SDAG-NEXT: .LBB7_4: ; %Flow
; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13]
; SDAG-NEXT: s_cbranch_execz .LBB7_6
; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12
; SDAG-NEXT: v_sub_u32_e32 v2, 0x86, v5
; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7]
Expand All @@ -1940,7 +1942,7 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
; SDAG-NEXT: v_ashrrev_i32_e32 v2, 31, v1
; SDAG-NEXT: v_mul_i32_i24_e32 v0, v0, v8
; SDAG-NEXT: v_mov_b32_e32 v3, v2
; SDAG-NEXT: ; %bb.6: ; %Flow1
; SDAG-NEXT: .LBB7_6: ; %Flow1
; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
; SDAG-NEXT: .LBB7_7: ; %Flow2
; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11]
Expand Down
3 changes: 2 additions & 1 deletion llvm/test/CodeGen/AMDGPU/i1-copy-from-loop.ll
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,12 @@ define amdgpu_ps void @i1_copy_from_loop(ptr addrspace(8) inreg %rsrc, i32 %tid)
; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v1
; SI-NEXT: s_mov_b64 s[8:9], -1
; SI-NEXT: s_and_saveexec_b64 s[12:13], vcc
; SI-NEXT: s_cbranch_execz .LBB0_6
; SI-NEXT: ; %bb.5: ; %end.loop
; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1
; SI-NEXT: s_add_i32 s14, s14, 1
; SI-NEXT: s_xor_b64 s[8:9], exec, -1
; SI-NEXT: ; %bb.6: ; %Flow1
; SI-NEXT: .LBB0_6: ; %Flow1
; SI-NEXT: ; in Loop: Header=BB0_3 Depth=1
; SI-NEXT: s_or_b64 exec, exec, s[12:13]
; SI-NEXT: s_branch .LBB0_2
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=polaris10 -run-pass si-pre-emit-peephole -amdgpu-skip-threshold=1 -verify-machineinstrs %s -o - | FileCheck %s
# RUN: llc -mtriple=amdgcn -mcpu=polaris10 -run-pass si-pre-emit-peephole -verify-machineinstrs %s -o - | FileCheck %s

---

name: skip_execz_flat
body: |
; CHECK-LABEL: name: skip_execz_flat
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; CHECK-NEXT: successors: %bb.1(0x7fffffff)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.2(0x80000000)
Expand All @@ -20,7 +18,7 @@ body: |
; CHECK-NEXT: bb.2:
; CHECK-NEXT: S_ENDPGM 0
bb.0:
successors: %bb.1, %bb.2
successors: %bb.1(0x70000000), %bb.2(0x00000001)
S_CBRANCH_EXECZ %bb.2, implicit $exec

bb.1:
Expand All @@ -38,9 +36,7 @@ name: skip_execz_mubuf
body: |
; CHECK-LABEL: name: skip_execz_mubuf
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; CHECK-NEXT: successors: %bb.1(0x7fffffff)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.2(0x80000000)
Expand All @@ -51,7 +47,7 @@ body: |
; CHECK-NEXT: bb.2:
; CHECK-NEXT: S_ENDPGM 0
bb.0:
successors: %bb.1, %bb.2
successors: %bb.1(0x70000000), %bb.2(0x00000001)
S_CBRANCH_EXECZ %bb.2, implicit $exec

bb.1:
Expand All @@ -69,9 +65,7 @@ name: skip_execz_ds
body: |
; CHECK-LABEL: name: skip_execz_ds
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
; CHECK-NEXT: successors: %bb.1(0x7fffffff)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.2(0x80000000)
Expand All @@ -82,7 +76,7 @@ body: |
; CHECK-NEXT: bb.2:
; CHECK-NEXT: S_ENDPGM 0
bb.0:
successors: %bb.1, %bb.2
successors: %bb.1(0x70000000), %bb.2(0x00000001)
S_CBRANCH_EXECZ %bb.2, implicit $exec

bb.1:
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/insert-skips-gfx10.mir
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass si-pre-emit-peephole -amdgpu-skip-threshold=10 -verify-machineinstrs %s -o - | FileCheck %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -run-pass si-pre-emit-peephole -verify-machineinstrs %s -o - | FileCheck %s

---
name: skip_waitcnt_vscnt
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/insert-skips-gfx12.mir
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass si-pre-emit-peephole -amdgpu-skip-threshold=10 -verify-machineinstrs %s -o - | FileCheck %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass si-pre-emit-peephole -verify-machineinstrs %s -o - | FileCheck %s

---
name: skip_wait_loadcnt
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AMDGPU/insert-skips-gws.mir
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass si-pre-emit-peephole -amdgpu-skip-threshold=1 -verify-machineinstrs %s -o - | FileCheck %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass si-pre-emit-peephole -verify-machineinstrs %s -o - | FileCheck %s
# Make sure mandatory skips are inserted to ensure GWS ops aren't run with exec = 0

---
Expand Down
30 changes: 26 additions & 4 deletions llvm/test/CodeGen/AMDGPU/insert-skips-ignored-insts.mir
Original file line number Diff line number Diff line change
@@ -1,12 +1,34 @@
# RUN: llc -mtriple=amdgcn-amd-amdhsa -run-pass si-pre-emit-peephole -amdgpu-skip-threshold=3 %s -o - | FileCheck %s
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -run-pass si-pre-emit-peephole %s -o - | FileCheck %s

---

# CHECK-LABEL: name: no_count_dbg_value
# CHECK: $vgpr1 = V_MOV_B32_e32 7, implicit $exec
# CHECK-NOT: S_CBRANCH_EXECZ
name: no_count_dbg_value
body: |
; CHECK-LABEL: name: no_count_dbg_value
; CHECK: bb.0:
; CHECK-NEXT: successors: %bb.1(0x40000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 7, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
; CHECK-NEXT: successors: %bb.2(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
; CHECK-NEXT: DBG_VALUE
; CHECK-NEXT: DBG_VALUE
; CHECK-NEXT: DBG_VALUE
; CHECK-NEXT: DBG_VALUE
; CHECK-NEXT: DBG_VALUE
; CHECK-NEXT: DBG_VALUE
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.2:
; CHECK-NEXT: successors: %bb.3(0x80000000)
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.3:
; CHECK-NEXT: S_ENDPGM 0
bb.0:
successors: %bb.1, %bb.2

Expand Down
Loading
Loading