-
Notifications
You must be signed in to change notification settings - Fork 12.6k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[AMDGPU] Add intrinsic for raw atomic buffer loads (#97707)
Upstream the intrinsics `llvm.amdgcn.raw.atomic.buffer.load` and `llvm.amdgcn.raw.atomic.ptr.buffer.load`. These additional intrinsics mark atomic buffer loads as atomic to LLVM by removing the `IntrReadMem` attribute. Otherwise, it could hoist these intrinsics out of loops in cases where LLVM marks them as invariant. That can cause issues such as infinite loops. Continuation of https://reviews.llvm.org/D138786 with the additional use in the fat buffer lowering, more test cases and the additional ptr versions of these intrinsics. --------- Co-authored-by: rtayl <> Co-authored-by: Jay Foad <jay.foad@amd.com> Co-authored-by: Mariusz Sikora <mariusz.sikora@amd.com>
- Loading branch information
1 parent
4010ddf
commit ec7f8e1
Showing
8 changed files
with
654 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
304 changes: 304 additions & 0 deletions
304
llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,304 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 | ||
; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -global-isel=0 | FileCheck %s -check-prefix=CHECK | ||
; RUN: llc < %s -march=amdgcn -mcpu=gfx1100 -global-isel=1 | FileCheck %s -check-prefix=CHECK | ||
|
||
define amdgpu_kernel void @raw_atomic_buffer_load_i32(<4 x i32> %addr) { | ||
; CHECK-LABEL: raw_atomic_buffer_load_i32: | ||
; CHECK: ; %bb.0: ; %bb | ||
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 | ||
; CHECK-NEXT: s_mov_b32 s4, 0 | ||
; CHECK-NEXT: .LBB0_1: ; %bb1 | ||
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 | ||
; CHECK-NEXT: s_waitcnt lgkmcnt(0) | ||
; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc | ||
; CHECK-NEXT: s_waitcnt vmcnt(0) | ||
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 | ||
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 | ||
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | ||
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 | ||
; CHECK-NEXT: s_cbranch_execnz .LBB0_1 | ||
; CHECK-NEXT: ; %bb.2: ; %bb2 | ||
; CHECK-NEXT: s_endpgm | ||
bb: | ||
%id = tail call i32 @llvm.amdgcn.workitem.id.x() | ||
br label %bb1 | ||
bb1: | ||
%load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 0, i32 0, i32 1) | ||
%cmp = icmp eq i32 %load, %id | ||
br i1 %cmp, label %bb1, label %bb2 | ||
bb2: | ||
ret void | ||
} | ||
|
||
define amdgpu_kernel void @raw_atomic_buffer_load_i32_off(<4 x i32> %addr) { | ||
; CHECK-LABEL: raw_atomic_buffer_load_i32_off: | ||
; CHECK: ; %bb.0: ; %bb | ||
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 | ||
; CHECK-NEXT: s_mov_b32 s4, 0 | ||
; CHECK-NEXT: .LBB1_1: ; %bb1 | ||
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 | ||
; CHECK-NEXT: s_waitcnt lgkmcnt(0) | ||
; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc | ||
; CHECK-NEXT: s_waitcnt vmcnt(0) | ||
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 | ||
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 | ||
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | ||
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 | ||
; CHECK-NEXT: s_cbranch_execnz .LBB1_1 | ||
; CHECK-NEXT: ; %bb.2: ; %bb2 | ||
; CHECK-NEXT: s_endpgm | ||
bb: | ||
%id = tail call i32 @llvm.amdgcn.workitem.id.x() | ||
br label %bb1 | ||
bb1: | ||
%load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 0, i32 0, i32 1) | ||
%cmp = icmp eq i32 %load, %id | ||
br i1 %cmp, label %bb1, label %bb2 | ||
bb2: | ||
ret void | ||
} | ||
define amdgpu_kernel void @raw_atomic_buffer_load_i32_soff(<4 x i32> %addr) { | ||
; CHECK-LABEL: raw_atomic_buffer_load_i32_soff: | ||
; CHECK: ; %bb.0: ; %bb | ||
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 | ||
; CHECK-NEXT: s_mov_b32 s4, 0 | ||
; CHECK-NEXT: .LBB2_1: ; %bb1 | ||
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 | ||
; CHECK-NEXT: s_waitcnt lgkmcnt(0) | ||
; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 4 offset:4 glc | ||
; CHECK-NEXT: s_waitcnt vmcnt(0) | ||
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 | ||
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 | ||
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | ||
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 | ||
; CHECK-NEXT: s_cbranch_execnz .LBB2_1 | ||
; CHECK-NEXT: ; %bb.2: ; %bb2 | ||
; CHECK-NEXT: s_endpgm | ||
bb: | ||
%id = tail call i32 @llvm.amdgcn.workitem.id.x() | ||
br label %bb1 | ||
bb1: | ||
%load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 4, i32 4, i32 1) | ||
%cmp = icmp eq i32 %load, %id | ||
br i1 %cmp, label %bb1, label %bb2 | ||
bb2: | ||
ret void | ||
} | ||
define amdgpu_kernel void @raw_atomic_buffer_load_i32_dlc(<4 x i32> %addr) { | ||
; CHECK-LABEL: raw_atomic_buffer_load_i32_dlc: | ||
; CHECK: ; %bb.0: ; %bb | ||
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 | ||
; CHECK-NEXT: s_mov_b32 s4, 0 | ||
; CHECK-NEXT: .LBB3_1: ; %bb1 | ||
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 | ||
; CHECK-NEXT: s_waitcnt lgkmcnt(0) | ||
; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 dlc | ||
; CHECK-NEXT: s_waitcnt vmcnt(0) | ||
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 | ||
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 | ||
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | ||
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 | ||
; CHECK-NEXT: s_cbranch_execnz .LBB3_1 | ||
; CHECK-NEXT: ; %bb.2: ; %bb2 | ||
; CHECK-NEXT: s_endpgm | ||
bb: | ||
%id = tail call i32 @llvm.amdgcn.workitem.id.x() | ||
br label %bb1 | ||
bb1: | ||
%load = call i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32> %addr, i32 4, i32 0, i32 4) | ||
%cmp = icmp eq i32 %load, %id | ||
br i1 %cmp, label %bb1, label %bb2 | ||
bb2: | ||
ret void | ||
} | ||
|
||
define amdgpu_kernel void @raw_nonatomic_buffer_load_i32(<4 x i32> %addr) { | ||
; CHECK-LABEL: raw_nonatomic_buffer_load_i32: | ||
; CHECK: ; %bb.0: ; %bb | ||
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 | ||
; CHECK-NEXT: s_waitcnt lgkmcnt(0) | ||
; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 offset:4 glc | ||
; CHECK-NEXT: s_mov_b32 s0, 0 | ||
; CHECK-NEXT: s_waitcnt vmcnt(0) | ||
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 | ||
; CHECK-NEXT: .LBB4_1: ; %bb1 | ||
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 | ||
; CHECK-NEXT: s_and_b32 s1, exec_lo, vcc_lo | ||
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) | ||
; CHECK-NEXT: s_or_b32 s0, s1, s0 | ||
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 | ||
; CHECK-NEXT: s_cbranch_execnz .LBB4_1 | ||
; CHECK-NEXT: ; %bb.2: ; %bb2 | ||
; CHECK-NEXT: s_endpgm | ||
bb: | ||
%id = tail call i32 @llvm.amdgcn.workitem.id.x() | ||
br label %bb1 | ||
bb1: | ||
%load = call i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32> %addr, i32 4, i32 0, i32 1) | ||
%cmp = icmp eq i32 %load, %id | ||
br i1 %cmp, label %bb1, label %bb2 | ||
bb2: | ||
ret void | ||
} | ||
|
||
define amdgpu_kernel void @raw_atomic_buffer_load_i64(<4 x i32> %addr) { | ||
; CHECK-LABEL: raw_atomic_buffer_load_i64: | ||
; CHECK: ; %bb.0: ; %bb | ||
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 | ||
; CHECK-NEXT: v_mov_b32_e32 v1, 0 | ||
; CHECK-NEXT: s_mov_b32 s4, 0 | ||
; CHECK-NEXT: .LBB5_1: ; %bb1 | ||
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 | ||
; CHECK-NEXT: s_waitcnt lgkmcnt(0) | ||
; CHECK-NEXT: buffer_load_b64 v[2:3], off, s[0:3], 0 offset:4 glc | ||
; CHECK-NEXT: s_waitcnt vmcnt(0) | ||
; CHECK-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[2:3], v[0:1] | ||
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 | ||
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | ||
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 | ||
; CHECK-NEXT: s_cbranch_execnz .LBB5_1 | ||
; CHECK-NEXT: ; %bb.2: ; %bb2 | ||
; CHECK-NEXT: s_endpgm | ||
bb: | ||
%id = tail call i32 @llvm.amdgcn.workitem.id.x() | ||
%id.zext = zext i32 %id to i64 | ||
br label %bb1 | ||
bb1: | ||
%load = call i64 @llvm.amdgcn.raw.atomic.buffer.load.i64(<4 x i32> %addr, i32 4, i32 0, i32 1) | ||
%cmp = icmp eq i64 %load, %id.zext | ||
br i1 %cmp, label %bb1, label %bb2 | ||
bb2: | ||
ret void | ||
} | ||
|
||
define amdgpu_kernel void @raw_atomic_buffer_load_v2i16(<4 x i32> %addr) { | ||
; CHECK-LABEL: raw_atomic_buffer_load_v2i16: | ||
; CHECK: ; %bb.0: ; %bb | ||
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 | ||
; CHECK-NEXT: s_mov_b32 s4, 0 | ||
; CHECK-NEXT: .LBB6_1: ; %bb1 | ||
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 | ||
; CHECK-NEXT: s_waitcnt lgkmcnt(0) | ||
; CHECK-NEXT: buffer_load_b32 v1, off, s[0:3], 0 glc | ||
; CHECK-NEXT: s_waitcnt vmcnt(0) | ||
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 | ||
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 | ||
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | ||
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 | ||
; CHECK-NEXT: s_cbranch_execnz .LBB6_1 | ||
; CHECK-NEXT: ; %bb.2: ; %bb2 | ||
; CHECK-NEXT: s_endpgm | ||
bb: | ||
%id = tail call i32 @llvm.amdgcn.workitem.id.x() | ||
br label %bb1 | ||
bb1: | ||
%load = call <2 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v2i16(<4 x i32> %addr, i32 0, i32 0, i32 1) | ||
%bitcast = bitcast <2 x i16> %load to i32 | ||
%cmp = icmp eq i32 %bitcast, %id | ||
br i1 %cmp, label %bb1, label %bb2 | ||
bb2: | ||
ret void | ||
} | ||
|
||
define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) { | ||
; CHECK-LABEL: raw_atomic_buffer_load_v4i16: | ||
; CHECK: ; %bb.0: ; %bb | ||
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 | ||
; CHECK-NEXT: s_mov_b32 s4, 0 | ||
; CHECK-NEXT: .LBB7_1: ; %bb1 | ||
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 | ||
; CHECK-NEXT: s_waitcnt lgkmcnt(0) | ||
; CHECK-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc | ||
; CHECK-NEXT: s_waitcnt vmcnt(0) | ||
; CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v1 | ||
; CHECK-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) | ||
; CHECK-NEXT: v_lshl_or_b32 v1, v2, 16, v1 | ||
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 | ||
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 | ||
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | ||
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 | ||
; CHECK-NEXT: s_cbranch_execnz .LBB7_1 | ||
; CHECK-NEXT: ; %bb.2: ; %bb2 | ||
; CHECK-NEXT: s_endpgm | ||
bb: | ||
%id = tail call i32 @llvm.amdgcn.workitem.id.x() | ||
br label %bb1 | ||
bb1: | ||
%load = call <4 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v4i16(<4 x i32> %addr, i32 4, i32 0, i32 1) | ||
%shortened = shufflevector <4 x i16> %load, <4 x i16> poison, <2 x i32> <i32 0, i32 2> | ||
%bitcast = bitcast <2 x i16> %shortened to i32 | ||
%cmp = icmp eq i32 %bitcast, %id | ||
br i1 %cmp, label %bb1, label %bb2 | ||
bb2: | ||
ret void | ||
} | ||
|
||
define amdgpu_kernel void @raw_atomic_buffer_load_v4i32(<4 x i32> %addr) { | ||
; CHECK-LABEL: raw_atomic_buffer_load_v4i32: | ||
; CHECK: ; %bb.0: ; %bb | ||
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 | ||
; CHECK-NEXT: s_mov_b32 s4, 0 | ||
; CHECK-NEXT: .LBB8_1: ; %bb1 | ||
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 | ||
; CHECK-NEXT: s_waitcnt lgkmcnt(0) | ||
; CHECK-NEXT: buffer_load_b128 v[1:4], off, s[0:3], 0 offset:4 glc | ||
; CHECK-NEXT: s_waitcnt vmcnt(0) | ||
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v4, v0 | ||
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 | ||
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | ||
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 | ||
; CHECK-NEXT: s_cbranch_execnz .LBB8_1 | ||
; CHECK-NEXT: ; %bb.2: ; %bb2 | ||
; CHECK-NEXT: s_endpgm | ||
bb: | ||
%id = tail call i32 @llvm.amdgcn.workitem.id.x() | ||
br label %bb1 | ||
bb1: | ||
%load = call <4 x i32> @llvm.amdgcn.raw.atomic.buffer.load.v4i32(<4 x i32> %addr, i32 4, i32 0, i32 1) | ||
%extracted = extractelement <4 x i32> %load, i32 3 | ||
%cmp = icmp eq i32 %extracted, %id | ||
br i1 %cmp, label %bb1, label %bb2 | ||
bb2: | ||
ret void | ||
} | ||
|
||
define amdgpu_kernel void @raw_atomic_buffer_load_ptr(<4 x i32> %addr) { | ||
; CHECK-LABEL: raw_atomic_buffer_load_ptr: | ||
; CHECK: ; %bb.0: ; %bb | ||
; CHECK-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 | ||
; CHECK-NEXT: s_mov_b32 s4, 0 | ||
; CHECK-NEXT: .LBB9_1: ; %bb1 | ||
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 | ||
; CHECK-NEXT: s_waitcnt lgkmcnt(0) | ||
; CHECK-NEXT: buffer_load_b64 v[1:2], off, s[0:3], 0 offset:4 glc | ||
; CHECK-NEXT: s_waitcnt vmcnt(0) | ||
; CHECK-NEXT: flat_load_b32 v1, v[1:2] | ||
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | ||
; CHECK-NEXT: v_cmp_ne_u32_e32 vcc_lo, v1, v0 | ||
; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 | ||
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | ||
; CHECK-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4 | ||
; CHECK-NEXT: s_cbranch_execnz .LBB9_1 | ||
; CHECK-NEXT: ; %bb.2: ; %bb2 | ||
; CHECK-NEXT: s_endpgm | ||
bb: | ||
%id = tail call i32 @llvm.amdgcn.workitem.id.x() | ||
br label %bb1 | ||
bb1: | ||
%load = call ptr @llvm.amdgcn.raw.atomic.buffer.load.ptr(<4 x i32> %addr, i32 4, i32 0, i32 1) | ||
%elem = load i32, ptr %load | ||
%cmp = icmp eq i32 %elem, %id | ||
br i1 %cmp, label %bb1, label %bb2 | ||
bb2: | ||
ret void | ||
} | ||
|
||
; Function Attrs: nounwind readonly | ||
declare i32 @llvm.amdgcn.raw.atomic.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg) | ||
declare i64 @llvm.amdgcn.raw.atomic.buffer.load.i64(<4 x i32>, i32, i32, i32 immarg) | ||
declare <2 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v2i16(<4 x i32>, i32, i32, i32 immarg) | ||
declare <4 x i16> @llvm.amdgcn.raw.atomic.buffer.load.v4i16(<4 x i32>, i32, i32, i32 immarg) | ||
declare <4 x i32> @llvm.amdgcn.raw.atomic.buffer.load.v4i32(<4 x i32>, i32, i32, i32 immarg) | ||
declare ptr @llvm.amdgcn.raw.atomic.buffer.load.ptr(<4 x i32>, i32, i32, i32 immarg) | ||
declare i32 @llvm.amdgcn.raw.buffer.load.i32(<4 x i32>, i32, i32, i32 immarg) | ||
declare i32 @llvm.amdgcn.workitem.id.x() |
Oops, something went wrong.