From 8eb6914b7ec55125897386f42dd5e21df703f575 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 8 Jun 2024 09:52:14 +0200 Subject: [PATCH] AMDGPU: Handle legal v2f16/v2bf16 atomicrmw fadd for global/flat Unlike the existing fadd cases, choose to ignore the requirement for amdgpu-unsafe-fp-atomics in case of fine-grained memory access. This is to minimize migration pain to the new atomic control metadata. This should not break any users, as the atomic intrinsics are still directly consumed, and clang does not yet produce vector FP atomicrmw. --- .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 7 + llvm/lib/Target/AMDGPU/FLATInstructions.td | 6 +- llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 29 +- .../AMDGPU/GlobalISel/fp-atomics-gfx940.ll | 62 +- .../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll | 908 +------- llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll | 1374 +----------- .../AMDGPU/expand-atomic-rmw-fadd.ll | 1928 +++++++++++++---- .../AMDGPU/expand-atomic-v2bf16-agent.ll | 1540 +++++++++++-- .../AMDGPU/expand-atomic-v2bf16-system.ll | 1540 +++++++++++-- .../AMDGPU/expand-atomic-v2f16-agent.ll | 1398 ++++++++++-- .../AMDGPU/expand-atomic-v2f16-system.ll | 1398 ++++++++++-- .../AMDGPU/expand-atomicrmw-fp-vector.ll | 102 +- 12 files changed, 6787 insertions(+), 3505 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 0234cd9088ae75..19f19eb7b7f44b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -1659,6 +1659,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, }); } + if (ST.hasAtomicBufferGlobalPkAddF16Insts()) + Atomic.legalFor({{V2F16, GlobalPtr}}); + if (ST.hasAtomicGlobalPkAddBF16Inst()) + Atomic.legalFor({{V2BF16, GlobalPtr}}); + if (ST.hasAtomicFlatPkAdd16Insts()) + Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}}); + // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output // demarshalling getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG) diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index 818cbde5924324..99e3a8084f067e 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -1645,6 +1645,7 @@ defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgc let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in { defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>; defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>; +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>; } let OtherPredicates = [HasBufferFlatGlobalAtomicsF64] in { @@ -1669,13 +1670,16 @@ defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat } let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in { +// FIXME: These do not have signed offsets defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "flat_addrspace", v2f16>; defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>; +defm : FlatSignedAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_flat", v2f16>; +defm : FlatSignedAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16>; } let OtherPredicates = [HasAtomicGlobalPkAddBF16Inst] in defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>; - +defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_global", v2bf16>; } // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 6325e75aa8f961..f59c7a73b18af8 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -15941,6 +15941,16 @@ static bool isHalf2OrBFloat2(Type *Ty) { return false; } +static bool isHalf2(Type *Ty) { + FixedVectorType *VT = dyn_cast(Ty); + return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy(); +} + +static bool isBFloat2(Type *Ty) { + FixedVectorType *VT = dyn_cast(Ty); + return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy(); +} + TargetLowering::AtomicExpansionKind SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { unsigned AS = RMW->getPointerAddressSpace(); @@ -16009,10 +16019,27 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { AS != AMDGPUAS::BUFFER_FAT_POINTER) return AtomicExpansionKind::CmpXChg; - // TODO: gfx940 supports v2f16 and v2bf16 if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy())) return AtomicExpansionKind::None; + if (AS == AMDGPUAS::FLAT_ADDRESS) { + // gfx940, gfx12 + // FIXME: Needs to account for no fine-grained memory + if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty)) + return AtomicExpansionKind::None; + } else { + // gfx90a, gfx940, gfx12 + // FIXME: Needs to account for no fine-grained memory + if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty)) + return AtomicExpansionKind::None; + + // gfx940, gfx12 + // FIXME: Need to skip buffer_fat_pointer? + // FIXME: Needs to account for no fine-grained memory + if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty)) + return AtomicExpansionKind::None; + } + if (unsafeFPAtomicsDisabled(RMW->getFunction())) return AtomicExpansionKind::CmpXChg; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll index 4e21ef83793429..fff341b07881b2 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll @@ -237,24 +237,10 @@ define <2 x half> @global_atomic_fadd_ret_v2f16_agent_offset(ptr addrspace(1) %p ; GFX940-LABEL: global_atomic_fadd_ret_v2f16_agent_offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:1024 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0 +; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:1024 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256 %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst @@ -265,23 +251,10 @@ define void @global_atomic_fadd_noret_v2f16_agent_offset(ptr addrspace(1) %ptr, ; GFX940-LABEL: global_atomic_fadd_noret_v2f16_agent_offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:1024 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0 +; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:1024 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256 %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst @@ -292,24 +265,10 @@ define <2 x half> @flat_atomic_fadd_ret_v2f16_agent_offset(ptr %ptr, <2 x half> ; GFX940-LABEL: flat_atomic_fadd_ret_v2f16_agent_offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:1024 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:1024 sc0 +; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:1024 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB19_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i32 256 %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst @@ -320,23 +279,10 @@ define void @flat_atomic_fadd_noret_v2f16_agent_offset(ptr %ptr, <2 x half> %val ; GFX940-LABEL: flat_atomic_fadd_noret_v2f16_agent_offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:1024 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:1024 sc0 +; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:1024 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB20_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i32 256 %unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll index 1e7cf0e702a03f..d644ff2f8efa40 100644 --- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll @@ -123,24 +123,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) { ; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB0_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16: @@ -200,26 +186,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB0_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB0_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst ret <2 x half> %result @@ -335,23 +305,10 @@ define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) { ; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB1_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16: @@ -408,24 +365,10 @@ define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB1_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB1_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst ret void @@ -595,43 +538,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0 +; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB2_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16: @@ -730,46 +640,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16(ptr %ptr, <2 x bfloat> %v ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB2_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB2_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst ret <2 x bfloat> %result @@ -895,42 +769,10 @@ define void @flat_agent_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) { ; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB3_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16: @@ -1026,44 +868,10 @@ define void @flat_agent_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB3_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB3_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst ret void @@ -1186,24 +994,10 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) ; GFX940-LABEL: flat_system_atomic_fadd_ret_v2f16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 sc1 +; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB4_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fadd_ret_v2f16: @@ -1263,26 +1057,10 @@ define <2 x half> @flat_system_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB4_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB4_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, <2 x half> %val seq_cst ret <2 x half> %result @@ -1400,23 +1178,10 @@ define void @flat_system_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) { ; GFX940-LABEL: flat_system_atomic_fadd_noret_v2f16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v5, v[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 sc1 +; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB5_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fadd_noret_v2f16: @@ -1473,24 +1238,10 @@ define void @flat_system_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB5_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB5_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, <2 x half> %val seq_cst ret void @@ -1662,43 +1413,10 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16(ptr %ptr, <2 x bfloat> % ; GFX940-LABEL: flat_system_atomic_fadd_ret_v2bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0 sc1 +; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB6_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fadd_ret_v2bf16: @@ -1797,46 +1515,10 @@ define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16(ptr %ptr, <2 x bfloat> % ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB6_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, <2 x bfloat> %val seq_cst ret <2 x bfloat> %result @@ -2003,42 +1685,10 @@ define void @flat_system_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) { ; GFX940-LABEL: flat_system_atomic_fadd_noret_v2bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: flat_load_dword v3, v[0:1] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1 +; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB7_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_system_atomic_fadd_noret_v2bf16: @@ -2134,44 +1784,10 @@ define void @flat_system_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) { ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB7_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr %ptr, <2 x bfloat> %val seq_cst ret void @@ -2825,30 +2441,13 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16_offset(ptr %ptr, <2 x half> ; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16_offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0x3ffc -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc -; GFX940-NEXT: flat_load_dword v0, v[0:1] offset:4092 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_pk_add_f16 v0, v1, v2 +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 +; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:4092 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB10_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16_offset: @@ -2914,26 +2513,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16_offset(ptr %ptr, <2 x half> ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:16380 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:16380 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:16380 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB10_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i64 4095 %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst @@ -3062,31 +2645,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16_negoffset(ptr %ptr, <2 x hal ; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16_negoffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX940-NEXT: flat_load_dword v0, v[0:1] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v1, v0 -; GFX940-NEXT: v_pk_add_f16 v0, v1, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0 +; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:63488 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB11_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16_negoffset: @@ -3152,26 +2714,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16_negoffset(ptr %ptr, <2 x hal ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB11_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst @@ -3300,28 +2846,13 @@ define void @flat_agent_atomic_fadd_noret_v2f16_offset(ptr %ptr, <2 x half> %val ; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16_offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0x3000, v0 -; GFX940-NEXT: s_mov_b64 s[0:1], 0x3ffc -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX940-NEXT: flat_load_dword v5, v[4:5] offset:4092 -; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:4092 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16_offset: @@ -3384,24 +2915,10 @@ define void @flat_agent_atomic_fadd_noret_v2f16_offset(ptr %ptr, <2 x half> %val ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:16380 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:16380 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:16380 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB12_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i64 4095 %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst @@ -3530,29 +3047,10 @@ define void @flat_agent_atomic_fadd_noret_v2f16_negoffset(ptr %ptr, <2 x half> % ; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16_negoffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v5, v[4:5] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 +; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:63488 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16_negoffset: @@ -3615,24 +3113,10 @@ define void @flat_agent_atomic_fadd_noret_v2f16_negoffset(ptr %ptr, <2 x half> % ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v4, v[0:1] offset:-2048 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:-2048 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB13_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr %ptr, i64 -512 %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst @@ -3810,49 +3294,13 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16_offset(ptr %ptr, <2 x bfl ; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16_offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v4 -; GFX940-NEXT: s_mov_b64 s[0:1], 0x3ffc -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc -; GFX940-NEXT: flat_load_dword v0, v[0:1] offset:4092 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX940-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v6, v6, v0, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v3, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v3, v0, s5 +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 +; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:4092 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16_offset: @@ -3957,46 +3405,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16_offset(ptr %ptr, <2 x bfl ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:16380 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:16380 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:16380 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 4095 %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst @@ -4174,50 +3586,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16_negoffset(ptr %ptr, <2 x ; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16_negoffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, v0 -; GFX940-NEXT: v_mov_b32_e32 v5, v1 -; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v4 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v5, vcc -; GFX940-NEXT: flat_load_dword v0, v[0:1] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v0 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v7 -; GFX940-NEXT: v_add_f32_e32 v0, v0, v1 -; GFX940-NEXT: v_add_f32_e32 v3, v3, v2 -; GFX940-NEXT: v_bfe_u32 v6, v0, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v3, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v0 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v3 -; GFX940-NEXT: v_add3_u32 v6, v6, v0, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v3, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v0, v0 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v3, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v0, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v3, v0, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0 +; GFX940-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:63488 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16_negoffset: @@ -4322,46 +3694,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16_negoffset(ptr %ptr, <2 x ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN +; GFX12-NEXT: flat_atomic_pk_add_bf16 v0, v[0:1], v2 offset:-2048 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst @@ -4539,47 +3875,13 @@ define void @flat_agent_atomic_fadd_noret_v2bf16_offset(ptr %ptr, <2 x bfloat> % ; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16_offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0x3000, v0 -; GFX940-NEXT: s_mov_b64 s[0:1], 0x3ffc -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GFX940-NEXT: flat_load_dword v3, v[4:5] offset:4092 -; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 +; GFX940-NEXT: v_add_co_u32_e32 v0, vcc, 0x3000, v0 +; GFX940-NEXT: s_nop 1 +; GFX940-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:4092 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16_offset: @@ -4681,44 +3983,10 @@ define void @flat_agent_atomic_fadd_noret_v2bf16_offset(ptr %ptr, <2 x bfloat> % ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:16380 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16380 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:16380 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 4095 %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst @@ -4896,48 +4164,10 @@ define void @flat_agent_atomic_fadd_noret_v2bf16_negoffset(ptr %ptr, <2 x bfloat ; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16_negoffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0 -; GFX940-NEXT: s_movk_i32 s0, 0xf800 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc -; GFX940-NEXT: flat_load_dword v3, v[4:5] -; GFX940-NEXT: s_mov_b32 s1, -1 -; GFX940-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1] -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 +; GFX940-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:63488 ; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16_negoffset: @@ -5039,44 +4269,10 @@ define void @flat_agent_atomic_fadd_noret_v2bf16_negoffset(ptr %ptr, <2 x bfloat ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: flat_load_b32 v3, v[0:1] offset:-2048 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: flat_atomic_pk_add_bf16 v[0:1], v2 offset:-2048 +; GFX12-NEXT: s_wait_storecnt_dscnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512 %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll index 88ce07c9779d0a..3a2adacf16903c 100644 --- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll @@ -2657,46 +2657,18 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB12_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB12_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16: @@ -2754,24 +2726,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB12_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB12_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst ret <2 x half> %result @@ -2825,46 +2783,18 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16_offset(ptr addrspace(1) %p ; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16_offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:1024 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 glc +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:1024 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB13_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16_offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:1024 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0 +; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:1024 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB13_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16_offset: @@ -2922,24 +2852,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16_offset(ptr addrspace(1) %p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:1024 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB13_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB13_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256 %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst @@ -2994,46 +2910,18 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16_negoffset(ptr addrspace(1) ; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16_negoffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-1024 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-1024 glc +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-1024 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB14_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16_negoffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-1024 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-1024 sc0 +; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-1024 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB14_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16_negoffset: @@ -3091,24 +2979,10 @@ define <2 x half> @global_agent_atomic_fadd_ret_v2f16_negoffset(ptr addrspace(1) ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-1024 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB14_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 -256 %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst @@ -3161,44 +3035,18 @@ define void @global_agent_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB15_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 +; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB15_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16: @@ -3254,23 +3102,10 @@ define void @global_agent_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x ha ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB15_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst ret void @@ -3322,44 +3157,18 @@ define void @global_agent_atomic_fadd_noret_v2f16_offset(ptr addrspace(1) %ptr, ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16_offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:1024 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 glc +; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:1024 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB16_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16_offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:1024 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0 +; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:1024 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB16_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16_offset: @@ -3415,23 +3224,10 @@ define void @global_agent_atomic_fadd_noret_v2f16_offset(ptr addrspace(1) %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:1024 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB16_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:1024 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB16_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256 %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst @@ -3484,44 +3280,18 @@ define void @global_agent_atomic_fadd_noret_v2f16_negoffset(ptr addrspace(1) %pt ; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16_negoffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-1024 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-1024 glc +; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-1024 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB17_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16_negoffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-1024 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-1024 sc0 +; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-1024 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB17_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16_negoffset: @@ -3577,23 +3347,10 @@ define void @global_agent_atomic_fadd_noret_v2f16_negoffset(ptr addrspace(1) %pt ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-1024 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB17_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-1024 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB17_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 -256 %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst @@ -3648,48 +3405,20 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX90A-LABEL: global_system_atomic_fadd_ret_v2f16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB18_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fadd_ret_v2f16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB18_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fadd_ret_v2f16: @@ -3747,24 +3476,10 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB18_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB18_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val seq_cst ret <2 x half> %result @@ -3818,48 +3533,20 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16_offset(ptr addrspace(1) % ; GFX90A-LABEL: global_system_atomic_fadd_ret_v2f16_offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:1024 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 glc +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:1024 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB19_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fadd_ret_v2f16_offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:1024 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:1024 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB19_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fadd_ret_v2f16_offset: @@ -3917,24 +3604,10 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16_offset(ptr addrspace(1) % ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:1024 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB19_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB19_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256 %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst @@ -3989,48 +3662,20 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16_negoffset(ptr addrspace(1 ; GFX90A-LABEL: global_system_atomic_fadd_ret_v2f16_negoffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v3, v[0:1], off offset:-1024 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB20_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-1024 glc +; GFX90A-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-1024 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB20_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fadd_ret_v2f16_negoffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-1024 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-1024 sc0 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-1024 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB20_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fadd_ret_v2f16_negoffset: @@ -4088,24 +3733,10 @@ define <2 x half> @global_system_atomic_fadd_ret_v2f16_negoffset(ptr addrspace(1 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-1024 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB20_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:-1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB20_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 -256 %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst @@ -4158,46 +3789,20 @@ define void @global_system_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x h ; GFX90A-LABEL: global_system_atomic_fadd_noret_v2f16: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB21_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off glc +; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB21_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fadd_noret_v2f16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB21_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB21_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fadd_noret_v2f16: @@ -4253,23 +3858,10 @@ define void @global_system_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x h ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB21_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB21_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val seq_cst ret void @@ -4321,46 +3913,20 @@ define void @global_system_atomic_fadd_noret_v2f16_offset(ptr addrspace(1) %ptr, ; GFX90A-LABEL: global_system_atomic_fadd_noret_v2f16_offset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:1024 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB22_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 glc +; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:1024 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB22_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fadd_noret_v2f16_offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:1024 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB22_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:1024 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB22_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fadd_noret_v2f16_offset: @@ -4416,23 +3982,10 @@ define void @global_system_atomic_fadd_noret_v2f16_offset(ptr addrspace(1) %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:1024 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB22_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:1024 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB22_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256 %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst @@ -4485,46 +4038,20 @@ define void @global_system_atomic_fadd_noret_v2f16_negoffset(ptr addrspace(1) %p ; GFX90A-LABEL: global_system_atomic_fadd_noret_v2f16_negoffset: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: global_load_dword v5, v[0:1], off offset:-1024 -; GFX90A-NEXT: s_mov_b64 s[4:5], 0 -; GFX90A-NEXT: .LBB23_1: ; %atomicrmw.start -; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX90A-NEXT: buffer_wbl2 -; GFX90A-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-1024 glc +; GFX90A-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-1024 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: buffer_invl2 ; GFX90A-NEXT: buffer_wbinvl1_vol -; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX90A-NEXT: v_mov_b32_e32 v5, v3 -; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5] -; GFX90A-NEXT: s_cbranch_execnz .LBB23_1 -; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: global_system_atomic_fadd_noret_v2f16_negoffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:-1024 -; GFX940-NEXT: s_mov_b64 s[0:1], 0 -; GFX940-NEXT: .LBB23_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_pk_add_f16 v4, v5, v2 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-1024 sc0 sc1 +; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-1024 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX940-NEXT: v_mov_b32_e32 v5, v3 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1] -; GFX940-NEXT: s_cbranch_execnz .LBB23_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fadd_noret_v2f16_negoffset: @@ -4580,23 +4107,10 @@ define void @global_system_atomic_fadd_noret_v2f16_negoffset(ptr addrspace(1) %p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v4, v[0:1], off offset:-1024 -; GFX12-NEXT: s_mov_b32 s0, 0 -; GFX12-NEXT: .LBB23_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_pk_add_f16 v3, v4, v2 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:-1024 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4 -; GFX12-NEXT: v_mov_b32_e32 v4, v3 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0 -; GFX12-NEXT: s_cbranch_execnz .LBB23_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 -256 %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst @@ -4727,43 +4241,10 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB24_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off sc0 +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB24_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16: @@ -4856,40 +4337,10 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB24_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB24_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst ret <2 x bfloat> %result @@ -5019,43 +4470,10 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16_offset(ptr addrspace(1) ; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16_offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:1024 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB25_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:1024 sc0 +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:1024 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB25_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16_offset: @@ -5148,40 +4566,10 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16_offset(ptr addrspace(1) ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:1024 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB25_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB25_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i32 256 %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst @@ -5312,43 +4700,10 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16_negoffset(ptr addrspace ; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16_negoffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-1024 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-1024 sc0 +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-1024 sc0 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB26_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16_negoffset: @@ -5441,40 +4796,10 @@ define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16_negoffset(ptr addrspace ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-1024 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB26_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB26_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i32 -256 %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst @@ -5602,42 +4927,10 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 +; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB27_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16: @@ -5728,39 +5021,10 @@ define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x b ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB27_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB27_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst ret void @@ -5887,42 +5151,10 @@ define void @global_agent_atomic_fadd_noret_v2bf16_offset(ptr addrspace(1) %ptr, ; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16_offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:1024 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:1024 sc0 +; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:1024 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB28_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16_offset: @@ -6013,39 +5245,10 @@ define void @global_agent_atomic_fadd_noret_v2bf16_offset(ptr addrspace(1) %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:1024 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB28_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:1024 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB28_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i32 256 %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst @@ -6173,42 +5376,10 @@ define void @global_agent_atomic_fadd_noret_v2bf16_negoffset(ptr addrspace(1) %p ; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16_negoffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-1024 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-1024 sc0 +; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-1024 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB29_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16_negoffset: @@ -6299,39 +5470,10 @@ define void @global_agent_atomic_fadd_noret_v2bf16_negoffset(ptr addrspace(1) %p ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-1024 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB29_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-1024 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_DEV -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB29_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i32 -256 %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst @@ -6464,43 +5606,10 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX940-LABEL: global_system_atomic_fadd_ret_v2bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off sc0 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB30_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fadd_ret_v2bf16: @@ -6593,40 +5702,10 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB30_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB30_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val seq_cst ret <2 x bfloat> %result @@ -6758,43 +5837,10 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16_offset(ptr addrspace(1 ; GFX940-LABEL: global_system_atomic_fadd_ret_v2bf16_offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:1024 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:1024 sc0 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:1024 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB31_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fadd_ret_v2bf16_offset: @@ -6887,40 +5933,10 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16_offset(ptr addrspace(1 ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:1024 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB31_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB31_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i32 256 %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst @@ -7053,43 +6069,10 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16_negoffset(ptr addrspac ; GFX940-LABEL: global_system_atomic_fadd_ret_v2bf16_negoffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-1024 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB32_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v7, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v7 -; GFX940-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX940-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX940-NEXT: v_bfe_u32 v6, v3, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v5, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v3 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX940-NEXT: v_add3_u32 v6, v6, v3, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v5, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v5, v5 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v3, v3 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v5, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v3, v6, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v6, v5, v3, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-1024 sc0 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-1024 sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v7 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB32_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v0, v3 ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fadd_ret_v2bf16_negoffset: @@ -7182,40 +6165,10 @@ define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16_negoffset(ptr addrspac ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-1024 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB32_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mov_b32_e32 v6, v3 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v6 -; GFX12-NEXT: v_add_f32_e32 v5, v5, v2 -; GFX12-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX12-NEXT: v_bfe_u32 v8, v5, 16, 1 -; GFX12-NEXT: v_add_f32_e32 v3, v3, v4 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v5 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v5, v5 -; GFX12-NEXT: v_add3_u32 v8, v8, v5, 0x7fff -; GFX12-NEXT: v_bfe_u32 v7, v3, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v3 -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v3, v3 -; GFX12-NEXT: v_cndmask_b32_e32 v5, v8, v10, vcc_lo -; GFX12-NEXT: v_add3_u32 v7, v7, v3, 0x7fff -; GFX12-NEXT: v_cndmask_b32_e64 v3, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v5, v5, v3, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-1024 th:TH_ATOMIC_RETURN +; GFX12-NEXT: global_atomic_pk_add_bf16 v0, v[0:1], v2, off offset:-1024 th:TH_ATOMIC_RETURN ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB32_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: v_mov_b32_e32 v0, v3 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i32 -256 %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst @@ -7345,42 +6298,10 @@ define void @global_system_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x ; GFX940-LABEL: global_system_atomic_fadd_noret_v2bf16: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB33_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fadd_noret_v2bf16: @@ -7471,39 +6392,10 @@ define void @global_system_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB33_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB33_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val seq_cst ret void @@ -7632,42 +6524,10 @@ define void @global_system_atomic_fadd_noret_v2bf16_offset(ptr addrspace(1) %ptr ; GFX940-LABEL: global_system_atomic_fadd_noret_v2bf16_offset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:1024 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:1024 sc0 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:1024 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB34_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fadd_noret_v2bf16_offset: @@ -7758,39 +6618,10 @@ define void @global_system_atomic_fadd_noret_v2bf16_offset(ptr addrspace(1) %ptr ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:1024 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB34_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:1024 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB34_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i32 256 %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst @@ -7920,42 +6751,10 @@ define void @global_system_atomic_fadd_noret_v2bf16_negoffset(ptr addrspace(1) % ; GFX940-LABEL: global_system_atomic_fadd_noret_v2bf16_negoffset: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:-1024 -; GFX940-NEXT: s_mov_b64 s[2:3], 0 -; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX940-NEXT: s_movk_i32 s4, 0x7fff -; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX940-NEXT: s_mov_b32 s5, 0x7060302 -; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start -; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX940-NEXT: s_waitcnt vmcnt(0) -; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX940-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX940-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX940-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX940-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX940-NEXT: v_bfe_u32 v9, v6, 16, 1 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v2 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX940-NEXT: v_add3_u32 v7, v7, v2, s4 -; GFX940-NEXT: v_add3_u32 v9, v9, v6, s4 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v6, v6 -; GFX940-NEXT: v_cmp_u_f32_e64 s[0:1], v2, v2 -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_cndmask_b32_e32 v6, v9, v10, vcc -; GFX940-NEXT: v_cndmask_b32_e64 v2, v7, v8, s[0:1] -; GFX940-NEXT: v_perm_b32 v2, v6, v2, s5 ; GFX940-NEXT: buffer_wbl2 sc0 sc1 -; GFX940-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-1024 sc0 sc1 +; GFX940-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-1024 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: buffer_inv sc0 sc1 -; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3 -; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3] -; GFX940-NEXT: v_mov_b32_e32 v3, v2 -; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3] -; GFX940-NEXT: s_cbranch_execnz .LBB35_1 -; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX940-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX940-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: global_system_atomic_fadd_noret_v2bf16_negoffset: @@ -8046,39 +6845,10 @@ define void @global_system_atomic_fadd_noret_v2bf16_negoffset(ptr addrspace(1) % ; GFX12-NEXT: s_wait_samplecnt 0x0 ; GFX12-NEXT: s_wait_bvhcnt 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v3, v[0:1], off offset:-1024 -; GFX12-NEXT: v_lshlrev_b32_e32 v4, 16, v2 -; GFX12-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 -; GFX12-NEXT: s_mov_b32 s1, 0 -; GFX12-NEXT: .LBB35_1: ; %atomicrmw.start -; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX12-NEXT: v_and_b32_e32 v6, 0xffff0000, v3 -; GFX12-NEXT: v_add_f32_e32 v2, v2, v4 -; GFX12-NEXT: v_add_f32_e32 v6, v6, v5 -; GFX12-NEXT: v_bfe_u32 v7, v2, 16, 1 -; GFX12-NEXT: v_bfe_u32 v8, v6, 16, 1 -; GFX12-NEXT: v_or_b32_e32 v9, 0x400000, v2 -; GFX12-NEXT: v_or_b32_e32 v10, 0x400000, v6 -; GFX12-NEXT: v_cmp_u_f32_e32 vcc_lo, v6, v6 -; GFX12-NEXT: v_add3_u32 v7, v7, v2, 0x7fff -; GFX12-NEXT: v_add3_u32 v8, v8, v6, 0x7fff -; GFX12-NEXT: v_cmp_u_f32_e64 s0, v2, v2 -; GFX12-NEXT: v_cndmask_b32_e32 v6, v8, v10, vcc_lo -; GFX12-NEXT: v_cndmask_b32_e64 v2, v7, v9, s0 -; GFX12-NEXT: v_perm_b32 v2, v6, v2, 0x7060302 ; GFX12-NEXT: s_wait_storecnt 0x0 -; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-1024 th:TH_ATOMIC_RETURN -; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: global_atomic_pk_add_bf16 v[0:1], v2, off offset:-1024 +; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: global_inv scope:SCOPE_SYS -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3 -; GFX12-NEXT: v_mov_b32_e32 v3, v2 -; GFX12-NEXT: s_or_b32 s1, vcc_lo, s1 -; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s1 -; GFX12-NEXT: s_cbranch_execnz .LBB35_1 -; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end -; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX12-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i32 -256 %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll index 3868f7bbc037a3..533f0f2691045b 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll @@ -4418,99 +4418,15 @@ define double @test_atomicrmw_fadd_f64_region_ret(ptr addrspace(2) %ptr, double } define <2 x half> @test_atomicrmw_fadd_v2f16_flat_agent(ptr %ptr, <2 x half> %value) { -; ALL-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret <2 x half> [[TMP5]] -; - %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst - ret <2 x half> %res -} - -define void @test_atomicrmw_fadd_v2f16_flat_agent_noret(ptr %ptr, <2 x half> %value) { -; ALL-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret void -; - %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst - ret void -} - -define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(ptr addrspace(1) %ptr, <2 x half> %value) { -; ALL-LABEL: @test_atomicrmw_fadd_v2f16_global_agent( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret <2 x half> [[TMP5]] -; - %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst - ret <2 x half> %res -} - -define void @test_atomicrmw_fadd_v2f16_flat_global_noret(ptr addrspace(1) %ptr, <2 x half> %value) { -; ALL-LABEL: @test_atomicrmw_fadd_v2f16_flat_global_noret( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret void -; - %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst - ret void -} - -define <2 x half> @test_atomicrmw_fadd_v2f16_local_agent(ptr addrspace(3) %ptr, <2 x half> %value) { -; CI-LABEL: @test_atomicrmw_fadd_v2f16_local_agent( -; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; CI-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent( +; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: ; CI-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] ; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] ; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -4518,15 +4434,15 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_local_agent(ptr addrspace(3) %ptr, ; CI: atomicrmw.end: ; CI-NEXT: ret <2 x half> [[TMP5]] ; -; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_local_agent( -; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 ; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX9: atomicrmw.start: ; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] ; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] ; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -4534,15 +4450,15 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_local_agent(ptr addrspace(3) %ptr, ; GFX9: atomicrmw.end: ; GFX9-NEXT: ret <2 x half> [[TMP5]] ; -; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_local_agent( -; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 ; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX908: atomicrmw.start: ; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -4550,15 +4466,15 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_local_agent(ptr addrspace(3) %ptr, ; GFX908: atomicrmw.end: ; GFX908-NEXT: ret <2 x half> [[TMP5]] ; -; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_local_agent( -; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 ; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX90A: atomicrmw.start: ; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -4566,19 +4482,19 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_local_agent(ptr addrspace(3) %ptr, ; GFX90A: atomicrmw.end: ; GFX90A-NEXT: ret <2 x half> [[TMP5]] ; -; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_local_agent( -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 ; GFX940-NEXT: ret <2 x half> [[RES]] ; -; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_local_agent( -; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 ; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX11: atomicrmw.start: ; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -4586,20 +4502,20 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_local_agent(ptr addrspace(3) %ptr, ; GFX11: atomicrmw.end: ; GFX11-NEXT: ret <2 x half> [[TMP5]] ; - %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %value syncscope("agent") seq_cst + %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst ret <2 x half> %res } -define void @test_atomicrmw_fadd_v2f16_flat_local_noret(ptr addrspace(3) %ptr, <2 x half> %value) { -; CI-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret( -; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +define void @test_atomicrmw_fadd_v2f16_flat_agent_noret(ptr %ptr, <2 x half> %value) { +; CI-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret( +; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: ; CI-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] ; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] ; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -4607,15 +4523,15 @@ define void @test_atomicrmw_fadd_v2f16_flat_local_noret(ptr addrspace(3) %ptr, < ; CI: atomicrmw.end: ; CI-NEXT: ret void ; -; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret( -; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 ; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX9: atomicrmw.start: ; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] ; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] ; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -4623,15 +4539,15 @@ define void @test_atomicrmw_fadd_v2f16_flat_local_noret(ptr addrspace(3) %ptr, < ; GFX9: atomicrmw.end: ; GFX9-NEXT: ret void ; -; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret( -; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 ; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX908: atomicrmw.start: ; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -4639,15 +4555,15 @@ define void @test_atomicrmw_fadd_v2f16_flat_local_noret(ptr addrspace(3) %ptr, < ; GFX908: atomicrmw.end: ; GFX908-NEXT: ret void ; -; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret( -; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 ; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX90A: atomicrmw.start: ; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -4655,19 +4571,19 @@ define void @test_atomicrmw_fadd_v2f16_flat_local_noret(ptr addrspace(3) %ptr, < ; GFX90A: atomicrmw.end: ; GFX90A-NEXT: ret void ; -; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret( -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 ; GFX940-NEXT: ret void ; -; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret( -; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 ; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX11: atomicrmw.start: ; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> @@ -4675,225 +4591,1339 @@ define void @test_atomicrmw_fadd_v2f16_flat_local_noret(ptr addrspace(3) %ptr, < ; GFX11: atomicrmw.end: ; GFX11-NEXT: ret void ; - %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %value syncscope("agent") seq_cst - ret void -} - -define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_flat_agent(ptr %ptr, <2 x bfloat> %value) { -; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret <2 x bfloat> [[TMP5]] -; - %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst - ret <2 x bfloat> %res -} - -define void @test_atomicrmw_fadd_v2bf16_flat_agent_noret(ptr %ptr, <2 x bfloat> %value) { -; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret void -; - %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst - ret void -} - -define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret <2 x bfloat> [[TMP5]] -; - %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst - ret <2 x bfloat> %res -} - -define void @test_atomicrmw_fadd_v2bf16_flat_global_noret(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_flat_global_noret( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret void -; - %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst ret void } -define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_local_agent(ptr addrspace(3) %ptr, <2 x bfloat> %value) { -; CI-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent( -; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(ptr addrspace(1) %ptr, <2 x half> %value) { +; CI-LABEL: @test_atomicrmw_fadd_v2f16_global_agent( +; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: -; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] -; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> ; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; CI: atomicrmw.end: -; CI-NEXT: ret <2 x bfloat> [[TMP5]] +; CI-NEXT: ret <2 x half> [[TMP5]] ; -; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent( -; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_global_agent( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX9: atomicrmw.start: -; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] -; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> ; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX9: atomicrmw.end: -; GFX9-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX9-NEXT: ret <2 x half> [[TMP5]] ; -; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent( -; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_global_agent( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX908: atomicrmw.start: -; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] -; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> ; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX908: atomicrmw.end: -; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX908-NEXT: ret <2 x half> [[TMP5]] ; -; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent( -; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 -; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] -; GFX90A: atomicrmw.start: -; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] -; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; GFX90A: atomicrmw.end: -; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_global_agent( +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX90A-NEXT: ret <2 x half> [[RES]] ; -; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent( -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 -; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_global_agent( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret <2 x half> [[RES]] ; -; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent( -; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_global_agent( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX11: atomicrmw.start: -; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] -; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> ; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX11: atomicrmw.end: -; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX11-NEXT: ret <2 x half> [[TMP5]] ; - %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst - ret <2 x bfloat> %res + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst + ret <2 x half> %res } -define void @test_atomicrmw_fadd_v2bf16_flat_local_noret(ptr addrspace(3) %ptr, <2 x bfloat> %value) { -; CI-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret( -; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +define void @test_atomicrmw_fadd_v2f16_flat_global_noret(ptr addrspace(1) %ptr, <2 x half> %value) { +; CI-LABEL: @test_atomicrmw_fadd_v2f16_flat_global_noret( +; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 ; CI-NEXT: br label [[ATOMICRMW_START:%.*]] ; CI: atomicrmw.start: -; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] -; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> ; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; CI: atomicrmw.end: ; CI-NEXT: ret void ; -; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret( -; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_flat_global_noret( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX9: atomicrmw.start: -; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] -; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> ; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] ; GFX9: atomicrmw.end: ; GFX9-NEXT: ret void ; -; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret( -; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_flat_global_noret( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 ; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX908: atomicrmw.start: -; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_flat_global_noret( +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_flat_global_noret( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_flat_global_noret( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst + ret void +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_local_agent(ptr addrspace(3) %ptr, <2 x half> %value) { +; CI-LABEL: @test_atomicrmw_fadd_v2f16_local_agent( +; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret <2 x half> [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_local_agent( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_local_agent( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_local_agent( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x half> [[TMP5]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_local_agent( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_local_agent( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %value syncscope("agent") seq_cst + ret <2 x half> %res +} + +define void @test_atomicrmw_fadd_v2f16_flat_local_noret(ptr addrspace(3) %ptr, <2 x half> %value) { +; CI-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret( +; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_flat_local_noret( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x half> %value syncscope("agent") seq_cst + ret void +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_flat_agent(ptr %ptr, <2 x bfloat> %value) { +; CI-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent( +; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + ret <2 x bfloat> %res +} + +define void @test_atomicrmw_fadd_v2bf16_flat_agent_noret(ptr %ptr, <2 x bfloat> %value) { +; CI-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret( +; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; + %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + ret void +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; CI-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent( +; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + ret <2 x bfloat> %res +} + +define void @test_atomicrmw_fadd_v2bf16_flat_global_noret(ptr addrspace(1) %ptr, <2 x bfloat> %value) { +; CI-LABEL: @test_atomicrmw_fadd_v2bf16_flat_global_noret( +; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_flat_global_noret( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_flat_global_noret( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_flat_global_noret( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_flat_global_noret( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_flat_global_noret( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + ret void +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_local_agent(ptr addrspace(3) %ptr, <2 x bfloat> %value) { +; CI-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent( +; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_local_agent( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + ret <2 x bfloat> %res +} + +define void @test_atomicrmw_fadd_v2bf16_flat_local_noret(ptr addrspace(3) %ptr, <2 x bfloat> %value) { +; CI-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret( +; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + ret void +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_flat_agent__unsafe(ptr %ptr, <2 x half> %value) #6 { +; CI-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent__unsafe( +; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret <2 x half> [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent__unsafe( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent__unsafe( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent__unsafe( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x half> [[TMP5]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent__unsafe( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent__unsafe( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst + ret <2 x half> %res +} + +define void @test_atomicrmw_fadd_v2f16_flat_agent_noret__unsafe(ptr %ptr, <2 x half> %value) #6 { +; CI-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret__unsafe( +; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret__unsafe( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret__unsafe( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret__unsafe( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret__unsafe( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret__unsafe( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; + %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst + ret void +} + +define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__unsafe(ptr addrspace(1) %ptr, <2 x half> %value) #6 { +; CI-LABEL: @test_atomicrmw_fadd_v2f16_global_agent__unsafe( +; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret <2 x half> [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_global_agent__unsafe( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_global_agent__unsafe( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_global_agent__unsafe( +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_global_agent__unsafe( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_global_agent__unsafe( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst + ret <2 x half> %res +} + +define void @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(ptr addrspace(1) %ptr, <2 x half> %value) #6 { +; CI-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe( +; CI-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe( +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void +; + %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst + ret void +} + +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe(ptr %ptr, <2 x bfloat> %value) #6 { +; CI-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe( +; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; + %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + ret <2 x bfloat> %res +} + +define void @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe(ptr %ptr, <2 x bfloat> %value) #6 { +; CI-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe( +; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] ; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] ; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -4901,15 +5931,15 @@ define void @test_atomicrmw_fadd_v2bf16_flat_local_noret(ptr addrspace(3) %ptr, ; GFX908: atomicrmw.end: ; GFX908-NEXT: ret void ; -; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret( -; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 ; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX90A: atomicrmw.start: ; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] ; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] ; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -4917,19 +5947,19 @@ define void @test_atomicrmw_fadd_v2bf16_flat_local_noret(ptr addrspace(3) %ptr, ; GFX90A: atomicrmw.end: ; GFX90A-NEXT: ret void ; -; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret( -; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(3) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 ; GFX940-NEXT: ret void ; -; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_flat_local_noret( -; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(3) [[PTR:%.*]], align 4 +; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 ; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] ; GFX11: atomicrmw.start: ; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] ; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] ; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 ; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 ; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 ; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 ; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> @@ -4937,173 +5967,183 @@ define void @test_atomicrmw_fadd_v2bf16_flat_local_noret(ptr addrspace(3) %ptr, ; GFX11: atomicrmw.end: ; GFX11-NEXT: ret void ; - %res = atomicrmw fadd ptr addrspace(3) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst + %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst ret void } -define <2 x half> @test_atomicrmw_fadd_v2f16_flat_agent__unsafe(ptr %ptr, <2 x half> %value) #6 { -; ALL-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent__unsafe( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret <2 x half> [[TMP5]] -; - %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst - ret <2 x half> %res -} - -define void @test_atomicrmw_fadd_v2f16_flat_agent_noret__unsafe(ptr %ptr, <2 x half> %value) #6 { -; ALL-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret__unsafe( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret void +define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__unsafe(ptr addrspace(1) %ptr, <2 x bfloat> %value) #6 { +; CI-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent__unsafe( +; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret <2 x bfloat> [[TMP5]] ; - %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst - ret void -} - -define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__unsafe(ptr addrspace(1) %ptr, <2 x half> %value) #6 { -; ALL-LABEL: @test_atomicrmw_fadd_v2f16_global_agent__unsafe( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret <2 x half> [[TMP5]] +; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent__unsafe( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret <2 x bfloat> [[TMP5]] ; - %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst - ret <2 x half> %res -} - -define void @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(ptr addrspace(1) %ptr, <2 x half> %value) #6 { -; ALL-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret void +; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent__unsafe( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] ; - %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst - ret void -} - -define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe(ptr %ptr, <2 x bfloat> %value) #6 { -; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent__unsafe( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] ; - %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst - ret <2 x bfloat> %res -} - -define void @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe(ptr %ptr, <2 x bfloat> %value) #6 { -; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret void +; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent__unsafe( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret <2 x bfloat> [[RES]] ; - %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst - ret void -} - -define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__unsafe(ptr addrspace(1) %ptr, <2 x bfloat> %value) #6 { -; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent__unsafe( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent__unsafe( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst ret <2 x bfloat> %res } define void @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe(ptr addrspace(1) %ptr, <2 x bfloat> %value) #6 { -; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe( -; ALL-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 -; ALL-NEXT: br label [[ATOMICRMW_START:%.*]] -; ALL: atomicrmw.start: -; ALL-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; ALL-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] -; ALL-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; ALL-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; ALL-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; ALL-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; ALL-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; ALL-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; ALL-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; ALL: atomicrmw.end: -; ALL-NEXT: ret void +; CI-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe( +; CI-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; CI-NEXT: br label [[ATOMICRMW_START:%.*]] +; CI: atomicrmw.start: +; CI-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; CI-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; CI-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; CI-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; CI-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; CI-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; CI-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; CI-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; CI-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; CI: atomicrmw.end: +; CI-NEXT: ret void +; +; GFX9-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe( +; GFX9-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX9-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX9: atomicrmw.start: +; GFX9-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX9-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX9-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX9-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX9-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX9-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX9-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX9-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX9-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX9: atomicrmw.end: +; GFX9-NEXT: ret void +; +; GFX908-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe( +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret void +; +; GFX90A-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe( +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret void +; +; GFX940-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe( +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret void +; +; GFX11-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe( +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret void ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst ret void diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll index 4f053af7aed932..26b84f82524e26 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-agent.ll @@ -41,264 +41,1404 @@ ;--------------------------------------------------------------------- define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(ptr addrspace(1) %ptr, <2 x bfloat> %value) #0 { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(ptr addrspace(1) %ptr, <2 x bfloat> %value) #1 { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, <2 x bfloat> %value) #0 { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, <2 x bfloat> %value) #1 { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x bfloat> %res @@ -310,7 +1450,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__amdgpu_ignore_deno define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent(ptr addrspace(1) %ptr, <2 x bfloat> %value) { ; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { ; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 ; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] ; COMMON: atomicrmw.start: @@ -848,12 +1988,8 @@ attributes #0 = { "denormal-fp-mode"="preserve-sign,preserve-sign" } attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" } !0 = !{} -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10: {{.*}} -; GFX11: {{.*}} -; GFX12: {{.*}} -; GFX803: {{.*}} -; GFX906: {{.*}} -; GFX908: {{.*}} -; GFX90A: {{.*}} -; GFX940: {{.*}} +;. +; GFX940: [[META0]] = !{} +;. +; GFX12: [[META0]] = !{} +;. diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll index 0339e885ca7700..d01cfbe035f64c 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2bf16-system.ll @@ -41,264 +41,1404 @@ ;--------------------------------------------------------------------- define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4 +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4 +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.remote.memory !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz(ptr addrspace(1) %ptr, <2 x bfloat> %value) #0 { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_daz( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic(ptr addrspace(1) %ptr, <2 x bfloat> %value) #1 { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2bf16_dynamic( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, <2 x bfloat> %value) #0 { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x bfloat> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, <2 x bfloat> %value) #1 { -; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX803-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX906-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX908-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x bfloat> [[RES]] +; +; GFX10-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX11-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX12-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x bfloat> %res @@ -310,7 +1450,7 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_system__amdgpu_ignore_den define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system(ptr addrspace(1) %ptr, <2 x bfloat> %value) { ; COMMON-LABEL: define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_global_system( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { ; COMMON-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 ; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] ; COMMON: atomicrmw.start: @@ -848,12 +1988,8 @@ attributes #0 = { "denormal-fp-mode"="preserve-sign,preserve-sign" } attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" } !0 = !{} -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10: {{.*}} -; GFX11: {{.*}} -; GFX12: {{.*}} -; GFX803: {{.*}} -; GFX906: {{.*}} -; GFX908: {{.*}} -; GFX90A: {{.*}} -; GFX940: {{.*}} +;. +; GFX940: [[META0]] = !{} +;. +; GFX12: [[META0]] = !{} +;. diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll index 0fc6f1134ff406..3b0b7c9209588b 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-agent.ll @@ -41,264 +41,1260 @@ ;--------------------------------------------------------------------- define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(ptr addrspace(1) %ptr, <2 x half> %value) #0 { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(ptr addrspace(1) %ptr, <2 x half> %value) #1 { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, <2 x half> %value) #0 { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, <2 x half> %value) #1 { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x half> %res @@ -310,7 +1306,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__amdgpu_ignore_denorma define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent(ptr addrspace(1) %ptr, <2 x half> %value) { ; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_agent( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { ; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 ; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] ; COMMON: atomicrmw.start: @@ -848,12 +1844,10 @@ attributes #0 = { "denormal-fp-mode"="preserve-sign,preserve-sign" } attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" } !0 = !{} -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10: {{.*}} -; GFX11: {{.*}} -; GFX12: {{.*}} -; GFX803: {{.*}} -; GFX906: {{.*}} -; GFX908: {{.*}} -; GFX90A: {{.*}} -; GFX940: {{.*}} +;. +; GFX90A: [[META0]] = !{} +;. +; GFX940: [[META0]] = !{} +;. +; GFX12: [[META0]] = !{} +;. diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll index 20f39b5ed0c860..3478f23e0b200d 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-v2f16-system.ll @@ -41,264 +41,1260 @@ ;--------------------------------------------------------------------- define <2 x half> @test_atomicrmw_fadd_v2f16_global_system(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4 +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4 +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4 +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.remote.memory !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz(ptr addrspace(1) %ptr, <2 x half> %value) #0 { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_daz( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1:[0-9]+]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic(ptr addrspace(1) %ptr, <2 x half> %value) #1 { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory___denormal_fp_mode_v2f16_dynamic( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2:[0-9]+]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.ignore.denormal.mode !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, <2 x half> %value) { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz(ptr addrspace(1) %ptr, <2 x half> %value) #0 { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_daz( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR1]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x half> %res } define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic(ptr addrspace(1) %ptr, <2 x half> %value) #1 { -; COMMON-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { -; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] -; COMMON: atomicrmw.start: -; COMMON-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; COMMON-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; COMMON-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; COMMON-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 -; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; COMMON-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; COMMON-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; COMMON: atomicrmw.end: -; COMMON-NEXT: ret <2 x half> [[TMP5]] +; GFX803-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX803-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX803-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX803-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX803: atomicrmw.start: +; GFX803-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX803-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX803-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX803-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX803-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX803-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX803-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX803-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX803-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX803: atomicrmw.end: +; GFX803-NEXT: ret <2 x half> [[TMP5]] +; +; GFX906-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX906-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX906-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX906-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX906: atomicrmw.start: +; GFX906-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX906-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX906-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX906-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX906-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX906-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX906-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX906-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX906-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX906: atomicrmw.end: +; GFX906-NEXT: ret <2 x half> [[TMP5]] +; +; GFX908-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX908-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX908-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX908-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX908: atomicrmw.start: +; GFX908-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX908-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX908-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX908-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX908-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX908-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX908-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX908-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX908-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX908: atomicrmw.end: +; GFX908-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX940-NEXT: ret <2 x half> [[RES]] +; +; GFX10-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX10-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX10-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX10-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX10: atomicrmw.start: +; GFX10-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX10-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX10-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX10-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX10-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX10-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX10-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX10-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX10-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX10: atomicrmw.end: +; GFX10-NEXT: ret <2 x half> [[TMP5]] +; +; GFX11-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX11-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX11-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX11-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX11: atomicrmw.start: +; GFX11-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX11-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX11-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX11-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX11-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4 +; GFX11-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX11-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX11-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX11-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX11: atomicrmw.end: +; GFX11-NEXT: ret <2 x half> [[TMP5]] +; +; GFX12-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory__denormal_mode_dynamic( +; GFX12-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR2]] { +; GFX12-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]] +; GFX12-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0, !amdgpu.ignore.denormal.mode !0 ret <2 x half> %res @@ -310,7 +1306,7 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_global_system__amdgpu_ignore_denorm define <2 x half> @test_atomicrmw_fsub_v2f16_global_system(ptr addrspace(1) %ptr, <2 x half> %value) { ; COMMON-LABEL: define <2 x half> @test_atomicrmw_fsub_v2f16_global_system( -; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] { ; COMMON-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 ; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]] ; COMMON: atomicrmw.start: @@ -848,12 +1844,10 @@ attributes #0 = { "denormal-fp-mode"="preserve-sign,preserve-sign" } attributes #1 = { "denormal-fp-mode"="dynamic,dynamic" } !0 = !{} -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10: {{.*}} -; GFX11: {{.*}} -; GFX12: {{.*}} -; GFX803: {{.*}} -; GFX906: {{.*}} -; GFX908: {{.*}} -; GFX90A: {{.*}} -; GFX940: {{.*}} +;. +; GFX90A: [[META0]] = !{} +;. +; GFX940: [[META0]] = !{} +;. +; GFX12: [[META0]] = !{} +;. diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-fp-vector.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-fp-vector.ll index 1411890e01dc59..4556ad60ccf8ca 100644 --- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-fp-vector.ll +++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-fp-vector.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand -mcpu=gfx900 %s | FileCheck %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand -mcpu=gfx90a %s | FileCheck %s -; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand -mcpu=gfx940 %s | FileCheck %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand -mcpu=gfx900 %s | FileCheck -check-prefixes=CHECK,GFX900 %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand -mcpu=gfx90a %s | FileCheck -check-prefixes=CHECK,GFX90A %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -passes=atomic-expand -mcpu=gfx940 %s | FileCheck -check-prefixes=CHECK,GFX940 %s ;--------------------------------------------------------------------- ; atomicrmw fadd @@ -70,44 +70,76 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align2(ptr addrspac } define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4(ptr addrspace(1) %ptr, <2 x half> %value) { -; CHECK-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4( -; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 -; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] -; CHECK: atomicrmw.start: -; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; CHECK-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 -; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; CHECK-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> -; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; CHECK: atomicrmw.end: -; CHECK-NEXT: ret <2 x half> [[TMP5]] +; GFX900-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4( +; GFX900-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR]], align 4 +; GFX900-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX900: atomicrmw.start: +; GFX900-NEXT: [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]] +; GFX900-NEXT: [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half> +; GFX900-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX900: atomicrmw.end: +; GFX900-NEXT: ret <2 x half> [[TMP5]] +; +; GFX90A-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX90A-NEXT: ret <2 x half> [[RES]] +; +; GFX940-LABEL: define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent_align4( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret <2 x half> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst, align 4 ret <2 x half> %res } define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4(ptr addrspace(1) %ptr, <2 x bfloat> %value) { -; CHECK-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4( -; CHECK-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 -; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]] -; CHECK: atomicrmw.start: -; CHECK-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] -; CHECK-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] -; CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 -; CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 -; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 -; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 -; CHECK-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 -; CHECK-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> -; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] -; CHECK: atomicrmw.end: -; CHECK-NEXT: ret <2 x bfloat> [[TMP5]] +; GFX900-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4( +; GFX900-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX900-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX900-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX900: atomicrmw.start: +; GFX900-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX900-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX900-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX900-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX900-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX900-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX900-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX900-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX900-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX900: atomicrmw.end: +; GFX900-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX90A-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4( +; GFX90A-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX90A-NEXT: [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR]], align 4 +; GFX90A-NEXT: br label [[ATOMICRMW_START:%.*]] +; GFX90A: atomicrmw.start: +; GFX90A-NEXT: [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ] +; GFX90A-NEXT: [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]] +; GFX90A-NEXT: [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32 +; GFX90A-NEXT: [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32 +; GFX90A-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4 +; GFX90A-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +; GFX90A-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +; GFX90A-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat> +; GFX90A-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]] +; GFX90A: atomicrmw.end: +; GFX90A-NEXT: ret <2 x bfloat> [[TMP5]] +; +; GFX940-LABEL: define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent_align4( +; GFX940-SAME: ptr addrspace(1) [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] { +; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4 +; GFX940-NEXT: ret <2 x bfloat> [[RES]] ; %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, align 4 ret <2 x bfloat> %res