Skip to content

Commit

Permalink
AMDGPU: Handle legal v2f16/v2bf16 atomicrmw fadd for global/flat
Browse files Browse the repository at this point in the history
Unlike the existing fadd cases, choose to ignore the requirement for
amdgpu-unsafe-fp-atomics in case of fine-grained memory access. This
is to minimize migration pain to the new atomic control metadata. This
should not break any users, as the atomic intrinsics are still
directly consumed, and clang does not yet produce vector FP atomicrmw.
  • Loading branch information
arsenm committed Jun 13, 2024
1 parent 86c3c5a commit 8eb6914
Show file tree
Hide file tree
Showing 12 changed files with 6,787 additions and 3,505 deletions.
7 changes: 7 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1659,6 +1659,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
});
}

if (ST.hasAtomicBufferGlobalPkAddF16Insts())
Atomic.legalFor({{V2F16, GlobalPtr}});
if (ST.hasAtomicGlobalPkAddBF16Inst())
Atomic.legalFor({{V2BF16, GlobalPtr}});
if (ST.hasAtomicFlatPkAdd16Insts())
Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});

// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
// demarshalling
getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
Expand Down
6 changes: 5 additions & 1 deletion llvm/lib/Target/AMDGPU/FLATInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -1645,6 +1645,7 @@ defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgc
let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in {
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>;
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>;
}

let OtherPredicates = [HasBufferFlatGlobalAtomicsF64] in {
Expand All @@ -1669,13 +1670,16 @@ defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat
}

let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in {
// FIXME: These do not have signed offsets
defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "flat_addrspace", v2f16>;
defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>;
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_flat", v2f16>;
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16>;
}

let OtherPredicates = [HasAtomicGlobalPkAddBF16Inst] in
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>;

defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_global", v2bf16>;
} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10

let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in {
Expand Down
29 changes: 28 additions & 1 deletion llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15941,6 +15941,16 @@ static bool isHalf2OrBFloat2(Type *Ty) {
return false;
}

static bool isHalf2(Type *Ty) {
FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
}

static bool isBFloat2(Type *Ty) {
FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty);
return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
}

TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
unsigned AS = RMW->getPointerAddressSpace();
Expand Down Expand Up @@ -16009,10 +16019,27 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
AS != AMDGPUAS::BUFFER_FAT_POINTER)
return AtomicExpansionKind::CmpXChg;

// TODO: gfx940 supports v2f16 and v2bf16
if (Subtarget->hasGFX940Insts() && (Ty->isFloatTy() || Ty->isDoubleTy()))
return AtomicExpansionKind::None;

if (AS == AMDGPUAS::FLAT_ADDRESS) {
// gfx940, gfx12
// FIXME: Needs to account for no fine-grained memory
if (Subtarget->hasAtomicFlatPkAdd16Insts() && isHalf2OrBFloat2(Ty))
return AtomicExpansionKind::None;
} else {
// gfx90a, gfx940, gfx12
// FIXME: Needs to account for no fine-grained memory
if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isHalf2(Ty))
return AtomicExpansionKind::None;

// gfx940, gfx12
// FIXME: Need to skip buffer_fat_pointer?
// FIXME: Needs to account for no fine-grained memory
if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isBFloat2(Ty))
return AtomicExpansionKind::None;
}

if (unsafeFPAtomicsDisabled(RMW->getFunction()))
return AtomicExpansionKind::CmpXChg;

Expand Down
62 changes: 4 additions & 58 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
Original file line number Diff line number Diff line change
Expand Up @@ -237,24 +237,10 @@ define <2 x half> @global_atomic_fadd_ret_v2f16_agent_offset(ptr addrspace(1) %p
; GFX940-LABEL: global_atomic_fadd_ret_v2f16_agent_offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v3, v[0:1], off offset:1024
; GFX940-NEXT: s_mov_b64 s[0:1], 0
; GFX940-NEXT: .LBB17_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0
; GFX940-NEXT: global_atomic_pk_add_f16 v0, v[0:1], v2, off offset:1024 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB17_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256
%result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
Expand All @@ -265,23 +251,10 @@ define void @global_atomic_fadd_noret_v2f16_agent_offset(ptr addrspace(1) %ptr,
; GFX940-LABEL: global_atomic_fadd_noret_v2f16_agent_offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: global_load_dword v5, v[0:1], off offset:1024
; GFX940-NEXT: s_mov_b64 s[0:1], 0
; GFX940-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0
; GFX940-NEXT: global_atomic_pk_add_f16 v[0:1], v2, off offset:1024
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: v_mov_b32_e32 v5, v3
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB18_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256
%unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
Expand All @@ -292,24 +265,10 @@ define <2 x half> @flat_atomic_fadd_ret_v2f16_agent_offset(ptr %ptr, <2 x half>
; GFX940-LABEL: flat_atomic_fadd_ret_v2f16_agent_offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v3, v[0:1] offset:1024
; GFX940-NEXT: s_mov_b64 s[0:1], 0
; GFX940-NEXT: .LBB19_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v5, v3
; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:1024 sc0
; GFX940-NEXT: flat_atomic_pk_add_f16 v0, v[0:1], v2 offset:1024 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB19_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr <2 x half>, ptr %ptr, i32 256
%result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst
Expand All @@ -320,23 +279,10 @@ define void @flat_atomic_fadd_noret_v2f16_agent_offset(ptr %ptr, <2 x half> %val
; GFX940-LABEL: flat_atomic_fadd_noret_v2f16_agent_offset:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: flat_load_dword v5, v[0:1] offset:1024
; GFX940-NEXT: s_mov_b64 s[0:1], 0
; GFX940-NEXT: .LBB20_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_pk_add_f16 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:1024 sc0
; GFX940-NEXT: flat_atomic_pk_add_f16 v[0:1], v2 offset:1024
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX940-NEXT: v_mov_b32_e32 v5, v3
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB20_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr <2 x half>, ptr %ptr, i32 256
%unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst
Expand Down
Loading

0 comments on commit 8eb6914

Please sign in to comment.