Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

clang/AMDGPU: Emit atomicrmw from ds_fadd builtins #95395

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 71 additions & 42 deletions clang/lib/CodeGen/CGBuiltin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18140,9 +18140,35 @@ void CodeGenFunction::ProcessOrderScopeAMDGCN(Value *Order, Value *Scope,
break;
}

// Some of the atomic builtins take the scope as a string name.
StringRef scp;
llvm::getConstantStringInfo(Scope, scp);
SSID = getLLVMContext().getOrInsertSyncScopeID(scp);
if (llvm::getConstantStringInfo(Scope, scp)) {
SSID = getLLVMContext().getOrInsertSyncScopeID(scp);
return;
}

// Older builtins had an enum argument for the memory scope.
int scope = cast<llvm::ConstantInt>(Scope)->getZExtValue();
switch (scope) {
case 0: // __MEMORY_SCOPE_SYSTEM
SSID = llvm::SyncScope::System;
break;
case 1: // __MEMORY_SCOPE_DEVICE
SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
break;
case 2: // __MEMORY_SCOPE_WRKGRP
SSID = getLLVMContext().getOrInsertSyncScopeID("workgroup");
break;
case 3: // __MEMORY_SCOPE_WVFRNT
SSID = getLLVMContext().getOrInsertSyncScopeID("wavefront");
break;
case 4: // __MEMORY_SCOPE_SINGLE
SSID = llvm::SyncScope::SingleThread;
break;
default:
SSID = llvm::SyncScope::System;
break;
}
}

llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned ICEArguments,
Expand Down Expand Up @@ -18558,14 +18584,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
Function *F = CGM.getIntrinsic(Intrin, { Src0->getType() });
return Builder.CreateCall(F, { Src0, Builder.getFalse() });
}
case AMDGPU::BI__builtin_amdgcn_ds_faddf:
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
case AMDGPU::BI__builtin_amdgcn_ds_fmaxf: {
Intrinsic::ID Intrin;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_ds_faddf:
Intrin = Intrinsic::amdgcn_ds_fadd;
break;
case AMDGPU::BI__builtin_amdgcn_ds_fminf:
Intrin = Intrinsic::amdgcn_ds_fmin;
break;
Expand Down Expand Up @@ -18656,35 +18678,6 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
llvm::Function *F = CGM.getIntrinsic(IID, {Addr->getType()});
return Builder.CreateCall(F, {Addr, Val});
}
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16: {
Intrinsic::ID IID;
llvm::Type *ArgTy;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
ArgTy = llvm::Type::getFloatTy(getLLVMContext());
IID = Intrinsic::amdgcn_ds_fadd;
break;
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
ArgTy = llvm::Type::getDoubleTy(getLLVMContext());
IID = Intrinsic::amdgcn_ds_fadd;
break;
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
ArgTy = llvm::FixedVectorType::get(
llvm::Type::getHalfTy(getLLVMContext()), 2);
IID = Intrinsic::amdgcn_ds_fadd;
break;
}
llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
llvm::Value *Val = EmitScalarExpr(E->getArg(1));
llvm::Constant *ZeroI32 = llvm::ConstantInt::getIntegerValue(
llvm::Type::getInt32Ty(getLLVMContext()), APInt(32, 0, true));
llvm::Constant *ZeroI1 = llvm::ConstantInt::getIntegerValue(
llvm::Type::getInt1Ty(getLLVMContext()), APInt(1, 0));
llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1});
}
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
Expand Down Expand Up @@ -19044,7 +19037,12 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
case AMDGPU::BI__builtin_amdgcn_atomic_inc64:
case AMDGPU::BI__builtin_amdgcn_atomic_dec32:
case AMDGPU::BI__builtin_amdgcn_atomic_dec64: {
case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
case AMDGPU::BI__builtin_amdgcn_ds_faddf:
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16: {
llvm::AtomicRMWInst::BinOp BinOp;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_atomic_inc32:
Expand All @@ -19055,23 +19053,54 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_atomic_dec64:
BinOp = llvm::AtomicRMWInst::UDecWrap;
break;
case AMDGPU::BI__builtin_amdgcn_ds_faddf:
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f64:
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_f32:
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2f16:
case AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16:
BinOp = llvm::AtomicRMWInst::FAdd;
break;
}

Address Ptr = CheckAtomicAlignment(*this, E);
Value *Val = EmitScalarExpr(E->getArg(1));
llvm::Type *OrigTy = Val->getType();
QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();

ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
EmitScalarExpr(E->getArg(3)), AO, SSID);
bool Volatile;

QualType PtrTy = E->getArg(0)->IgnoreImpCasts()->getType();
bool Volatile =
PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_faddf) {
// __builtin_amdgcn_ds_faddf has an explicit volatile argument
Volatile =
cast<ConstantInt>(EmitScalarExpr(E->getArg(4)))->getZExtValue();
} else {
// Infer volatile from the passed type.
Volatile =
PtrTy->castAs<PointerType>()->getPointeeType().isVolatileQualified();
}

if (E->getNumArgs() >= 4) {
// Some of the builtins have explicit ordering and scope arguments.
ProcessOrderScopeAMDGCN(EmitScalarExpr(E->getArg(2)),
EmitScalarExpr(E->getArg(3)), AO, SSID);
} else {
// The ds_fadd_* builtins do not have syncscope/order arguments.
SSID = llvm::SyncScope::System;
AO = AtomicOrdering::SequentiallyConsistent;

// The v2bf16 builtin uses i16 instead of a natural bfloat type.
if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16) {
llvm::Type *V2BF16Ty = FixedVectorType::get(
llvm::Type::getBFloatTy(Builder.getContext()), 2);
Val = Builder.CreateBitCast(Val, V2BF16Ty);
}
}

llvm::AtomicRMWInst *RMW =
Builder.CreateAtomicRMW(BinOp, Ptr, Val, AO, SSID);
if (Volatile)
RMW->setVolatile(true);
return RMW;
return Builder.CreateBitCast(RMW, OrigTy);
}
case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtn:
case AMDGPU::BI__builtin_amdgcn_s_sendmsg_rtnl: {
Expand Down
2 changes: 1 addition & 1 deletion clang/test/CodeGenCUDA/builtins-amdgcn.cu
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ __global__
// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr
// CHECK-NEXT: store float [[SRC:%.*]], ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = call contract float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) @_ZZ12test_ds_faddfE6shared, float [[TMP0]], i32 0, i32 0, i1 false)
// CHECK-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr addrspace(3) @_ZZ12test_ds_faddfE6shared, float [[TMP0]] monotonic, align 4
// CHECK-NEXT: store volatile float [[TMP1]], ptr [[X_ASCAST]], align 4
// CHECK-NEXT: ret void
//
Expand Down
2 changes: 1 addition & 1 deletion clang/test/CodeGenCUDA/builtins-spirv-amdgcn.cu
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ __global__
// CHECK-NEXT: [[X_ASCAST:%.*]] = addrspacecast ptr [[X]] to ptr addrspace(4)
// CHECK-NEXT: store float [[SRC:%.*]], ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP0:%.*]] = load float, ptr addrspace(4) [[SRC_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP1:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) @_ZZ12test_ds_faddfE6shared, float [[TMP0]], i32 0, i32 0, i1 false)
// CHECK-NEXT: [[TMP1:%.*]] = atomicrmw fadd ptr addrspace(3) @_ZZ12test_ds_faddfE6shared, float [[TMP0]] monotonic, align 4
// CHECK-NEXT: store volatile float [[TMP1]], ptr addrspace(4) [[X_ASCAST]], align 4
// CHECK-NEXT: ret void
//
Expand Down
5 changes: 3 additions & 2 deletions clang/test/CodeGenCUDA/builtins-unsafe-atomics-gfx90a.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@ typedef __attribute__((address_space(3))) float *LP;
// CHECK: store ptr %addr, ptr %[[ADDR_ADDR_ASCAST_PTR]], align 8
// CHECK: %[[ADDR_ADDR_ASCAST:.*]] = load ptr, ptr %[[ADDR_ADDR_ASCAST_PTR]], align 8
// CHECK: %[[AS_CAST:.*]] = addrspacecast ptr %[[ADDR_ADDR_ASCAST]] to ptr addrspace(3)
// CHECK: %3 = call contract float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %[[AS_CAST]]
// CHECK: [[TMP2:%.+]] = load float, ptr %val.addr.ascast, align 4
// CHECK: [[TMP3:%.+]] = atomicrmw fadd ptr addrspace(3) %[[AS_CAST]], float [[TMP2]] monotonic, align 4
// CHECK: %4 = load ptr, ptr %rtn.ascast, align 8
// CHECK: store float %3, ptr %4, align 4
// CHECK: store float [[TMP3]], ptr %4, align 4
__device__ void test_ds_atomic_add_f32(float *addr, float val) {
float *rtn;
*rtn = __builtin_amdgcn_ds_faddf((LP)addr, val, 0, 0, 0);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ typedef __attribute__((address_space(3))) float *LP;
// CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[ADDR_ADDR_ASCAST]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(4) [[TMP0]] to ptr addrspace(3)
// CHECK-NEXT: [[TMP2:%.*]] = load float, ptr addrspace(4) [[VAL_ADDR_ASCAST]], align 4
// CHECK-NEXT: [[TMP3:%.*]] = call contract addrspace(4) float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) [[TMP1]], float [[TMP2]], i32 0, i32 0, i1 false)
// CHECK-NEXT: [[TMP3:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP1]], float [[TMP2]] monotonic, align 4
// CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(4), ptr addrspace(4) [[RTN_ASCAST]], align 8
// CHECK-NEXT: store float [[TMP3]], ptr addrspace(4) [[TMP4]], align 4
// CHECK-NEXT: ret void
Expand Down
37 changes: 34 additions & 3 deletions clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl
Original file line number Diff line number Diff line change
Expand Up @@ -117,13 +117,44 @@ void test_update_dpp(global int* out, int arg1, int arg2)
}

// CHECK-LABEL: @test_ds_fadd
// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %out, float %src, i32 0, i32 0, i1 false)
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
// CHECK: atomicrmw volatile fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}}

// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src acquire, align 4{{$}}
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src acquire, align 4{{$}}
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src release, align 4{{$}}
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src acq_rel, align 4{{$}}
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src seq_cst, align 4{{$}}

// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}}
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}}
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}}
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}}
// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}}
#if !defined(__SPIRV__)
void test_ds_faddf(local float *out, float src) {
#else
void test_ds_faddf(__attribute__((address_space(3))) float *out, float src) {
void test_ds_faddf(__attribute__((address_space(3))) float *out, float src) {
#endif
*out = __builtin_amdgcn_ds_faddf(out, src, 0, 0, false);

*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM, false);
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM, true);

// Test all orders.
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_CONSUME, __MEMORY_SCOPE_SYSTEM, false);
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_SYSTEM, false);
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELEASE, __MEMORY_SCOPE_SYSTEM, false);
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_ACQ_REL, __MEMORY_SCOPE_SYSTEM, false);
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM, false);
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_SEQ_CST, __MEMORY_SCOPE_SYSTEM, false); // invalid

// Test all syncscopes.
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE, false);
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP, false);
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT, false);
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE, false);
*out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, 5, false); // invalid
}

// CHECK-LABEL: @test_ds_fmin
Expand Down
14 changes: 10 additions & 4 deletions clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
Original file line number Diff line number Diff line change
Expand Up @@ -10,31 +10,37 @@ typedef half __attribute__((ext_vector_type(2))) half2;
typedef short __attribute__((ext_vector_type(2))) short2;

// CHECK-LABEL: test_local_add_2bf16
// CHECK: call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %{{.*}}, <2 x i16> %
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>

// GFX12-LABEL: test_local_add_2bf16
// GFX12: ds_pk_add_rtn_bf16
short2 test_local_add_2bf16(__local short2 *addr, short2 x) {
return __builtin_amdgcn_ds_atomic_fadd_v2bf16(addr, x);
}

// CHECK-LABEL: test_local_add_2bf16_noret
// CHECK: call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %{{.*}}, <2 x i16> %
// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>

// GFX12-LABEL: test_local_add_2bf16_noret
// GFX12: ds_pk_add_bf16
void test_local_add_2bf16_noret(__local short2 *addr, short2 x) {
__builtin_amdgcn_ds_atomic_fadd_v2bf16(addr, x);
}

// CHECK-LABEL: test_local_add_2f16
// CHECK: call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %{{.*}}, <2 x half> %
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4
// GFX12-LABEL: test_local_add_2f16
// GFX12: ds_pk_add_rtn_f16
half2 test_local_add_2f16(__local half2 *addr, half2 x) {
return __builtin_amdgcn_ds_atomic_fadd_v2f16(addr, x);
}

// CHECK-LABEL: test_local_add_2f16_noret
// CHECK: call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %{{.*}}, <2 x half> %
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4
// GFX12-LABEL: test_local_add_2f16_noret
// GFX12: ds_pk_add_f16
void test_local_add_2f16_noret(__local half2 *addr, half2 x) {
Expand Down
9 changes: 8 additions & 1 deletion clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,18 @@
// REQUIRES: amdgpu-registered-target

// CHECK-LABEL: test_fadd_local
// CHECK: call float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %{{.*}}, float %{{.*}}, i32 0, i32 0, i1 false)
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, float %{{.+}} seq_cst, align 4
// GFX8-LABEL: test_fadd_local$local:
// GFX8: ds_add_rtn_f32 v2, v0, v1
// GFX8: s_endpgm
kernel void test_fadd_local(__local float *ptr, float val){
float *res;
*res = __builtin_amdgcn_ds_atomic_fadd_f32(ptr, val);
}

// CHECK-LABEL: test_fadd_local_volatile
// CHECK: = atomicrmw volatile fadd ptr addrspace(3) %{{.+}}, float %{{.+}} seq_cst, align 4
kernel void test_fadd_local_volatile(volatile __local float *ptr, float val){
volatile float *res;
*res = __builtin_amdgcn_ds_atomic_fadd_f32(ptr, val);
}
4 changes: 2 additions & 2 deletions clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ void test_flat_global_max_f64(__global double *addr, double x){
}

// CHECK-LABEL: test_ds_add_local_f64
// CHECK: call double @llvm.amdgcn.ds.fadd.f64(ptr addrspace(3) %{{.*}}, double %{{.*}},
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, double %{{.+}} seq_cst, align 8
// GFX90A: test_ds_add_local_f64$local
// GFX90A: ds_add_rtn_f64
void test_ds_add_local_f64(__local double *addr, double x){
Expand All @@ -108,7 +108,7 @@ void test_ds_add_local_f64(__local double *addr, double x){
}

// CHECK-LABEL: test_ds_addf_local_f32
// CHECK: call float @llvm.amdgcn.ds.fadd.f32(ptr addrspace(3) %{{.*}}, float %{{.*}},
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, float %{{.+}} seq_cst, align 4
// GFX90A-LABEL: test_ds_addf_local_f32$local
// GFX90A: ds_add_rtn_f32
void test_ds_addf_local_f32(__local float *addr, float x){
Expand Down
10 changes: 7 additions & 3 deletions clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
Original file line number Diff line number Diff line change
Expand Up @@ -42,23 +42,27 @@ short2 test_global_add_2bf16(__global short2 *addr, short2 x) {
}

// CHECK-LABEL: test_local_add_2bf16
// CHECK: call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %{{.*}}, <2 x i16> %

// CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] seq_cst, align 4
// CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>

// GFX940-LABEL: test_local_add_2bf16
// GFX940: ds_pk_add_rtn_bf16
short2 test_local_add_2bf16(__local short2 *addr, short2 x) {
return __builtin_amdgcn_ds_atomic_fadd_v2bf16(addr, x);
}

// CHECK-LABEL: test_local_add_2f16
// CHECK: call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %{{.*}}, <2 x half> %
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4
// GFX940-LABEL: test_local_add_2f16
// GFX940: ds_pk_add_rtn_f16
half2 test_local_add_2f16(__local half2 *addr, half2 x) {
return __builtin_amdgcn_ds_atomic_fadd_v2f16(addr, x);
}

// CHECK-LABEL: test_local_add_2f16_noret
// CHECK: call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %{{.*}}, <2 x half> %
// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4
// GFX940-LABEL: test_local_add_2f16_noret
// GFX940: ds_pk_add_f16
void test_local_add_2f16_noret(__local half2 *addr, half2 x) {
Expand Down
3 changes: 1 addition & 2 deletions llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -2933,8 +2933,7 @@ def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUAtomicRtn<llvm_v2i16_ty>;
def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic<
[llvm_v2i16_ty],
[LLVMQualPointerType<3>, llvm_v2i16_ty],
[IntrArgMemOnly, NoCapture<ArgIndex<0>>]>,
ClangBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">;
[IntrArgMemOnly, NoCapture<ArgIndex<0>>]>;

defset list<Intrinsic> AMDGPUMFMAIntrinsics940 = {
def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic<llvm_v4i32_ty, llvm_i64_ty>;
Expand Down
Loading