[RFC] IR: Support atomicrmw FP ops with vector types (#86796)

Allow using atomicrmw fadd, fsub, fmin, and fmax with vectors of floating-point type. AMDGPU supports atomic fadd for <2 x half> and <2 x bfloat> on some targets and address spaces. Note this only supports the proper floating-point operations; float vector typed xchg is still not supported. cmpxchg still only supports integers, so this inserts bitcasts for the loop expansion. I have support for fp vector typed xchg, and vector of int/ptr separately implemented but I don't have an immediate need for those beyond feature consistency.
llvm · Apr 6, 2024 · 4cb110a · 4cb110a
1 parent bd589f5
commit 4cb110a
Show file tree

Hide file tree

Showing 11 changed files with 1,510 additions and 11 deletions.
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
@@ -11112,11 +11112,12 @@ For most of these operations, the type of '<value>' must be an integer
 type whose bit width is a power of two greater than or equal to eight
 and less than or equal to a target-specific size limit. For xchg, this
 may also be a floating point or a pointer type with the same size constraints
-as integers.  For fadd/fsub/fmax/fmin, this must be a floating point type.  The
-type of the '``<pointer>``' operand must be a pointer to that type. If
-the ``atomicrmw`` is marked as ``volatile``, then the optimizer is not
-allowed to modify the number or order of execution of this
-``atomicrmw`` with other :ref:`volatile operations <volatile>`.
+as integers.  For fadd/fsub/fmax/fmin, this must be a floating-point
+or fixed vector of floating-point type.  The type of the '``<pointer>``'
+operand must be a pointer to that type. If the ``atomicrmw`` is marked
+as ``volatile``, then the optimizer is not allowed to modify the
+number or order of execution of this ``atomicrmw`` with other
+:ref:`volatile operations <volatile>`.
 
 Note: if the alignment is not greater or equal to the size of the `<value>`
 type, the atomic operation is likely to require a lock and have poor

diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
@@ -8240,6 +8240,8 @@ int LLParser::parseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
     return tokError("atomicrmw cannot be unordered");
   if (!Ptr->getType()->isPointerTy())
     return error(PtrLoc, "atomicrmw operand must be a pointer");
+  if (Val->getType()->isScalableTy())
+    return error(ValLoc, "atomicrmw operand may not be scalable");
 
   if (Operation == AtomicRMWInst::Xchg) {
     if (!Val->getType()->isIntegerTy() &&
@@ -8251,7 +8253,7 @@ int LLParser::parseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
               " operand must be an integer, floating point, or pointer type");
     }
   } else if (IsFP) {
-    if (!Val->getType()->isFloatingPointTy()) {
+    if (!Val->getType()->isFPOrFPVectorTy()) {
       return error(ValLoc, "atomicrmw " +
                                AtomicRMWInst::getOperationName(Operation) +
                                " operand must be a floating point type");

diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -562,9 +562,9 @@ static void createCmpXchgInstFun(IRBuilderBase &Builder, Value *Addr,
                                  Value *&Success, Value *&NewLoaded) {
   Type *OrigTy = NewVal->getType();
 
-  // This code can go away when cmpxchg supports FP types.
+  // This code can go away when cmpxchg supports FP and vector types.
   assert(!OrigTy->isPointerTy());
-  bool NeedBitcast = OrigTy->isFloatingPointTy();
+  bool NeedBitcast = OrigTy->isFloatingPointTy() || OrigTy->isVectorTy();
   if (NeedBitcast) {
     IntegerType *IntTy = Builder.getIntNTy(OrigTy->getPrimitiveSizeInBits());
     NewVal = Builder.CreateBitCast(NewVal, IntTy);
@@ -731,7 +731,7 @@ static PartwordMaskValues createMaskInstrs(IRBuilderBase &Builder,
   unsigned ValueSize = DL.getTypeStoreSize(ValueType);
 
   PMV.ValueType = PMV.IntValueType = ValueType;
-  if (PMV.ValueType->isFloatingPointTy())
+  if (PMV.ValueType->isFloatingPointTy() || PMV.ValueType->isVectorTy())
     PMV.IntValueType =
         Type::getIntNTy(Ctx, ValueType->getPrimitiveSizeInBits());
 

diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
@@ -4268,9 +4268,10 @@ void Verifier::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
               " operand must have integer or floating point type!",
           &RMWI, ElTy);
   } else if (AtomicRMWInst::isFPOperation(Op)) {
-    Check(ElTy->isFloatingPointTy(),
+    Check(ElTy->isFPOrFPVectorTy() && !isa<ScalableVectorType>(ElTy),
           "atomicrmw " + AtomicRMWInst::getOperationName(Op) +
-              " operand must have floating point type!",
+              " operand must have floating-point or fixed vector of floating-point "
+              "type!",
           &RMWI, ElTy);
   } else {
     Check(ElTy->isIntegerTy(),

diff --git a/llvm/test/Assembler/atomic.ll b/llvm/test/Assembler/atomic.ll
@@ -72,3 +72,19 @@ define void @fp_atomics(ptr %x) {
 
   ret void
 }
+
+define void @fp_vector_atomicrmw(ptr %x, <2 x half> %val) {
+  ; CHECK: %atomic.fadd = atomicrmw fadd ptr %x, <2 x half> %val seq_cst
+  %atomic.fadd = atomicrmw fadd ptr %x, <2 x half> %val seq_cst
+
+  ; CHECK: %atomic.fsub = atomicrmw fsub ptr %x, <2 x half> %val seq_cst
+  %atomic.fsub = atomicrmw fsub ptr %x, <2 x half> %val seq_cst
+
+  ; CHECK: %atomic.fmax = atomicrmw fmax ptr %x, <2 x half> %val seq_cst
+  %atomic.fmax = atomicrmw fmax ptr %x, <2 x half> %val seq_cst
+
+  ; CHECK: %atomic.fmin = atomicrmw fmin ptr %x, <2 x half> %val seq_cst
+  %atomic.fmin = atomicrmw fmin ptr %x, <2 x half> %val seq_cst
+
+  ret void
+}
diff --git a/llvm/test/Assembler/invalid-atomicrmw-scalable.ll b/llvm/test/Assembler/invalid-atomicrmw-scalable.ll
@@ -0,0 +1,41 @@
+; RUN: split-file %s %t --leading-lines
+; RUN: not llvm-as < %t/scalable_fp_vector_atomicrmw_xchg.ll 2>&1 | FileCheck -check-prefix=ERR0 %s
+; RUN: not llvm-as < %t/scalable_int_vector_atomicrmw_xchg.ll 2>&1 | FileCheck -check-prefix=ERR1 %s
+; RUN: not llvm-as < %t/scalable_ptr_vector_atomicrmw_xchg.ll 2>&1 | FileCheck -check-prefix=ERR2 %s
+; RUN: not llvm-as < %t/scalable_fp_vector_atomicrmw_fadd.ll 2>&1 | FileCheck -check-prefix=ERR3 %s
+; RUN: not llvm-as < %t/scalable_int_vector_atomicrmw_add.ll 2>&1 | FileCheck -check-prefix=ERR4 %s
+
+;--- scalable_fp_vector_atomicrmw_xchg.ll
+define <vscale x 2 x half> @scalable_fp_vector_atomicrmw_xchg(ptr %x, <vscale x 2 x half> %val) {
+; ERR0: :41: error: atomicrmw operand may not be scalable
+  %atomic.xchg = atomicrmw xchg ptr %x, <vscale x 2 x half> %val seq_cst
+  ret <vscale x 2 x half> %atomic.xchg
+}
+
+;--- scalable_int_vector_atomicrmw_xchg.ll
+define <vscale x 2 x i16> @scalable_int_vector_atomicrmw_xchg(ptr %x, <vscale x 2 x i16> %val) {
+; ERR1: :41: error: atomicrmw operand may not be scalable
+  %atomic.xchg = atomicrmw xchg ptr %x, <vscale x 2 x i16> %val seq_cst
+  ret <vscale x 2 x i16> %atomic.xchg
+}
+
+;--- scalable_ptr_vector_atomicrmw_xchg.ll
+define <vscale x 2 x ptr> @scalable_ptr_vector_atomicrmw_xchg(ptr %x, <vscale x 2 x ptr> %val) {
+; ERR2: :41: error: atomicrmw operand may not be scalable
+  %atomic.xchg = atomicrmw xchg ptr %x, <vscale x 2 x ptr> %val seq_cst
+  ret <vscale x 2 x ptr> %atomic.xchg
+}
+
+;--- scalable_fp_vector_atomicrmw_fadd.ll
+define <vscale x 2 x half> @scalable_fp_vector_atomicrmw_fadd(ptr %x, <vscale x 2 x half> %val) {
+; ERR3: :41: error: atomicrmw operand may not be scalable
+  %atomic.fadd = atomicrmw fadd ptr %x, <vscale x 2 x half> %val seq_cst
+  ret <vscale x 2 x half> %atomic.fadd
+}
+
+;--- scalable_int_vector_atomicrmw_add.ll
+define <vscale x 2 x i16> @scalable_int_vector_atomicrmw_add(ptr %x, <vscale x 2 x i16> %val) {
+; ERR4: :39: error: atomicrmw operand may not be scalable
+  %atomic.add = atomicrmw add ptr %x, <vscale x 2 x i16> %val seq_cst
+  ret <vscale x 2 x i16> %atomic.add
+}
diff --git a/llvm/test/Assembler/invalid-atomicrmw-xchg-fp-vector.ll b/llvm/test/Assembler/invalid-atomicrmw-xchg-fp-vector.ll
@@ -0,0 +1,7 @@
+; RUN: not llvm-as -disable-output %s 2>&1 | FileCheck %s
+
+; CHECK: error: atomicrmw xchg operand must be an integer, floating point, or pointer type
+define <2 x half> @fp_vector_atomicrmw(ptr %x, <2 x half> %val) {
+  %atomic.xchg = atomicrmw xchg ptr %x, <2 x half> %val seq_cst
+  ret <2 x half> %atomic.xchg
+}
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd-fp-vector.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd-fp-vector.ll
@@ -0,0 +1,115 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-- -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefixes=CHECK,NOLSE %s
+; RUN: llc -mtriple=aarch64-- -mattr=+lse -O1 -fast-isel=0 -global-isel=false %s -o - | FileCheck -check-prefixes=CHECK,LSE %s
+
+define <2 x half> @test_atomicrmw_fadd_v2f16_align4(ptr addrspace(1) %ptr, <2 x half> %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fadd_v2f16_align4:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    fcvtl v1.4s, v0.4h
+; NOLSE-NEXT:    ldr s0, [x0]
+; NOLSE-NEXT:    b .LBB0_2
+; NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
+; NOLSE-NEXT:    fmov s0, w10
+; NOLSE-NEXT:    cmp w10, w9
+; NOLSE-NEXT:    b.eq .LBB0_5
+; NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
+; NOLSE-NEXT:    fcvtl v2.4s, v0.4h
+; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    fadd v2.4s, v2.4s, v1.4s
+; NOLSE-NEXT:    fcvtn v2.4h, v2.4s
+; NOLSE-NEXT:    fmov w8, s2
+; NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxr w10, [x0]
+; NOLSE-NEXT:    cmp w10, w9
+; NOLSE-NEXT:    b.ne .LBB0_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
+; NOLSE-NEXT:    stlxr wzr, w8, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB0_3
+; NOLSE-NEXT:    b .LBB0_1
+; NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
+; NOLSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fadd_v2f16_align4:
+; LSE:       // %bb.0:
+; LSE-NEXT:    fcvtl v1.4s, v0.4h
+; LSE-NEXT:    ldr s0, [x0]
+; LSE-NEXT:  .LBB0_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fcvtl v2.4s, v0.4h
+; LSE-NEXT:    fmov w8, s0
+; LSE-NEXT:    mov w10, w8
+; LSE-NEXT:    fadd v2.4s, v2.4s, v1.4s
+; LSE-NEXT:    fcvtn v2.4h, v2.4s
+; LSE-NEXT:    fmov w9, s2
+; LSE-NEXT:    casal w10, w9, [x0]
+; LSE-NEXT:    fmov s0, w10
+; LSE-NEXT:    cmp w10, w8
+; LSE-NEXT:    b.ne .LBB0_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; LSE-NEXT:    ret
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4
+  ret <2 x half> %res
+}
+
+define <2 x float> @test_atomicrmw_fadd_v2f32_align8(ptr addrspace(1) %ptr, <2 x float> %value) #0 {
+; NOLSE-LABEL: test_atomicrmw_fadd_v2f32_align8:
+; NOLSE:       // %bb.0:
+; NOLSE-NEXT:    ldr d1, [x0]
+; NOLSE-NEXT:    b .LBB1_2
+; NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
+; NOLSE-NEXT:    fmov d1, x10
+; NOLSE-NEXT:    cmp x10, x9
+; NOLSE-NEXT:    b.eq .LBB1_5
+; NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
+; NOLSE-NEXT:    // =>This Loop Header: Depth=1
+; NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
+; NOLSE-NEXT:    fadd v2.2s, v1.2s, v0.2s
+; NOLSE-NEXT:    fmov x9, d1
+; NOLSE-NEXT:    fmov x8, d2
+; NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
+; NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
+; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
+; NOLSE-NEXT:    ldaxr x10, [x0]
+; NOLSE-NEXT:    cmp x10, x9
+; NOLSE-NEXT:    b.ne .LBB1_1
+; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
+; NOLSE-NEXT:    stlxr wzr, x8, [x0]
+; NOLSE-NEXT:    cbnz wzr, .LBB1_3
+; NOLSE-NEXT:    b .LBB1_1
+; NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
+; NOLSE-NEXT:    fmov d0, d1
+; NOLSE-NEXT:    ret
+;
+; LSE-LABEL: test_atomicrmw_fadd_v2f32_align8:
+; LSE:       // %bb.0:
+; LSE-NEXT:    ldr d1, [x0]
+; LSE-NEXT:  .LBB1_1: // %atomicrmw.start
+; LSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; LSE-NEXT:    fadd v2.2s, v1.2s, v0.2s
+; LSE-NEXT:    fmov x8, d1
+; LSE-NEXT:    mov x10, x8
+; LSE-NEXT:    fmov x9, d2
+; LSE-NEXT:    casal x10, x9, [x0]
+; LSE-NEXT:    fmov d1, x10
+; LSE-NEXT:    cmp x10, x8
+; LSE-NEXT:    b.ne .LBB1_1
+; LSE-NEXT:  // %bb.2: // %atomicrmw.end
+; LSE-NEXT:    fmov d0, d1
+; LSE-NEXT:    ret
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x float> %value seq_cst, align 8
+  ret <2 x float> %res
+}
+
+attributes #0 = { nounwind }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll b/llvm/test/CodeGen/X86/atomicrmw-fadd-fp-vector.ll
@@ -0,0 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple x86_64-pc-linux < %s | FileCheck %s
+
+define <2 x half> @test_atomicrmw_fadd_v2f16_align4(ptr addrspace(1) %ptr, <2 x half> %value) #0 {
+; CHECK-LABEL: test_atomicrmw_fadd_v2f16_align4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    subq $88, %rsp
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    psrld $16, %xmm0
+; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    pinsrw $0, 2(%rdi), %xmm1
+; CHECK-NEXT:    pinsrw $0, (%rdi), %xmm0
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_1: # %atomicrmw.start
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    callq __extendhfsf2@PLT
+; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    callq __extendhfsf2@PLT
+; CHECK-NEXT:    addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    callq __truncsfhf2@PLT
+; CHECK-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-NEXT:    movzwl %ax, %ebp
+; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    callq __extendhfsf2@PLT
+; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    callq __extendhfsf2@PLT
+; CHECK-NEXT:    addss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; CHECK-NEXT:    callq __truncsfhf2@PLT
+; CHECK-NEXT:    pextrw $0, %xmm0, %ecx
+; CHECK-NEXT:    shll $16, %ecx
+; CHECK-NEXT:    orl %ebp, %ecx
+; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    pextrw $0, %xmm0, %edx
+; CHECK-NEXT:    shll $16, %edx
+; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    pextrw $0, %xmm0, %eax
+; CHECK-NEXT:    movzwl %ax, %eax
+; CHECK-NEXT:    orl %edx, %eax
+; CHECK-NEXT:    lock cmpxchgl %ecx, (%rbx)
+; CHECK-NEXT:    setne %cl
+; CHECK-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-NEXT:    shrl $16, %eax
+; CHECK-NEXT:    pinsrw $0, %eax, %xmm1
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    jne .LBB0_1
+; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
+; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; CHECK-NEXT:    addq $88, %rsp
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    retq
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value seq_cst, align 4
+  ret <2 x half> %res
+}
+
+define <2 x float> @test_atomicrmw_fadd_v2f32_align8(ptr addrspace(1) %ptr, <2 x float> %value) #0 {
+; CHECK-LABEL: test_atomicrmw_fadd_v2f32_align8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB1_1: # %atomicrmw.start
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movq %xmm1, %rax
+; CHECK-NEXT:    addps %xmm0, %xmm1
+; CHECK-NEXT:    movq %xmm1, %rcx
+; CHECK-NEXT:    lock cmpxchgq %rcx, (%rdi)
+; CHECK-NEXT:    movq %rax, %xmm1
+; CHECK-NEXT:    jne .LBB1_1
+; CHECK-NEXT:  # %bb.2: # %atomicrmw.end
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x float> %value seq_cst, align 8
+  ret <2 x float> %res
+}
+
+attributes #0 = { nounwind }