AArch64: Use consistent atomicrmw expansion for FP operations #103702

arsenm · 2024-08-14T06:45:14Z

Use LLSC or cmpxchg in the same cases as for the unsupported
integer operations. This required some fixups to the LLSC
implementatation to deal with the fp128 case.

The comment about floating-point exceptions was wrong,
because floating-point exceptions are not really exceptions at all.

arsenm · 2024-08-14T06:45:23Z

AArch64: Use consistent atomicrmw expansion for FP operations #103702 👈
AArch64: Add tests for atomicrmw fp operations #103701
main

This stack of pull requests is managed by Graphite. Learn more about stacking.

Join @arsenm and the rest of your teammates on Graphite

llvmbot · 2024-08-14T06:46:37Z

@llvm/pr-subscribers-backend-aarch64

Author: Matt Arsenault (arsenm)

Changes

Use LLSC or cmpxchg in the same cases as for the unsupported
integer operations. This required some fixups to the LLSC
implementatation to deal with the fp128 case.

Patch is 88.31 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/103702.diff

5 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64ISelLowering.cpp (+18-12)
(modified) llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll (+94-298)
(modified) llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll (+107-311)
(modified) llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll (+107-311)
(modified) llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll (+94-298)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 40d9fa4f2b494a..26f698898e487b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -27061,9 +27061,6 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
   unsigned Size = AI->getType()->getPrimitiveSizeInBits();
   assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes.");
 
-  if (AI->isFloatingPointOperation())
-    return AtomicExpansionKind::CmpXChg;
-
   bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 &&
                       (AI->getOperation() == AtomicRMWInst::Xchg ||
                        AI->getOperation() == AtomicRMWInst::Or ||
@@ -27073,7 +27070,8 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
 
   // Nand is not supported in LSE.
   // Leave 128 bits to LLSC or CmpXChg.
-  if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) {
+  if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128 &&
+      !AI->isFloatingPointOperation()) {
     if (Subtarget->hasLSE())
       return AtomicExpansionKind::None;
     if (Subtarget->outlineAtomics()) {
@@ -27146,10 +27144,14 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
 
     Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
     Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
-    Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
-    Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
-    return Builder.CreateOr(
-        Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64");
+
+    auto *Int128Ty = Type::getInt128Ty(Builder.getContext());
+    Lo = Builder.CreateZExt(Lo, Int128Ty, "lo64");
+    Hi = Builder.CreateZExt(Hi, Int128Ty, "hi64");
+
+    Value *Or = Builder.CreateOr(
+        Lo, Builder.CreateShl(Hi, ConstantInt::get(Int128Ty, 64)), "val64");
+    return Builder.CreateBitCast(Or, ValueTy);
   }
 
   Type *Tys[] = { Addr->getType() };
@@ -27160,8 +27162,8 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
   const DataLayout &DL = M->getDataLayout();
   IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
   CallInst *CI = Builder.CreateCall(Ldxr, Addr);
-  CI->addParamAttr(
-      0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy));
+  CI->addParamAttr(0, Attribute::get(Builder.getContext(),
+                                     Attribute::ElementType, IntEltTy));
   Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
 
   return Builder.CreateBitCast(Trunc, ValueTy);
@@ -27187,9 +27189,13 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
         IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
     Function *Stxr = Intrinsic::getDeclaration(M, Int);
     Type *Int64Ty = Type::getInt64Ty(M->getContext());
+    Type *Int128Ty = Type::getInt128Ty(M->getContext());
+
+    Value *CastVal = Builder.CreateBitCast(Val, Int128Ty);
 
-    Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
-    Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
+    Value *Lo = Builder.CreateTrunc(CastVal, Int64Ty, "lo");
+    Value *Hi =
+        Builder.CreateTrunc(Builder.CreateLShr(CastVal, 64), Int64Ty, "hi");
     return Builder.CreateCall(Stxr, {Lo, Hi, Addr});
   }
 
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
index f95caf325b197c..2c6461097f7d9b 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll
@@ -6,33 +6,17 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align2(ptr %ptr, half %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align2:
 ; NOLSE:       // %bb.0:
 ; NOLSE-NEXT:    fcvt s1, h0
-; NOLSE-NEXT:    ldr h0, [x0]
-; NOLSE-NEXT:    b .LBB0_2
 ; NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w10
-; NOLSE-NEXT:    cmp w10, w9, uxth
-; NOLSE-NEXT:    b.eq .LBB0_5
-; NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxrh w8, [x0]
+; NOLSE-NEXT:    fmov s0, w8
 ; NOLSE-NEXT:    fcvt s2, h0
-; NOLSE-NEXT:    fmov w9, s0
 ; NOLSE-NEXT:    fadd s2, s2, s1
 ; NOLSE-NEXT:    fcvt h2, s2
 ; NOLSE-NEXT:    fmov w8, s2
-; NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxrh w10, [x0]
-; NOLSE-NEXT:    cmp w10, w9, uxth
-; NOLSE-NEXT:    b.ne .LBB0_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
-; NOLSE-NEXT:    stlxrh wzr, w8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB0_3
-; NOLSE-NEXT:    b .LBB0_1
-; NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxrh w9, w8, [x0]
+; NOLSE-NEXT:    cbnz w9, .LBB0_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; NOLSE-NEXT:    ret
 ;
@@ -63,33 +47,17 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align4(ptr %ptr, half %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align4:
 ; NOLSE:       // %bb.0:
 ; NOLSE-NEXT:    fcvt s1, h0
-; NOLSE-NEXT:    ldr h0, [x0]
-; NOLSE-NEXT:    b .LBB1_2
 ; NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w10
-; NOLSE-NEXT:    cmp w10, w9, uxth
-; NOLSE-NEXT:    b.eq .LBB1_5
-; NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxrh w8, [x0]
+; NOLSE-NEXT:    fmov s0, w8
 ; NOLSE-NEXT:    fcvt s2, h0
-; NOLSE-NEXT:    fmov w9, s0
 ; NOLSE-NEXT:    fadd s2, s2, s1
 ; NOLSE-NEXT:    fcvt h2, s2
 ; NOLSE-NEXT:    fmov w8, s2
-; NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxrh w10, [x0]
-; NOLSE-NEXT:    cmp w10, w9, uxth
-; NOLSE-NEXT:    b.ne .LBB1_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
-; NOLSE-NEXT:    stlxrh wzr, w8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB1_3
-; NOLSE-NEXT:    b .LBB1_1
-; NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxrh w9, w8, [x0]
+; NOLSE-NEXT:    cbnz w9, .LBB1_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; NOLSE-NEXT:    ret
 ;
@@ -122,19 +90,12 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
 ; NOLSE-NEXT:    fmov w9, s0
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    ldr h0, [x0]
 ; NOLSE-NEXT:    lsl w9, w9, #16
 ; NOLSE-NEXT:    fmov s1, w9
-; NOLSE-NEXT:    b .LBB2_2
 ; NOLSE-NEXT:  .LBB2_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB2_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w11
-; NOLSE-NEXT:    cmp w11, w9, uxth
-; NOLSE-NEXT:    b.eq .LBB2_5
-; NOLSE-NEXT:  .LBB2_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB2_3 Depth 2
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxrh w9, [x0]
+; NOLSE-NEXT:    fmov s0, w9
 ; NOLSE-NEXT:    lsl w9, w9, #16
 ; NOLSE-NEXT:    fmov s2, w9
 ; NOLSE-NEXT:    fadd s2, s2, s1
@@ -143,21 +104,9 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    add w9, w9, w8
 ; NOLSE-NEXT:    add w9, w10, w9
 ; NOLSE-NEXT:    lsr w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
-; NOLSE-NEXT:    fmov w9, s0
-; NOLSE-NEXT:    fmov w10, s2
-; NOLSE-NEXT:  .LBB2_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB2_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxrh w11, [x0]
-; NOLSE-NEXT:    cmp w11, w9, uxth
-; NOLSE-NEXT:    b.ne .LBB2_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB2_3 Depth=2
-; NOLSE-NEXT:    stlxrh wzr, w10, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB2_3
-; NOLSE-NEXT:    b .LBB2_1
-; NOLSE-NEXT:  .LBB2_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxrh w10, w9, [x0]
+; NOLSE-NEXT:    cbnz w10, .LBB2_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; NOLSE-NEXT:    ret
 ;
@@ -199,19 +148,12 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    // kill: def $h0 killed $h0 def $s0
 ; NOLSE-NEXT:    fmov w9, s0
 ; NOLSE-NEXT:    mov w8, #32767 // =0x7fff
-; NOLSE-NEXT:    ldr h0, [x0]
 ; NOLSE-NEXT:    lsl w9, w9, #16
 ; NOLSE-NEXT:    fmov s1, w9
-; NOLSE-NEXT:    b .LBB3_2
 ; NOLSE-NEXT:  .LBB3_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB3_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w11
-; NOLSE-NEXT:    cmp w11, w9, uxth
-; NOLSE-NEXT:    b.eq .LBB3_5
-; NOLSE-NEXT:  .LBB3_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB3_3 Depth 2
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxrh w9, [x0]
+; NOLSE-NEXT:    fmov s0, w9
 ; NOLSE-NEXT:    lsl w9, w9, #16
 ; NOLSE-NEXT:    fmov s2, w9
 ; NOLSE-NEXT:    fadd s2, s2, s1
@@ -220,21 +162,9 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 ; NOLSE-NEXT:    add w9, w9, w8
 ; NOLSE-NEXT:    add w9, w10, w9
 ; NOLSE-NEXT:    lsr w9, w9, #16
-; NOLSE-NEXT:    fmov s2, w9
-; NOLSE-NEXT:    fmov w9, s0
-; NOLSE-NEXT:    fmov w10, s2
-; NOLSE-NEXT:  .LBB3_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB3_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxrh w11, [x0]
-; NOLSE-NEXT:    cmp w11, w9, uxth
-; NOLSE-NEXT:    b.ne .LBB3_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB3_3 Depth=2
-; NOLSE-NEXT:    stlxrh wzr, w10, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB3_3
-; NOLSE-NEXT:    b .LBB3_1
-; NOLSE-NEXT:  .LBB3_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxrh w10, w9, [x0]
+; NOLSE-NEXT:    cbnz w10, .LBB3_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    // kill: def $h0 killed $h0 killed $s0
 ; NOLSE-NEXT:    ret
 ;
@@ -273,31 +203,15 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value)
 define float @test_atomicrmw_fadd_f32_seq_cst_align4(ptr %ptr, float %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align4:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    ldr s1, [x0]
-; NOLSE-NEXT:    b .LBB4_2
 ; NOLSE-NEXT:  .LBB4_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB4_2 Depth=1
-; NOLSE-NEXT:    fmov s1, w10
-; NOLSE-NEXT:    cmp w10, w9
-; NOLSE-NEXT:    b.eq .LBB4_5
-; NOLSE-NEXT:  .LBB4_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB4_3 Depth 2
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxr w8, [x0]
+; NOLSE-NEXT:    fmov s1, w8
 ; NOLSE-NEXT:    fadd s2, s1, s0
-; NOLSE-NEXT:    fmov w9, s1
 ; NOLSE-NEXT:    fmov w8, s2
-; NOLSE-NEXT:  .LBB4_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB4_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxr w10, [x0]
-; NOLSE-NEXT:    cmp w10, w9
-; NOLSE-NEXT:    b.ne .LBB4_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB4_3 Depth=2
-; NOLSE-NEXT:    stlxr wzr, w8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB4_3
-; NOLSE-NEXT:    b .LBB4_1
-; NOLSE-NEXT:  .LBB4_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxr w9, w8, [x0]
+; NOLSE-NEXT:    cbnz w9, .LBB4_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    fmov s0, s1
 ; NOLSE-NEXT:    ret
 ;
@@ -324,31 +238,15 @@ define float @test_atomicrmw_fadd_f32_seq_cst_align4(ptr %ptr, float %value) #0
 define double @test_atomicrmw_fadd_f32_seq_cst_align8(ptr %ptr, double %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align8:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    ldr d1, [x0]
-; NOLSE-NEXT:    b .LBB5_2
 ; NOLSE-NEXT:  .LBB5_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB5_2 Depth=1
-; NOLSE-NEXT:    fmov d1, x10
-; NOLSE-NEXT:    cmp x10, x9
-; NOLSE-NEXT:    b.eq .LBB5_5
-; NOLSE-NEXT:  .LBB5_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB5_3 Depth 2
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxr x8, [x0]
+; NOLSE-NEXT:    fmov d1, x8
 ; NOLSE-NEXT:    fadd d2, d1, d0
-; NOLSE-NEXT:    fmov x9, d1
 ; NOLSE-NEXT:    fmov x8, d2
-; NOLSE-NEXT:  .LBB5_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB5_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxr x10, [x0]
-; NOLSE-NEXT:    cmp x10, x9
-; NOLSE-NEXT:    b.ne .LBB5_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB5_3 Depth=2
-; NOLSE-NEXT:    stlxr wzr, x8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB5_3
-; NOLSE-NEXT:    b .LBB5_1
-; NOLSE-NEXT:  .LBB5_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxr w9, x8, [x0]
+; NOLSE-NEXT:    cbnz w9, .LBB5_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
 ; NOLSE-NEXT:    fmov d0, d1
 ; NOLSE-NEXT:    ret
 ;
@@ -375,54 +273,26 @@ define double @test_atomicrmw_fadd_f32_seq_cst_align8(ptr %ptr, double %value) #
 define fp128 @test_atomicrmw_fadd_f32_seq_cst_align16(ptr %ptr, fp128 %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align16:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    sub sp, sp, #96
-; NOLSE-NEXT:    ldr q1, [x0]
-; NOLSE-NEXT:    stp x30, x19, [sp, #80] // 16-byte Folded Spill
+; NOLSE-NEXT:    sub sp, sp, #80
+; NOLSE-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
 ; NOLSE-NEXT:    mov x19, x0
-; NOLSE-NEXT:    str q0, [sp] // 16-byte Folded Spill
-; NOLSE-NEXT:    b .LBB6_2
+; NOLSE-NEXT:    str q0, [sp, #16] // 16-byte Folded Spill
 ; NOLSE-NEXT:  .LBB6_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB6_2 Depth=1
-; NOLSE-NEXT:    stp x12, x13, [sp, #32]
-; NOLSE-NEXT:    cmp x13, x10
-; NOLSE-NEXT:    ldr q1, [sp, #32]
-; NOLSE-NEXT:    ccmp x12, x11, #0, eq
-; NOLSE-NEXT:    b.eq .LBB6_6
-; NOLSE-NEXT:  .LBB6_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB6_3 Depth 2
-; NOLSE-NEXT:    mov v0.16b, v1.16b
-; NOLSE-NEXT:    str q1, [sp, #16] // 16-byte Folded Spill
-; NOLSE-NEXT:    ldr q1, [sp] // 16-byte Folded Reload
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxp x8, x9, [x19]
+; NOLSE-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; NOLSE-NEXT:    stp x8, x9, [sp, #48]
+; NOLSE-NEXT:    ldr q0, [sp, #48]
+; NOLSE-NEXT:    str q0, [sp] // 16-byte Folded Spill
 ; NOLSE-NEXT:    bl __addtf3
-; NOLSE-NEXT:    str q0, [sp, #48]
-; NOLSE-NEXT:    ldr q0, [sp, #16] // 16-byte Folded Reload
-; NOLSE-NEXT:    ldp x9, x8, [sp, #48]
-; NOLSE-NEXT:    str q0, [sp, #64]
-; NOLSE-NEXT:    ldp x11, x10, [sp, #64]
-; NOLSE-NEXT:  .LBB6_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB6_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxp x12, x13, [x19]
-; NOLSE-NEXT:    cmp x12, x11
-; NOLSE-NEXT:    cset w14, ne
-; NOLSE-NEXT:    cmp x13, x10
-; NOLSE-NEXT:    cinc w14, w14, ne
-; NOLSE-NEXT:    cbz w14, .LBB6_5
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
-; NOLSE-NEXT:    stlxp w14, x12, x13, [x19]
-; NOLSE-NEXT:    cbnz w14, .LBB6_3
-; NOLSE-NEXT:    b .LBB6_1
-; NOLSE-NEXT:  .LBB6_5: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB6_3 Depth=2
-; NOLSE-NEXT:    stlxp w14, x9, x8, [x19]
-; NOLSE-NEXT:    cbnz w14, .LBB6_3
-; NOLSE-NEXT:    b .LBB6_1
-; NOLSE-NEXT:  .LBB6_6: // %atomicrmw.end
-; NOLSE-NEXT:    ldp x30, x19, [sp, #80] // 16-byte Folded Reload
-; NOLSE-NEXT:    mov v0.16b, v1.16b
-; NOLSE-NEXT:    add sp, sp, #96
+; NOLSE-NEXT:    str q0, [sp, #32]
+; NOLSE-NEXT:    ldp x9, x8, [sp, #32]
+; NOLSE-NEXT:    stlxp w10, x9, x8, [x19]
+; NOLSE-NEXT:    cbnz w10, .LBB6_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
+; NOLSE-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; NOLSE-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; NOLSE-NEXT:    add sp, sp, #80
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align16:
@@ -463,35 +333,19 @@ define fp128 @test_atomicrmw_fadd_f32_seq_cst_align16(ptr %ptr, fp128 %value) #0
 define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half> %value) #0 {
 ; NOLSE-LABEL: test_atomicrmw_fadd_v2f16_seq_cst_align4:
 ; NOLSE:       // %bb.0:
-; NOLSE-NEXT:    fcvtl v1.4s, v0.4h
-; NOLSE-NEXT:    ldr s0, [x0]
-; NOLSE-NEXT:    b .LBB7_2
+; NOLSE-NEXT:    fcvtl v0.4s, v0.4h
 ; NOLSE-NEXT:  .LBB7_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB7_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w10
-; NOLSE-NEXT:    cmp w10, w9
-; NOLSE-NEXT:    b.eq .LBB7_5
-; NOLSE-NEXT:  .LBB7_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB7_3 Depth 2
-; NOLSE-NEXT:    fcvtl v2.4s, v0.4h
-; NOLSE-NEXT:    fmov w9, s0
-; NOLSE-NEXT:    fadd v2.4s, v2.4s, v1.4s
-; NOLSE-NEXT:    fcvtn v2.4h, v2.4s
-; NOLSE-NEXT:    fmov w8, s2
-; NOLSE-NEXT:  .LBB7_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB7_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxr w10, [x0]
-; NOLSE-NEXT:    cmp w10, w9
-; NOLSE-NEXT:    b.ne .LBB7_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB7_3 Depth=2
-; NOLSE-NEXT:    stlxr wzr, w8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB7_3
-; NOLSE-NEXT:    b .LBB7_1
-; NOLSE-NEXT:  .LBB7_5: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxr w8, [x0]
+; NOLSE-NEXT:    fmov s1, w8
+; NOLSE-NEXT:    fcvtl v1.4s, v1.4h
+; NOLSE-NEXT:    fadd v1.4s, v1.4s, v0.4s
+; NOLSE-NEXT:    fcvtn v1.4h, v1.4s
+; NOLSE-NEXT:    fmov w9, s1
+; NOLSE-NEXT:    stlxr w10, w9, [x0]
+; NOLSE-NEXT:    cbnz w10, .LBB7_1
+; NOLSE-NEXT:  // %bb.2: // %atomicrmw.end
+; NOLSE-NEXT:    fmov d0, x8
 ; NOLSE-NEXT:    ret
 ;
 ; LSE-LABEL: test_atomicrmw_fadd_v2f16_seq_cst_align4:
@@ -522,38 +376,22 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf
 ; NOLSE:       // %bb.0:
 ; NOLSE-NEXT:    movi v1.4s, #1
 ; NOLSE-NEXT:    movi v2.4s, #127, msl #8
-; NOLSE-NEXT:    shll v3.4s, v0.4h, #16
-; NOLSE-NEXT:    ldr s0, [x0]
-; NOLSE-NEXT:    b .LBB8_2
+; NOLSE-NEXT:    shll v0.4s, v0.4h, #16
 ; NOLSE-NEXT:  .LBB8_1: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB8_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w10
-; NOLSE-NEXT:    cmp w10, w9
-; NOLSE-NEXT:    b.eq .LBB8_5
-; NOLSE-NEXT:  .LBB8_2: // %atomicrmw.start
-; NOLSE-NEXT:    // =>This Loop Header: Depth=1
-; NOLSE-NEXT:    // Child Loop BB8_3 Depth 2
-; NOLSE-NEXT:    shll v4.4s, v0.4h, #16
-; NOLSE-NEXT:    fmov w9, s0
-; NOLSE-NEXT:    fadd v4.4s, v4.4s, v3.4s
-; NOLSE-NEXT:    ushr v5.4s, v4.4s, #16
-; NOLSE-NEXT:    and v5.16b, v5.16b, v1.16b
-; NOLSE-NEXT:    add v4.4s, v5.4s, v4.4s
-; NOLSE-NEXT:    addhn v4.4h, v4.4s, v2.4s
-; NOLSE-NEXT:    fmov w8, s4
-; NOLSE-NEXT:  .LBB8_3: // %atomicrmw.start
-; NOLSE-NEXT:    // Parent Loop BB8_2 Depth=1
-; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxr w10, [x0]
-; NOLSE-NEXT:    cmp w10, w9
-; NOLSE-NEXT:    b.ne .LBB8_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
-; NOLSE-NEXT:    // in Loop: Header=BB8_3 Depth=2
-; NOLSE-NEXT:    stlxr wzr, w8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB8_3
-; NOLSE-NEXT:    b .LBB8_1
-; NOLSE-NEXT:  .LBB8_5: // %atomicrmw.end
-; NOLSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; NOLSE-NEXT:    // =>This Inner Loop Header: Depth=1
+; NOLSE-NEXT:    ldaxr w8, [x0]
+; NOLSE-NEXT:    fmov s3, w8
+; NOLSE-NEXT:    shll ...
[truncated]

efriedma-quic · 2024-08-14T16:46:26Z

Please also fix the comment before AArch64TargetLowering::shouldExpandAtomicRMWInIR . (The comment is wrong; a floating-point "exception" isn't an exception in the sense that it would cancel an exclusive reservation.)

efriedma-quic · 2024-08-15T18:09:45Z

Just thought of this, but... we can't do this in the case where we do a libcall. Any load or store between the load exclusive and the store exclusive could break the reservation. (It normally won't, but it can in weird cases where the atomic variable is on the stack.) So we have to use cmpxchg lowering in those cases (and then expand the cmpxchg to ll/sc).

The cases we can do inline should be fine, though.

arsenm · 2024-08-22T11:31:49Z

ping

efriedma-quic · 2024-08-22T19:28:41Z

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+  case Type::DoubleTyID:
+  case Type::HalfTyID:
+  case Type::BFloatTyID:
+    return false;


We need to check if we're in softfp mode (Subtarget->hasFPARMv8()).

Seems to not do anything?

aarch64 has fp enabled by default; you need something like -mattr=-fp-armv8 to test.

(I'm confident Subtarget->hasFPARMv8() is the right check; it's the same check used to enable the registers.)

That is in the test (in the parent #103701)

efriedma-quic

LGTM

arsenm · 2024-08-29T15:32:03Z

Merge activity

Aug 29, 11:32 AM EDT: @arsenm started a stack merge that includes this pull request via Graphite.
Aug 29, 11:43 AM EDT: Graphite rebased this pull request as part of a merge.
Aug 29, 11:46 AM EDT: Graphite rebased this pull request as part of a merge.
Aug 29, 11:48 AM EDT: @arsenm merged this pull request with Graphite.

Use LLSC or cmpxchg in the same cases as for the unsupported integer operations. This required some fixups to the LLSC implementatation to deal with the fp128 case.

arsenm mentioned this pull request Aug 14, 2024

AArch64: Add tests for atomicrmw fp operations #103701

Merged

arsenm added the backend:AArch64 label Aug 14, 2024 — with Graphite App

arsenm requested review from aemerson, davemgreen, efriedma-quic, ornata and TNorthover August 14, 2024 06:46

arsenm marked this pull request as ready for review August 14, 2024 06:46

arsenm mentioned this pull request Aug 14, 2024

AtomicExpand: Allow incrementally legalizing atomicrmw #103371

Merged

davemgreen requested a review from tmatheson-arm August 14, 2024 08:17

arsenm force-pushed the users/arsenm/aarch64-add-atomicrmw-fp-tests branch from 6b636d0 to da57a21 Compare August 15, 2024 14:36

arsenm force-pushed the users/arsenm/aarch64-fp-atomic-expansion branch from ce17dc9 to d576f3e Compare August 15, 2024 14:36

arsenm force-pushed the users/arsenm/aarch64-add-atomicrmw-fp-tests branch from da57a21 to 38ce457 Compare August 16, 2024 10:49

arsenm force-pushed the users/arsenm/aarch64-fp-atomic-expansion branch from d576f3e to 5c3c318 Compare August 16, 2024 10:49

arsenm force-pushed the users/arsenm/aarch64-add-atomicrmw-fp-tests branch from 38ce457 to 07be231 Compare August 19, 2024 20:14

arsenm force-pushed the users/arsenm/aarch64-fp-atomic-expansion branch from 5c3c318 to 5e13bdd Compare August 19, 2024 20:14

efriedma-quic reviewed Aug 22, 2024

View reviewed changes

arsenm force-pushed the users/arsenm/aarch64-add-atomicrmw-fp-tests branch from 07be231 to f988002 Compare August 22, 2024 20:06

arsenm force-pushed the users/arsenm/aarch64-fp-atomic-expansion branch from 5e13bdd to 5bed50c Compare August 22, 2024 20:06

efriedma-quic approved these changes Aug 28, 2024

View reviewed changes

arsenm force-pushed the users/arsenm/aarch64-add-atomicrmw-fp-tests branch from f988002 to 3fff86f Compare August 28, 2024 18:19

arsenm force-pushed the users/arsenm/aarch64-fp-atomic-expansion branch from 5bed50c to 5af8653 Compare August 28, 2024 18:19

arsenm force-pushed the users/arsenm/aarch64-add-atomicrmw-fp-tests branch from 3fff86f to 07ad2a3 Compare August 29, 2024 03:46

arsenm force-pushed the users/arsenm/aarch64-fp-atomic-expansion branch from 5af8653 to bc9f9cd Compare August 29, 2024 03:46

arsenm force-pushed the users/arsenm/aarch64-add-atomicrmw-fp-tests branch from 07ad2a3 to eb21c2b Compare August 29, 2024 05:20

arsenm force-pushed the users/arsenm/aarch64-fp-atomic-expansion branch from bc9f9cd to 58082aa Compare August 29, 2024 05:20

arsenm force-pushed the users/arsenm/aarch64-add-atomicrmw-fp-tests branch from eb21c2b to 9c924de Compare August 29, 2024 09:50

arsenm force-pushed the users/arsenm/aarch64-fp-atomic-expansion branch from 58082aa to ad70ac1 Compare August 29, 2024 09:51

arsenm force-pushed the users/arsenm/aarch64-add-atomicrmw-fp-tests branch from 9c924de to 03949db Compare August 29, 2024 15:34

Base automatically changed from users/arsenm/aarch64-add-atomicrmw-fp-tests to main August 29, 2024 15:36

arsenm force-pushed the users/arsenm/aarch64-fp-atomic-expansion branch from ad70ac1 to 2573ff3 Compare August 29, 2024 15:42

arsenm added 4 commits August 29, 2024 15:46

AArch64: Use consistent atomicrmw expansion for FP operations

a060cf6

Use LLSC or cmpxchg in the same cases as for the unsupported integer operations. This required some fixups to the LLSC implementatation to deal with the fp128 case.

Remove wrong comment about fp exceptions

07349a6

Check if libcall

665d84c

Check soft float

bf31e46

arsenm force-pushed the users/arsenm/aarch64-fp-atomic-expansion branch from 2573ff3 to bf31e46 Compare August 29, 2024 15:46

arsenm merged commit 26c3a84 into main Aug 29, 2024
4 of 5 checks passed

arsenm deleted the users/arsenm/aarch64-fp-atomic-expansion branch August 29, 2024 15:48

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

AArch64: Use consistent atomicrmw expansion for FP operations #103702

AArch64: Use consistent atomicrmw expansion for FP operations #103702

arsenm commented Aug 14, 2024 •

edited

Loading

arsenm commented Aug 14, 2024 •

edited

Loading

llvmbot commented Aug 14, 2024

efriedma-quic commented Aug 14, 2024

efriedma-quic commented Aug 15, 2024

arsenm commented Aug 22, 2024

efriedma-quic Aug 22, 2024

arsenm Aug 22, 2024

efriedma-quic Aug 22, 2024

efriedma-quic Aug 22, 2024

arsenm Aug 28, 2024

efriedma-quic left a comment

arsenm commented Aug 29, 2024 •

edited

Loading

AArch64: Use consistent atomicrmw expansion for FP operations #103702

AArch64: Use consistent atomicrmw expansion for FP operations #103702

Conversation

arsenm commented Aug 14, 2024 • edited Loading

arsenm commented Aug 14, 2024 • edited Loading

llvmbot commented Aug 14, 2024

efriedma-quic commented Aug 14, 2024

efriedma-quic commented Aug 15, 2024

arsenm commented Aug 22, 2024

efriedma-quic Aug 22, 2024

Choose a reason for hiding this comment

arsenm Aug 22, 2024

Choose a reason for hiding this comment

efriedma-quic Aug 22, 2024

Choose a reason for hiding this comment

efriedma-quic Aug 22, 2024

Choose a reason for hiding this comment

arsenm Aug 28, 2024

Choose a reason for hiding this comment

efriedma-quic left a comment

Choose a reason for hiding this comment

arsenm commented Aug 29, 2024 • edited Loading

Merge activity

arsenm commented Aug 14, 2024 •

edited

Loading

arsenm commented Aug 14, 2024 •

edited

Loading

arsenm commented Aug 29, 2024 •

edited

Loading