From 5e13bdd36179f5fe9511e013e8ffb9099711d379 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 16 Aug 2024 14:26:55 +0400 Subject: [PATCH] Check if libcall --- .../Target/AArch64/AArch64ISelLowering.cpp | 24 ++++++- llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll | 62 ++++++++++++++----- llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll | 62 ++++++++++++++----- llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll | 62 ++++++++++++++----- llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll | 62 ++++++++++++++----- 5 files changed, 202 insertions(+), 70 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 56844210511ace..8accf9ebcb8190 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -27056,13 +27056,33 @@ AArch64TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { : AtomicExpansionKind::LLSC; } +// Return true if the atomic operation expansion will lower to use a library +// call, and is thus ineligible to use an LLSC expansion. +static bool rmwOpMayLowerToLibcall(const AtomicRMWInst *RMW) { + if (!RMW->isFloatingPointOperation()) + return false; + switch (RMW->getType()->getScalarType()->getTypeID()) { + case Type::FloatTyID: + case Type::DoubleTyID: + case Type::HalfTyID: + case Type::BFloatTyID: + return false; + default: + // fp128 will emit library calls. + return true; + } + + llvm_unreachable("covered type switch"); +} + // The "default" for integer RMW operations is to expand to an LL/SC loop. // However, with the LSE instructions (or outline-atomics mode, which provides // library routines in place of the LSE-instructions), we can directly emit many // operations instead. TargetLowering::AtomicExpansionKind AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { - unsigned Size = AI->getType()->getPrimitiveSizeInBits(); + Type *Ty = AI->getType(); + unsigned Size = Ty->getPrimitiveSizeInBits(); assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes."); bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 && @@ -27101,7 +27121,7 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { // succeed. So at -O0 lower this operation to a CAS loop. Also worthwhile if // we have a single CAS instruction that can replace the loop. if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None || - Subtarget->hasLSE()) + Subtarget->hasLSE() || rmwOpMayLowerToLibcall(AI)) return AtomicExpansionKind::CmpXChg; return AtomicExpansionKind::LLSC; diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll index 20a8a862506f70..50aba3a7719e79 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll @@ -273,26 +273,54 @@ define double @test_atomicrmw_fadd_f32_seq_cst_align8(ptr %ptr, double %value) # define fp128 @test_atomicrmw_fadd_fp128_seq_cst_align16(ptr %ptr, fp128 %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fadd_fp128_seq_cst_align16: ; NOLSE: // %bb.0: -; NOLSE-NEXT: sub sp, sp, #80 -; NOLSE-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; NOLSE-NEXT: sub sp, sp, #96 +; NOLSE-NEXT: ldr q1, [x0] +; NOLSE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; NOLSE-NEXT: mov x19, x0 -; NOLSE-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; NOLSE-NEXT: .LBB6_1: // %atomicrmw.start -; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 -; NOLSE-NEXT: ldaxp x8, x9, [x19] -; NOLSE-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload -; NOLSE-NEXT: stp x8, x9, [sp, #48] -; NOLSE-NEXT: ldr q0, [sp, #48] ; NOLSE-NEXT: str q0, [sp] // 16-byte Folded Spill +; NOLSE-NEXT: b .LBB6_2 +; NOLSE-NEXT: .LBB6_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 +; NOLSE-NEXT: stp x12, x13, [sp, #32] +; NOLSE-NEXT: cmp x13, x10 +; NOLSE-NEXT: ldr q1, [sp, #32] +; NOLSE-NEXT: ccmp x12, x11, #0, eq +; NOLSE-NEXT: b.eq .LBB6_6 +; NOLSE-NEXT: .LBB6_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB6_3 Depth 2 +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; NOLSE-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; NOLSE-NEXT: bl __addtf3 -; NOLSE-NEXT: str q0, [sp, #32] -; NOLSE-NEXT: ldp x9, x8, [sp, #32] -; NOLSE-NEXT: stlxp w10, x9, x8, [x19] -; NOLSE-NEXT: cbnz w10, .LBB6_1 -; NOLSE-NEXT: // %bb.2: // %atomicrmw.end -; NOLSE-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload -; NOLSE-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; NOLSE-NEXT: add sp, sp, #80 +; NOLSE-NEXT: str q0, [sp, #48] +; NOLSE-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; NOLSE-NEXT: ldp x9, x8, [sp, #48] +; NOLSE-NEXT: str q0, [sp, #64] +; NOLSE-NEXT: ldp x11, x10, [sp, #64] +; NOLSE-NEXT: .LBB6_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxp x12, x13, [x19] +; NOLSE-NEXT: cmp x12, x11 +; NOLSE-NEXT: cset w14, ne +; NOLSE-NEXT: cmp x13, x10 +; NOLSE-NEXT: cinc w14, w14, ne +; NOLSE-NEXT: cbz w14, .LBB6_5 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x12, x13, [x19] +; NOLSE-NEXT: cbnz w14, .LBB6_3 +; NOLSE-NEXT: b .LBB6_1 +; NOLSE-NEXT: .LBB6_5: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x9, x8, [x19] +; NOLSE-NEXT: cbnz w14, .LBB6_3 +; NOLSE-NEXT: b .LBB6_1 +; NOLSE-NEXT: .LBB6_6: // %atomicrmw.end +; NOLSE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: add sp, sp, #96 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fadd_fp128_seq_cst_align16: diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll index 7a65e5cb02f577..5cbc06796026f8 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll @@ -273,26 +273,54 @@ define double @test_atomicrmw_fmax_f32_seq_cst_align8(ptr %ptr, double %value) # define fp128 @test_atomicrmw_fmax_fp128_seq_cst_align16(ptr %ptr, fp128 %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmax_fp128_seq_cst_align16: ; NOLSE: // %bb.0: -; NOLSE-NEXT: sub sp, sp, #80 -; NOLSE-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; NOLSE-NEXT: sub sp, sp, #96 +; NOLSE-NEXT: ldr q1, [x0] +; NOLSE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; NOLSE-NEXT: mov x19, x0 -; NOLSE-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; NOLSE-NEXT: .LBB6_1: // %atomicrmw.start -; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 -; NOLSE-NEXT: ldaxp x8, x9, [x19] -; NOLSE-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload -; NOLSE-NEXT: stp x8, x9, [sp, #48] -; NOLSE-NEXT: ldr q0, [sp, #48] ; NOLSE-NEXT: str q0, [sp] // 16-byte Folded Spill +; NOLSE-NEXT: b .LBB6_2 +; NOLSE-NEXT: .LBB6_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 +; NOLSE-NEXT: stp x12, x13, [sp, #32] +; NOLSE-NEXT: cmp x13, x10 +; NOLSE-NEXT: ldr q1, [sp, #32] +; NOLSE-NEXT: ccmp x12, x11, #0, eq +; NOLSE-NEXT: b.eq .LBB6_6 +; NOLSE-NEXT: .LBB6_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB6_3 Depth 2 +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; NOLSE-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; NOLSE-NEXT: bl fmaxl -; NOLSE-NEXT: str q0, [sp, #32] -; NOLSE-NEXT: ldp x9, x8, [sp, #32] -; NOLSE-NEXT: stlxp w10, x9, x8, [x19] -; NOLSE-NEXT: cbnz w10, .LBB6_1 -; NOLSE-NEXT: // %bb.2: // %atomicrmw.end -; NOLSE-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload -; NOLSE-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; NOLSE-NEXT: add sp, sp, #80 +; NOLSE-NEXT: str q0, [sp, #48] +; NOLSE-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; NOLSE-NEXT: ldp x9, x8, [sp, #48] +; NOLSE-NEXT: str q0, [sp, #64] +; NOLSE-NEXT: ldp x11, x10, [sp, #64] +; NOLSE-NEXT: .LBB6_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxp x12, x13, [x19] +; NOLSE-NEXT: cmp x12, x11 +; NOLSE-NEXT: cset w14, ne +; NOLSE-NEXT: cmp x13, x10 +; NOLSE-NEXT: cinc w14, w14, ne +; NOLSE-NEXT: cbz w14, .LBB6_5 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x12, x13, [x19] +; NOLSE-NEXT: cbnz w14, .LBB6_3 +; NOLSE-NEXT: b .LBB6_1 +; NOLSE-NEXT: .LBB6_5: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x9, x8, [x19] +; NOLSE-NEXT: cbnz w14, .LBB6_3 +; NOLSE-NEXT: b .LBB6_1 +; NOLSE-NEXT: .LBB6_6: // %atomicrmw.end +; NOLSE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: add sp, sp, #96 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fmax_fp128_seq_cst_align16: diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll index 17e618849501c1..e7c950d1c89ea4 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll @@ -273,26 +273,54 @@ define double @test_atomicrmw_fmin_f32_seq_cst_align8(ptr %ptr, double %value) # define fp128 @test_atomicrmw_fmin_fp128_seq_cst_align16(ptr %ptr, fp128 %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmin_fp128_seq_cst_align16: ; NOLSE: // %bb.0: -; NOLSE-NEXT: sub sp, sp, #80 -; NOLSE-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; NOLSE-NEXT: sub sp, sp, #96 +; NOLSE-NEXT: ldr q1, [x0] +; NOLSE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; NOLSE-NEXT: mov x19, x0 -; NOLSE-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; NOLSE-NEXT: .LBB6_1: // %atomicrmw.start -; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 -; NOLSE-NEXT: ldaxp x8, x9, [x19] -; NOLSE-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload -; NOLSE-NEXT: stp x8, x9, [sp, #48] -; NOLSE-NEXT: ldr q0, [sp, #48] ; NOLSE-NEXT: str q0, [sp] // 16-byte Folded Spill +; NOLSE-NEXT: b .LBB6_2 +; NOLSE-NEXT: .LBB6_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 +; NOLSE-NEXT: stp x12, x13, [sp, #32] +; NOLSE-NEXT: cmp x13, x10 +; NOLSE-NEXT: ldr q1, [sp, #32] +; NOLSE-NEXT: ccmp x12, x11, #0, eq +; NOLSE-NEXT: b.eq .LBB6_6 +; NOLSE-NEXT: .LBB6_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB6_3 Depth 2 +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; NOLSE-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; NOLSE-NEXT: bl fminl -; NOLSE-NEXT: str q0, [sp, #32] -; NOLSE-NEXT: ldp x9, x8, [sp, #32] -; NOLSE-NEXT: stlxp w10, x9, x8, [x19] -; NOLSE-NEXT: cbnz w10, .LBB6_1 -; NOLSE-NEXT: // %bb.2: // %atomicrmw.end -; NOLSE-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload -; NOLSE-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; NOLSE-NEXT: add sp, sp, #80 +; NOLSE-NEXT: str q0, [sp, #48] +; NOLSE-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; NOLSE-NEXT: ldp x9, x8, [sp, #48] +; NOLSE-NEXT: str q0, [sp, #64] +; NOLSE-NEXT: ldp x11, x10, [sp, #64] +; NOLSE-NEXT: .LBB6_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxp x12, x13, [x19] +; NOLSE-NEXT: cmp x12, x11 +; NOLSE-NEXT: cset w14, ne +; NOLSE-NEXT: cmp x13, x10 +; NOLSE-NEXT: cinc w14, w14, ne +; NOLSE-NEXT: cbz w14, .LBB6_5 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x12, x13, [x19] +; NOLSE-NEXT: cbnz w14, .LBB6_3 +; NOLSE-NEXT: b .LBB6_1 +; NOLSE-NEXT: .LBB6_5: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x9, x8, [x19] +; NOLSE-NEXT: cbnz w14, .LBB6_3 +; NOLSE-NEXT: b .LBB6_1 +; NOLSE-NEXT: .LBB6_6: // %atomicrmw.end +; NOLSE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: add sp, sp, #96 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fmin_fp128_seq_cst_align16: diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll index 0ccf91249a0b82..31858506266d61 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll @@ -273,26 +273,54 @@ define double @test_atomicrmw_fsub_f32_seq_cst_align8(ptr %ptr, double %value) # define fp128 @test_atomicrmw_fsub_fp128_seq_cst_align16(ptr %ptr, fp128 %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fsub_fp128_seq_cst_align16: ; NOLSE: // %bb.0: -; NOLSE-NEXT: sub sp, sp, #80 -; NOLSE-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; NOLSE-NEXT: sub sp, sp, #96 +; NOLSE-NEXT: ldr q1, [x0] +; NOLSE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill ; NOLSE-NEXT: mov x19, x0 -; NOLSE-NEXT: str q0, [sp, #16] // 16-byte Folded Spill -; NOLSE-NEXT: .LBB6_1: // %atomicrmw.start -; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 -; NOLSE-NEXT: ldaxp x8, x9, [x19] -; NOLSE-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload -; NOLSE-NEXT: stp x8, x9, [sp, #48] -; NOLSE-NEXT: ldr q0, [sp, #48] ; NOLSE-NEXT: str q0, [sp] // 16-byte Folded Spill +; NOLSE-NEXT: b .LBB6_2 +; NOLSE-NEXT: .LBB6_1: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 +; NOLSE-NEXT: stp x12, x13, [sp, #32] +; NOLSE-NEXT: cmp x13, x10 +; NOLSE-NEXT: ldr q1, [sp, #32] +; NOLSE-NEXT: ccmp x12, x11, #0, eq +; NOLSE-NEXT: b.eq .LBB6_6 +; NOLSE-NEXT: .LBB6_2: // %atomicrmw.start +; NOLSE-NEXT: // =>This Loop Header: Depth=1 +; NOLSE-NEXT: // Child Loop BB6_3 Depth 2 +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: str q1, [sp, #16] // 16-byte Folded Spill +; NOLSE-NEXT: ldr q1, [sp] // 16-byte Folded Reload ; NOLSE-NEXT: bl __subtf3 -; NOLSE-NEXT: str q0, [sp, #32] -; NOLSE-NEXT: ldp x9, x8, [sp, #32] -; NOLSE-NEXT: stlxp w10, x9, x8, [x19] -; NOLSE-NEXT: cbnz w10, .LBB6_1 -; NOLSE-NEXT: // %bb.2: // %atomicrmw.end -; NOLSE-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload -; NOLSE-NEXT: ldr q0, [sp] // 16-byte Folded Reload -; NOLSE-NEXT: add sp, sp, #80 +; NOLSE-NEXT: str q0, [sp, #48] +; NOLSE-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload +; NOLSE-NEXT: ldp x9, x8, [sp, #48] +; NOLSE-NEXT: str q0, [sp, #64] +; NOLSE-NEXT: ldp x11, x10, [sp, #64] +; NOLSE-NEXT: .LBB6_3: // %atomicrmw.start +; NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 +; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 +; NOLSE-NEXT: ldaxp x12, x13, [x19] +; NOLSE-NEXT: cmp x12, x11 +; NOLSE-NEXT: cset w14, ne +; NOLSE-NEXT: cmp x13, x10 +; NOLSE-NEXT: cinc w14, w14, ne +; NOLSE-NEXT: cbz w14, .LBB6_5 +; NOLSE-NEXT: // %bb.4: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x12, x13, [x19] +; NOLSE-NEXT: cbnz w14, .LBB6_3 +; NOLSE-NEXT: b .LBB6_1 +; NOLSE-NEXT: .LBB6_5: // %atomicrmw.start +; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 +; NOLSE-NEXT: stlxp w14, x9, x8, [x19] +; NOLSE-NEXT: cbnz w14, .LBB6_3 +; NOLSE-NEXT: b .LBB6_1 +; NOLSE-NEXT: .LBB6_6: // %atomicrmw.end +; NOLSE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload +; NOLSE-NEXT: mov v0.16b, v1.16b +; NOLSE-NEXT: add sp, sp, #96 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fsub_fp128_seq_cst_align16: