diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 40d9fa4f2b494a..26f698898e487b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -27061,9 +27061,6 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { unsigned Size = AI->getType()->getPrimitiveSizeInBits(); assert(Size <= 128 && "AtomicExpandPass should've handled larger sizes."); - if (AI->isFloatingPointOperation()) - return AtomicExpansionKind::CmpXChg; - bool CanUseLSE128 = Subtarget->hasLSE128() && Size == 128 && (AI->getOperation() == AtomicRMWInst::Xchg || AI->getOperation() == AtomicRMWInst::Or || @@ -27073,7 +27070,8 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { // Nand is not supported in LSE. // Leave 128 bits to LLSC or CmpXChg. - if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128) { + if (AI->getOperation() != AtomicRMWInst::Nand && Size < 128 && + !AI->isFloatingPointOperation()) { if (Subtarget->hasLSE()) return AtomicExpansionKind::None; if (Subtarget->outlineAtomics()) { @@ -27146,10 +27144,14 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder, Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo"); Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi"); - Lo = Builder.CreateZExt(Lo, ValueTy, "lo64"); - Hi = Builder.CreateZExt(Hi, ValueTy, "hi64"); - return Builder.CreateOr( - Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 64)), "val64"); + + auto *Int128Ty = Type::getInt128Ty(Builder.getContext()); + Lo = Builder.CreateZExt(Lo, Int128Ty, "lo64"); + Hi = Builder.CreateZExt(Hi, Int128Ty, "hi64"); + + Value *Or = Builder.CreateOr( + Lo, Builder.CreateShl(Hi, ConstantInt::get(Int128Ty, 64)), "val64"); + return Builder.CreateBitCast(Or, ValueTy); } Type *Tys[] = { Addr->getType() }; @@ -27160,8 +27162,8 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder, const DataLayout &DL = M->getDataLayout(); IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy)); CallInst *CI = Builder.CreateCall(Ldxr, Addr); - CI->addParamAttr( - 0, Attribute::get(Builder.getContext(), Attribute::ElementType, ValueTy)); + CI->addParamAttr(0, Attribute::get(Builder.getContext(), + Attribute::ElementType, IntEltTy)); Value *Trunc = Builder.CreateTrunc(CI, IntEltTy); return Builder.CreateBitCast(Trunc, ValueTy); @@ -27187,9 +27189,13 @@ Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder, IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp; Function *Stxr = Intrinsic::getDeclaration(M, Int); Type *Int64Ty = Type::getInt64Ty(M->getContext()); + Type *Int128Ty = Type::getInt128Ty(M->getContext()); + + Value *CastVal = Builder.CreateBitCast(Val, Int128Ty); - Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo"); - Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi"); + Value *Lo = Builder.CreateTrunc(CastVal, Int64Ty, "lo"); + Value *Hi = + Builder.CreateTrunc(Builder.CreateLShr(CastVal, 64), Int64Ty, "hi"); return Builder.CreateCall(Stxr, {Lo, Hi, Addr}); } diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll index f95caf325b197c..2c6461097f7d9b 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll @@ -6,33 +6,17 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align2: ; NOLSE: // %bb.0: ; NOLSE-NEXT: fcvt s1, h0 -; NOLSE-NEXT: ldr h0, [x0] -; NOLSE-NEXT: b .LBB0_2 ; NOLSE-NEXT: .LBB0_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.eq .LBB0_5 -; NOLSE-NEXT: .LBB0_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB0_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w8, [x0] +; NOLSE-NEXT: fmov s0, w8 ; NOLSE-NEXT: fcvt s2, h0 -; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: fadd s2, s2, s1 ; NOLSE-NEXT: fcvt h2, s2 ; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB0_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w10, [x0] -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.ne .LBB0_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB0_3 -; NOLSE-NEXT: b .LBB0_1 -; NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w9, w8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB0_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -63,33 +47,17 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fadd_f16_seq_cst_align4: ; NOLSE: // %bb.0: ; NOLSE-NEXT: fcvt s1, h0 -; NOLSE-NEXT: ldr h0, [x0] -; NOLSE-NEXT: b .LBB1_2 ; NOLSE-NEXT: .LBB1_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.eq .LBB1_5 -; NOLSE-NEXT: .LBB1_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB1_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w8, [x0] +; NOLSE-NEXT: fmov s0, w8 ; NOLSE-NEXT: fcvt s2, h0 -; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: fadd s2, s2, s1 ; NOLSE-NEXT: fcvt h2, s2 ; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB1_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w10, [x0] -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.ne .LBB1_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB1_3 -; NOLSE-NEXT: b .LBB1_1 -; NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w9, w8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB1_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -122,19 +90,12 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 ; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: ldr h0, [x0] ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s1, w9 -; NOLSE-NEXT: b .LBB2_2 ; NOLSE-NEXT: .LBB2_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; NOLSE-NEXT: fmov s0, w11 -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.eq .LBB2_5 -; NOLSE-NEXT: .LBB2_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB2_3 Depth 2 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w9, [x0] +; NOLSE-NEXT: fmov s0, w9 ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s2, w9 ; NOLSE-NEXT: fadd s2, s2, s1 @@ -143,21 +104,9 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; NOLSE-NEXT: add w9, w9, w8 ; NOLSE-NEXT: add w9, w10, w9 ; NOLSE-NEXT: lsr w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: fmov w10, s2 -; NOLSE-NEXT: .LBB2_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w11, [x0] -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.ne .LBB2_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w10, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB2_3 -; NOLSE-NEXT: b .LBB2_1 -; NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w10, w9, [x0] +; NOLSE-NEXT: cbnz w10, .LBB2_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -199,19 +148,12 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 ; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: ldr h0, [x0] ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s1, w9 -; NOLSE-NEXT: b .LBB3_2 ; NOLSE-NEXT: .LBB3_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; NOLSE-NEXT: fmov s0, w11 -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.eq .LBB3_5 -; NOLSE-NEXT: .LBB3_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB3_3 Depth 2 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w9, [x0] +; NOLSE-NEXT: fmov s0, w9 ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s2, w9 ; NOLSE-NEXT: fadd s2, s2, s1 @@ -220,21 +162,9 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; NOLSE-NEXT: add w9, w9, w8 ; NOLSE-NEXT: add w9, w10, w9 ; NOLSE-NEXT: lsr w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: fmov w10, s2 -; NOLSE-NEXT: .LBB3_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w11, [x0] -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.ne .LBB3_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w10, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB3_3 -; NOLSE-NEXT: b .LBB3_1 -; NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w10, w9, [x0] +; NOLSE-NEXT: cbnz w10, .LBB3_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -273,31 +203,15 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value) define float @test_atomicrmw_fadd_f32_seq_cst_align4(ptr %ptr, float %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align4: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr s1, [x0] -; NOLSE-NEXT: b .LBB4_2 ; NOLSE-NEXT: .LBB4_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; NOLSE-NEXT: fmov s1, w10 -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.eq .LBB4_5 -; NOLSE-NEXT: .LBB4_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB4_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr w8, [x0] +; NOLSE-NEXT: fmov s1, w8 ; NOLSE-NEXT: fadd s2, s1, s0 -; NOLSE-NEXT: fmov w9, s1 ; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB4_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr w10, [x0] -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.ne .LBB4_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB4_3 -; NOLSE-NEXT: b .LBB4_1 -; NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; NOLSE-NEXT: stlxr w9, w8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB4_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov s0, s1 ; NOLSE-NEXT: ret ; @@ -324,31 +238,15 @@ define float @test_atomicrmw_fadd_f32_seq_cst_align4(ptr %ptr, float %value) #0 define double @test_atomicrmw_fadd_f32_seq_cst_align8(ptr %ptr, double %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align8: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr d1, [x0] -; NOLSE-NEXT: b .LBB5_2 ; NOLSE-NEXT: .LBB5_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; NOLSE-NEXT: fmov d1, x10 -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.eq .LBB5_5 -; NOLSE-NEXT: .LBB5_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB5_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr x8, [x0] +; NOLSE-NEXT: fmov d1, x8 ; NOLSE-NEXT: fadd d2, d1, d0 -; NOLSE-NEXT: fmov x9, d1 ; NOLSE-NEXT: fmov x8, d2 -; NOLSE-NEXT: .LBB5_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr x10, [x0] -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.ne .LBB5_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, x8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB5_3 -; NOLSE-NEXT: b .LBB5_1 -; NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; NOLSE-NEXT: stlxr w9, x8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB5_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov d0, d1 ; NOLSE-NEXT: ret ; @@ -375,54 +273,26 @@ define double @test_atomicrmw_fadd_f32_seq_cst_align8(ptr %ptr, double %value) # define fp128 @test_atomicrmw_fadd_f32_seq_cst_align16(ptr %ptr, fp128 %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align16: ; NOLSE: // %bb.0: -; NOLSE-NEXT: sub sp, sp, #96 -; NOLSE-NEXT: ldr q1, [x0] -; NOLSE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; NOLSE-NEXT: sub sp, sp, #80 +; NOLSE-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill ; NOLSE-NEXT: mov x19, x0 -; NOLSE-NEXT: str q0, [sp] // 16-byte Folded Spill -; NOLSE-NEXT: b .LBB6_2 +; NOLSE-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; NOLSE-NEXT: .LBB6_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 -; NOLSE-NEXT: stp x12, x13, [sp, #32] -; NOLSE-NEXT: cmp x13, x10 -; NOLSE-NEXT: ldr q1, [sp, #32] -; NOLSE-NEXT: ccmp x12, x11, #0, eq -; NOLSE-NEXT: b.eq .LBB6_6 -; NOLSE-NEXT: .LBB6_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB6_3 Depth 2 -; NOLSE-NEXT: mov v0.16b, v1.16b -; NOLSE-NEXT: str q1, [sp, #16] // 16-byte Folded Spill -; NOLSE-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxp x8, x9, [x19] +; NOLSE-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; NOLSE-NEXT: stp x8, x9, [sp, #48] +; NOLSE-NEXT: ldr q0, [sp, #48] +; NOLSE-NEXT: str q0, [sp] // 16-byte Folded Spill ; NOLSE-NEXT: bl __addtf3 -; NOLSE-NEXT: str q0, [sp, #48] -; NOLSE-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; NOLSE-NEXT: ldp x9, x8, [sp, #48] -; NOLSE-NEXT: str q0, [sp, #64] -; NOLSE-NEXT: ldp x11, x10, [sp, #64] -; NOLSE-NEXT: .LBB6_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxp x12, x13, [x19] -; NOLSE-NEXT: cmp x12, x11 -; NOLSE-NEXT: cset w14, ne -; NOLSE-NEXT: cmp x13, x10 -; NOLSE-NEXT: cinc w14, w14, ne -; NOLSE-NEXT: cbz w14, .LBB6_5 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 -; NOLSE-NEXT: stlxp w14, x12, x13, [x19] -; NOLSE-NEXT: cbnz w14, .LBB6_3 -; NOLSE-NEXT: b .LBB6_1 -; NOLSE-NEXT: .LBB6_5: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 -; NOLSE-NEXT: stlxp w14, x9, x8, [x19] -; NOLSE-NEXT: cbnz w14, .LBB6_3 -; NOLSE-NEXT: b .LBB6_1 -; NOLSE-NEXT: .LBB6_6: // %atomicrmw.end -; NOLSE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; NOLSE-NEXT: mov v0.16b, v1.16b -; NOLSE-NEXT: add sp, sp, #96 +; NOLSE-NEXT: str q0, [sp, #32] +; NOLSE-NEXT: ldp x9, x8, [sp, #32] +; NOLSE-NEXT: stlxp w10, x9, x8, [x19] +; NOLSE-NEXT: cbnz w10, .LBB6_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end +; NOLSE-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; NOLSE-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; NOLSE-NEXT: add sp, sp, #80 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fadd_f32_seq_cst_align16: @@ -463,35 +333,19 @@ define fp128 @test_atomicrmw_fadd_f32_seq_cst_align16(ptr %ptr, fp128 %value) #0 define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half> %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fadd_v2f16_seq_cst_align4: ; NOLSE: // %bb.0: -; NOLSE-NEXT: fcvtl v1.4s, v0.4h -; NOLSE-NEXT: ldr s0, [x0] -; NOLSE-NEXT: b .LBB7_2 +; NOLSE-NEXT: fcvtl v0.4s, v0.4h ; NOLSE-NEXT: .LBB7_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.eq .LBB7_5 -; NOLSE-NEXT: .LBB7_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB7_3 Depth 2 -; NOLSE-NEXT: fcvtl v2.4s, v0.4h -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: fadd v2.4s, v2.4s, v1.4s -; NOLSE-NEXT: fcvtn v2.4h, v2.4s -; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB7_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr w10, [x0] -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.ne .LBB7_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB7_3 -; NOLSE-NEXT: b .LBB7_1 -; NOLSE-NEXT: .LBB7_5: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr w8, [x0] +; NOLSE-NEXT: fmov s1, w8 +; NOLSE-NEXT: fcvtl v1.4s, v1.4h +; NOLSE-NEXT: fadd v1.4s, v1.4s, v0.4s +; NOLSE-NEXT: fcvtn v1.4h, v1.4s +; NOLSE-NEXT: fmov w9, s1 +; NOLSE-NEXT: stlxr w10, w9, [x0] +; NOLSE-NEXT: cbnz w10, .LBB7_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, x8 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fadd_v2f16_seq_cst_align4: @@ -522,38 +376,22 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; NOLSE: // %bb.0: ; NOLSE-NEXT: movi v1.4s, #1 ; NOLSE-NEXT: movi v2.4s, #127, msl #8 -; NOLSE-NEXT: shll v3.4s, v0.4h, #16 -; NOLSE-NEXT: ldr s0, [x0] -; NOLSE-NEXT: b .LBB8_2 +; NOLSE-NEXT: shll v0.4s, v0.4h, #16 ; NOLSE-NEXT: .LBB8_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.eq .LBB8_5 -; NOLSE-NEXT: .LBB8_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB8_3 Depth 2 -; NOLSE-NEXT: shll v4.4s, v0.4h, #16 -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: fadd v4.4s, v4.4s, v3.4s -; NOLSE-NEXT: ushr v5.4s, v4.4s, #16 -; NOLSE-NEXT: and v5.16b, v5.16b, v1.16b -; NOLSE-NEXT: add v4.4s, v5.4s, v4.4s -; NOLSE-NEXT: addhn v4.4h, v4.4s, v2.4s -; NOLSE-NEXT: fmov w8, s4 -; NOLSE-NEXT: .LBB8_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr w10, [x0] -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.ne .LBB8_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB8_3 -; NOLSE-NEXT: b .LBB8_1 -; NOLSE-NEXT: .LBB8_5: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr w8, [x0] +; NOLSE-NEXT: fmov s3, w8 +; NOLSE-NEXT: shll v3.4s, v3.4h, #16 +; NOLSE-NEXT: fadd v3.4s, v3.4s, v0.4s +; NOLSE-NEXT: ushr v4.4s, v3.4s, #16 +; NOLSE-NEXT: and v4.16b, v4.16b, v1.16b +; NOLSE-NEXT: add v3.4s, v4.4s, v3.4s +; NOLSE-NEXT: addhn v3.4h, v3.4s, v2.4s +; NOLSE-NEXT: fmov w9, s3 +; NOLSE-NEXT: stlxr w10, w9, [x0] +; NOLSE-NEXT: cbnz w10, .LBB8_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, x8 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fadd_v2bf16_seq_cst_align4: @@ -587,31 +425,15 @@ define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf define <2 x float> @test_atomicrmw_fadd_v2f32_seq_cst_align8(ptr %ptr, <2 x float> %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fadd_v2f32_seq_cst_align8: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr d1, [x0] -; NOLSE-NEXT: b .LBB9_2 ; NOLSE-NEXT: .LBB9_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 -; NOLSE-NEXT: fmov d1, x10 -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.eq .LBB9_5 -; NOLSE-NEXT: .LBB9_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB9_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr x8, [x0] +; NOLSE-NEXT: fmov d1, x8 ; NOLSE-NEXT: fadd v2.2s, v1.2s, v0.2s -; NOLSE-NEXT: fmov x9, d1 ; NOLSE-NEXT: fmov x8, d2 -; NOLSE-NEXT: .LBB9_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr x10, [x0] -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.ne .LBB9_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, x8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB9_3 -; NOLSE-NEXT: b .LBB9_1 -; NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; NOLSE-NEXT: stlxr w9, x8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB9_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov d0, d1 ; NOLSE-NEXT: ret ; @@ -638,43 +460,17 @@ define <2 x float> @test_atomicrmw_fadd_v2f32_seq_cst_align8(ptr %ptr, <2 x floa define <2 x double> @test_atomicrmw_fadd_v2f64_seq_cst_align8(ptr %ptr, <2 x double> %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fadd_v2f64_seq_cst_align8: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr q1, [x0] -; NOLSE-NEXT: b .LBB10_2 ; NOLSE-NEXT: .LBB10_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB10_2 Depth=1 -; NOLSE-NEXT: fmov d1, x12 -; NOLSE-NEXT: cmp x13, x9 -; NOLSE-NEXT: ccmp x12, x11, #0, eq -; NOLSE-NEXT: mov v1.d[1], x13 -; NOLSE-NEXT: b.eq .LBB10_6 -; NOLSE-NEXT: .LBB10_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB10_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxp x8, x9, [x0] +; NOLSE-NEXT: fmov d1, x8 +; NOLSE-NEXT: mov v1.d[1], x9 ; NOLSE-NEXT: fadd v2.2d, v1.2d, v0.2d -; NOLSE-NEXT: mov x9, v1.d[1] -; NOLSE-NEXT: fmov x11, d1 ; NOLSE-NEXT: mov x8, v2.d[1] -; NOLSE-NEXT: fmov x10, d2 -; NOLSE-NEXT: .LBB10_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB10_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxp x12, x13, [x0] -; NOLSE-NEXT: cmp x12, x11 -; NOLSE-NEXT: cset w14, ne -; NOLSE-NEXT: cmp x13, x9 -; NOLSE-NEXT: cinc w14, w14, ne -; NOLSE-NEXT: cbz w14, .LBB10_5 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 -; NOLSE-NEXT: stlxp w14, x12, x13, [x0] -; NOLSE-NEXT: cbnz w14, .LBB10_3 -; NOLSE-NEXT: b .LBB10_1 -; NOLSE-NEXT: .LBB10_5: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 -; NOLSE-NEXT: stlxp w14, x10, x8, [x0] -; NOLSE-NEXT: cbnz w14, .LBB10_3 -; NOLSE-NEXT: b .LBB10_1 -; NOLSE-NEXT: .LBB10_6: // %atomicrmw.end +; NOLSE-NEXT: fmov x9, d2 +; NOLSE-NEXT: stlxp w10, x9, x8, [x0] +; NOLSE-NEXT: cbnz w10, .LBB10_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: mov v0.16b, v1.16b ; NOLSE-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll index fe6ec534860e30..84bab18ca59d02 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll @@ -6,33 +6,17 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align2: ; NOLSE: // %bb.0: ; NOLSE-NEXT: fcvt s1, h0 -; NOLSE-NEXT: ldr h0, [x0] -; NOLSE-NEXT: b .LBB0_2 ; NOLSE-NEXT: .LBB0_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.eq .LBB0_5 -; NOLSE-NEXT: .LBB0_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB0_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w8, [x0] +; NOLSE-NEXT: fmov s0, w8 ; NOLSE-NEXT: fcvt s2, h0 -; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: fmaxnm s2, s2, s1 ; NOLSE-NEXT: fcvt h2, s2 ; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB0_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w10, [x0] -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.ne .LBB0_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB0_3 -; NOLSE-NEXT: b .LBB0_1 -; NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w9, w8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB0_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -63,33 +47,17 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmax_f16_seq_cst_align4: ; NOLSE: // %bb.0: ; NOLSE-NEXT: fcvt s1, h0 -; NOLSE-NEXT: ldr h0, [x0] -; NOLSE-NEXT: b .LBB1_2 ; NOLSE-NEXT: .LBB1_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.eq .LBB1_5 -; NOLSE-NEXT: .LBB1_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB1_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w8, [x0] +; NOLSE-NEXT: fmov s0, w8 ; NOLSE-NEXT: fcvt s2, h0 -; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: fmaxnm s2, s2, s1 ; NOLSE-NEXT: fcvt h2, s2 ; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB1_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w10, [x0] -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.ne .LBB1_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB1_3 -; NOLSE-NEXT: b .LBB1_1 -; NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w9, w8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB1_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -122,19 +90,12 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 ; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: ldr h0, [x0] ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s1, w9 -; NOLSE-NEXT: b .LBB2_2 ; NOLSE-NEXT: .LBB2_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; NOLSE-NEXT: fmov s0, w11 -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.eq .LBB2_5 -; NOLSE-NEXT: .LBB2_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB2_3 Depth 2 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w9, [x0] +; NOLSE-NEXT: fmov s0, w9 ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s2, w9 ; NOLSE-NEXT: fmaxnm s2, s2, s1 @@ -143,21 +104,9 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; NOLSE-NEXT: add w9, w9, w8 ; NOLSE-NEXT: add w9, w10, w9 ; NOLSE-NEXT: lsr w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: fmov w10, s2 -; NOLSE-NEXT: .LBB2_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w11, [x0] -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.ne .LBB2_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w10, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB2_3 -; NOLSE-NEXT: b .LBB2_1 -; NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w10, w9, [x0] +; NOLSE-NEXT: cbnz w10, .LBB2_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -199,19 +148,12 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 ; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: ldr h0, [x0] ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s1, w9 -; NOLSE-NEXT: b .LBB3_2 ; NOLSE-NEXT: .LBB3_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; NOLSE-NEXT: fmov s0, w11 -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.eq .LBB3_5 -; NOLSE-NEXT: .LBB3_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB3_3 Depth 2 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w9, [x0] +; NOLSE-NEXT: fmov s0, w9 ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s2, w9 ; NOLSE-NEXT: fmaxnm s2, s2, s1 @@ -220,21 +162,9 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; NOLSE-NEXT: add w9, w9, w8 ; NOLSE-NEXT: add w9, w10, w9 ; NOLSE-NEXT: lsr w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: fmov w10, s2 -; NOLSE-NEXT: .LBB3_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w11, [x0] -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.ne .LBB3_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w10, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB3_3 -; NOLSE-NEXT: b .LBB3_1 -; NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w10, w9, [x0] +; NOLSE-NEXT: cbnz w10, .LBB3_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -273,31 +203,15 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value) define float @test_atomicrmw_fmax_f32_seq_cst_align4(ptr %ptr, float %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align4: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr s1, [x0] -; NOLSE-NEXT: b .LBB4_2 ; NOLSE-NEXT: .LBB4_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; NOLSE-NEXT: fmov s1, w10 -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.eq .LBB4_5 -; NOLSE-NEXT: .LBB4_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB4_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr w8, [x0] +; NOLSE-NEXT: fmov s1, w8 ; NOLSE-NEXT: fmaxnm s2, s1, s0 -; NOLSE-NEXT: fmov w9, s1 ; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB4_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr w10, [x0] -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.ne .LBB4_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB4_3 -; NOLSE-NEXT: b .LBB4_1 -; NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; NOLSE-NEXT: stlxr w9, w8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB4_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov s0, s1 ; NOLSE-NEXT: ret ; @@ -324,31 +238,15 @@ define float @test_atomicrmw_fmax_f32_seq_cst_align4(ptr %ptr, float %value) #0 define double @test_atomicrmw_fmax_f32_seq_cst_align8(ptr %ptr, double %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align8: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr d1, [x0] -; NOLSE-NEXT: b .LBB5_2 ; NOLSE-NEXT: .LBB5_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; NOLSE-NEXT: fmov d1, x10 -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.eq .LBB5_5 -; NOLSE-NEXT: .LBB5_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB5_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr x8, [x0] +; NOLSE-NEXT: fmov d1, x8 ; NOLSE-NEXT: fmaxnm d2, d1, d0 -; NOLSE-NEXT: fmov x9, d1 ; NOLSE-NEXT: fmov x8, d2 -; NOLSE-NEXT: .LBB5_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr x10, [x0] -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.ne .LBB5_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, x8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB5_3 -; NOLSE-NEXT: b .LBB5_1 -; NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; NOLSE-NEXT: stlxr w9, x8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB5_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov d0, d1 ; NOLSE-NEXT: ret ; @@ -375,54 +273,26 @@ define double @test_atomicrmw_fmax_f32_seq_cst_align8(ptr %ptr, double %value) # define fp128 @test_atomicrmw_fmax_f32_seq_cst_align16(ptr %ptr, fp128 %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align16: ; NOLSE: // %bb.0: -; NOLSE-NEXT: sub sp, sp, #96 -; NOLSE-NEXT: ldr q1, [x0] -; NOLSE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; NOLSE-NEXT: sub sp, sp, #80 +; NOLSE-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill ; NOLSE-NEXT: mov x19, x0 -; NOLSE-NEXT: str q0, [sp] // 16-byte Folded Spill -; NOLSE-NEXT: b .LBB6_2 +; NOLSE-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; NOLSE-NEXT: .LBB6_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 -; NOLSE-NEXT: stp x12, x13, [sp, #32] -; NOLSE-NEXT: cmp x13, x10 -; NOLSE-NEXT: ldr q1, [sp, #32] -; NOLSE-NEXT: ccmp x12, x11, #0, eq -; NOLSE-NEXT: b.eq .LBB6_6 -; NOLSE-NEXT: .LBB6_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB6_3 Depth 2 -; NOLSE-NEXT: mov v0.16b, v1.16b -; NOLSE-NEXT: str q1, [sp, #16] // 16-byte Folded Spill -; NOLSE-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxp x8, x9, [x19] +; NOLSE-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; NOLSE-NEXT: stp x8, x9, [sp, #48] +; NOLSE-NEXT: ldr q0, [sp, #48] +; NOLSE-NEXT: str q0, [sp] // 16-byte Folded Spill ; NOLSE-NEXT: bl fmaxl -; NOLSE-NEXT: str q0, [sp, #48] -; NOLSE-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; NOLSE-NEXT: ldp x9, x8, [sp, #48] -; NOLSE-NEXT: str q0, [sp, #64] -; NOLSE-NEXT: ldp x11, x10, [sp, #64] -; NOLSE-NEXT: .LBB6_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxp x12, x13, [x19] -; NOLSE-NEXT: cmp x12, x11 -; NOLSE-NEXT: cset w14, ne -; NOLSE-NEXT: cmp x13, x10 -; NOLSE-NEXT: cinc w14, w14, ne -; NOLSE-NEXT: cbz w14, .LBB6_5 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 -; NOLSE-NEXT: stlxp w14, x12, x13, [x19] -; NOLSE-NEXT: cbnz w14, .LBB6_3 -; NOLSE-NEXT: b .LBB6_1 -; NOLSE-NEXT: .LBB6_5: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 -; NOLSE-NEXT: stlxp w14, x9, x8, [x19] -; NOLSE-NEXT: cbnz w14, .LBB6_3 -; NOLSE-NEXT: b .LBB6_1 -; NOLSE-NEXT: .LBB6_6: // %atomicrmw.end -; NOLSE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; NOLSE-NEXT: mov v0.16b, v1.16b -; NOLSE-NEXT: add sp, sp, #96 +; NOLSE-NEXT: str q0, [sp, #32] +; NOLSE-NEXT: ldp x9, x8, [sp, #32] +; NOLSE-NEXT: stlxp w10, x9, x8, [x19] +; NOLSE-NEXT: cbnz w10, .LBB6_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end +; NOLSE-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; NOLSE-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; NOLSE-NEXT: add sp, sp, #80 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fmax_f32_seq_cst_align16: @@ -465,41 +335,25 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; NOLSE: // %bb.0: ; NOLSE-NEXT: // kill: def $d0 killed $d0 def $q0 ; NOLSE-NEXT: mov h1, v0.h[1] -; NOLSE-NEXT: fcvt s2, h0 -; NOLSE-NEXT: ldr s0, [x0] +; NOLSE-NEXT: fcvt s0, h0 ; NOLSE-NEXT: fcvt s1, h1 -; NOLSE-NEXT: b .LBB7_2 ; NOLSE-NEXT: .LBB7_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.eq .LBB7_5 -; NOLSE-NEXT: .LBB7_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB7_3 Depth 2 -; NOLSE-NEXT: mov h3, v0.h[1] -; NOLSE-NEXT: fcvt s4, h0 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr w8, [x0] +; NOLSE-NEXT: fmov s2, w8 +; NOLSE-NEXT: mov h3, v2.h[1] +; NOLSE-NEXT: fcvt s2, h2 ; NOLSE-NEXT: fcvt s3, h3 -; NOLSE-NEXT: fmaxnm s4, s4, s2 +; NOLSE-NEXT: fmaxnm s2, s2, s0 ; NOLSE-NEXT: fmaxnm s3, s3, s1 -; NOLSE-NEXT: fcvt h4, s4 +; NOLSE-NEXT: fcvt h2, s2 ; NOLSE-NEXT: fcvt h3, s3 -; NOLSE-NEXT: mov v4.h[1], v3.h[0] -; NOLSE-NEXT: fmov w8, s4 -; NOLSE-NEXT: .LBB7_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr w10, [x0] -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.ne .LBB7_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB7_3 -; NOLSE-NEXT: b .LBB7_1 -; NOLSE-NEXT: .LBB7_5: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: mov v2.h[1], v3.h[0] +; NOLSE-NEXT: fmov w9, s2 +; NOLSE-NEXT: stlxr w10, w9, [x0] +; NOLSE-NEXT: cbnz w10, .LBB7_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, x8 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fmax_v2f16_seq_cst_align4: @@ -540,58 +394,42 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; NOLSE-NEXT: mov h1, v0.h[1] ; NOLSE-NEXT: fmov w10, s0 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: ldr s0, [x0] ; NOLSE-NEXT: lsl w10, w10, #16 ; NOLSE-NEXT: fmov w9, s1 -; NOLSE-NEXT: fmov s2, w10 +; NOLSE-NEXT: fmov s1, w10 ; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s1, w9 -; NOLSE-NEXT: b .LBB8_2 +; NOLSE-NEXT: fmov s0, w9 ; NOLSE-NEXT: .LBB8_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; NOLSE-NEXT: fmov s0, w11 -; NOLSE-NEXT: cmp w11, w9 -; NOLSE-NEXT: b.eq .LBB8_5 -; NOLSE-NEXT: .LBB8_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB8_3 Depth 2 -; NOLSE-NEXT: mov h3, v0.h[1] -; NOLSE-NEXT: fmov w10, s0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr w9, [x0] +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: mov h3, v2.h[1] +; NOLSE-NEXT: fmov w11, s2 +; NOLSE-NEXT: lsl w11, w11, #16 +; NOLSE-NEXT: fmov w10, s3 +; NOLSE-NEXT: fmov s3, w11 ; NOLSE-NEXT: lsl w10, w10, #16 -; NOLSE-NEXT: fmov w9, s3 -; NOLSE-NEXT: fmov s4, w10 -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmaxnm s4, s4, s2 -; NOLSE-NEXT: fmov s3, w9 ; NOLSE-NEXT: fmaxnm s3, s3, s1 -; NOLSE-NEXT: fmov w10, s4 +; NOLSE-NEXT: fmov s2, w10 +; NOLSE-NEXT: fmaxnm s2, s2, s0 +; NOLSE-NEXT: fmov w11, s3 +; NOLSE-NEXT: ubfx w13, w11, #16, #1 +; NOLSE-NEXT: add w11, w11, w8 +; NOLSE-NEXT: fmov w10, s2 +; NOLSE-NEXT: add w11, w13, w11 +; NOLSE-NEXT: lsr w11, w11, #16 ; NOLSE-NEXT: ubfx w12, w10, #16, #1 ; NOLSE-NEXT: add w10, w10, w8 -; NOLSE-NEXT: fmov w9, s3 +; NOLSE-NEXT: fmov s3, w11 ; NOLSE-NEXT: add w10, w12, w10 ; NOLSE-NEXT: lsr w10, w10, #16 -; NOLSE-NEXT: ubfx w11, w9, #16, #1 -; NOLSE-NEXT: add w9, w9, w8 -; NOLSE-NEXT: fmov s4, w10 -; NOLSE-NEXT: add w9, w11, w9 -; NOLSE-NEXT: lsr w9, w9, #16 -; NOLSE-NEXT: fmov s3, w9 -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: mov v4.h[1], v3.h[0] -; NOLSE-NEXT: fmov w10, s4 -; NOLSE-NEXT: .LBB8_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr w11, [x0] -; NOLSE-NEXT: cmp w11, w9 -; NOLSE-NEXT: b.ne .LBB8_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, w10, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB8_3 -; NOLSE-NEXT: b .LBB8_1 -; NOLSE-NEXT: .LBB8_5: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: fmov s2, w10 +; NOLSE-NEXT: mov v3.h[1], v2.h[0] +; NOLSE-NEXT: fmov w10, s3 +; NOLSE-NEXT: stlxr w11, w10, [x0] +; NOLSE-NEXT: cbnz w11, .LBB8_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, x9 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fmax_v2bf16_seq_cst_align4: @@ -647,31 +485,15 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x float> %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmax_v2f32_seq_cst_align8: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr d1, [x0] -; NOLSE-NEXT: b .LBB9_2 ; NOLSE-NEXT: .LBB9_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 -; NOLSE-NEXT: fmov d1, x10 -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.eq .LBB9_5 -; NOLSE-NEXT: .LBB9_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB9_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr x8, [x0] +; NOLSE-NEXT: fmov d1, x8 ; NOLSE-NEXT: fmaxnm v2.2s, v1.2s, v0.2s -; NOLSE-NEXT: fmov x9, d1 ; NOLSE-NEXT: fmov x8, d2 -; NOLSE-NEXT: .LBB9_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr x10, [x0] -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.ne .LBB9_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, x8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB9_3 -; NOLSE-NEXT: b .LBB9_1 -; NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; NOLSE-NEXT: stlxr w9, x8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB9_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov d0, d1 ; NOLSE-NEXT: ret ; @@ -698,43 +520,17 @@ define <2 x float> @test_atomicrmw_fmax_v2f32_seq_cst_align8(ptr %ptr, <2 x floa define <2 x double> @test_atomicrmw_fmax_v2f64_seq_cst_align8(ptr %ptr, <2 x double> %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmax_v2f64_seq_cst_align8: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr q1, [x0] -; NOLSE-NEXT: b .LBB10_2 ; NOLSE-NEXT: .LBB10_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB10_2 Depth=1 -; NOLSE-NEXT: fmov d1, x12 -; NOLSE-NEXT: cmp x13, x9 -; NOLSE-NEXT: ccmp x12, x11, #0, eq -; NOLSE-NEXT: mov v1.d[1], x13 -; NOLSE-NEXT: b.eq .LBB10_6 -; NOLSE-NEXT: .LBB10_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB10_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxp x8, x9, [x0] +; NOLSE-NEXT: fmov d1, x8 +; NOLSE-NEXT: mov v1.d[1], x9 ; NOLSE-NEXT: fmaxnm v2.2d, v1.2d, v0.2d -; NOLSE-NEXT: mov x9, v1.d[1] -; NOLSE-NEXT: fmov x11, d1 ; NOLSE-NEXT: mov x8, v2.d[1] -; NOLSE-NEXT: fmov x10, d2 -; NOLSE-NEXT: .LBB10_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB10_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxp x12, x13, [x0] -; NOLSE-NEXT: cmp x12, x11 -; NOLSE-NEXT: cset w14, ne -; NOLSE-NEXT: cmp x13, x9 -; NOLSE-NEXT: cinc w14, w14, ne -; NOLSE-NEXT: cbz w14, .LBB10_5 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 -; NOLSE-NEXT: stlxp w14, x12, x13, [x0] -; NOLSE-NEXT: cbnz w14, .LBB10_3 -; NOLSE-NEXT: b .LBB10_1 -; NOLSE-NEXT: .LBB10_5: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 -; NOLSE-NEXT: stlxp w14, x10, x8, [x0] -; NOLSE-NEXT: cbnz w14, .LBB10_3 -; NOLSE-NEXT: b .LBB10_1 -; NOLSE-NEXT: .LBB10_6: // %atomicrmw.end +; NOLSE-NEXT: fmov x9, d2 +; NOLSE-NEXT: stlxp w10, x9, x8, [x0] +; NOLSE-NEXT: cbnz w10, .LBB10_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: mov v0.16b, v1.16b ; NOLSE-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll index 45566bd60fec2d..c815713f7d7acc 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll @@ -6,33 +6,17 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align2: ; NOLSE: // %bb.0: ; NOLSE-NEXT: fcvt s1, h0 -; NOLSE-NEXT: ldr h0, [x0] -; NOLSE-NEXT: b .LBB0_2 ; NOLSE-NEXT: .LBB0_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.eq .LBB0_5 -; NOLSE-NEXT: .LBB0_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB0_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w8, [x0] +; NOLSE-NEXT: fmov s0, w8 ; NOLSE-NEXT: fcvt s2, h0 -; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: fminnm s2, s2, s1 ; NOLSE-NEXT: fcvt h2, s2 ; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB0_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w10, [x0] -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.ne .LBB0_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB0_3 -; NOLSE-NEXT: b .LBB0_1 -; NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w9, w8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB0_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -63,33 +47,17 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmin_f16_seq_cst_align4: ; NOLSE: // %bb.0: ; NOLSE-NEXT: fcvt s1, h0 -; NOLSE-NEXT: ldr h0, [x0] -; NOLSE-NEXT: b .LBB1_2 ; NOLSE-NEXT: .LBB1_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.eq .LBB1_5 -; NOLSE-NEXT: .LBB1_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB1_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w8, [x0] +; NOLSE-NEXT: fmov s0, w8 ; NOLSE-NEXT: fcvt s2, h0 -; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: fminnm s2, s2, s1 ; NOLSE-NEXT: fcvt h2, s2 ; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB1_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w10, [x0] -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.ne .LBB1_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB1_3 -; NOLSE-NEXT: b .LBB1_1 -; NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w9, w8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB1_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -122,19 +90,12 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 ; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: ldr h0, [x0] ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s1, w9 -; NOLSE-NEXT: b .LBB2_2 ; NOLSE-NEXT: .LBB2_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; NOLSE-NEXT: fmov s0, w11 -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.eq .LBB2_5 -; NOLSE-NEXT: .LBB2_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB2_3 Depth 2 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w9, [x0] +; NOLSE-NEXT: fmov s0, w9 ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s2, w9 ; NOLSE-NEXT: fminnm s2, s2, s1 @@ -143,21 +104,9 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; NOLSE-NEXT: add w9, w9, w8 ; NOLSE-NEXT: add w9, w10, w9 ; NOLSE-NEXT: lsr w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: fmov w10, s2 -; NOLSE-NEXT: .LBB2_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w11, [x0] -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.ne .LBB2_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w10, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB2_3 -; NOLSE-NEXT: b .LBB2_1 -; NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w10, w9, [x0] +; NOLSE-NEXT: cbnz w10, .LBB2_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -199,19 +148,12 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 ; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: ldr h0, [x0] ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s1, w9 -; NOLSE-NEXT: b .LBB3_2 ; NOLSE-NEXT: .LBB3_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; NOLSE-NEXT: fmov s0, w11 -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.eq .LBB3_5 -; NOLSE-NEXT: .LBB3_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB3_3 Depth 2 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w9, [x0] +; NOLSE-NEXT: fmov s0, w9 ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s2, w9 ; NOLSE-NEXT: fminnm s2, s2, s1 @@ -220,21 +162,9 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; NOLSE-NEXT: add w9, w9, w8 ; NOLSE-NEXT: add w9, w10, w9 ; NOLSE-NEXT: lsr w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: fmov w10, s2 -; NOLSE-NEXT: .LBB3_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w11, [x0] -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.ne .LBB3_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w10, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB3_3 -; NOLSE-NEXT: b .LBB3_1 -; NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w10, w9, [x0] +; NOLSE-NEXT: cbnz w10, .LBB3_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -273,31 +203,15 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value) define float @test_atomicrmw_fmin_f32_seq_cst_align4(ptr %ptr, float %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align4: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr s1, [x0] -; NOLSE-NEXT: b .LBB4_2 ; NOLSE-NEXT: .LBB4_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; NOLSE-NEXT: fmov s1, w10 -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.eq .LBB4_5 -; NOLSE-NEXT: .LBB4_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB4_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr w8, [x0] +; NOLSE-NEXT: fmov s1, w8 ; NOLSE-NEXT: fminnm s2, s1, s0 -; NOLSE-NEXT: fmov w9, s1 ; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB4_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr w10, [x0] -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.ne .LBB4_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB4_3 -; NOLSE-NEXT: b .LBB4_1 -; NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; NOLSE-NEXT: stlxr w9, w8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB4_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov s0, s1 ; NOLSE-NEXT: ret ; @@ -324,31 +238,15 @@ define float @test_atomicrmw_fmin_f32_seq_cst_align4(ptr %ptr, float %value) #0 define double @test_atomicrmw_fmin_f32_seq_cst_align8(ptr %ptr, double %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align8: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr d1, [x0] -; NOLSE-NEXT: b .LBB5_2 ; NOLSE-NEXT: .LBB5_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; NOLSE-NEXT: fmov d1, x10 -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.eq .LBB5_5 -; NOLSE-NEXT: .LBB5_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB5_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr x8, [x0] +; NOLSE-NEXT: fmov d1, x8 ; NOLSE-NEXT: fminnm d2, d1, d0 -; NOLSE-NEXT: fmov x9, d1 ; NOLSE-NEXT: fmov x8, d2 -; NOLSE-NEXT: .LBB5_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr x10, [x0] -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.ne .LBB5_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, x8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB5_3 -; NOLSE-NEXT: b .LBB5_1 -; NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; NOLSE-NEXT: stlxr w9, x8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB5_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov d0, d1 ; NOLSE-NEXT: ret ; @@ -375,54 +273,26 @@ define double @test_atomicrmw_fmin_f32_seq_cst_align8(ptr %ptr, double %value) # define fp128 @test_atomicrmw_fmin_f32_seq_cst_align16(ptr %ptr, fp128 %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align16: ; NOLSE: // %bb.0: -; NOLSE-NEXT: sub sp, sp, #96 -; NOLSE-NEXT: ldr q1, [x0] -; NOLSE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; NOLSE-NEXT: sub sp, sp, #80 +; NOLSE-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill ; NOLSE-NEXT: mov x19, x0 -; NOLSE-NEXT: str q0, [sp] // 16-byte Folded Spill -; NOLSE-NEXT: b .LBB6_2 +; NOLSE-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; NOLSE-NEXT: .LBB6_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 -; NOLSE-NEXT: stp x12, x13, [sp, #32] -; NOLSE-NEXT: cmp x13, x10 -; NOLSE-NEXT: ldr q1, [sp, #32] -; NOLSE-NEXT: ccmp x12, x11, #0, eq -; NOLSE-NEXT: b.eq .LBB6_6 -; NOLSE-NEXT: .LBB6_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB6_3 Depth 2 -; NOLSE-NEXT: mov v0.16b, v1.16b -; NOLSE-NEXT: str q1, [sp, #16] // 16-byte Folded Spill -; NOLSE-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxp x8, x9, [x19] +; NOLSE-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; NOLSE-NEXT: stp x8, x9, [sp, #48] +; NOLSE-NEXT: ldr q0, [sp, #48] +; NOLSE-NEXT: str q0, [sp] // 16-byte Folded Spill ; NOLSE-NEXT: bl fminl -; NOLSE-NEXT: str q0, [sp, #48] -; NOLSE-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; NOLSE-NEXT: ldp x9, x8, [sp, #48] -; NOLSE-NEXT: str q0, [sp, #64] -; NOLSE-NEXT: ldp x11, x10, [sp, #64] -; NOLSE-NEXT: .LBB6_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxp x12, x13, [x19] -; NOLSE-NEXT: cmp x12, x11 -; NOLSE-NEXT: cset w14, ne -; NOLSE-NEXT: cmp x13, x10 -; NOLSE-NEXT: cinc w14, w14, ne -; NOLSE-NEXT: cbz w14, .LBB6_5 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 -; NOLSE-NEXT: stlxp w14, x12, x13, [x19] -; NOLSE-NEXT: cbnz w14, .LBB6_3 -; NOLSE-NEXT: b .LBB6_1 -; NOLSE-NEXT: .LBB6_5: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 -; NOLSE-NEXT: stlxp w14, x9, x8, [x19] -; NOLSE-NEXT: cbnz w14, .LBB6_3 -; NOLSE-NEXT: b .LBB6_1 -; NOLSE-NEXT: .LBB6_6: // %atomicrmw.end -; NOLSE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; NOLSE-NEXT: mov v0.16b, v1.16b -; NOLSE-NEXT: add sp, sp, #96 +; NOLSE-NEXT: str q0, [sp, #32] +; NOLSE-NEXT: ldp x9, x8, [sp, #32] +; NOLSE-NEXT: stlxp w10, x9, x8, [x19] +; NOLSE-NEXT: cbnz w10, .LBB6_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end +; NOLSE-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; NOLSE-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; NOLSE-NEXT: add sp, sp, #80 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fmin_f32_seq_cst_align16: @@ -465,41 +335,25 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; NOLSE: // %bb.0: ; NOLSE-NEXT: // kill: def $d0 killed $d0 def $q0 ; NOLSE-NEXT: mov h1, v0.h[1] -; NOLSE-NEXT: fcvt s2, h0 -; NOLSE-NEXT: ldr s0, [x0] +; NOLSE-NEXT: fcvt s0, h0 ; NOLSE-NEXT: fcvt s1, h1 -; NOLSE-NEXT: b .LBB7_2 ; NOLSE-NEXT: .LBB7_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.eq .LBB7_5 -; NOLSE-NEXT: .LBB7_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB7_3 Depth 2 -; NOLSE-NEXT: mov h3, v0.h[1] -; NOLSE-NEXT: fcvt s4, h0 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr w8, [x0] +; NOLSE-NEXT: fmov s2, w8 +; NOLSE-NEXT: mov h3, v2.h[1] +; NOLSE-NEXT: fcvt s2, h2 ; NOLSE-NEXT: fcvt s3, h3 -; NOLSE-NEXT: fminnm s4, s4, s2 +; NOLSE-NEXT: fminnm s2, s2, s0 ; NOLSE-NEXT: fminnm s3, s3, s1 -; NOLSE-NEXT: fcvt h4, s4 +; NOLSE-NEXT: fcvt h2, s2 ; NOLSE-NEXT: fcvt h3, s3 -; NOLSE-NEXT: mov v4.h[1], v3.h[0] -; NOLSE-NEXT: fmov w8, s4 -; NOLSE-NEXT: .LBB7_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr w10, [x0] -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.ne .LBB7_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB7_3 -; NOLSE-NEXT: b .LBB7_1 -; NOLSE-NEXT: .LBB7_5: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: mov v2.h[1], v3.h[0] +; NOLSE-NEXT: fmov w9, s2 +; NOLSE-NEXT: stlxr w10, w9, [x0] +; NOLSE-NEXT: cbnz w10, .LBB7_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, x8 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fmin_v2f16_seq_cst_align4: @@ -540,58 +394,42 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; NOLSE-NEXT: mov h1, v0.h[1] ; NOLSE-NEXT: fmov w10, s0 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: ldr s0, [x0] ; NOLSE-NEXT: lsl w10, w10, #16 ; NOLSE-NEXT: fmov w9, s1 -; NOLSE-NEXT: fmov s2, w10 +; NOLSE-NEXT: fmov s1, w10 ; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s1, w9 -; NOLSE-NEXT: b .LBB8_2 +; NOLSE-NEXT: fmov s0, w9 ; NOLSE-NEXT: .LBB8_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; NOLSE-NEXT: fmov s0, w11 -; NOLSE-NEXT: cmp w11, w9 -; NOLSE-NEXT: b.eq .LBB8_5 -; NOLSE-NEXT: .LBB8_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB8_3 Depth 2 -; NOLSE-NEXT: mov h3, v0.h[1] -; NOLSE-NEXT: fmov w10, s0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr w9, [x0] +; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: mov h3, v2.h[1] +; NOLSE-NEXT: fmov w11, s2 +; NOLSE-NEXT: lsl w11, w11, #16 +; NOLSE-NEXT: fmov w10, s3 +; NOLSE-NEXT: fmov s3, w11 ; NOLSE-NEXT: lsl w10, w10, #16 -; NOLSE-NEXT: fmov w9, s3 -; NOLSE-NEXT: fmov s4, w10 -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fminnm s4, s4, s2 -; NOLSE-NEXT: fmov s3, w9 ; NOLSE-NEXT: fminnm s3, s3, s1 -; NOLSE-NEXT: fmov w10, s4 +; NOLSE-NEXT: fmov s2, w10 +; NOLSE-NEXT: fminnm s2, s2, s0 +; NOLSE-NEXT: fmov w11, s3 +; NOLSE-NEXT: ubfx w13, w11, #16, #1 +; NOLSE-NEXT: add w11, w11, w8 +; NOLSE-NEXT: fmov w10, s2 +; NOLSE-NEXT: add w11, w13, w11 +; NOLSE-NEXT: lsr w11, w11, #16 ; NOLSE-NEXT: ubfx w12, w10, #16, #1 ; NOLSE-NEXT: add w10, w10, w8 -; NOLSE-NEXT: fmov w9, s3 +; NOLSE-NEXT: fmov s3, w11 ; NOLSE-NEXT: add w10, w12, w10 ; NOLSE-NEXT: lsr w10, w10, #16 -; NOLSE-NEXT: ubfx w11, w9, #16, #1 -; NOLSE-NEXT: add w9, w9, w8 -; NOLSE-NEXT: fmov s4, w10 -; NOLSE-NEXT: add w9, w11, w9 -; NOLSE-NEXT: lsr w9, w9, #16 -; NOLSE-NEXT: fmov s3, w9 -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: mov v4.h[1], v3.h[0] -; NOLSE-NEXT: fmov w10, s4 -; NOLSE-NEXT: .LBB8_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr w11, [x0] -; NOLSE-NEXT: cmp w11, w9 -; NOLSE-NEXT: b.ne .LBB8_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, w10, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB8_3 -; NOLSE-NEXT: b .LBB8_1 -; NOLSE-NEXT: .LBB8_5: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: fmov s2, w10 +; NOLSE-NEXT: mov v3.h[1], v2.h[0] +; NOLSE-NEXT: fmov w10, s3 +; NOLSE-NEXT: stlxr w11, w10, [x0] +; NOLSE-NEXT: cbnz w11, .LBB8_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, x9 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fmin_v2bf16_seq_cst_align4: @@ -647,31 +485,15 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x float> %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmin_v2f32_seq_cst_align8: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr d1, [x0] -; NOLSE-NEXT: b .LBB9_2 ; NOLSE-NEXT: .LBB9_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 -; NOLSE-NEXT: fmov d1, x10 -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.eq .LBB9_5 -; NOLSE-NEXT: .LBB9_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB9_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr x8, [x0] +; NOLSE-NEXT: fmov d1, x8 ; NOLSE-NEXT: fminnm v2.2s, v1.2s, v0.2s -; NOLSE-NEXT: fmov x9, d1 ; NOLSE-NEXT: fmov x8, d2 -; NOLSE-NEXT: .LBB9_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr x10, [x0] -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.ne .LBB9_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, x8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB9_3 -; NOLSE-NEXT: b .LBB9_1 -; NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; NOLSE-NEXT: stlxr w9, x8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB9_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov d0, d1 ; NOLSE-NEXT: ret ; @@ -698,43 +520,17 @@ define <2 x float> @test_atomicrmw_fmin_v2f32_seq_cst_align8(ptr %ptr, <2 x floa define <2 x double> @test_atomicrmw_fmin_v2f64_seq_cst_align8(ptr %ptr, <2 x double> %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmin_v2f64_seq_cst_align8: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr q1, [x0] -; NOLSE-NEXT: b .LBB10_2 ; NOLSE-NEXT: .LBB10_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB10_2 Depth=1 -; NOLSE-NEXT: fmov d1, x12 -; NOLSE-NEXT: cmp x13, x9 -; NOLSE-NEXT: ccmp x12, x11, #0, eq -; NOLSE-NEXT: mov v1.d[1], x13 -; NOLSE-NEXT: b.eq .LBB10_6 -; NOLSE-NEXT: .LBB10_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB10_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxp x8, x9, [x0] +; NOLSE-NEXT: fmov d1, x8 +; NOLSE-NEXT: mov v1.d[1], x9 ; NOLSE-NEXT: fminnm v2.2d, v1.2d, v0.2d -; NOLSE-NEXT: mov x9, v1.d[1] -; NOLSE-NEXT: fmov x11, d1 ; NOLSE-NEXT: mov x8, v2.d[1] -; NOLSE-NEXT: fmov x10, d2 -; NOLSE-NEXT: .LBB10_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB10_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxp x12, x13, [x0] -; NOLSE-NEXT: cmp x12, x11 -; NOLSE-NEXT: cset w14, ne -; NOLSE-NEXT: cmp x13, x9 -; NOLSE-NEXT: cinc w14, w14, ne -; NOLSE-NEXT: cbz w14, .LBB10_5 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 -; NOLSE-NEXT: stlxp w14, x12, x13, [x0] -; NOLSE-NEXT: cbnz w14, .LBB10_3 -; NOLSE-NEXT: b .LBB10_1 -; NOLSE-NEXT: .LBB10_5: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 -; NOLSE-NEXT: stlxp w14, x10, x8, [x0] -; NOLSE-NEXT: cbnz w14, .LBB10_3 -; NOLSE-NEXT: b .LBB10_1 -; NOLSE-NEXT: .LBB10_6: // %atomicrmw.end +; NOLSE-NEXT: fmov x9, d2 +; NOLSE-NEXT: stlxp w10, x9, x8, [x0] +; NOLSE-NEXT: cbnz w10, .LBB10_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: mov v0.16b, v1.16b ; NOLSE-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll index 692d1ba9e091fc..a416c468a67362 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll @@ -6,33 +6,17 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align2: ; NOLSE: // %bb.0: ; NOLSE-NEXT: fcvt s1, h0 -; NOLSE-NEXT: ldr h0, [x0] -; NOLSE-NEXT: b .LBB0_2 ; NOLSE-NEXT: .LBB0_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB0_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.eq .LBB0_5 -; NOLSE-NEXT: .LBB0_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB0_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w8, [x0] +; NOLSE-NEXT: fmov s0, w8 ; NOLSE-NEXT: fcvt s2, h0 -; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: fsub s2, s2, s1 ; NOLSE-NEXT: fcvt h2, s2 ; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB0_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w10, [x0] -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.ne .LBB0_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB0_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB0_3 -; NOLSE-NEXT: b .LBB0_1 -; NOLSE-NEXT: .LBB0_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w9, w8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB0_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -63,33 +47,17 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fsub_f16_seq_cst_align4: ; NOLSE: // %bb.0: ; NOLSE-NEXT: fcvt s1, h0 -; NOLSE-NEXT: ldr h0, [x0] -; NOLSE-NEXT: b .LBB1_2 ; NOLSE-NEXT: .LBB1_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB1_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.eq .LBB1_5 -; NOLSE-NEXT: .LBB1_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB1_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w8, [x0] +; NOLSE-NEXT: fmov s0, w8 ; NOLSE-NEXT: fcvt s2, h0 -; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: fsub s2, s2, s1 ; NOLSE-NEXT: fcvt h2, s2 ; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB1_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w10, [x0] -; NOLSE-NEXT: cmp w10, w9, uxth -; NOLSE-NEXT: b.ne .LBB1_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB1_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB1_3 -; NOLSE-NEXT: b .LBB1_1 -; NOLSE-NEXT: .LBB1_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w9, w8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB1_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -122,19 +90,12 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 ; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: ldr h0, [x0] ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s1, w9 -; NOLSE-NEXT: b .LBB2_2 ; NOLSE-NEXT: .LBB2_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB2_2 Depth=1 -; NOLSE-NEXT: fmov s0, w11 -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.eq .LBB2_5 -; NOLSE-NEXT: .LBB2_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB2_3 Depth 2 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w9, [x0] +; NOLSE-NEXT: fmov s0, w9 ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s2, w9 ; NOLSE-NEXT: fsub s2, s2, s1 @@ -143,21 +104,9 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; NOLSE-NEXT: add w9, w9, w8 ; NOLSE-NEXT: add w9, w10, w9 ; NOLSE-NEXT: lsr w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: fmov w10, s2 -; NOLSE-NEXT: .LBB2_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB2_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w11, [x0] -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.ne .LBB2_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB2_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w10, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB2_3 -; NOLSE-NEXT: b .LBB2_1 -; NOLSE-NEXT: .LBB2_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w10, w9, [x0] +; NOLSE-NEXT: cbnz w10, .LBB2_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -199,19 +148,12 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 ; NOLSE-NEXT: fmov w9, s0 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: ldr h0, [x0] ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s1, w9 -; NOLSE-NEXT: b .LBB3_2 ; NOLSE-NEXT: .LBB3_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB3_2 Depth=1 -; NOLSE-NEXT: fmov s0, w11 -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.eq .LBB3_5 -; NOLSE-NEXT: .LBB3_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB3_3 Depth 2 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxrh w9, [x0] +; NOLSE-NEXT: fmov s0, w9 ; NOLSE-NEXT: lsl w9, w9, #16 ; NOLSE-NEXT: fmov s2, w9 ; NOLSE-NEXT: fsub s2, s2, s1 @@ -220,21 +162,9 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; NOLSE-NEXT: add w9, w9, w8 ; NOLSE-NEXT: add w9, w10, w9 ; NOLSE-NEXT: lsr w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: fmov w10, s2 -; NOLSE-NEXT: .LBB3_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB3_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxrh w11, [x0] -; NOLSE-NEXT: cmp w11, w9, uxth -; NOLSE-NEXT: b.ne .LBB3_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB3_3 Depth=2 -; NOLSE-NEXT: stlxrh wzr, w10, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB3_3 -; NOLSE-NEXT: b .LBB3_1 -; NOLSE-NEXT: .LBB3_5: // %atomicrmw.end +; NOLSE-NEXT: stlxrh w10, w9, [x0] +; NOLSE-NEXT: cbnz w10, .LBB3_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 ; NOLSE-NEXT: ret ; @@ -273,31 +203,15 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value) define float @test_atomicrmw_fsub_f32_seq_cst_align4(ptr %ptr, float %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align4: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr s1, [x0] -; NOLSE-NEXT: b .LBB4_2 ; NOLSE-NEXT: .LBB4_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB4_2 Depth=1 -; NOLSE-NEXT: fmov s1, w10 -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.eq .LBB4_5 -; NOLSE-NEXT: .LBB4_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB4_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr w8, [x0] +; NOLSE-NEXT: fmov s1, w8 ; NOLSE-NEXT: fsub s2, s1, s0 -; NOLSE-NEXT: fmov w9, s1 ; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB4_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB4_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr w10, [x0] -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.ne .LBB4_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB4_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB4_3 -; NOLSE-NEXT: b .LBB4_1 -; NOLSE-NEXT: .LBB4_5: // %atomicrmw.end +; NOLSE-NEXT: stlxr w9, w8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB4_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov s0, s1 ; NOLSE-NEXT: ret ; @@ -324,31 +238,15 @@ define float @test_atomicrmw_fsub_f32_seq_cst_align4(ptr %ptr, float %value) #0 define double @test_atomicrmw_fsub_f32_seq_cst_align8(ptr %ptr, double %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align8: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr d1, [x0] -; NOLSE-NEXT: b .LBB5_2 ; NOLSE-NEXT: .LBB5_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB5_2 Depth=1 -; NOLSE-NEXT: fmov d1, x10 -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.eq .LBB5_5 -; NOLSE-NEXT: .LBB5_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB5_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr x8, [x0] +; NOLSE-NEXT: fmov d1, x8 ; NOLSE-NEXT: fsub d2, d1, d0 -; NOLSE-NEXT: fmov x9, d1 ; NOLSE-NEXT: fmov x8, d2 -; NOLSE-NEXT: .LBB5_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB5_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr x10, [x0] -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.ne .LBB5_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB5_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, x8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB5_3 -; NOLSE-NEXT: b .LBB5_1 -; NOLSE-NEXT: .LBB5_5: // %atomicrmw.end +; NOLSE-NEXT: stlxr w9, x8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB5_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov d0, d1 ; NOLSE-NEXT: ret ; @@ -375,54 +273,26 @@ define double @test_atomicrmw_fsub_f32_seq_cst_align8(ptr %ptr, double %value) # define fp128 @test_atomicrmw_fsub_f32_seq_cst_align16(ptr %ptr, fp128 %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align16: ; NOLSE: // %bb.0: -; NOLSE-NEXT: sub sp, sp, #96 -; NOLSE-NEXT: ldr q1, [x0] -; NOLSE-NEXT: stp x30, x19, [sp, #80] // 16-byte Folded Spill +; NOLSE-NEXT: sub sp, sp, #80 +; NOLSE-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill ; NOLSE-NEXT: mov x19, x0 -; NOLSE-NEXT: str q0, [sp] // 16-byte Folded Spill -; NOLSE-NEXT: b .LBB6_2 +; NOLSE-NEXT: str q0, [sp, #16] // 16-byte Folded Spill ; NOLSE-NEXT: .LBB6_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB6_2 Depth=1 -; NOLSE-NEXT: stp x12, x13, [sp, #32] -; NOLSE-NEXT: cmp x13, x10 -; NOLSE-NEXT: ldr q1, [sp, #32] -; NOLSE-NEXT: ccmp x12, x11, #0, eq -; NOLSE-NEXT: b.eq .LBB6_6 -; NOLSE-NEXT: .LBB6_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB6_3 Depth 2 -; NOLSE-NEXT: mov v0.16b, v1.16b -; NOLSE-NEXT: str q1, [sp, #16] // 16-byte Folded Spill -; NOLSE-NEXT: ldr q1, [sp] // 16-byte Folded Reload +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxp x8, x9, [x19] +; NOLSE-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload +; NOLSE-NEXT: stp x8, x9, [sp, #48] +; NOLSE-NEXT: ldr q0, [sp, #48] +; NOLSE-NEXT: str q0, [sp] // 16-byte Folded Spill ; NOLSE-NEXT: bl __subtf3 -; NOLSE-NEXT: str q0, [sp, #48] -; NOLSE-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload -; NOLSE-NEXT: ldp x9, x8, [sp, #48] -; NOLSE-NEXT: str q0, [sp, #64] -; NOLSE-NEXT: ldp x11, x10, [sp, #64] -; NOLSE-NEXT: .LBB6_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB6_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxp x12, x13, [x19] -; NOLSE-NEXT: cmp x12, x11 -; NOLSE-NEXT: cset w14, ne -; NOLSE-NEXT: cmp x13, x10 -; NOLSE-NEXT: cinc w14, w14, ne -; NOLSE-NEXT: cbz w14, .LBB6_5 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 -; NOLSE-NEXT: stlxp w14, x12, x13, [x19] -; NOLSE-NEXT: cbnz w14, .LBB6_3 -; NOLSE-NEXT: b .LBB6_1 -; NOLSE-NEXT: .LBB6_5: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB6_3 Depth=2 -; NOLSE-NEXT: stlxp w14, x9, x8, [x19] -; NOLSE-NEXT: cbnz w14, .LBB6_3 -; NOLSE-NEXT: b .LBB6_1 -; NOLSE-NEXT: .LBB6_6: // %atomicrmw.end -; NOLSE-NEXT: ldp x30, x19, [sp, #80] // 16-byte Folded Reload -; NOLSE-NEXT: mov v0.16b, v1.16b -; NOLSE-NEXT: add sp, sp, #96 +; NOLSE-NEXT: str q0, [sp, #32] +; NOLSE-NEXT: ldp x9, x8, [sp, #32] +; NOLSE-NEXT: stlxp w10, x9, x8, [x19] +; NOLSE-NEXT: cbnz w10, .LBB6_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end +; NOLSE-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; NOLSE-NEXT: ldr q0, [sp] // 16-byte Folded Reload +; NOLSE-NEXT: add sp, sp, #80 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fsub_f32_seq_cst_align16: @@ -463,35 +333,19 @@ define fp128 @test_atomicrmw_fsub_f32_seq_cst_align16(ptr %ptr, fp128 %value) #0 define <2 x half> @test_atomicrmw_fsub_v2f16_seq_cst_align4(ptr %ptr, <2 x half> %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fsub_v2f16_seq_cst_align4: ; NOLSE: // %bb.0: -; NOLSE-NEXT: fcvtl v1.4s, v0.4h -; NOLSE-NEXT: ldr s0, [x0] -; NOLSE-NEXT: b .LBB7_2 +; NOLSE-NEXT: fcvtl v0.4s, v0.4h ; NOLSE-NEXT: .LBB7_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB7_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.eq .LBB7_5 -; NOLSE-NEXT: .LBB7_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB7_3 Depth 2 -; NOLSE-NEXT: fcvtl v2.4s, v0.4h -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: fsub v2.4s, v2.4s, v1.4s -; NOLSE-NEXT: fcvtn v2.4h, v2.4s -; NOLSE-NEXT: fmov w8, s2 -; NOLSE-NEXT: .LBB7_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB7_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr w10, [x0] -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.ne .LBB7_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB7_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB7_3 -; NOLSE-NEXT: b .LBB7_1 -; NOLSE-NEXT: .LBB7_5: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr w8, [x0] +; NOLSE-NEXT: fmov s1, w8 +; NOLSE-NEXT: fcvtl v1.4s, v1.4h +; NOLSE-NEXT: fsub v1.4s, v1.4s, v0.4s +; NOLSE-NEXT: fcvtn v1.4h, v1.4s +; NOLSE-NEXT: fmov w9, s1 +; NOLSE-NEXT: stlxr w10, w9, [x0] +; NOLSE-NEXT: cbnz w10, .LBB7_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, x8 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fsub_v2f16_seq_cst_align4: @@ -522,38 +376,22 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; NOLSE: // %bb.0: ; NOLSE-NEXT: movi v1.4s, #1 ; NOLSE-NEXT: movi v2.4s, #127, msl #8 -; NOLSE-NEXT: shll v3.4s, v0.4h, #16 -; NOLSE-NEXT: ldr s0, [x0] -; NOLSE-NEXT: b .LBB8_2 +; NOLSE-NEXT: shll v0.4s, v0.4h, #16 ; NOLSE-NEXT: .LBB8_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB8_2 Depth=1 -; NOLSE-NEXT: fmov s0, w10 -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.eq .LBB8_5 -; NOLSE-NEXT: .LBB8_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB8_3 Depth 2 -; NOLSE-NEXT: shll v4.4s, v0.4h, #16 -; NOLSE-NEXT: fmov w9, s0 -; NOLSE-NEXT: fsub v4.4s, v4.4s, v3.4s -; NOLSE-NEXT: ushr v5.4s, v4.4s, #16 -; NOLSE-NEXT: and v5.16b, v5.16b, v1.16b -; NOLSE-NEXT: add v4.4s, v5.4s, v4.4s -; NOLSE-NEXT: addhn v4.4h, v4.4s, v2.4s -; NOLSE-NEXT: fmov w8, s4 -; NOLSE-NEXT: .LBB8_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB8_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr w10, [x0] -; NOLSE-NEXT: cmp w10, w9 -; NOLSE-NEXT: b.ne .LBB8_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB8_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, w8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB8_3 -; NOLSE-NEXT: b .LBB8_1 -; NOLSE-NEXT: .LBB8_5: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $d0 killed $d0 killed $q0 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr w8, [x0] +; NOLSE-NEXT: fmov s3, w8 +; NOLSE-NEXT: shll v3.4s, v3.4h, #16 +; NOLSE-NEXT: fsub v3.4s, v3.4s, v0.4s +; NOLSE-NEXT: ushr v4.4s, v3.4s, #16 +; NOLSE-NEXT: and v4.16b, v4.16b, v1.16b +; NOLSE-NEXT: add v3.4s, v4.4s, v3.4s +; NOLSE-NEXT: addhn v3.4h, v3.4s, v2.4s +; NOLSE-NEXT: fmov w9, s3 +; NOLSE-NEXT: stlxr w10, w9, [x0] +; NOLSE-NEXT: cbnz w10, .LBB8_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end +; NOLSE-NEXT: fmov d0, x8 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fsub_v2bf16_seq_cst_align4: @@ -587,31 +425,15 @@ define <2 x bfloat> @test_atomicrmw_fsub_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf define <2 x float> @test_atomicrmw_fsub_v2f32_seq_cst_align8(ptr %ptr, <2 x float> %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fsub_v2f32_seq_cst_align8: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr d1, [x0] -; NOLSE-NEXT: b .LBB9_2 ; NOLSE-NEXT: .LBB9_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB9_2 Depth=1 -; NOLSE-NEXT: fmov d1, x10 -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.eq .LBB9_5 -; NOLSE-NEXT: .LBB9_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB9_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxr x8, [x0] +; NOLSE-NEXT: fmov d1, x8 ; NOLSE-NEXT: fsub v2.2s, v1.2s, v0.2s -; NOLSE-NEXT: fmov x9, d1 ; NOLSE-NEXT: fmov x8, d2 -; NOLSE-NEXT: .LBB9_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB9_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxr x10, [x0] -; NOLSE-NEXT: cmp x10, x9 -; NOLSE-NEXT: b.ne .LBB9_1 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB9_3 Depth=2 -; NOLSE-NEXT: stlxr wzr, x8, [x0] -; NOLSE-NEXT: cbnz wzr, .LBB9_3 -; NOLSE-NEXT: b .LBB9_1 -; NOLSE-NEXT: .LBB9_5: // %atomicrmw.end +; NOLSE-NEXT: stlxr w9, x8, [x0] +; NOLSE-NEXT: cbnz w9, .LBB9_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: fmov d0, d1 ; NOLSE-NEXT: ret ; @@ -638,43 +460,17 @@ define <2 x float> @test_atomicrmw_fsub_v2f32_seq_cst_align8(ptr %ptr, <2 x floa define <2 x double> @test_atomicrmw_fsub_v2f64_seq_cst_align8(ptr %ptr, <2 x double> %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fsub_v2f64_seq_cst_align8: ; NOLSE: // %bb.0: -; NOLSE-NEXT: ldr q1, [x0] -; NOLSE-NEXT: b .LBB10_2 ; NOLSE-NEXT: .LBB10_1: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB10_2 Depth=1 -; NOLSE-NEXT: fmov d1, x12 -; NOLSE-NEXT: cmp x13, x9 -; NOLSE-NEXT: ccmp x12, x11, #0, eq -; NOLSE-NEXT: mov v1.d[1], x13 -; NOLSE-NEXT: b.eq .LBB10_6 -; NOLSE-NEXT: .LBB10_2: // %atomicrmw.start -; NOLSE-NEXT: // =>This Loop Header: Depth=1 -; NOLSE-NEXT: // Child Loop BB10_3 Depth 2 +; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 +; NOLSE-NEXT: ldaxp x8, x9, [x0] +; NOLSE-NEXT: fmov d1, x8 +; NOLSE-NEXT: mov v1.d[1], x9 ; NOLSE-NEXT: fsub v2.2d, v1.2d, v0.2d -; NOLSE-NEXT: mov x9, v1.d[1] -; NOLSE-NEXT: fmov x11, d1 ; NOLSE-NEXT: mov x8, v2.d[1] -; NOLSE-NEXT: fmov x10, d2 -; NOLSE-NEXT: .LBB10_3: // %atomicrmw.start -; NOLSE-NEXT: // Parent Loop BB10_2 Depth=1 -; NOLSE-NEXT: // => This Inner Loop Header: Depth=2 -; NOLSE-NEXT: ldaxp x12, x13, [x0] -; NOLSE-NEXT: cmp x12, x11 -; NOLSE-NEXT: cset w14, ne -; NOLSE-NEXT: cmp x13, x9 -; NOLSE-NEXT: cinc w14, w14, ne -; NOLSE-NEXT: cbz w14, .LBB10_5 -; NOLSE-NEXT: // %bb.4: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 -; NOLSE-NEXT: stlxp w14, x12, x13, [x0] -; NOLSE-NEXT: cbnz w14, .LBB10_3 -; NOLSE-NEXT: b .LBB10_1 -; NOLSE-NEXT: .LBB10_5: // %atomicrmw.start -; NOLSE-NEXT: // in Loop: Header=BB10_3 Depth=2 -; NOLSE-NEXT: stlxp w14, x10, x8, [x0] -; NOLSE-NEXT: cbnz w14, .LBB10_3 -; NOLSE-NEXT: b .LBB10_1 -; NOLSE-NEXT: .LBB10_6: // %atomicrmw.end +; NOLSE-NEXT: fmov x9, d2 +; NOLSE-NEXT: stlxp w10, x9, x8, [x0] +; NOLSE-NEXT: cbnz w10, .LBB10_1 +; NOLSE-NEXT: // %bb.2: // %atomicrmw.end ; NOLSE-NEXT: mov v0.16b, v1.16b ; NOLSE-NEXT: ret ;