From 6c84709eff20460a75fb58d2face54432c133967 Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 14 Feb 2024 13:55:51 +0000 Subject: [PATCH] [AArch64] Materialize constants via fneg. (#80641) This is something that is already done as a special case for copysign, this patch extends it to be more generally applied. If we are trying to matrialize a negative constant (notably -0.0, 0x80000000), then there may be no movi encoding that creates the immediate, but a fneg(movi) might. Some of the existing patterns for RADDHN needed to be adjusted to keep them in line with the new immediates. --- .../Target/AArch64/AArch64ISelLowering.cpp | 101 ++++++++++++------ llvm/lib/Target/AArch64/AArch64InstrInfo.td | 16 +-- .../CodeGen/AArch64/arm64-build-vector.ll | 4 +- llvm/test/CodeGen/AArch64/fabs-combine.ll | 4 +- llvm/test/CodeGen/AArch64/fcvt_combine.ll | 8 +- llvm/test/CodeGen/AArch64/neon-mov.ll | 50 ++++++--- .../AArch64/srem-seteq-vec-nonsplat.ll | 60 +++++------ .../CodeGen/AArch64/urem-seteq-vec-nonzero.ll | 5 +- llvm/test/CodeGen/AArch64/urem-vector-lkk.ll | 14 +-- 9 files changed, 156 insertions(+), 106 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index a3b7e3128ac1a..8c5a4cdae1163 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -12935,42 +12935,74 @@ static SDValue NormalizeBuildVector(SDValue Op, return DAG.getBuildVector(VT, dl, Ops); } -static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG) { +static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG, + const AArch64Subtarget *ST) { EVT VT = Op.getValueType(); + assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) && + "Expected a legal NEON vector"); APInt DefBits(VT.getSizeInBits(), 0); APInt UndefBits(VT.getSizeInBits(), 0); BuildVectorSDNode *BVN = cast(Op.getNode()); if (resolveBuildVector(BVN, DefBits, UndefBits)) { - SDValue NewOp; - if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) || - (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || - (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) || - (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || - (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) || - (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits))) - return NewOp; - - DefBits = ~DefBits; - if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) || - (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) || - (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits))) - return NewOp; - - DefBits = UndefBits; - if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) || - (NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || - (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) || - (NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || - (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) || - (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits))) - return NewOp; + auto TryMOVIWithBits = [&](APInt DefBits) { + SDValue NewOp; + if ((NewOp = + tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) || + (NewOp = + tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || + (NewOp = + tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) || + (NewOp = + tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) || + (NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) || + (NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits))) + return NewOp; + + APInt NotDefBits = ~DefBits; + if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, + NotDefBits)) || + (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, + NotDefBits)) || + (NewOp = + tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits))) + return NewOp; + return SDValue(); + }; + if (SDValue R = TryMOVIWithBits(DefBits)) + return R; + if (SDValue R = TryMOVIWithBits(UndefBits)) + return R; - DefBits = ~UndefBits; - if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) || - (NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) || - (NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits))) - return NewOp; + // See if a fneg of the constant can be materialized with a MOVI, etc + auto TryWithFNeg = [&](APInt DefBits, MVT FVT) { + // FNegate each sub-element of the constant + assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0); + APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1) + .zext(VT.getSizeInBits()); + APInt NegBits(VT.getSizeInBits(), 0); + unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits(); + for (unsigned i = 0; i < NumElts; i++) + NegBits |= Neg << (FVT.getScalarSizeInBits() * i); + NegBits = DefBits ^ NegBits; + + // Try to create the new constants with MOVI, and if so generate a fneg + // for it. + if (SDValue NewOp = TryMOVIWithBits(NegBits)) { + SDLoc DL(Op); + MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts); + return DAG.getNode( + AArch64ISD::NVCAST, DL, VT, + DAG.getNode(ISD::FNEG, DL, VFVT, + DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp))); + } + return SDValue(); + }; + SDValue R; + if ((R = TryWithFNeg(DefBits, MVT::f32)) || + (R = TryWithFNeg(DefBits, MVT::f64)) || + (ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16)))) + return R; } return SDValue(); @@ -13019,7 +13051,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, return Op; } - if (SDValue V = ConstantBuildVector(Op, DAG)) + if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget)) return V; // Scan through the operands to find some interesting properties we can @@ -13244,7 +13276,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize); if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) && !ConstantValueAPInt.isAllOnes()) { - Val = ConstantBuildVector(Val, DAG); + Val = ConstantBuildVector(Val, DAG, Subtarget); if (!Val) // Otherwise, materialize the constant and splat it. Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue); @@ -23145,9 +23177,12 @@ static SDValue performDUPCombine(SDNode *N, } /// Get rid of unnecessary NVCASTs (that don't change the type). -static SDValue performNVCASTCombine(SDNode *N) { +static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG) { if (N->getValueType(0) == N->getOperand(0).getValueType()) return N->getOperand(0); + if (N->getOperand(0).getOpcode() == AArch64ISD::NVCAST) + return DAG.getNode(AArch64ISD::NVCAST, SDLoc(N), N->getValueType(0), + N->getOperand(0).getOperand(0)); return SDValue(); } @@ -24141,7 +24176,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case AArch64ISD::DUPLANE128: return performDupLane128Combine(N, DAG); case AArch64ISD::NVCAST: - return performNVCASTCombine(N); + return performNVCASTCombine(N, DAG); case AArch64ISD::SPLICE: return performSpliceCombine(N, DAG); case AArch64ISD::UUNPKLO: diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 9c3a6927d043b..8c2a852850320 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -7595,13 +7595,17 @@ defm USHR : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", AArch64vlshr>; defm USRA : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra", TriOpFrag<(add_and_or_is_add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >; +def VImm0080: PatLeaf<(AArch64movi_shift (i32 128), (i32 0))>; +def VImm00008000: PatLeaf<(AArch64movi_shift (i32 128), (i32 8))>; +def VImm0000000080000000: PatLeaf<(AArch64NvCast (v2f64 (fneg (AArch64NvCast (v4i32 (AArch64movi_shift (i32 128), (i32 24)))))))>; + // RADDHN patterns for when RSHRN shifts by half the size of the vector element -def : Pat<(v8i8 (trunc (AArch64vlshr (add (v8i16 V128:$Vn), (AArch64movi_shift (i32 128), (i32 0))), (i32 8)))), +def : Pat<(v8i8 (trunc (AArch64vlshr (add (v8i16 V128:$Vn), VImm0080), (i32 8)))), (RADDHNv8i16_v8i8 V128:$Vn, (v8i16 (MOVIv2d_ns (i32 0))))>; -def : Pat<(v4i16 (trunc (AArch64vlshr (add (v4i32 V128:$Vn), (AArch64movi_shift (i32 128), (i32 8))), (i32 16)))), +def : Pat<(v4i16 (trunc (AArch64vlshr (add (v4i32 V128:$Vn), VImm00008000), (i32 16)))), (RADDHNv4i32_v4i16 V128:$Vn, (v4i32 (MOVIv2d_ns (i32 0))))>; let AddedComplexity = 5 in -def : Pat<(v2i32 (trunc (AArch64vlshr (add (v2i64 V128:$Vn), (AArch64dup (i64 2147483648))), (i32 32)))), +def : Pat<(v2i32 (trunc (AArch64vlshr (add (v2i64 V128:$Vn), VImm0000000080000000), (i32 32)))), (RADDHNv2i64_v2i32 V128:$Vn, (v2i64 (MOVIv2d_ns (i32 0))))>; def : Pat<(v8i8 (int_aarch64_neon_rshrn (v8i16 V128:$Vn), (i32 8))), (RADDHNv8i16_v8i8 V128:$Vn, (v8i16 (MOVIv2d_ns (i32 0))))>; @@ -7613,20 +7617,20 @@ def : Pat<(v2i32 (int_aarch64_neon_rshrn (v2i64 V128:$Vn), (i32 32))), // RADDHN2 patterns for when RSHRN shifts by half the size of the vector element def : Pat<(v16i8 (concat_vectors (v8i8 V64:$Vd), - (v8i8 (trunc (AArch64vlshr (add (v8i16 V128:$Vn), (AArch64movi_shift (i32 128), (i32 0))), (i32 8)))))), + (v8i8 (trunc (AArch64vlshr (add (v8i16 V128:$Vn), VImm0080), (i32 8)))))), (RADDHNv8i16_v16i8 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn, (v8i16 (MOVIv2d_ns (i32 0))))>; def : Pat<(v8i16 (concat_vectors (v4i16 V64:$Vd), - (v4i16 (trunc (AArch64vlshr (add (v4i32 V128:$Vn), (AArch64movi_shift (i32 128), (i32 8))), (i32 16)))))), + (v4i16 (trunc (AArch64vlshr (add (v4i32 V128:$Vn), VImm00008000), (i32 16)))))), (RADDHNv4i32_v8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn, (v4i32 (MOVIv2d_ns (i32 0))))>; let AddedComplexity = 5 in def : Pat<(v4i32 (concat_vectors (v2i32 V64:$Vd), - (v2i32 (trunc (AArch64vlshr (add (v2i64 V128:$Vn), (AArch64dup (i64 2147483648))), (i32 32)))))), + (v2i32 (trunc (AArch64vlshr (add (v2i64 V128:$Vn), VImm0000000080000000), (i32 32)))))), (RADDHNv2i64_v4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn, (v2i64 (MOVIv2d_ns (i32 0))))>; diff --git a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll index e4fbf0765dcd2..82802c79c7085 100644 --- a/llvm/test/CodeGen/AArch64/arm64-build-vector.ll +++ b/llvm/test/CodeGen/AArch64/arm64-build-vector.ll @@ -120,8 +120,8 @@ define <2 x double> @poszero_v2f64(<2 x double> %a) { define <2 x double> @negzero_v2f64(<2 x double> %a) { ; CHECK-LABEL: negzero_v2f64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #-9223372036854775808 // =0x8000000000000000 -; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: fneg v1.2d, v1.2d ; CHECK-NEXT: fmul v0.2d, v0.2d, v1.2d ; CHECK-NEXT: ret %b = fmul <2 x double> %a, diff --git a/llvm/test/CodeGen/AArch64/fabs-combine.ll b/llvm/test/CodeGen/AArch64/fabs-combine.ll index 23bf7a699195f..d083f2006575b 100644 --- a/llvm/test/CodeGen/AArch64/fabs-combine.ll +++ b/llvm/test/CodeGen/AArch64/fabs-combine.ll @@ -71,8 +71,8 @@ define <4 x float> @nabsv4f32(<4 x float> %a) { define <2 x double> @nabsv2d64(<2 x double> %a) { ; CHECK-LABEL: nabsv2d64: ; CHECK: // %bb.0: -; CHECK-NEXT: mov x8, #-9223372036854775808 -; CHECK-NEXT: dup v1.2d, x8 +; CHECK-NEXT: movi v1.2d, #0000000000000000 +; CHECK-NEXT: fneg v1.2d, v1.2d ; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b ; CHECK-NEXT: ret %conv = bitcast <2 x double> %a to <2 x i64> diff --git a/llvm/test/CodeGen/AArch64/fcvt_combine.ll b/llvm/test/CodeGen/AArch64/fcvt_combine.ll index b5b9055fbc02f..37133cf0aa1df 100644 --- a/llvm/test/CodeGen/AArch64/fcvt_combine.ll +++ b/llvm/test/CodeGen/AArch64/fcvt_combine.ll @@ -110,8 +110,8 @@ define <2 x i32> @test9(<2 x float> %f) { define <2 x i32> @test10(<2 x float> %f) { ; CHECK-LABEL: test10: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #2143289344 // =0x7fc00000 -; CHECK-NEXT: dup v0.2s, w8 +; CHECK-NEXT: mvni v0.2s, #63, msl #16 +; CHECK-NEXT: fneg v0.2s, v0.2s ; CHECK-NEXT: fcvtzu v0.2s, v0.2s ; CHECK-NEXT: ret %mul.i = fmul <2 x float> %f, @@ -426,8 +426,8 @@ define <2 x i32> @test9_sat(<2 x float> %f) { define <2 x i32> @test10_sat(<2 x float> %f) { ; CHECK-LABEL: test10_sat: ; CHECK: // %bb.0: -; CHECK-NEXT: mov w8, #2143289344 // =0x7fc00000 -; CHECK-NEXT: dup v0.2s, w8 +; CHECK-NEXT: mvni v0.2s, #63, msl #16 +; CHECK-NEXT: fneg v0.2s, v0.2s ; CHECK-NEXT: fcvtzu v0.2s, v0.2s ; CHECK-NEXT: ret %mul.i = fmul <2 x float> %f, diff --git a/llvm/test/CodeGen/AArch64/neon-mov.ll b/llvm/test/CodeGen/AArch64/neon-mov.ll index 219c8b53243e6..7effdc97993c1 100644 --- a/llvm/test/CodeGen/AArch64/neon-mov.ll +++ b/llvm/test/CodeGen/AArch64/neon-mov.ll @@ -111,16 +111,14 @@ define <4 x i32> @movi4s_lsl16() { define <4 x i32> @movi4s_fneg() { ; CHECK-NOFP16-SD-LABEL: movi4s_fneg: ; CHECK-NOFP16-SD: // %bb.0: -; CHECK-NOFP16-SD-NEXT: mov w8, #61440 // =0xf000 -; CHECK-NOFP16-SD-NEXT: movk w8, #32768, lsl #16 -; CHECK-NOFP16-SD-NEXT: dup v0.4s, w8 +; CHECK-NOFP16-SD-NEXT: movi v0.4s, #240, lsl #8 +; CHECK-NOFP16-SD-NEXT: fneg v0.4s, v0.4s ; CHECK-NOFP16-SD-NEXT: ret ; ; CHECK-FP16-SD-LABEL: movi4s_fneg: ; CHECK-FP16-SD: // %bb.0: -; CHECK-FP16-SD-NEXT: mov w8, #61440 // =0xf000 -; CHECK-FP16-SD-NEXT: movk w8, #32768, lsl #16 -; CHECK-FP16-SD-NEXT: dup v0.4s, w8 +; CHECK-FP16-SD-NEXT: movi v0.4s, #240, lsl #8 +; CHECK-FP16-SD-NEXT: fneg v0.4s, v0.4s ; CHECK-FP16-SD-NEXT: ret ; ; CHECK-NOFP16-GI-LABEL: movi4s_fneg: @@ -178,11 +176,29 @@ define <8 x i16> @movi8h_lsl8() { } define <8 x i16> @movi8h_fneg() { -; CHECK-LABEL: movi8h_fneg: -; CHECK: // %bb.0: -; CHECK-NEXT: adrp x8, .LCPI19_0 -; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI19_0] -; CHECK-NEXT: ret +; CHECK-NOFP16-SD-LABEL: movi8h_fneg: +; CHECK-NOFP16-SD: // %bb.0: +; CHECK-NOFP16-SD-NEXT: movi v0.8h, #127, lsl #8 +; CHECK-NOFP16-SD-NEXT: fneg v0.4s, v0.4s +; CHECK-NOFP16-SD-NEXT: ret +; +; CHECK-FP16-SD-LABEL: movi8h_fneg: +; CHECK-FP16-SD: // %bb.0: +; CHECK-FP16-SD-NEXT: movi v0.8h, #127, lsl #8 +; CHECK-FP16-SD-NEXT: fneg v0.4s, v0.4s +; CHECK-FP16-SD-NEXT: ret +; +; CHECK-NOFP16-GI-LABEL: movi8h_fneg: +; CHECK-NOFP16-GI: // %bb.0: +; CHECK-NOFP16-GI-NEXT: adrp x8, .LCPI19_0 +; CHECK-NOFP16-GI-NEXT: ldr q0, [x8, :lo12:.LCPI19_0] +; CHECK-NOFP16-GI-NEXT: ret +; +; CHECK-FP16-GI-LABEL: movi8h_fneg: +; CHECK-FP16-GI: // %bb.0: +; CHECK-FP16-GI-NEXT: adrp x8, .LCPI19_0 +; CHECK-FP16-GI-NEXT: ldr q0, [x8, :lo12:.LCPI19_0] +; CHECK-FP16-GI-NEXT: ret ret <8 x i16> } @@ -294,8 +310,8 @@ define <8 x i16> @mvni8h_neg() { ; ; CHECK-FP16-SD-LABEL: mvni8h_neg: ; CHECK-FP16-SD: // %bb.0: -; CHECK-FP16-SD-NEXT: mov w8, #33008 // =0x80f0 -; CHECK-FP16-SD-NEXT: dup v0.8h, w8 +; CHECK-FP16-SD-NEXT: movi v0.8h, #240 +; CHECK-FP16-SD-NEXT: fneg v0.8h, v0.8h ; CHECK-FP16-SD-NEXT: ret ; ; CHECK-NOFP16-GI-LABEL: mvni8h_neg: @@ -480,14 +496,14 @@ define <2 x double> @fmov2d() { define <2 x double> @fmov2d_neg0() { ; CHECK-NOFP16-SD-LABEL: fmov2d_neg0: ; CHECK-NOFP16-SD: // %bb.0: -; CHECK-NOFP16-SD-NEXT: mov x8, #-9223372036854775808 // =0x8000000000000000 -; CHECK-NOFP16-SD-NEXT: dup v0.2d, x8 +; CHECK-NOFP16-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NOFP16-SD-NEXT: fneg v0.2d, v0.2d ; CHECK-NOFP16-SD-NEXT: ret ; ; CHECK-FP16-SD-LABEL: fmov2d_neg0: ; CHECK-FP16-SD: // %bb.0: -; CHECK-FP16-SD-NEXT: mov x8, #-9223372036854775808 // =0x8000000000000000 -; CHECK-FP16-SD-NEXT: dup v0.2d, x8 +; CHECK-FP16-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-FP16-SD-NEXT: fneg v0.2d, v0.2d ; CHECK-FP16-SD-NEXT: ret ; ; CHECK-NOFP16-GI-LABEL: fmov2d_neg0: diff --git a/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll index f8c6f4193959d..1ebfe308e9af9 100644 --- a/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll +++ b/llvm/test/CodeGen/AArch64/srem-seteq-vec-nonsplat.ll @@ -35,18 +35,17 @@ define <4 x i32> @test_srem_odd_even(<4 x i32> %X) nounwind { define <4 x i32> @test_srem_odd_allones_eq(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_allones_eq: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.16b, #153 ; CHECK-NEXT: mov w8, #52429 // =0xcccd -; CHECK-NEXT: mov w9, #39321 // =0x9999 ; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: movk w9, #6553, lsl #16 -; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: dup v2.4s, w9 +; CHECK-NEXT: dup v2.4s, w8 ; CHECK-NEXT: adrp x8, .LCPI1_0 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: fneg v1.4s, v1.4s +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s ; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI1_0] -; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: cmhs v0.4s, v0.4s, v2.4s -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: cmhs v0.4s, v0.4s, v1.4s +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -56,18 +55,17 @@ define <4 x i32> @test_srem_odd_allones_eq(<4 x i32> %X) nounwind { define <4 x i32> @test_srem_odd_allones_ne(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_allones_ne: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.16b, #153 ; CHECK-NEXT: mov w8, #52429 // =0xcccd -; CHECK-NEXT: mov w9, #39321 // =0x9999 ; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: movk w9, #6553, lsl #16 -; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: dup v2.4s, w9 +; CHECK-NEXT: dup v2.4s, w8 ; CHECK-NEXT: adrp x8, .LCPI2_0 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: fneg v1.4s, v1.4s +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s ; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI2_0] -; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: cmhi v0.4s, v2.4s, v0.4s -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp ne <4 x i32> %srem, @@ -269,18 +267,17 @@ define <4 x i32> @test_srem_odd_even_poweroftwo(<4 x i32> %X) nounwind { define <4 x i32> @test_srem_odd_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_one: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.16b, #153 ; CHECK-NEXT: mov w8, #52429 // =0xcccd -; CHECK-NEXT: mov w9, #39321 // =0x9999 ; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: movk w9, #6553, lsl #16 -; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: dup v2.4s, w9 +; CHECK-NEXT: dup v2.4s, w8 ; CHECK-NEXT: adrp x8, .LCPI10_0 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: fneg v1.4s, v1.4s +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s ; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI10_0] -; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: cmhs v0.4s, v0.4s, v2.4s -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: cmhs v0.4s, v0.4s, v1.4s +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, @@ -522,18 +519,17 @@ define <4 x i32> @test_srem_odd_even_allones_and_poweroftwo(<4 x i32> %X) nounwi define <4 x i32> @test_srem_odd_allones_and_one(<4 x i32> %X) nounwind { ; CHECK-LABEL: test_srem_odd_allones_and_one: ; CHECK: // %bb.0: +; CHECK-NEXT: movi v1.16b, #153 ; CHECK-NEXT: mov w8, #52429 // =0xcccd -; CHECK-NEXT: mov w9, #39321 // =0x9999 ; CHECK-NEXT: movk w8, #52428, lsl #16 -; CHECK-NEXT: movk w9, #6553, lsl #16 -; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: dup v2.4s, w9 +; CHECK-NEXT: dup v2.4s, w8 ; CHECK-NEXT: adrp x8, .LCPI19_0 -; CHECK-NEXT: mla v2.4s, v0.4s, v1.4s +; CHECK-NEXT: fneg v1.4s, v1.4s +; CHECK-NEXT: mla v1.4s, v0.4s, v2.4s ; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI19_0] -; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: cmhs v0.4s, v0.4s, v2.4s -; CHECK-NEXT: and v0.16b, v0.16b, v1.16b +; CHECK-NEXT: movi v2.4s, #1 +; CHECK-NEXT: cmhs v0.4s, v0.4s, v1.4s +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b ; CHECK-NEXT: ret %srem = srem <4 x i32> %X, %cmp = icmp eq <4 x i32> %srem, diff --git a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll index 478a34cf2a2b9..b31ce94cdaaea 100644 --- a/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll +++ b/llvm/test/CodeGen/AArch64/urem-seteq-vec-nonzero.ll @@ -51,12 +51,11 @@ define <4 x i1> @t32_6_part0(<4 x i32> %X) nounwind { ; CHECK-NEXT: movk w8, #43690, lsl #16 ; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: dup v1.4s, w8 -; CHECK-NEXT: mov w8, #43690 // =0xaaaa -; CHECK-NEXT: movk w8, #10922, lsl #16 ; CHECK-NEXT: mul v0.4s, v0.4s, v1.4s ; CHECK-NEXT: shl v1.4s, v0.4s, #31 ; CHECK-NEXT: usra v1.4s, v0.4s, #1 -; CHECK-NEXT: dup v0.4s, w8 +; CHECK-NEXT: movi v0.16b, #170 +; CHECK-NEXT: fneg v0.4s, v0.4s ; CHECK-NEXT: cmhs v0.4s, v0.4s, v1.4s ; CHECK-NEXT: xtn v0.4h, v0.4s ; CHECK-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll b/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll index dc021bc3bfcc7..468a33ce5bfcf 100644 --- a/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll +++ b/llvm/test/CodeGen/AArch64/urem-vector-lkk.ll @@ -10,18 +10,18 @@ define <4 x i16> @fold_urem_vec_1(<4 x i16> %x) { ; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_1] ; CHECK-NEXT: adrp x8, .LCPI0_2 ; CHECK-NEXT: ushl v1.4h, v0.4h, v1.4h -; CHECK-NEXT: ldr d3, [x8, :lo12:.LCPI0_2] -; CHECK-NEXT: adrp x8, .LCPI0_3 ; CHECK-NEXT: umull v1.4s, v1.4h, v2.4h +; CHECK-NEXT: movi d2, #0000000000000000 ; CHECK-NEXT: shrn v1.4h, v1.4s, #16 -; CHECK-NEXT: sub v2.4h, v0.4h, v1.4h -; CHECK-NEXT: umull v2.4s, v2.4h, v3.4h +; CHECK-NEXT: fneg d2, d2 +; CHECK-NEXT: sub v3.4h, v0.4h, v1.4h +; CHECK-NEXT: umull v2.4s, v3.4h, v2.4h ; CHECK-NEXT: shrn v2.4h, v2.4s, #16 ; CHECK-NEXT: add v1.4h, v2.4h, v1.4h -; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_3] -; CHECK-NEXT: adrp x8, .LCPI0_4 +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_2] +; CHECK-NEXT: adrp x8, .LCPI0_3 ; CHECK-NEXT: ushl v1.4h, v1.4h, v2.4h -; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_4] +; CHECK-NEXT: ldr d2, [x8, :lo12:.LCPI0_3] ; CHECK-NEXT: mls v0.4h, v1.4h, v2.4h ; CHECK-NEXT: ret %1 = urem <4 x i16> %x,