Skip to content

Commit

Permalink
[AArch64] Materialize constants via fneg. (#80641)
Browse files Browse the repository at this point in the history
This is something that is already done as a special case for copysign,
this patch extends it to be more generally applied. If we are trying to
matrialize a negative constant (notably -0.0, 0x80000000), then there
may be no movi encoding that creates the immediate, but a fneg(movi)
might.

Some of the existing patterns for RADDHN needed to be adjusted to keep
them in line with the new immediates.
  • Loading branch information
davemgreen authored Feb 14, 2024
1 parent f723260 commit 6c84709
Show file tree
Hide file tree
Showing 9 changed files with 156 additions and 106 deletions.
101 changes: 68 additions & 33 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12935,42 +12935,74 @@ static SDValue NormalizeBuildVector(SDValue Op,
return DAG.getBuildVector(VT, dl, Ops);
}

static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG) {
static SDValue ConstantBuildVector(SDValue Op, SelectionDAG &DAG,
const AArch64Subtarget *ST) {
EVT VT = Op.getValueType();
assert((VT.getSizeInBits() == 64 || VT.getSizeInBits() == 128) &&
"Expected a legal NEON vector");

APInt DefBits(VT.getSizeInBits(), 0);
APInt UndefBits(VT.getSizeInBits(), 0);
BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
if (resolveBuildVector(BVN, DefBits, UndefBits)) {
SDValue NewOp;
if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
return NewOp;

DefBits = ~DefBits;
if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
return NewOp;

DefBits = UndefBits;
if ((NewOp = tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
return NewOp;
auto TryMOVIWithBits = [&](APInt DefBits) {
SDValue NewOp;
if ((NewOp =
tryAdvSIMDModImm64(AArch64ISD::MOVIedit, Op, DAG, DefBits)) ||
(NewOp =
tryAdvSIMDModImm32(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
(NewOp =
tryAdvSIMDModImm321s(AArch64ISD::MOVImsl, Op, DAG, DefBits)) ||
(NewOp =
tryAdvSIMDModImm16(AArch64ISD::MOVIshift, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm8(AArch64ISD::MOVI, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImmFP(AArch64ISD::FMOV, Op, DAG, DefBits)))
return NewOp;

APInt NotDefBits = ~DefBits;
if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG,
NotDefBits)) ||
(NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG,
NotDefBits)) ||
(NewOp =
tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, NotDefBits)))
return NewOp;
return SDValue();
};
if (SDValue R = TryMOVIWithBits(DefBits))
return R;
if (SDValue R = TryMOVIWithBits(UndefBits))
return R;

DefBits = ~UndefBits;
if ((NewOp = tryAdvSIMDModImm32(AArch64ISD::MVNIshift, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm321s(AArch64ISD::MVNImsl, Op, DAG, DefBits)) ||
(NewOp = tryAdvSIMDModImm16(AArch64ISD::MVNIshift, Op, DAG, DefBits)))
return NewOp;
// See if a fneg of the constant can be materialized with a MOVI, etc
auto TryWithFNeg = [&](APInt DefBits, MVT FVT) {
// FNegate each sub-element of the constant
assert(VT.getSizeInBits() % FVT.getScalarSizeInBits() == 0);
APInt Neg = APInt::getHighBitsSet(FVT.getSizeInBits(), 1)
.zext(VT.getSizeInBits());
APInt NegBits(VT.getSizeInBits(), 0);
unsigned NumElts = VT.getSizeInBits() / FVT.getScalarSizeInBits();
for (unsigned i = 0; i < NumElts; i++)
NegBits |= Neg << (FVT.getScalarSizeInBits() * i);
NegBits = DefBits ^ NegBits;

// Try to create the new constants with MOVI, and if so generate a fneg
// for it.
if (SDValue NewOp = TryMOVIWithBits(NegBits)) {
SDLoc DL(Op);
MVT VFVT = NumElts == 1 ? FVT : MVT::getVectorVT(FVT, NumElts);
return DAG.getNode(
AArch64ISD::NVCAST, DL, VT,
DAG.getNode(ISD::FNEG, DL, VFVT,
DAG.getNode(AArch64ISD::NVCAST, DL, VFVT, NewOp)));
}
return SDValue();
};
SDValue R;
if ((R = TryWithFNeg(DefBits, MVT::f32)) ||
(R = TryWithFNeg(DefBits, MVT::f64)) ||
(ST->hasFullFP16() && (R = TryWithFNeg(DefBits, MVT::f16))))
return R;
}

return SDValue();
Expand Down Expand Up @@ -13019,7 +13051,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
return Op;
}

if (SDValue V = ConstantBuildVector(Op, DAG))
if (SDValue V = ConstantBuildVector(Op, DAG, Subtarget))
return V;

// Scan through the operands to find some interesting properties we can
Expand Down Expand Up @@ -13244,7 +13276,7 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
ConstantValueAPInt = C->getAPIntValue().zextOrTrunc(BitSize);
if (!isNullConstant(ConstantValue) && !isNullFPConstant(ConstantValue) &&
!ConstantValueAPInt.isAllOnes()) {
Val = ConstantBuildVector(Val, DAG);
Val = ConstantBuildVector(Val, DAG, Subtarget);
if (!Val)
// Otherwise, materialize the constant and splat it.
Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
Expand Down Expand Up @@ -23145,9 +23177,12 @@ static SDValue performDUPCombine(SDNode *N,
}

/// Get rid of unnecessary NVCASTs (that don't change the type).
static SDValue performNVCASTCombine(SDNode *N) {
static SDValue performNVCASTCombine(SDNode *N, SelectionDAG &DAG) {
if (N->getValueType(0) == N->getOperand(0).getValueType())
return N->getOperand(0);
if (N->getOperand(0).getOpcode() == AArch64ISD::NVCAST)
return DAG.getNode(AArch64ISD::NVCAST, SDLoc(N), N->getValueType(0),
N->getOperand(0).getOperand(0));

return SDValue();
}
Expand Down Expand Up @@ -24141,7 +24176,7 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
case AArch64ISD::DUPLANE128:
return performDupLane128Combine(N, DAG);
case AArch64ISD::NVCAST:
return performNVCASTCombine(N);
return performNVCASTCombine(N, DAG);
case AArch64ISD::SPLICE:
return performSpliceCombine(N, DAG);
case AArch64ISD::UUNPKLO:
Expand Down
16 changes: 10 additions & 6 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -7595,13 +7595,17 @@ defm USHR : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", AArch64vlshr>;
defm USRA : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra",
TriOpFrag<(add_and_or_is_add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >;

def VImm0080: PatLeaf<(AArch64movi_shift (i32 128), (i32 0))>;
def VImm00008000: PatLeaf<(AArch64movi_shift (i32 128), (i32 8))>;
def VImm0000000080000000: PatLeaf<(AArch64NvCast (v2f64 (fneg (AArch64NvCast (v4i32 (AArch64movi_shift (i32 128), (i32 24)))))))>;

// RADDHN patterns for when RSHRN shifts by half the size of the vector element
def : Pat<(v8i8 (trunc (AArch64vlshr (add (v8i16 V128:$Vn), (AArch64movi_shift (i32 128), (i32 0))), (i32 8)))),
def : Pat<(v8i8 (trunc (AArch64vlshr (add (v8i16 V128:$Vn), VImm0080), (i32 8)))),
(RADDHNv8i16_v8i8 V128:$Vn, (v8i16 (MOVIv2d_ns (i32 0))))>;
def : Pat<(v4i16 (trunc (AArch64vlshr (add (v4i32 V128:$Vn), (AArch64movi_shift (i32 128), (i32 8))), (i32 16)))),
def : Pat<(v4i16 (trunc (AArch64vlshr (add (v4i32 V128:$Vn), VImm00008000), (i32 16)))),
(RADDHNv4i32_v4i16 V128:$Vn, (v4i32 (MOVIv2d_ns (i32 0))))>;
let AddedComplexity = 5 in
def : Pat<(v2i32 (trunc (AArch64vlshr (add (v2i64 V128:$Vn), (AArch64dup (i64 2147483648))), (i32 32)))),
def : Pat<(v2i32 (trunc (AArch64vlshr (add (v2i64 V128:$Vn), VImm0000000080000000), (i32 32)))),
(RADDHNv2i64_v2i32 V128:$Vn, (v2i64 (MOVIv2d_ns (i32 0))))>;
def : Pat<(v8i8 (int_aarch64_neon_rshrn (v8i16 V128:$Vn), (i32 8))),
(RADDHNv8i16_v8i8 V128:$Vn, (v8i16 (MOVIv2d_ns (i32 0))))>;
Expand All @@ -7613,20 +7617,20 @@ def : Pat<(v2i32 (int_aarch64_neon_rshrn (v2i64 V128:$Vn), (i32 32))),
// RADDHN2 patterns for when RSHRN shifts by half the size of the vector element
def : Pat<(v16i8 (concat_vectors
(v8i8 V64:$Vd),
(v8i8 (trunc (AArch64vlshr (add (v8i16 V128:$Vn), (AArch64movi_shift (i32 128), (i32 0))), (i32 8)))))),
(v8i8 (trunc (AArch64vlshr (add (v8i16 V128:$Vn), VImm0080), (i32 8)))))),
(RADDHNv8i16_v16i8
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn,
(v8i16 (MOVIv2d_ns (i32 0))))>;
def : Pat<(v8i16 (concat_vectors
(v4i16 V64:$Vd),
(v4i16 (trunc (AArch64vlshr (add (v4i32 V128:$Vn), (AArch64movi_shift (i32 128), (i32 8))), (i32 16)))))),
(v4i16 (trunc (AArch64vlshr (add (v4i32 V128:$Vn), VImm00008000), (i32 16)))))),
(RADDHNv4i32_v8i16
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn,
(v4i32 (MOVIv2d_ns (i32 0))))>;
let AddedComplexity = 5 in
def : Pat<(v4i32 (concat_vectors
(v2i32 V64:$Vd),
(v2i32 (trunc (AArch64vlshr (add (v2i64 V128:$Vn), (AArch64dup (i64 2147483648))), (i32 32)))))),
(v2i32 (trunc (AArch64vlshr (add (v2i64 V128:$Vn), VImm0000000080000000), (i32 32)))))),
(RADDHNv2i64_v4i32
(INSERT_SUBREG (IMPLICIT_DEF), V64:$Vd, dsub), V128:$Vn,
(v2i64 (MOVIv2d_ns (i32 0))))>;
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AArch64/arm64-build-vector.ll
Original file line number Diff line number Diff line change
Expand Up @@ -120,8 +120,8 @@ define <2 x double> @poszero_v2f64(<2 x double> %a) {
define <2 x double> @negzero_v2f64(<2 x double> %a) {
; CHECK-LABEL: negzero_v2f64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x8, #-9223372036854775808 // =0x8000000000000000
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: movi v1.2d, #0000000000000000
; CHECK-NEXT: fneg v1.2d, v1.2d
; CHECK-NEXT: fmul v0.2d, v0.2d, v1.2d
; CHECK-NEXT: ret
%b = fmul <2 x double> %a, <double -0.0, double -0.0>
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AArch64/fabs-combine.ll
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,8 @@ define <4 x float> @nabsv4f32(<4 x float> %a) {
define <2 x double> @nabsv2d64(<2 x double> %a) {
; CHECK-LABEL: nabsv2d64:
; CHECK: // %bb.0:
; CHECK-NEXT: mov x8, #-9223372036854775808
; CHECK-NEXT: dup v1.2d, x8
; CHECK-NEXT: movi v1.2d, #0000000000000000
; CHECK-NEXT: fneg v1.2d, v1.2d
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
; CHECK-NEXT: ret
%conv = bitcast <2 x double> %a to <2 x i64>
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AArch64/fcvt_combine.ll
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,8 @@ define <2 x i32> @test9(<2 x float> %f) {
define <2 x i32> @test10(<2 x float> %f) {
; CHECK-LABEL: test10:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #2143289344 // =0x7fc00000
; CHECK-NEXT: dup v0.2s, w8
; CHECK-NEXT: mvni v0.2s, #63, msl #16
; CHECK-NEXT: fneg v0.2s, v0.2s
; CHECK-NEXT: fcvtzu v0.2s, v0.2s
; CHECK-NEXT: ret
%mul.i = fmul <2 x float> %f, <float undef, float undef>
Expand Down Expand Up @@ -426,8 +426,8 @@ define <2 x i32> @test9_sat(<2 x float> %f) {
define <2 x i32> @test10_sat(<2 x float> %f) {
; CHECK-LABEL: test10_sat:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #2143289344 // =0x7fc00000
; CHECK-NEXT: dup v0.2s, w8
; CHECK-NEXT: mvni v0.2s, #63, msl #16
; CHECK-NEXT: fneg v0.2s, v0.2s
; CHECK-NEXT: fcvtzu v0.2s, v0.2s
; CHECK-NEXT: ret
%mul.i = fmul <2 x float> %f, <float undef, float undef>
Expand Down
50 changes: 33 additions & 17 deletions llvm/test/CodeGen/AArch64/neon-mov.ll
Original file line number Diff line number Diff line change
Expand Up @@ -111,16 +111,14 @@ define <4 x i32> @movi4s_lsl16() {
define <4 x i32> @movi4s_fneg() {
; CHECK-NOFP16-SD-LABEL: movi4s_fneg:
; CHECK-NOFP16-SD: // %bb.0:
; CHECK-NOFP16-SD-NEXT: mov w8, #61440 // =0xf000
; CHECK-NOFP16-SD-NEXT: movk w8, #32768, lsl #16
; CHECK-NOFP16-SD-NEXT: dup v0.4s, w8
; CHECK-NOFP16-SD-NEXT: movi v0.4s, #240, lsl #8
; CHECK-NOFP16-SD-NEXT: fneg v0.4s, v0.4s
; CHECK-NOFP16-SD-NEXT: ret
;
; CHECK-FP16-SD-LABEL: movi4s_fneg:
; CHECK-FP16-SD: // %bb.0:
; CHECK-FP16-SD-NEXT: mov w8, #61440 // =0xf000
; CHECK-FP16-SD-NEXT: movk w8, #32768, lsl #16
; CHECK-FP16-SD-NEXT: dup v0.4s, w8
; CHECK-FP16-SD-NEXT: movi v0.4s, #240, lsl #8
; CHECK-FP16-SD-NEXT: fneg v0.4s, v0.4s
; CHECK-FP16-SD-NEXT: ret
;
; CHECK-NOFP16-GI-LABEL: movi4s_fneg:
Expand Down Expand Up @@ -178,11 +176,29 @@ define <8 x i16> @movi8h_lsl8() {
}

define <8 x i16> @movi8h_fneg() {
; CHECK-LABEL: movi8h_fneg:
; CHECK: // %bb.0:
; CHECK-NEXT: adrp x8, .LCPI19_0
; CHECK-NEXT: ldr q0, [x8, :lo12:.LCPI19_0]
; CHECK-NEXT: ret
; CHECK-NOFP16-SD-LABEL: movi8h_fneg:
; CHECK-NOFP16-SD: // %bb.0:
; CHECK-NOFP16-SD-NEXT: movi v0.8h, #127, lsl #8
; CHECK-NOFP16-SD-NEXT: fneg v0.4s, v0.4s
; CHECK-NOFP16-SD-NEXT: ret
;
; CHECK-FP16-SD-LABEL: movi8h_fneg:
; CHECK-FP16-SD: // %bb.0:
; CHECK-FP16-SD-NEXT: movi v0.8h, #127, lsl #8
; CHECK-FP16-SD-NEXT: fneg v0.4s, v0.4s
; CHECK-FP16-SD-NEXT: ret
;
; CHECK-NOFP16-GI-LABEL: movi8h_fneg:
; CHECK-NOFP16-GI: // %bb.0:
; CHECK-NOFP16-GI-NEXT: adrp x8, .LCPI19_0
; CHECK-NOFP16-GI-NEXT: ldr q0, [x8, :lo12:.LCPI19_0]
; CHECK-NOFP16-GI-NEXT: ret
;
; CHECK-FP16-GI-LABEL: movi8h_fneg:
; CHECK-FP16-GI: // %bb.0:
; CHECK-FP16-GI-NEXT: adrp x8, .LCPI19_0
; CHECK-FP16-GI-NEXT: ldr q0, [x8, :lo12:.LCPI19_0]
; CHECK-FP16-GI-NEXT: ret
ret <8 x i16> <i16 32512, i16 65280, i16 32512, i16 65280, i16 32512, i16 65280, i16 32512, i16 65280>
}

Expand Down Expand Up @@ -294,8 +310,8 @@ define <8 x i16> @mvni8h_neg() {
;
; CHECK-FP16-SD-LABEL: mvni8h_neg:
; CHECK-FP16-SD: // %bb.0:
; CHECK-FP16-SD-NEXT: mov w8, #33008 // =0x80f0
; CHECK-FP16-SD-NEXT: dup v0.8h, w8
; CHECK-FP16-SD-NEXT: movi v0.8h, #240
; CHECK-FP16-SD-NEXT: fneg v0.8h, v0.8h
; CHECK-FP16-SD-NEXT: ret
;
; CHECK-NOFP16-GI-LABEL: mvni8h_neg:
Expand Down Expand Up @@ -480,14 +496,14 @@ define <2 x double> @fmov2d() {
define <2 x double> @fmov2d_neg0() {
; CHECK-NOFP16-SD-LABEL: fmov2d_neg0:
; CHECK-NOFP16-SD: // %bb.0:
; CHECK-NOFP16-SD-NEXT: mov x8, #-9223372036854775808 // =0x8000000000000000
; CHECK-NOFP16-SD-NEXT: dup v0.2d, x8
; CHECK-NOFP16-SD-NEXT: movi v0.2d, #0000000000000000
; CHECK-NOFP16-SD-NEXT: fneg v0.2d, v0.2d
; CHECK-NOFP16-SD-NEXT: ret
;
; CHECK-FP16-SD-LABEL: fmov2d_neg0:
; CHECK-FP16-SD: // %bb.0:
; CHECK-FP16-SD-NEXT: mov x8, #-9223372036854775808 // =0x8000000000000000
; CHECK-FP16-SD-NEXT: dup v0.2d, x8
; CHECK-FP16-SD-NEXT: movi v0.2d, #0000000000000000
; CHECK-FP16-SD-NEXT: fneg v0.2d, v0.2d
; CHECK-FP16-SD-NEXT: ret
;
; CHECK-NOFP16-GI-LABEL: fmov2d_neg0:
Expand Down
Loading

0 comments on commit 6c84709

Please sign in to comment.