Skip to content

Commit

Permalink
[DAG] Fold fdiv X, c2 -> fmul X, 1/c2 without AllowReciprocal if exact (
Browse files Browse the repository at this point in the history
#93882)

This moves the combine of fdiv by constant to fmul out of an
'if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()' block,
so that it triggers if the divide is exact. An extra check for
Recip.isDenormal() is added as multiple places make reference
to it being unsafe or slow on certain platforms.
  • Loading branch information
davemgreen committed Jun 9, 2024
1 parent 53fecef commit a284bdb
Show file tree
Hide file tree
Showing 10 changed files with 200 additions and 239 deletions.
7 changes: 7 additions & 0 deletions llvm/include/llvm/ADT/APFloat.h
Original file line number Diff line number Diff line change
Expand Up @@ -964,6 +964,13 @@ class APFloat : public APFloatBase {
return Val;
}

/// Factory for Positive and Negative One.
///
/// \param Negative True iff the number should be negative.
static APFloat getOne(const fltSemantics &Sem, bool Negative = false) {
return APFloat(Sem, Negative ? -1 : 1);
}

/// Factory for Positive and Negative Infinity.
///
/// \param Negative True iff the number should be negative.
Expand Down
41 changes: 22 additions & 19 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17262,26 +17262,29 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
if (SDValue V = combineRepeatedFPDivisors(N))
return V;

if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) {
// fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
if (auto *N1CFP = dyn_cast<ConstantFPSDNode>(N1)) {
// Compute the reciprocal 1.0 / c2.
const APFloat &N1APF = N1CFP->getValueAPF();
APFloat Recip(N1APF.getSemantics(), 1); // 1.0
APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);
// Only do the transform if the reciprocal is a legal fp immediate that
// isn't too nasty (eg NaN, denormal, ...).
if ((st == APFloat::opOK || st == APFloat::opInexact) && // Not too nasty
(!LegalOperations ||
// FIXME: custom lowering of ConstantFP might fail (see e.g. ARM
// backend)... we should handle this gracefully after Legalize.
// TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT) ||
TLI.isOperationLegal(ISD::ConstantFP, VT) ||
TLI.isFPImmLegal(Recip, VT, ForCodeSize)))
return DAG.getNode(ISD::FMUL, DL, VT, N0,
DAG.getConstantFP(Recip, DL, VT));
}
// fold (fdiv X, c2) -> (fmul X, 1/c2) if there is no loss in precision, or
// the loss is acceptable with AllowReciprocal.
if (auto *N1CFP = isConstOrConstSplatFP(N1, true)) {
// Compute the reciprocal 1.0 / c2.
const APFloat &N1APF = N1CFP->getValueAPF();
APFloat Recip = APFloat::getOne(N1APF.getSemantics());
APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);
// Only do the transform if the reciprocal is a legal fp immediate that
// isn't too nasty (eg NaN, denormal, ...).
if (((st == APFloat::opOK && !Recip.isDenormal()) ||
(st == APFloat::opInexact &&
(Options.UnsafeFPMath || Flags.hasAllowReciprocal()))) &&
(!LegalOperations ||
// FIXME: custom lowering of ConstantFP might fail (see e.g. ARM
// backend)... we should handle this gracefully after Legalize.
// TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT) ||
TLI.isOperationLegal(ISD::ConstantFP, VT) ||
TLI.isFPImmLegal(Recip, VT, ForCodeSize)))
return DAG.getNode(ISD::FMUL, DL, VT, N0,
DAG.getConstantFP(Recip, DL, VT));
}

if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) {
// If this FDIV is part of a reciprocal square root, it may be folded
// into a target-specific square root estimate instruction.
if (N1.getOpcode() == ISD::FSQRT) {
Expand Down
32 changes: 16 additions & 16 deletions llvm/test/CodeGen/AArch64/fcvt-fixed.ll
Original file line number Diff line number Diff line change
Expand Up @@ -412,10 +412,10 @@ define half @scvtf_f16_i32_7(i32 %int) {
; CHECK-NO16-LABEL: scvtf_f16_i32_7:
; CHECK-NO16: // %bb.0:
; CHECK-NO16-NEXT: scvtf s1, w0
; CHECK-NO16-NEXT: movi v0.2s, #67, lsl #24
; CHECK-NO16-NEXT: movi v0.2s, #60, lsl #24
; CHECK-NO16-NEXT: fcvt h1, s1
; CHECK-NO16-NEXT: fcvt s1, h1
; CHECK-NO16-NEXT: fdiv s0, s1, s0
; CHECK-NO16-NEXT: fmul s0, s1, s0
; CHECK-NO16-NEXT: fcvt h0, s0
; CHECK-NO16-NEXT: ret
;
Expand All @@ -432,10 +432,10 @@ define half @scvtf_f16_i32_15(i32 %int) {
; CHECK-NO16-LABEL: scvtf_f16_i32_15:
; CHECK-NO16: // %bb.0:
; CHECK-NO16-NEXT: scvtf s1, w0
; CHECK-NO16-NEXT: movi v0.2s, #71, lsl #24
; CHECK-NO16-NEXT: movi v0.2s, #56, lsl #24
; CHECK-NO16-NEXT: fcvt h1, s1
; CHECK-NO16-NEXT: fcvt s1, h1
; CHECK-NO16-NEXT: fdiv s0, s1, s0
; CHECK-NO16-NEXT: fmul s0, s1, s0
; CHECK-NO16-NEXT: fcvt h0, s0
; CHECK-NO16-NEXT: ret
;
Expand All @@ -452,10 +452,10 @@ define half @scvtf_f16_i64_7(i64 %long) {
; CHECK-NO16-LABEL: scvtf_f16_i64_7:
; CHECK-NO16: // %bb.0:
; CHECK-NO16-NEXT: scvtf s1, x0
; CHECK-NO16-NEXT: movi v0.2s, #67, lsl #24
; CHECK-NO16-NEXT: movi v0.2s, #60, lsl #24
; CHECK-NO16-NEXT: fcvt h1, s1
; CHECK-NO16-NEXT: fcvt s1, h1
; CHECK-NO16-NEXT: fdiv s0, s1, s0
; CHECK-NO16-NEXT: fmul s0, s1, s0
; CHECK-NO16-NEXT: fcvt h0, s0
; CHECK-NO16-NEXT: ret
;
Expand All @@ -472,10 +472,10 @@ define half @scvtf_f16_i64_15(i64 %long) {
; CHECK-NO16-LABEL: scvtf_f16_i64_15:
; CHECK-NO16: // %bb.0:
; CHECK-NO16-NEXT: scvtf s1, x0
; CHECK-NO16-NEXT: movi v0.2s, #71, lsl #24
; CHECK-NO16-NEXT: movi v0.2s, #56, lsl #24
; CHECK-NO16-NEXT: fcvt h1, s1
; CHECK-NO16-NEXT: fcvt s1, h1
; CHECK-NO16-NEXT: fdiv s0, s1, s0
; CHECK-NO16-NEXT: fmul s0, s1, s0
; CHECK-NO16-NEXT: fcvt h0, s0
; CHECK-NO16-NEXT: ret
;
Expand Down Expand Up @@ -574,10 +574,10 @@ define half @ucvtf_f16_i32_7(i32 %int) {
; CHECK-NO16-LABEL: ucvtf_f16_i32_7:
; CHECK-NO16: // %bb.0:
; CHECK-NO16-NEXT: ucvtf s1, w0
; CHECK-NO16-NEXT: movi v0.2s, #67, lsl #24
; CHECK-NO16-NEXT: movi v0.2s, #60, lsl #24
; CHECK-NO16-NEXT: fcvt h1, s1
; CHECK-NO16-NEXT: fcvt s1, h1
; CHECK-NO16-NEXT: fdiv s0, s1, s0
; CHECK-NO16-NEXT: fmul s0, s1, s0
; CHECK-NO16-NEXT: fcvt h0, s0
; CHECK-NO16-NEXT: ret
;
Expand All @@ -594,10 +594,10 @@ define half @ucvtf_f16_i32_15(i32 %int) {
; CHECK-NO16-LABEL: ucvtf_f16_i32_15:
; CHECK-NO16: // %bb.0:
; CHECK-NO16-NEXT: ucvtf s1, w0
; CHECK-NO16-NEXT: movi v0.2s, #71, lsl #24
; CHECK-NO16-NEXT: movi v0.2s, #56, lsl #24
; CHECK-NO16-NEXT: fcvt h1, s1
; CHECK-NO16-NEXT: fcvt s1, h1
; CHECK-NO16-NEXT: fdiv s0, s1, s0
; CHECK-NO16-NEXT: fmul s0, s1, s0
; CHECK-NO16-NEXT: fcvt h0, s0
; CHECK-NO16-NEXT: ret
;
Expand All @@ -614,10 +614,10 @@ define half @ucvtf_f16_i64_7(i64 %long) {
; CHECK-NO16-LABEL: ucvtf_f16_i64_7:
; CHECK-NO16: // %bb.0:
; CHECK-NO16-NEXT: ucvtf s1, x0
; CHECK-NO16-NEXT: movi v0.2s, #67, lsl #24
; CHECK-NO16-NEXT: movi v0.2s, #60, lsl #24
; CHECK-NO16-NEXT: fcvt h1, s1
; CHECK-NO16-NEXT: fcvt s1, h1
; CHECK-NO16-NEXT: fdiv s0, s1, s0
; CHECK-NO16-NEXT: fmul s0, s1, s0
; CHECK-NO16-NEXT: fcvt h0, s0
; CHECK-NO16-NEXT: ret
;
Expand All @@ -634,10 +634,10 @@ define half @ucvtf_f16_i64_15(i64 %long) {
; CHECK-NO16-LABEL: ucvtf_f16_i64_15:
; CHECK-NO16: // %bb.0:
; CHECK-NO16-NEXT: ucvtf s1, x0
; CHECK-NO16-NEXT: movi v0.2s, #71, lsl #24
; CHECK-NO16-NEXT: movi v0.2s, #56, lsl #24
; CHECK-NO16-NEXT: fcvt h1, s1
; CHECK-NO16-NEXT: fcvt s1, h1
; CHECK-NO16-NEXT: fdiv s0, s1, s0
; CHECK-NO16-NEXT: fmul s0, s1, s0
; CHECK-NO16-NEXT: fcvt h0, s0
; CHECK-NO16-NEXT: ret
;
Expand Down
23 changes: 11 additions & 12 deletions llvm/test/CodeGen/AArch64/fdiv-const.ll
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
define float @divf32_2(float %a) nounwind {
; CHECK-LABEL: divf32_2:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov s1, #2.00000000
; CHECK-NEXT: fdiv s0, s0, s1
; CHECK-NEXT: fmov s1, #0.50000000
; CHECK-NEXT: fmul s0, s0, s1
; CHECK-NEXT: ret
%r = fdiv float %a, 2.0
ret float %r
Expand Down Expand Up @@ -46,8 +46,8 @@ define float @divf32_p75_arcp(float %a) nounwind {
define half @divf16_2(half %a) nounwind {
; CHECK-LABEL: divf16_2:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov h1, #2.00000000
; CHECK-NEXT: fdiv h0, h0, h1
; CHECK-NEXT: fmov h1, #0.50000000
; CHECK-NEXT: fmul h0, h0, h1
; CHECK-NEXT: ret
%r = fdiv half %a, 2.0
ret half %r
Expand All @@ -67,9 +67,9 @@ define half @divf16_32768(half %a) nounwind {
define half @divf16_32768_arcp(half %a) nounwind {
; CHECK-LABEL: divf16_32768_arcp:
; CHECK: // %bb.0:
; CHECK-NEXT: mov w8, #512 // =0x200
; CHECK-NEXT: mov w8, #30720 // =0x7800
; CHECK-NEXT: fmov h1, w8
; CHECK-NEXT: fmul h0, h0, h1
; CHECK-NEXT: fdiv h0, h0, h1
; CHECK-NEXT: ret
%r = fdiv arcp half %a, 32768.0
ret half %r
Expand All @@ -78,8 +78,8 @@ define half @divf16_32768_arcp(half %a) nounwind {
define double @divf64_2(double %a) nounwind {
; CHECK-LABEL: divf64_2:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov d1, #2.00000000
; CHECK-NEXT: fdiv d0, d0, d1
; CHECK-NEXT: fmov d1, #0.50000000
; CHECK-NEXT: fmul d0, d0, d1
; CHECK-NEXT: ret
%r = fdiv double %a, 2.0
ret double %r
Expand All @@ -88,8 +88,8 @@ define double @divf64_2(double %a) nounwind {
define <4 x float> @divv4f32_2(<4 x float> %a) nounwind {
; CHECK-LABEL: divv4f32_2:
; CHECK: // %bb.0:
; CHECK-NEXT: movi v1.4s, #64, lsl #24
; CHECK-NEXT: fdiv v0.4s, v0.4s, v1.4s
; CHECK-NEXT: movi v1.4s, #63, lsl #24
; CHECK-NEXT: fmul v0.4s, v0.4s, v1.4s
; CHECK-NEXT: ret
%r = fdiv <4 x float> %a, <float 2.0, float 2.0, float 2.0, float 2.0>
ret <4 x float> %r
Expand Down Expand Up @@ -141,9 +141,8 @@ define <4 x float> @divv4f32_24816(<4 x float> %a) nounwind {
define <vscale x 4 x float> @divnxv4f32_2(<vscale x 4 x float> %a) nounwind {
; CHECK-LABEL: divnxv4f32_2:
; CHECK: // %bb.0:
; CHECK-NEXT: fmov z1.s, #2.00000000
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: fdiv z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: fmul z0.s, p0/m, z0.s, #0.5
; CHECK-NEXT: ret
%r = fdiv <vscale x 4 x float> %a, splat (float 2.0)
ret <vscale x 4 x float> %r
Expand Down
Loading

0 comments on commit a284bdb

Please sign in to comment.