[DAG] Fold fdiv X, c2 -> fmul X, 1/c2 without AllowReciprocal if exact (

#93882) This moves the combine of fdiv by constant to fmul out of an 'if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()' block, so that it triggers if the divide is exact. An extra check for Recip.isDenormal() is added as multiple places make reference to it being unsafe or slow on certain platforms.
llvm · Jun 9, 2024 · a284bdb · a284bdb
1 parent 53fecef
commit a284bdb
Show file tree

Hide file tree

Showing 10 changed files with 200 additions and 239 deletions.
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
@@ -964,6 +964,13 @@ class APFloat : public APFloatBase {
     return Val;
   }
 
+  /// Factory for Positive and Negative One.
+  ///
+  /// \param Negative True iff the number should be negative.
+  static APFloat getOne(const fltSemantics &Sem, bool Negative = false) {
+    return APFloat(Sem, Negative ? -1 : 1);
+  }
+
   /// Factory for Positive and Negative Infinity.
   ///
   /// \param Negative True iff the number should be negative.

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17262,26 +17262,29 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
   if (SDValue V = combineRepeatedFPDivisors(N))
     return V;
 
-  if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) {
-    // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
-    if (auto *N1CFP = dyn_cast<ConstantFPSDNode>(N1)) {
-      // Compute the reciprocal 1.0 / c2.
-      const APFloat &N1APF = N1CFP->getValueAPF();
-      APFloat Recip(N1APF.getSemantics(), 1); // 1.0
-      APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);
-      // Only do the transform if the reciprocal is a legal fp immediate that
-      // isn't too nasty (eg NaN, denormal, ...).
-      if ((st == APFloat::opOK || st == APFloat::opInexact) && // Not too nasty
-          (!LegalOperations ||
-           // FIXME: custom lowering of ConstantFP might fail (see e.g. ARM
-           // backend)... we should handle this gracefully after Legalize.
-           // TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT) ||
-           TLI.isOperationLegal(ISD::ConstantFP, VT) ||
-           TLI.isFPImmLegal(Recip, VT, ForCodeSize)))
-        return DAG.getNode(ISD::FMUL, DL, VT, N0,
-                           DAG.getConstantFP(Recip, DL, VT));
-    }
+  // fold (fdiv X, c2) -> (fmul X, 1/c2) if there is no loss in precision, or
+  // the loss is acceptable with AllowReciprocal.
+  if (auto *N1CFP = isConstOrConstSplatFP(N1, true)) {
+    // Compute the reciprocal 1.0 / c2.
+    const APFloat &N1APF = N1CFP->getValueAPF();
+    APFloat Recip = APFloat::getOne(N1APF.getSemantics());
+    APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);
+    // Only do the transform if the reciprocal is a legal fp immediate that
+    // isn't too nasty (eg NaN, denormal, ...).
+    if (((st == APFloat::opOK && !Recip.isDenormal()) ||
+         (st == APFloat::opInexact &&
+          (Options.UnsafeFPMath || Flags.hasAllowReciprocal()))) &&
+        (!LegalOperations ||
+         // FIXME: custom lowering of ConstantFP might fail (see e.g. ARM
+         // backend)... we should handle this gracefully after Legalize.
+         // TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT) ||
+         TLI.isOperationLegal(ISD::ConstantFP, VT) ||
+         TLI.isFPImmLegal(Recip, VT, ForCodeSize)))
+      return DAG.getNode(ISD::FMUL, DL, VT, N0,
+                         DAG.getConstantFP(Recip, DL, VT));
+  }
 
+  if (Options.UnsafeFPMath || Flags.hasAllowReciprocal()) {
     // If this FDIV is part of a reciprocal square root, it may be folded
     // into a target-specific square root estimate instruction.
     if (N1.getOpcode() == ISD::FSQRT) {

diff --git a/llvm/test/CodeGen/AArch64/fcvt-fixed.ll b/llvm/test/CodeGen/AArch64/fcvt-fixed.ll
@@ -412,10 +412,10 @@ define half @scvtf_f16_i32_7(i32 %int) {
 ; CHECK-NO16-LABEL: scvtf_f16_i32_7:
 ; CHECK-NO16:       // %bb.0:
 ; CHECK-NO16-NEXT:    scvtf s1, w0
-; CHECK-NO16-NEXT:    movi v0.2s, #67, lsl #24
+; CHECK-NO16-NEXT:    movi v0.2s, #60, lsl #24
 ; CHECK-NO16-NEXT:    fcvt h1, s1
 ; CHECK-NO16-NEXT:    fcvt s1, h1
-; CHECK-NO16-NEXT:    fdiv s0, s1, s0
+; CHECK-NO16-NEXT:    fmul s0, s1, s0
 ; CHECK-NO16-NEXT:    fcvt h0, s0
 ; CHECK-NO16-NEXT:    ret
 ;
@@ -432,10 +432,10 @@ define half @scvtf_f16_i32_15(i32 %int) {
 ; CHECK-NO16-LABEL: scvtf_f16_i32_15:
 ; CHECK-NO16:       // %bb.0:
 ; CHECK-NO16-NEXT:    scvtf s1, w0
-; CHECK-NO16-NEXT:    movi v0.2s, #71, lsl #24
+; CHECK-NO16-NEXT:    movi v0.2s, #56, lsl #24
 ; CHECK-NO16-NEXT:    fcvt h1, s1
 ; CHECK-NO16-NEXT:    fcvt s1, h1
-; CHECK-NO16-NEXT:    fdiv s0, s1, s0
+; CHECK-NO16-NEXT:    fmul s0, s1, s0
 ; CHECK-NO16-NEXT:    fcvt h0, s0
 ; CHECK-NO16-NEXT:    ret
 ;
@@ -452,10 +452,10 @@ define half @scvtf_f16_i64_7(i64 %long) {
 ; CHECK-NO16-LABEL: scvtf_f16_i64_7:
 ; CHECK-NO16:       // %bb.0:
 ; CHECK-NO16-NEXT:    scvtf s1, x0
-; CHECK-NO16-NEXT:    movi v0.2s, #67, lsl #24
+; CHECK-NO16-NEXT:    movi v0.2s, #60, lsl #24
 ; CHECK-NO16-NEXT:    fcvt h1, s1
 ; CHECK-NO16-NEXT:    fcvt s1, h1
-; CHECK-NO16-NEXT:    fdiv s0, s1, s0
+; CHECK-NO16-NEXT:    fmul s0, s1, s0
 ; CHECK-NO16-NEXT:    fcvt h0, s0
 ; CHECK-NO16-NEXT:    ret
 ;
@@ -472,10 +472,10 @@ define half @scvtf_f16_i64_15(i64 %long) {
 ; CHECK-NO16-LABEL: scvtf_f16_i64_15:
 ; CHECK-NO16:       // %bb.0:
 ; CHECK-NO16-NEXT:    scvtf s1, x0
-; CHECK-NO16-NEXT:    movi v0.2s, #71, lsl #24
+; CHECK-NO16-NEXT:    movi v0.2s, #56, lsl #24
 ; CHECK-NO16-NEXT:    fcvt h1, s1
 ; CHECK-NO16-NEXT:    fcvt s1, h1
-; CHECK-NO16-NEXT:    fdiv s0, s1, s0
+; CHECK-NO16-NEXT:    fmul s0, s1, s0
 ; CHECK-NO16-NEXT:    fcvt h0, s0
 ; CHECK-NO16-NEXT:    ret
 ;
@@ -574,10 +574,10 @@ define half @ucvtf_f16_i32_7(i32 %int) {
 ; CHECK-NO16-LABEL: ucvtf_f16_i32_7:
 ; CHECK-NO16:       // %bb.0:
 ; CHECK-NO16-NEXT:    ucvtf s1, w0
-; CHECK-NO16-NEXT:    movi v0.2s, #67, lsl #24
+; CHECK-NO16-NEXT:    movi v0.2s, #60, lsl #24
 ; CHECK-NO16-NEXT:    fcvt h1, s1
 ; CHECK-NO16-NEXT:    fcvt s1, h1
-; CHECK-NO16-NEXT:    fdiv s0, s1, s0
+; CHECK-NO16-NEXT:    fmul s0, s1, s0
 ; CHECK-NO16-NEXT:    fcvt h0, s0
 ; CHECK-NO16-NEXT:    ret
 ;
@@ -594,10 +594,10 @@ define half @ucvtf_f16_i32_15(i32 %int) {
 ; CHECK-NO16-LABEL: ucvtf_f16_i32_15:
 ; CHECK-NO16:       // %bb.0:
 ; CHECK-NO16-NEXT:    ucvtf s1, w0
-; CHECK-NO16-NEXT:    movi v0.2s, #71, lsl #24
+; CHECK-NO16-NEXT:    movi v0.2s, #56, lsl #24
 ; CHECK-NO16-NEXT:    fcvt h1, s1
 ; CHECK-NO16-NEXT:    fcvt s1, h1
-; CHECK-NO16-NEXT:    fdiv s0, s1, s0
+; CHECK-NO16-NEXT:    fmul s0, s1, s0
 ; CHECK-NO16-NEXT:    fcvt h0, s0
 ; CHECK-NO16-NEXT:    ret
 ;
@@ -614,10 +614,10 @@ define half @ucvtf_f16_i64_7(i64 %long) {
 ; CHECK-NO16-LABEL: ucvtf_f16_i64_7:
 ; CHECK-NO16:       // %bb.0:
 ; CHECK-NO16-NEXT:    ucvtf s1, x0
-; CHECK-NO16-NEXT:    movi v0.2s, #67, lsl #24
+; CHECK-NO16-NEXT:    movi v0.2s, #60, lsl #24
 ; CHECK-NO16-NEXT:    fcvt h1, s1
 ; CHECK-NO16-NEXT:    fcvt s1, h1
-; CHECK-NO16-NEXT:    fdiv s0, s1, s0
+; CHECK-NO16-NEXT:    fmul s0, s1, s0
 ; CHECK-NO16-NEXT:    fcvt h0, s0
 ; CHECK-NO16-NEXT:    ret
 ;
@@ -634,10 +634,10 @@ define half @ucvtf_f16_i64_15(i64 %long) {
 ; CHECK-NO16-LABEL: ucvtf_f16_i64_15:
 ; CHECK-NO16:       // %bb.0:
 ; CHECK-NO16-NEXT:    ucvtf s1, x0
-; CHECK-NO16-NEXT:    movi v0.2s, #71, lsl #24
+; CHECK-NO16-NEXT:    movi v0.2s, #56, lsl #24
 ; CHECK-NO16-NEXT:    fcvt h1, s1
 ; CHECK-NO16-NEXT:    fcvt s1, h1
-; CHECK-NO16-NEXT:    fdiv s0, s1, s0
+; CHECK-NO16-NEXT:    fmul s0, s1, s0
 ; CHECK-NO16-NEXT:    fcvt h0, s0
 ; CHECK-NO16-NEXT:    ret
 ;

diff --git a/llvm/test/CodeGen/AArch64/fdiv-const.ll b/llvm/test/CodeGen/AArch64/fdiv-const.ll
@@ -4,8 +4,8 @@
 define float @divf32_2(float %a) nounwind {
 ; CHECK-LABEL: divf32_2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov s1, #2.00000000
-; CHECK-NEXT:    fdiv s0, s0, s1
+; CHECK-NEXT:    fmov s1, #0.50000000
+; CHECK-NEXT:    fmul s0, s0, s1
 ; CHECK-NEXT:    ret
   %r = fdiv float %a, 2.0
   ret float %r
@@ -46,8 +46,8 @@ define float @divf32_p75_arcp(float %a) nounwind {
 define half @divf16_2(half %a) nounwind {
 ; CHECK-LABEL: divf16_2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov h1, #2.00000000
-; CHECK-NEXT:    fdiv h0, h0, h1
+; CHECK-NEXT:    fmov h1, #0.50000000
+; CHECK-NEXT:    fmul h0, h0, h1
 ; CHECK-NEXT:    ret
   %r = fdiv half %a, 2.0
   ret half %r
@@ -67,9 +67,9 @@ define half @divf16_32768(half %a) nounwind {
 define half @divf16_32768_arcp(half %a) nounwind {
 ; CHECK-LABEL: divf16_32768_arcp:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mov w8, #512 // =0x200
+; CHECK-NEXT:    mov w8, #30720 // =0x7800
 ; CHECK-NEXT:    fmov h1, w8
-; CHECK-NEXT:    fmul h0, h0, h1
+; CHECK-NEXT:    fdiv h0, h0, h1
 ; CHECK-NEXT:    ret
   %r = fdiv arcp half %a, 32768.0
   ret half %r
@@ -78,8 +78,8 @@ define half @divf16_32768_arcp(half %a) nounwind {
 define double @divf64_2(double %a) nounwind {
 ; CHECK-LABEL: divf64_2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov d1, #2.00000000
-; CHECK-NEXT:    fdiv d0, d0, d1
+; CHECK-NEXT:    fmov d1, #0.50000000
+; CHECK-NEXT:    fmul d0, d0, d1
 ; CHECK-NEXT:    ret
   %r = fdiv double %a, 2.0
   ret double %r
@@ -88,8 +88,8 @@ define double @divf64_2(double %a) nounwind {
 define <4 x float> @divv4f32_2(<4 x float> %a) nounwind {
 ; CHECK-LABEL: divv4f32_2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    movi v1.4s, #64, lsl #24
-; CHECK-NEXT:    fdiv v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    movi v1.4s, #63, lsl #24
+; CHECK-NEXT:    fmul v0.4s, v0.4s, v1.4s
 ; CHECK-NEXT:    ret
   %r = fdiv <4 x float> %a, <float 2.0, float 2.0, float 2.0, float 2.0>
   ret <4 x float> %r
@@ -141,9 +141,8 @@ define <4 x float> @divv4f32_24816(<4 x float> %a) nounwind {
 define <vscale x 4 x float> @divnxv4f32_2(<vscale x 4 x float> %a) nounwind {
 ; CHECK-LABEL: divnxv4f32_2:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov z1.s, #2.00000000
 ; CHECK-NEXT:    ptrue p0.s
-; CHECK-NEXT:    fdiv z0.s, p0/m, z0.s, z1.s
+; CHECK-NEXT:    fmul z0.s, p0/m, z0.s, #0.5
 ; CHECK-NEXT:    ret
   %r = fdiv <vscale x 4 x float> %a, splat (float 2.0)
   ret <vscale x 4 x float> %r