Skip to content

Commit

Permalink
Base: make constructing a float from a rational exact
Browse files Browse the repository at this point in the history
Two example bugs that are now fixed:
```julia-repl
julia> Float16(9//70000)       # incorrect because `Float16(70000)` overflows
Float16(0.0)

julia> Float16(big(9//70000))  # correct because of promotion to `BigFloat`
Float16(0.0001286)

julia> Float32(16777216//16777217) < 1  # `16777217` doesn't fit in a `Float32` mantissa
false
```

Another way to fix this would have been to convert the numerator and
denominator into `BigFloat` exactly and then divide one with the other.
However, the requirement for exactness means that the `BigFloat`
precision would need to be manipulated, something that seems to be
problematic in Julia. So implement the division using integer
arithmetic.

As a bonus, constructing a `BigFloat` from a `Rational` is now
thread-safe when the rounding mode and precision are provided to the
constructor.

Updates #45213
  • Loading branch information
nsajko committed May 17, 2023
1 parent 10dc33e commit dd48fdd
Show file tree
Hide file tree
Showing 4 changed files with 409 additions and 10 deletions.
5 changes: 5 additions & 0 deletions base/float.jl
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,11 @@ for T in (Float16, Float32, Float64)
@eval exponent_raw_max(::Type{$T}) = $(Int(exponent_mask(T) >> significand_bits(T)))
end

# The minimum exponent for a normal number. Assuming unknown types
# don't have subnormals.
min_normal_number_exponent(::Type{<:AbstractFloat}) = nothing
min_normal_number_exponent(::Type{F}) where {F<:IEEEFloat} = true - exponent_bias(F)

"""
exponent_max(T)
Expand Down
73 changes: 67 additions & 6 deletions base/mpfr.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ import
cbrt, typemax, typemin, unsafe_trunc, floatmin, floatmax, rounding,
setrounding, maxintfloat, widen, significand, frexp, tryparse, iszero,
isone, big, _string_n, decompose, minmax,
sinpi, cospi, sincospi, tanpi, sind, cosd, tand, asind, acosd, atand
sinpi, cospi, sincospi, tanpi, sind, cosd, tand, asind, acosd, atand,
rational_to_float_components, unsafe_rational, bit_width

import ..Rounding: rounding_raw, setrounding_raw

Expand Down Expand Up @@ -280,14 +281,74 @@ BigFloat(x::Union{UInt8,UInt16,UInt32}, r::MPFRRoundingMode=ROUNDING_MODE[]; pre
BigFloat(x::Union{Float16,Float32}, r::MPFRRoundingMode=ROUNDING_MODE[]; precision::Integer=DEFAULT_PRECISION[]) =
BigFloat(Float64(x), r; precision=precision)

function BigFloat(x::Rational, r::MPFRRoundingMode=ROUNDING_MODE[]; precision::Integer=DEFAULT_PRECISION[])
setprecision(BigFloat, precision) do
setrounding_raw(BigFloat, r) do
BigFloat(numerator(x))::BigFloat / BigFloat(denominator(x))::BigFloat
end
function set_2exp!(z::BigFloat, n::BigInt, exp::Int, rm::MPFRRoundingMode)
ccall(
(:mpfr_set_z_2exp, libmpfr),
Int32,
(Ref{BigFloat}, Ref{BigInt}, Int, MPFRRoundingMode),
z, n, exp, rm,
)
nothing
end

function rounding_mode_translated_for_abs(rm::MPFRRoundingMode, sign::Real)
s = signbit(sign)
if rm == MPFRRoundDown
# Negative numbers round away from zero, positives round to zero.
s ? MPFRRoundFromZero : MPFRRoundToZero
elseif rm == MPFRRoundUp
s ? MPFRRoundToZero : MPFRRoundFromZero
else
rm
end
end

function rational_to_bigfloat(x::Rational, romo::MPFRRoundingMode, prec::Integer)
s = Int8(sign(numerator(x)))
a = abs(x)

num = numerator(a)
den = denominator(a)

# Handle special cases
num_is_zero = iszero(num)
den_is_zero = iszero(den)
if den_is_zero
num_is_zero && return BigFloat(precision = prec) # 0/0
# n/0 = Inf * sign(n)
#
# The rounding mode doesn't matter for infinity.
return BigFloat(s * Inf, MPFRRoundToZero, precision = prec)
end
# 0/1 = 0
#
# The rounding mode doesn't matter for zero.
num_is_zero && return BigFloat(false, MPFRRoundToZero, precision = prec)

rm = convert(RoundingMode, rounding_mode_translated_for_abs(romo, s))
c = rational_to_float_components(num, den, prec, BigInt, rm)
bw = bit_width(c.mantissa)
ret = BigFloat(precision = prec)

# The rounding mode doesn't matter because there shouldn't be any
# rounding, as MPFR doesn't have subnormals.
set_2exp!(ret, s * c.mantissa, Int(c.exponent - bw + true), MPFRRoundToZero)

ret
end

function rational_to_bigfloat(x::Rational{Bool}, ::MPFRRoundingMode, prec::Integer)
# the rounding mode doesn't matter for conversion from boolean fractions
BigFloat(numerator(x)/denominator(x), MPFRRoundToZero, precision = prec)
end

function BigFloat(x::Rational,
r::MPFRRoundingMode = ROUNDING_MODE[];
precision::Integer = DEFAULT_PRECISION[])
y = unsafe_rational(numerator(x), denominator(x))
rational_to_bigfloat(y, r, precision)::BigFloat
end

function tryparse(::Type{BigFloat}, s::AbstractString; base::Integer=0, precision::Integer=DEFAULT_PRECISION[], rounding::MPFRRoundingMode=ROUNDING_MODE[])
!isempty(s) && isspace(s[end]) && return tryparse(BigFloat, rstrip(s), base = base)
z = BigFloat(precision=precision)
Expand Down
178 changes: 174 additions & 4 deletions base/rational.jl
Original file line number Diff line number Diff line change
Expand Up @@ -129,12 +129,182 @@ Bool(x::Rational) = x==0 ? false : x==1 ? true :
(::Type{T})(x::Rational) where {T<:Integer} = (isinteger(x) ? convert(T, x.num)::T :
throw(InexactError(nameof(T), T, x)))

AbstractFloat(x::Rational) = (float(x.num)/float(x.den))::AbstractFloat
function (::Type{T})(x::Rational{S}) where T<:AbstractFloat where S
P = promote_type(T,S)
convert(T, convert(P,x.num)/convert(P,x.den))::T
bit_width(n) = ndigits(n, base = UInt8(2), pad = false)

const RoundingModeIndependentFromSign = Union{
RoundingMode{:ToZero}, RoundingMode{:FromZero},
RoundingMode{:Nearest}, RoundingMode{:NearestTiesAway}}

clamped_to_zero(x) = max(zero(x), x)

struct FloatComponentsResult{S<:Integer,T<:Integer}
mantissa::S
exponent::T
inexact::Bool
underflowed::Bool

function FloatComponentsResult(m::S, e::T;
inexact::Bool,
underflowed::Bool) where {S<:Integer,T<:Integer}
new{S,T}(m, e, inexact, underflowed)
end
end

# `num`, `den` are positive integers. `requested_precision` is the
# requested floating-point precision. `T` is the integer type that
# we'll be working with mostly, it needs to be wide enough.
# `exp_lower_bound` is the minimum allowed normalized exponent for
# normal numbers.
function rational_to_float_components_impl(num::Integer,
den::Integer,
requested_precision::Integer,
::Type{T},
romo::RoundingModeIndependentFromSign,
exp_lower_bound::Union{Nothing,Integer}) where {T<:Integer}
num_bit_width = bit_width(num)
den_bit_width = bit_width(den)

# `T` must be wide enough to avoid overflow.
numT = T(num)
denT = T(den)

# Creates a mantissa in `quo_` that's at least
# `requested_precision` bits wide.
bit_shift_num_ = clamped_to_zero(den_bit_width - num_bit_width + requested_precision)
quo_ = div(numT << bit_shift_num_, den, RoundToZero)

# Normalized exponent
nexp = bit_width(quo_) - bit_shift_num_ - true

# Take possible underflow into account: if there's underflow, the
# precision needs to be less than requested.
adjusted_precision = requested_precision
if !isnothing(exp_lower_bound)
underflow = clamped_to_zero(exp_lower_bound - nexp)
adjusted_precision -= underflow
end

# Nonnegative. Experiments indicate that, when iterating over all
# possible numerators and denominators below some bit width, with
# some fixed value for `adjusted_precision`, the maximal attained
# value will be
# `max(1, max(num_bit_width, den_bit_width) - adjusted_precision)`.
# So in the worst case we have `adjusted_precision == 1` and
# `excess_precision == max(1, max(num_bit_width, den_bit_width) - 1)`
excess_precision = bit_width(quo_) - adjusted_precision

(adjusted_precision < 0) && return FloatComponentsResult(
zero(quo_), zero(nexp), inexact = true, underflowed = true)

# Creates a mantissa in `quo` that's exactly `adjusted_precision`
# bits wide, except if rounding up happens, in which case the bit
# width may be `adjusted_precision + 1`.
bit_shift_num = clamped_to_zero(bit_shift_num_ - excess_precision)
bit_shift_den = excess_precision - (bit_shift_num_ - bit_shift_num) # nonnegative
(quo, rem) = divrem(numT << bit_shift_num, denT << bit_shift_den, romo)

result_is_exact = iszero(rem)

nexp_final = nexp + (adjusted_precision < bit_width(quo))
is_subnormal = !isnothing(exp_lower_bound) && (nexp_final < exp_lower_bound)

iszero(quo) || (quo >>= trailing_zeros(quo))

# The bit width of `quo` is now less than or equal to
# `adjusted_precision`, except if `adjusted_precision` is zero, in
# which case `bit_width(quo)` may be one, in case rounding up
# happened.

FloatComponentsResult(
quo, nexp_final, inexact = !result_is_exact, underflowed = is_subnormal)
end

# `num`, `den` are positive integers. `bit_width` is the requested
# floating-point precision and must be positive. `T` is the integer
# type that we'll be working with mostly, it needs to be wide enough.
# `exp_lower_bound` is the minimum allowed normalized exponent for
# normal numbers.
function rational_to_float_components(num::Integer,
den::Integer,
bit_width::Integer,
::Type{T},
romo::RoundingModeIndependentFromSign,
exp_lower_bound::Union{Nothing,Integer} = nothing) where {T<:Integer}
# Factor out powers of two
tz_num = trailing_zeros(num)
tz_den = trailing_zeros(den)
dexp = tz_num - tz_den
exp_lb = isnothing(exp_lower_bound) ? nothing : exp_lower_bound - dexp
c = rational_to_float_components_impl(
num >> tz_num, den >> tz_den, bit_width, T, romo, exp_lb)
FloatComponentsResult(
c.mantissa, c.exponent + dexp, inexact = c.inexact, underflowed = c.underflowed)
end

# Assuming the wanted rounding mode is round to nearest with ties to
# even.
#
# `precision` must be positive.
function rational_to_float_impl(to_float::C,
::Type{T},
x::Rational,
precision::Integer) where {C<:Union{Type,Function},T<:Integer}
s = Int8(sign(numerator(x)))
a = abs(x)

num = numerator(a)
den = denominator(a)

# Handle special cases
num_is_zero = iszero(num)
den_is_zero = iszero(den)
if den_is_zero
num_is_zero && return to_float(NaN) # 0/0
return to_float(s * Inf) # n/0 = sign(n) * Inf
end
num_is_zero && return to_float(false) # 0/n = 0

F = typeof(to_float(false))
c = rational_to_float_components(
num, den, precision, T, RoundNearest, min_normal_number_exponent(F))
bw = bit_width(c.mantissa)

# TODO: `ldexp` could be replaced with a mere bit of bit twiddling
# in the case of `Float16`, `Float32`, `Float64`
ldexp(s * to_float(c.mantissa), c.exponent - bw + true)::F
end

function rational_to_float_promote_type(::Type{F},
::Type{S}) where {F<:AbstractFloat,S<:Integer}
BigInt
end

function rational_to_float_promote_type(::Type{F},
::Type{S}) where {F<:AbstractFloat,S<:Unsigned}
rational_to_float_promote_type(F, widen(signed(S)))
end

# As an optimization, use a narrower type when possible.
rational_to_float_promote_type(::Type{Float16}, ::Type{<:Union{Int8,Int16}}) = Int32
rational_to_float_promote_type(::Type{Float32}, ::Type{<:Union{Int8,Int16}}) = Int64
rational_to_float_promote_type(::Type{Float64}, ::Type{<:Union{Int8,Int16}}) = Int128
rational_to_float_promote_type(::Type{<:Union{Float16,Float32}}, ::Type{Int32}) = Int64
rational_to_float_promote_type(::Type{Float64}, ::Type{Int32}) = Int128
rational_to_float_promote_type(::Type{<:Union{Float16,Float32,Float64}}, ::Type{Int64}) = Int128

rational_to_float(::Type{F}, x::Rational{Bool}) where {F<:AbstractFloat} = F(numerator(x)/denominator(x))

function rational_to_float(::Type{F}, x::Rational{T}) where {F<:AbstractFloat,T<:Integer}
rational_to_float_impl(F, rational_to_float_promote_type(F, T), x, precision(F))::F
end

function (::Type{F})(x::Rational) where {F<:AbstractFloat}
y = unsafe_rational(numerator(x), denominator(x))
rational_to_float(F, y)::F
end

AbstractFloat(x::Q) where {Q<:Rational} = float(Q)(x)::AbstractFloat

function Rational{T}(x::AbstractFloat) where T<:Integer
r = rationalize(T, x, tol=0)
x == convert(typeof(x), r) || throw(InexactError(:Rational, Rational{T}, x))
Expand Down
Loading

0 comments on commit dd48fdd

Please sign in to comment.