-
Notifications
You must be signed in to change notification settings - Fork 215
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Replace division implementations with code from the specialized-div-r…
…em crate
- Loading branch information
1 parent
6de4f8f
commit 6573765
Showing
8 changed files
with
1,160 additions
and
311 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,101 +1,84 @@ | ||
use int::Int; | ||
use super::specialized_div_rem::*; | ||
|
||
trait Div: Int { | ||
/// Returns `a / b` | ||
fn div(self, other: Self) -> Self { | ||
let s_a = self >> (Self::BITS - 1); | ||
let s_b = other >> (Self::BITS - 1); | ||
// NOTE it's OK to overflow here because of the `.unsigned()` below. | ||
// This whole operation is computing the absolute value of the inputs | ||
// So some overflow will happen when dealing with e.g. `i64::MIN` | ||
// where the absolute value is `(-i64::MIN) as u64` | ||
let a = (self ^ s_a).wrapping_sub(s_a); | ||
let b = (other ^ s_b).wrapping_sub(s_b); | ||
let s = s_a ^ s_b; | ||
|
||
let r = a.unsigned().aborting_div(b.unsigned()); | ||
(Self::from_unsigned(r) ^ s) - s | ||
} | ||
} | ||
|
||
impl Div for i32 {} | ||
impl Div for i64 {} | ||
impl Div for i128 {} | ||
|
||
trait Mod: Int { | ||
/// Returns `a % b` | ||
fn mod_(self, other: Self) -> Self { | ||
let s = other >> (Self::BITS - 1); | ||
// NOTE(wrapping_sub) see comment in the `div` | ||
let b = (other ^ s).wrapping_sub(s); | ||
let s = self >> (Self::BITS - 1); | ||
let a = (self ^ s).wrapping_sub(s); | ||
|
||
let r = a.unsigned().aborting_rem(b.unsigned()); | ||
(Self::from_unsigned(r) ^ s) - s | ||
} | ||
} | ||
|
||
impl Mod for i32 {} | ||
impl Mod for i64 {} | ||
impl Mod for i128 {} | ||
|
||
trait Divmod: Int { | ||
/// Returns `a / b` and sets `*rem = n % d` | ||
fn divmod<F>(self, other: Self, rem: &mut Self, div: F) -> Self | ||
where | ||
F: Fn(Self, Self) -> Self, | ||
{ | ||
let r = div(self, other); | ||
// NOTE won't overflow because it's using the result from the | ||
// previous division | ||
*rem = self - r.wrapping_mul(other); | ||
r | ||
} | ||
} | ||
|
||
impl Divmod for i32 {} | ||
impl Divmod for i64 {} | ||
// NOTE: there are panics inside the specialized_div_rem functions if division by 0 | ||
// is encountered, however these should be unreachable and optimized away unless | ||
// uses of `std/core::intrinsics::unchecked_div/rem` do not have a 0 check in front | ||
// of them. | ||
|
||
intrinsics! { | ||
#[maybe_use_optimized_c_shim] | ||
#[arm_aeabi_alias = __aeabi_idiv] | ||
/// Returns `n / d` | ||
pub extern "C" fn __divsi3(a: i32, b: i32) -> i32 { | ||
a.div(b) | ||
i32_div_rem_binary_long(a, b).0 | ||
} | ||
|
||
#[maybe_use_optimized_c_shim] | ||
pub extern "C" fn __divdi3(a: i64, b: i64) -> i64 { | ||
a.div(b) | ||
/// Returns `n % d` | ||
pub extern "C" fn __modsi3(a: i32, b: i32) -> i32 { | ||
i32_div_rem_binary_long(a, b).1 | ||
} | ||
|
||
#[win64_128bit_abi_hack] | ||
pub extern "C" fn __divti3(a: i128, b: i128) -> i128 { | ||
a.div(b) | ||
#[maybe_use_optimized_c_shim] | ||
/// Returns `n / d` and sets `*rem = n % d` | ||
pub extern "C" fn __divmodsi4(a: i32, b: i32, rem: &mut i32) -> i32 { | ||
let quo_rem = i32_div_rem_binary_long(a, b); | ||
*rem = quo_rem.1; | ||
quo_rem.0 | ||
} | ||
|
||
// `_delegate` is most efficient in the 64 bit range | ||
|
||
#[maybe_use_optimized_c_shim] | ||
pub extern "C" fn __modsi3(a: i32, b: i32) -> i32 { | ||
a.mod_(b) | ||
/// Returns `n / d` | ||
pub extern "C" fn __divdi3(a: i64, b: i64) -> i64 { | ||
i64_div_rem_delegate(a, b).0 | ||
} | ||
|
||
#[maybe_use_optimized_c_shim] | ||
/// Returns `n % d` | ||
pub extern "C" fn __moddi3(a: i64, b: i64) -> i64 { | ||
a.mod_(b) | ||
i64_div_rem_delegate(a, b).1 | ||
} | ||
|
||
#[aapcs_on_arm] | ||
/// Returns `n / d` and sets `*rem = n % d` | ||
pub extern "C" fn __divmoddi4(a: i64, b: i64, rem: &mut i64) -> i64 { | ||
let quo_rem = i64_div_rem_delegate(a, b); | ||
*rem = quo_rem.1; | ||
quo_rem.0 | ||
} | ||
} | ||
|
||
// `_trifecta` is efficient for large divisions, even when division | ||
// hardware is not availiable at all. | ||
|
||
#[cfg(not(target = "x86_64"))] | ||
intrinsics! { | ||
#[win64_128bit_abi_hack] | ||
/// Returns `n / d` | ||
pub extern "C" fn __divti3(a: i128, b: i128) -> i128 { | ||
i128_div_rem_trifecta(a, b).0 | ||
} | ||
|
||
#[win64_128bit_abi_hack] | ||
/// Returns `n % d` | ||
pub extern "C" fn __modti3(a: i128, b: i128) -> i128 { | ||
a.mod_(b) | ||
i128_div_rem_trifecta(a, b).1 | ||
} | ||
} | ||
|
||
#[maybe_use_optimized_c_shim] | ||
pub extern "C" fn __divmodsi4(a: i32, b: i32, rem: &mut i32) -> i32 { | ||
a.divmod(b, rem, |a, b| __divsi3(a, b)) | ||
#[cfg(target = "x86_64")] | ||
intrinsics! { | ||
#[win64_128bit_abi_hack] | ||
/// Returns `a / b` | ||
pub extern "C" fn __divti3(a: i128, b: i128) -> i128 { | ||
specialized_div_rem::i128_div_rem_asymmetric(a, b).0 | ||
} | ||
|
||
#[aapcs_on_arm] | ||
pub extern "C" fn __divmoddi4(a: i64, b: i64, rem: &mut i64) -> i64 { | ||
a.divmod(b, rem, |a, b| __divdi3(a, b)) | ||
#[win64_128bit_abi_hack] | ||
/// Returns `a % b` | ||
pub extern "C" fn __modti3(a: i128, b: i128) -> i128 { | ||
specialized_div_rem::i128_div_rem_asymmetric(a, b).1 | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,174 @@ | ||
macro_rules! impl_asymmetric { | ||
( | ||
$unsigned_name:ident, // name of the unsigned function | ||
$signed_name:ident, // name of the signed function | ||
$half_division:ident, // function for division of a $uX by a $uX | ||
$asymmetric_division:ident, // function for division of a $uD by a $uX | ||
$n_h:expr, // the number of bits in $iH or $uH | ||
$uH:ident, // unsigned integer with half the bit width of $uX | ||
$uX:ident, // unsigned integer with half the bit width of $uD | ||
$uD:ident, // unsigned integer with double the bit width of $uX | ||
$iD:ident, // signed version of $uD | ||
$($unsigned_attr:meta),*; // attributes for the unsigned function | ||
$($signed_attr:meta),* // attributes for the signed function | ||
) => { | ||
/// Computes the quotient and remainder of `duo` divided by `div` and returns them as a | ||
/// tuple. | ||
/// | ||
/// This is optimized for dividing integers with the same bitwidth as the largest operand in | ||
/// an asymmetrically sized division. For example, the x86-64 `divq` assembly instruction | ||
/// can divide a 128 bit integer by a 64 bit integer if the quotient fits in 64 bits. | ||
/// | ||
/// # Panics | ||
/// | ||
/// When attempting to divide by zero, this function will panic. | ||
$( | ||
#[$unsigned_attr] | ||
)* | ||
pub fn $unsigned_name(duo: $uD, div: $uD) -> ($uD,$uD) { | ||
#[inline(always)] | ||
fn carrying_mul(lhs: $uX, rhs: $uX) -> ($uX, $uX) { | ||
let tmp = (lhs as $uD).wrapping_mul(rhs as $uD); | ||
(tmp as $uX, (tmp >> ($n_h * 2)) as $uX) | ||
} | ||
#[inline(always)] | ||
fn carrying_mul_add(lhs: $uX, mul: $uX, add: $uX) -> ($uX, $uX) { | ||
let tmp = (lhs as $uD).wrapping_mul(mul as $uD).wrapping_add(add as $uD); | ||
(tmp as $uX, (tmp >> ($n_h * 2)) as $uX) | ||
} | ||
|
||
let n: u32 = $n_h * 2; | ||
|
||
// Many of these subalgorithms are taken from trifecta.rs, see that for better | ||
// documentation | ||
|
||
let duo_lo = duo as $uX; | ||
let duo_hi = (duo >> n) as $uX; | ||
let div_lo = div as $uX; | ||
let div_hi = (div >> n) as $uX; | ||
if div_hi == 0 { | ||
if div_lo == 0 { | ||
panic!("division by zero"); | ||
} | ||
if duo_hi < div_lo { | ||
// plain $uD by $uX division that will fit into $uX | ||
let tmp = unsafe { $asymmetric_division(duo, div_lo) }; | ||
return (tmp.0 as $uD, tmp.1 as $uD) | ||
} else if (div_lo >> $n_h) == 0 { | ||
// Short division of $uD by a $uH. | ||
let div_0 = div_lo as $uH as $uX; | ||
let (quo_hi, rem_3) = $half_division(duo_hi, div_0); | ||
|
||
let duo_mid = | ||
((duo >> $n_h) as $uH as $uX) | ||
| (rem_3 << $n_h); | ||
let (quo_1, rem_2) = $half_division(duo_mid, div_0); | ||
|
||
let duo_lo = | ||
(duo as $uH as $uX) | ||
| (rem_2 << $n_h); | ||
let (quo_0, rem_1) = $half_division(duo_lo, div_0); | ||
|
||
return ( | ||
(quo_0 as $uD) | ||
| ((quo_1 as $uD) << $n_h) | ||
| ((quo_hi as $uD) << n), | ||
rem_1 as $uD | ||
) | ||
} else { | ||
// Short division using the $uD by $uX division | ||
let (quo_hi, rem_hi) = $half_division(duo_hi, div_lo); | ||
let tmp = unsafe { | ||
$asymmetric_division((duo_lo as $uD) | ((rem_hi as $uD) << n), div_lo) | ||
}; | ||
return ((tmp.0 as $uD) | ((quo_hi as $uD) << n), tmp.1 as $uD) | ||
} | ||
} | ||
|
||
let duo_lz = duo_hi.leading_zeros(); | ||
let div_lz = div_hi.leading_zeros(); | ||
let rel_leading_sb = div_lz.wrapping_sub(duo_lz); | ||
if rel_leading_sb < $n_h { | ||
// Some x86_64 CPUs have bad `divq` implementations that make putting | ||
// a `mul` or `mul - 1` algorithm here beneficial | ||
let shift = n.wrapping_sub(duo_lz); | ||
let duo_sig_n = (duo >> shift) as $uX; | ||
let div_sig_n = (div >> shift) as $uX; | ||
let mul = $half_division(duo_sig_n, div_sig_n).0; | ||
let div_lo = div as $uX; | ||
let div_hi = (div >> n) as $uX; | ||
let (tmp_lo, carry) = carrying_mul(mul,div_lo); | ||
let (tmp_hi, overflow) = carrying_mul_add(mul,div_hi,carry); | ||
let tmp = (tmp_lo as $uD) | ((tmp_hi as $uD) << n); | ||
if ((overflow & 1) != 0) || (duo < tmp) { | ||
return ( | ||
mul.wrapping_sub(1) as $uD, | ||
duo.wrapping_add(div.wrapping_sub(tmp)) | ||
) | ||
} else { | ||
return ( | ||
mul as $uD, | ||
duo.wrapping_sub(tmp) | ||
) | ||
} | ||
} else { | ||
// This has been adapted from | ||
// https://www.codeproject.com/tips/785014/uint-division-modulus which was in turn | ||
// adapted from www.hackersdelight.org | ||
|
||
// This is similar to the `mul` or `mul - 1` algorithm in that it uses only more | ||
// significant parts of `duo` and `div` to divide a large integer with a smaller | ||
// division instruction. | ||
let tmp = unsafe { | ||
$asymmetric_division(duo >> 1, ((div << div_lz) >> n) as $uX) | ||
}; | ||
let mut quo = tmp.0 >> ((n - 1) - div_lz); | ||
if quo != 0 { | ||
quo -= 1; | ||
} | ||
// Note that this is a large $uD multiplication being used here | ||
let mut rem = duo - ((quo as $uD) * div); | ||
|
||
if rem >= div { | ||
quo += 1; | ||
rem -= div; | ||
} | ||
return (quo as $uD, rem) | ||
} | ||
} | ||
|
||
/// Computes the quotient and remainder of `duo` divided by `div` and returns them as a | ||
/// tuple. | ||
/// | ||
/// This is optimized for dividing integers with the same bitwidth as the largest operand in | ||
/// an asymmetrically sized division. For example, the x86-64 `divq` assembly instruction | ||
/// can divide a 128 bit integer by a 64 bit integer if the quotient fits in 64 bits. | ||
/// | ||
/// # Panics | ||
/// | ||
/// When attempting to divide by zero, this function will panic. | ||
$( | ||
#[$signed_attr] | ||
)* | ||
pub fn $signed_name(duo: $iD, div: $iD) -> ($iD,$iD) { | ||
match (duo < 0, div < 0) { | ||
(false,false) => { | ||
let t = $unsigned_name(duo as $uD,div as $uD); | ||
(t.0 as $iD,t.1 as $iD) | ||
}, | ||
(true,false) => { | ||
let t = $unsigned_name(duo.wrapping_neg() as $uD,div as $uD); | ||
((t.0 as $iD).wrapping_neg(),(t.1 as $iD).wrapping_neg()) | ||
}, | ||
(false,true) => { | ||
let t = $unsigned_name(duo as $uD,div.wrapping_neg() as $uD); | ||
((t.0 as $iD).wrapping_neg(),t.1 as $iD) | ||
}, | ||
(true,true) => { | ||
let t = $unsigned_name(duo.wrapping_neg() as $uD,div.wrapping_neg() as $uD); | ||
(t.0 as $iD,(t.1 as $iD).wrapping_neg()) | ||
}, | ||
} | ||
} | ||
} | ||
} |
Oops, something went wrong.