From d242475b5c27a1f87bbdf4a954d654da77eaa28f Mon Sep 17 00:00:00 2001 From: Aaron Kutch Date: Sun, 3 May 2020 22:38:04 -0500 Subject: [PATCH 1/8] regularize the location and documentation of division functions --- src/int/sdiv.rs | 44 +++++++++++++++++++++++++++----------------- src/int/udiv.rs | 14 ++++++-------- 2 files changed, 33 insertions(+), 25 deletions(-) diff --git a/src/int/sdiv.rs b/src/int/sdiv.rs index c9e252cc..854ea00b 100644 --- a/src/int/sdiv.rs +++ b/src/int/sdiv.rs @@ -60,42 +60,52 @@ impl Divmod for i64 {} intrinsics! { #[maybe_use_optimized_c_shim] #[arm_aeabi_alias = __aeabi_idiv] + /// Returns `n / d` pub extern "C" fn __divsi3(a: i32, b: i32) -> i32 { a.div(b) } #[maybe_use_optimized_c_shim] - pub extern "C" fn __divdi3(a: i64, b: i64) -> i64 { - a.div(b) + /// Returns `n % d` + pub extern "C" fn __modsi3(a: i32, b: i32) -> i32 { + a.mod_(b) } - - #[win64_128bit_abi_hack] - pub extern "C" fn __divti3(a: i128, b: i128) -> i128 { - a.div(b) + + #[maybe_use_optimized_c_shim] + /// Returns `n / d` and sets `*rem = n % d` + pub extern "C" fn __divmodsi4(a: i32, b: i32, rem: &mut i32) -> i32 { + a.divmod(b, rem, |a, b| __divsi3(a, b)) } #[maybe_use_optimized_c_shim] - pub extern "C" fn __modsi3(a: i32, b: i32) -> i32 { - a.mod_(b) + /// Returns `n / d` + pub extern "C" fn __divdi3(a: i64, b: i64) -> i64 { + a.div(b) } #[maybe_use_optimized_c_shim] + /// Returns `n % d` pub extern "C" fn __moddi3(a: i64, b: i64) -> i64 { a.mod_(b) } - #[win64_128bit_abi_hack] - pub extern "C" fn __modti3(a: i128, b: i128) -> i128 { - a.mod_(b) + #[aapcs_on_arm] + /// Returns `n / d` and sets `*rem = n % d` + pub extern "C" fn __divmoddi4(a: i64, b: i64, rem: &mut i64) -> i64 { + a.divmod(b, rem, |a, b| __divdi3(a, b)) } - #[maybe_use_optimized_c_shim] - pub extern "C" fn __divmodsi4(a: i32, b: i32, rem: &mut i32) -> i32 { - a.divmod(b, rem, |a, b| __divsi3(a, b)) + #[win64_128bit_abi_hack] + /// Returns `n / d` + pub extern "C" fn __divti3(a: i128, b: i128) -> i128 { + a.div(b) } - #[aapcs_on_arm] - pub extern "C" fn __divmoddi4(a: i64, b: i64, rem: &mut i64) -> i64 { - a.divmod(b, rem, |a, b| __divdi3(a, b)) + #[win64_128bit_abi_hack] + /// Returns `n % d` + pub extern "C" fn __modti3(a: i128, b: i128) -> i128 { + a.mod_(b) } + + // LLVM does not currently have a `__divmodti4` function } diff --git a/src/int/udiv.rs b/src/int/udiv.rs index b393ac6d..1ee670c7 100644 --- a/src/int/udiv.rs +++ b/src/int/udiv.rs @@ -241,6 +241,11 @@ intrinsics! { rem } + /// Returns `n / d` and sets `*rem = n % d` + pub extern "C" fn __udivmoddi4(n: u64, d: u64, rem: Option<&mut u64>) -> u64 { + udivmod_inner!(n, d, rem, u64) + } + #[win64_128bit_abi_hack] /// Returns `n / d` pub extern "C" fn __udivti3(n: u128, d: u128) -> u128 { @@ -255,16 +260,9 @@ intrinsics! { rem } - /// Returns `n / d` and sets `*rem = n % d` - pub extern "C" fn __udivmoddi4(n: u64, d: u64, rem: Option<&mut u64>) -> u64 { - udivmod_inner!(n, d, rem, u64) - } - #[win64_128bit_abi_hack] /// Returns `n / d` and sets `*rem = n % d` - pub extern "C" fn __udivmodti4(n: u128, - d: u128, - rem: Option<&mut u128>) -> u128 { + pub extern "C" fn __udivmodti4(n: u128, d: u128, rem: Option<&mut u128>) -> u128 { udivmod_inner!(n, d, rem, u128) } } From 83425b17ebfbb0c64536e0985ee8e5fbf8a54538 Mon Sep 17 00:00:00 2001 From: Aaron Kutch Date: Sun, 3 May 2020 22:40:54 -0500 Subject: [PATCH 2/8] replace old soft division code with new functions --- src/int/sdiv.rs | 23 +++++++------ src/int/udiv.rs | 86 +++++++++++-------------------------------------- 2 files changed, 33 insertions(+), 76 deletions(-) diff --git a/src/int/sdiv.rs b/src/int/sdiv.rs index 854ea00b..983c34cf 100644 --- a/src/int/sdiv.rs +++ b/src/int/sdiv.rs @@ -57,54 +57,59 @@ trait Divmod: Int { impl Divmod for i32 {} impl Divmod for i64 {} + intrinsics! { #[maybe_use_optimized_c_shim] #[arm_aeabi_alias = __aeabi_idiv] /// Returns `n / d` pub extern "C" fn __divsi3(a: i32, b: i32) -> i32 { - a.div(b) + i32_div_rem(a, b).0 } #[maybe_use_optimized_c_shim] /// Returns `n % d` pub extern "C" fn __modsi3(a: i32, b: i32) -> i32 { - a.mod_(b) + i32_div_rem(a, b).1 } - + #[maybe_use_optimized_c_shim] /// Returns `n / d` and sets `*rem = n % d` pub extern "C" fn __divmodsi4(a: i32, b: i32, rem: &mut i32) -> i32 { - a.divmod(b, rem, |a, b| __divsi3(a, b)) + let quo_rem = i32_div_rem(a, b); + *rem = quo_rem.1; + quo_rem.0 } #[maybe_use_optimized_c_shim] /// Returns `n / d` pub extern "C" fn __divdi3(a: i64, b: i64) -> i64 { - a.div(b) + i64_div_rem(a, b).0 } #[maybe_use_optimized_c_shim] /// Returns `n % d` pub extern "C" fn __moddi3(a: i64, b: i64) -> i64 { - a.mod_(b) + i64_div_rem(a, b).1 } #[aapcs_on_arm] /// Returns `n / d` and sets `*rem = n % d` pub extern "C" fn __divmoddi4(a: i64, b: i64, rem: &mut i64) -> i64 { - a.divmod(b, rem, |a, b| __divdi3(a, b)) + let quo_rem = i64_div_rem(a, b); + *rem = quo_rem.1; + quo_rem.0 } #[win64_128bit_abi_hack] /// Returns `n / d` pub extern "C" fn __divti3(a: i128, b: i128) -> i128 { - a.div(b) + i128_div_rem(a, b).0 } #[win64_128bit_abi_hack] /// Returns `n % d` pub extern "C" fn __modti3(a: i128, b: i128) -> i128 { - a.mod_(b) + i128_div_rem(a, b).1 } // LLVM does not currently have a `__divmodti4` function diff --git a/src/int/udiv.rs b/src/int/udiv.rs index 1ee670c7..b312ca48 100644 --- a/src/int/udiv.rs +++ b/src/int/udiv.rs @@ -156,113 +156,65 @@ intrinsics! { #[arm_aeabi_alias = __aeabi_uidiv] /// Returns `n / d` pub extern "C" fn __udivsi3(n: u32, d: u32) -> u32 { - // Special cases - if d == 0 { - // NOTE This should be unreachable in safe Rust because the program will panic before - // this intrinsic is called - ::abort(); - } - - if n == 0 { - return 0; - } - - let mut sr = d.leading_zeros().wrapping_sub(n.leading_zeros()); - - // d > n - if sr > u32::BITS - 1 { - return 0; - } - - // d == 1 - if sr == u32::BITS - 1 { - return n; - } - - sr += 1; - - // 1 <= sr <= u32::BITS - 1 - let mut q = n << (u32::BITS - sr); - let mut r = n >> sr; - - let mut carry = 0; - - // Don't use a range because they may generate references to memcpy in unoptimized code - let mut i = 0; - while i < sr { - i += 1; - - // r:q = ((r:q) << 1) | carry - r = (r << 1) | (q >> (u32::BITS - 1)); - q = (q << 1) | carry; - - // carry = 0; - // if r > d { - // r -= d; - // carry = 1; - // } - - let s = (d.wrapping_sub(r).wrapping_sub(1)) as i32 >> (u32::BITS - 1); - carry = (s & 1) as u32; - r -= d & s as u32; - } - - (q << 1) | carry + u32_div_rem(n, d).0 } #[maybe_use_optimized_c_shim] /// Returns `n % d` pub extern "C" fn __umodsi3(n: u32, d: u32) -> u32 { - let q = __udivsi3(n, d); - n - q * d + u32_div_rem(n, d).1 } #[maybe_use_optimized_c_shim] /// Returns `n / d` and sets `*rem = n % d` pub extern "C" fn __udivmodsi4(n: u32, d: u32, rem: Option<&mut u32>) -> u32 { - let q = __udivsi3(n, d); + let quo_rem = u32_div_rem(n, d); if let Some(rem) = rem { - *rem = n - (q * d); + *rem = quo_rem.1; } - q + quo_rem.0 } #[maybe_use_optimized_c_shim] /// Returns `n / d` pub extern "C" fn __udivdi3(n: u64, d: u64) -> u64 { - __udivmoddi4(n, d, None) + u64_div_rem(n, d).0 } #[maybe_use_optimized_c_shim] /// Returns `n % d` pub extern "C" fn __umoddi3(n: u64, d: u64) -> u64 { - let mut rem = 0; - __udivmoddi4(n, d, Some(&mut rem)); - rem + u64_div_rem(n, d).1 } /// Returns `n / d` and sets `*rem = n % d` pub extern "C" fn __udivmoddi4(n: u64, d: u64, rem: Option<&mut u64>) -> u64 { - udivmod_inner!(n, d, rem, u64) + let quo_rem = u64_div_rem(n, d); + if let Some(rem) = rem { + *rem = quo_rem.1; + } + quo_rem.0 } #[win64_128bit_abi_hack] /// Returns `n / d` pub extern "C" fn __udivti3(n: u128, d: u128) -> u128 { - __udivmodti4(n, d, None) + u128_div_rem(n, d).0 } #[win64_128bit_abi_hack] /// Returns `n % d` pub extern "C" fn __umodti3(n: u128, d: u128) -> u128 { - let mut rem = 0; - __udivmodti4(n, d, Some(&mut rem)); - rem + u128_div_rem(n, d).1 } #[win64_128bit_abi_hack] /// Returns `n / d` and sets `*rem = n % d` pub extern "C" fn __udivmodti4(n: u128, d: u128, rem: Option<&mut u128>) -> u128 { - udivmod_inner!(n, d, rem, u128) + let quo_rem = u128_div_rem(n, d); + if let Some(rem) = rem { + *rem = quo_rem.1; + } + quo_rem.0 } } From 752ab52a7a3d26ed5a976e9611d7cdcba253705b Mon Sep 17 00:00:00 2001 From: Aaron Kutch Date: Sun, 19 Jul 2020 17:15:50 -0500 Subject: [PATCH 3/8] Remove erroneous `aapcs_on_arm` and add `maybe_use_optimized_c_shim` --- src/int/sdiv.rs | 2 +- src/int/udiv.rs | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/int/sdiv.rs b/src/int/sdiv.rs index 983c34cf..682ebeda 100644 --- a/src/int/sdiv.rs +++ b/src/int/sdiv.rs @@ -92,7 +92,7 @@ intrinsics! { i64_div_rem(a, b).1 } - #[aapcs_on_arm] + #[maybe_use_optimized_c_shim] /// Returns `n / d` and sets `*rem = n % d` pub extern "C" fn __divmoddi4(a: i64, b: i64, rem: &mut i64) -> i64 { let quo_rem = i64_div_rem(a, b); diff --git a/src/int/udiv.rs b/src/int/udiv.rs index b312ca48..e517ca86 100644 --- a/src/int/udiv.rs +++ b/src/int/udiv.rs @@ -187,6 +187,7 @@ intrinsics! { u64_div_rem(n, d).1 } + #[maybe_use_optimized_c_shim] /// Returns `n / d` and sets `*rem = n % d` pub extern "C" fn __udivmoddi4(n: u64, d: u64, rem: Option<&mut u64>) -> u64 { let quo_rem = u64_div_rem(n, d); From 6aef025a369849dba4ed461bf2c80bda540dce1d Mon Sep 17 00:00:00 2001 From: Aaron Kutch Date: Fri, 17 Jul 2020 15:17:40 -0500 Subject: [PATCH 4/8] Remove unused code --- src/int/mod.rs | 12 ---- src/int/sdiv.rs | 58 ------------------- src/int/udiv.rs | 151 ------------------------------------------------ 3 files changed, 221 deletions(-) diff --git a/src/int/mod.rs b/src/int/mod.rs index 8a469d90..7bafb82f 100644 --- a/src/int/mod.rs +++ b/src/int/mod.rs @@ -1,17 +1,5 @@ use core::ops; -macro_rules! hty { - ($ty:ty) => { - <$ty as LargeInt>::HighHalf - }; -} - -macro_rules! os_ty { - ($ty:ty) => { - <$ty as Int>::OtherSign - }; -} - pub mod addsub; pub mod leading_zeros; pub mod mul; diff --git a/src/int/sdiv.rs b/src/int/sdiv.rs index 682ebeda..d8bb9c09 100644 --- a/src/int/sdiv.rs +++ b/src/int/sdiv.rs @@ -1,63 +1,5 @@ use int::Int; -trait Div: Int { - /// Returns `a / b` - fn div(self, other: Self) -> Self { - let s_a = self >> (Self::BITS - 1); - let s_b = other >> (Self::BITS - 1); - // NOTE it's OK to overflow here because of the `.unsigned()` below. - // This whole operation is computing the absolute value of the inputs - // So some overflow will happen when dealing with e.g. `i64::MIN` - // where the absolute value is `(-i64::MIN) as u64` - let a = (self ^ s_a).wrapping_sub(s_a); - let b = (other ^ s_b).wrapping_sub(s_b); - let s = s_a ^ s_b; - - let r = a.unsigned().aborting_div(b.unsigned()); - (Self::from_unsigned(r) ^ s) - s - } -} - -impl Div for i32 {} -impl Div for i64 {} -impl Div for i128 {} - -trait Mod: Int { - /// Returns `a % b` - fn mod_(self, other: Self) -> Self { - let s = other >> (Self::BITS - 1); - // NOTE(wrapping_sub) see comment in the `div` - let b = (other ^ s).wrapping_sub(s); - let s = self >> (Self::BITS - 1); - let a = (self ^ s).wrapping_sub(s); - - let r = a.unsigned().aborting_rem(b.unsigned()); - (Self::from_unsigned(r) ^ s) - s - } -} - -impl Mod for i32 {} -impl Mod for i64 {} -impl Mod for i128 {} - -trait Divmod: Int { - /// Returns `a / b` and sets `*rem = n % d` - fn divmod(self, other: Self, rem: &mut Self, div: F) -> Self - where - F: Fn(Self, Self) -> Self, - { - let r = div(self, other); - // NOTE won't overflow because it's using the result from the - // previous division - *rem = self - r.wrapping_mul(other); - r - } -} - -impl Divmod for i32 {} -impl Divmod for i64 {} - - intrinsics! { #[maybe_use_optimized_c_shim] #[arm_aeabi_alias = __aeabi_idiv] diff --git a/src/int/udiv.rs b/src/int/udiv.rs index e517ca86..49151594 100644 --- a/src/int/udiv.rs +++ b/src/int/udiv.rs @@ -1,156 +1,5 @@ use int::{Int, LargeInt}; -macro_rules! udivmod_inner { - ($n:expr, $d:expr, $rem:expr, $ty:ty) => {{ - let (n, d, rem) = ($n, $d, $rem); - // NOTE X is unknown, K != 0 - if n.high() == 0 { - if d.high() == 0 { - // 0 X - // --- - // 0 X - - if let Some(rem) = rem { - *rem = <$ty>::from(n.low().aborting_rem(d.low())); - } - return <$ty>::from(n.low().aborting_div(d.low())) - } else { - // 0 X - // --- - // K X - if let Some(rem) = rem { - *rem = n; - } - return 0; - }; - } - - let mut sr; - let mut q; - let mut r; - - if d.low() == 0 { - if d.high() == 0 { - // K X - // --- - // 0 0 - // NOTE This should be unreachable in safe Rust because the program will panic before - // this intrinsic is called - ::abort(); - } - - if n.low() == 0 { - // K 0 - // --- - // K 0 - if let Some(rem) = rem { - *rem = <$ty>::from_parts(0, n.high().aborting_rem(d.high())); - } - return <$ty>::from(n.high().aborting_div(d.high())) - } - - // K K - // --- - // K 0 - - if d.high().is_power_of_two() { - if let Some(rem) = rem { - *rem = <$ty>::from_parts(n.low(), n.high() & (d.high() - 1)); - } - return <$ty>::from(n.high() >> d.high().trailing_zeros()); - } - - sr = d.high().leading_zeros().wrapping_sub(n.high().leading_zeros()); - - // D > N - if sr > ::BITS - 2 { - if let Some(rem) = rem { - *rem = n; - } - return 0; - } - - sr += 1; - - // 1 <= sr <= ::BITS - 1 - q = n << (<$ty>::BITS - sr); - r = n >> sr; - } else if d.high() == 0 { - // K X - // --- - // 0 K - if d.low().is_power_of_two() { - if let Some(rem) = rem { - *rem = <$ty>::from(n.low() & (d.low() - 1)); - } - - if d.low() == 1 { - return n; - } else { - let sr = d.low().trailing_zeros(); - return n >> sr; - }; - } - - sr = 1 + ::BITS + d.low().leading_zeros() - n.high().leading_zeros(); - - // 2 <= sr <= u64::BITS - 1 - q = n << (<$ty>::BITS - sr); - r = n >> sr; - } else { - // K X - // --- - // K K - sr = d.high().leading_zeros().wrapping_sub(n.high().leading_zeros()); - - // D > N - if sr > ::BITS - 1 { - if let Some(rem) = rem { - *rem = n; - } - return 0; - } - - sr += 1; - - // 1 <= sr <= ::BITS - q = n << (<$ty>::BITS - sr); - r = n >> sr; - } - - // Not a special case - // q and r are initialized with - // q = n << (u64::BITS - sr) - // r = n >> sr - // 1 <= sr <= u64::BITS - 1 - let mut carry = 0; - - // Don't use a range because they may generate references to memcpy in unoptimized code - let mut i = 0; - while i < sr { - i += 1; - - // r:q = ((r:q) << 1) | carry - r = (r << 1) | (q >> (<$ty>::BITS - 1)); - q = (q << 1) | carry as $ty; - - // carry = 0 - // if r >= d { - // r -= d; - // carry = 1; - // } - let s = (d.wrapping_sub(r).wrapping_sub(1)) as os_ty!($ty) >> (<$ty>::BITS - 1); - carry = (s & 1) as hty!($ty); - r -= d & s as $ty; - } - - if let Some(rem) = rem { - *rem = r; - } - (q << 1) | carry as $ty - }} -} - intrinsics! { #[maybe_use_optimized_c_shim] #[arm_aeabi_alias = __aeabi_uidiv] From 1621c6dbf9eb0c7a4acf1c47d52254b481bb58c2 Mon Sep 17 00:00:00 2001 From: Aaron Kutch Date: Sat, 11 Jul 2020 12:15:10 -0500 Subject: [PATCH 5/8] Use `specialized-div-rem` 1.0.0 for division algorithms --- Cargo.toml | 4 + src/int/mod.rs | 2 + src/int/sdiv.rs | 2 +- src/int/specialized_div_rem/asymmetric.rs | 169 ++++++ src/int/specialized_div_rem/binary_long.rs | 596 +++++++++++++++++++++ src/int/specialized_div_rem/delegate.rs | 226 ++++++++ src/int/specialized_div_rem/mod.rs | 295 ++++++++++ src/int/specialized_div_rem/norm_shift.rs | 106 ++++ src/int/specialized_div_rem/trifecta.rs | 441 +++++++++++++++ src/int/udiv.rs | 2 +- src/lib.rs | 1 + testcrate/Cargo.toml | 3 +- testcrate/tests/div_rem.rs | 143 +++++ 13 files changed, 1987 insertions(+), 3 deletions(-) create mode 100644 src/int/specialized_div_rem/asymmetric.rs create mode 100644 src/int/specialized_div_rem/binary_long.rs create mode 100644 src/int/specialized_div_rem/delegate.rs create mode 100644 src/int/specialized_div_rem/mod.rs create mode 100644 src/int/specialized_div_rem/norm_shift.rs create mode 100644 src/int/specialized_div_rem/trifecta.rs create mode 100644 testcrate/tests/div_rem.rs diff --git a/Cargo.toml b/Cargo.toml index d4d1ad67..353d9a73 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -40,6 +40,10 @@ panic-handler = { path = 'crates/panic-handler' } [features] default = ["compiler-builtins"] +# Some algorithms benefit from inline assembly, but some compiler backends do +# not support it, so inline assembly is only enabled when this flag is set. +asm = [] + # Enable compilation of C code in compiler-rt, filling in some more optimized # implementations and also filling in unimplemented intrinsics c = ["cc"] diff --git a/src/int/mod.rs b/src/int/mod.rs index 7bafb82f..5cfad1d6 100644 --- a/src/int/mod.rs +++ b/src/int/mod.rs @@ -1,5 +1,7 @@ use core::ops; +mod specialized_div_rem; + pub mod addsub; pub mod leading_zeros; pub mod mul; diff --git a/src/int/sdiv.rs b/src/int/sdiv.rs index d8bb9c09..57ef03cd 100644 --- a/src/int/sdiv.rs +++ b/src/int/sdiv.rs @@ -1,4 +1,4 @@ -use int::Int; +use int::specialized_div_rem::*; intrinsics! { #[maybe_use_optimized_c_shim] diff --git a/src/int/specialized_div_rem/asymmetric.rs b/src/int/specialized_div_rem/asymmetric.rs new file mode 100644 index 00000000..861e9174 --- /dev/null +++ b/src/int/specialized_div_rem/asymmetric.rs @@ -0,0 +1,169 @@ +/// Creates unsigned and signed division functions optimized for dividing integers with the same +/// bitwidth as the largest operand in an asymmetrically sized division. For example, x86-64 has an +/// assembly instruction that can divide a 128 bit integer by a 64 bit integer if the quotient fits +/// in 64 bits. The 128 bit version of this algorithm would use that fast hardware division to +/// construct a full 128 bit by 128 bit division. +#[macro_export] +macro_rules! impl_asymmetric { + ( + $unsigned_name:ident, // name of the unsigned division function + $signed_name:ident, // name of the signed division function + $zero_div_fn:ident, // function called when division by zero is attempted + $half_division:ident, // function for division of a $uX by a $uX + $asymmetric_division:ident, // function for division of a $uD by a $uX + $n_h:expr, // the number of bits in a $iH or $uH + $uH:ident, // unsigned integer with half the bit width of $uX + $uX:ident, // unsigned integer with half the bit width of $uD + $uD:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name` + $iD:ident, // signed integer type for the inputs and outputs of `$signed_name` + $($unsigned_attr:meta),*; // attributes for the unsigned function + $($signed_attr:meta),* // attributes for the signed function + ) => { + /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a + /// tuple. + $( + #[$unsigned_attr] + )* + pub fn $unsigned_name(duo: $uD, div: $uD) -> ($uD,$uD) { + fn carrying_mul(lhs: $uX, rhs: $uX) -> ($uX, $uX) { + let tmp = (lhs as $uD).wrapping_mul(rhs as $uD); + (tmp as $uX, (tmp >> ($n_h * 2)) as $uX) + } + fn carrying_mul_add(lhs: $uX, mul: $uX, add: $uX) -> ($uX, $uX) { + let tmp = (lhs as $uD).wrapping_mul(mul as $uD).wrapping_add(add as $uD); + (tmp as $uX, (tmp >> ($n_h * 2)) as $uX) + } + + let n: u32 = $n_h * 2; + + // Many of these subalgorithms are taken from trifecta.rs, see that for better + // documentation. + + let duo_lo = duo as $uX; + let duo_hi = (duo >> n) as $uX; + let div_lo = div as $uX; + let div_hi = (div >> n) as $uX; + if div_hi == 0 { + if div_lo == 0 { + $zero_div_fn() + } + if duo_hi < div_lo { + // `$uD` by `$uX` division with a quotient that will fit into a `$uX` + let (quo, rem) = unsafe { $asymmetric_division(duo, div_lo) }; + return (quo as $uD, rem as $uD) + } else if (div_lo >> $n_h) == 0 { + // Short division of $uD by a $uH. + + // Some x86_64 CPUs have bad division implementations that make specializing + // this case faster. + let div_0 = div_lo as $uH as $uX; + let (quo_hi, rem_3) = $half_division(duo_hi, div_0); + + let duo_mid = + ((duo >> $n_h) as $uH as $uX) + | (rem_3 << $n_h); + let (quo_1, rem_2) = $half_division(duo_mid, div_0); + + let duo_lo = + (duo as $uH as $uX) + | (rem_2 << $n_h); + let (quo_0, rem_1) = $half_division(duo_lo, div_0); + + return ( + (quo_0 as $uD) + | ((quo_1 as $uD) << $n_h) + | ((quo_hi as $uD) << n), + rem_1 as $uD + ) + } else { + // Short division using the $uD by $uX division + let (quo_hi, rem_hi) = $half_division(duo_hi, div_lo); + let tmp = unsafe { + $asymmetric_division((duo_lo as $uD) | ((rem_hi as $uD) << n), div_lo) + }; + return ((tmp.0 as $uD) | ((quo_hi as $uD) << n), tmp.1 as $uD) + } + } + + let duo_lz = duo_hi.leading_zeros(); + let div_lz = div_hi.leading_zeros(); + let rel_leading_sb = div_lz.wrapping_sub(duo_lz); + if rel_leading_sb < $n_h { + // Some x86_64 CPUs have bad hardware division implementations that make putting + // a two possibility algorithm here beneficial. We also avoid a full `$uD` + // multiplication. + let shift = n - duo_lz; + let duo_sig_n = (duo >> shift) as $uX; + let div_sig_n = (div >> shift) as $uX; + let quo = $half_division(duo_sig_n, div_sig_n).0; + let div_lo = div as $uX; + let div_hi = (div >> n) as $uX; + let (tmp_lo, carry) = carrying_mul(quo, div_lo); + let (tmp_hi, overflow) = carrying_mul_add(quo, div_hi, carry); + let tmp = (tmp_lo as $uD) | ((tmp_hi as $uD) << n); + if (overflow != 0) || (duo < tmp) { + return ( + (quo - 1) as $uD, + duo.wrapping_add(div).wrapping_sub(tmp) + ) + } else { + return ( + quo as $uD, + duo - tmp + ) + } + } else { + // This has been adapted from + // https://www.codeproject.com/tips/785014/uint-division-modulus which was in turn + // adapted from Hacker's Delight. This is similar to the two possibility algorithm + // in that it uses only more significant parts of `duo` and `div` to divide a large + // integer with a smaller division instruction. + + let div_extra = n - div_lz; + let div_sig_n = (div >> div_extra) as $uX; + let tmp = unsafe { + $asymmetric_division(duo >> 1, div_sig_n) + }; + + let mut quo = tmp.0 >> ((n - 1) - div_lz); + if quo != 0 { + quo -= 1; + } + + // Note that this is a full `$uD` multiplication being used here + let mut rem = duo - (quo as $uD).wrapping_mul(div); + if div <= rem { + quo += 1; + rem -= div; + } + return (quo as $uD, rem) + } + } + + /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a + /// tuple. + $( + #[$signed_attr] + )* + pub fn $signed_name(duo: $iD, div: $iD) -> ($iD, $iD) { + match (duo < 0, div < 0) { + (false, false) => { + let t = $unsigned_name(duo as $uD, div as $uD); + (t.0 as $iD, t.1 as $iD) + }, + (true, false) => { + let t = $unsigned_name(duo.wrapping_neg() as $uD, div as $uD); + ((t.0 as $iD).wrapping_neg(), (t.1 as $iD).wrapping_neg()) + }, + (false, true) => { + let t = $unsigned_name(duo as $uD, div.wrapping_neg() as $uD); + ((t.0 as $iD).wrapping_neg(), t.1 as $iD) + }, + (true, true) => { + let t = $unsigned_name(duo.wrapping_neg() as $uD, div.wrapping_neg() as $uD); + (t.0 as $iD, (t.1 as $iD).wrapping_neg()) + }, + } + } + } +} diff --git a/src/int/specialized_div_rem/binary_long.rs b/src/int/specialized_div_rem/binary_long.rs new file mode 100644 index 00000000..4c63396a --- /dev/null +++ b/src/int/specialized_div_rem/binary_long.rs @@ -0,0 +1,596 @@ +/// Creates unsigned and signed division functions that use binary long division, designed for +/// computer architectures without division instructions. These functions have good performance for +/// microarchitectures with large branch miss penalties and architectures without the ability to +/// predicate instructions. For architectures with predicated instructions, one of the algorithms +/// described in the documentation of these functions probably has higher performance, and a custom +/// assembly routine should be used instead. +#[macro_export] +macro_rules! impl_binary_long { + ( + $unsigned_name:ident, // name of the unsigned division function + $signed_name:ident, // name of the signed division function + $zero_div_fn:ident, // function called when division by zero is attempted + $normalization_shift:ident, // function for finding the normalization shift + $n:tt, // the number of bits in a $iX or $uX + $uX:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name` + $iX:ident, // signed integer type for the inputs and outputs of `$signed_name` + $($unsigned_attr:meta),*; // attributes for the unsigned function + $($signed_attr:meta),* // attributes for the signed function + ) => { + /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a + /// tuple. + $( + #[$unsigned_attr] + )* + pub fn $unsigned_name(duo: $uX, div: $uX) -> ($uX, $uX) { + let mut duo = duo; + // handle edge cases before calling `$normalization_shift` + if div == 0 { + $zero_div_fn() + } + if duo < div { + return (0, duo) + } + + // There are many variations of binary division algorithm that could be used. This + // documentation gives a tour of different methods so that future readers wanting to + // optimize further do not have to painstakingly derive them. The SWAR variation is + // especially hard to understand without reading the less convoluted methods first. + + // You may notice that a `duo < div_original` check is included in many these + // algorithms. A critical optimization that many algorithms miss is handling of + // quotients that will turn out to have many trailing zeros or many leading zeros. This + // happens in cases of exact or close-to-exact divisions, divisions by power of two, and + // in cases where the quotient is small. The `duo < div_original` check handles these + // cases of early returns and ends up replacing other kinds of mundane checks that + // normally terminate a binary division algorithm. + // + // Something you may see in other algorithms that is not special-cased here is checks + // for division by powers of two. The `duo < div_original` check handles this case and + // more, however it can be checked up front before the bisection using the + // `((div > 0) && ((div & (div - 1)) == 0))` trick. This is not special-cased because + // compilers should handle most cases where divisions by power of two occur, and we do + // not want to add on a few cycles for every division operation just to save a few + // cycles rarely. + + // The following example is the most straightforward translation from the way binary + // long division is typically visualized: + // Dividing 178u8 (0b10110010) by 6u8 (0b110). `div` is shifted left by 5, according to + // the result from `$normalization_shift(duo, div, false)`. + // + // Step 0: `sub` is negative, so there is not full normalization, so no `quo` bit is set + // and `duo` is kept unchanged. + // duo:10110010, div_shifted:11000000, sub:11110010, quo:00000000, shl:5 + // + // Step 1: `sub` is positive, set a `quo` bit and update `duo` for next step. + // duo:10110010, div_shifted:01100000, sub:01010010, quo:00010000, shl:4 + // + // Step 2: Continue based on `sub`. The `quo` bits start accumulating. + // duo:01010010, div_shifted:00110000, sub:00100010, quo:00011000, shl:3 + // duo:00100010, div_shifted:00011000, sub:00001010, quo:00011100, shl:2 + // duo:00001010, div_shifted:00001100, sub:11111110, quo:00011100, shl:1 + // duo:00001010, div_shifted:00000110, sub:00000100, quo:00011100, shl:0 + // The `duo < div_original` check terminates the algorithm with the correct quotient of + // 29u8 and remainder of 4u8 + /* + let div_original = div; + let mut shl = $normalization_shift(duo, div, false); + let mut quo = 0; + loop { + let div_shifted = div << shl; + let sub = duo.wrapping_sub(div_shifted); + // it is recommended to use `println!`s like this if functionality is unclear + /* + println!("duo:{:08b}, div_shifted:{:08b}, sub:{:08b}, quo:{:08b}, shl:{}", + duo, + div_shifted, + sub, + quo, + shl + ); + */ + if 0 <= (sub as $iX) { + duo = sub; + quo += 1 << shl; + if duo < div_original { + // this branch is optional + return (quo, duo) + } + } + if shl == 0 { + return (quo, duo) + } + shl -= 1; + } + */ + + // This restoring binary long division algorithm reduces the number of operations + // overall via: + // - `pow` can be shifted right instead of recalculating from `shl` + // - starting `div` shifted left and shifting it right for each step instead of + // recalculating from `shl` + // - The `duo < div_original` branch is used to terminate the algorithm instead of the + // `shl == 0` branch. This check is strong enough to prevent set bits of `pow` and + // `div` from being shifted off the end. This check also only occurs on half of steps + // on average, since it is behind the `(sub as $iX) >= 0` branch. + // - `shl` is now not needed by any aspect of of the loop and thus only 3 variables are + // being updated between steps + // + // There are many variations of this algorithm, but this encompases the largest number + // of architectures and does not rely on carry flags, add-with-carry, or SWAR + // complications to be decently fast. + /* + let div_original = div; + let shl = $normalization_shift(duo, div, false); + let mut div: $uX = div << shl; + let mut pow: $uX = 1 << shl; + let mut quo: $uX = 0; + loop { + let sub = duo.wrapping_sub(div); + if 0 <= (sub as $iX) { + duo = sub; + quo |= pow; + if duo < div_original { + return (quo, duo) + } + } + div >>= 1; + pow >>= 1; + } + */ + + // If the architecture has flags and predicated arithmetic instructions, it is possible + // to do binary long division without branching and in only 3 or 4 instructions. This is + // a variation of a 3 instruction central loop from + // http://www.chiark.greenend.org.uk/~theom/riscos/docs/ultimate/a252div.txt. + // + // What allows doing division in only 3 instructions is realizing that instead of + // keeping `duo` in place and shifting `div` right to align bits, `div` can be kept in + // place and `duo` can be shifted left. This means `div` does not have to be updated, + // but causes edge case problems and makes `duo < div_original` tests harder. Some + // architectures have an option to shift an argument in an arithmetic operation, which + // means `duo` can be shifted left and subtracted from in one instruction. The other two + // instructions are updating `quo` and undoing the subtraction if it turns out things + // were not normalized. + + /* + // Perform one binary long division step on the already normalized arguments, because + // the main. Note that this does a full normalization since the central loop needs + // `duo.leading_zeros()` to be at least 1 more than `div.leading_zeros()`. The original + // variation only did normalization to the nearest 4 steps, but this makes handling edge + // cases much harder. We do a full normalization and perform a binary long division + // step. In the edge case where the msbs of `duo` and `div` are set, it clears the msb + // of `duo`, then the edge case handler shifts `div` right and does another long + // division step to always insure `duo.leading_zeros() + 1 >= div.leading_zeros()`. + let div_original = div; + let mut shl = $normalization_shift(duo, div, true); + let mut div: $uX = (div << shl); + let mut quo: $uX = 1; + duo = duo.wrapping_sub(div); + if duo < div_original { + return (1 << shl, duo); + } + let div_neg: $uX; + if (div as $iX) < 0 { + // A very ugly edge case where the most significant bit of `div` is set (after + // shifting to match `duo` when its most significant bit is at the sign bit), which + // leads to the sign bit of `div_neg` being cut off and carries not happening when + // they should. This branch performs a long division step that keeps `duo` in place + // and shifts `div` down. + div >>= 1; + div_neg = div.wrapping_neg(); + let (sub, carry) = duo.overflowing_add(div_neg); + duo = sub; + quo = quo.wrapping_add(quo).wrapping_add(carry as $uX); + if !carry { + duo = duo.wrapping_add(div); + } + shl -= 1; + } else { + div_neg = div.wrapping_neg(); + } + // The add-with-carry that updates `quo` needs to have the carry set when a normalized + // subtract happens. Using `duo.wrapping_shl(1).overflowing_sub(div)` to do the + // subtraction generates a carry when an unnormalized subtract happens, which is the + // opposite of what we want. Instead, we use + // `duo.wrapping_shl(1).overflowing_add(div_neg)`, where `div_neg` is negative `div`. + let mut i = shl; + loop { + if i == 0 { + break; + } + i -= 1; + // `ADDS duo, div, duo, LSL #1` + // (add `div` to `duo << 1` and set flags) + let (sub, carry) = duo.wrapping_shl(1).overflowing_add(div_neg); + duo = sub; + // `ADC quo, quo, quo` + // (add with carry). Effectively shifts `quo` left by 1 and sets the least + // significant bit to the carry. + quo = quo.wrapping_add(quo).wrapping_add(carry as $uX); + // `ADDCC duo, duo, div` + // (add if carry clear). Undoes the subtraction if no carry was generated. + if !carry { + duo = duo.wrapping_add(div); + } + } + return (quo, duo >> shl); + */ + + // This is the SWAR (SIMD within in a register) restoring division algorithm. + // This combines several ideas of the above algorithms: + // - If `duo` is shifted left instead of shifting `div` right like in the 3 instruction + // restoring division algorithm, some architectures can do the shifting and + // subtraction step in one instruction. + // - `quo` can be constructed by adding powers-of-two to it or shifting it left by one + // and adding one. + // - Every time `duo` is shifted left, there is another unused 0 bit shifted into the + // LSB, so what if we use those bits to store `quo`? + // Through a complex setup, it is possible to manage `duo` and `quo` in the same + // register, and perform one step with 2 or 3 instructions. The only major downsides are + // that there is significant setup (it is only saves instructions if `shl` is + // approximately more than 4), `duo < div_original` checks are impractical once SWAR is + // initiated, and the number of division steps taken has to be exact (we cannot do more + // division steps than `shl`, because it introduces edge cases where quotient bits in + // `duo` start to collide with the real part of `div`. + /* + // first step. The quotient bit is stored in `quo` for now + let div_original = div; + let mut shl = $normalization_shift(duo, div, true); + let mut div: $uX = (div << shl); + duo = duo.wrapping_sub(div); + let mut quo: $uX = 1 << shl; + if duo < div_original { + return (quo, duo); + } + + let mask: $uX; + if (div as $iX) < 0 { + // deal with same edge case as the 3 instruction restoring division algorithm, but + // the quotient bit from this step also has to be stored in `quo` + div >>= 1; + shl -= 1; + let tmp = 1 << shl; + mask = tmp - 1; + let sub = duo.wrapping_sub(div); + if (sub as $iX) >= 0 { + // restore + duo = sub; + quo |= tmp; + } + if duo < div_original { + return (quo, duo); + } + } else { + mask = quo - 1; + } + // There is now room for quotient bits in `duo`. + + // Note that `div` is already shifted left and has `shl` unset bits. We subtract 1 from + // `div` and end up with the subset of `shl` bits being all being set. This subset acts + // just like a two's complement negative one. The subset of `div` containing the divisor + // had 1 subtracted from it, but a carry will always be generated from the `shl` subset + // as long as the quotient stays positive. + // + // When the modified `div` is subtracted from `duo.wrapping_shl(1)`, the `shl` subset + // adds a quotient bit to the least significant bit. + // For example, 89 (0b01011001) divided by 3 (0b11): + // + // shl:4, div:0b00110000 + // first step: + // duo:0b01011001 + // + div_neg:0b11010000 + // ____________________ + // 0b00101001 + // quo is set to 0b00010000 and mask is set to 0b00001111 for later + // + // 1 is subtracted from `div`. I will differentiate the `shl` part of `div` and the + // quotient part of `duo` with `^`s. + // chars. + // div:0b00110000 + // ^^^^ + // + 0b11111111 + // ________________ + // 0b00101111 + // ^^^^ + // div_neg:0b11010001 + // + // first SWAR step: + // duo_shl1:0b01010010 + // ^ + // + div_neg:0b11010001 + // ____________________ + // 0b00100011 + // ^ + // second: + // duo_shl1:0b01000110 + // ^^ + // + div_neg:0b11010001 + // ____________________ + // 0b00010111 + // ^^ + // third: + // duo_shl1:0b00101110 + // ^^^ + // + div_neg:0b11010001 + // ____________________ + // 0b11111111 + // ^^^ + // 3 steps resulted in the quotient with 3 set bits as expected, but currently the real + // part of `duo` is negative and the third step was an unnormalized step. The restore + // branch then restores `duo`. Note that the restore branch does not shift `duo` left. + // + // duo:0b11111111 + // ^^^ + // + div:0b00101111 + // ^^^^ + // ________________ + // 0b00101110 + // ^^^ + // `duo` is now back in the `duo_shl1` state it was at in the the third step, with an + // unset quotient bit. + // + // final step (`shl` was 4, so exactly 4 steps must be taken) + // duo_shl1:0b01011100 + // ^^^^ + // + div_neg:0b11010001 + // ____________________ + // 0b00101101 + // ^^^^ + // The quotient includes the `^` bits added with the `quo` bits from the beginning that + // contained the first step and potential edge case step, + // `quo:0b00010000 + (duo:0b00101101 & mask:0b00001111) == 0b00011101 == 29u8`. + // The remainder is the bits remaining in `duo` that are not part of the quotient bits, + // `duo:0b00101101 >> shl == 0b0010 == 2u8`. + let div: $uX = div.wrapping_sub(1); + let mut i = shl; + loop { + if i == 0 { + break; + } + i -= 1; + duo = duo.wrapping_shl(1).wrapping_sub(div); + if (duo as $iX) < 0 { + // restore + duo = duo.wrapping_add(div); + } + } + // unpack the results of SWAR + return ((duo & mask) | quo, duo >> shl); + */ + + // The problem with the conditional restoring SWAR algorithm above is that, in practice, + // it requires assembly code to bring out its full unrolled potential (It seems that + // LLVM can't use unrolled conditionals optimally and ends up erasing all the benefit + // that my algorithm intends. On architectures without predicated instructions, the code + // gen is especially bad. We need a default software division algorithm that is + // guaranteed to get decent code gen for the central loop. + + // For non-SWAR algorithms, there is a way to do binary long division without + // predication or even branching. This involves creating a mask from the sign bit and + // performing different kinds of steps using that. + /* + let shl = $normalization_shift(duo, div, true); + let mut div: $uX = div << shl; + let mut pow: $uX = 1 << shl; + let mut quo: $uX = 0; + loop { + let sub = duo.wrapping_sub(div); + let sign_mask = !((sub as $iX).wrapping_shr($n - 1) as $uX); + duo -= div & sign_mask; + quo |= pow & sign_mask; + div >>= 1; + pow >>= 1; + if pow == 0 { + break; + } + } + return (quo, duo); + */ + // However, it requires about 4 extra operations (smearing the sign bit, negating the + // mask, and applying the mask twice) on top of the operations done by the actual + // algorithm. With SWAR however, just 2 extra operations are needed, making it + // practical and even the most optimal algorithm for some architectures. + + // What we do is use custom assembly for predicated architectures that need software + // division, and for the default algorithm use a mask based restoring SWAR algorithm + // without conditionals or branches. On almost all architectures, this Rust code is + // guaranteed to compile down to 5 assembly instructions or less for each step, and LLVM + // will unroll it in a decent way. + + // standard opening for SWAR algorithm with first step and edge case handling + let div_original = div; + let mut shl = $normalization_shift(duo, div, true); + let mut div: $uX = (div << shl); + duo = duo.wrapping_sub(div); + let mut quo: $uX = 1 << shl; + if duo < div_original { + return (quo, duo); + } + let mask: $uX; + if (div as $iX) < 0 { + div >>= 1; + shl -= 1; + let tmp = 1 << shl; + mask = tmp - 1; + let sub = duo.wrapping_sub(div); + if (sub as $iX) >= 0 { + duo = sub; + quo |= tmp; + } + if duo < div_original { + return (quo, duo); + } + } else { + mask = quo - 1; + } + + // central loop + div = div.wrapping_sub(1); + let mut i = shl; + loop { + if i == 0 { + break + } + i -= 1; + // shift left 1 and subtract + duo = duo.wrapping_shl(1).wrapping_sub(div); + // create mask + let mask = (duo as $iX).wrapping_shr($n - 1) as $uX; + // restore + duo = duo.wrapping_add(div & mask); + } + // unpack + return ((duo & mask) | quo, duo >> shl); + + // miscellanious binary long division algorithms that might be better for specific + // architectures + + // Another kind of long division uses an interesting fact that `div` and `pow` can be + // negated when `duo` is negative to perform a "negated" division step that works in + // place of any normalization mechanism. This is a non-restoring division algorithm that + // is very similar to the non-restoring division algorithms that can be found on the + // internet, except there is only one test for `duo < 0`. The subtraction from `quo` can + // be viewed as shifting the least significant set bit right (e.x. if we enter a series + // of negated binary long division steps starting with `quo == 0b1011_0000` and + // `pow == 0b0000_1000`, `quo` will progress like this: 0b1010_1000, 0b1010_0100, + // 0b1010_0010, 0b1010_0001). + /* + let div_original = div; + let shl = $normalization_shift(duo, div, true); + let mut div: $uX = (div << shl); + let mut pow: $uX = 1 << shl; + let mut quo: $uX = pow; + duo = duo.wrapping_sub(div); + if duo < div_original { + return (quo, duo); + } + div >>= 1; + pow >>= 1; + loop { + if (duo as $iX) < 0 { + // Negated binary long division step. + duo = duo.wrapping_add(div); + quo = quo.wrapping_sub(pow); + } else { + // Normal long division step. + if duo < div_original { + return (quo, duo) + } + duo = duo.wrapping_sub(div); + quo = quo.wrapping_add(pow); + } + pow >>= 1; + div >>= 1; + } + */ + + // This is the Nonrestoring SWAR algorithm, combining the nonrestoring algorithm with + // SWAR techniques that makes the only difference between steps be negation of `div`. + // If there was an architecture with an instruction that negated inputs to an adder + // based on conditionals, and in place shifting (or a three input addition operation + // that can have `duo` as two of the inputs to effectively shift it left by 1), then a + // single instruction central loop is possible. Microarchitectures often have inputs to + // their ALU that can invert the arguments and carry in of adders, but the architectures + // unfortunately do not have an instruction to dynamically invert this input based on + // conditionals. + /* + // SWAR opening + let div_original = div; + let mut shl = $normalization_shift(duo, div, true); + let mut div: $uX = (div << shl); + duo = duo.wrapping_sub(div); + let mut quo: $uX = 1 << shl; + if duo < div_original { + return (quo, duo); + } + let mask: $uX; + if (div as $iX) < 0 { + div >>= 1; + shl -= 1; + let tmp = 1 << shl; + let sub = duo.wrapping_sub(div); + if (sub as $iX) >= 0 { + // restore + duo = sub; + quo |= tmp; + } + if duo < div_original { + return (quo, duo); + } + mask = tmp - 1; + } else { + mask = quo - 1; + } + + // central loop + let div: $uX = div.wrapping_sub(1); + let mut i = shl; + loop { + if i == 0 { + break; + } + i -= 1; + // note: the `wrapping_shl(1)` can be factored out, but would require another + // restoring division step to prevent `(duo as $iX)` from overflowing + if (duo as $iX) < 0 { + // Negated binary long division step. + duo = duo.wrapping_shl(1).wrapping_add(div); + } else { + // Normal long division step. + duo = duo.wrapping_shl(1).wrapping_sub(div); + } + } + if (duo as $iX) < 0 { + // Restore. This was not needed in the original nonrestoring algorithm because of + // the `duo < div_original` checks. + duo = duo.wrapping_add(div); + } + // unpack + return ((duo & mask) | quo, duo >> shl); + */ + } + + /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a + /// tuple. + $( + #[$signed_attr] + )* + pub fn $signed_name(duo: $iX, div: $iX) -> ($iX, $iX) { + // There is a way of doing this without any branches, but requires too many extra + // operations to be faster. + /* + let duo_s = duo >> ($n - 1); + let div_s = div >> ($n - 1); + let duo = (duo ^ duo_s).wrapping_sub(duo_s); + let div = (div ^ div_s).wrapping_sub(div_s); + let quo_s = duo_s ^ div_s; + let rem_s = duo_s; + let tmp = $unsigned_name(duo as $uX, div as $uX); + ( + ((tmp.0 as $iX) ^ quo_s).wrapping_sub(quo_s), + ((tmp.1 as $iX) ^ rem_s).wrapping_sub(rem_s), + ) + */ + + match (duo < 0, div < 0) { + (false, false) => { + let t = $unsigned_name(duo as $uX, div as $uX); + (t.0 as $iX, t.1 as $iX) + }, + (true, false) => { + let t = $unsigned_name(duo.wrapping_neg() as $uX, div as $uX); + ((t.0 as $iX).wrapping_neg(), (t.1 as $iX).wrapping_neg()) + }, + (false, true) => { + let t = $unsigned_name(duo as $uX, div.wrapping_neg() as $uX); + ((t.0 as $iX).wrapping_neg(), t.1 as $iX) + }, + (true, true) => { + let t = $unsigned_name(duo.wrapping_neg() as $uX, div.wrapping_neg() as $uX); + (t.0 as $iX, (t.1 as $iX).wrapping_neg()) + }, + } + } + } +} diff --git a/src/int/specialized_div_rem/delegate.rs b/src/int/specialized_div_rem/delegate.rs new file mode 100644 index 00000000..1ba72431 --- /dev/null +++ b/src/int/specialized_div_rem/delegate.rs @@ -0,0 +1,226 @@ +/// Creates unsigned and signed division functions that use a combination of hardware division and +/// binary long division to divide integers larger than what hardware division by itself can do. This +/// function is intended for microarchitectures that have division hardware, but not fast enough +/// multiplication hardware for `impl_trifecta` to be faster. +#[macro_export] +macro_rules! impl_delegate { + ( + $unsigned_name:ident, // name of the unsigned division function + $signed_name:ident, // name of the signed division function + $zero_div_fn:ident, // function called when division by zero is attempted + $half_normalization_shift:ident, // function for finding the normalization shift of $uX + $half_division:ident, // function for division of a $uX by a $uX + $n_h:expr, // the number of bits in $iH or $uH + $uH:ident, // unsigned integer with half the bit width of $uX + $uX:ident, // unsigned integer with half the bit width of $uD. + $uD:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name` + $iD:ident, // signed integer type for the inputs and outputs of `$signed_name` + $($unsigned_attr:meta),*; // attributes for the unsigned function + $($signed_attr:meta),* // attributes for the signed function + ) => { + /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a + /// tuple. + $( + #[$unsigned_attr] + )* + pub fn $unsigned_name(duo: $uD, div: $uD) -> ($uD, $uD) { + // The two possibility algorithm, undersubtracting long division algorithm, or any kind + // of reciprocal based algorithm will not be fastest, because they involve large + // multiplications that we assume to not be fast enough relative to the divisions to + // outweigh setup times. + + // the number of bits in a $uX + let n = $n_h * 2; + + let duo_lo = duo as $uX; + let duo_hi = (duo >> n) as $uX; + let div_lo = div as $uX; + let div_hi = (div >> n) as $uX; + + match (div_lo == 0, div_hi == 0, duo_hi == 0) { + (true, true, _) => { + $zero_div_fn() + } + (_, false, true) => { + // `duo` < `div` + return (0, duo) + } + (false, true, true) => { + // delegate to smaller division + let tmp = $half_division(duo_lo, div_lo); + return (tmp.0 as $uD, tmp.1 as $uD) + } + (false, true, false) => { + if duo_hi < div_lo { + // `quo_hi` will always be 0. This performs a binary long division algorithm + // to zero `duo_hi` followed by a half division. + + // We can calculate the normalization shift using only `$uX` size functions. + // If we calculated the normalization shift using + // `$half_normalization_shift(duo_hi, div_lo false)`, it would break the + // assumption the function has that the first argument is more than the + // second argument. If the arguments are switched, the assumption holds true + // since `duo_hi < div_lo`. + let norm_shift = $half_normalization_shift(div_lo, duo_hi, false); + let shl = if norm_shift == 0 { + // Consider what happens if the msbs of `duo_hi` and `div_lo` align with + // no shifting. The normalization shift will always return + // `norm_shift == 0` regardless of whether it is fully normalized, + // because `duo_hi < div_lo`. In that edge case, `n - norm_shift` would + // result in shift overflow down the line. For the edge case, because + // both `duo_hi < div_lo` and we are comparing all the significant bits + // of `duo_hi` and `div`, we can make `shl = n - 1`. + n - 1 + } else { + // We also cannot just use `shl = n - norm_shift - 1` in the general + // case, because when we are not in the edge case comparing all the + // significant bits, then the full `duo < div` may not be true and thus + // breaks the division algorithm. + n - norm_shift + }; + + // The 3 variable restoring division algorithm (see binary_long.rs) is ideal + // for this task, since `pow` and `quo` can be `$uX` and the delegation + // check is simple. + let mut div: $uD = div << shl; + let mut pow_lo: $uX = 1 << shl; + let mut quo_lo: $uX = 0; + let mut duo = duo; + loop { + let sub = duo.wrapping_sub(div); + if 0 <= (sub as $iD) { + duo = sub; + quo_lo |= pow_lo; + let duo_hi = (duo >> n) as $uX; + if duo_hi == 0 { + // Delegate to get the rest of the quotient. Note that the + // `div_lo` here is the original unshifted `div`. + let tmp = $half_division(duo as $uX, div_lo); + return ((quo_lo | tmp.0) as $uD, tmp.1 as $uD) + } + } + div >>= 1; + pow_lo >>= 1; + } + } else if duo_hi == div_lo { + // `quo_hi == 1`. This branch is cheap and helps with edge cases. + let tmp = $half_division(duo as $uX, div as $uX); + return ((1 << n) | (tmp.0 as $uD), tmp.1 as $uD) + } else { + // `div_lo < duo_hi` + // `rem_hi == 0` + if (div_lo >> $n_h) == 0 { + // Short division of $uD by a $uH, using $uX by $uX division + let div_0 = div_lo as $uH as $uX; + let (quo_hi, rem_3) = $half_division(duo_hi, div_0); + + let duo_mid = + ((duo >> $n_h) as $uH as $uX) + | (rem_3 << $n_h); + let (quo_1, rem_2) = $half_division(duo_mid, div_0); + + let duo_lo = + (duo as $uH as $uX) + | (rem_2 << $n_h); + let (quo_0, rem_1) = $half_division(duo_lo, div_0); + + return ( + (quo_0 as $uD) + | ((quo_1 as $uD) << $n_h) + | ((quo_hi as $uD) << n), + rem_1 as $uD + ) + } + + // This is basically a short division composed of a half division for the hi + // part, specialized 3 variable binary long division in the middle, and + // another half division for the lo part. + let duo_lo = duo as $uX; + let tmp = $half_division(duo_hi, div_lo); + let quo_hi = tmp.0; + let mut duo = (duo_lo as $uD) | ((tmp.1 as $uD) << n); + // This check is required to avoid breaking the long division below. + if duo < div { + return ((quo_hi as $uD) << n, duo); + } + + // The half division handled all shift alignments down to `n`, so this + // division can continue with a shift of `n - 1`. + let mut div: $uD = div << (n - 1); + let mut pow_lo: $uX = 1 << (n - 1); + let mut quo_lo: $uX = 0; + loop { + let sub = duo.wrapping_sub(div); + if 0 <= (sub as $iD) { + duo = sub; + quo_lo |= pow_lo; + let duo_hi = (duo >> n) as $uX; + if duo_hi == 0 { + // Delegate to get the rest of the quotient. Note that the + // `div_lo` here is the original unshifted `div`. + let tmp = $half_division(duo as $uX, div_lo); + return ( + (tmp.0) as $uD | (quo_lo as $uD) | ((quo_hi as $uD) << n), + tmp.1 as $uD + ); + } + } + div >>= 1; + pow_lo >>= 1; + } + } + } + (_, false, false) => { + // Full $uD by $uD binary long division. `quo_hi` will always be 0. + if duo < div { + return (0, duo); + } + let div_original = div; + let shl = $half_normalization_shift(duo_hi, div_hi, false); + let mut duo = duo; + let mut div: $uD = div << shl; + let mut pow_lo: $uX = 1 << shl; + let mut quo_lo: $uX = 0; + loop { + let sub = duo.wrapping_sub(div); + if 0 <= (sub as $iD) { + duo = sub; + quo_lo |= pow_lo; + if duo < div_original { + return (quo_lo as $uD, duo) + } + } + div >>= 1; + pow_lo >>= 1; + } + } + } + } + + /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a + /// tuple. + $( + #[$signed_attr] + )* + pub fn $signed_name(duo: $iD, div: $iD) -> ($iD, $iD) { + match (duo < 0, div < 0) { + (false, false) => { + let t = $unsigned_name(duo as $uD, div as $uD); + (t.0 as $iD, t.1 as $iD) + }, + (true, false) => { + let t = $unsigned_name(duo.wrapping_neg() as $uD, div as $uD); + ((t.0 as $iD).wrapping_neg(), (t.1 as $iD).wrapping_neg()) + }, + (false, true) => { + let t = $unsigned_name(duo as $uD, div.wrapping_neg() as $uD); + ((t.0 as $iD).wrapping_neg(), t.1 as $iD) + }, + (true, true) => { + let t = $unsigned_name(duo.wrapping_neg() as $uD, div.wrapping_neg() as $uD); + (t.0 as $iD, (t.1 as $iD).wrapping_neg()) + }, + } + } + } +} diff --git a/src/int/specialized_div_rem/mod.rs b/src/int/specialized_div_rem/mod.rs new file mode 100644 index 00000000..5a034dcf --- /dev/null +++ b/src/int/specialized_div_rem/mod.rs @@ -0,0 +1,295 @@ +// TODO: when `unsafe_block_in_unsafe_fn` is stabilized, remove this +#![allow(unused_unsafe)] + +//! This `specialized_div_rem` module is originally from version 1.0.0 of the +//! `specialized-div-rem` crate. Note that `for` loops with ranges are not used in this +//! module, since unoptimized compilation may generate references to `memcpy`. +//! +//! The purpose of these macros is to easily change the both the division algorithm used +//! for a given integer size and the half division used by that algorithm. The way +//! functions call each other is also constructed such that linkers will find the chain of +//! software and hardware divisions needed for every size of signed and unsigned division. +//! For example, most target compilations do the following: +//! +//! - Many 128 bit division functions like `u128::wrapping_div` use +//! `std::intrinsics::unchecked_div`, which gets replaced by `__udivti3` because there +//! is not a 128 bit by 128 bit hardware division function in most architectures. +//! `__udivti3` uses `u128_div_rem` (this extra level of function calls exists because +//! `__umodti3` and `__udivmodti4` also exist, and `specialized_div_rem` supplies just +//! one function to calculate both the quotient and remainder. If configuration flags +//! enable it, `impl_trifecta!` defines `u128_div_rem` to use the trifecta algorithm, +//! which requires the half sized division `u64_by_u64_div_rem`. If the architecture +//! supplies a 64 bit hardware division instruction, `u64_by_u64_div_rem` will be +//! reduced to those instructions. Note that we do not specify the half size division +//! directly to be `__udivdi3`, because hardware division would never be introduced. +//! - If the architecture does not supply a 64 bit hardware division instruction, u64 +//! divisions will use functions such as `__udivdi3`. This will call `u64_div_rem` +//! which is defined by `impl_delegate!`. The half division for this algorithm is +//! `u32_by_u32_div_rem` which in turn becomes hardware division instructions or more +//! software division algorithms. +//! - If the architecture does not supply a 32 bit hardware instruction, linkers will +//! look for `__udivsi3`. `impl_binary_long!` is used, but this algorithm uses no half +//! division, so the chain of calls ends here. +//! +//! On some architectures like x86_64, an asymmetrically sized division is supplied, in +//! which 128 bit numbers can be divided by 64 bit numbers. `impl_asymmetric!` is used to +//! extend the 128 by 64 bit division to a full 128 by 128 bit division. + +// `allow(dead_code)` is used in various places, because the configuration code would otherwise be +// ridiculously complex + +#[macro_use] +mod norm_shift; + +#[macro_use] +mod binary_long; + +#[macro_use] +mod delegate; + +#[macro_use] +mod trifecta; + +#[macro_use] +mod asymmetric; + +/// The behavior of all divisions by zero is controlled by this function. This function should be +/// impossible to reach by Rust users, unless `compiler-builtins` public division functions or +/// `core/std::unchecked_div/rem` are directly used without a zero check in front. +fn zero_div_fn() -> ! { + // TODO: change this once the algorithms are verified + //unsafe {core::hint::unreachable_unchecked()} + ::abort() +} + +// The `B` extension on RISC-V determines if a CLZ assembly instruction exists +#[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))] +const USE_LZ: bool = cfg!(target_feature = "b"); + +#[cfg(target_arch = "arm")] +const USE_LZ: bool = if cfg!(target_feature = "thumb-mode") { + // ARM thumb targets have CLZ instructions if the instruction set of ARMv6T2 is supported. This + // is needed to successfully differentiate between targets like `thumbv8.base` and + // `thumbv8.main`. + cfg!(target_feature = "v6t2") +} else { + // Regular ARM targets have CLZ instructions if the ARMv5TE instruction set is supported. + // Technically, ARMv5T was the first to have CLZ, but the "v5t" target feature does not seem to + // work. + cfg!(target_feature = "v5te") +}; + +// All other targets Rust supports have CLZ instructions +#[cfg(not(any(target_arch = "arm", target_arch = "riscv32", target_arch = "riscv64")))] +const USE_LZ: bool = true; + +impl_normalization_shift!( + u32_normalization_shift, + USE_LZ, + 32, + u32, + i32, + allow(dead_code) +); +impl_normalization_shift!( + u64_normalization_shift, + USE_LZ, + 64, + u64, + i64, + allow(dead_code) +); + +/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder. +/// `checked_div` and `checked_rem` are used to avoid bringing in panic function +/// dependencies. +#[inline] +fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) { + if let Some(quo) = duo.checked_div(div) { + if let Some(rem) = duo.checked_rem(div) { + return (quo, rem); + } + } + zero_div_fn() +} + +// Whether `trifecta` or `delegate` is faster for 128 bit division depends on the speed at which a +// microarchitecture can multiply and divide. We decide to be optimistic and assume `trifecta` is +// faster if the target pointer width is at least 64. +#[cfg(all( + not(all(feature = "asm", target_arch = "x86_64")), + not(any(target_pointer_width = "16", target_pointer_width = "32")) +))] +impl_trifecta!( + u128_div_rem, + i128_div_rem, + zero_div_fn, + u64_by_u64_div_rem, + 32, + u32, + u64, + u128, + i128,; +); + +// If the pointer width less than 64, then the target architecture almost certainly does not have +// the fast 64 to 128 bit widening multiplication needed for `trifecta` to be faster. +#[cfg(all( + not(all(feature = "asm", target_arch = "x86_64")), + any(target_pointer_width = "16", target_pointer_width = "32") +))] +impl_delegate!( + u128_div_rem, + i128_div_rem, + zero_div_fn, + u64_normalization_shift, + u64_by_u64_div_rem, + 32, + u32, + u64, + u128, + i128,; +); + +/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder. +/// +/// # Safety +/// +/// If the quotient does not fit in a `u64`, a floating point exception occurs. +/// If `div == 0`, then a division by zero exception occurs. +#[cfg(all(feature = "asm", target_arch = "x86_64"))] +#[inline] +unsafe fn u128_by_u64_div_rem(duo: u128, div: u64) -> (u64, u64) { + let duo_lo = duo as u64; + let duo_hi = (duo >> 64) as u64; + let quo: u64; + let rem: u64; + unsafe { + // divides the combined registers rdx:rax (`duo` is split into two 64 bit parts to do this) + // by `div`. The quotient is stored in rax and the remainder in rdx. + asm!( + "div {0}", + in(reg) div, + inlateout("rax") duo_lo => quo, + inlateout("rdx") duo_hi => rem, + options(pure, nomem, nostack) + ); + } + (quo, rem) +} + +// use `asymmetric` instead of `trifecta` on x86_64 +#[cfg(all(feature = "asm", target_arch = "x86_64"))] +impl_asymmetric!( + u128_div_rem, + i128_div_rem, + zero_div_fn, + u64_by_u64_div_rem, + u128_by_u64_div_rem, + 32, + u32, + u64, + u128, + i128,; +); + +/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder. +/// `checked_div` and `checked_rem` are used to avoid bringing in panic function +/// dependencies. +#[inline] +#[allow(dead_code)] +fn u32_by_u32_div_rem(duo: u32, div: u32) -> (u32, u32) { + if let Some(quo) = duo.checked_div(div) { + if let Some(rem) = duo.checked_rem(div) { + return (quo, rem); + } + } + zero_div_fn() +} + +// When not on x86 and the pointer width is not 64, use `delegate` since the division size is larger +// than register size. +#[cfg(all( + not(all(feature = "asm", target_arch = "x86")), + not(target_pointer_width = "64") +))] +impl_delegate!( + u64_div_rem, + i64_div_rem, + zero_div_fn, + u32_normalization_shift, + u32_by_u32_div_rem, + 16, + u16, + u32, + u64, + i64,; +); + +// When not on x86 and the pointer width is 64, use `binary_long`. +#[cfg(all( + not(all(feature = "asm", target_arch = "x86")), + target_pointer_width = "64" +))] +impl_binary_long!( + u64_div_rem, + i64_div_rem, + zero_div_fn, + u64_normalization_shift, + 64, + u64, + i64,; +); + +/// Divides `duo` by `div` and returns a tuple of the quotient and the remainder. +/// +/// # Safety +/// +/// If the quotient does not fit in a `u32`, a floating point exception occurs. +/// If `div == 0`, then a division by zero exception occurs. +#[cfg(all(feature = "asm", target_arch = "x86"))] +#[inline] +unsafe fn u64_by_u32_div_rem(duo: u64, div: u32) -> (u32, u32) { + let duo_lo = duo as u32; + let duo_hi = (duo >> 32) as u32; + let quo: u32; + let rem: u32; + unsafe { + // divides the combined registers rdx:rax (`duo` is split into two 32 bit parts to do this) + // by `div`. The quotient is stored in rax and the remainder in rdx. + asm!( + "div {0}", + in(reg) div, + inlateout("rax") duo_lo => quo, + inlateout("rdx") duo_hi => rem, + options(pure, nomem, nostack) + ); + } + (quo, rem) +} + +// use `asymmetric` instead of `delegate` on x86 +#[cfg(all(feature = "asm", target_arch = "x86"))] +impl_asymmetric!( + u64_div_rem, + i64_div_rem, + zero_div_fn, + u32_by_u32_div_rem, + u64_by_u32_div_rem, + 16, + u16, + u32, + u64, + i64,; +); + +// 32 bits is the smallest division used by `compiler-builtins`, so we end with binary long division +impl_binary_long!( + u32_div_rem, + i32_div_rem, + zero_div_fn, + u32_normalization_shift, + 32, + u32, + i32,; +); diff --git a/src/int/specialized_div_rem/norm_shift.rs b/src/int/specialized_div_rem/norm_shift.rs new file mode 100644 index 00000000..33348b37 --- /dev/null +++ b/src/int/specialized_div_rem/norm_shift.rs @@ -0,0 +1,106 @@ +/// Creates a function used by some division algorithms to compute the "normalization shift". +#[macro_export] +macro_rules! impl_normalization_shift { + ( + $name:ident, // name of the normalization shift function + // boolean for if `$uX::leading_zeros` should be used (if an architecture does not have a + // hardware instruction for `usize::leading_zeros`, then this should be `true`) + $use_lz:ident, + $n:tt, // the number of bits in a $iX or $uX + $uX:ident, // unsigned integer type for the inputs of `$name` + $iX:ident, // signed integer type for the inputs of `$name` + $($unsigned_attr:meta),* // attributes for the function + ) => { + /// Finds the shift left that the divisor `div` would need to be normalized for a binary + /// long division step with the dividend `duo`. NOTE: This function assumes that these edge + /// cases have been handled before reaching it: + /// ` + /// if div == 0 { + /// panic!("attempt to divide by zero") + /// } + /// if duo < div { + /// return (0, duo) + /// } + /// ` + /// + /// Normalization is defined as (where `shl` is the output of this function): + /// ` + /// if duo.leading_zeros() != (div << shl).leading_zeros() { + /// // If the most significant bits of `duo` and `div << shl` are not in the same place, + /// // then `div << shl` has one more leading zero than `duo`. + /// assert_eq!(duo.leading_zeros() + 1, (div << shl).leading_zeros()); + /// // Also, `2*(div << shl)` is not more than `duo` (otherwise the first division step + /// // would not be able to clear the msb of `duo`) + /// assert!(duo < (div << (shl + 1))); + /// } + /// if full_normalization { + /// // Some algorithms do not need "full" normalization, which means that `duo` is + /// // larger than `div << shl` when the most significant bits are aligned. + /// assert!((div << shl) <= duo); + /// } + /// ` + /// + /// Note: If the software bisection algorithm is being used in this function, it happens + /// that full normalization always occurs, so be careful that new algorithms are not + /// invisibly depending on this invariant when `full_normalization` is set to `false`. + $( + #[$unsigned_attr] + )* + fn $name(duo: $uX, div: $uX, full_normalization: bool) -> usize { + // We have to find the leading zeros of `div` to know where its msb (most significant + // set bit) is to even begin binary long division. It is also good to know where the msb + // of `duo` is so that useful work can be started instead of shifting `div` for all + // possible quotients (many division steps are wasted if `duo.leading_zeros()` is large + // and `div` starts out being shifted all the way to the msb). Aligning the msbs of + // `div` and `duo` could be done by shifting `div` left by + // `div.leading_zeros() - duo.leading_zeros()`, but some CPUs without division hardware + // also do not have single instructions for calculating `leading_zeros`. Instead of + // software doing two bisections to find the two `leading_zeros`, we do one bisection to + // find `div.leading_zeros() - duo.leading_zeros()` without actually knowing either of + // the leading zeros values. + + let mut shl: usize; + if $use_lz { + shl = (div.leading_zeros() - duo.leading_zeros()) as usize; + if full_normalization { + if duo < (div << shl) { + // when the msb of `duo` and `div` are aligned, the resulting `div` may be + // larger than `duo`, so we decrease the shift by 1. + shl -= 1; + } + } + } else { + let mut test = duo; + shl = 0usize; + let mut lvl = $n >> 1; + loop { + let tmp = test >> lvl; + // It happens that a final `duo < (div << shl)` check is not needed, because the + // `div <= tmp` check insures that the msb of `test` never passes the msb of + // `div`, and any set bits shifted off the end of `test` would still keep + // `div <= tmp` true. + if div <= tmp { + test = tmp; + shl += lvl; + } + // narrow down bisection + lvl >>= 1; + if lvl == 0 { + break + } + } + } + // tests the invariants that should hold before beginning binary long division + /* + if full_normalization { + assert!((div << shl) <= duo); + } + if duo.leading_zeros() != (div << shl).leading_zeros() { + assert_eq!(duo.leading_zeros() + 1, (div << shl).leading_zeros()); + assert!(duo < (div << (shl + 1))); + } + */ + shl + } + } +} diff --git a/src/int/specialized_div_rem/trifecta.rs b/src/int/specialized_div_rem/trifecta.rs new file mode 100644 index 00000000..e76516f3 --- /dev/null +++ b/src/int/specialized_div_rem/trifecta.rs @@ -0,0 +1,441 @@ +/// Creates unsigned and signed division functions optimized for division of integers with bitwidths +/// larger than the largest hardware integer division supported. These functions use large radix +/// division algorithms that require both fast division and very fast widening multiplication on the +/// target microarchitecture. Otherwise, `impl_delegate` should be used instead. +#[macro_export] +macro_rules! impl_trifecta { + ( + $unsigned_name:ident, // name of the unsigned division function + $signed_name:ident, // name of the signed division function + $zero_div_fn:ident, // function called when division by zero is attempted + $half_division:ident, // function for division of a $uX by a $uX + $n_h:expr, // the number of bits in $iH or $uH + $uH:ident, // unsigned integer with half the bit width of $uX + $uX:ident, // unsigned integer with half the bit width of $uD + $uD:ident, // unsigned integer type for the inputs and outputs of `$unsigned_name` + $iD:ident, // signed integer type for the inputs and outputs of `$signed_name` + $($unsigned_attr:meta),*; // attributes for the unsigned function + $($signed_attr:meta),* // attributes for the signed function + ) => { + /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a + /// tuple. + $( + #[$unsigned_attr] + )* + pub fn $unsigned_name(duo: $uD, div: $uD) -> ($uD, $uD) { + // This is called the trifecta algorithm because it uses three main algorithms: short + // division for small divisors, the two possibility algorithm for large divisors, and an + // undersubtracting long division algorithm for intermediate cases. + + // This replicates `carrying_mul` (rust-lang rfc #2417). LLVM correctly optimizes this + // to use a widening multiply to 128 bits on the relevant architectures. + fn carrying_mul(lhs: $uX, rhs: $uX) -> ($uX, $uX) { + let tmp = (lhs as $uD).wrapping_mul(rhs as $uD); + (tmp as $uX, (tmp >> ($n_h * 2)) as $uX) + } + fn carrying_mul_add(lhs: $uX, mul: $uX, add: $uX) -> ($uX, $uX) { + let tmp = (lhs as $uD).wrapping_mul(mul as $uD).wrapping_add(add as $uD); + (tmp as $uX, (tmp >> ($n_h * 2)) as $uX) + } + + // the number of bits in a $uX + let n = $n_h * 2; + + if div == 0 { + $zero_div_fn() + } + + // Trying to use a normalization shift function will cause inelegancies in the code and + // inefficiencies for architectures with a native count leading zeros instruction. The + // undersubtracting algorithm needs both values (keeping the original `div_lz` but + // updating `duo_lz` multiple times), so we assume hardware support for fast + // `leading_zeros` calculation. + let div_lz = div.leading_zeros(); + let mut duo_lz = duo.leading_zeros(); + + // the possible ranges of `duo` and `div` at this point: + // `0 <= duo < 2^n_d` + // `1 <= div < 2^n_d` + + // quotient is 0 or 1 branch + if div_lz <= duo_lz { + // The quotient cannot be more than 1. The highest set bit of `duo` needs to be at + // least one place higher than `div` for the quotient to be more than 1. + if duo >= div { + return (1, duo - div) + } else { + return (0, duo) + } + } + + // `_sb` is the number of significant bits (from the ones place to the highest set bit) + // `{2, 2^div_sb} <= duo < 2^n_d` + // `1 <= div < {2^duo_sb, 2^(n_d - 1)}` + // smaller division branch + if duo_lz >= n { + // `duo < 2^n` so it will fit in a $uX. `div` will also fit in a $uX (because of the + // `div_lz <= duo_lz` branch) so no numerical error. + let (quo, rem) = $half_division(duo as $uX, div as $uX); + return ( + quo as $uD, + rem as $uD + ) + } + + // `{2^n, 2^div_sb} <= duo < 2^n_d` + // `1 <= div < {2^duo_sb, 2^(n_d - 1)}` + // short division branch + if div_lz >= (n + $n_h) { + // `1 <= div < {2^duo_sb, 2^n_h}` + + // It is barely possible to improve the performance of this by calculating the + // reciprocal and removing one `$half_division`, but only if the CPU can do fast + // multiplications in parallel. Other reciprocal based methods can remove two + // `$half_division`s, but have multiplications that cannot be done in parallel and + // reduce performance. I have decided to use this trivial short division method and + // rely on the CPU having quick divisions. + + let duo_hi = (duo >> n) as $uX; + let div_0 = div as $uH as $uX; + let (quo_hi, rem_3) = $half_division(duo_hi, div_0); + + let duo_mid = + ((duo >> $n_h) as $uH as $uX) + | (rem_3 << $n_h); + let (quo_1, rem_2) = $half_division(duo_mid, div_0); + + let duo_lo = + (duo as $uH as $uX) + | (rem_2 << $n_h); + let (quo_0, rem_1) = $half_division(duo_lo, div_0); + + return ( + (quo_0 as $uD) + | ((quo_1 as $uD) << $n_h) + | ((quo_hi as $uD) << n), + rem_1 as $uD + ) + } + + // relative leading significant bits, cannot overflow because of above branches + let lz_diff = div_lz - duo_lz; + + // `{2^n, 2^div_sb} <= duo < 2^n_d` + // `2^n_h <= div < {2^duo_sb, 2^(n_d - 1)}` + // `mul` or `mul - 1` branch + if lz_diff < $n_h { + // Two possibility division algorithm + + // The most significant bits of `duo` and `div` are within `$n_h` bits of each + // other. If we take the `n` most significant bits of `duo` and divide them by the + // corresponding bits in `div`, it produces a quotient value `quo`. It happens that + // `quo` or `quo - 1` will always be the correct quotient for the whole number. In + // other words, the bits less significant than the `n` most significant bits of + // `duo` and `div` can only influence the quotient to be one of two values. + // Because there are only two possibilities, there only needs to be one `$uH` sized + // division, a `$uH` by `$uD` multiplication, and only one branch with a few simple + // operations. + // + // Proof that the true quotient can only be `quo` or `quo - 1`. + // All `/` operators here are floored divisions. + // + // `shift` is the number of bits not in the higher `n` significant bits of `duo`. + // (definitions) + // 0. shift = n - duo_lz + // 1. duo_sig_n == duo / 2^shift + // 2. div_sig_n == div / 2^shift + // 3. quo == duo_sig_n / div_sig_n + // + // + // We are trying to find the true quotient, `true_quo`. + // 4. true_quo = duo / div. (definition) + // + // This is true because of the bits that are cut off during the bit shift. + // 5. duo_sig_n * 2^shift <= duo < (duo_sig_n + 1) * 2^shift. + // 6. div_sig_n * 2^shift <= div < (div_sig_n + 1) * 2^shift. + // + // Dividing each bound of (5) by each bound of (6) gives 4 possibilities for what + // `true_quo == duo / div` is bounded by: + // (duo_sig_n * 2^shift) / (div_sig_n * 2^shift) + // (duo_sig_n * 2^shift) / ((div_sig_n + 1) * 2^shift) + // ((duo_sig_n + 1) * 2^shift) / (div_sig_n * 2^shift) + // ((duo_sig_n + 1) * 2^shift) / ((div_sig_n + 1) * 2^shift) + // + // Simplifying each of these four: + // duo_sig_n / div_sig_n + // duo_sig_n / (div_sig_n + 1) + // (duo_sig_n + 1) / div_sig_n + // (duo_sig_n + 1) / (div_sig_n + 1) + // + // Taking the smallest and the largest of these as the low and high bounds + // and replacing `duo / div` with `true_quo`: + // 7. duo_sig_n / (div_sig_n + 1) <= true_quo < (duo_sig_n + 1) / div_sig_n + // + // The `lz_diff < n_h` conditional on this branch makes sure that `div_sig_n` is at + // least `2^n_h`, and the `div_lz <= duo_lz` branch makes sure that the highest bit + // of `div_sig_n` is not the `2^(n - 1)` bit. + // 8. `2^(n - 1) <= duo_sig_n < 2^n` + // 9. `2^n_h <= div_sig_n < 2^(n - 1)` + // + // We want to prove that either + // `(duo_sig_n + 1) / div_sig_n == duo_sig_n / (div_sig_n + 1)` or that + // `(duo_sig_n + 1) / div_sig_n == duo_sig_n / (div_sig_n + 1) + 1`. + // + // We also want to prove that `quo` is one of these: + // `duo_sig_n / div_sig_n == duo_sig_n / (div_sig_n + 1)` or + // `duo_sig_n / div_sig_n == (duo_sig_n + 1) / div_sig_n`. + // + // When 1 is added to the numerator of `duo_sig_n / div_sig_n` to produce + // `(duo_sig_n + 1) / div_sig_n`, it is not possible that the value increases by + // more than 1 with floored integer arithmetic and `div_sig_n != 0`. Consider + // `x/y + 1 < (x + 1)/y` <=> `x/y + 1 < x/y + 1/y` <=> `1 < 1/y` <=> `y < 1`. + // `div_sig_n` is a nonzero integer. Thus, + // 10. `duo_sig_n / div_sig_n == (duo_sig_n + 1) / div_sig_n` or + // `(duo_sig_n / div_sig_n) + 1 == (duo_sig_n + 1) / div_sig_n. + // + // When 1 is added to the denominator of `duo_sig_n / div_sig_n` to produce + // `duo_sig_n / (div_sig_n + 1)`, it is not possible that the value decreases by + // more than 1 with the bounds (8) and (9). Consider `x/y - 1 <= x/(y + 1)` <=> + // `(x - y)/y < x/(y + 1)` <=> `(y + 1)*(x - y) < x*y` <=> `x*y - y*y + x - y < x*y` + // <=> `x < y*y + y`. The smallest value of `div_sig_n` is `2^n_h` and the largest + // value of `duo_sig_n` is `2^n - 1`. Substituting reveals `2^n - 1 < 2^n + 2^n_h`. + // Thus, + // 11. `duo_sig_n / div_sig_n == duo_sig_n / (div_sig_n + 1)` or + // `(duo_sig_n / div_sig_n) - 1` == duo_sig_n / (div_sig_n + 1)` + // + // Combining both (10) and (11), we know that + // `quo - 1 <= duo_sig_n / (div_sig_n + 1) <= true_quo + // < (duo_sig_n + 1) / div_sig_n <= quo + 1` and therefore: + // 12. quo - 1 <= true_quo < quo + 1 + // + // In a lot of division algorithms using smaller divisions to construct a larger + // division, we often encounter a situation where the approximate `quo` value + // calculated from a smaller division is multiple increments away from the true + // `quo` value. In those algorithms, multiple correction steps have to be applied. + // Those correction steps may need more multiplications to test `duo - (quo*div)` + // again. Because of the fact that our `quo` can only be one of two values, we can + // see if `duo - (quo*div)` overflows. If it did overflow, then we know that we have + // the larger of the two values (since the true quotient is unique, and any larger + // quotient will cause `duo - (quo*div)` to be negative). Also because there is only + // one correction needed, we can calculate the remainder `duo - (true_quo*div) == + // duo - ((quo - 1)*div) == duo - (quo*div - div) == duo + div - quo*div`. + // If `duo - (quo*div)` did not overflow, then we have the correct answer. + let shift = n - duo_lz; + let duo_sig_n = (duo >> shift) as $uX; + let div_sig_n = (div >> shift) as $uX; + let quo = $half_division(duo_sig_n, div_sig_n).0; + + // The larger `quo` value can overflow `$uD` in the right circumstances. This is a + // manual `carrying_mul_add` with overflow checking. + let div_lo = div as $uX; + let div_hi = (div >> n) as $uX; + let (tmp_lo, carry) = carrying_mul(quo, div_lo); + let (tmp_hi, overflow) = carrying_mul_add(quo, div_hi, carry); + let tmp = (tmp_lo as $uD) | ((tmp_hi as $uD) << n); + if (overflow != 0) || (duo < tmp) { + return ( + (quo - 1) as $uD, + // Both the addition and subtraction can overflow, but when combined end up + // as a correct positive number. + duo.wrapping_add(div).wrapping_sub(tmp) + ) + } else { + return ( + quo as $uD, + duo - tmp + ) + } + } + + // Undersubtracting long division algorithm. + // Instead of clearing a minimum of 1 bit from `duo` per iteration via binary long + // division, `n_h - 1` bits are cleared per iteration with this algorithm. It is a more + // complicated version of regular long division. Most integer division algorithms tend + // to guess a part of the quotient, and may have a larger quotient than the true + // quotient (which when multiplied by `div` will "oversubtract" the original dividend). + // They then check if the quotient was in fact too large and then have to correct it. + // This long division algorithm has been carefully constructed to always underguess the + // quotient by slim margins. This allows different subalgorithms to be blindly jumped to + // without needing an extra correction step. + // + // The only problem is that this subalgorithm will not work for many ranges of `duo` and + // `div`. Fortunately, the short division, two possibility algorithm, and other simple + // cases happen to exactly fill these gaps. + // + // For an example, consider the division of 76543210 by 213 and assume that `n_h` is + // equal to two decimal digits (note: we are working with base 10 here for readability). + // The first `sig_n_h` part of the divisor (21) is taken and is incremented by 1 to + // prevent oversubtraction. We also record the number of extra places not a part of + // the `sig_n` or `sig_n_h` parts. + // + // sig_n_h == 2 digits, sig_n == 4 digits + // + // vvvv <- `duo_sig_n` + // 76543210 + // ^^^^ <- extra places in duo, `duo_extra == 4` + // + // vv <- `div_sig_n_h` + // 213 + // ^ <- extra places in div, `div_extra == 1` + // + // The difference in extra places, `duo_extra - div_extra == extra_shl == 3`, is used + // for shifting partial sums in the long division. + // + // In the first step, the first `sig_n` part of duo (7654) is divided by + // `div_sig_n_h_add_1` (22), which results in a partial quotient of 347. This is + // multiplied by the whole divisor to make 73911, which is shifted left by `extra_shl` + // and subtracted from duo. The partial quotient is also shifted left by `extra_shl` to + // be added to `quo`. + // + // 347 + // ________ + // |76543210 + // -73911 + // 2632210 + // + // Variables dependent on duo have to be updated: + // + // vvvv <- `duo_sig_n == 2632` + // 2632210 + // ^^^ <- `duo_extra == 3` + // + // `extra_shl == 2` + // + // Two more steps are taken after this and then duo fits into `n` bits, and then a final + // normal long division step is made. The partial quotients are all progressively added + // to each other in the actual algorithm, but here I have left them all in a tower that + // can be added together to produce the quotient, 359357. + // + // 14 + // 443 + // 119 + // 347 + // ________ + // |76543210 + // -73911 + // 2632210 + // -25347 + // 97510 + // -94359 + // 3151 + // -2982 + // 169 <- the remainder + + let mut duo = duo; + let mut quo: $uD = 0; + + // The number of lesser significant bits not a part of `div_sig_n_h` + let div_extra = (n + $n_h) - div_lz; + + // The most significant `n_h` bits of div + let div_sig_n_h = (div >> div_extra) as $uH; + + // This needs to be a `$uX` in case of overflow from the increment + let div_sig_n_h_add1 = (div_sig_n_h as $uX) + 1; + + // `{2^n, 2^(div_sb + n_h)} <= duo < 2^n_d` + // `2^n_h <= div < {2^(duo_sb - n_h), 2^n}` + loop { + // The number of lesser significant bits not a part of `duo_sig_n` + let duo_extra = n - duo_lz; + + // The most significant `n` bits of `duo` + let duo_sig_n = (duo >> duo_extra) as $uX; + + // the two possibility algorithm requires that the difference between msbs is less + // than `n_h`, so the comparison is `<=` here. + if div_extra <= duo_extra { + // Undersubtracting long division step + let quo_part = $half_division(duo_sig_n, div_sig_n_h_add1).0 as $uD; + let extra_shl = duo_extra - div_extra; + + // Addition to the quotient. + quo += (quo_part << extra_shl); + + // Subtraction from `duo`. At least `n_h - 1` bits are cleared from `duo` here. + duo -= (div.wrapping_mul(quo_part) << extra_shl); + } else { + // Two possibility algorithm + let shift = n - duo_lz; + let duo_sig_n = (duo >> shift) as $uX; + let div_sig_n = (div >> shift) as $uX; + let quo_part = $half_division(duo_sig_n, div_sig_n).0; + let div_lo = div as $uX; + let div_hi = (div >> n) as $uX; + + let (tmp_lo, carry) = carrying_mul(quo_part, div_lo); + // The undersubtracting long division algorithm has already run once, so + // overflow beyond `$uD` bits is not possible here + let (tmp_hi, _) = carrying_mul_add(quo_part, div_hi, carry); + let tmp = (tmp_lo as $uD) | ((tmp_hi as $uD) << n); + + if duo < tmp { + return ( + quo + ((quo_part - 1) as $uD), + duo.wrapping_add(div).wrapping_sub(tmp) + ) + } else { + return ( + quo + (quo_part as $uD), + duo - tmp + ) + } + } + + duo_lz = duo.leading_zeros(); + + if div_lz <= duo_lz { + // quotient can have 0 or 1 added to it + if div <= duo { + return ( + quo + 1, + duo - div + ) + } else { + return ( + quo, + duo + ) + } + } + + // This can only happen if `div_sd < n` (because of previous "quo = 0 or 1" + // branches), but it is not worth it to unroll further. + if n <= duo_lz { + // simple division and addition + let tmp = $half_division(duo as $uX, div as $uX); + return ( + quo + (tmp.0 as $uD), + tmp.1 as $uD + ) + } + } + } + + /// Computes the quotient and remainder of `duo` divided by `div` and returns them as a + /// tuple. + $( + #[$signed_attr] + )* + pub fn $signed_name(duo: $iD, div: $iD) -> ($iD, $iD) { + match (duo < 0, div < 0) { + (false, false) => { + let t = $unsigned_name(duo as $uD, div as $uD); + (t.0 as $iD, t.1 as $iD) + }, + (true, false) => { + let t = $unsigned_name(duo.wrapping_neg() as $uD, div as $uD); + ((t.0 as $iD).wrapping_neg(), (t.1 as $iD).wrapping_neg()) + }, + (false, true) => { + let t = $unsigned_name(duo as $uD, div.wrapping_neg() as $uD); + ((t.0 as $iD).wrapping_neg(), t.1 as $iD) + }, + (true, true) => { + let t = $unsigned_name(duo.wrapping_neg() as $uD, div.wrapping_neg() as $uD); + (t.0 as $iD, (t.1 as $iD).wrapping_neg()) + }, + } + } + } +} diff --git a/src/int/udiv.rs b/src/int/udiv.rs index 49151594..3cd9be93 100644 --- a/src/int/udiv.rs +++ b/src/int/udiv.rs @@ -1,4 +1,4 @@ -use int::{Int, LargeInt}; +use int::specialized_div_rem::*; intrinsics! { #[maybe_use_optimized_c_shim] diff --git a/src/lib.rs b/src/lib.rs index 34397e0d..cbd23850 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,5 @@ #![cfg_attr(feature = "compiler-builtins", compiler_builtins)] +#![cfg_attr(feature = "asm", feature(asm))] #![feature(abi_unadjusted)] #![feature(llvm_asm)] #![feature(global_asm)] diff --git a/testcrate/Cargo.toml b/testcrate/Cargo.toml index 61282af0..5c3df796 100644 --- a/testcrate/Cargo.toml +++ b/testcrate/Cargo.toml @@ -28,7 +28,8 @@ utest-cortex-m-qemu = { default-features = false, git = "https://github.com/japa utest-macros = { git = "https://github.com/japaric/utest" } [features] +default = ["asm", "mangled-names"] +asm = ["compiler_builtins/asm"] c = ["compiler_builtins/c"] mem = ["compiler_builtins/mem"] mangled-names = ["compiler_builtins/mangled-names"] -default = ["mangled-names"] diff --git a/testcrate/tests/div_rem.rs b/testcrate/tests/div_rem.rs new file mode 100644 index 00000000..2a154f5e --- /dev/null +++ b/testcrate/tests/div_rem.rs @@ -0,0 +1,143 @@ +use rand_xoshiro::rand_core::{RngCore, SeedableRng}; +use rand_xoshiro::Xoshiro128StarStar; + +use compiler_builtins::int::sdiv::{__divmoddi4, __divmodsi4, __divti3, __modti3}; +use compiler_builtins::int::udiv::{__udivmoddi4, __udivmodsi4, __udivmodti4}; + +// because `__divmodti4` does not exist, we synthesize it +fn __divmodti4(a: i128, b: i128, rem: &mut i128) -> i128 { + *rem = __modti3(a, b); + __divti3(a, b) +} + +/// Creates intensive test functions for division functions of a certain size +macro_rules! test { + ( + $n:expr, // the number of bits in a $iX or $uX + $uX:ident, // unsigned integer that will be shifted + $iX:ident, // signed version of $uX + $test_name:ident, // name of the test function + $unsigned_name:ident, // unsigned division function + $signed_name:ident // signed division function + ) => { + #[test] + fn $test_name() { + fn assert_invariants(lhs: $uX, rhs: $uX) { + let rem: &mut $uX = &mut 0; + let quo: $uX = $unsigned_name(lhs, rhs, Some(rem)); + let rem = *rem; + if rhs <= rem || (lhs != rhs.wrapping_mul(quo).wrapping_add(rem)) { + panic!( + "unsigned division function failed with lhs:{} rhs:{} \ + expected:({}, {}) found:({}, {})", + lhs, + rhs, + lhs.wrapping_div(rhs), + lhs.wrapping_rem(rhs), + quo, + rem + ); + } + + // test the signed division function also + let lhs = lhs as $iX; + let rhs = rhs as $iX; + let mut rem: $iX = 0; + let quo: $iX = $signed_name(lhs, rhs, &mut rem); + // We cannot just test that + // `lhs == rhs.wrapping_mul(quo).wrapping_add(rem)`, but also + // need to make sure the remainder isn't larger than the divisor + // and has the correct sign. + let incorrect_rem = if rem == 0 { + false + } else if rhs == $iX::MIN { + // `rhs.wrapping_abs()` would overflow, so handle this case + // separately. + (lhs.is_negative() != rem.is_negative()) || (rem == $iX::MIN) + } else { + (lhs.is_negative() != rem.is_negative()) + || (rhs.wrapping_abs() <= rem.wrapping_abs()) + }; + if incorrect_rem || lhs != rhs.wrapping_mul(quo).wrapping_add(rem) { + panic!( + "signed division function failed with lhs:{} rhs:{} \ + expected:({}, {}) found:({}, {})", + lhs, + rhs, + lhs.wrapping_div(rhs), + lhs.wrapping_rem(rhs), + quo, + rem + ); + } + } + + // Specially designed random fuzzer + let mut rng = Xoshiro128StarStar::seed_from_u64(0); + let mut lhs: $uX = 0; + let mut rhs: $uX = 0; + // all ones constant + let ones: $uX = !0; + // Alternating ones and zeros (e.x. 0b1010101010101010). This catches second-order + // problems that might occur for algorithms with two modes of operation (potentially + // there is some invariant that can be broken for large `duo` and maintained via + // alternating between modes, breaking the algorithm when it reaches the end). + let mut alt_ones: $uX = 1; + for _ in 0..($n / 2) { + alt_ones <<= 2; + alt_ones |= 1; + } + // creates a mask for indexing the bits of the type + let bit_indexing_mask = $n - 1; + for _ in 0..1_000_000 { + // Randomly OR, AND, and XOR randomly sized and shifted continuous strings of + // ones with `lhs` and `rhs`. This results in excellent fuzzing entropy such as: + // lhs:10101010111101000000000100101010 rhs: 1010101010000000000000001000001 + // lhs:10101010111101000000000101001010 rhs: 1010101010101010101010100010100 + // lhs:10101010111101000000000101001010 rhs:11101010110101010101010100001110 + // lhs:10101010000000000000000001001010 rhs:10100010100000000000000000001010 + // lhs:10101010000000000000000001001010 rhs: 10101010101010101000 + // lhs:10101010000000000000000001100000 rhs:11111111111101010101010101001111 + // lhs:10101010000000101010101011000000 rhs:11111111111101010101010100000111 + // lhs:10101010101010101010101011101010 rhs: 1010100000000000000 + // lhs:11111111110101101010101011010111 rhs: 1010100000000000000 + // The msb is set half of the time by the fuzzer, but `assert_invariants` tests + // both the signed and unsigned functions. + let r0: u32 = bit_indexing_mask & rng.next_u32(); + let r1: u32 = bit_indexing_mask & rng.next_u32(); + let mask = ones.wrapping_shr(r0).rotate_left(r1); + match rng.next_u32() % 8 { + 0 => lhs |= mask, + 1 => lhs &= mask, + // both 2 and 3 to make XORs as common as ORs and ANDs combined, otherwise + // the entropy gets destroyed too often + 2 | 3 => lhs ^= mask, + 4 => rhs |= mask, + 5 => rhs &= mask, + _ => rhs ^= mask, + } + // do the same for alternating ones and zeros + let r0: u32 = bit_indexing_mask & rng.next_u32(); + let r1: u32 = bit_indexing_mask & rng.next_u32(); + let mask = alt_ones.wrapping_shr(r0).rotate_left(r1); + match rng.next_u32() % 8 { + 0 => lhs |= mask, + 1 => lhs &= mask, + // both 2 and 3 to make XORs as common as ORs and ANDs combined, otherwise + // the entropy gets destroyed too often + 2 | 3 => lhs ^= mask, + 4 => rhs |= mask, + 5 => rhs &= mask, + _ => rhs ^= mask, + } + if rhs != 0 { + assert_invariants(lhs, rhs); + } + } + } + }; +} + +test!(32, u32, i32, div_rem_si4, __udivmodsi4, __divmodsi4); +test!(64, u64, i64, div_rem_di4, __udivmoddi4, __divmoddi4); +test!(128, u128, i128, div_rem_ti4, __udivmodti4, __divmodti4); From 0e6d75d746b2a0e8a948f662eec6347cc627be6d Mon Sep 17 00:00:00 2001 From: Aaron Kutch Date: Tue, 21 Jul 2020 23:16:52 -0500 Subject: [PATCH 6/8] Change inlining to favor three underlying division functions --- src/int/specialized_div_rem/mod.rs | 35 ++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/src/int/specialized_div_rem/mod.rs b/src/int/specialized_div_rem/mod.rs index 5a034dcf..0304aa2e 100644 --- a/src/int/specialized_div_rem/mod.rs +++ b/src/int/specialized_div_rem/mod.rs @@ -113,6 +113,13 @@ fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) { zero_div_fn() } +// `inline(never)` is placed on unsigned division functions so that there are just three division +// functions (`u32_div_rem`, `u64_div_rem`, and `u128_div_rem`) backing all `compiler-builtins` +// division functions. The signed functions like `i32_div_rem` will get inlined into the +// `compiler-builtins` signed division functions, so that they directly call the three division +// functions. Otherwise, LLVM may try to inline the unsigned division functions 4 times into the +// signed division functions, which results in an explosion in code size. + // Whether `trifecta` or `delegate` is faster for 128 bit division depends on the speed at which a // microarchitecture can multiply and divide. We decide to be optimistic and assume `trifecta` is // faster if the target pointer width is at least 64. @@ -129,7 +136,9 @@ impl_trifecta!( u32, u64, u128, - i128,; + i128, + inline(never); + inline ); // If the pointer width less than 64, then the target architecture almost certainly does not have @@ -148,7 +157,9 @@ impl_delegate!( u32, u64, u128, - i128,; + i128, + inline(never); + inline ); /// Divides `duo` by `div` and returns a tuple of the quotient and the remainder. @@ -190,7 +201,9 @@ impl_asymmetric!( u32, u64, u128, - i128,; + i128, + inline(never); + inline ); /// Divides `duo` by `div` and returns a tuple of the quotient and the remainder. @@ -223,7 +236,9 @@ impl_delegate!( u16, u32, u64, - i64,; + i64, + inline(never); + inline ); // When not on x86 and the pointer width is 64, use `binary_long`. @@ -238,7 +253,9 @@ impl_binary_long!( u64_normalization_shift, 64, u64, - i64,; + i64, + inline(never); + inline ); /// Divides `duo` by `div` and returns a tuple of the quotient and the remainder. @@ -280,7 +297,9 @@ impl_asymmetric!( u16, u32, u64, - i64,; + i64, + inline(never); + inline ); // 32 bits is the smallest division used by `compiler-builtins`, so we end with binary long division @@ -291,5 +310,7 @@ impl_binary_long!( u32_normalization_shift, 32, u32, - i32,; + i32, + inline(never); + inline ); From bc0646543eb23d48caf7d8fdcf5318704b6020e6 Mon Sep 17 00:00:00 2001 From: Aaron Kutch Date: Fri, 14 Aug 2020 15:28:48 -0500 Subject: [PATCH 7/8] Use unreachable_unchecked --- src/int/specialized_div_rem/mod.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/int/specialized_div_rem/mod.rs b/src/int/specialized_div_rem/mod.rs index 0304aa2e..f7dc044f 100644 --- a/src/int/specialized_div_rem/mod.rs +++ b/src/int/specialized_div_rem/mod.rs @@ -57,9 +57,7 @@ mod asymmetric; /// impossible to reach by Rust users, unless `compiler-builtins` public division functions or /// `core/std::unchecked_div/rem` are directly used without a zero check in front. fn zero_div_fn() -> ! { - // TODO: change this once the algorithms are verified - //unsafe {core::hint::unreachable_unchecked()} - ::abort() + unsafe { core::hint::unreachable_unchecked() } } // The `B` extension on RISC-V determines if a CLZ assembly instruction exists From 26fe6ff9361f2855a7b1448ecba167316f09650f Mon Sep 17 00:00:00 2001 From: Aaron Kutch Date: Sat, 29 Aug 2020 18:02:57 -0500 Subject: [PATCH 8/8] Add `__divmodti4` --- src/int/sdiv.rs | 9 ++++++++- testcrate/build.rs | 13 +++++++++++++ testcrate/tests/div_rem.rs | 8 +------- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/src/int/sdiv.rs b/src/int/sdiv.rs index 57ef03cd..3d0c3afc 100644 --- a/src/int/sdiv.rs +++ b/src/int/sdiv.rs @@ -54,5 +54,12 @@ intrinsics! { i128_div_rem(a, b).1 } - // LLVM does not currently have a `__divmodti4` function + // LLVM does not currently have a `__divmodti4` function, but GCC does + #[maybe_use_optimized_c_shim] + /// Returns `n / d` and sets `*rem = n % d` + pub extern "C" fn __divmodti4(a: i128, b: i128, rem: &mut i128) -> i128 { + let quo_rem = i128_div_rem(a, b); + *rem = quo_rem.1; + quo_rem.0 + } } diff --git a/testcrate/build.rs b/testcrate/build.rs index e1d4cf9e..656fd2d2 100644 --- a/testcrate/build.rs +++ b/testcrate/build.rs @@ -805,6 +805,19 @@ fn main() { (builtins::int::sdiv::__divmodsi4(a, b, &mut r), r) }", ); + gen( + |(a, b): (MyI128, MyI128)| { + if b.0 == 0 { + None + } else { + Some((a.0 / b.0, a.0 % b.0)) + } + }, + "{ + let mut r = 0; + (builtins::int::sdiv::__divmodti4(a, b, &mut r), r) + }", + ); gen( |(a, b): (MyI32, MyI32)| { if b.0 == 0 { diff --git a/testcrate/tests/div_rem.rs b/testcrate/tests/div_rem.rs index 2a154f5e..199fa9db 100644 --- a/testcrate/tests/div_rem.rs +++ b/testcrate/tests/div_rem.rs @@ -1,15 +1,9 @@ use rand_xoshiro::rand_core::{RngCore, SeedableRng}; use rand_xoshiro::Xoshiro128StarStar; -use compiler_builtins::int::sdiv::{__divmoddi4, __divmodsi4, __divti3, __modti3}; +use compiler_builtins::int::sdiv::{__divmoddi4, __divmodsi4, __divmodti4}; use compiler_builtins::int::udiv::{__udivmoddi4, __udivmodsi4, __udivmodti4}; -// because `__divmodti4` does not exist, we synthesize it -fn __divmodti4(a: i128, b: i128, rem: &mut i128) -> i128 { - *rem = __modti3(a, b); - __divti3(a, b) -} - /// Creates intensive test functions for division functions of a certain size macro_rules! test { (