diff --git a/coresimd/ppsv/api/arithmetic_reductions.rs b/coresimd/ppsv/api/arithmetic_reductions.rs index d1aa2433f5..3d51d113fe 100644 --- a/coresimd/ppsv/api/arithmetic_reductions.rs +++ b/coresimd/ppsv/api/arithmetic_reductions.rs @@ -6,9 +6,20 @@ macro_rules! impl_arithmetic_reductions { impl $id { /// Lane-wise addition of the vector elements. /// - /// FIXME: document guarantees with respect to: - /// * integers: overflow behavior - /// * floats: order and NaNs + /// The intrinsic performs a tree-reduction of the vector elements. + /// That is, for an 8 element vector: + /// + /// > ((x0 + x1) + (x2 + x3)) + ((x4 + x5) + (x6 + x7)) + /// + /// # Integer vectors + /// + /// If an operation overflows it returns the mathematical result + /// modulo `2^n` where `n` is the number of times it overflows. + /// + /// # Floating-point vectors + /// + /// If one of the vector element is `NaN` the reduction returns + /// `NaN`. #[cfg(not(target_arch = "aarch64"))] #[inline] pub fn sum(self) -> $elem_ty { @@ -19,9 +30,20 @@ macro_rules! impl_arithmetic_reductions { } /// Lane-wise addition of the vector elements. /// - /// FIXME: document guarantees with respect to: - /// * integers: overflow behavior - /// * floats: order and NaNs + /// The intrinsic performs a tree-reduction of the vector elements. + /// That is, for an 8 element vector: + /// + /// > ((x0 + x1) + (x2 + x3)) + ((x4 + x5) + (x6 + x7)) + /// + /// # Integer vectors + /// + /// If an operation overflows it returns the mathematical result + /// modulo `2^n` where `n` is the number of times it overflows. + /// + /// # Floating-point vectors + /// + /// If one of the vector element is `NaN` the reduction returns + /// `NaN`. #[cfg(target_arch = "aarch64")] #[inline] pub fn sum(self) -> $elem_ty { @@ -36,9 +58,20 @@ macro_rules! impl_arithmetic_reductions { /// Lane-wise multiplication of the vector elements. /// - /// FIXME: document guarantees with respect to: - /// * integers: overflow behavior - /// * floats: order and NaNs + /// The intrinsic performs a tree-reduction of the vector elements. + /// That is, for an 8 element vector: + /// + /// > ((x0 * x1) * (x2 * x3)) * ((x4 * x5) * (x6 * x7)) + /// + /// # Integer vectors + /// + /// If an operation overflows it returns the mathematical result + /// modulo `2^n` where `n` is the number of times it overflows. + /// + /// # Floating-point vectors + /// + /// If one of the vector element is `NaN` the reduction returns + /// `NaN`. #[cfg(not(target_arch = "aarch64"))] #[inline] pub fn product(self) -> $elem_ty { @@ -49,9 +82,20 @@ macro_rules! impl_arithmetic_reductions { } /// Lane-wise multiplication of the vector elements. /// - /// FIXME: document guarantees with respect to: - /// * integers: overflow behavior - /// * floats: order and NaNs + /// The intrinsic performs a tree-reduction of the vector elements. + /// That is, for an 8 element vector: + /// + /// > ((x0 * x1) * (x2 * x3)) * ((x4 * x5) * (x6 * x7)) + /// + /// # Integer vectors + /// + /// If an operation overflows it returns the mathematical result + /// modulo `2^n` where `n` is the number of times it overflows. + /// + /// # Floating-point vectors + /// + /// If one of the vector element is `NaN` the reduction returns + /// `NaN`. #[cfg(target_arch = "aarch64")] #[inline] pub fn product(self) -> $elem_ty { diff --git a/coresimd/ppsv/api/minmax_reductions.rs b/coresimd/ppsv/api/minmax_reductions.rs index 159c59a99b..9d4eabdf52 100644 --- a/coresimd/ppsv/api/minmax_reductions.rs +++ b/coresimd/ppsv/api/minmax_reductions.rs @@ -6,7 +6,13 @@ macro_rules! impl_minmax_reductions { impl $id { /// Largest vector value. /// - /// FIXME: document behavior for float vectors with NaNs. + /// # Floating-point behvior + /// + /// If the vector contains only `NaN` values, + /// the result is a `NaN`. + /// + /// Otherwise, if the vector contains `NaN` values, either the + /// largest element of the vector or a `NaN` is returned. #[cfg(not(target_arch = "aarch64"))] #[inline] pub fn max(self) -> $elem_ty { @@ -15,9 +21,16 @@ macro_rules! impl_minmax_reductions { simd_reduce_max(self) } } + /// Largest vector value. /// - /// FIXME: document behavior for float vectors with NaNs. + /// # Floating-point behvior + /// + /// If the vector contains only `NaN` values, + /// the result is a `NaN`. + /// + /// Otherwise, if the vector contains `NaN` values, either the + /// largest element of the vector or a `NaN` is returned. #[cfg(target_arch = "aarch64")] #[allow(unused_imports)] #[inline] @@ -35,7 +48,13 @@ macro_rules! impl_minmax_reductions { /// Smallest vector value. /// - /// FIXME: document behavior for float vectors with NaNs. + /// # Floating-point behvior + /// + /// If the vector contains only `NaN` values, + /// the result is a `NaN`. + /// + /// Otherwise, if the vector contains `NaN` values, either the + /// smallest element of the vector or a `NaN` is returned. #[cfg(not(target_arch = "aarch64"))] #[inline] pub fn min(self) -> $elem_ty { @@ -44,9 +63,14 @@ macro_rules! impl_minmax_reductions { simd_reduce_min(self) } } - /// Smallest vector value. + + /// # Floating-point behvior + /// + /// If the vector contains only `NaN` values, + /// the result is a `NaN`. /// - /// FIXME: document behavior for float vectors with NaNs. + /// Otherwise, if the vector contains `NaN` values, either the + /// smallest element of the vector or a `NaN` is returned. #[cfg(target_arch = "aarch64")] #[allow(unused_imports)] #[inline] diff --git a/crates/coresimd/tests/reductions.rs b/crates/coresimd/tests/reductions.rs new file mode 100644 index 0000000000..bf7c91bdf7 --- /dev/null +++ b/crates/coresimd/tests/reductions.rs @@ -0,0 +1,418 @@ +#![feature(cfg_target_feature, stdsimd, target_feature)] + +#[macro_use] +extern crate stdsimd; + +use stdsimd::simd::*; + +macro_rules! invoke_arch { + ($macro:ident, $feature_macro:ident, $id:ident, $elem_ty:ident, + [$($feature:tt),*]) => { + $($macro!($feature, $feature_macro, $id, $elem_ty);)* + } +} + +macro_rules! invoke_vectors { + ($macro:ident, [$(($id:ident, $elem_ty:ident)),*]) => { + $( + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + invoke_arch!($macro, is_x86_feature_detected, $id, $elem_ty, + ["sse", "sse2", "sse3", "ssse3", "sse4.1", + "sse4.2", "sse4a", "avx2", "avx2", "avx512f"]); + #[cfg(target_arch = "aarch64")] + invoke_arch!($macro, is_aarch64_feature_detected, $id, $elem_ty, + ["neon"]); + #[cfg(all(target_arch = "arm", target_feature = "v7", target_feature = "neon"))] + invoke_arch!($macro, is_arm_feature_detected, $id, $elem_ty, + ["neon"]); + #[cfg(target_arch = "powerpc")] + invoke_arch!($macro, is_powerpc_feature_detected, $id, $elem_ty, ["altivec"]); + #[cfg(target_arch = "powerpc64")] + invoke_arch!($macro, is_powerpc64_feature_detected, $id, $elem_ty, ["altivec"]); + )* + } +} + +macro_rules! finvoke { + ($macro:ident) => { + invoke_vectors!( + $macro, + [(f32x2, f32), (f32x4, f32), (f32x8, f32), (f32x16, f32), + (f64x2, f64), (f64x4, f64), (f64x8, f64)] + ); + } +} + +macro_rules! iinvoke { + ($macro:ident) => { + invoke_vectors!( + $macro, + [(i8x2, i8), (i8x4, i8), (i8x8, i8), (i8x16, i8), (i8x32, i8), (i8x64, i8), + (i16x2, i16), (i16x4, i16), (i16x8, i16), (i16x16, i16), (i16x32, i16), + (i32x2, i32), (i32x4, i32), (i32x8, i32), (i32x16, i32), + (i64x2, i64), (i64x4, i64), (i64x8, i64)] + ); + } +} + +macro_rules! min_nan_test { + ($feature:tt, $feature_macro:ident, $id:ident, $elem_ty:ident) => { + if $feature_macro!($feature) { + #[target_feature(enable = $feature)] + unsafe fn test_fn() { + let n0 = ::std::$elem_ty::NAN; + + assert_eq!(n0.min(-3.0), -3.0); + assert_eq!((-3.0 as $elem_ty).min(n0), -3.0); + + let v0 = $id::splat(-3.0); + + for i in 0..$id::lanes() { + let v = v0.replace(i, n0); + if i != $id::lanes() - 1 { + assert_eq!(v.min(), -3.0); + let mut v = v; + for j in 0..i { + v = v.replace(j, n0); + assert_eq!(v.min(), -3.0); + } + } else { + // not necessarily n0: + assert!(v.min().is_nan()); + let mut v = v; + for j in 0..i { + v = v.replace(j, n0); + assert!(v.min().is_nan()); + } + } + } + + let vn = $id::splat(n0); + assert!(vn.min().is_nan()); + } + unsafe { test_fn() }; + } + } +} + +#[test] +fn min_nan() { + finvoke!(min_nan_test); +} + +macro_rules! max_nan_test { + ($feature:tt, $feature_macro:ident, $id:ident, $elem_ty:ident) => { + if $feature_macro!($feature) { + #[target_feature(enable = $feature)] + unsafe fn test_fn() { + let n0 = ::std::$elem_ty::NAN; + + assert_eq!(n0.max(-3.0), -3.0); + assert_eq!((-3.0 as $elem_ty).max(n0), -3.0); + + let v0 = $id::splat(-3.0); + + for i in 0..$id::lanes() { + let v = v0.replace(i, n0); + if i != $id::lanes() - 1 { + assert_eq!(v.max(), -3.0); + let mut v = v; + for j in 0..i { + v = v.replace(j, n0); + assert_eq!(v.max(), -3.0); + } + } else { + // not necessarily n0: + assert!(v.max().is_nan()); + let mut v = v; + for j in 0..i { + v = v.replace(j, n0); + assert!(v.max().is_nan()); + } + } + } + + let vn = $id::splat(n0); + assert!(vn.max().is_nan()); + } + unsafe { test_fn() }; + } + } +} + +#[test] +fn max_nan() { + finvoke!(max_nan_test); +} + +macro_rules! sum_nan_test { + ($feature:tt, $feature_macro:ident, $id:ident, $elem_ty:ident) => { + if $feature_macro!($feature) { + #[target_feature(enable = $feature)] + unsafe fn test_fn() { + let n0 = ::std::$elem_ty::NAN; + + let v0 = $id::splat(-3.0); + + for i in 0..$id::lanes() { + let v = v0.replace(i, n0); + assert!(v.sum().is_nan()); + } + let v = $id::splat(n0); + assert!(v.sum().is_nan()); + } + unsafe { test_fn() }; + } + } +} + +#[test] +fn sum_nan() { + finvoke!(sum_nan_test); +} + +macro_rules! product_nan_test { + ($feature:tt, $feature_macro:ident, $id:ident, $elem_ty:ident) => { + if $feature_macro!($feature) { + #[target_feature(enable = $feature)] + unsafe fn test_fn() { + let n0 = ::std::$elem_ty::NAN; + + let v0 = $id::splat(-3.0); + + for i in 0..$id::lanes() { + let v = v0.replace(i, n0); + assert!(v.product().is_nan()); + } + let v = $id::splat(n0); + assert!(v.product().is_nan()); + } + unsafe { test_fn() }; + } + } +} + +#[test] +fn product_nan() { + finvoke!(product_nan_test); +} + +trait AsInt { + type Int; + fn as_int(self) -> Self::Int; + fn from_int(Self::Int) -> Self; +} + +macro_rules! as_int { + ($float:ident, $int:ident) => { + impl AsInt for $float { + type Int = $int; + fn as_int(self) -> $int { + unsafe { ::std::mem::transmute(self) } + } + fn from_int(x: $int) -> $float { + unsafe { ::std::mem::transmute(x) } + } + } + } +} + +as_int!(f32, u32); +as_int!(f64, u64); + +trait TreeReduceSum { + type R; + fn tree_reduce_sum(self) -> Self::R; +} + +macro_rules! tree_reduce_sum_f { + ($elem_ty:ident) => { + impl<'a> TreeReduceSum for &'a [$elem_ty] { + type R = $elem_ty; + fn tree_reduce_sum(self) -> $elem_ty { + if self.len() == 2 { + self[0] + self[1] + } else { + let mid = self.len() / 2; + let (left, right) = self.split_at(mid); + Self::tree_reduce_sum(left) + Self::tree_reduce_sum(right) + + } + } + } + } +} +tree_reduce_sum_f!(f32); +tree_reduce_sum_f!(f64); + +macro_rules! sum_roundoff_test { + ($feature:tt, $feature_macro:ident, $id:ident, $elem_ty:ident) => { + if $feature_macro!($feature) { + #[target_feature(enable = $feature)] + unsafe fn test_fn() { + let mut start = std::$elem_ty::EPSILON; + let mut sum = 0. as $elem_ty; + + let mut v = $id::splat(0. as $elem_ty); + for i in 0..$id::lanes() { + let c = if i % 2 == 0 { 1e3 } else { -1. }; + start *= 3.14 * c; + sum += start; + // println!("{} | start: {}", stringify!($id), start); + v = v.replace(i, start); + } + let vsum = v.sum(); + println!("{} | lsum: {}", stringify!($id), sum); + println!("{} | vsum: {}", stringify!($id), vsum); + let r = vsum.as_int() == sum.as_int(); + // This is false in general; the intrinsic performs a + // tree-reduce: + println!("{} | equal: {}", stringify!($id), r); + + let mut a = [0. as $elem_ty; $id::lanes()]; + v.store_unaligned(&mut a); + + let tsum = a.tree_reduce_sum(); + println!("{} | tsum: {}", stringify!($id), tsum); + + // tolerate 1 ULP difference: + if vsum.as_int() > tsum.as_int() { + assert!(vsum.as_int() - tsum.as_int() < 2); + } else { + assert!(tsum.as_int() - vsum.as_int() < 2); + } + } + unsafe { test_fn() }; + } + } +} + +#[test] +fn sum_roundoff_test() { + finvoke!(sum_roundoff_test); +} + +trait TreeReduceMul { + type R; + fn tree_reduce_mul(self) -> Self::R; +} + +macro_rules! tree_reduce_mul_f { + ($elem_ty:ident) => { + impl<'a> TreeReduceMul for &'a [$elem_ty] { + type R = $elem_ty; + fn tree_reduce_mul(self) -> $elem_ty { + if self.len() == 2 { + self[0] * self[1] + } else { + let mid = self.len() / 2; + let (left, right) = self.split_at(mid); + Self::tree_reduce_mul(left) * Self::tree_reduce_mul(right) + + } + } + } + } +} + +tree_reduce_mul_f!(f32); +tree_reduce_mul_f!(f64); + +macro_rules! mul_roundoff_test { + ($feature:tt, $feature_macro:ident, $id:ident, $elem_ty:ident) => { + if $feature_macro!($feature) { + #[target_feature(enable = $feature)] + unsafe fn test_fn() { + let mut start = std::$elem_ty::EPSILON; + let mut mul = 1. as $elem_ty; + + let mut v = $id::splat(1. as $elem_ty); + for i in 0..$id::lanes() { + let c = if i % 2 == 0 { 1e3 } else { -1. }; + start *= 3.14 * c; + mul *= start; + println!("{} | start: {}", stringify!($id), start); + v = v.replace(i, start); + } + let vmul = v.product(); + println!("{} | lmul: {}", stringify!($id), mul); + println!("{} | vmul: {}", stringify!($id), vmul); + let r = vmul.as_int() == mul.as_int(); + // This is false in general; the intrinsic performs a + // tree-reduce: + println!("{} | equal: {}", stringify!($id), r); + + let mut a = [0. as $elem_ty; $id::lanes()]; + v.store_unaligned(&mut a); + + let tmul = a.tree_reduce_mul(); + println!("{} | tmul: {}", stringify!($id), tmul); + + // tolerate 1 ULP difference: + if vmul.as_int() > tmul.as_int() { + assert!(vmul.as_int() - tmul.as_int() < 2); + } else { + assert!(tmul.as_int() - vmul.as_int() < 2); + } + } + unsafe { test_fn() }; + } + } +} + +#[test] +fn mul_roundoff_test() { + finvoke!(mul_roundoff_test); +} + +macro_rules! sum_overflow_test { + ($feature:tt, $feature_macro:ident, $id:ident, $elem_ty:ident) => { + if $feature_macro!($feature) { + #[target_feature(enable = $feature)] + unsafe fn test_fn() { + let start = $elem_ty::max_value() - ($id::lanes() as $elem_ty / 2); + + let v = $id::splat(start as $elem_ty); + let vsum = v.sum(); + + let mut sum = start; + for _ in 1..$id::lanes() { + sum = sum.wrapping_add(start); + } + assert_eq!(sum, vsum); + } + unsafe { test_fn() }; + } + } +} + +#[test] +fn sum_overflow_test() { + iinvoke!(sum_overflow_test); +} + +macro_rules! mul_overflow_test { + ($feature:tt, $feature_macro:ident, $id:ident, $elem_ty:ident) => { + if $feature_macro!($feature) { + #[target_feature(enable = $feature)] + unsafe fn test_fn() { + let start = $elem_ty::max_value() - ($id::lanes() as $elem_ty / 2); + + let v = $id::splat(start as $elem_ty); + let vmul = v.product(); + + let mut mul = start; + for _ in 1..$id::lanes() { + mul = mul.wrapping_mul(start); + } + assert_eq!(mul, vmul); + } + unsafe { test_fn() }; + } + } +} + +#[test] +fn mul_overflow_test() { + iinvoke!(mul_overflow_test); +}