From e8fad325fe8630c0a6561d3e0c9c5fc51423aac0 Mon Sep 17 00:00:00 2001 From: Scott McMurray Date: Thu, 4 May 2017 20:28:34 -0700 Subject: [PATCH 1/3] Make [u8]::reverse() 5x faster Since LLVM doesn't vectorize the loop for us, do unaligned reads of a larger type and use LLVM's bswap intrinsic to do the reversing of the actual bytes. cfg!-restricted to x86 and x86_64, as I assume it wouldn't help on things like ARMv5. Also makes [u16]::reverse() a more modest 1.5x faster by loading/storing u32 and swapping the u16s with ROT16. Thank you ptr::*_unaligned for making this easy :) --- src/libcollections/benches/slice.rs | 21 ++++++++++++++++ src/libcollections/tests/slice.rs | 10 ++++++++ src/libcore/slice/mod.rs | 38 +++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+) diff --git a/src/libcollections/benches/slice.rs b/src/libcollections/benches/slice.rs index 7195a9f9bf2c6..db523681bc7c5 100644 --- a/src/libcollections/benches/slice.rs +++ b/src/libcollections/benches/slice.rs @@ -290,3 +290,24 @@ sort!(sort_unstable, sort_unstable_large_random, gen_random, 10000); sort!(sort_unstable, sort_unstable_large_big_random, gen_big_random, 10000); sort!(sort_unstable, sort_unstable_large_strings, gen_strings, 10000); sort_expensive!(sort_unstable_by, sort_unstable_large_random_expensive, gen_random, 10000); + +macro_rules! reverse { + ($name:ident, $ty:ident) => { + #[bench] + fn $name(b: &mut Bencher) { + // odd length and offset by 1 to be as unaligned as possible + let n = 0xFFFFF; + let mut v: Vec<_> = + (0..1+(n / mem::size_of::<$ty>() as u64)) + .map(|x| x as $ty) + .collect(); + b.iter(|| black_box(&mut v[1..]).reverse()); + b.bytes = n; + } + } +} + +reverse!(reverse_u8, u8); +reverse!(reverse_u16, u16); +reverse!(reverse_u32, u32); +reverse!(reverse_u64, u64); diff --git a/src/libcollections/tests/slice.rs b/src/libcollections/tests/slice.rs index c3e5304fb2b35..1708f98b7ee47 100644 --- a/src/libcollections/tests/slice.rs +++ b/src/libcollections/tests/slice.rs @@ -379,6 +379,16 @@ fn test_reverse() { let mut v3 = Vec::::new(); v3.reverse(); assert!(v3.is_empty()); + + // check the 1-byte-types path + let mut v = (-50..51i8).collect::>(); + v.reverse(); + assert_eq!(v, (-50..51i8).rev().collect::>()); + + // check the 2-byte-types path + let mut v = (-50..51i16).collect::>(); + v.reverse(); + assert_eq!(v, (-50..51i16).rev().collect::>()); } #[test] diff --git a/src/libcore/slice/mod.rs b/src/libcore/slice/mod.rs index 9e3bd9115468a..bf637af0639d9 100644 --- a/src/libcore/slice/mod.rs +++ b/src/libcore/slice/mod.rs @@ -539,6 +539,44 @@ impl SliceExt for [T] { fn reverse(&mut self) { let mut i: usize = 0; let ln = self.len(); + + let fast_unaligned = + cfg!(any(target_arch = "x86", target_arch = "x86_64")); + + if fast_unaligned && mem::size_of::() == 1 { + // Single-byte read & write are comparatively slow. Instead, + // work in usize chunks and get bswap to do the hard work. + let chunk = mem::size_of::(); + while i + chunk - 1 < ln / 2 { + unsafe { + let pa: *mut T = self.get_unchecked_mut(i); + let pb: *mut T = self.get_unchecked_mut(ln - i - chunk); + let va = ptr::read_unaligned(pa as *mut usize); + let vb = ptr::read_unaligned(pb as *mut usize); + ptr::write_unaligned(pa as *mut usize, vb.swap_bytes()); + ptr::write_unaligned(pb as *mut usize, va.swap_bytes()); + } + i += chunk; + } + } + + if fast_unaligned && mem::size_of::() == 2 { + // Not quite as good as the above, but still helpful. + // Same general idea, read bigger and do the swap in a register. + let chunk = mem::size_of::() / 2; + while i + chunk - 1 < ln / 2 { + unsafe { + let pa: *mut T = self.get_unchecked_mut(i); + let pb: *mut T = self.get_unchecked_mut(ln - i - chunk); + let va = ptr::read_unaligned(pa as *mut u32); + let vb = ptr::read_unaligned(pb as *mut u32); + ptr::write_unaligned(pa as *mut u32, vb.rotate_left(16)); + ptr::write_unaligned(pb as *mut u32, va.rotate_left(16)); + } + i += chunk; + } + } + while i < ln / 2 { // Unsafe swap to avoid the bounds check in safe swap. unsafe { From 1f891d11f5ff64e1f2e9cba79f1069f7a8d13c7f Mon Sep 17 00:00:00 2001 From: Scott McMurray Date: Fri, 5 May 2017 18:54:47 -0700 Subject: [PATCH 2/3] Improve implementation approach comments in [T]::reverse() --- src/libcore/slice/mod.rs | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/libcore/slice/mod.rs b/src/libcore/slice/mod.rs index bf637af0639d9..e15eb8f244409 100644 --- a/src/libcore/slice/mod.rs +++ b/src/libcore/slice/mod.rs @@ -540,12 +540,24 @@ impl SliceExt for [T] { let mut i: usize = 0; let ln = self.len(); + // For very small types, all the individual reads in the normal + // path perform poorly. We can do better, given efficient unaligned + // load/store, by loading a larger chunk and reversing a register. + + // Ideally LLVM would do this for us, as it knows better than we do + // whether unaligned reads are efficient (since that changes between + // different ARM versions, for example) and what the best chunk size + // would be. Unfortunately, as of LLVM 4.0 (2017-05) it only unrolls + // the loop, so we need to do this ourselves. (Hypothesis: reverse + // is troublesome because the sides can be aligned differently -- + // will be, when the length is odd -- so there's no way of emitting + // pre- and postludes to use fully-aligned SIMD in the middle.) + let fast_unaligned = cfg!(any(target_arch = "x86", target_arch = "x86_64")); if fast_unaligned && mem::size_of::() == 1 { - // Single-byte read & write are comparatively slow. Instead, - // work in usize chunks and get bswap to do the hard work. + // Use the llvm.bswap intrinsic to reverse u8s in a usize let chunk = mem::size_of::(); while i + chunk - 1 < ln / 2 { unsafe { @@ -561,8 +573,7 @@ impl SliceExt for [T] { } if fast_unaligned && mem::size_of::() == 2 { - // Not quite as good as the above, but still helpful. - // Same general idea, read bigger and do the swap in a register. + // Use rotate-by-16 to reverse u16s in a u32 let chunk = mem::size_of::() / 2; while i + chunk - 1 < ln / 2 { unsafe { From da91361d2a8ea86a42cbe2a23a7ff816cc5500af Mon Sep 17 00:00:00 2001 From: Scott McMurray Date: Fri, 5 May 2017 20:50:48 -0700 Subject: [PATCH 3/3] Add reverse benchmarks for u128, [u8;3], and Simd<[f64;4]> None of these are affected by e8fad325fe. --- src/libcollections/benches/lib.rs | 2 ++ src/libcollections/benches/slice.rs | 16 ++++++++++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/libcollections/benches/lib.rs b/src/libcollections/benches/lib.rs index 42064e9ca5750..9f356e4b57912 100644 --- a/src/libcollections/benches/lib.rs +++ b/src/libcollections/benches/lib.rs @@ -10,7 +10,9 @@ #![deny(warnings)] +#![feature(i128_type)] #![feature(rand)] +#![feature(repr_simd)] #![feature(sort_unstable)] #![feature(test)] diff --git a/src/libcollections/benches/slice.rs b/src/libcollections/benches/slice.rs index db523681bc7c5..0079f2d01036c 100644 --- a/src/libcollections/benches/slice.rs +++ b/src/libcollections/benches/slice.rs @@ -292,14 +292,14 @@ sort!(sort_unstable, sort_unstable_large_strings, gen_strings, 10000); sort_expensive!(sort_unstable_by, sort_unstable_large_random_expensive, gen_random, 10000); macro_rules! reverse { - ($name:ident, $ty:ident) => { + ($name:ident, $ty:ty, $f:expr) => { #[bench] fn $name(b: &mut Bencher) { // odd length and offset by 1 to be as unaligned as possible let n = 0xFFFFF; let mut v: Vec<_> = (0..1+(n / mem::size_of::<$ty>() as u64)) - .map(|x| x as $ty) + .map($f) .collect(); b.iter(|| black_box(&mut v[1..]).reverse()); b.bytes = n; @@ -307,7 +307,11 @@ macro_rules! reverse { } } -reverse!(reverse_u8, u8); -reverse!(reverse_u16, u16); -reverse!(reverse_u32, u32); -reverse!(reverse_u64, u64); +reverse!(reverse_u8, u8, |x| x as u8); +reverse!(reverse_u16, u16, |x| x as u16); +reverse!(reverse_u8x3, [u8;3], |x| [x as u8, (x>>8) as u8, (x>>16) as u8]); +reverse!(reverse_u32, u32, |x| x as u32); +reverse!(reverse_u64, u64, |x| x as u64); +reverse!(reverse_u128, u128, |x| x as u128); +#[repr(simd)] struct F64x4(f64, f64, f64, f64); +reverse!(reverse_simd_f64x4, F64x4, |x| { let x = x as f64; F64x4(x,x,x,x) });