From e8fad325fe8630c0a6561d3e0c9c5fc51423aac0 Mon Sep 17 00:00:00 2001
From: Scott McMurray <scottmcm@users.noreply.github.com>
Date: Thu, 4 May 2017 20:28:34 -0700
Subject: [PATCH 1/3] Make [u8]::reverse() 5x faster

Since LLVM doesn't vectorize the loop for us, do unaligned reads
of a larger type and use LLVM's bswap intrinsic to do the
reversing of the actual bytes.  cfg!-restricted to x86 and
x86_64, as I assume it wouldn't help on things like ARMv5.

Also makes [u16]::reverse() a more modest 1.5x faster by
loading/storing u32 and swapping the u16s with ROT16.

Thank you ptr::*_unaligned for making this easy :)
---
 src/libcollections/benches/slice.rs | 21 ++++++++++++++++
 src/libcollections/tests/slice.rs   | 10 ++++++++
 src/libcore/slice/mod.rs            | 38 +++++++++++++++++++++++++++++
 3 files changed, 69 insertions(+)
diff --git a/src/libcollections/benches/slice.rs b/src/libcollections/benches/slice.rs
index 7195a9f9bf2c6..db523681bc7c5 100644
--- a/src/libcollections/benches/slice.rs
+++ b/src/libcollections/benches/slice.rs
@@ -290,3 +290,24 @@ sort!(sort_unstable, sort_unstable_large_random, gen_random, 10000);
 sort!(sort_unstable, sort_unstable_large_big_random, gen_big_random, 10000);
 sort!(sort_unstable, sort_unstable_large_strings, gen_strings, 10000);
 sort_expensive!(sort_unstable_by, sort_unstable_large_random_expensive, gen_random, 10000);
+
+macro_rules! reverse {
+    ($name:ident, $ty:ident) => {
+        #[bench]
+        fn $name(b: &mut Bencher) {
+            // odd length and offset by 1 to be as unaligned as possible
+            let n = 0xFFFFF;
+            let mut v: Vec<_> =
+                (0..1+(n / mem::size_of::<$ty>() as u64))
+                .map(|x| x as $ty)
+                .collect();
+            b.iter(|| black_box(&mut v[1..]).reverse());
+            b.bytes = n;
+        }
+    }
+}
+
+reverse!(reverse_u8, u8);
+reverse!(reverse_u16, u16);
+reverse!(reverse_u32, u32);
+reverse!(reverse_u64, u64);
diff --git a/src/libcollections/tests/slice.rs b/src/libcollections/tests/slice.rs
index c3e5304fb2b35..1708f98b7ee47 100644
--- a/src/libcollections/tests/slice.rs
+++ b/src/libcollections/tests/slice.rs
@@ -379,6 +379,16 @@ fn test_reverse() {
     let mut v3 = Vec::<i32>::new();
     v3.reverse();
     assert!(v3.is_empty());
+
+    // check the 1-byte-types path
+    let mut v = (-50..51i8).collect::<Vec<_>>();
+    v.reverse();
+    assert_eq!(v, (-50..51i8).rev().collect::<Vec<_>>());
+
+    // check the 2-byte-types path
+    let mut v = (-50..51i16).collect::<Vec<_>>();
+    v.reverse();
+    assert_eq!(v, (-50..51i16).rev().collect::<Vec<_>>());
 }
 
 #[test]
diff --git a/src/libcore/slice/mod.rs b/src/libcore/slice/mod.rs
index 9e3bd9115468a..bf637af0639d9 100644
--- a/src/libcore/slice/mod.rs
+++ b/src/libcore/slice/mod.rs
@@ -539,6 +539,44 @@ impl<T> SliceExt for [T] {
     fn reverse(&mut self) {
         let mut i: usize = 0;
         let ln = self.len();
+
+        let fast_unaligned =
+            cfg!(any(target_arch = "x86", target_arch = "x86_64"));
+
+        if fast_unaligned && mem::size_of::<T>() == 1 {
+            // Single-byte read & write are comparatively slow. Instead,
+            // work in usize chunks and get bswap to do the hard work.
+            let chunk = mem::size_of::<usize>();
+            while i + chunk - 1 < ln / 2 {
+                unsafe {
+                    let pa: *mut T = self.get_unchecked_mut(i);
+                    let pb: *mut T = self.get_unchecked_mut(ln - i - chunk);
+                    let va = ptr::read_unaligned(pa as *mut usize);
+                    let vb = ptr::read_unaligned(pb as *mut usize);
+                    ptr::write_unaligned(pa as *mut usize, vb.swap_bytes());
+                    ptr::write_unaligned(pb as *mut usize, va.swap_bytes());
+                }
+                i += chunk;
+            }
+        }
+
+        if fast_unaligned && mem::size_of::<T>() == 2 {
+            // Not quite as good as the above, but still helpful.
+            // Same general idea, read bigger and do the swap in a register.
+            let chunk = mem::size_of::<u32>() / 2;
+            while i + chunk - 1 < ln / 2 {
+                unsafe {
+                    let pa: *mut T = self.get_unchecked_mut(i);
+                    let pb: *mut T = self.get_unchecked_mut(ln - i - chunk);
+                    let va = ptr::read_unaligned(pa as *mut u32);
+                    let vb = ptr::read_unaligned(pb as *mut u32);
+                    ptr::write_unaligned(pa as *mut u32, vb.rotate_left(16));
+                    ptr::write_unaligned(pb as *mut u32, va.rotate_left(16));
+                }
+                i += chunk;
+            }
+        }
+
         while i < ln / 2 {
             // Unsafe swap to avoid the bounds check in safe swap.
             unsafe {

From 1f891d11f5ff64e1f2e9cba79f1069f7a8d13c7f Mon Sep 17 00:00:00 2001
From: Scott McMurray <scottmcm@users.noreply.github.com>
Date: Fri, 5 May 2017 18:54:47 -0700
Subject: [PATCH 2/3] Improve implementation approach comments in
 [T]::reverse()

---
 src/libcore/slice/mod.rs | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/src/libcore/slice/mod.rs b/src/libcore/slice/mod.rs
index bf637af0639d9..e15eb8f244409 100644
--- a/src/libcore/slice/mod.rs
+++ b/src/libcore/slice/mod.rs
@@ -540,12 +540,24 @@ impl<T> SliceExt for [T] {
         let mut i: usize = 0;
         let ln = self.len();
 
+        // For very small types, all the individual reads in the normal
+        // path perform poorly.  We can do better, given efficient unaligned
+        // load/store, by loading a larger chunk and reversing a register.
+
+        // Ideally LLVM would do this for us, as it knows better than we do
+        // whether unaligned reads are efficient (since that changes between
+        // different ARM versions, for example) and what the best chunk size
+        // would be.  Unfortunately, as of LLVM 4.0 (2017-05) it only unrolls
+        // the loop, so we need to do this ourselves.  (Hypothesis: reverse
+        // is troublesome because the sides can be aligned differently --
+        // will be, when the length is odd -- so there's no way of emitting
+        // pre- and postludes to use fully-aligned SIMD in the middle.)
+
         let fast_unaligned =
             cfg!(any(target_arch = "x86", target_arch = "x86_64"));
 
         if fast_unaligned && mem::size_of::<T>() == 1 {
-            // Single-byte read & write are comparatively slow. Instead,
-            // work in usize chunks and get bswap to do the hard work.
+            // Use the llvm.bswap intrinsic to reverse u8s in a usize
             let chunk = mem::size_of::<usize>();
             while i + chunk - 1 < ln / 2 {
                 unsafe {
@@ -561,8 +573,7 @@ impl<T> SliceExt for [T] {
         }
 
         if fast_unaligned && mem::size_of::<T>() == 2 {
-            // Not quite as good as the above, but still helpful.
-            // Same general idea, read bigger and do the swap in a register.
+            // Use rotate-by-16 to reverse u16s in a u32
             let chunk = mem::size_of::<u32>() / 2;
             while i + chunk - 1 < ln / 2 {
                 unsafe {

From da91361d2a8ea86a42cbe2a23a7ff816cc5500af Mon Sep 17 00:00:00 2001
From: Scott McMurray <scottmcm@users.noreply.github.com>
Date: Fri, 5 May 2017 20:50:48 -0700
Subject: [PATCH 3/3] Add reverse benchmarks for u128, [u8;3], and
 Simd<[f64;4]>

None of these are affected by e8fad325fe.
---
 src/libcollections/benches/lib.rs   |  2 ++
 src/libcollections/benches/slice.rs | 16 ++++++++++------
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/src/libcollections/benches/lib.rs b/src/libcollections/benches/lib.rs
index 42064e9ca5750..9f356e4b57912 100644
--- a/src/libcollections/benches/lib.rs
+++ b/src/libcollections/benches/lib.rs
@@ -10,7 +10,9 @@
 
 #![deny(warnings)]
 
+#![feature(i128_type)]
 #![feature(rand)]
+#![feature(repr_simd)]
 #![feature(sort_unstable)]
 #![feature(test)]
 
diff --git a/src/libcollections/benches/slice.rs b/src/libcollections/benches/slice.rs
index db523681bc7c5..0079f2d01036c 100644
--- a/src/libcollections/benches/slice.rs
+++ b/src/libcollections/benches/slice.rs
@@ -292,14 +292,14 @@ sort!(sort_unstable, sort_unstable_large_strings, gen_strings, 10000);
 sort_expensive!(sort_unstable_by, sort_unstable_large_random_expensive, gen_random, 10000);
 
 macro_rules! reverse {
-    ($name:ident, $ty:ident) => {
+    ($name:ident, $ty:ty, $f:expr) => {
         #[bench]
         fn $name(b: &mut Bencher) {
             // odd length and offset by 1 to be as unaligned as possible
             let n = 0xFFFFF;
             let mut v: Vec<_> =
                 (0..1+(n / mem::size_of::<$ty>() as u64))
-                .map(|x| x as $ty)
+                .map($f)
                 .collect();
             b.iter(|| black_box(&mut v[1..]).reverse());
             b.bytes = n;
@@ -307,7 +307,11 @@ macro_rules! reverse {
     }
 }
 
-reverse!(reverse_u8, u8);
-reverse!(reverse_u16, u16);
-reverse!(reverse_u32, u32);
-reverse!(reverse_u64, u64);
+reverse!(reverse_u8, u8, |x| x as u8);
+reverse!(reverse_u16, u16, |x| x as u16);
+reverse!(reverse_u8x3, [u8;3], |x| [x as u8, (x>>8) as u8, (x>>16) as u8]);
+reverse!(reverse_u32, u32, |x| x as u32);
+reverse!(reverse_u64, u64, |x| x as u64);
+reverse!(reverse_u128, u128, |x| x as u128);
+#[repr(simd)] struct F64x4(f64, f64, f64, f64);
+reverse!(reverse_simd_f64x4, F64x4, |x| { let x = x as f64; F64x4(x,x,x,x) });