Skip to content

Commit

Permalink
Rollup merge of rust-lang#39538 - stjepang:slightly-optimize-sort, r=…
Browse files Browse the repository at this point in the history
…alexcrichton

Slightly optimize slice::sort

First, get rid of some bound checks.

Second, instead of comparing by ternary `compare` function, use a binary function testing whether an element is less than some other element. This apparently makes it easier for the compiler to reason about the code. I've noticed the same effect with [pdqsort](https://github.com/stjepang/pdqsort) crate.

Benchmark:

```
name                                        before ns/iter        after ns/iter         diff ns/iter   diff %
slice::bench::sort_large_ascending          8,969 (8919 MB/s)     7,410 (10796 MB/s)          -1,559  -17.38%
slice::bench::sort_large_big_ascending      355,640 (3599 MB/s)   359,137 (3564 MB/s)          3,497    0.98%
slice::bench::sort_large_big_descending     427,112 (2996 MB/s)   424,721 (3013 MB/s)         -2,391   -0.56%
slice::bench::sort_large_big_random         2,207,799 (579 MB/s)  2,138,804 (598 MB/s)       -68,995   -3.13%
slice::bench::sort_large_descending         13,694 (5841 MB/s)    13,514 (5919 MB/s)            -180   -1.31%
slice::bench::sort_large_mostly_ascending   239,697 (333 MB/s)    203,542 (393 MB/s)         -36,155  -15.08%
slice::bench::sort_large_mostly_descending  270,102 (296 MB/s)    234,263 (341 MB/s)         -35,839  -13.27%
slice::bench::sort_large_random             513,406 (155 MB/s)    470,084 (170 MB/s)         -43,322   -8.44%
slice::bench::sort_large_random_expensive   23,650,321 (3 MB/s)   23,675,098 (3 MB/s)         24,777    0.10%
slice::bench::sort_medium_ascending         143 (5594 MB/s)       132 (6060 MB/s)                -11   -7.69%
slice::bench::sort_medium_descending        197 (4060 MB/s)       188 (4255 MB/s)                 -9   -4.57%
slice::bench::sort_medium_random            3,358 (238 MB/s)      3,271 (244 MB/s)               -87   -2.59%
slice::bench::sort_small_ascending          32 (2500 MB/s)        32 (2500 MB/s)                   0    0.00%
slice::bench::sort_small_big_ascending      97 (13195 MB/s)       97 (13195 MB/s)                  0    0.00%
slice::bench::sort_small_big_descending     247 (5182 MB/s)       249 (5140 MB/s)                  2    0.81%
slice::bench::sort_small_big_random         502 (2549 MB/s)       498 (2570 MB/s)                 -4   -0.80%
slice::bench::sort_small_descending         55 (1454 MB/s)        61 (1311 MB/s)                   6   10.91%
slice::bench::sort_small_random             358 (223 MB/s)        356 (224 MB/s)                  -2   -0.56%
```
  • Loading branch information
frewsxcv committed Feb 5, 2017
2 parents ec47f9d + fa457bf commit 78bf59b
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 38 deletions.
68 changes: 36 additions & 32 deletions src/libcollections/slice.rs
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@
#![cfg_attr(test, allow(unused_imports, dead_code))]

use alloc::boxed::Box;
use core::cmp::Ordering::{self, Greater};
use core::cmp::Ordering::{self, Less};
use core::mem::size_of;
use core::mem;
use core::ptr;
Expand Down Expand Up @@ -1089,7 +1089,7 @@ impl<T> [T] {
pub fn sort(&mut self)
where T: Ord
{
self.sort_by(|a, b| a.cmp(b))
merge_sort(self, |a, b| a.lt(b));
}

/// Sorts the slice using `f` to extract a key to compare elements by.
Expand Down Expand Up @@ -1119,7 +1119,7 @@ impl<T> [T] {
pub fn sort_by_key<B, F>(&mut self, mut f: F)
where F: FnMut(&T) -> B, B: Ord
{
self.sort_by(|a, b| f(a).cmp(&f(b)))
merge_sort(self, |a, b| f(a).lt(&f(b)));
}

/// Sorts the slice using `compare` to compare elements.
Expand Down Expand Up @@ -1149,10 +1149,10 @@ impl<T> [T] {
/// ```
#[stable(feature = "rust1", since = "1.0.0")]
#[inline]
pub fn sort_by<F>(&mut self, compare: F)
pub fn sort_by<F>(&mut self, mut compare: F)
where F: FnMut(&T, &T) -> Ordering
{
merge_sort(self, compare)
merge_sort(self, |a, b| compare(a, b) == Less);
}

/// Copies the elements from `src` into `self`.
Expand Down Expand Up @@ -1355,10 +1355,10 @@ impl<T: Clone> ToOwned for [T] {
/// Inserts `v[0]` into pre-sorted sequence `v[1..]` so that whole `v[..]` becomes sorted.
///
/// This is the integral subroutine of insertion sort.
fn insert_head<T, F>(v: &mut [T], compare: &mut F)
where F: FnMut(&T, &T) -> Ordering
fn insert_head<T, F>(v: &mut [T], is_less: &mut F)
where F: FnMut(&T, &T) -> bool
{
if v.len() >= 2 && compare(&v[0], &v[1]) == Greater {
if v.len() >= 2 && is_less(&v[1], &v[0]) {
unsafe {
// There are three ways to implement insertion here:
//
Expand All @@ -1381,12 +1381,12 @@ fn insert_head<T, F>(v: &mut [T], compare: &mut F)

// Intermediate state of the insertion process is always tracked by `hole`, which
// serves two purposes:
// 1. Protects integrity of `v` from panics in `compare`.
// 1. Protects integrity of `v` from panics in `is_less`.
// 2. Fills the remaining hole in `v` in the end.
//
// Panic safety:
//
// If `compare` panics at any point during the process, `hole` will get dropped and
// If `is_less` panics at any point during the process, `hole` will get dropped and
// fill the hole in `v` with `tmp`, thus ensuring that `v` still holds every object it
// initially held exactly once.
let mut hole = InsertionHole {
Expand All @@ -1396,7 +1396,7 @@ fn insert_head<T, F>(v: &mut [T], compare: &mut F)
ptr::copy_nonoverlapping(&v[1], &mut v[0], 1);

for i in 2..v.len() {
if compare(&tmp.value, &v[i]) != Greater {
if !is_less(&v[i], &tmp.value) {
break;
}
ptr::copy_nonoverlapping(&v[i], &mut v[i - 1], 1);
Expand Down Expand Up @@ -1432,8 +1432,8 @@ fn insert_head<T, F>(v: &mut [T], compare: &mut F)
///
/// The two slices must be non-empty and `mid` must be in bounds. Buffer `buf` must be long enough
/// to hold a copy of the shorter slice. Also, `T` must not be a zero-sized type.
unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, compare: &mut F)
where F: FnMut(&T, &T) -> Ordering
unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, is_less: &mut F)
where F: FnMut(&T, &T) -> bool
{
let len = v.len();
let v = v.as_mut_ptr();
Expand All @@ -1449,12 +1449,12 @@ unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, compare: &mut F)
// hole in `v`.
//
// Intermediate state of the process is always tracked by `hole`, which serves two purposes:
// 1. Protects integrity of `v` from panics in `compare`.
// 1. Protects integrity of `v` from panics in `is_less`.
// 2. Fills the remaining hole in `v` if the longer run gets consumed first.
//
// Panic safety:
//
// If `compare` panics at any point during the process, `hole` will get dropped and fill the
// If `is_less` panics at any point during the process, `hole` will get dropped and fill the
// hole in `v` with the unconsumed range in `buf`, thus ensuring that `v` still holds every
// object it initially held exactly once.
let mut hole;
Expand All @@ -1476,7 +1476,7 @@ unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, compare: &mut F)
while *left < hole.end && right < v_end {
// Consume the lesser side.
// If equal, prefer the left run to maintain stability.
let to_copy = if compare(&**left, &*right) == Greater {
let to_copy = if is_less(&*right, &**left) {
get_and_increment(&mut right)
} else {
get_and_increment(left)
Expand All @@ -1500,7 +1500,7 @@ unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, compare: &mut F)
while v < *left && buf < *right {
// Consume the greater side.
// If equal, prefer the right run to maintain stability.
let to_copy = if compare(&*left.offset(-1), &*right.offset(-1)) == Greater {
let to_copy = if is_less(&*right.offset(-1), &*left.offset(-1)) {
decrement_and_get(left)
} else {
decrement_and_get(right)
Expand Down Expand Up @@ -1550,8 +1550,8 @@ unsafe fn merge<T, F>(v: &mut [T], mid: usize, buf: *mut T, compare: &mut F)
/// 2. for every `i` in `2..runs.len()`: `runs[i - 2].len > runs[i - 1].len + runs[i].len`
///
/// The invariants ensure that the total running time is `O(n log n)` worst-case.
fn merge_sort<T, F>(v: &mut [T], mut compare: F)
where F: FnMut(&T, &T) -> Ordering
fn merge_sort<T, F>(v: &mut [T], mut is_less: F)
where F: FnMut(&T, &T) -> bool
{
// Sorting has no meaningful behavior on zero-sized types.
if size_of::<T>() == 0 {
Expand All @@ -1565,7 +1565,7 @@ fn merge_sort<T, F>(v: &mut [T], mut compare: F)
//
// Short runs are extended using insertion sort to span at least `min_run` elements, in order
// to improve performance.
let (max_insertion, min_run) = if size_of::<T>() <= 16 {
let (max_insertion, min_run) = if size_of::<T>() <= 2 * mem::size_of::<usize>() {
(64, 32)
} else {
(32, 16)
Expand All @@ -1577,15 +1577,15 @@ fn merge_sort<T, F>(v: &mut [T], mut compare: F)
if len <= max_insertion {
if len >= 2 {
for i in (0..len-1).rev() {
insert_head(&mut v[i..], &mut compare);
insert_head(&mut v[i..], &mut is_less);
}
}
return;
}

// Allocate a buffer to use as scratch memory. We keep the length 0 so we can keep in it
// shallow copies of the contents of `v` without risking the dtors running on copies if
// `compare` panics. When merging two sorted runs, this buffer holds a copy of the shorter run,
// `is_less` panics. When merging two sorted runs, this buffer holds a copy of the shorter run,
// which will always have length at most `len / 2`.
let mut buf = Vec::with_capacity(len / 2);

Expand All @@ -1600,14 +1600,18 @@ fn merge_sort<T, F>(v: &mut [T], mut compare: F)
let mut start = end - 1;
if start > 0 {
start -= 1;
if compare(&v[start], &v[start + 1]) == Greater {
while start > 0 && compare(&v[start - 1], &v[start]) == Greater {
start -= 1;
}
v[start..end].reverse();
} else {
while start > 0 && compare(&v[start - 1], &v[start]) != Greater {
start -= 1;
unsafe {
if is_less(v.get_unchecked(start + 1), v.get_unchecked(start)) {
while start > 0 && is_less(v.get_unchecked(start),
v.get_unchecked(start - 1)) {
start -= 1;
}
v[start..end].reverse();
} else {
while start > 0 && !is_less(v.get_unchecked(start),
v.get_unchecked(start - 1)) {
start -= 1;
}
}
}
}
Expand All @@ -1616,7 +1620,7 @@ fn merge_sort<T, F>(v: &mut [T], mut compare: F)
// merge sort on short sequences, so this significantly improves performance.
while start > 0 && end - start < min_run {
start -= 1;
insert_head(&mut v[start..end], &mut compare);
insert_head(&mut v[start..end], &mut is_less);
}

// Push this run onto the stack.
Expand All @@ -1632,7 +1636,7 @@ fn merge_sort<T, F>(v: &mut [T], mut compare: F)
let right = runs[r];
unsafe {
merge(&mut v[left.start .. right.start + right.len], left.len, buf.as_mut_ptr(),
&mut compare);
&mut is_less);
}
runs[r] = Run {
start: left.start,
Expand Down
9 changes: 3 additions & 6 deletions src/libcollectionstest/slice.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1429,18 +1429,15 @@ mod bench {
fn sort_large_random_expensive(b: &mut Bencher) {
let len = 10000;
b.iter(|| {
let mut v = gen_random(len);
let mut count = 0;
let cmp = move |a: &u64, b: &u64| {
v.sort_by(|a: &u64, b: &u64| {
count += 1;
if count % 1_000_000_000 == 0 {
panic!("should not happen");
}
(*a as f64).cos().partial_cmp(&(*b as f64).cos()).unwrap()
};

let mut v = gen_random(len);
v.sort_by(cmp);

});
black_box(count);
});
b.bytes = len as u64 * mem::size_of::<u64>() as u64;
Expand Down

0 comments on commit 78bf59b

Please sign in to comment.