Skip to content

Commit b954d4d

Browse files
committed
Rollup merge of rust-lang#52051 - scottmcm:swap-directly, r=alexcrichton
mem::swap the obvious way for types smaller than the SIMD optimization's block size LLVM isn't able to remove the alloca for the unaligned block in the post-SIMD tail in some cases, so doing this helps SRoA work in cases where it currently doesn't. Found in the `replace_with` RFC discussion. Examples of the improvements: <details> <summary>swapping `[u16; 3]` takes 1/3 fewer instructions and no stackalloc</summary> ```rust type Demo = [u16; 3]; pub fn swap_demo(x: &mut Demo, y: &mut Demo) { std::mem::swap(x, y); } ``` nightly: ```asm _ZN4blah9swap_demo17ha1732a9b71393a7eE: .seh_proc _ZN4blah9swap_demo17ha1732a9b71393a7eE sub rsp, 32 .seh_stackalloc 32 .seh_endprologue movzx eax, word ptr [rcx + 4] mov word ptr [rsp + 4], ax mov eax, dword ptr [rcx] mov dword ptr [rsp], eax movzx eax, word ptr [rdx + 4] mov word ptr [rcx + 4], ax mov eax, dword ptr [rdx] mov dword ptr [rcx], eax movzx eax, word ptr [rsp + 4] mov word ptr [rdx + 4], ax mov eax, dword ptr [rsp] mov dword ptr [rdx], eax add rsp, 32 ret .seh_handlerdata .section .text,"xr",one_only,_ZN4blah9swap_demo17ha1732a9b71393a7eE .seh_endproc ``` this PR: ```asm _ZN4blah9swap_demo17ha1732a9b71393a7eE: mov r8d, dword ptr [rcx] movzx r9d, word ptr [rcx + 4] movzx eax, word ptr [rdx + 4] mov word ptr [rcx + 4], ax mov eax, dword ptr [rdx] mov dword ptr [rcx], eax mov word ptr [rdx + 4], r9w mov dword ptr [rdx], r8d ret ``` </details> <details> <summary>`replace_with` optimizes down much better</summary> Inspired by rust-lang/rfcs#2490, ```rust fn replace_with<T, F>(x: &mut Option<T>, f: F) where F: FnOnce(Option<T>) -> Option<T> { *x = f(x.take()); } pub fn inc_opt(mut x: &mut Option<i32>) { replace_with(&mut x, |i| i.map(|j| j + 1)); } ``` Rust 1.26.0: ```asm _ZN4blah7inc_opt17heb0acb64c51777cfE: mov rax, qword ptr [rcx] movabs r8, 4294967296 add r8, rax shl rax, 32 movabs rdx, -4294967296 and rdx, r8 xor r8d, r8d test rax, rax cmove rdx, rax setne r8b or rdx, r8 mov qword ptr [rcx], rdx ret ``` Nightly (better thanks to ScalarPair, maybe?): ```asm _ZN4blah7inc_opt17h66df690be0b5899dE: mov r8, qword ptr [rcx] mov rdx, r8 shr rdx, 32 xor eax, eax test r8d, r8d setne al add edx, 1 mov dword ptr [rcx], eax mov dword ptr [rcx + 4], edx ret ``` This PR: ```asm _ZN4blah7inc_opt17h1426dc215ecbdb19E: xor eax, eax cmp dword ptr [rcx], 0 setne al mov dword ptr [rcx], eax add dword ptr [rcx + 4], 1 ret ``` Where that add is beautiful -- using an addressing mode to not even need to explicitly go through a register -- and the remaining imperfection is well-known (rust-lang#49420 (comment)). </details>
2 parents 7019cce + c9482f7 commit b954d4d

File tree

3 files changed

+41
-1
lines changed

3 files changed

+41
-1
lines changed

src/libcore/mem.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -638,7 +638,7 @@ pub unsafe fn uninitialized<T>() -> T {
638638
#[stable(feature = "rust1", since = "1.0.0")]
639639
pub fn swap<T>(x: &mut T, y: &mut T) {
640640
unsafe {
641-
ptr::swap_nonoverlapping(x, y, 1);
641+
ptr::swap_nonoverlapping_one(x, y);
642642
}
643643
}
644644

src/libcore/ptr.rs

+13
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,19 @@ pub unsafe fn swap_nonoverlapping<T>(x: *mut T, y: *mut T, count: usize) {
187187
swap_nonoverlapping_bytes(x, y, len)
188188
}
189189

190+
#[inline]
191+
pub(crate) unsafe fn swap_nonoverlapping_one<T>(x: *mut T, y: *mut T) {
192+
// For types smaller than the block optimization below,
193+
// just swap directly to avoid pessimizing codegen.
194+
if mem::size_of::<T>() < 32 {
195+
let z = read(x);
196+
copy_nonoverlapping(y, x, 1);
197+
write(y, z);
198+
} else {
199+
swap_nonoverlapping(x, y, 1);
200+
}
201+
}
202+
190203
#[inline]
191204
unsafe fn swap_nonoverlapping_bytes(x: *mut u8, y: *mut u8, len: usize) {
192205
// The approach here is to utilize simd to swap x & y efficiently. Testing reveals

src/test/codegen/swap-small-types.rs

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
// Copyright 2018 The Rust Project Developers. See the COPYRIGHT
2+
// file at the top-level directory of this distribution and at
3+
// http://rust-lang.org/COPYRIGHT.
4+
//
5+
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6+
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7+
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8+
// option. This file may not be copied, modified, or distributed
9+
// except according to those terms.
10+
11+
// compile-flags: -O
12+
// only-x86_64
13+
14+
#![crate_type = "lib"]
15+
16+
use std::mem::swap;
17+
18+
type RGB48 = [u16; 3];
19+
20+
// CHECK-LABEL: @swap_rgb48
21+
#[no_mangle]
22+
pub fn swap_rgb48(x: &mut RGB48, y: &mut RGB48) {
23+
// CHECK-NOT: alloca
24+
// CHECK: load i48
25+
// CHECK: store i48
26+
swap(x, y)
27+
}

0 commit comments

Comments
 (0)