From d7d67ad14b7c69325a3377bea83b9919398f9588 Mon Sep 17 00:00:00 2001 From: okaneco <47607823+okaneco@users.noreply.github.com> Date: Mon, 23 Sep 2024 00:10:18 -0400 Subject: [PATCH 1/2] Add new implementation benchmark Add LONG benchmarks for more comparison between the methods --- library/core/benches/ascii/is_ascii.rs | 45 ++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 3 deletions(-) diff --git a/library/core/benches/ascii/is_ascii.rs b/library/core/benches/ascii/is_ascii.rs index 4b2920c5eb45f..417d3e0fcbfe7 100644 --- a/library/core/benches/ascii/is_ascii.rs +++ b/library/core/benches/ascii/is_ascii.rs @@ -10,9 +10,12 @@ macro_rules! benches { // Ensure we benchmark cases where the functions are called with strings // that are not perfectly aligned or have a length which is not a // multiple of size_of::() (or both) - benches!(mod unaligned_head MEDIUM[1..] $($name $arg $body)+); - benches!(mod unaligned_tail MEDIUM[..(MEDIUM.len() - 1)] $($name $arg $body)+); - benches!(mod unaligned_both MEDIUM[1..(MEDIUM.len() - 1)] $($name $arg $body)+); + benches!(mod unaligned_head_medium MEDIUM[1..] $($name $arg $body)+); + benches!(mod unaligned_tail_medium MEDIUM[..(MEDIUM.len() - 1)] $($name $arg $body)+); + benches!(mod unaligned_both_medium MEDIUM[1..(MEDIUM.len() - 1)] $($name $arg $body)+); + benches!(mod unaligned_head_long LONG[1..] $($name $arg $body)+); + benches!(mod unaligned_tail_long LONG[..(LONG.len() - 1)] $($name $arg $body)+); + benches!(mod unaligned_both_long LONG[1..(LONG.len() - 1)] $($name $arg $body)+); }; (mod $mod_name: ident $input: ident [$range: expr] $($name: ident $arg: ident $body: block)+) => { @@ -49,6 +52,42 @@ benches! { fn case03_align_to_unrolled(bytes: &[u8]) { is_ascii_align_to_unrolled(bytes) } + + fn case04_while_loop(bytes: &[u8]) { + // Constant chosen to enable `pmovmskb` instruction on x86-64 + const N: usize = 32; + + let mut i = 0; + + while i + N <= bytes.len() { + let chunk_end = i + N; + + // Get LLVM to produce a `pmovmskb` instruction on x86-64 which + // creates a mask from the most significant bit of each byte. + // ASCII bytes are less than 128 (0x80), so their most significant + // bit is unset. Thus, detecting non-ASCII bytes can be done in one + // instruction. + let mut count = 0; + while i < chunk_end { + count += (bytes[i] <= 127) as u8; + i += 1; + } + + // All bytes should be <= 127 so count is equal to chunk size. + if count != N as u8 { + return false; + } + } + + // Process the remaining `bytes.len() % N` bytes. + let mut is_ascii = true; + while i < bytes.len() { + is_ascii &= bytes[i] <= 127; + i += 1; + } + + is_ascii + } } // These are separate since it's easier to debug errors if they don't go through From 1b5c02b7578879ebfcd54fdc6a4f86c49d2d9ecd Mon Sep 17 00:00:00 2001 From: okaneco <47607823+okaneco@users.noreply.github.com> Date: Thu, 26 Sep 2024 19:39:14 -0400 Subject: [PATCH 2/2] Add `is_ascii` function optimized for x86-64 for [u8] The new `is_ascii` function is optimized to use the `pmovmskb` vector instruction which tests the high bit in a lane. This corresponds to the same check of whether a byte is ASCII so ASCII validity checking can be vectorized. This instruction does not exist on other platforms so it is likely to regress performance and is gated to all(target_arch = "x86_64", target_feature = "sse2"). Add codegen test Remove crate::mem import for functions included in the prelude --- library/core/benches/ascii/is_ascii.rs | 20 ++++---- library/core/src/slice/ascii.rs | 70 +++++++++++++++++++++----- tests/codegen/slice-is-ascii.rs | 16 ++++++ 3 files changed, 85 insertions(+), 21 deletions(-) create mode 100644 tests/codegen/slice-is-ascii.rs diff --git a/library/core/benches/ascii/is_ascii.rs b/library/core/benches/ascii/is_ascii.rs index 417d3e0fcbfe7..ced7084fb0e48 100644 --- a/library/core/benches/ascii/is_ascii.rs +++ b/library/core/benches/ascii/is_ascii.rs @@ -54,27 +54,29 @@ benches! { } fn case04_while_loop(bytes: &[u8]) { - // Constant chosen to enable `pmovmskb` instruction on x86-64 - const N: usize = 32; + // Process chunks of 32 bytes at a time in the fast path to enable + // auto-vectorization and use of `pmovmskb`. Two 128-bit vector registers + // can be OR'd together and then the resulting vector can be tested for + // non-ASCII bytes. + const CHUNK_SIZE: usize = 32; let mut i = 0; - while i + N <= bytes.len() { - let chunk_end = i + N; + while i + CHUNK_SIZE <= bytes.len() { + let chunk_end = i + CHUNK_SIZE; // Get LLVM to produce a `pmovmskb` instruction on x86-64 which // creates a mask from the most significant bit of each byte. // ASCII bytes are less than 128 (0x80), so their most significant - // bit is unset. Thus, detecting non-ASCII bytes can be done in one - // instruction. + // bit is unset. let mut count = 0; while i < chunk_end { - count += (bytes[i] <= 127) as u8; + count += bytes[i].is_ascii() as u8; i += 1; } // All bytes should be <= 127 so count is equal to chunk size. - if count != N as u8 { + if count != CHUNK_SIZE as u8 { return false; } } @@ -82,7 +84,7 @@ benches! { // Process the remaining `bytes.len() % N` bytes. let mut is_ascii = true; while i < bytes.len() { - is_ascii &= bytes[i] <= 127; + is_ascii &= bytes[i].is_ascii(); i += 1; } diff --git a/library/core/src/slice/ascii.rs b/library/core/src/slice/ascii.rs index 58ba3a1573a81..a50e010955dee 100644 --- a/library/core/src/slice/ascii.rs +++ b/library/core/src/slice/ascii.rs @@ -3,8 +3,9 @@ use core::ascii::EscapeDefault; use crate::fmt::{self, Write}; +#[cfg(not(all(target_arch = "x86_64", target_feature = "sse2")))] use crate::intrinsics::const_eval_select; -use crate::{ascii, iter, mem, ops}; +use crate::{ascii, iter, ops}; #[cfg(not(test))] impl [u8] { @@ -308,14 +309,6 @@ impl<'a> fmt::Debug for EscapeAscii<'a> { } } -/// Returns `true` if any byte in the word `v` is nonascii (>= 128). Snarfed -/// from `../str/mod.rs`, which does something similar for utf8 validation. -#[inline] -const fn contains_nonascii(v: usize) -> bool { - const NONASCII_MASK: usize = usize::repeat_u8(0x80); - (NONASCII_MASK & v) != 0 -} - /// ASCII test *without* the chunk-at-a-time optimizations. /// /// This is carefully structured to produce nice small code -- it's smaller in @@ -346,6 +339,7 @@ pub const fn is_ascii_simple(mut bytes: &[u8]) -> bool { /// /// If any of these loads produces something for which `contains_nonascii` /// (above) returns true, then we know the answer is false. +#[cfg(not(all(target_arch = "x86_64", target_feature = "sse2")))] #[inline] #[rustc_allow_const_fn_unstable(const_eval_select)] // fallback impl has same behavior const fn is_ascii(s: &[u8]) -> bool { @@ -356,7 +350,14 @@ const fn is_ascii(s: &[u8]) -> bool { if const { is_ascii_simple(s) } else { - const USIZE_SIZE: usize = mem::size_of::(); + /// Returns `true` if any byte in the word `v` is nonascii (>= 128). Snarfed + /// from `../str/mod.rs`, which does something similar for utf8 validation. + const fn contains_nonascii(v: usize) -> bool { + const NONASCII_MASK: usize = usize::repeat_u8(0x80); + (NONASCII_MASK & v) != 0 + } + + const USIZE_SIZE: usize = size_of::(); let len = s.len(); let align_offset = s.as_ptr().align_offset(USIZE_SIZE); @@ -366,7 +367,7 @@ const fn is_ascii(s: &[u8]) -> bool { // // We also do this for architectures where `size_of::()` isn't // sufficient alignment for `usize`, because it's a weird edge case. - if len < USIZE_SIZE || len < align_offset || USIZE_SIZE < mem::align_of::() { + if len < USIZE_SIZE || len < align_offset || USIZE_SIZE < align_of::() { return is_ascii_simple(s); } @@ -400,7 +401,7 @@ const fn is_ascii(s: &[u8]) -> bool { // have alignment information it should have given a `usize::MAX` for // `align_offset` earlier, sending things through the scalar path instead of // this one, so this check should pass if it's reachable. - debug_assert!(word_ptr.is_aligned_to(mem::align_of::())); + debug_assert!(word_ptr.is_aligned_to(align_of::())); // Read subsequent words until the last aligned word, excluding the last // aligned word by itself to be done in tail check later, to ensure that @@ -435,3 +436,48 @@ const fn is_ascii(s: &[u8]) -> bool { } ) } + +/// ASCII test optimized to use the `pmovmskb` instruction available on `x86-64` +/// platforms. +/// +/// Other platforms are not likely to benefit from this code structure, so they +/// use SWAR techniques to test for ASCII in `usize`-sized chunks. +#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))] +#[inline] +const fn is_ascii(bytes: &[u8]) -> bool { + // Process chunks of 32 bytes at a time in the fast path to enable + // auto-vectorization and use of `pmovmskb`. Two 128-bit vector registers + // can be OR'd together and then the resulting vector can be tested for + // non-ASCII bytes. + const CHUNK_SIZE: usize = 32; + + let mut i = 0; + + while i + CHUNK_SIZE <= bytes.len() { + let chunk_end = i + CHUNK_SIZE; + + // Get LLVM to produce a `pmovmskb` instruction on x86-64 which + // creates a mask from the most significant bit of each byte. + // ASCII bytes are less than 128 (0x80), so their most significant + // bit is unset. + let mut count = 0; + while i < chunk_end { + count += bytes[i].is_ascii() as u8; + i += 1; + } + + // All bytes should be <= 127 so count is equal to chunk size. + if count != CHUNK_SIZE as u8 { + return false; + } + } + + // Process the remaining `bytes.len() % N` bytes. + let mut is_ascii = true; + while i < bytes.len() { + is_ascii &= bytes[i].is_ascii(); + i += 1; + } + + is_ascii +} diff --git a/tests/codegen/slice-is-ascii.rs b/tests/codegen/slice-is-ascii.rs new file mode 100644 index 0000000000000..b1e97154609b2 --- /dev/null +++ b/tests/codegen/slice-is-ascii.rs @@ -0,0 +1,16 @@ +//@ only-x86_64 +//@ compile-flags: -C opt-level=3 +#![crate_type = "lib"] + +/// Check that the fast-path of `is_ascii` uses a `pmovmskb` instruction. +/// Platforms lacking an equivalent instruction use other techniques for +/// optimizing `is_ascii`. +// CHECK-LABEL: @is_ascii_autovectorized +#[no_mangle] +pub fn is_ascii_autovectorized(s: &[u8]) -> bool { + // CHECK: load <32 x i8> + // CHECK-NEXT: icmp slt <32 x i8> + // CHECK-NEXT: bitcast <32 x i1> + // CHECK-NEXT: icmp eq i32 + s.is_ascii() +}