From d7d67ad14b7c69325a3377bea83b9919398f9588 Mon Sep 17 00:00:00 2001
From: okaneco <47607823+okaneco@users.noreply.github.com>
Date: Mon, 23 Sep 2024 00:10:18 -0400
Subject: [PATCH 1/2] Add new implementation benchmark

Add LONG benchmarks for more comparison between the methods
---
 library/core/benches/ascii/is_ascii.rs | 45 ++++++++++++++++++++++++--
 1 file changed, 42 insertions(+), 3 deletions(-)
diff --git a/library/core/benches/ascii/is_ascii.rs b/library/core/benches/ascii/is_ascii.rs
index 4b2920c5eb45f..417d3e0fcbfe7 100644
--- a/library/core/benches/ascii/is_ascii.rs
+++ b/library/core/benches/ascii/is_ascii.rs
@@ -10,9 +10,12 @@ macro_rules! benches {
         // Ensure we benchmark cases where the functions are called with strings
         // that are not perfectly aligned or have a length which is not a
         // multiple of size_of::<usize>() (or both)
-        benches!(mod unaligned_head MEDIUM[1..] $($name $arg $body)+);
-        benches!(mod unaligned_tail MEDIUM[..(MEDIUM.len() - 1)] $($name $arg $body)+);
-        benches!(mod unaligned_both MEDIUM[1..(MEDIUM.len() - 1)] $($name $arg $body)+);
+        benches!(mod unaligned_head_medium MEDIUM[1..] $($name $arg $body)+);
+        benches!(mod unaligned_tail_medium MEDIUM[..(MEDIUM.len() - 1)] $($name $arg $body)+);
+        benches!(mod unaligned_both_medium MEDIUM[1..(MEDIUM.len() - 1)] $($name $arg $body)+);
+        benches!(mod unaligned_head_long LONG[1..] $($name $arg $body)+);
+        benches!(mod unaligned_tail_long LONG[..(LONG.len() - 1)] $($name $arg $body)+);
+        benches!(mod unaligned_both_long LONG[1..(LONG.len() - 1)] $($name $arg $body)+);
     };
 
     (mod $mod_name: ident $input: ident [$range: expr] $($name: ident $arg: ident $body: block)+) => {
@@ -49,6 +52,42 @@ benches! {
     fn case03_align_to_unrolled(bytes: &[u8]) {
         is_ascii_align_to_unrolled(bytes)
     }
+
+    fn case04_while_loop(bytes: &[u8]) {
+        // Constant chosen to enable `pmovmskb` instruction on x86-64
+        const N: usize = 32;
+
+        let mut i = 0;
+
+        while i + N <= bytes.len() {
+            let chunk_end = i + N;
+
+            // Get LLVM to produce a `pmovmskb` instruction on x86-64 which
+            // creates a mask from the most significant bit of each byte.
+            // ASCII bytes are less than 128 (0x80), so their most significant
+            // bit is unset. Thus, detecting non-ASCII bytes can be done in one
+            // instruction.
+            let mut count = 0;
+            while i < chunk_end {
+                count += (bytes[i] <= 127) as u8;
+                i += 1;
+            }
+
+            // All bytes should be <= 127 so count is equal to chunk size.
+            if count != N as u8 {
+                return false;
+            }
+        }
+
+        // Process the remaining `bytes.len() % N` bytes.
+        let mut is_ascii = true;
+        while i < bytes.len() {
+            is_ascii &= bytes[i] <= 127;
+            i += 1;
+        }
+
+        is_ascii
+    }
 }
 
 // These are separate since it's easier to debug errors if they don't go through

From 1b5c02b7578879ebfcd54fdc6a4f86c49d2d9ecd Mon Sep 17 00:00:00 2001
From: okaneco <47607823+okaneco@users.noreply.github.com>
Date: Thu, 26 Sep 2024 19:39:14 -0400
Subject: [PATCH 2/2] Add `is_ascii` function optimized for x86-64 for [u8]

The new `is_ascii` function is optimized to use the
`pmovmskb` vector instruction which tests the high bit in a lane.
This corresponds to the same check of whether a byte is ASCII so
ASCII validity checking can be vectorized. This instruction
does not exist on other platforms so it is likely to regress performance
and is gated to all(target_arch = "x86_64", target_feature = "sse2").

Add codegen test
Remove crate::mem import for functions included in the prelude
---
 library/core/benches/ascii/is_ascii.rs | 20 ++++----
 library/core/src/slice/ascii.rs        | 70 +++++++++++++++++++++-----
 tests/codegen/slice-is-ascii.rs        | 16 ++++++
 3 files changed, 85 insertions(+), 21 deletions(-)
 create mode 100644 tests/codegen/slice-is-ascii.rs

diff --git a/library/core/benches/ascii/is_ascii.rs b/library/core/benches/ascii/is_ascii.rs
index 417d3e0fcbfe7..ced7084fb0e48 100644
--- a/library/core/benches/ascii/is_ascii.rs
+++ b/library/core/benches/ascii/is_ascii.rs
@@ -54,27 +54,29 @@ benches! {
     }
 
     fn case04_while_loop(bytes: &[u8]) {
-        // Constant chosen to enable `pmovmskb` instruction on x86-64
-        const N: usize = 32;
+        // Process chunks of 32 bytes at a time in the fast path to enable
+        // auto-vectorization and use of `pmovmskb`. Two 128-bit vector registers
+        // can be OR'd together and then the resulting vector can be tested for
+        // non-ASCII bytes.
+        const CHUNK_SIZE: usize = 32;
 
         let mut i = 0;
 
-        while i + N <= bytes.len() {
-            let chunk_end = i + N;
+        while i + CHUNK_SIZE <= bytes.len() {
+            let chunk_end = i + CHUNK_SIZE;
 
             // Get LLVM to produce a `pmovmskb` instruction on x86-64 which
             // creates a mask from the most significant bit of each byte.
             // ASCII bytes are less than 128 (0x80), so their most significant
-            // bit is unset. Thus, detecting non-ASCII bytes can be done in one
-            // instruction.
+            // bit is unset.
             let mut count = 0;
             while i < chunk_end {
-                count += (bytes[i] <= 127) as u8;
+                count += bytes[i].is_ascii() as u8;
                 i += 1;
             }
 
             // All bytes should be <= 127 so count is equal to chunk size.
-            if count != N as u8 {
+            if count != CHUNK_SIZE as u8 {
                 return false;
             }
         }
@@ -82,7 +84,7 @@ benches! {
         // Process the remaining `bytes.len() % N` bytes.
         let mut is_ascii = true;
         while i < bytes.len() {
-            is_ascii &= bytes[i] <= 127;
+            is_ascii &= bytes[i].is_ascii();
             i += 1;
         }
 
diff --git a/library/core/src/slice/ascii.rs b/library/core/src/slice/ascii.rs
index 58ba3a1573a81..a50e010955dee 100644
--- a/library/core/src/slice/ascii.rs
+++ b/library/core/src/slice/ascii.rs
@@ -3,8 +3,9 @@
 use core::ascii::EscapeDefault;
 
 use crate::fmt::{self, Write};
+#[cfg(not(all(target_arch = "x86_64", target_feature = "sse2")))]
 use crate::intrinsics::const_eval_select;
-use crate::{ascii, iter, mem, ops};
+use crate::{ascii, iter, ops};
 
 #[cfg(not(test))]
 impl [u8] {
@@ -308,14 +309,6 @@ impl<'a> fmt::Debug for EscapeAscii<'a> {
     }
 }
 
-/// Returns `true` if any byte in the word `v` is nonascii (>= 128). Snarfed
-/// from `../str/mod.rs`, which does something similar for utf8 validation.
-#[inline]
-const fn contains_nonascii(v: usize) -> bool {
-    const NONASCII_MASK: usize = usize::repeat_u8(0x80);
-    (NONASCII_MASK & v) != 0
-}
-
 /// ASCII test *without* the chunk-at-a-time optimizations.
 ///
 /// This is carefully structured to produce nice small code -- it's smaller in
@@ -346,6 +339,7 @@ pub const fn is_ascii_simple(mut bytes: &[u8]) -> bool {
 ///
 /// If any of these loads produces something for which `contains_nonascii`
 /// (above) returns true, then we know the answer is false.
+#[cfg(not(all(target_arch = "x86_64", target_feature = "sse2")))]
 #[inline]
 #[rustc_allow_const_fn_unstable(const_eval_select)] // fallback impl has same behavior
 const fn is_ascii(s: &[u8]) -> bool {
@@ -356,7 +350,14 @@ const fn is_ascii(s: &[u8]) -> bool {
         if const {
             is_ascii_simple(s)
         } else {
-            const USIZE_SIZE: usize = mem::size_of::<usize>();
+            /// Returns `true` if any byte in the word `v` is nonascii (>= 128). Snarfed
+            /// from `../str/mod.rs`, which does something similar for utf8 validation.
+            const fn contains_nonascii(v: usize) -> bool {
+                const NONASCII_MASK: usize = usize::repeat_u8(0x80);
+                (NONASCII_MASK & v) != 0
+            }
+
+            const USIZE_SIZE: usize = size_of::<usize>();
 
             let len = s.len();
             let align_offset = s.as_ptr().align_offset(USIZE_SIZE);
@@ -366,7 +367,7 @@ const fn is_ascii(s: &[u8]) -> bool {
             //
             // We also do this for architectures where `size_of::<usize>()` isn't
             // sufficient alignment for `usize`, because it's a weird edge case.
-            if len < USIZE_SIZE || len < align_offset || USIZE_SIZE < mem::align_of::<usize>() {
+            if len < USIZE_SIZE || len < align_offset || USIZE_SIZE < align_of::<usize>() {
                 return is_ascii_simple(s);
             }
 
@@ -400,7 +401,7 @@ const fn is_ascii(s: &[u8]) -> bool {
             // have alignment information it should have given a `usize::MAX` for
             // `align_offset` earlier, sending things through the scalar path instead of
             // this one, so this check should pass if it's reachable.
-            debug_assert!(word_ptr.is_aligned_to(mem::align_of::<usize>()));
+            debug_assert!(word_ptr.is_aligned_to(align_of::<usize>()));
 
             // Read subsequent words until the last aligned word, excluding the last
             // aligned word by itself to be done in tail check later, to ensure that
@@ -435,3 +436,48 @@ const fn is_ascii(s: &[u8]) -> bool {
         }
     )
 }
+
+/// ASCII test optimized to use the `pmovmskb` instruction available on `x86-64`
+/// platforms.
+///
+/// Other platforms are not likely to benefit from this code structure, so they
+/// use SWAR techniques to test for ASCII in `usize`-sized chunks.
+#[cfg(all(target_arch = "x86_64", target_feature = "sse2"))]
+#[inline]
+const fn is_ascii(bytes: &[u8]) -> bool {
+    // Process chunks of 32 bytes at a time in the fast path to enable
+    // auto-vectorization and use of `pmovmskb`. Two 128-bit vector registers
+    // can be OR'd together and then the resulting vector can be tested for
+    // non-ASCII bytes.
+    const CHUNK_SIZE: usize = 32;
+
+    let mut i = 0;
+
+    while i + CHUNK_SIZE <= bytes.len() {
+        let chunk_end = i + CHUNK_SIZE;
+
+        // Get LLVM to produce a `pmovmskb` instruction on x86-64 which
+        // creates a mask from the most significant bit of each byte.
+        // ASCII bytes are less than 128 (0x80), so their most significant
+        // bit is unset.
+        let mut count = 0;
+        while i < chunk_end {
+            count += bytes[i].is_ascii() as u8;
+            i += 1;
+        }
+
+        // All bytes should be <= 127 so count is equal to chunk size.
+        if count != CHUNK_SIZE as u8 {
+            return false;
+        }
+    }
+
+    // Process the remaining `bytes.len() % N` bytes.
+    let mut is_ascii = true;
+    while i < bytes.len() {
+        is_ascii &= bytes[i].is_ascii();
+        i += 1;
+    }
+
+    is_ascii
+}
diff --git a/tests/codegen/slice-is-ascii.rs b/tests/codegen/slice-is-ascii.rs
new file mode 100644
index 0000000000000..b1e97154609b2
--- /dev/null
+++ b/tests/codegen/slice-is-ascii.rs
@@ -0,0 +1,16 @@
+//@ only-x86_64
+//@ compile-flags: -C opt-level=3
+#![crate_type = "lib"]
+
+/// Check that the fast-path of `is_ascii` uses a `pmovmskb` instruction.
+/// Platforms lacking an equivalent instruction use other techniques for
+/// optimizing `is_ascii`.
+// CHECK-LABEL: @is_ascii_autovectorized
+#[no_mangle]
+pub fn is_ascii_autovectorized(s: &[u8]) -> bool {
+    // CHECK: load <32 x i8>
+    // CHECK-NEXT: icmp slt <32 x i8>
+    // CHECK-NEXT: bitcast <32 x i1>
+    // CHECK-NEXT: icmp eq i32
+    s.is_ascii()
+}