✨ v2.3.0

JumperBot · Jul 15, 2024 · bc4e6b4 · bc4e6b4
1 parent 4a323c2
commit bc4e6b4
Show file tree

Hide file tree

Showing 5 changed files with 104 additions and 96 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "whitespace-sifter"
-version = "2.2.0"
+version = "2.3.0"
 edition = "2021"
 authors = ["JumperBot_"]
 description = "Sift duplicate whitespaces away!"

diff --git a/README.md b/README.md
@@ -32,15 +32,13 @@ println!(
 ## ✨ Sift Duplicate Whitespaces In One Function Call
 
 This crate **helps you** remove duplicate [whitespaces](https://doc.rust-lang.org/reference/whitespace.html) within a `string`.  
-Other than that, it naturally removes the whitespaces at the start and end of the `string`.
+It naturally removes the whitespaces at the start and end of the `string`.
 
 ---
 
 ## ⚡️Benchmarks
 
-Performance is one of the priorities of this crate.  
-One of the advises is to not listen to repository authors/maintainers when it comes to benchmarks.  
-You are free to run `cargo bench` on your machine after cloning this repository instead.  
+Performance is a priority; Most updates are performance improvements.  
 The benchmark uses a transcript of the [Bee Movie](https://movies.fandom.com/wiki/Bee_Movie/Transcript).
 
 Execute these commands to benchmark:
@@ -54,13 +52,20 @@ $ cargo bench
 You should only look for results that look like the following:
 
 ```bash
-Sift/Sift               time:   [176.65 µs 177.11 µs 177.73 µs]
+Sift/Sift               time:   [159.31 µs 159.60 µs 159.95 µs]
 Sift Preserved/Sift Preserved
-                        time:   [242.64 µs 243.04 µs 243.79 µs]
+                        time:   [198.11 µs 198.21 µs 198.32 µs]
 ```
 
 In just 0.0001 seconds; Pretty impressive, no?  
-Go try it on a better machine, I guess.
+<details>
+<summary>Go try it on a better machine, I guess.</summary>
+Benchmark specifications:  
+<ul>
+<li>Processor: Intel(R) Core(TM) i5-8350U CPU @ 1.70GHz 1.90 GHz</li>
+<li>Memory: RAM 16.0 GB (15.8 GB usable)</li>
+<li>System: GNU/Linux 5.15.153.1-microsoft-standard-WSL2 x86_64</li>
+</details>
 
 ---
 

diff --git a/src/lib.rs b/src/lib.rs
@@ -1,6 +1,6 @@
 //! Sift duplicate whitespaces away in just one function call.
 //! This crate **helps you** remove duplicate [whitespaces](https://doc.rust-lang.org/reference/whitespace.html) within a `string`.  
-//! Other than that, it naturally removes the whitespaces at the start and end of the `string`.
+//! It naturally removes the whitespaces at the start and end of the `string`.
 //!
 //! # Examples
 //!
@@ -20,6 +20,10 @@
 //! );
 //! ```
 
+mod unsafe_vec;
+
+use unsafe_vec::UnsafeVec;
+
 /// A trait containing all `string` whitespace-sifting functions.
 pub trait WhitespaceSifter: AsRef<str> {
     /// This removes duplicate [whitespaces](https://doc.rust-lang.org/reference/whitespace.html) from a `string` implementing `AsRef<str>`.
@@ -46,16 +50,16 @@ pub trait WhitespaceSifter: AsRef<str> {
         while ind < bytes.len() {
             crate::sift_preallocated_until_newline(bytes, &mut ind, &mut out);
         }
-        let out_mut: &mut Vec<u8> = unsafe { out.as_mut_vec() };
-        if out_mut.len() > 1 {
-            if *unsafe { out_mut.get_unchecked(out_mut.len().unchecked_sub(2)) } == CARRIAGE_RETURN
-            {
-                out_mut.pop();
-                out_mut.pop();
+        if out.len() > 1 {
+            let out_mut: &mut Vec<u8> = unsafe { out.as_mut_vec() };
+            let new_out_mut_len: usize = unsafe { out_mut.len().unchecked_sub(2) };
+            if *unsafe { out_mut.get_unchecked(new_out_mut_len) } == CARRIAGE_RETURN {
+                unsafe { out_mut.set_len(new_out_mut_len) };
                 return out;
             }
-            if *unsafe { out_mut.get_unchecked(out_mut.len().unchecked_sub(1)) } == LINE_FEED {
-                out_mut.pop();
+            let new_out_mut_len: usize = unsafe { out_mut.len().unchecked_sub(1) };
+            if *unsafe { out_mut.get_unchecked(new_out_mut_len) } == LINE_FEED {
+                unsafe { out_mut.set_len(new_out_mut_len) };
             }
         }
         out
@@ -67,25 +71,7 @@ impl<T: AsRef<str>> WhitespaceSifter for T {}
 /// A utility for `sift`.
 fn sift_preallocated(bytes: &[u8], out: &mut String) {
     let mut ind: usize = 0;
-    // Implementation of str::trim_start()
-    while ind < bytes.len() {
-        match get_char_metadata(*unsafe { bytes.get_unchecked(ind) }) {
-            Character::SingleByte { data } => {
-                ind = unsafe { ind.unchecked_add(1) };
-                if !is_ascii_whitespace(data) {
-                    unsafe { out.as_mut_vec() }.push(data);
-                    break;
-                }
-            }
-            Character::MultiByte { len } => {
-                let new_ind: usize = unsafe { ind.unchecked_add(len) };
-                unsafe { out.as_mut_vec() }
-                    .extend_from_slice(unsafe { bytes.get_unchecked(ind..new_ind) });
-                ind = new_ind;
-                break;
-            }
-        }
-    }
+    sift_trim_start(bytes, &mut ind, out);
     // Actual sifting
     let mut is_last_whitespace: bool = false;
     let mut is_last_carriage_return: bool = false;
@@ -96,8 +82,9 @@ fn sift_preallocated(bytes: &[u8], out: &mut String) {
                 ind = unsafe { ind.unchecked_add(1) };
                 if is_ascii_whitespace(data) {
                     if data == LINE_FEED && is_last_carriage_return {
-                        #[allow(clippy::cast_possible_truncation)]
-                        unsafe { out.as_mut_vec() }.push(LINE_FEED);
+                        unsafe {
+                            out.as_mut_vec().unsafe_push(LINE_FEED);
+                        }
                         is_last_carriage_return = false;
                         is_last_carriage_return_line_feed = true;
                         continue;
@@ -109,56 +96,29 @@ fn sift_preallocated(bytes: &[u8], out: &mut String) {
                 } else {
                     is_last_whitespace = false;
                 }
-                unsafe { out.as_mut_vec() }.push(data);
+                unsafe { out.as_mut_vec().unsafe_push(data) };
                 is_last_carriage_return = data == CARRIAGE_RETURN;
                 is_last_carriage_return_line_feed = false;
                 continue;
             }
-            Character::MultiByte { len } => {
-                let new_ind: usize = unsafe { ind.unchecked_add(len) };
-                unsafe { out.as_mut_vec() }
-                    .extend_from_slice(unsafe { bytes.get_unchecked(ind..new_ind) });
-                ind = new_ind;
-            }
+            Character::MultiByte { len } => extend_from_bytes_with_len(bytes, &mut ind, out, len),
         }
         is_last_carriage_return = false;
         is_last_whitespace = false;
         is_last_carriage_return_line_feed = false;
     }
     // Implementation of str::trim_end()
     if is_last_carriage_return_line_feed {
-        let out_mut: &mut Vec<u8> = unsafe { out.as_mut_vec() };
-        out_mut.pop();
-        out_mut.pop();
+        let new_out_len: usize = unsafe { out.len().unchecked_sub(2) };
+        unsafe { out.as_mut_vec().set_len(new_out_len) };
         return;
     }
-    if is_last_whitespace {
-        let out_mut: &mut Vec<u8> = unsafe { out.as_mut_vec() };
-        out_mut.pop();
-    }
+    sift_trim_end(out, is_last_whitespace);
 }
 
 /// A utility for `sift_preserve_newlines`.
 fn sift_preallocated_until_newline(bytes: &[u8], ind: &mut usize, out: &mut String) {
-    // Implementation of str::trim_start()
-    while *ind < bytes.len() {
-        match get_char_metadata(*unsafe { bytes.get_unchecked(*ind) }) {
-            Character::SingleByte { data } => {
-                *ind = unsafe { ind.unchecked_add(1) };
-                if !is_ascii_whitespace(data) {
-                    unsafe { out.as_mut_vec() }.push(data);
-                    break;
-                }
-            }
-            Character::MultiByte { len } => {
-                let new_ind: usize = unsafe { ind.unchecked_add(len) };
-                unsafe { out.as_mut_vec() }
-                    .extend_from_slice(unsafe { bytes.get_unchecked(*ind..new_ind) });
-                *ind = new_ind;
-                break;
-            }
-        }
-    }
+    sift_trim_start(bytes, ind, out);
     // Actual sifting
     let mut is_last_whitespace: bool = false;
     let mut is_last_carriage_return: bool = false;
@@ -171,13 +131,14 @@ fn sift_preallocated_until_newline(bytes: &[u8], ind: &mut usize, out: &mut Stri
                         // Implementation of str::trim_end()
                         let out_mut: &mut Vec<u8> = unsafe { out.as_mut_vec() };
                         if is_last_whitespace {
-                            out_mut.pop();
+                            let new_out_mut_len: usize = unsafe { out_mut.len().unchecked_sub(1) };
+                            unsafe { out_mut.set_len(new_out_mut_len) };
                         }
                         // Append newline
                         if is_last_carriage_return {
-                            out_mut.push(CARRIAGE_RETURN);
+                            unsafe { out_mut.unsafe_push(CARRIAGE_RETURN) };
                         }
-                        out_mut.push(LINE_FEED);
+                        unsafe { out_mut.unsafe_push(LINE_FEED) };
                         return;
                     }
                     is_last_carriage_return = data == CARRIAGE_RETURN;
@@ -188,23 +149,42 @@ fn sift_preallocated_until_newline(bytes: &[u8], ind: &mut usize, out: &mut Stri
                 } else {
                     is_last_whitespace = false;
                 }
-                unsafe { out.as_mut_vec() }.push(data);
+                unsafe { out.as_mut_vec().unsafe_push(data) };
                 is_last_carriage_return = data == CARRIAGE_RETURN;
                 continue;
             }
-            Character::MultiByte { len } => {
-                let new_ind: usize = unsafe { ind.unchecked_add(len) };
-                unsafe { out.as_mut_vec() }
-                    .extend_from_slice(unsafe { bytes.get_unchecked(*ind..new_ind) });
-                *ind = new_ind;
-            }
+            Character::MultiByte { len } => extend_from_bytes_with_len(bytes, ind, out, len),
         }
         is_last_carriage_return = false;
         is_last_whitespace = false;
     }
-    // Implementation of str::trim_end()
+    sift_trim_end(out, is_last_whitespace);
+}
+
+/// A custom implementation of `str::trim_start`.
+fn sift_trim_start(bytes: &[u8], ind: &mut usize, out: &mut String) {
+    while *ind < bytes.len() {
+        match get_char_metadata(*unsafe { bytes.get_unchecked(*ind) }) {
+            Character::SingleByte { data } => {
+                *ind = unsafe { ind.unchecked_add(1) };
+                if !is_ascii_whitespace(data) {
+                    unsafe { out.as_mut_vec().unsafe_push(data) };
+                    break;
+                }
+            }
+            Character::MultiByte { len } => {
+                extend_from_bytes_with_len(bytes, ind, out, len);
+                break;
+            }
+        }
+    }
+}
+
+/// A custom implementation for `str::trim_end`.
+fn sift_trim_end(out: &mut String, is_last_whitespace: bool) {
     if is_last_whitespace {
-        unsafe { out.as_mut_vec() }.pop();
+        let new_out_len: usize = unsafe { out.len().unchecked_sub(1) };
+        unsafe { out.as_mut_vec().set_len(new_out_len) };
     }
 }
 
@@ -244,5 +224,15 @@ const fn is_ascii_whitespace(codepoint: u8) -> bool {
     )
 }
 
+/// A function mostly used for `Character::MultiByte` copying.
+fn extend_from_bytes_with_len(bytes: &[u8], ind: &mut usize, out: &mut String, len: usize) {
+    let new_ind: usize = unsafe { ind.unchecked_add(len) };
+    unsafe {
+        out.as_mut_vec()
+            .unsafe_extend(bytes.get_unchecked(*ind..new_ind));
+    }
+    *ind = new_ind;
+}
+
 #[cfg(test)]
 mod tests;
diff --git a/src/tests.rs b/src/tests.rs
@@ -32,19 +32,12 @@ fn test_sift_preserved() {
         &input.sift_preserve_newlines(),
         "This.\nis.\na.\nsentence...\nWith.\nsome.\nduplicate...\nWhitespaces.\nThis.\r\nis.\r\na.\r\nsentence...\r\nWith.\r\nsome.\r\nduplicate...\r\nWhitespaces."
     );
-    let input: String = format!(
-        "{}\n\n{}\n\n{}\n\n\n{}\r\n\n\r\n{}\r\n\r\n{}\r\n\r\n\r\n",
-        "This. \n\nis. \n\na. \n\nsentence... \n\n",
-        "With. \n\nsome. \n\nduplicate... \n\n",
-        "Whitespaces. \n\n",
-        "This. \r\n\r\nis. \r\n\r\na. \r\n\r\nsentence... \r\n\r\n",
-        "With. \r\n\r\nsome. \r\n\r\nduplicate... \r\n\r\n",
-        "Whitespaces."
-    );
-    assert_eq!(
-        &input.sift_preserve_newlines(),
-        "This.\nis.\na.\nsentence...\nWith.\nsome.\nduplicate...\nWhitespaces.\nThis.\r\nis.\r\na.\r\nsentence...\r\nWith.\r\nsome.\r\nduplicate...\r\nWhitespaces."
-    );
+}
+
+#[test]
+fn test_blank_string_sifting() {
+    assert_eq!(&"".sift(), "");
+    assert_eq!(&"".sift_preserve_newlines(), "");
 }
 
 #[test]

diff --git a/src/unsafe_vec.rs b/src/unsafe_vec.rs
@@ -0,0 +1,20 @@
+/// A trait containing all unsafe `Vec` functions used by this crate.
+pub(crate) trait UnsafeVec<T> {
+    /// Push to a `Vec` without checking the capacity.
+    unsafe fn unsafe_push(&mut self, item: T);
+
+    /// Extend to a `Vec` without checking the capacity.
+    unsafe fn unsafe_extend(&mut self, item: &[T]);
+}
+
+impl<T> UnsafeVec<T> for Vec<T> {
+    unsafe fn unsafe_push(&mut self, item: T) {
+        std::ptr::write(self.as_mut_ptr().add(self.len()), item);
+        self.set_len(self.len().unchecked_add(1));
+    }
+
+    unsafe fn unsafe_extend(&mut self, item: &[T]) {
+        std::ptr::copy_nonoverlapping(item.as_ptr(), self.as_mut_ptr(), item.len());
+        self.set_len(self.len().unchecked_add(item.len()));
+    }
+}