Skip to content

Commit

Permalink
✨ v2.3.0
Browse files Browse the repository at this point in the history
  • Loading branch information
JumperBot committed Jul 15, 2024
1 parent 4a323c2 commit bc4e6b4
Show file tree
Hide file tree
Showing 5 changed files with 104 additions and 96 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "whitespace-sifter"
version = "2.2.0"
version = "2.3.0"
edition = "2021"
authors = ["JumperBot_"]
description = "Sift duplicate whitespaces away!"
Expand Down
19 changes: 12 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,13 @@ println!(
## ✨ Sift Duplicate Whitespaces In One Function Call

This crate **helps you** remove duplicate [whitespaces](https://doc.rust-lang.org/reference/whitespace.html) within a `string`.
Other than that, it naturally removes the whitespaces at the start and end of the `string`.
It naturally removes the whitespaces at the start and end of the `string`.

---

## ⚡️Benchmarks

Performance is one of the priorities of this crate.
One of the advises is to not listen to repository authors/maintainers when it comes to benchmarks.
You are free to run `cargo bench` on your machine after cloning this repository instead.
Performance is a priority; Most updates are performance improvements.
The benchmark uses a transcript of the [Bee Movie](https://movies.fandom.com/wiki/Bee_Movie/Transcript).

Execute these commands to benchmark:
Expand All @@ -54,13 +52,20 @@ $ cargo bench
You should only look for results that look like the following:

```bash
Sift/Sift time: [176.65 µs 177.11 µs 177.73 µs]
Sift/Sift time: [159.31 µs 159.60 µs 159.95 µs]
Sift Preserved/Sift Preserved
time: [242.64 µs 243.04 µs 243.79 µs]
time: [198.11 µs 198.21 µs 198.32 µs]
```

In just 0.0001 seconds; Pretty impressive, no?
Go try it on a better machine, I guess.
<details>
<summary>Go try it on a better machine, I guess.</summary>
Benchmark specifications:
<ul>
<li>Processor: Intel(R) Core(TM) i5-8350U CPU @ 1.70GHz 1.90 GHz</li>
<li>Memory: RAM 16.0 GB (15.8 GB usable)</li>
<li>System: GNU/Linux 5.15.153.1-microsoft-standard-WSL2 x86_64</li>
</details>

---

Expand Down
140 changes: 65 additions & 75 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
//! Sift duplicate whitespaces away in just one function call.
//! This crate **helps you** remove duplicate [whitespaces](https://doc.rust-lang.org/reference/whitespace.html) within a `string`.
//! Other than that, it naturally removes the whitespaces at the start and end of the `string`.
//! It naturally removes the whitespaces at the start and end of the `string`.
//!
//! # Examples
//!
Expand All @@ -20,6 +20,10 @@
//! );
//! ```
mod unsafe_vec;

use unsafe_vec::UnsafeVec;

/// A trait containing all `string` whitespace-sifting functions.
pub trait WhitespaceSifter: AsRef<str> {
/// This removes duplicate [whitespaces](https://doc.rust-lang.org/reference/whitespace.html) from a `string` implementing `AsRef<str>`.
Expand All @@ -46,16 +50,16 @@ pub trait WhitespaceSifter: AsRef<str> {
while ind < bytes.len() {
crate::sift_preallocated_until_newline(bytes, &mut ind, &mut out);
}
let out_mut: &mut Vec<u8> = unsafe { out.as_mut_vec() };
if out_mut.len() > 1 {
if *unsafe { out_mut.get_unchecked(out_mut.len().unchecked_sub(2)) } == CARRIAGE_RETURN
{
out_mut.pop();
out_mut.pop();
if out.len() > 1 {
let out_mut: &mut Vec<u8> = unsafe { out.as_mut_vec() };
let new_out_mut_len: usize = unsafe { out_mut.len().unchecked_sub(2) };
if *unsafe { out_mut.get_unchecked(new_out_mut_len) } == CARRIAGE_RETURN {
unsafe { out_mut.set_len(new_out_mut_len) };
return out;
}
if *unsafe { out_mut.get_unchecked(out_mut.len().unchecked_sub(1)) } == LINE_FEED {
out_mut.pop();
let new_out_mut_len: usize = unsafe { out_mut.len().unchecked_sub(1) };
if *unsafe { out_mut.get_unchecked(new_out_mut_len) } == LINE_FEED {
unsafe { out_mut.set_len(new_out_mut_len) };
}
}
out
Expand All @@ -67,25 +71,7 @@ impl<T: AsRef<str>> WhitespaceSifter for T {}
/// A utility for `sift`.
fn sift_preallocated(bytes: &[u8], out: &mut String) {
let mut ind: usize = 0;
// Implementation of str::trim_start()
while ind < bytes.len() {
match get_char_metadata(*unsafe { bytes.get_unchecked(ind) }) {
Character::SingleByte { data } => {
ind = unsafe { ind.unchecked_add(1) };
if !is_ascii_whitespace(data) {
unsafe { out.as_mut_vec() }.push(data);
break;
}
}
Character::MultiByte { len } => {
let new_ind: usize = unsafe { ind.unchecked_add(len) };
unsafe { out.as_mut_vec() }
.extend_from_slice(unsafe { bytes.get_unchecked(ind..new_ind) });
ind = new_ind;
break;
}
}
}
sift_trim_start(bytes, &mut ind, out);
// Actual sifting
let mut is_last_whitespace: bool = false;
let mut is_last_carriage_return: bool = false;
Expand All @@ -96,8 +82,9 @@ fn sift_preallocated(bytes: &[u8], out: &mut String) {
ind = unsafe { ind.unchecked_add(1) };
if is_ascii_whitespace(data) {
if data == LINE_FEED && is_last_carriage_return {
#[allow(clippy::cast_possible_truncation)]
unsafe { out.as_mut_vec() }.push(LINE_FEED);
unsafe {
out.as_mut_vec().unsafe_push(LINE_FEED);
}
is_last_carriage_return = false;
is_last_carriage_return_line_feed = true;
continue;
Expand All @@ -109,56 +96,29 @@ fn sift_preallocated(bytes: &[u8], out: &mut String) {
} else {
is_last_whitespace = false;
}
unsafe { out.as_mut_vec() }.push(data);
unsafe { out.as_mut_vec().unsafe_push(data) };
is_last_carriage_return = data == CARRIAGE_RETURN;
is_last_carriage_return_line_feed = false;
continue;
}
Character::MultiByte { len } => {
let new_ind: usize = unsafe { ind.unchecked_add(len) };
unsafe { out.as_mut_vec() }
.extend_from_slice(unsafe { bytes.get_unchecked(ind..new_ind) });
ind = new_ind;
}
Character::MultiByte { len } => extend_from_bytes_with_len(bytes, &mut ind, out, len),
}
is_last_carriage_return = false;
is_last_whitespace = false;
is_last_carriage_return_line_feed = false;
}
// Implementation of str::trim_end()
if is_last_carriage_return_line_feed {
let out_mut: &mut Vec<u8> = unsafe { out.as_mut_vec() };
out_mut.pop();
out_mut.pop();
let new_out_len: usize = unsafe { out.len().unchecked_sub(2) };
unsafe { out.as_mut_vec().set_len(new_out_len) };
return;
}
if is_last_whitespace {
let out_mut: &mut Vec<u8> = unsafe { out.as_mut_vec() };
out_mut.pop();
}
sift_trim_end(out, is_last_whitespace);
}

/// A utility for `sift_preserve_newlines`.
fn sift_preallocated_until_newline(bytes: &[u8], ind: &mut usize, out: &mut String) {
// Implementation of str::trim_start()
while *ind < bytes.len() {
match get_char_metadata(*unsafe { bytes.get_unchecked(*ind) }) {
Character::SingleByte { data } => {
*ind = unsafe { ind.unchecked_add(1) };
if !is_ascii_whitespace(data) {
unsafe { out.as_mut_vec() }.push(data);
break;
}
}
Character::MultiByte { len } => {
let new_ind: usize = unsafe { ind.unchecked_add(len) };
unsafe { out.as_mut_vec() }
.extend_from_slice(unsafe { bytes.get_unchecked(*ind..new_ind) });
*ind = new_ind;
break;
}
}
}
sift_trim_start(bytes, ind, out);
// Actual sifting
let mut is_last_whitespace: bool = false;
let mut is_last_carriage_return: bool = false;
Expand All @@ -171,13 +131,14 @@ fn sift_preallocated_until_newline(bytes: &[u8], ind: &mut usize, out: &mut Stri
// Implementation of str::trim_end()
let out_mut: &mut Vec<u8> = unsafe { out.as_mut_vec() };
if is_last_whitespace {
out_mut.pop();
let new_out_mut_len: usize = unsafe { out_mut.len().unchecked_sub(1) };
unsafe { out_mut.set_len(new_out_mut_len) };
}
// Append newline
if is_last_carriage_return {
out_mut.push(CARRIAGE_RETURN);
unsafe { out_mut.unsafe_push(CARRIAGE_RETURN) };
}
out_mut.push(LINE_FEED);
unsafe { out_mut.unsafe_push(LINE_FEED) };
return;
}
is_last_carriage_return = data == CARRIAGE_RETURN;
Expand All @@ -188,23 +149,42 @@ fn sift_preallocated_until_newline(bytes: &[u8], ind: &mut usize, out: &mut Stri
} else {
is_last_whitespace = false;
}
unsafe { out.as_mut_vec() }.push(data);
unsafe { out.as_mut_vec().unsafe_push(data) };
is_last_carriage_return = data == CARRIAGE_RETURN;
continue;
}
Character::MultiByte { len } => {
let new_ind: usize = unsafe { ind.unchecked_add(len) };
unsafe { out.as_mut_vec() }
.extend_from_slice(unsafe { bytes.get_unchecked(*ind..new_ind) });
*ind = new_ind;
}
Character::MultiByte { len } => extend_from_bytes_with_len(bytes, ind, out, len),
}
is_last_carriage_return = false;
is_last_whitespace = false;
}
// Implementation of str::trim_end()
sift_trim_end(out, is_last_whitespace);
}

/// A custom implementation of `str::trim_start`.
fn sift_trim_start(bytes: &[u8], ind: &mut usize, out: &mut String) {
while *ind < bytes.len() {
match get_char_metadata(*unsafe { bytes.get_unchecked(*ind) }) {
Character::SingleByte { data } => {
*ind = unsafe { ind.unchecked_add(1) };
if !is_ascii_whitespace(data) {
unsafe { out.as_mut_vec().unsafe_push(data) };
break;
}
}
Character::MultiByte { len } => {
extend_from_bytes_with_len(bytes, ind, out, len);
break;
}
}
}
}

/// A custom implementation for `str::trim_end`.
fn sift_trim_end(out: &mut String, is_last_whitespace: bool) {
if is_last_whitespace {
unsafe { out.as_mut_vec() }.pop();
let new_out_len: usize = unsafe { out.len().unchecked_sub(1) };
unsafe { out.as_mut_vec().set_len(new_out_len) };
}
}

Expand Down Expand Up @@ -244,5 +224,15 @@ const fn is_ascii_whitespace(codepoint: u8) -> bool {
)
}

/// A function mostly used for `Character::MultiByte` copying.
fn extend_from_bytes_with_len(bytes: &[u8], ind: &mut usize, out: &mut String, len: usize) {
let new_ind: usize = unsafe { ind.unchecked_add(len) };
unsafe {
out.as_mut_vec()
.unsafe_extend(bytes.get_unchecked(*ind..new_ind));
}
*ind = new_ind;
}

#[cfg(test)]
mod tests;
19 changes: 6 additions & 13 deletions src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,19 +32,12 @@ fn test_sift_preserved() {
&input.sift_preserve_newlines(),
"This.\nis.\na.\nsentence...\nWith.\nsome.\nduplicate...\nWhitespaces.\nThis.\r\nis.\r\na.\r\nsentence...\r\nWith.\r\nsome.\r\nduplicate...\r\nWhitespaces."
);
let input: String = format!(
"{}\n\n{}\n\n{}\n\n\n{}\r\n\n\r\n{}\r\n\r\n{}\r\n\r\n\r\n",
"This. \n\nis. \n\na. \n\nsentence... \n\n",
"With. \n\nsome. \n\nduplicate... \n\n",
"Whitespaces. \n\n",
"This. \r\n\r\nis. \r\n\r\na. \r\n\r\nsentence... \r\n\r\n",
"With. \r\n\r\nsome. \r\n\r\nduplicate... \r\n\r\n",
"Whitespaces."
);
assert_eq!(
&input.sift_preserve_newlines(),
"This.\nis.\na.\nsentence...\nWith.\nsome.\nduplicate...\nWhitespaces.\nThis.\r\nis.\r\na.\r\nsentence...\r\nWith.\r\nsome.\r\nduplicate...\r\nWhitespaces."
);
}

#[test]
fn test_blank_string_sifting() {
assert_eq!(&"".sift(), "");
assert_eq!(&"".sift_preserve_newlines(), "");
}

#[test]
Expand Down
20 changes: 20 additions & 0 deletions src/unsafe_vec.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/// A trait containing all unsafe `Vec` functions used by this crate.
pub(crate) trait UnsafeVec<T> {
/// Push to a `Vec` without checking the capacity.
unsafe fn unsafe_push(&mut self, item: T);

/// Extend to a `Vec` without checking the capacity.
unsafe fn unsafe_extend(&mut self, item: &[T]);
}

impl<T> UnsafeVec<T> for Vec<T> {
unsafe fn unsafe_push(&mut self, item: T) {
std::ptr::write(self.as_mut_ptr().add(self.len()), item);
self.set_len(self.len().unchecked_add(1));
}

unsafe fn unsafe_extend(&mut self, item: &[T]) {
std::ptr::copy_nonoverlapping(item.as_ptr(), self.as_mut_ptr(), item.len());
self.set_len(self.len().unchecked_add(item.len()));
}
}

0 comments on commit bc4e6b4

Please sign in to comment.