From edd318c313763d8c4cf3e8cc339f433832d6454a Mon Sep 17 00:00:00 2001 From: ltdk Date: Sun, 20 Jun 2021 16:24:10 -0400 Subject: [PATCH] Add {floor,ceil}_char_boundary methods to str --- library/alloc/tests/lib.rs | 1 + library/alloc/tests/str.rs | 92 +++++++++++++++++++++++++++++ library/core/src/num/mod.rs | 5 ++ library/core/src/str/mod.rs | 88 +++++++++++++++++++++++---- library/core/src/str/validations.rs | 13 ---- 5 files changed, 176 insertions(+), 23 deletions(-) diff --git a/library/alloc/tests/lib.rs b/library/alloc/tests/lib.rs index dcf51e3142a61..cbb86265233b0 100644 --- a/library/alloc/tests/lib.rs +++ b/library/alloc/tests/lib.rs @@ -29,6 +29,7 @@ #![feature(binary_heap_as_slice)] #![feature(inplace_iteration)] #![feature(iter_advance_by)] +#![feature(round_char_boundary)] #![feature(slice_group_by)] #![feature(slice_partition_dedup)] #![feature(string_remove_matches)] diff --git a/library/alloc/tests/str.rs b/library/alloc/tests/str.rs index 7b07821ab1d31..6b8be2506b64e 100644 --- a/library/alloc/tests/str.rs +++ b/library/alloc/tests/str.rs @@ -2272,3 +2272,95 @@ fn utf8_char_counts() { } } } + +#[test] +fn floor_char_boundary() { + fn check_many(s: &str, arg: impl IntoIterator, ret: usize) { + for idx in arg { + assert_eq!( + s.floor_char_boundary(idx), + ret, + "{:?}.floor_char_boundary({:?}) != {:?}", + s, + idx, + ret + ); + } + } + + // edge case + check_many("", [0, 1, isize::MAX as usize, usize::MAX], 0); + + // basic check + check_many("x", [0], 0); + check_many("x", [1, isize::MAX as usize, usize::MAX], 1); + + // 1-byte chars + check_many("jp", [0], 0); + check_many("jp", [1], 1); + check_many("jp", 2..4, 2); + + // 2-byte chars + check_many("ĵƥ", 0..2, 0); + check_many("ĵƥ", 2..4, 2); + check_many("ĵƥ", 4..6, 4); + + // 3-byte chars + check_many("日本", 0..3, 0); + check_many("日本", 3..6, 3); + check_many("日本", 6..8, 6); + + // 4-byte chars + check_many("🇯🇵", 0..4, 0); + check_many("🇯🇵", 4..8, 4); + check_many("🇯🇵", 8..10, 8); +} + +#[test] +fn ceil_char_boundary() { + fn check_many(s: &str, arg: impl IntoIterator, ret: usize) { + for idx in arg { + assert_eq!( + s.ceil_char_boundary(idx), + ret, + "{:?}.ceil_char_boundary({:?}) != {:?}", + s, + idx, + ret + ); + } + } + + // edge case + check_many("", [0], 0); + + // basic check + check_many("x", [0], 0); + check_many("x", [1], 1); + + // 1-byte chars + check_many("jp", [0], 0); + check_many("jp", [1], 1); + check_many("jp", [2], 2); + + // 2-byte chars + check_many("ĵƥ", 0..=0, 0); + check_many("ĵƥ", 1..=2, 2); + check_many("ĵƥ", 3..=4, 4); + + // 3-byte chars + check_many("日本", 0..=0, 0); + check_many("日本", 1..=3, 3); + check_many("日本", 4..=6, 6); + + // 4-byte chars + check_many("🇯🇵", 0..=0, 0); + check_many("🇯🇵", 1..=4, 4); + check_many("🇯🇵", 5..=8, 8); +} + +#[test] +#[should_panic] +fn ceil_char_boundary_above_len_panic() { + let _ = "x".ceil_char_boundary(2); +} diff --git a/library/core/src/num/mod.rs b/library/core/src/num/mod.rs index 721c030b410ae..864a253299f6e 100644 --- a/library/core/src/num/mod.rs +++ b/library/core/src/num/mod.rs @@ -809,6 +809,11 @@ impl u8 { pub fn escape_ascii(&self) -> ascii::EscapeDefault { ascii::escape_default(*self) } + + pub(crate) fn is_utf8_char_boundary(self) -> bool { + // This is bit magic equivalent to: b < 128 || b >= 192 + (self as i8) >= -0x40 + } } #[lang = "u16"] diff --git a/library/core/src/str/mod.rs b/library/core/src/str/mod.rs index fceea2366da54..09709dc3cf6df 100644 --- a/library/core/src/str/mod.rs +++ b/library/core/src/str/mod.rs @@ -76,15 +76,14 @@ use iter::MatchIndicesInternal; use iter::SplitInternal; use iter::{MatchesInternal, SplitNInternal}; -use validations::truncate_to_char_boundary; - #[inline(never)] #[cold] #[track_caller] fn slice_error_fail(s: &str, begin: usize, end: usize) -> ! { const MAX_DISPLAY_LENGTH: usize = 256; - let (truncated, s_trunc) = truncate_to_char_boundary(s, MAX_DISPLAY_LENGTH); - let ellipsis = if truncated { "[...]" } else { "" }; + let trunc_len = s.floor_char_boundary(MAX_DISPLAY_LENGTH); + let s_trunc = &s[..trunc_len]; + let ellipsis = if trunc_len < s.len() { "[...]" } else { "" }; // 1. out of bounds if begin > s.len() || end > s.len() { @@ -105,10 +104,7 @@ fn slice_error_fail(s: &str, begin: usize, end: usize) -> ! { // 3. character boundary let index = if !s.is_char_boundary(begin) { begin } else { end }; // find the character - let mut char_start = index; - while !s.is_char_boundary(char_start) { - char_start -= 1; - } + let char_start = s.floor_char_boundary(index); // `char_start` must be less than len and a char boundary let ch = s[char_start..].chars().next().unwrap(); let char_range = char_start..char_start + ch.len_utf8(); @@ -215,8 +211,80 @@ impl str { // code on higher opt-levels. See PR #84751 for more details. None => index == self.len(), - // This is bit magic equivalent to: b < 128 || b >= 192 - Some(&b) => (b as i8) >= -0x40, + Some(&b) => b.is_utf8_char_boundary(), + } + } + + /// Finds the closest `x` not exceeding `index` where `is_char_boundary(x)` is `true`. + /// + /// This method can help you truncate a string so that it's still valid UTF-8, but doesn't + /// exceed a given number of bytes. Note that this is done purely at the character level + /// and can still visually split graphemes, even though the underlying characters aren't + /// split. For example, the emoji 🧑‍🔬 (scientist) could be split so that the string only + /// includes 🧑 (person) instead. + /// + /// # Examples + /// + /// ``` + /// #![feature(round_char_boundary)] + /// let s = "❤️🧡💛💚💙💜"; + /// assert_eq!(s.len(), 26); + /// assert!(!s.is_char_boundary(13)); + /// + /// let closest = s.floor_char_boundary(13); + /// assert_eq!(closest, 10); + /// assert_eq!(&s[..closest], "❤️🧡"); + /// ``` + #[unstable(feature = "round_char_boundary", issue = "93743")] + #[inline] + pub fn floor_char_boundary(&self, index: usize) -> usize { + if index >= self.len() { + self.len() + } else { + let lower_bound = index.saturating_sub(3); + let new_index = self.as_bytes()[lower_bound..=index] + .iter() + .rposition(|b| b.is_utf8_char_boundary()); + + // SAFETY: we know that the character boundary will be within four bytes + unsafe { lower_bound + new_index.unwrap_unchecked() } + } + } + + /// Finds the closest `x` not below `index` where `is_char_boundary(x)` is `true`. + /// + /// This method is the natural complement to [`floor_char_boundary`]. See that method + /// for more details. + /// + /// [`floor_char_boundary`]: str::floor_char_boundary + /// + /// # Panics + /// + /// Panics if `index > self.len()`. + /// + /// # Examples + /// + /// ``` + /// #![feature(round_char_boundary)] + /// let s = "❤️🧡💛💚💙💜"; + /// assert_eq!(s.len(), 26); + /// assert!(!s.is_char_boundary(13)); + /// + /// let closest = s.ceil_char_boundary(13); + /// assert_eq!(closest, 14); + /// assert_eq!(&s[..closest], "❤️🧡💛"); + /// ``` + #[unstable(feature = "round_char_boundary", issue = "93743")] + #[inline] + pub fn ceil_char_boundary(&self, index: usize) -> usize { + if index > self.len() { + slice_error_fail(self, index, index) + } else { + let upper_bound = Ord::min(index + 4, self.len()); + self.as_bytes()[index..upper_bound] + .iter() + .position(|b| b.is_utf8_char_boundary()) + .map_or(upper_bound, |pos| pos + index) } } diff --git a/library/core/src/str/validations.rs b/library/core/src/str/validations.rs index b2ea86d699aa6..0d3dc856be577 100644 --- a/library/core/src/str/validations.rs +++ b/library/core/src/str/validations.rs @@ -273,16 +273,3 @@ pub const fn utf8_char_width(b: u8) -> usize { /// Mask of the value bits of a continuation byte. const CONT_MASK: u8 = 0b0011_1111; - -// truncate `&str` to length at most equal to `max` -// return `true` if it were truncated, and the new str. -pub(super) fn truncate_to_char_boundary(s: &str, mut max: usize) -> (bool, &str) { - if max >= s.len() { - (false, s) - } else { - while !s.is_char_boundary(max) { - max -= 1; - } - (true, &s[..max]) - } -}