From 47ca9c1225906e592b85cb76b6b8af8fb26ee275 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Wed, 29 Nov 2023 19:32:16 -0700 Subject: [PATCH 01/21] Add debug_unwrap helper macro --- experimental/zerotrie/src/helpers.rs | 21 +++++++++++++++++++++ experimental/zerotrie/src/lib.rs | 1 + experimental/zerotrie/src/varint.rs | 10 ++-------- 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/experimental/zerotrie/src/helpers.rs b/experimental/zerotrie/src/helpers.rs index e7983b490e1..43baf7fff81 100644 --- a/experimental/zerotrie/src/helpers.rs +++ b/experimental/zerotrie/src/helpers.rs @@ -54,3 +54,24 @@ pub(crate) fn debug_get_range(slice: &[u8], range: Range) -> Option<&[u8] } } } + +macro_rules! debug_unwrap { + ($expr:expr, return $retval:expr, $($arg:tt)+) => { + match $expr { + Some(x) => x, + None => { + debug_assert!(false, $($arg)*); + return $retval; + } + } + }; + ($expr:expr, return $retval:expr) => { + debug_unwrap!($expr, return $retval, "invalid trie") + }; + ($expr:expr, $($arg:tt)+) => { + debug_unwrap!($expr, return (), $($arg)*) + }; + ($expr:expr) => { + debug_unwrap!($expr, return ()) + }; +} diff --git a/experimental/zerotrie/src/lib.rs b/experimental/zerotrie/src/lib.rs index 3fa8ed6ee27..817093d2f36 100644 --- a/experimental/zerotrie/src/lib.rs +++ b/experimental/zerotrie/src/lib.rs @@ -57,6 +57,7 @@ extern crate alloc; mod builder; mod byte_phf; mod error; +#[macro_use] mod helpers; mod reader; #[cfg(feature = "serde")] diff --git a/experimental/zerotrie/src/varint.rs b/experimental/zerotrie/src/varint.rs index 420bc4d9c13..5364aaa8ba4 100644 --- a/experimental/zerotrie/src/varint.rs +++ b/experimental/zerotrie/src/varint.rs @@ -53,10 +53,7 @@ pub const fn read_varint_meta2(start: u8, remainder: &[u8]) -> Option<(usize, &[ if (start & 0b00100000) != 0 { loop { let next; - (next, remainder) = match remainder.split_first() { - Some(t) => t, - None => return None, - }; + (next, remainder) = debug_unwrap!(remainder.split_first(), return None); // Note: value << 7 could drop high bits. The first addition can't overflow. // The second addition could overflow; in such a case we just inform the // developer via the debug assertion. @@ -78,10 +75,7 @@ pub const fn read_varint_meta3(start: u8, remainder: &[u8]) -> Option<(usize, &[ if (start & 0b00010000) != 0 { loop { let next; - (next, remainder) = match remainder.split_first() { - Some(t) => t, - None => return None, - }; + (next, remainder) = debug_unwrap!(remainder.split_first(), return None); // Note: value << 7 could drop high bits. The first addition can't overflow. // The second addition could overflow; in such a case we just inform the // developer via the debug assertion. From 92a420763ba77940ccc96577f4b8a5af667dd5e9 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Wed, 29 Nov 2023 19:50:39 -0700 Subject: [PATCH 02/21] Clean up varint functions to not double assert --- experimental/zerotrie/src/helpers.rs | 12 +++++++ experimental/zerotrie/src/reader.rs | 16 ++++----- experimental/zerotrie/src/varint.rs | 53 +++++++++++++++------------- 3 files changed, 48 insertions(+), 33 deletions(-) diff --git a/experimental/zerotrie/src/helpers.rs b/experimental/zerotrie/src/helpers.rs index 43baf7fff81..73393b18a41 100644 --- a/experimental/zerotrie/src/helpers.rs +++ b/experimental/zerotrie/src/helpers.rs @@ -68,6 +68,18 @@ macro_rules! debug_unwrap { ($expr:expr, return $retval:expr) => { debug_unwrap!($expr, return $retval, "invalid trie") }; + ($expr:expr, break, $($arg:tt)+) => { + match $expr { + Some(x) => x, + None => { + debug_assert!(false, $($arg)*); + break; + } + } + }; + ($expr:expr, break) => { + debug_unwrap!($expr, break, "invalid trie") + }; ($expr:expr, $($arg:tt)+) => { debug_unwrap!($expr, return (), $($arg)*) }; diff --git a/experimental/zerotrie/src/reader.rs b/experimental/zerotrie/src/reader.rs index ba0345538cf..ad48f0fa480 100644 --- a/experimental/zerotrie/src/reader.rs +++ b/experimental/zerotrie/src/reader.rs @@ -311,8 +311,8 @@ pub fn get_bsearch_only(mut trie: &[u8], mut ascii: &[u8]) -> Option { let byte_type = byte_type(*b); (x, trie) = match byte_type { NodeType::Ascii => (0, trie), - NodeType::Span | NodeType::Value => read_varint_meta3(*b, trie)?, - NodeType::Branch => read_varint_meta2(*b, trie)?, + NodeType::Span | NodeType::Value => read_varint_meta3(*b, trie), + NodeType::Branch => read_varint_meta2(*b, trie), }; if let Some((c, temp)) = ascii.split_first() { if matches!(byte_type, NodeType::Ascii) { @@ -375,8 +375,8 @@ pub fn get_phf_limited(mut trie: &[u8], mut ascii: &[u8]) -> Option { let byte_type = byte_type(*b); (x, trie) = match byte_type { NodeType::Ascii => (0, trie), - NodeType::Span | NodeType::Value => read_varint_meta3(*b, trie)?, - NodeType::Branch => read_varint_meta2(*b, trie)?, + NodeType::Span | NodeType::Value => read_varint_meta3(*b, trie), + NodeType::Branch => read_varint_meta2(*b, trie), }; if let Some((c, temp)) = ascii.split_first() { if matches!(byte_type, NodeType::Ascii) { @@ -445,8 +445,8 @@ pub fn get_phf_extended(mut trie: &[u8], mut ascii: &[u8]) -> Option { let byte_type = byte_type(*b); (x, trie) = match byte_type { NodeType::Ascii => (0, trie), - NodeType::Span | NodeType::Value => read_varint_meta3(*b, trie)?, - NodeType::Branch => read_varint_meta2(*b, trie)?, + NodeType::Span | NodeType::Value => read_varint_meta3(*b, trie), + NodeType::Branch => read_varint_meta2(*b, trie), }; if let Some((c, temp)) = ascii.split_first() { if matches!(byte_type, NodeType::Ascii) { @@ -554,8 +554,8 @@ impl<'a> Iterator for ZeroTrieIterator<'a> { } (x, trie) = match byte_type { NodeType::Ascii => (0, trie), - NodeType::Span | NodeType::Value => read_varint_meta3(*b, trie)?, - NodeType::Branch => read_varint_meta2(*b, trie)?, + NodeType::Span | NodeType::Value => read_varint_meta3(*b, trie), + NodeType::Branch => read_varint_meta2(*b, trie), }; if matches!(byte_type, NodeType::Span) { (span, trie) = debug_split_at(trie, x)?; diff --git a/experimental/zerotrie/src/varint.rs b/experimental/zerotrie/src/varint.rs index 5364aaa8ba4..5af0ea664be 100644 --- a/experimental/zerotrie/src/varint.rs +++ b/experimental/zerotrie/src/varint.rs @@ -47,13 +47,16 @@ use crate::builder::nonconst::TrieBuilderStore; /// Reads a varint with 2 bits of metadata in the lead byte. /// /// Returns the varint value and a subslice of `remainder` with the varint bytes removed. -pub const fn read_varint_meta2(start: u8, remainder: &[u8]) -> Option<(usize, &[u8])> { +/// +/// If the varint spills off the end of the slice, a debug assertion will fail, +/// and the function will return the value up to that point. +pub const fn read_varint_meta2(start: u8, remainder: &[u8]) -> (usize, &[u8]) { let mut value = (start & 0b00011111) as usize; let mut remainder = remainder; if (start & 0b00100000) != 0 { loop { let next; - (next, remainder) = debug_unwrap!(remainder.split_first(), return None); + (next, remainder) = debug_unwrap!(remainder.split_first(), break, "invalid varint"); // Note: value << 7 could drop high bits. The first addition can't overflow. // The second addition could overflow; in such a case we just inform the // developer via the debug assertion. @@ -63,19 +66,22 @@ pub const fn read_varint_meta2(start: u8, remainder: &[u8]) -> Option<(usize, &[ } } } - Some((value, remainder)) + (value, remainder) } /// Reads a varint with 3 bits of metadata in the lead byte. /// /// Returns the varint value and a subslice of `remainder` with the varint bytes removed. -pub const fn read_varint_meta3(start: u8, remainder: &[u8]) -> Option<(usize, &[u8])> { +/// +/// If the varint spills off the end of the slice, a debug assertion will fail, +/// and the function will return the value up to that point. +pub const fn read_varint_meta3(start: u8, remainder: &[u8]) -> (usize, &[u8]) { let mut value = (start & 0b00001111) as usize; let mut remainder = remainder; if (start & 0b00010000) != 0 { loop { let next; - (next, remainder) = debug_unwrap!(remainder.split_first(), return None); + (next, remainder) = debug_unwrap!(remainder.split_first(), break, "invalid varint"); // Note: value << 7 could drop high bits. The first addition can't overflow. // The second addition could overflow; in such a case we just inform the // developer via the debug assertion. @@ -85,7 +91,7 @@ pub const fn read_varint_meta3(start: u8, remainder: &[u8]) -> Option<(usize, &[ } } } - Some((value, remainder)) + (value, remainder) } /// Reads and removes a varint with 3 bits of metadata from a [`TrieBuilderStore`]. @@ -356,7 +362,7 @@ mod tests { #[test] fn test_read() { for cas in CASES { - let recovered = read_varint_meta2(cas.bytes[0], &cas.bytes[1..]).unwrap(); + let recovered = read_varint_meta2(cas.bytes[0], &cas.bytes[1..]); assert_eq!(recovered, (cas.value, cas.remainder), "{:?}", cas); } } @@ -377,7 +383,7 @@ mod tests { "{:?}", cas ); - let recovered = read_varint_meta2(cas.bytes[0], &cas.bytes[1..]).unwrap(); + let recovered = read_varint_meta2(cas.bytes[0], &cas.bytes[1..]); assert_eq!(recovered, (cas.value, cas.remainder), "{:?}", cas); let write_bytes = write_varint_meta2(cas.value); assert_eq!( @@ -395,8 +401,7 @@ mod tests { while i < MAX_VARINT { let bytes = write_varint_meta2(i); let recovered = read_varint_meta2(bytes.as_slice()[0], &bytes.as_slice()[1..]); - assert!(recovered.is_some(), "{:?}", i); - assert_eq!(i, recovered.unwrap().0, "{:?}", bytes.as_slice()); + assert_eq!(i, recovered.0, "{:?}", bytes.as_slice()); i <<= 1; i += 1; } @@ -408,8 +413,7 @@ mod tests { while i < MAX_VARINT { let bytes = write_varint_meta3(i); let recovered = read_varint_meta3(bytes.as_slice()[0], &bytes.as_slice()[1..]); - assert!(recovered.is_some(), "{:?}", i); - assert_eq!(i, recovered.unwrap().0, "{:?}", bytes.as_slice()); + assert_eq!(i, recovered.0, "{:?}", bytes.as_slice()); i <<= 1; i += 1; } @@ -427,8 +431,7 @@ mod tests { let (recovered_value, remainder) = read_varint_meta2( *write_bytes.as_const_slice().first().unwrap(), subarray.as_slice(), - ) - .unwrap(); + ); assert!(remainder.is_empty()); assert_eq!(recovered_value, MAX_VARINT); assert_eq!( @@ -453,7 +456,7 @@ mod tests { let write_bytes = write_varint_meta3(MAX_VARINT); assert_eq!(write_bytes.len(), MAX_VARINT_LENGTH); let (lead, trailing) = write_bytes.as_slice().split_first().unwrap(); - let (recovered_value, remainder) = read_varint_meta3(*lead, trailing).unwrap(); + let (recovered_value, remainder) = read_varint_meta3(*lead, trailing); assert!(remainder.is_empty()); assert_eq!(recovered_value, MAX_VARINT); assert_eq!( @@ -477,18 +480,18 @@ mod tests { fn test_latent_values() { // Same values documented in the module docs: M=2 let m2 = read_varint_meta2; - assert_eq!(m2(0, &[]).unwrap().0, 0); - assert_eq!(m2(0x20, &[0x00]).unwrap().0, 32); - assert_eq!(m2(0x20, &[0x80, 0x00]).unwrap().0, 4128); - assert_eq!(m2(0x20, &[0x80, 0x80, 0x00]).unwrap().0, 528416); - assert_eq!(m2(0x20, &[0x80, 0x80, 0x80, 0x00]).unwrap().0, 67637280); + assert_eq!(m2(0, &[]).0, 0); + assert_eq!(m2(0x20, &[0x00]).0, 32); + assert_eq!(m2(0x20, &[0x80, 0x00]).0, 4128); + assert_eq!(m2(0x20, &[0x80, 0x80, 0x00]).0, 528416); + assert_eq!(m2(0x20, &[0x80, 0x80, 0x80, 0x00]).0, 67637280); // Same values documented in the module docs: M=3 let m3 = read_varint_meta3; - assert_eq!(m3(0, &[]).unwrap().0, 0); - assert_eq!(m3(0x10, &[0x00]).unwrap().0, 16); - assert_eq!(m3(0x10, &[0x80, 0x00]).unwrap().0, 2064); - assert_eq!(m3(0x10, &[0x80, 0x80, 0x00]).unwrap().0, 264208); - assert_eq!(m3(0x10, &[0x80, 0x80, 0x80, 0x00]).unwrap().0, 33818640); + assert_eq!(m3(0, &[]).0, 0); + assert_eq!(m3(0x10, &[0x00]).0, 16); + assert_eq!(m3(0x10, &[0x80, 0x00]).0, 2064); + assert_eq!(m3(0x10, &[0x80, 0x80, 0x00]).0, 264208); + assert_eq!(m3(0x10, &[0x80, 0x80, 0x80, 0x00]).0, 33818640); } } From 512ebc646af90906cc1d340f10d578b61f427196 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Wed, 29 Nov 2023 20:09:05 -0700 Subject: [PATCH 03/21] Change debug_split_at to return a value in the GIGO case --- experimental/zerotrie/src/byte_phf/mod.rs | 8 +++---- experimental/zerotrie/src/helpers.rs | 11 +++++----- experimental/zerotrie/src/reader.rs | 26 +++++++++++------------ 3 files changed, 21 insertions(+), 24 deletions(-) diff --git a/experimental/zerotrie/src/byte_phf/mod.rs b/experimental/zerotrie/src/byte_phf/mod.rs index cbbb01a175d..bdfad27b977 100644 --- a/experimental/zerotrie/src/byte_phf/mod.rs +++ b/experimental/zerotrie/src/byte_phf/mod.rs @@ -213,7 +213,7 @@ where if n == 0 { return None; } - let (qq, eks) = debug_split_at(buffer, n)?; + let (qq, eks) = debug_split_at(buffer, n); debug_assert_eq!(qq.len(), eks.len()); let q = debug_get(qq, f1(key, *p, n))?; let l2 = f2(key, q, n); @@ -232,9 +232,7 @@ where /// Get an iterator over the keys in the order in which they are stored in the map. pub fn keys(&self) -> &[u8] { let n = self.num_items(); - debug_split_at(self.0.as_ref(), 1 + n) - .map(|s| s.1) - .unwrap_or(&[]) + debug_split_at(self.0.as_ref(), 1 + n).1 } /// Diagnostic function that returns `p` and the maximum value of `q` #[cfg(test)] @@ -244,7 +242,7 @@ where if n == 0 { return None; } - let (qq, _) = debug_split_at(buffer, n)?; + let (qq, _) = debug_split_at(buffer, n); Some((*p, *qq.iter().max().unwrap())) } /// Returns the map as bytes. The map can be recovered with [`Self::from_store`] diff --git a/experimental/zerotrie/src/helpers.rs b/experimental/zerotrie/src/helpers.rs index 73393b18a41..b4bbb08815f 100644 --- a/experimental/zerotrie/src/helpers.rs +++ b/experimental/zerotrie/src/helpers.rs @@ -4,18 +4,17 @@ use core::ops::Range; -/// Like slice::split_at but returns an Option instead of panicking. -/// -/// Debug-panics if `mid` is out of range. +/// Like slice::split_at but debug-panics and returns an empty second slice +/// if the index is out of range. #[inline] -pub(crate) fn debug_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { +pub(crate) fn debug_split_at(slice: &[u8], mid: usize) -> (&[u8], &[u8]) { if mid > slice.len() { debug_assert!(false, "debug_split_at: index expected to be in range"); - None + (slice, &[]) } else { // Note: We're trusting the compiler to inline this and remove the assertion // hiding on the top of slice::split_at: `assert(mid <= self.len())` - Some(slice.split_at(mid)) + slice.split_at(mid) } } diff --git a/experimental/zerotrie/src/reader.rs b/experimental/zerotrie/src/reader.rs index ad48f0fa480..56ec575ab50 100644 --- a/experimental/zerotrie/src/reader.rs +++ b/experimental/zerotrie/src/reader.rs @@ -224,7 +224,7 @@ fn get_branch(mut trie: &[u8], i: usize, n: usize, mut w: usize) -> Option<&[u8] let mut q = 0usize; loop { let indices; - (indices, trie) = debug_split_at(trie, n - 1)?; + (indices, trie) = debug_split_at(trie, n - 1); p = (p << 8) + if i == 0 { 0 @@ -247,7 +247,7 @@ fn get_branch(mut trie: &[u8], i: usize, n: usize, mut w: usize) -> Option<&[u8] #[inline] fn get_branch_w0(mut trie: &[u8], i: usize, n: usize) -> Option<&[u8]> { let indices; - (indices, trie) = debug_split_at(trie, n - 1)?; + (indices, trie) = debug_split_at(trie, n - 1); let p = if i == 0 { 0 } else { @@ -331,7 +331,7 @@ pub fn get_bsearch_only(mut trie: &[u8], mut ascii: &[u8]) -> Option { } if matches!(byte_type, NodeType::Span) { let (trie_span, ascii_span); - (trie_span, trie) = debug_split_at(trie, x)?; + (trie_span, trie) = debug_split_at(trie, x); (ascii_span, ascii) = maybe_split_at(ascii, x)?; if trie_span == ascii_span { // Matched a byte span @@ -348,7 +348,7 @@ pub fn get_bsearch_only(mut trie: &[u8], mut ascii: &[u8]) -> Option { let w = w & 0x3; let x = if x == 0 { 256 } else { x }; // Always use binary search - (search, trie) = debug_split_at(trie, x)?; + (search, trie) = debug_split_at(trie, x); i = search.binary_search(c).ok()?; trie = if w == 0 { get_branch_w0(trie, i, x) @@ -395,7 +395,7 @@ pub fn get_phf_limited(mut trie: &[u8], mut ascii: &[u8]) -> Option { } if matches!(byte_type, NodeType::Span) { let (trie_span, ascii_span); - (trie_span, trie) = debug_split_at(trie, x)?; + (trie_span, trie) = debug_split_at(trie, x); (ascii_span, ascii) = maybe_split_at(ascii, x)?; if trie_span == ascii_span { // Matched a byte span @@ -413,11 +413,11 @@ pub fn get_phf_limited(mut trie: &[u8], mut ascii: &[u8]) -> Option { let x = if x == 0 { 256 } else { x }; if x < 16 { // binary search - (search, trie) = debug_split_at(trie, x)?; + (search, trie) = debug_split_at(trie, x); i = search.binary_search(c).ok()?; } else { // phf - (search, trie) = debug_split_at(trie, x * 2 + 1)?; + (search, trie) = debug_split_at(trie, x * 2 + 1); i = PerfectByteHashMap::from_store(search).get(*c)?; } trie = if w == 0 { @@ -465,7 +465,7 @@ pub fn get_phf_extended(mut trie: &[u8], mut ascii: &[u8]) -> Option { } if matches!(byte_type, NodeType::Span) { let (trie_span, ascii_span); - (trie_span, trie) = debug_split_at(trie, x)?; + (trie_span, trie) = debug_split_at(trie, x); (ascii_span, ascii) = maybe_split_at(ascii, x)?; if trie_span == ascii_span { // Matched a byte span @@ -480,11 +480,11 @@ pub fn get_phf_extended(mut trie: &[u8], mut ascii: &[u8]) -> Option { let x = if x == 0 { 256 } else { x }; if x < 16 { // binary search - (search, trie) = debug_split_at(trie, x)?; + (search, trie) = debug_split_at(trie, x); i = search.binary_search(c).ok()?; } else { // phf - (search, trie) = debug_split_at(trie, x * 2 + 1)?; + (search, trie) = debug_split_at(trie, x * 2 + 1); i = PerfectByteHashMap::from_store(search).get(*c)?; } trie = if w == 0 { @@ -558,7 +558,7 @@ impl<'a> Iterator for ZeroTrieIterator<'a> { NodeType::Branch => read_varint_meta2(*b, trie), }; if matches!(byte_type, NodeType::Span) { - (span, trie) = debug_split_at(trie, x)?; + (span, trie) = debug_split_at(trie, x); string.extend(span); continue; } @@ -578,11 +578,11 @@ impl<'a> Iterator for ZeroTrieIterator<'a> { } let byte = if x < 16 || !self.use_phf { // binary search - (search, trie) = debug_split_at(trie, x)?; + (search, trie) = debug_split_at(trie, x); debug_get(search, branch_idx)? } else { // phf - (search, trie) = debug_split_at(trie, x * 2 + 1)?; + (search, trie) = debug_split_at(trie, x * 2 + 1); debug_get(search, branch_idx + x + 1)? }; string.push(byte); From ed19daa9456ee1cdbeaf3a5d3284a9400f1d77f7 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Wed, 29 Nov 2023 23:06:13 -0600 Subject: [PATCH 04/21] Refactor helpers to be trait-added functions --- experimental/zerotrie/src/byte_phf/mod.rs | 14 ++-- experimental/zerotrie/src/helpers.rs | 83 ++++++++++++----------- experimental/zerotrie/src/reader.rs | 58 ++++++++-------- 3 files changed, 78 insertions(+), 77 deletions(-) diff --git a/experimental/zerotrie/src/byte_phf/mod.rs b/experimental/zerotrie/src/byte_phf/mod.rs index bdfad27b977..e1623641012 100644 --- a/experimental/zerotrie/src/byte_phf/mod.rs +++ b/experimental/zerotrie/src/byte_phf/mod.rs @@ -213,12 +213,12 @@ where if n == 0 { return None; } - let (qq, eks) = debug_split_at(buffer, n); + let (qq, eks) = buffer.debug_split_at(n); debug_assert_eq!(qq.len(), eks.len()); - let q = debug_get(qq, f1(key, *p, n))?; - let l2 = f2(key, q, n); - let ek = debug_get(eks, l2)?; - if ek == key { + let q = debug_unwrap!(qq.get(f1(key, *p, n)), return None); + let l2 = f2(key, *q, n); + let ek = debug_unwrap!(eks.get(l2), return None); + if *ek == key { Some(l2) } else { None @@ -232,7 +232,7 @@ where /// Get an iterator over the keys in the order in which they are stored in the map. pub fn keys(&self) -> &[u8] { let n = self.num_items(); - debug_split_at(self.0.as_ref(), 1 + n).1 + self.0.as_ref().debug_split_at(1 + n).1 } /// Diagnostic function that returns `p` and the maximum value of `q` #[cfg(test)] @@ -242,7 +242,7 @@ where if n == 0 { return None; } - let (qq, _) = debug_split_at(buffer, n); + let (qq, _) = buffer.debug_split_at(n); Some((*p, *qq.iter().max().unwrap())) } /// Returns the map as bytes. The map can be recovered with [`Self::from_store`] diff --git a/experimental/zerotrie/src/helpers.rs b/experimental/zerotrie/src/helpers.rs index b4bbb08815f..4bb3abe2ca1 100644 --- a/experimental/zerotrie/src/helpers.rs +++ b/experimental/zerotrie/src/helpers.rs @@ -2,54 +2,53 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). -use core::ops::Range; - -/// Like slice::split_at but debug-panics and returns an empty second slice -/// if the index is out of range. -#[inline] -pub(crate) fn debug_split_at(slice: &[u8], mid: usize) -> (&[u8], &[u8]) { - if mid > slice.len() { - debug_assert!(false, "debug_split_at: index expected to be in range"); - (slice, &[]) - } else { - // Note: We're trusting the compiler to inline this and remove the assertion - // hiding on the top of slice::split_at: `assert(mid <= self.len())` - slice.split_at(mid) - } +pub(crate) trait MaybeSplitAt { + /// Like slice::split_at but returns an Option instead of panicking + /// if the index is out of range. + fn maybe_split_at(&self, mid: usize) -> Option<(&Self, &Self)>; + /// Like slice::split_at but debug-panics and returns an empty second slice + /// if the index is out of range. + fn debug_split_at(&self, mid: usize) -> (&Self, &Self); } -/// Like slice::split_at but returns an Option instead of panicking. -#[inline] -pub(crate) fn maybe_split_at(slice: &[u8], mid: usize) -> Option<(&[u8], &[u8])> { - if mid > slice.len() { - None - } else { - // Note: We're trusting the compiler to inline this and remove the assertion - // hiding on the top of slice::split_at: `assert(mid <= self.len())` - Some(slice.split_at(mid)) - } -} - -/// Gets the item at the specified index, panicking in debug mode if it is not there. -#[inline] -pub(crate) fn debug_get(slice: &[u8], index: usize) -> Option { - match slice.get(index) { - Some(x) => Some(*x), - None => { - debug_assert!(false, "debug_get: index expected to be in range"); +impl MaybeSplitAt for [T] { + #[inline] + fn maybe_split_at(&self, mid: usize) -> Option<(&Self, &Self)> { + if mid > self.len() { None + } else { + // Note: We're trusting the compiler to inline this and remove the assertion + // hiding on the top of slice::split_at: `assert(mid <= self.len())` + Some(self.split_at(mid)) + } + } + #[inline] + fn debug_split_at(&self, mid: usize) -> (&Self, &Self) { + if mid > self.len() { + debug_assert!(false, "debug_split_at: index expected to be in range"); + (self, &[]) + } else { + // Note: We're trusting the compiler to inline this and remove the assertion + // hiding on the top of slice::split_at: `assert(mid <= self.len())` + self.split_at(mid) } } } -/// Gets the range between the specified indices, panicking in debug mode if not in bounds. -#[inline] -pub(crate) fn debug_get_range(slice: &[u8], range: Range) -> Option<&[u8]> { - match slice.get(range) { - Some(x) => Some(x), - None => { - debug_assert!(false, "debug_get_range: indices expected to be in range"); - None +pub(crate) trait DebugUnwrapOr { + /// Unwraps the option or panics in debug mode, returning the `gigo_value` + fn debug_unwrap_or(self, gigo_value: T) -> T; +} + +impl DebugUnwrapOr for Option { + #[inline] + fn debug_unwrap_or(self, gigo_value: T) -> T { + match self { + Some(x) => x, + None => { + debug_assert!(false, "debug_unwrap_or called on a None value"); + gigo_value + } } } } @@ -86,3 +85,5 @@ macro_rules! debug_unwrap { debug_unwrap!($expr, return ()) }; } + +pub(crate) use debug_unwrap; diff --git a/experimental/zerotrie/src/reader.rs b/experimental/zerotrie/src/reader.rs index 56ec575ab50..afd447c5b40 100644 --- a/experimental/zerotrie/src/reader.rs +++ b/experimental/zerotrie/src/reader.rs @@ -219,17 +219,17 @@ use alloc::string::String; /// - `n` = the number of items in the offset table /// - `w` = the width of the offset table items minus one #[inline] -fn get_branch(mut trie: &[u8], i: usize, n: usize, mut w: usize) -> Option<&[u8]> { +fn get_branch(mut trie: &[u8], i: usize, n: usize, mut w: usize) -> &[u8] { let mut p = 0usize; let mut q = 0usize; loop { let indices; - (indices, trie) = debug_split_at(trie, n - 1); + (indices, trie) = trie.debug_split_at(n - 1); p = (p << 8) + if i == 0 { 0 } else { - debug_get(indices, i - 1)? as usize + *indices.get(i - 1).debug_unwrap_or(&0) as usize }; q = match indices.get(i) { Some(x) => (q << 8) + *x as usize, @@ -240,24 +240,24 @@ fn get_branch(mut trie: &[u8], i: usize, n: usize, mut w: usize) -> Option<&[u8] } w -= 1; } - debug_get_range(trie, p..q) + trie.get(p..q).debug_unwrap_or(&[]) } /// Version of [`get_branch()`] specialized for the case `w == 0` for performance #[inline] -fn get_branch_w0(mut trie: &[u8], i: usize, n: usize) -> Option<&[u8]> { +fn get_branch_w0(mut trie: &[u8], i: usize, n: usize) -> &[u8] { let indices; - (indices, trie) = debug_split_at(trie, n - 1); + (indices, trie) = trie.debug_split_at(n - 1); let p = if i == 0 { 0 } else { - debug_get(indices, i - 1)? as usize + *indices.get(i - 1).debug_unwrap_or(&0) as usize }; let q = match indices.get(i) { Some(x) => *x as usize, None => trie.len(), }; - debug_get_range(trie, p..q) + trie.get(p..q).debug_unwrap_or(&[]) } /// The node type. See the module-level docs for more explanation of the four node types. @@ -331,8 +331,8 @@ pub fn get_bsearch_only(mut trie: &[u8], mut ascii: &[u8]) -> Option { } if matches!(byte_type, NodeType::Span) { let (trie_span, ascii_span); - (trie_span, trie) = debug_split_at(trie, x); - (ascii_span, ascii) = maybe_split_at(ascii, x)?; + (trie_span, trie) = trie.debug_split_at(x); + (ascii_span, ascii) = ascii.maybe_split_at(x)?; if trie_span == ascii_span { // Matched a byte span continue; @@ -348,13 +348,13 @@ pub fn get_bsearch_only(mut trie: &[u8], mut ascii: &[u8]) -> Option { let w = w & 0x3; let x = if x == 0 { 256 } else { x }; // Always use binary search - (search, trie) = debug_split_at(trie, x); + (search, trie) = trie.debug_split_at(x); i = search.binary_search(c).ok()?; trie = if w == 0 { get_branch_w0(trie, i, x) } else { get_branch(trie, i, x, w) - }?; + }; ascii = temp; continue; } else { @@ -395,8 +395,8 @@ pub fn get_phf_limited(mut trie: &[u8], mut ascii: &[u8]) -> Option { } if matches!(byte_type, NodeType::Span) { let (trie_span, ascii_span); - (trie_span, trie) = debug_split_at(trie, x); - (ascii_span, ascii) = maybe_split_at(ascii, x)?; + (trie_span, trie) = trie.debug_split_at(x); + (ascii_span, ascii) = ascii.maybe_split_at(x)?; if trie_span == ascii_span { // Matched a byte span continue; @@ -413,18 +413,18 @@ pub fn get_phf_limited(mut trie: &[u8], mut ascii: &[u8]) -> Option { let x = if x == 0 { 256 } else { x }; if x < 16 { // binary search - (search, trie) = debug_split_at(trie, x); + (search, trie) = trie.debug_split_at(x); i = search.binary_search(c).ok()?; } else { // phf - (search, trie) = debug_split_at(trie, x * 2 + 1); + (search, trie) = trie.debug_split_at(x * 2 + 1); i = PerfectByteHashMap::from_store(search).get(*c)?; } trie = if w == 0 { get_branch_w0(trie, i, x) } else { get_branch(trie, i, x, w) - }?; + }; ascii = temp; continue; } else { @@ -465,8 +465,8 @@ pub fn get_phf_extended(mut trie: &[u8], mut ascii: &[u8]) -> Option { } if matches!(byte_type, NodeType::Span) { let (trie_span, ascii_span); - (trie_span, trie) = debug_split_at(trie, x); - (ascii_span, ascii) = maybe_split_at(ascii, x)?; + (trie_span, trie) = trie.debug_split_at(x); + (ascii_span, ascii) = ascii.maybe_split_at(x)?; if trie_span == ascii_span { // Matched a byte span continue; @@ -480,18 +480,18 @@ pub fn get_phf_extended(mut trie: &[u8], mut ascii: &[u8]) -> Option { let x = if x == 0 { 256 } else { x }; if x < 16 { // binary search - (search, trie) = debug_split_at(trie, x); + (search, trie) = trie.debug_split_at(x); i = search.binary_search(c).ok()?; } else { // phf - (search, trie) = debug_split_at(trie, x * 2 + 1); + (search, trie) = trie.debug_split_at(x * 2 + 1); i = PerfectByteHashMap::from_store(search).get(*c)?; } trie = if w == 0 { get_branch_w0(trie, i, x) } else { get_branch(trie, i, x, w) - }?; + }; ascii = temp; continue; } else { @@ -558,7 +558,7 @@ impl<'a> Iterator for ZeroTrieIterator<'a> { NodeType::Branch => read_varint_meta2(*b, trie), }; if matches!(byte_type, NodeType::Span) { - (span, trie) = debug_split_at(trie, x); + (span, trie) = trie.debug_split_at(x); string.extend(span); continue; } @@ -578,19 +578,19 @@ impl<'a> Iterator for ZeroTrieIterator<'a> { } let byte = if x < 16 || !self.use_phf { // binary search - (search, trie) = debug_split_at(trie, x); - debug_get(search, branch_idx)? + (search, trie) = trie.debug_split_at(x); + debug_unwrap!(search.get(branch_idx), return None) } else { // phf - (search, trie) = debug_split_at(trie, x * 2 + 1); - debug_get(search, branch_idx + x + 1)? + (search, trie) = trie.debug_split_at(x * 2 + 1); + debug_unwrap!(search.get(branch_idx + x + 1), return None) }; - string.push(byte); + string.push(*byte); trie = if w == 0 { get_branch_w0(trie, branch_idx, x) } else { get_branch(trie, branch_idx, x, w) - }?; + }; branch_idx = 0; } } From 3743661150a9cb80ab37431f0c82b83b076b9774 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Wed, 29 Nov 2023 19:34:35 -0700 Subject: [PATCH 05/21] Add as_borrowed_slice and AsRef impl. Need to bikeshed the name. --- experimental/zerotrie/src/zerotrie.rs | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/experimental/zerotrie/src/zerotrie.rs b/experimental/zerotrie/src/zerotrie.rs index f319ed9324a..deedcb81443 100644 --- a/experimental/zerotrie/src/zerotrie.rs +++ b/experimental/zerotrie/src/zerotrie.rs @@ -231,6 +231,20 @@ macro_rules! impl_zerotrie_subtype { pub fn as_borrowed(&self) -> &$name<[u8]> { $name::from_bytes(self.store.as_ref()) } + /// Returns a trie with a store borrowing from this trie. + #[inline] + pub fn as_borrowed_slice(&self) -> $name<&[u8]> { + $name::from_store(self.store.as_ref()) + } + } + impl AsRef<$name<[u8]>> for $name + where + Store: AsRef<[u8]> + ?Sized, + { + #[inline] + fn as_ref(&self) -> &$name<[u8]> { + self.as_borrowed() + } } #[cfg(feature = "alloc")] impl $name From 46316a09e499f7c06ffbf8121c6c34b492ba00f6 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Wed, 29 Nov 2023 23:14:42 -0600 Subject: [PATCH 06/21] Initial implementation of ZeroTrie step function --- experimental/zerotrie/src/reader.rs | 71 ++++++++++++++++++++++++ experimental/zerotrie/src/zerotrie.rs | 78 +++++++++++++++++++++++++++ 2 files changed, 149 insertions(+) diff --git a/experimental/zerotrie/src/reader.rs b/experimental/zerotrie/src/reader.rs index afd447c5b40..94bf1b2c190 100644 --- a/experimental/zerotrie/src/reader.rs +++ b/experimental/zerotrie/src/reader.rs @@ -504,6 +504,77 @@ pub fn get_phf_extended(mut trie: &[u8], mut ascii: &[u8]) -> Option { } } +pub(crate) fn step_bsearch_only(trie: &mut &[u8], c: u8) { + let (mut b, x, search); + loop { + (b, *trie) = match trie.split_first() { + Some(v) => v, + None => { + // Empty trie or only a value node + return + } + }; + match byte_type(*b) { + NodeType::Ascii if *b == c => { + // Matched a byte + return; + } + NodeType::Ascii => { + // Byte that doesn't match + *trie = &[]; + return; + } + NodeType::Branch => { + // Proceed to the branch node logic below + (x, *trie) = read_varint_meta2(*b, trie); + break; + } + NodeType::Span => { + // Question: Should we put the trie back into a valid state? + // Currently this code is unreachable so let's not worry about it. + debug_assert!(false, "span nodes not supported in stepping"); + return; + } + NodeType::Value => { + // Skip the value node and go to the next node + (_, *trie) = read_varint_meta3(*b, trie); + continue; + } + }; + } + // Branch node + let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) }; + // See comment above regarding this assertion + debug_assert!(w <= 3, "get: w > 3 but we assume w <= 3"); + let w = w & 0x3; + let x = if x == 0 { 256 } else { x }; + // Always use binary search + (search, *trie) = trie.debug_split_at(x); + match search.binary_search(&c) { + Ok(i) => { + // Matched a byte + *trie = if w == 0 { + get_branch_w0(trie, i, x) + } else { + get_branch(trie, i, x, w) + }; + }, + Err(_) => { + // Byte that doesn't match + *trie = &[] + } + }; +} + +pub(crate) fn peek_value(mut trie: &[u8]) -> Option { + let b; + (b, trie) = trie.split_first()?; + match byte_type(*b) { + NodeType::Ascii | NodeType::Span | NodeType::Branch => None, + NodeType::Value => Some(read_varint_meta3(*b, trie).0), + } +} + #[cfg(feature = "alloc")] use alloc::vec::Vec; diff --git a/experimental/zerotrie/src/zerotrie.rs b/experimental/zerotrie/src/zerotrie.rs index deedcb81443..90cf22c2c87 100644 --- a/experimental/zerotrie/src/zerotrie.rs +++ b/experimental/zerotrie/src/zerotrie.rs @@ -204,6 +204,24 @@ macro_rules! impl_zerotrie_subtype { pub fn is_empty(&self) -> bool { self.store.as_ref().is_empty() } + /// Gets the value at the head of the trie. This is equivalent to + /// calling `get` with the empty string. + /// + /// # Examples + /// + /// ``` + #[doc = concat!("use zerotrie::", stringify!($name), ";")] + /// + /// // A trie with two values: "" and "abc" + #[doc = concat!("let trie: &", stringify!($name), "<[u8]> = ", stringify!($name), "::from_bytes(b\"\\x80abc\\x81\");")] + /// + /// assert_eq!(Some(0), trie.head_value()); + /// assert_eq!(Some(0), trie.get("")); + /// ``` + #[inline] + pub fn head_value(&self) -> Option { + peek_value(self.store.as_ref()) + } /// Returns the size of the trie in number of bytes. /// /// To get the number of keys in the trie, use `.iter().count()`: @@ -565,6 +583,66 @@ impl_zerotrie_subtype!( Vec::into_boxed_slice ); +impl ZeroTrieSimpleAscii<&[u8]> { + /// Steps one node into the trie, mutating self. + /// + /// Useful to query a trie with data that is not a slice. Use + /// [`Self::head_value()`] to check for the presence of a string + /// in the trie. + /// + /// This is only supported on `ZeroTrieSimpleAscii` because other trie + /// types may contain span nodes, which cannot be split. + /// + /// # Examples + /// + /// Get a value out of a trie by manually iterating over the bytes: + /// + /// ``` + /// use zerotrie::ZeroTrieSimpleAscii; + /// + /// // A trie with two values: "abc" and "abcdef" + /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); + /// + /// // Get out the value for "abc" + /// let mut it = trie.as_borrowed_slice(); + /// for c in b"abc".iter() { + /// it.step(*c); + /// } + /// assert_eq!(it.head_value(), Some(0)); + /// ``` + /// + /// Unrolled loop checking for string presence at every step: + /// + /// ``` + /// use zerotrie::ZeroTrieSimpleAscii; + /// + /// // A trie with two values: "abc" and "abcdef" + /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); + /// + /// // Search the trie for the string "abcdxy" + /// let mut it = trie.as_borrowed_slice(); + /// assert_eq!(it.head_value(), None); // "" + /// it.step(b'a'); + /// assert_eq!(it.head_value(), None); // "a" + /// it.step(b'b'); + /// assert_eq!(it.head_value(), None); // "ab" + /// it.step(b'c'); + /// assert_eq!(it.head_value(), Some(0)); // "abc" + /// it.step(b'd'); + /// assert_eq!(it.head_value(), None); // "abcd" + /// assert!(!it.is_empty()); + /// it.step(b'x'); // no strings have the prefix "abcdx" + /// assert!(it.is_empty()); + /// assert_eq!(it.head_value(), None); // "abcdx" + /// it.step(b'y'); + /// assert_eq!(it.head_value(), None); // "abcdxy" + /// ``` + #[inline] + pub fn step(&mut self, byte: u8) { + step_bsearch_only(&mut self.store, byte) + } +} + macro_rules! impl_dispatch { ($self:ident, $inner_fn:ident()) => { match $self.0 { From 2d8d8f9cced0f04b68431ad0df4a4347be2ba2cd Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Wed, 29 Nov 2023 23:49:39 -0600 Subject: [PATCH 07/21] Use ZeroTrie stepping in BlobSchemaV2 to avoid allocations --- provider/blob/src/blob_schema.rs | 37 ++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/provider/blob/src/blob_schema.rs b/provider/blob/src/blob_schema.rs index 4475b2e3236..0bee8c6c3c3 100644 --- a/provider/blob/src/blob_schema.rs +++ b/provider/blob/src/blob_schema.rs @@ -3,6 +3,7 @@ // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). use alloc::boxed::Box; +use core::fmt; use icu_provider::prelude::*; use serde::Deserialize; use writeable::Writeable; @@ -149,6 +150,28 @@ impl Default for BlobSchemaV2<'_> { } } +/// A struct that steps through a ZeroTrie when fed data from fmt::Write +struct ZeroTrieStepWrite<'a> { + trie: ZeroTrieSimpleAscii<&'a [u8]>, +} + +impl<'a> fmt::Write for ZeroTrieStepWrite<'a> { + fn write_str(&mut self, s: &str) -> fmt::Result { + for b in s.bytes() { + self.trie.step(b); + } + Ok(()) + } + fn write_char(&mut self, c: char) -> fmt::Result { + debug_assert!(c.is_ascii()); + self.trie.step(c as u8); + Ok(()) + } + fn write_fmt(&mut self, _: fmt::Arguments<'_>) -> fmt::Result { + unreachable!() + } +} + impl<'data> BlobSchemaV2<'data> { pub fn load(&self, key: DataKey, req: DataRequest) -> Result<&'data [u8], DataError> { let key_index = self @@ -163,10 +186,16 @@ impl<'data> BlobSchemaV2<'data> { .locales .get(key_index) .ok_or_else(|| DataError::custom("Invalid blob bytes").with_req(key, req))?; - // TODO(#4249): Add a lookup function to zerotrie so we don't need to stringify - let locale_str = req.locale.write_to_string(); - let blob_index = ZeroTrieSimpleAscii::from_store(zerotrie) - .get(locale_str.as_bytes()) + let mut trie_write = ZeroTrieStepWrite { + trie: ZeroTrieSimpleAscii::from_store(zerotrie), + }; + #[allow(clippy::unwrap_used)] // infallible impl + req.locale + .write_to(&mut trie_write) + .unwrap(); + let blob_index = trie_write + .trie + .head_value() .ok_or_else(|| DataErrorKind::MissingLocale.with_req(key, req))?; let buffer = self .buffers From f9a614e7079ac3db2707ea471b9bab92c06995ef Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Fri, 1 Dec 2023 00:56:07 -0600 Subject: [PATCH 08/21] Add APIs for ZeroTrieSimpleAsciiCursor and use them in BlobSchemaV2 --- experimental/zerotrie/src/lib.rs | 1 + experimental/zerotrie/src/zerotrie.rs | 93 +++++++++++++++++++++++---- provider/blob/src/blob_schema.rs | 14 ++-- 3 files changed, 87 insertions(+), 21 deletions(-) diff --git a/experimental/zerotrie/src/lib.rs b/experimental/zerotrie/src/lib.rs index 817093d2f36..61fca0555b9 100644 --- a/experimental/zerotrie/src/lib.rs +++ b/experimental/zerotrie/src/lib.rs @@ -69,6 +69,7 @@ pub use crate::zerotrie::ZeroTrie; pub use crate::zerotrie::ZeroTrieExtendedCapacity; pub use crate::zerotrie::ZeroTriePerfectHash; pub use crate::zerotrie::ZeroTrieSimpleAscii; +pub use crate::zerotrie::ZeroTrieSimpleAsciiCursor; pub use error::Error as ZeroTrieError; #[doc(hidden)] diff --git a/experimental/zerotrie/src/zerotrie.rs b/experimental/zerotrie/src/zerotrie.rs index 90cf22c2c87..9e37a3dbfd7 100644 --- a/experimental/zerotrie/src/zerotrie.rs +++ b/experimental/zerotrie/src/zerotrie.rs @@ -583,15 +583,17 @@ impl_zerotrie_subtype!( Vec::into_boxed_slice ); -impl ZeroTrieSimpleAscii<&[u8]> { +impl ZeroTrieSimpleAscii +where + Store: AsRef<[u8]> + ?Sized, +{ /// Steps one node into the trie, mutating self. /// /// Useful to query a trie with data that is not a slice. Use /// [`Self::head_value()`] to check for the presence of a string /// in the trie. /// - /// This is only supported on `ZeroTrieSimpleAscii` because other trie - /// types may contain span nodes, which cannot be split. + /// This is currently supported only on `ZeroTrieSimpleAscii`. /// /// # Examples /// @@ -604,11 +606,16 @@ impl ZeroTrieSimpleAscii<&[u8]> { /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); /// /// // Get out the value for "abc" - /// let mut it = trie.as_borrowed_slice(); + /// let mut it = trie.cursor(); /// for c in b"abc".iter() { + /// // Checking is_empty() is not required, but it is + /// // good for efficiency + /// if it.is_empty() { + /// break; + /// } /// it.step(*c); /// } - /// assert_eq!(it.head_value(), Some(0)); + /// assert_eq!(it.value(), Some(0)); /// ``` /// /// Unrolled loop checking for string presence at every step: @@ -620,26 +627,84 @@ impl ZeroTrieSimpleAscii<&[u8]> { /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); /// /// // Search the trie for the string "abcdxy" - /// let mut it = trie.as_borrowed_slice(); - /// assert_eq!(it.head_value(), None); // "" + /// let mut it = trie.cursor(); + /// assert_eq!(it.value(), None); // "" /// it.step(b'a'); - /// assert_eq!(it.head_value(), None); // "a" + /// assert_eq!(it.value(), None); // "a" /// it.step(b'b'); - /// assert_eq!(it.head_value(), None); // "ab" + /// assert_eq!(it.value(), None); // "ab" /// it.step(b'c'); - /// assert_eq!(it.head_value(), Some(0)); // "abc" + /// assert_eq!(it.value(), Some(0)); // "abc" /// it.step(b'd'); - /// assert_eq!(it.head_value(), None); // "abcd" + /// assert_eq!(it.value(), None); // "abcd" /// assert!(!it.is_empty()); /// it.step(b'x'); // no strings have the prefix "abcdx" /// assert!(it.is_empty()); - /// assert_eq!(it.head_value(), None); // "abcdx" + /// assert_eq!(it.value(), None); // "abcdx" /// it.step(b'y'); - /// assert_eq!(it.head_value(), None); // "abcdxy" + /// assert_eq!(it.value(), None); // "abcdxy" /// ``` #[inline] + pub fn cursor(&self) -> ZeroTrieSimpleAsciiCursor { + ZeroTrieSimpleAsciiCursor { + trie: self.as_borrowed_slice(), + } + } +} + +impl<'a> ZeroTrieSimpleAscii<&'a [u8]> { + /// Same as [`ZeroTrieSimpleAscii::cursor()`] but moves self to avoid + /// having to doubly anchor the trie to the stack. + #[inline] + pub fn into_cursor(self) -> ZeroTrieSimpleAsciiCursor<'a> { + ZeroTrieSimpleAsciiCursor { + trie: self + } + } +} + +/// A cursor into a [`ZeroTrieSimpleAscii`], useful for stepwise lookup. +/// +/// For examples, see [`ZeroTrieSimpleAscii::cursor()`]. +#[derive(Debug)] +pub struct ZeroTrieSimpleAsciiCursor<'a> { + trie: ZeroTrieSimpleAscii<&'a [u8]>, +} + +impl<'a> ZeroTrieSimpleAsciiCursor<'a> { + /// Steps the cursor one byte into the trie. + #[inline] pub fn step(&mut self, byte: u8) { - step_bsearch_only(&mut self.store, byte) + step_bsearch_only(&mut self.trie.store, byte) + } + + /// Gets the value at the current position in the trie. + /// + /// Calling this function on a new cursor is equivalent to calling `.get()` + /// with the empty string. + /// + /// # Examples + /// + /// ``` + /// use zerotrie::ZeroTrieSimpleAscii; + /// + /// // A trie with two values: "" and "abc" + /// let trie = ZeroTrieSimpleAscii::from_bytes(b"\x80abc\x81"); + /// + /// assert_eq!(Some(0), trie.get("")); + /// assert_eq!(Some(0), trie.cursor().value()); + /// ``` + #[inline] + pub fn value(&self) -> Option { + peek_value(self.trie.store.as_ref()) + } + + /// Checks whether the cursor points to an empty trie. + /// + /// Use this to determine when to stop iterating. + #[inline] + pub fn is_empty(&self) -> bool { + self.trie.is_empty() } } diff --git a/provider/blob/src/blob_schema.rs b/provider/blob/src/blob_schema.rs index 0bee8c6c3c3..431e1a2b7de 100644 --- a/provider/blob/src/blob_schema.rs +++ b/provider/blob/src/blob_schema.rs @@ -7,7 +7,7 @@ use core::fmt; use icu_provider::prelude::*; use serde::Deserialize; use writeable::Writeable; -use zerotrie::ZeroTrieSimpleAscii; +use zerotrie::{ZeroTrieSimpleAscii, ZeroTrieSimpleAsciiCursor}; use zerovec::maps::{ZeroMap2dBorrowed, ZeroMapKV}; use zerovec::vecs::{Index32, VarZeroSlice, VarZeroVec, ZeroSlice}; @@ -152,19 +152,19 @@ impl Default for BlobSchemaV2<'_> { /// A struct that steps through a ZeroTrie when fed data from fmt::Write struct ZeroTrieStepWrite<'a> { - trie: ZeroTrieSimpleAscii<&'a [u8]>, + cursor: ZeroTrieSimpleAsciiCursor<'a>, } impl<'a> fmt::Write for ZeroTrieStepWrite<'a> { fn write_str(&mut self, s: &str) -> fmt::Result { for b in s.bytes() { - self.trie.step(b); + self.cursor.step(b); } Ok(()) } fn write_char(&mut self, c: char) -> fmt::Result { debug_assert!(c.is_ascii()); - self.trie.step(c as u8); + self.cursor.step(c as u8); Ok(()) } fn write_fmt(&mut self, _: fmt::Arguments<'_>) -> fmt::Result { @@ -187,15 +187,15 @@ impl<'data> BlobSchemaV2<'data> { .get(key_index) .ok_or_else(|| DataError::custom("Invalid blob bytes").with_req(key, req))?; let mut trie_write = ZeroTrieStepWrite { - trie: ZeroTrieSimpleAscii::from_store(zerotrie), + cursor: ZeroTrieSimpleAscii::from_store(zerotrie).into_cursor(), }; #[allow(clippy::unwrap_used)] // infallible impl req.locale .write_to(&mut trie_write) .unwrap(); let blob_index = trie_write - .trie - .head_value() + .cursor + .value() .ok_or_else(|| DataErrorKind::MissingLocale.with_req(key, req))?; let buffer = self .buffers From 929f81fcf0cc4d20024a852cd86d83bcb8525fad Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Fri, 1 Dec 2023 00:57:10 -0600 Subject: [PATCH 09/21] Move around examples and cargo fmt --- experimental/zerotrie/src/reader.rs | 8 +-- experimental/zerotrie/src/zerotrie.rs | 80 ++++++++++++++++++--------- 2 files changed, 57 insertions(+), 31 deletions(-) diff --git a/experimental/zerotrie/src/reader.rs b/experimental/zerotrie/src/reader.rs index 94bf1b2c190..c27592f7816 100644 --- a/experimental/zerotrie/src/reader.rs +++ b/experimental/zerotrie/src/reader.rs @@ -504,14 +504,14 @@ pub fn get_phf_extended(mut trie: &[u8], mut ascii: &[u8]) -> Option { } } -pub(crate) fn step_bsearch_only(trie: &mut &[u8], c: u8) { +pub(crate) fn step_bsearch_only(trie: &mut &[u8], c: u8) { let (mut b, x, search); loop { (b, *trie) = match trie.split_first() { Some(v) => v, None => { // Empty trie or only a value node - return + return; } }; match byte_type(*b) { @@ -558,7 +558,7 @@ pub(crate) fn step_bsearch_only(trie: &mut &[u8], c: u8) { } else { get_branch(trie, i, x, w) }; - }, + } Err(_) => { // Byte that doesn't match *trie = &[] @@ -566,7 +566,7 @@ pub(crate) fn step_bsearch_only(trie: &mut &[u8], c: u8) { }; } -pub(crate) fn peek_value(mut trie: &[u8]) -> Option { +pub(crate) fn peek_value(mut trie: &[u8]) -> Option { let b; (b, trie) = trie.split_first()?; match byte_type(*b) { diff --git a/experimental/zerotrie/src/zerotrie.rs b/experimental/zerotrie/src/zerotrie.rs index 9e37a3dbfd7..00cd71de1e8 100644 --- a/experimental/zerotrie/src/zerotrie.rs +++ b/experimental/zerotrie/src/zerotrie.rs @@ -587,11 +587,9 @@ impl ZeroTrieSimpleAscii where Store: AsRef<[u8]> + ?Sized, { - /// Steps one node into the trie, mutating self. + /// Gets a cursor into the current trie. /// - /// Useful to query a trie with data that is not a slice. Use - /// [`Self::head_value()`] to check for the presence of a string - /// in the trie. + /// Useful to query a trie with data that is not a slice. /// /// This is currently supported only on `ZeroTrieSimpleAscii`. /// @@ -606,19 +604,19 @@ where /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); /// /// // Get out the value for "abc" - /// let mut it = trie.cursor(); - /// for c in b"abc".iter() { + /// let mut cursor = trie.cursor(); + /// for b in b"abc".iter() { /// // Checking is_empty() is not required, but it is /// // good for efficiency - /// if it.is_empty() { + /// if cursor.is_empty() { /// break; /// } - /// it.step(*c); + /// cursor.step(*b); /// } - /// assert_eq!(it.value(), Some(0)); + /// assert_eq!(cursor.value(), Some(0)); /// ``` /// - /// Unrolled loop checking for string presence at every step: + /// Find the longest prefix match: /// /// ``` /// use zerotrie::ZeroTrieSimpleAscii; @@ -626,23 +624,22 @@ where /// // A trie with two values: "abc" and "abcdef" /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); /// - /// // Search the trie for the string "abcdxy" - /// let mut it = trie.cursor(); - /// assert_eq!(it.value(), None); // "" - /// it.step(b'a'); - /// assert_eq!(it.value(), None); // "a" - /// it.step(b'b'); - /// assert_eq!(it.value(), None); // "ab" - /// it.step(b'c'); - /// assert_eq!(it.value(), Some(0)); // "abc" - /// it.step(b'd'); - /// assert_eq!(it.value(), None); // "abcd" - /// assert!(!it.is_empty()); - /// it.step(b'x'); // no strings have the prefix "abcdx" - /// assert!(it.is_empty()); - /// assert_eq!(it.value(), None); // "abcdx" - /// it.step(b'y'); - /// assert_eq!(it.value(), None); // "abcdxy" + /// // Find the longest prefix of the string "abcdxy": + /// let query = b"abcdxy"; + /// let mut longest_prefix = 0; + /// let mut cursor = trie.cursor(); + /// for (i, b) in query.iter().enumerate() { + /// if cursor.is_empty() { + /// break; + /// } + /// if cursor.value().is_some() { + /// longest_prefix = i; + /// } + /// cursor.step(*b); + /// } + /// + /// // The longest prefix is "abc" which is length 3: + /// assert_eq!(longest_prefix, 3); /// ``` #[inline] pub fn cursor(&self) -> ZeroTrieSimpleAsciiCursor { @@ -673,6 +670,35 @@ pub struct ZeroTrieSimpleAsciiCursor<'a> { impl<'a> ZeroTrieSimpleAsciiCursor<'a> { /// Steps the cursor one byte into the trie. + /// + /// # Examples + /// + /// Unrolled loop checking for string presence at every step: + /// + /// ``` + /// use zerotrie::ZeroTrieSimpleAscii; + /// + /// // A trie with two values: "abc" and "abcdef" + /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); + /// + /// // Search the trie for the string "abcdxy" + /// let mut cursor = trie.cursor(); + /// assert_eq!(cursor.value(), None); // "" + /// cursor.step(b'a'); + /// assert_eq!(cursor.value(), None); // "a" + /// cursor.step(b'b'); + /// assert_eq!(cursor.value(), None); // "ab" + /// cursor.step(b'c'); + /// assert_eq!(cursor.value(), Some(0)); // "abc" + /// cursor.step(b'd'); + /// assert_eq!(cursor.value(), None); // "abcd" + /// assert!(!cursor.is_empty()); + /// cursor.step(b'x'); // no strings have the prefix "abcdx" + /// assert!(cursor.is_empty()); + /// assert_eq!(cursor.value(), None); // "abcdx" + /// cursor.step(b'y'); + /// assert_eq!(cursor.value(), None); // "abcdxy" + /// ``` #[inline] pub fn step(&mut self, byte: u8) { step_bsearch_only(&mut self.trie.store, byte) From 79fc8907fd5c132c10279e04ce364d27c2bdc52a Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Fri, 1 Dec 2023 01:02:24 -0600 Subject: [PATCH 10/21] Return core::fmt::Error instead of asserting ascii --- provider/blob/src/blob_schema.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/provider/blob/src/blob_schema.rs b/provider/blob/src/blob_schema.rs index 431e1a2b7de..66b1dda3ac7 100644 --- a/provider/blob/src/blob_schema.rs +++ b/provider/blob/src/blob_schema.rs @@ -158,12 +158,17 @@ struct ZeroTrieStepWrite<'a> { impl<'a> fmt::Write for ZeroTrieStepWrite<'a> { fn write_str(&mut self, s: &str) -> fmt::Result { for b in s.bytes() { + if !b.is_ascii() { + return Err(fmt::Error); + } self.cursor.step(b); } Ok(()) } fn write_char(&mut self, c: char) -> fmt::Result { - debug_assert!(c.is_ascii()); + if !c.is_ascii() { + return Err(fmt::Error); + } self.cursor.step(c as u8); Ok(()) } From 3a8efa16cce85e0fd30bd27aeae8589f1181f065 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Fri, 1 Dec 2023 01:05:54 -0600 Subject: [PATCH 11/21] Move the core::fmt::Write impl into the zerotrie crate --- experimental/zerotrie/src/zerotrie.rs | 23 ++++++++++++++++ provider/blob/src/blob_schema.rs | 39 +++------------------------ 2 files changed, 27 insertions(+), 35 deletions(-) diff --git a/experimental/zerotrie/src/zerotrie.rs b/experimental/zerotrie/src/zerotrie.rs index 00cd71de1e8..8416c142f3e 100644 --- a/experimental/zerotrie/src/zerotrie.rs +++ b/experimental/zerotrie/src/zerotrie.rs @@ -4,6 +4,7 @@ use crate::reader::*; +use core::fmt; use core::borrow::Borrow; #[cfg(feature = "alloc")] @@ -734,6 +735,28 @@ impl<'a> ZeroTrieSimpleAsciiCursor<'a> { } } +impl<'a> fmt::Write for ZeroTrieSimpleAsciiCursor<'a> { + fn write_str(&mut self, s: &str) -> fmt::Result { + for b in s.bytes() { + if !b.is_ascii() { + return Err(fmt::Error); + } + self.step(b); + } + Ok(()) + } + fn write_char(&mut self, c: char) -> fmt::Result { + if !c.is_ascii() { + return Err(fmt::Error); + } + self.step(c as u8); + Ok(()) + } + fn write_fmt(&mut self, _: fmt::Arguments<'_>) -> fmt::Result { + unreachable!() + } +} + macro_rules! impl_dispatch { ($self:ident, $inner_fn:ident()) => { match $self.0 { diff --git a/provider/blob/src/blob_schema.rs b/provider/blob/src/blob_schema.rs index 66b1dda3ac7..7c385679337 100644 --- a/provider/blob/src/blob_schema.rs +++ b/provider/blob/src/blob_schema.rs @@ -3,11 +3,10 @@ // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). use alloc::boxed::Box; -use core::fmt; use icu_provider::prelude::*; use serde::Deserialize; use writeable::Writeable; -use zerotrie::{ZeroTrieSimpleAscii, ZeroTrieSimpleAsciiCursor}; +use zerotrie::ZeroTrieSimpleAscii; use zerovec::maps::{ZeroMap2dBorrowed, ZeroMapKV}; use zerovec::vecs::{Index32, VarZeroSlice, VarZeroVec, ZeroSlice}; @@ -150,33 +149,6 @@ impl Default for BlobSchemaV2<'_> { } } -/// A struct that steps through a ZeroTrie when fed data from fmt::Write -struct ZeroTrieStepWrite<'a> { - cursor: ZeroTrieSimpleAsciiCursor<'a>, -} - -impl<'a> fmt::Write for ZeroTrieStepWrite<'a> { - fn write_str(&mut self, s: &str) -> fmt::Result { - for b in s.bytes() { - if !b.is_ascii() { - return Err(fmt::Error); - } - self.cursor.step(b); - } - Ok(()) - } - fn write_char(&mut self, c: char) -> fmt::Result { - if !c.is_ascii() { - return Err(fmt::Error); - } - self.cursor.step(c as u8); - Ok(()) - } - fn write_fmt(&mut self, _: fmt::Arguments<'_>) -> fmt::Result { - unreachable!() - } -} - impl<'data> BlobSchemaV2<'data> { pub fn load(&self, key: DataKey, req: DataRequest) -> Result<&'data [u8], DataError> { let key_index = self @@ -191,15 +163,12 @@ impl<'data> BlobSchemaV2<'data> { .locales .get(key_index) .ok_or_else(|| DataError::custom("Invalid blob bytes").with_req(key, req))?; - let mut trie_write = ZeroTrieStepWrite { - cursor: ZeroTrieSimpleAscii::from_store(zerotrie).into_cursor(), - }; + let mut cursor = ZeroTrieSimpleAscii::from_store(zerotrie).into_cursor(); #[allow(clippy::unwrap_used)] // infallible impl req.locale - .write_to(&mut trie_write) + .write_to(&mut cursor) .unwrap(); - let blob_index = trie_write - .cursor + let blob_index = cursor .value() .ok_or_else(|| DataErrorKind::MissingLocale.with_req(key, req))?; let buffer = self From 838ad05f6b9d8a4d25782f9226ab893f6d5a6f94 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Fri, 1 Dec 2023 01:06:16 -0600 Subject: [PATCH 12/21] Don't assert unreachable anymore --- experimental/zerotrie/src/zerotrie.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/experimental/zerotrie/src/zerotrie.rs b/experimental/zerotrie/src/zerotrie.rs index 8416c142f3e..0a6d2486476 100644 --- a/experimental/zerotrie/src/zerotrie.rs +++ b/experimental/zerotrie/src/zerotrie.rs @@ -752,9 +752,6 @@ impl<'a> fmt::Write for ZeroTrieSimpleAsciiCursor<'a> { self.step(c as u8); Ok(()) } - fn write_fmt(&mut self, _: fmt::Arguments<'_>) -> fmt::Result { - unreachable!() - } } macro_rules! impl_dispatch { From 4e706186392868bd0901a5f1e3398d7db8139474 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Fri, 1 Dec 2023 01:18:02 -0600 Subject: [PATCH 13/21] Docs for the new core::fmt::Write impl --- experimental/zerotrie/src/zerotrie.rs | 48 ++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 9 deletions(-) diff --git a/experimental/zerotrie/src/zerotrie.rs b/experimental/zerotrie/src/zerotrie.rs index 0a6d2486476..5608927c8c1 100644 --- a/experimental/zerotrie/src/zerotrie.rs +++ b/experimental/zerotrie/src/zerotrie.rs @@ -596,9 +596,10 @@ where /// /// # Examples /// - /// Get a value out of a trie by manually iterating over the bytes: + /// Get a value out of a trie by [writing](fmt::Write) it to the cursor: /// /// ``` + /// use core::fmt::Write; /// use zerotrie::ZeroTrieSimpleAscii; /// /// // A trie with two values: "abc" and "abcdef" @@ -606,14 +607,7 @@ where /// /// // Get out the value for "abc" /// let mut cursor = trie.cursor(); - /// for b in b"abc".iter() { - /// // Checking is_empty() is not required, but it is - /// // good for efficiency - /// if cursor.is_empty() { - /// break; - /// } - /// cursor.step(*b); - /// } + /// write!(&mut cursor, "abc"); /// assert_eq!(cursor.value(), Some(0)); /// ``` /// @@ -630,6 +624,8 @@ where /// let mut longest_prefix = 0; /// let mut cursor = trie.cursor(); /// for (i, b) in query.iter().enumerate() { + /// // Checking is_empty() is not required, but it is + /// // good for efficiency /// if cursor.is_empty() { /// break; /// } @@ -736,6 +732,23 @@ impl<'a> ZeroTrieSimpleAsciiCursor<'a> { } impl<'a> fmt::Write for ZeroTrieSimpleAsciiCursor<'a> { + /// Steps the cursor through each ASCII byte of the string. + /// + /// If the string contains non-ASCII chars, an error is returned. + /// + /// # Examples + /// + /// ``` + /// use core::fmt::Write; + /// use zerotrie::ZeroTrieSimpleAscii; + /// + /// // A trie with two values: "abc" and "abcdef" + /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); + /// + /// let mut cursor = trie.cursor(); + /// cursor.write_str("abcdxy").expect("all ASCII"); + /// cursor.write_str("🚂").expect_err("non-ASCII"); + /// ``` fn write_str(&mut self, s: &str) -> fmt::Result { for b in s.bytes() { if !b.is_ascii() { @@ -745,6 +758,23 @@ impl<'a> fmt::Write for ZeroTrieSimpleAsciiCursor<'a> { } Ok(()) } + /// Equivalent to [`ZeroTrieSimpleAsciiCursor::step()`], except returns + /// an error if the char is non-ASCII. + /// + /// # Examples + /// + /// ``` + /// use core::fmt::Write; + /// use zerotrie::ZeroTrieSimpleAscii; + /// + /// // A trie with two values: "abc" and "abcdef" + /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); + /// + /// let mut cursor = trie.cursor(); + /// cursor.write_char('a').expect("ASCII"); + /// cursor.write_char('x').expect("ASCII"); + /// cursor.write_char('🚂').expect_err("non-ASCII"); + /// ``` fn write_char(&mut self, c: char) -> fmt::Result { if !c.is_ascii() { return Err(fmt::Error); From 8f754cc48e4b40e8c7b10e22a44bae3876d5f439 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Fri, 1 Dec 2023 01:20:15 -0600 Subject: [PATCH 14/21] Move cursor impls to their own file --- experimental/zerotrie/src/cursor.rs | 208 ++++++++++++++++++++++++++ experimental/zerotrie/src/lib.rs | 3 +- experimental/zerotrie/src/zerotrie.rs | 201 ------------------------- 3 files changed, 210 insertions(+), 202 deletions(-) create mode 100644 experimental/zerotrie/src/cursor.rs diff --git a/experimental/zerotrie/src/cursor.rs b/experimental/zerotrie/src/cursor.rs new file mode 100644 index 00000000000..e03360abb45 --- /dev/null +++ b/experimental/zerotrie/src/cursor.rs @@ -0,0 +1,208 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use crate::ZeroTrieSimpleAscii; +use crate::reader::*; + +use core::fmt; + +impl ZeroTrieSimpleAscii +where + Store: AsRef<[u8]> + ?Sized, +{ + /// Gets a cursor into the current trie. + /// + /// Useful to query a trie with data that is not a slice. + /// + /// This is currently supported only on `ZeroTrieSimpleAscii`. + /// + /// # Examples + /// + /// Get a value out of a trie by [writing](fmt::Write) it to the cursor: + /// + /// ``` + /// use core::fmt::Write; + /// use zerotrie::ZeroTrieSimpleAscii; + /// + /// // A trie with two values: "abc" and "abcdef" + /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); + /// + /// // Get out the value for "abc" + /// let mut cursor = trie.cursor(); + /// write!(&mut cursor, "abc"); + /// assert_eq!(cursor.value(), Some(0)); + /// ``` + /// + /// Find the longest prefix match: + /// + /// ``` + /// use zerotrie::ZeroTrieSimpleAscii; + /// + /// // A trie with two values: "abc" and "abcdef" + /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); + /// + /// // Find the longest prefix of the string "abcdxy": + /// let query = b"abcdxy"; + /// let mut longest_prefix = 0; + /// let mut cursor = trie.cursor(); + /// for (i, b) in query.iter().enumerate() { + /// // Checking is_empty() is not required, but it is + /// // good for efficiency + /// if cursor.is_empty() { + /// break; + /// } + /// if cursor.value().is_some() { + /// longest_prefix = i; + /// } + /// cursor.step(*b); + /// } + /// + /// // The longest prefix is "abc" which is length 3: + /// assert_eq!(longest_prefix, 3); + /// ``` + #[inline] + pub fn cursor(&self) -> ZeroTrieSimpleAsciiCursor { + ZeroTrieSimpleAsciiCursor { + trie: self.as_borrowed_slice(), + } + } +} + +impl<'a> ZeroTrieSimpleAscii<&'a [u8]> { + /// Same as [`ZeroTrieSimpleAscii::cursor()`] but moves self to avoid + /// having to doubly anchor the trie to the stack. + #[inline] + pub fn into_cursor(self) -> ZeroTrieSimpleAsciiCursor<'a> { + ZeroTrieSimpleAsciiCursor { + trie: self + } + } +} + +/// A cursor into a [`ZeroTrieSimpleAscii`], useful for stepwise lookup. +/// +/// For examples, see [`ZeroTrieSimpleAscii::cursor()`]. +#[derive(Debug)] +pub struct ZeroTrieSimpleAsciiCursor<'a> { + trie: ZeroTrieSimpleAscii<&'a [u8]>, +} + +impl<'a> ZeroTrieSimpleAsciiCursor<'a> { + /// Steps the cursor one byte into the trie. + /// + /// # Examples + /// + /// Unrolled loop checking for string presence at every step: + /// + /// ``` + /// use zerotrie::ZeroTrieSimpleAscii; + /// + /// // A trie with two values: "abc" and "abcdef" + /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); + /// + /// // Search the trie for the string "abcdxy" + /// let mut cursor = trie.cursor(); + /// assert_eq!(cursor.value(), None); // "" + /// cursor.step(b'a'); + /// assert_eq!(cursor.value(), None); // "a" + /// cursor.step(b'b'); + /// assert_eq!(cursor.value(), None); // "ab" + /// cursor.step(b'c'); + /// assert_eq!(cursor.value(), Some(0)); // "abc" + /// cursor.step(b'd'); + /// assert_eq!(cursor.value(), None); // "abcd" + /// assert!(!cursor.is_empty()); + /// cursor.step(b'x'); // no strings have the prefix "abcdx" + /// assert!(cursor.is_empty()); + /// assert_eq!(cursor.value(), None); // "abcdx" + /// cursor.step(b'y'); + /// assert_eq!(cursor.value(), None); // "abcdxy" + /// ``` + #[inline] + pub fn step(&mut self, byte: u8) { + step_bsearch_only(&mut self.trie.store, byte) + } + + /// Gets the value at the current position in the trie. + /// + /// Calling this function on a new cursor is equivalent to calling `.get()` + /// with the empty string. + /// + /// # Examples + /// + /// ``` + /// use zerotrie::ZeroTrieSimpleAscii; + /// + /// // A trie with two values: "" and "abc" + /// let trie = ZeroTrieSimpleAscii::from_bytes(b"\x80abc\x81"); + /// + /// assert_eq!(Some(0), trie.get("")); + /// assert_eq!(Some(0), trie.cursor().value()); + /// ``` + #[inline] + pub fn value(&self) -> Option { + peek_value(self.trie.store.as_ref()) + } + + /// Checks whether the cursor points to an empty trie. + /// + /// Use this to determine when to stop iterating. + #[inline] + pub fn is_empty(&self) -> bool { + self.trie.is_empty() + } +} + +impl<'a> fmt::Write for ZeroTrieSimpleAsciiCursor<'a> { + /// Steps the cursor through each ASCII byte of the string. + /// + /// If the string contains non-ASCII chars, an error is returned. + /// + /// # Examples + /// + /// ``` + /// use core::fmt::Write; + /// use zerotrie::ZeroTrieSimpleAscii; + /// + /// // A trie with two values: "abc" and "abcdef" + /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); + /// + /// let mut cursor = trie.cursor(); + /// cursor.write_str("abcdxy").expect("all ASCII"); + /// cursor.write_str("🚂").expect_err("non-ASCII"); + /// ``` + fn write_str(&mut self, s: &str) -> fmt::Result { + for b in s.bytes() { + if !b.is_ascii() { + return Err(fmt::Error); + } + self.step(b); + } + Ok(()) + } + /// Equivalent to [`ZeroTrieSimpleAsciiCursor::step()`], except returns + /// an error if the char is non-ASCII. + /// + /// # Examples + /// + /// ``` + /// use core::fmt::Write; + /// use zerotrie::ZeroTrieSimpleAscii; + /// + /// // A trie with two values: "abc" and "abcdef" + /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); + /// + /// let mut cursor = trie.cursor(); + /// cursor.write_char('a').expect("ASCII"); + /// cursor.write_char('x').expect("ASCII"); + /// cursor.write_char('🚂').expect_err("non-ASCII"); + /// ``` + fn write_char(&mut self, c: char) -> fmt::Result { + if !c.is_ascii() { + return Err(fmt::Error); + } + self.step(c as u8); + Ok(()) + } +} diff --git a/experimental/zerotrie/src/lib.rs b/experimental/zerotrie/src/lib.rs index 61fca0555b9..f03c56122e8 100644 --- a/experimental/zerotrie/src/lib.rs +++ b/experimental/zerotrie/src/lib.rs @@ -56,6 +56,7 @@ extern crate alloc; mod builder; mod byte_phf; +mod cursor; mod error; #[macro_use] mod helpers; @@ -65,11 +66,11 @@ mod serde; mod varint; mod zerotrie; +pub use crate::cursor::ZeroTrieSimpleAsciiCursor; pub use crate::zerotrie::ZeroTrie; pub use crate::zerotrie::ZeroTrieExtendedCapacity; pub use crate::zerotrie::ZeroTriePerfectHash; pub use crate::zerotrie::ZeroTrieSimpleAscii; -pub use crate::zerotrie::ZeroTrieSimpleAsciiCursor; pub use error::Error as ZeroTrieError; #[doc(hidden)] diff --git a/experimental/zerotrie/src/zerotrie.rs b/experimental/zerotrie/src/zerotrie.rs index 5608927c8c1..34921efd5ad 100644 --- a/experimental/zerotrie/src/zerotrie.rs +++ b/experimental/zerotrie/src/zerotrie.rs @@ -4,7 +4,6 @@ use crate::reader::*; -use core::fmt; use core::borrow::Borrow; #[cfg(feature = "alloc")] @@ -584,206 +583,6 @@ impl_zerotrie_subtype!( Vec::into_boxed_slice ); -impl ZeroTrieSimpleAscii -where - Store: AsRef<[u8]> + ?Sized, -{ - /// Gets a cursor into the current trie. - /// - /// Useful to query a trie with data that is not a slice. - /// - /// This is currently supported only on `ZeroTrieSimpleAscii`. - /// - /// # Examples - /// - /// Get a value out of a trie by [writing](fmt::Write) it to the cursor: - /// - /// ``` - /// use core::fmt::Write; - /// use zerotrie::ZeroTrieSimpleAscii; - /// - /// // A trie with two values: "abc" and "abcdef" - /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); - /// - /// // Get out the value for "abc" - /// let mut cursor = trie.cursor(); - /// write!(&mut cursor, "abc"); - /// assert_eq!(cursor.value(), Some(0)); - /// ``` - /// - /// Find the longest prefix match: - /// - /// ``` - /// use zerotrie::ZeroTrieSimpleAscii; - /// - /// // A trie with two values: "abc" and "abcdef" - /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); - /// - /// // Find the longest prefix of the string "abcdxy": - /// let query = b"abcdxy"; - /// let mut longest_prefix = 0; - /// let mut cursor = trie.cursor(); - /// for (i, b) in query.iter().enumerate() { - /// // Checking is_empty() is not required, but it is - /// // good for efficiency - /// if cursor.is_empty() { - /// break; - /// } - /// if cursor.value().is_some() { - /// longest_prefix = i; - /// } - /// cursor.step(*b); - /// } - /// - /// // The longest prefix is "abc" which is length 3: - /// assert_eq!(longest_prefix, 3); - /// ``` - #[inline] - pub fn cursor(&self) -> ZeroTrieSimpleAsciiCursor { - ZeroTrieSimpleAsciiCursor { - trie: self.as_borrowed_slice(), - } - } -} - -impl<'a> ZeroTrieSimpleAscii<&'a [u8]> { - /// Same as [`ZeroTrieSimpleAscii::cursor()`] but moves self to avoid - /// having to doubly anchor the trie to the stack. - #[inline] - pub fn into_cursor(self) -> ZeroTrieSimpleAsciiCursor<'a> { - ZeroTrieSimpleAsciiCursor { - trie: self - } - } -} - -/// A cursor into a [`ZeroTrieSimpleAscii`], useful for stepwise lookup. -/// -/// For examples, see [`ZeroTrieSimpleAscii::cursor()`]. -#[derive(Debug)] -pub struct ZeroTrieSimpleAsciiCursor<'a> { - trie: ZeroTrieSimpleAscii<&'a [u8]>, -} - -impl<'a> ZeroTrieSimpleAsciiCursor<'a> { - /// Steps the cursor one byte into the trie. - /// - /// # Examples - /// - /// Unrolled loop checking for string presence at every step: - /// - /// ``` - /// use zerotrie::ZeroTrieSimpleAscii; - /// - /// // A trie with two values: "abc" and "abcdef" - /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); - /// - /// // Search the trie for the string "abcdxy" - /// let mut cursor = trie.cursor(); - /// assert_eq!(cursor.value(), None); // "" - /// cursor.step(b'a'); - /// assert_eq!(cursor.value(), None); // "a" - /// cursor.step(b'b'); - /// assert_eq!(cursor.value(), None); // "ab" - /// cursor.step(b'c'); - /// assert_eq!(cursor.value(), Some(0)); // "abc" - /// cursor.step(b'd'); - /// assert_eq!(cursor.value(), None); // "abcd" - /// assert!(!cursor.is_empty()); - /// cursor.step(b'x'); // no strings have the prefix "abcdx" - /// assert!(cursor.is_empty()); - /// assert_eq!(cursor.value(), None); // "abcdx" - /// cursor.step(b'y'); - /// assert_eq!(cursor.value(), None); // "abcdxy" - /// ``` - #[inline] - pub fn step(&mut self, byte: u8) { - step_bsearch_only(&mut self.trie.store, byte) - } - - /// Gets the value at the current position in the trie. - /// - /// Calling this function on a new cursor is equivalent to calling `.get()` - /// with the empty string. - /// - /// # Examples - /// - /// ``` - /// use zerotrie::ZeroTrieSimpleAscii; - /// - /// // A trie with two values: "" and "abc" - /// let trie = ZeroTrieSimpleAscii::from_bytes(b"\x80abc\x81"); - /// - /// assert_eq!(Some(0), trie.get("")); - /// assert_eq!(Some(0), trie.cursor().value()); - /// ``` - #[inline] - pub fn value(&self) -> Option { - peek_value(self.trie.store.as_ref()) - } - - /// Checks whether the cursor points to an empty trie. - /// - /// Use this to determine when to stop iterating. - #[inline] - pub fn is_empty(&self) -> bool { - self.trie.is_empty() - } -} - -impl<'a> fmt::Write for ZeroTrieSimpleAsciiCursor<'a> { - /// Steps the cursor through each ASCII byte of the string. - /// - /// If the string contains non-ASCII chars, an error is returned. - /// - /// # Examples - /// - /// ``` - /// use core::fmt::Write; - /// use zerotrie::ZeroTrieSimpleAscii; - /// - /// // A trie with two values: "abc" and "abcdef" - /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); - /// - /// let mut cursor = trie.cursor(); - /// cursor.write_str("abcdxy").expect("all ASCII"); - /// cursor.write_str("🚂").expect_err("non-ASCII"); - /// ``` - fn write_str(&mut self, s: &str) -> fmt::Result { - for b in s.bytes() { - if !b.is_ascii() { - return Err(fmt::Error); - } - self.step(b); - } - Ok(()) - } - /// Equivalent to [`ZeroTrieSimpleAsciiCursor::step()`], except returns - /// an error if the char is non-ASCII. - /// - /// # Examples - /// - /// ``` - /// use core::fmt::Write; - /// use zerotrie::ZeroTrieSimpleAscii; - /// - /// // A trie with two values: "abc" and "abcdef" - /// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); - /// - /// let mut cursor = trie.cursor(); - /// cursor.write_char('a').expect("ASCII"); - /// cursor.write_char('x').expect("ASCII"); - /// cursor.write_char('🚂').expect_err("non-ASCII"); - /// ``` - fn write_char(&mut self, c: char) -> fmt::Result { - if !c.is_ascii() { - return Err(fmt::Error); - } - self.step(c as u8); - Ok(()) - } -} - macro_rules! impl_dispatch { ($self:ident, $inner_fn:ident()) => { match $self.0 { From bb4d1e700b5a09b89a684e899da6074cef75d9b2 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Fri, 1 Dec 2023 01:23:33 -0600 Subject: [PATCH 15/21] fmt, clippy --- experimental/zerotrie/src/cursor.rs | 8 +++----- provider/blob/src/blob_schema.rs | 4 +--- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/experimental/zerotrie/src/cursor.rs b/experimental/zerotrie/src/cursor.rs index e03360abb45..4f63fda2b1f 100644 --- a/experimental/zerotrie/src/cursor.rs +++ b/experimental/zerotrie/src/cursor.rs @@ -2,8 +2,8 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). -use crate::ZeroTrieSimpleAscii; use crate::reader::*; +use crate::ZeroTrieSimpleAscii; use core::fmt; @@ -74,9 +74,7 @@ impl<'a> ZeroTrieSimpleAscii<&'a [u8]> { /// having to doubly anchor the trie to the stack. #[inline] pub fn into_cursor(self) -> ZeroTrieSimpleAsciiCursor<'a> { - ZeroTrieSimpleAsciiCursor { - trie: self - } + ZeroTrieSimpleAsciiCursor { trie: self } } } @@ -142,7 +140,7 @@ impl<'a> ZeroTrieSimpleAsciiCursor<'a> { /// ``` #[inline] pub fn value(&self) -> Option { - peek_value(self.trie.store.as_ref()) + peek_value(self.trie.store) } /// Checks whether the cursor points to an empty trie. diff --git a/provider/blob/src/blob_schema.rs b/provider/blob/src/blob_schema.rs index 7c385679337..cf99364e196 100644 --- a/provider/blob/src/blob_schema.rs +++ b/provider/blob/src/blob_schema.rs @@ -165,9 +165,7 @@ impl<'data> BlobSchemaV2<'data> { .ok_or_else(|| DataError::custom("Invalid blob bytes").with_req(key, req))?; let mut cursor = ZeroTrieSimpleAscii::from_store(zerotrie).into_cursor(); #[allow(clippy::unwrap_used)] // infallible impl - req.locale - .write_to(&mut cursor) - .unwrap(); + req.locale.write_to(&mut cursor).unwrap(); let blob_index = cursor .value() .ok_or_else(|| DataErrorKind::MissingLocale.with_req(key, req))?; From babaf98bf29e23b5681dfc7256e06b73c208db82 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Fri, 1 Dec 2023 01:24:12 -0600 Subject: [PATCH 16/21] Delete the `.head_value()` function in favor of `.cursor().value()` --- experimental/zerotrie/src/zerotrie.rs | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/experimental/zerotrie/src/zerotrie.rs b/experimental/zerotrie/src/zerotrie.rs index 34921efd5ad..deedcb81443 100644 --- a/experimental/zerotrie/src/zerotrie.rs +++ b/experimental/zerotrie/src/zerotrie.rs @@ -204,24 +204,6 @@ macro_rules! impl_zerotrie_subtype { pub fn is_empty(&self) -> bool { self.store.as_ref().is_empty() } - /// Gets the value at the head of the trie. This is equivalent to - /// calling `get` with the empty string. - /// - /// # Examples - /// - /// ``` - #[doc = concat!("use zerotrie::", stringify!($name), ";")] - /// - /// // A trie with two values: "" and "abc" - #[doc = concat!("let trie: &", stringify!($name), "<[u8]> = ", stringify!($name), "::from_bytes(b\"\\x80abc\\x81\");")] - /// - /// assert_eq!(Some(0), trie.head_value()); - /// assert_eq!(Some(0), trie.get("")); - /// ``` - #[inline] - pub fn head_value(&self) -> Option { - peek_value(self.store.as_ref()) - } /// Returns the size of the trie in number of bytes. /// /// To get the number of keys in the trie, use `.iter().count()`: From d5e9e2e86fcd6f11a355e73caafcc0a992be1551 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Fri, 1 Dec 2023 01:40:01 -0600 Subject: [PATCH 17/21] Change .value() to a mutating function and add .peek_value() --- experimental/zerotrie/src/cursor.rs | 38 ++++++++++++++++++++++++++--- experimental/zerotrie/src/reader.rs | 11 ++++++--- 2 files changed, 41 insertions(+), 8 deletions(-) diff --git a/experimental/zerotrie/src/cursor.rs b/experimental/zerotrie/src/cursor.rs index 4f63fda2b1f..ea5323f054e 100644 --- a/experimental/zerotrie/src/cursor.rs +++ b/experimental/zerotrie/src/cursor.rs @@ -122,7 +122,33 @@ impl<'a> ZeroTrieSimpleAsciiCursor<'a> { step_bsearch_only(&mut self.trie.store, byte) } - /// Gets the value at the current position in the trie. + /// Takes the value at the current position and moves the cursor. + /// + /// Calling this function on a new cursor is equivalent to calling `.get()` + /// with the empty string. + /// + /// This is slightly more efficient than [`Self::peek_value()`] if you + /// check the value at each step. + /// + /// # Examples + /// + /// ``` + /// use zerotrie::ZeroTrieSimpleAscii; + /// + /// // A trie with two values: "" and "abc" + /// let trie = ZeroTrieSimpleAscii::from_bytes(b"\x80abc\x81"); + /// + /// assert_eq!(Some(0), trie.get("")); + /// let mut cursor = trie.cursor(); + /// assert_eq!(Some(0), cursor.value()); + /// assert_eq!(None, cursor.value()); + /// ``` + #[inline] + pub fn value(&mut self) -> Option { + take_value(&mut self.trie.store) + } + + /// Gets the value at the current position without moving the cursor. /// /// Calling this function on a new cursor is equivalent to calling `.get()` /// with the empty string. @@ -136,11 +162,14 @@ impl<'a> ZeroTrieSimpleAsciiCursor<'a> { /// let trie = ZeroTrieSimpleAscii::from_bytes(b"\x80abc\x81"); /// /// assert_eq!(Some(0), trie.get("")); - /// assert_eq!(Some(0), trie.cursor().value()); + /// let cursor = trie.cursor(); + /// assert_eq!(Some(0), cursor.peek_value()); + /// assert_eq!(Some(0), cursor.peek_value()); /// ``` #[inline] - pub fn value(&self) -> Option { - peek_value(self.trie.store) + pub fn peek_value(&self) -> Option { + let mut temp = self.trie.store; + take_value(&mut temp) } /// Checks whether the cursor points to an empty trie. @@ -179,6 +208,7 @@ impl<'a> fmt::Write for ZeroTrieSimpleAsciiCursor<'a> { } Ok(()) } + /// Equivalent to [`ZeroTrieSimpleAsciiCursor::step()`], except returns /// an error if the char is non-ASCII. /// diff --git a/experimental/zerotrie/src/reader.rs b/experimental/zerotrie/src/reader.rs index c27592f7816..d49bab5212b 100644 --- a/experimental/zerotrie/src/reader.rs +++ b/experimental/zerotrie/src/reader.rs @@ -566,12 +566,15 @@ pub(crate) fn step_bsearch_only(trie: &mut &[u8], c: u8) { }; } -pub(crate) fn peek_value(mut trie: &[u8]) -> Option { - let b; - (b, trie) = trie.split_first()?; +pub(crate) fn take_value(trie: &mut &[u8]) -> Option { + let (b, new_trie) = trie.split_first()?; match byte_type(*b) { NodeType::Ascii | NodeType::Span | NodeType::Branch => None, - NodeType::Value => Some(read_varint_meta3(*b, trie).0), + NodeType::Value => { + let x; + (x, *trie) = read_varint_meta3(*b, new_trie); + Some(x) + } } } From fb05eb002e7f5c5cdcccba3de8cd0d2d2ae0bd76 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Fri, 1 Dec 2023 13:24:56 -0600 Subject: [PATCH 18/21] Update provider/blob/src/blob_schema.rs Co-authored-by: Robert Bastian <4706271+robertbastian@users.noreply.github.com> --- provider/blob/src/blob_schema.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/provider/blob/src/blob_schema.rs b/provider/blob/src/blob_schema.rs index cf99364e196..b1651204b9e 100644 --- a/provider/blob/src/blob_schema.rs +++ b/provider/blob/src/blob_schema.rs @@ -164,7 +164,7 @@ impl<'data> BlobSchemaV2<'data> { .get(key_index) .ok_or_else(|| DataError::custom("Invalid blob bytes").with_req(key, req))?; let mut cursor = ZeroTrieSimpleAscii::from_store(zerotrie).into_cursor(); - #[allow(clippy::unwrap_used)] // infallible impl + #[allow(clippy::unwrap_used)] // DataLocale::write_to produces ASCII only req.locale.write_to(&mut cursor).unwrap(); let blob_index = cursor .value() From 5ce1b16c0b1e654b00b5937465fa79216f2062e1 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Fri, 1 Dec 2023 13:33:00 -0600 Subject: [PATCH 19/21] Remove peek_value and impl Clone instead --- experimental/zerotrie/src/cursor.rs | 34 ++++------------------------- 1 file changed, 4 insertions(+), 30 deletions(-) diff --git a/experimental/zerotrie/src/cursor.rs b/experimental/zerotrie/src/cursor.rs index ea5323f054e..dbb91f9e3dd 100644 --- a/experimental/zerotrie/src/cursor.rs +++ b/experimental/zerotrie/src/cursor.rs @@ -81,7 +81,8 @@ impl<'a> ZeroTrieSimpleAscii<&'a [u8]> { /// A cursor into a [`ZeroTrieSimpleAscii`], useful for stepwise lookup. /// /// For examples, see [`ZeroTrieSimpleAscii::cursor()`]. -#[derive(Debug)] +// Clone but not Copy: +#[derive(Debug, Clone)] pub struct ZeroTrieSimpleAsciiCursor<'a> { trie: ZeroTrieSimpleAscii<&'a [u8]>, } @@ -122,13 +123,10 @@ impl<'a> ZeroTrieSimpleAsciiCursor<'a> { step_bsearch_only(&mut self.trie.store, byte) } - /// Takes the value at the current position and moves the cursor. + /// Takes the value at the current position. /// /// Calling this function on a new cursor is equivalent to calling `.get()` - /// with the empty string. - /// - /// This is slightly more efficient than [`Self::peek_value()`] if you - /// check the value at each step. + /// with the empty string (except that it can only be called once). /// /// # Examples /// @@ -148,30 +146,6 @@ impl<'a> ZeroTrieSimpleAsciiCursor<'a> { take_value(&mut self.trie.store) } - /// Gets the value at the current position without moving the cursor. - /// - /// Calling this function on a new cursor is equivalent to calling `.get()` - /// with the empty string. - /// - /// # Examples - /// - /// ``` - /// use zerotrie::ZeroTrieSimpleAscii; - /// - /// // A trie with two values: "" and "abc" - /// let trie = ZeroTrieSimpleAscii::from_bytes(b"\x80abc\x81"); - /// - /// assert_eq!(Some(0), trie.get("")); - /// let cursor = trie.cursor(); - /// assert_eq!(Some(0), cursor.peek_value()); - /// assert_eq!(Some(0), cursor.peek_value()); - /// ``` - #[inline] - pub fn peek_value(&self) -> Option { - let mut temp = self.trie.store; - take_value(&mut temp) - } - /// Checks whether the cursor points to an empty trie. /// /// Use this to determine when to stop iterating. From 7017868df8866f6a9c0f57326eb24d878673fed3 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Fri, 1 Dec 2023 16:38:08 -0600 Subject: [PATCH 20/21] value -> take_value --- experimental/zerotrie/src/cursor.rs | 24 ++++++++++++------------ provider/blob/src/blob_schema.rs | 2 +- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/experimental/zerotrie/src/cursor.rs b/experimental/zerotrie/src/cursor.rs index dbb91f9e3dd..a7a88645415 100644 --- a/experimental/zerotrie/src/cursor.rs +++ b/experimental/zerotrie/src/cursor.rs @@ -31,7 +31,7 @@ where /// // Get out the value for "abc" /// let mut cursor = trie.cursor(); /// write!(&mut cursor, "abc"); - /// assert_eq!(cursor.value(), Some(0)); + /// assert_eq!(cursor.take_value(), Some(0)); /// ``` /// /// Find the longest prefix match: @@ -52,7 +52,7 @@ where /// if cursor.is_empty() { /// break; /// } - /// if cursor.value().is_some() { + /// if cursor.take_value().is_some() { /// longest_prefix = i; /// } /// cursor.step(*b); @@ -102,21 +102,21 @@ impl<'a> ZeroTrieSimpleAsciiCursor<'a> { /// /// // Search the trie for the string "abcdxy" /// let mut cursor = trie.cursor(); - /// assert_eq!(cursor.value(), None); // "" + /// assert_eq!(cursor.take_value(), None); // "" /// cursor.step(b'a'); - /// assert_eq!(cursor.value(), None); // "a" + /// assert_eq!(cursor.take_value(), None); // "a" /// cursor.step(b'b'); - /// assert_eq!(cursor.value(), None); // "ab" + /// assert_eq!(cursor.take_value(), None); // "ab" /// cursor.step(b'c'); - /// assert_eq!(cursor.value(), Some(0)); // "abc" + /// assert_eq!(cursor.take_value(), Some(0)); // "abc" /// cursor.step(b'd'); - /// assert_eq!(cursor.value(), None); // "abcd" + /// assert_eq!(cursor.take_value(), None); // "abcd" /// assert!(!cursor.is_empty()); /// cursor.step(b'x'); // no strings have the prefix "abcdx" /// assert!(cursor.is_empty()); - /// assert_eq!(cursor.value(), None); // "abcdx" + /// assert_eq!(cursor.take_value(), None); // "abcdx" /// cursor.step(b'y'); - /// assert_eq!(cursor.value(), None); // "abcdxy" + /// assert_eq!(cursor.take_value(), None); // "abcdxy" /// ``` #[inline] pub fn step(&mut self, byte: u8) { @@ -138,11 +138,11 @@ impl<'a> ZeroTrieSimpleAsciiCursor<'a> { /// /// assert_eq!(Some(0), trie.get("")); /// let mut cursor = trie.cursor(); - /// assert_eq!(Some(0), cursor.value()); - /// assert_eq!(None, cursor.value()); + /// assert_eq!(Some(0), cursor.take_value()); + /// assert_eq!(None, cursor.take_value()); /// ``` #[inline] - pub fn value(&mut self) -> Option { + pub fn take_value(&mut self) -> Option { take_value(&mut self.trie.store) } diff --git a/provider/blob/src/blob_schema.rs b/provider/blob/src/blob_schema.rs index b1651204b9e..37cd2b3bfa5 100644 --- a/provider/blob/src/blob_schema.rs +++ b/provider/blob/src/blob_schema.rs @@ -167,7 +167,7 @@ impl<'data> BlobSchemaV2<'data> { #[allow(clippy::unwrap_used)] // DataLocale::write_to produces ASCII only req.locale.write_to(&mut cursor).unwrap(); let blob_index = cursor - .value() + .take_value() .ok_or_else(|| DataErrorKind::MissingLocale.with_req(key, req))?; let buffer = self .buffers From 09b56c03b07db0f89bb7cff02bf9d413bf778a66 Mon Sep 17 00:00:00 2001 From: "Shane F. Carr" Date: Mon, 4 Dec 2023 13:59:13 -0800 Subject: [PATCH 21/21] Docs for internal functions --- experimental/zerotrie/src/reader.rs | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/experimental/zerotrie/src/reader.rs b/experimental/zerotrie/src/reader.rs index d49bab5212b..bc5df09808e 100644 --- a/experimental/zerotrie/src/reader.rs +++ b/experimental/zerotrie/src/reader.rs @@ -504,6 +504,11 @@ pub fn get_phf_extended(mut trie: &[u8], mut ascii: &[u8]) -> Option { } } +/// Steps one node into the trie assuming all branch nodes are binary search and that +/// there are no span nodes. +/// +/// The input-output argument `trie` starts at the original trie and ends pointing to +/// the sub-trie reachable by `c`. pub(crate) fn step_bsearch_only(trie: &mut &[u8], c: u8) { let (mut b, x, search); loop { @@ -566,6 +571,11 @@ pub(crate) fn step_bsearch_only(trie: &mut &[u8], c: u8) { }; } +/// Steps one node into the trie if the head node is a value node, returning the value. +/// If the head node is not a value node, no change is made. +/// +/// The input-output argument `trie` starts at the original trie and ends pointing to +/// the sub-trie with the value node removed. pub(crate) fn take_value(trie: &mut &[u8]) -> Option { let (b, new_trie) = trie.split_first()?; match byte_type(*b) {