-
Notifications
You must be signed in to change notification settings - Fork 184
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ZeroTrie: Add cursor type for manual iteration and use it in BlobSchemaV2 #4383
Changes from 14 commits
47ca9c1
92a4207
512ebc6
ed19daa
3743661
46316a0
2d8d8f9
f272f0e
f9a614e
929f81f
79fc890
3a8efa1
838ad05
4e70618
8f754cc
bb4d1e7
babaf98
d5e9e2e
fb05eb0
5ce1b16
eeb3cbe
7017868
09b56c0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,236 @@ | ||
// This file is part of ICU4X. For terms of use, please see the file | ||
// called LICENSE at the top level of the ICU4X source tree | ||
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). | ||
|
||
use crate::reader::*; | ||
use crate::ZeroTrieSimpleAscii; | ||
|
||
use core::fmt; | ||
|
||
impl<Store> ZeroTrieSimpleAscii<Store> | ||
where | ||
Store: AsRef<[u8]> + ?Sized, | ||
{ | ||
/// Gets a cursor into the current trie. | ||
/// | ||
/// Useful to query a trie with data that is not a slice. | ||
/// | ||
/// This is currently supported only on `ZeroTrieSimpleAscii`. | ||
/// | ||
/// # Examples | ||
/// | ||
/// Get a value out of a trie by [writing](fmt::Write) it to the cursor: | ||
/// | ||
/// ``` | ||
/// use core::fmt::Write; | ||
/// use zerotrie::ZeroTrieSimpleAscii; | ||
/// | ||
/// // A trie with two values: "abc" and "abcdef" | ||
/// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); | ||
/// | ||
/// // Get out the value for "abc" | ||
/// let mut cursor = trie.cursor(); | ||
/// write!(&mut cursor, "abc"); | ||
/// assert_eq!(cursor.value(), Some(0)); | ||
/// ``` | ||
/// | ||
/// Find the longest prefix match: | ||
/// | ||
/// ``` | ||
/// use zerotrie::ZeroTrieSimpleAscii; | ||
/// | ||
/// // A trie with two values: "abc" and "abcdef" | ||
/// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); | ||
/// | ||
/// // Find the longest prefix of the string "abcdxy": | ||
/// let query = b"abcdxy"; | ||
/// let mut longest_prefix = 0; | ||
/// let mut cursor = trie.cursor(); | ||
/// for (i, b) in query.iter().enumerate() { | ||
/// // Checking is_empty() is not required, but it is | ||
/// // good for efficiency | ||
/// if cursor.is_empty() { | ||
/// break; | ||
/// } | ||
/// if cursor.value().is_some() { | ||
/// longest_prefix = i; | ||
/// } | ||
/// cursor.step(*b); | ||
/// } | ||
/// | ||
/// // The longest prefix is "abc" which is length 3: | ||
/// assert_eq!(longest_prefix, 3); | ||
/// ``` | ||
#[inline] | ||
pub fn cursor(&self) -> ZeroTrieSimpleAsciiCursor { | ||
ZeroTrieSimpleAsciiCursor { | ||
trie: self.as_borrowed_slice(), | ||
} | ||
} | ||
} | ||
|
||
impl<'a> ZeroTrieSimpleAscii<&'a [u8]> { | ||
/// Same as [`ZeroTrieSimpleAscii::cursor()`] but moves self to avoid | ||
/// having to doubly anchor the trie to the stack. | ||
#[inline] | ||
pub fn into_cursor(self) -> ZeroTrieSimpleAsciiCursor<'a> { | ||
ZeroTrieSimpleAsciiCursor { trie: self } | ||
} | ||
} | ||
|
||
/// A cursor into a [`ZeroTrieSimpleAscii`], useful for stepwise lookup. | ||
/// | ||
/// For examples, see [`ZeroTrieSimpleAscii::cursor()`]. | ||
#[derive(Debug)] | ||
pub struct ZeroTrieSimpleAsciiCursor<'a> { | ||
trie: ZeroTrieSimpleAscii<&'a [u8]>, | ||
} | ||
|
||
impl<'a> ZeroTrieSimpleAsciiCursor<'a> { | ||
/// Steps the cursor one byte into the trie. | ||
/// | ||
/// # Examples | ||
/// | ||
/// Unrolled loop checking for string presence at every step: | ||
/// | ||
/// ``` | ||
/// use zerotrie::ZeroTrieSimpleAscii; | ||
/// | ||
/// // A trie with two values: "abc" and "abcdef" | ||
/// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); | ||
/// | ||
/// // Search the trie for the string "abcdxy" | ||
/// let mut cursor = trie.cursor(); | ||
/// assert_eq!(cursor.value(), None); // "" | ||
/// cursor.step(b'a'); | ||
/// assert_eq!(cursor.value(), None); // "a" | ||
/// cursor.step(b'b'); | ||
/// assert_eq!(cursor.value(), None); // "ab" | ||
/// cursor.step(b'c'); | ||
/// assert_eq!(cursor.value(), Some(0)); // "abc" | ||
/// cursor.step(b'd'); | ||
/// assert_eq!(cursor.value(), None); // "abcd" | ||
/// assert!(!cursor.is_empty()); | ||
/// cursor.step(b'x'); // no strings have the prefix "abcdx" | ||
/// assert!(cursor.is_empty()); | ||
/// assert_eq!(cursor.value(), None); // "abcdx" | ||
/// cursor.step(b'y'); | ||
/// assert_eq!(cursor.value(), None); // "abcdxy" | ||
/// ``` | ||
#[inline] | ||
pub fn step(&mut self, byte: u8) { | ||
step_bsearch_only(&mut self.trie.store, byte) | ||
} | ||
|
||
/// Takes the value at the current position and moves the cursor. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it's unclear to me where it moves the cursor, I don't see a default place where it could be moved There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I deleted There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, but why does this "take" and use a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Values are stored in trie nodes. When we read a value, we can step over that trie node so that we don't need to do that again next time we call There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I could make a moving There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
That signature doesn't allow checking the presence of a value and then continuing, as we need to do in the longest-prefix example. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Actually my There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I changed the function name to |
||
/// | ||
/// Calling this function on a new cursor is equivalent to calling `.get()` | ||
/// with the empty string. | ||
/// | ||
/// This is slightly more efficient than [`Self::peek_value()`] if you | ||
/// check the value at each step. | ||
/// | ||
/// # Examples | ||
/// | ||
/// ``` | ||
/// use zerotrie::ZeroTrieSimpleAscii; | ||
/// | ||
/// // A trie with two values: "" and "abc" | ||
/// let trie = ZeroTrieSimpleAscii::from_bytes(b"\x80abc\x81"); | ||
/// | ||
/// assert_eq!(Some(0), trie.get("")); | ||
/// let mut cursor = trie.cursor(); | ||
/// assert_eq!(Some(0), cursor.value()); | ||
/// assert_eq!(None, cursor.value()); | ||
/// ``` | ||
#[inline] | ||
pub fn value(&mut self) -> Option<usize> { | ||
take_value(&mut self.trie.store) | ||
} | ||
|
||
/// Gets the value at the current position without moving the cursor. | ||
/// | ||
/// Calling this function on a new cursor is equivalent to calling `.get()` | ||
/// with the empty string. | ||
/// | ||
/// # Examples | ||
/// | ||
/// ``` | ||
/// use zerotrie::ZeroTrieSimpleAscii; | ||
/// | ||
/// // A trie with two values: "" and "abc" | ||
/// let trie = ZeroTrieSimpleAscii::from_bytes(b"\x80abc\x81"); | ||
/// | ||
/// assert_eq!(Some(0), trie.get("")); | ||
/// let cursor = trie.cursor(); | ||
/// assert_eq!(Some(0), cursor.peek_value()); | ||
/// assert_eq!(Some(0), cursor.peek_value()); | ||
/// ``` | ||
#[inline] | ||
pub fn peek_value(&self) -> Option<usize> { | ||
let mut temp = self.trie.store; | ||
take_value(&mut temp) | ||
} | ||
|
||
/// Checks whether the cursor points to an empty trie. | ||
/// | ||
/// Use this to determine when to stop iterating. | ||
#[inline] | ||
pub fn is_empty(&self) -> bool { | ||
self.trie.is_empty() | ||
} | ||
} | ||
|
||
impl<'a> fmt::Write for ZeroTrieSimpleAsciiCursor<'a> { | ||
/// Steps the cursor through each ASCII byte of the string. | ||
/// | ||
/// If the string contains non-ASCII chars, an error is returned. | ||
/// | ||
/// # Examples | ||
/// | ||
/// ``` | ||
/// use core::fmt::Write; | ||
/// use zerotrie::ZeroTrieSimpleAscii; | ||
/// | ||
/// // A trie with two values: "abc" and "abcdef" | ||
/// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); | ||
/// | ||
/// let mut cursor = trie.cursor(); | ||
/// cursor.write_str("abcdxy").expect("all ASCII"); | ||
/// cursor.write_str("🚂").expect_err("non-ASCII"); | ||
/// ``` | ||
fn write_str(&mut self, s: &str) -> fmt::Result { | ||
for b in s.bytes() { | ||
if !b.is_ascii() { | ||
return Err(fmt::Error); | ||
} | ||
self.step(b); | ||
} | ||
Ok(()) | ||
} | ||
|
||
/// Equivalent to [`ZeroTrieSimpleAsciiCursor::step()`], except returns | ||
/// an error if the char is non-ASCII. | ||
/// | ||
/// # Examples | ||
/// | ||
/// ``` | ||
/// use core::fmt::Write; | ||
/// use zerotrie::ZeroTrieSimpleAscii; | ||
/// | ||
/// // A trie with two values: "abc" and "abcdef" | ||
/// let trie = ZeroTrieSimpleAscii::from_bytes(b"abc\x80def\x81"); | ||
/// | ||
/// let mut cursor = trie.cursor(); | ||
/// cursor.write_char('a').expect("ASCII"); | ||
/// cursor.write_char('x').expect("ASCII"); | ||
/// cursor.write_char('🚂').expect_err("non-ASCII"); | ||
/// ``` | ||
fn write_char(&mut self, c: char) -> fmt::Result { | ||
if !c.is_ascii() { | ||
return Err(fmt::Error); | ||
} | ||
self.step(c as u8); | ||
Ok(()) | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -504,6 +504,80 @@ pub fn get_phf_extended(mut trie: &[u8], mut ascii: &[u8]) -> Option<usize> { | |
} | ||
} | ||
|
||
pub(crate) fn step_bsearch_only(trie: &mut &[u8], c: u8) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: docs |
||
let (mut b, x, search); | ||
loop { | ||
(b, *trie) = match trie.split_first() { | ||
Some(v) => v, | ||
None => { | ||
// Empty trie or only a value node | ||
return; | ||
} | ||
}; | ||
match byte_type(*b) { | ||
NodeType::Ascii if *b == c => { | ||
// Matched a byte | ||
return; | ||
} | ||
NodeType::Ascii => { | ||
// Byte that doesn't match | ||
*trie = &[]; | ||
return; | ||
} | ||
NodeType::Branch => { | ||
// Proceed to the branch node logic below | ||
(x, *trie) = read_varint_meta2(*b, trie); | ||
break; | ||
} | ||
NodeType::Span => { | ||
// Question: Should we put the trie back into a valid state? | ||
// Currently this code is unreachable so let's not worry about it. | ||
debug_assert!(false, "span nodes not supported in stepping"); | ||
return; | ||
} | ||
NodeType::Value => { | ||
// Skip the value node and go to the next node | ||
(_, *trie) = read_varint_meta3(*b, trie); | ||
continue; | ||
} | ||
}; | ||
} | ||
// Branch node | ||
let (x, w) = if x >= 256 { (x & 0xff, x >> 8) } else { (x, 0) }; | ||
// See comment above regarding this assertion | ||
debug_assert!(w <= 3, "get: w > 3 but we assume w <= 3"); | ||
let w = w & 0x3; | ||
let x = if x == 0 { 256 } else { x }; | ||
// Always use binary search | ||
(search, *trie) = trie.debug_split_at(x); | ||
match search.binary_search(&c) { | ||
Ok(i) => { | ||
// Matched a byte | ||
*trie = if w == 0 { | ||
get_branch_w0(trie, i, x) | ||
} else { | ||
get_branch(trie, i, x, w) | ||
}; | ||
} | ||
Err(_) => { | ||
// Byte that doesn't match | ||
*trie = &[] | ||
} | ||
}; | ||
} | ||
|
||
pub(crate) fn take_value(trie: &mut &[u8]) -> Option<usize> { | ||
let (b, new_trie) = trie.split_first()?; | ||
match byte_type(*b) { | ||
NodeType::Ascii | NodeType::Span | NodeType::Branch => None, | ||
NodeType::Value => { | ||
let x; | ||
(x, *trie) = read_varint_meta3(*b, new_trie); | ||
Some(x) | ||
} | ||
} | ||
} | ||
|
||
#[cfg(feature = "alloc")] | ||
use alloc::vec::Vec; | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
how does this handle non-ASCII?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#4395