diff --git a/Cargo.lock b/Cargo.lock index 3a54db84478..1d9eb11344c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1055,6 +1055,7 @@ dependencies = [ name = "icu_codepointtrie" version = "0.2.0" dependencies = [ + "postcard", "serde", "thiserror", "toml", diff --git a/Cargo.toml b/Cargo.toml index d08142e90b2..bed46f213a5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,7 +16,6 @@ members = [ "components/plurals", "components/uniset", "experimental/bies", - "experimental/codepointtrie", "experimental/segmenter", "experimental/segmenter_lstm", "ffi/diplomat", @@ -32,6 +31,7 @@ members = [ "tools/benchmark/memory", "tools/benchmark/binsize", "tools/datagen", + "utils/codepointtrie", "utils/fixed_decimal", "utils/litemap", "utils/pattern", diff --git a/experimental/codepointtrie/README.md b/experimental/codepointtrie/README.md deleted file mode 100644 index 93dde47f0e7..00000000000 --- a/experimental/codepointtrie/README.md +++ /dev/null @@ -1,7 +0,0 @@ -# icu_codepointtrie [![crates.io](http://meritbadge.herokuapp.com/icu_codepointtrie)](https://crates.io/crates/icu_codepointtrie) - - - -## More Information - -For more information on development, authorship, contributing etc. please visit [`ICU4X home page`](https://github.com/unicode-org/icu4x). diff --git a/experimental/codepointtrie/src/codepointtrie.rs b/experimental/codepointtrie/src/codepointtrie.rs deleted file mode 100644 index 45f0063cf85..00000000000 --- a/experimental/codepointtrie/src/codepointtrie.rs +++ /dev/null @@ -1,340 +0,0 @@ -// This file is part of ICU4X. For terms of use, please see the file -// called LICENSE at the top level of the ICU4X source tree -// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). - -// TODO: add module-level Rust-doc with examples - -use crate::error::Error; -use crate::impl_const::*; -use std::marker::PhantomData; -use zerovec::ZeroVec; - -// Enums - -/// The width of the elements in the data array of a [`CodePointTrie`]. -/// See UCPTrieValueWidth in ICU4C. -#[derive(Clone, Copy, PartialEq)] -pub enum ValueWidthEnum { - Bits16 = 0, - Bits32 = 1, - Bits8 = 2, -} - -/// The type of trie represents whether the trie has an optimization that -/// would make it small or fast. -/// See UCPTrieType in ICU4C. -#[derive(Clone, Copy, PartialEq)] -pub enum TrieTypeEnum { - Fast = 0, - Small = 1, -} - -// ValueWidth trait - -// AsULE is AsUnalignedLittleEndian, i.e. "allowed in a zerovec" - -/// A trait representing the width of the values stored in the data array of a -/// [`CodePointTrie`]. This trait is used as a type parameter in constructing -/// a `CodePointTrie`. -pub trait ValueWidth: Copy + zerovec::ule::AsULE { - /// This enum variant represents the specific instance of `ValueWidth` such - /// that the enum discriminant values matches ICU4C's enum integer value. - const ENUM_VALUE: ValueWidthEnum; - /// This value is used to indicate an error in the Rust code in accessing - /// a position in the trie's `data` array. In normal cases, the position in - /// the `data` array will return either the correct value, or in case of a - /// logical error in the trie's computation, the trie's own error value - /// which is stored that in the `data` array. - const DATA_GET_ERROR_VALUE: Self; - fn cast_to_widest(self) -> u32; -} - -impl ValueWidth for u8 { - const ENUM_VALUE: ValueWidthEnum = ValueWidthEnum::Bits8; - const DATA_GET_ERROR_VALUE: u8 = u8::MAX; - - fn cast_to_widest(self) -> u32 { - self as u32 - } -} - -impl ValueWidth for u16 { - const ENUM_VALUE: ValueWidthEnum = ValueWidthEnum::Bits16; - const DATA_GET_ERROR_VALUE: u16 = u16::MAX; - - fn cast_to_widest(self) -> u32 { - self as u32 - } -} - -impl ValueWidth for u32 { - const ENUM_VALUE: ValueWidthEnum = ValueWidthEnum::Bits32; - const DATA_GET_ERROR_VALUE: u32 = u32::MAX; - - fn cast_to_widest(self) -> u32 { - self - } -} - -// TrieType trait - -/// A trait representing the "trie type" of a [`CodePointTrie`]. -/// -/// Currently, the options are "fast" and "small", which differ in the "fast max" -/// limit. -pub trait TrieType { - /// All code points up to the fast max limit are represented - /// individually in the `index` array to hold their `data` array position, and - /// thus only need 2 lookups for a [`crate::codepointtrie::CodePointTrie::get`]. - /// Code points above the "fast max" limit require 4 lookups. - const FAST_MAX: u32; - /// This enum variant represents the specific instance of `TrieType` such - /// that the enum discriminant values matches ICU4C's enum integer value. - const ENUM_VALUE: TrieTypeEnum; -} - -/// An empty struct to represent "fast" type code point tries for the -/// [`TrieType`] trait. The "fast max" limit is set to 0xffff. -pub struct Fast; - -impl TrieType for Fast { - const FAST_MAX: u32 = FAST_TYPE_FAST_INDEXING_MAX; - const ENUM_VALUE: TrieTypeEnum = TrieTypeEnum::Fast; -} - -/// An empty struct to represent "small" type code point tries for the -/// [`TrieType`] trait. The "fast max" limit is set to 0x0fff. -pub struct Small; - -impl TrieType for Small { - const FAST_MAX: u32 = SMALL_TYPE_FAST_INDEXING_MAX; - const ENUM_VALUE: TrieTypeEnum = TrieTypeEnum::Small; -} - -/// This struct represents a de-serialized CodePointTrie that was exported from -/// ICU binary data. -/// -/// For more information: -/// - [ICU Site design doc](http://site.icu-project.org/design/struct/utrie) -/// - [ICU User Guide section on Properties lookup](https://unicode-org.github.io/icu/userguide/strings/properties.html#lookup) -pub struct CodePointTrie<'trie, W: ValueWidth, T: TrieType> { - header: CodePointTrieHeader, - index: ZeroVec<'trie, u16>, - data: ZeroVec<'trie, W>, - _marker_ty: PhantomData, -} - -/// This struct contains the fixed-length header fields of a [`CodePointTrie`]. -pub struct CodePointTrieHeader { - /// Length of the trie's `index` array - pub index_length: u32, - /// Length of the trie's `data` array - pub data_length: u32, - /// The code point of the start of the last range of the trie, where a - /// range is defined as a partition of the code point space such that the - /// value in this trie associated with all code points of the same range is - /// the same. For the property value data for many Unicode properties, - /// often times, `high_start` is U+10000 or lower. In such cases, not - /// reserving space in the `index` array for duplicate values is a large - /// savings. The "highValue" associated with the `high_start` range is - /// stored at the second-to-last position of the `data` array. - /// (See `impl_const::HIGH_VALUE_NEG_DATA_OFFSET`.) - pub high_start: u32, - /// A version of the `high_start` value that is right-shifted 12 spaces, - /// but is rounded up to a multiple 0x1000 for easy testing from UTF-8 - /// lead bytes. - pub shifted12_high_start: u16, - /// Offset for the null block in the "index-3" table of the `index` array. - /// Set to an impossibly high value (e.g., 0xffff) if there is no - /// dedicated index-3 null block. - pub index3_null_offset: u16, - /// Internal data null block offset, not shifted. - /// Set to an impossibly high value (e.g., 0xfffff) if there is no - /// dedicated data null block. - pub data_null_offset: u32, - /// The value stored in the trie that represents a null value being - /// associated to a code point. - pub null_value: u32, -} - -impl<'trie, W: ValueWidth, T: TrieType> CodePointTrie<'trie, W, T> { - /// Returns a new [`CodePointTrie`] backed by borrowed data for the `index` - /// array and `data` array, whose data values have width `W`, for a trie - /// type `T`. - pub fn try_new( - header: CodePointTrieHeader, - index: ZeroVec<'trie, u16>, - data: ZeroVec<'trie, W>, - ) -> Result, Error> { - if header.data_length < ERROR_VALUE_NEG_DATA_OFFSET { - return Err(Error::FromDeserialized { - reason: "Data array must be large enough to contain error value", - }); - } - - if header.data_length < HIGH_VALUE_NEG_DATA_OFFSET { - return Err(Error::FromDeserialized { - reason: - "Data array must be large enough to contain value for range highStart..U+10FFFF", - }); - } - - if index.len() as u32 != header.index_length { - return Err(Error::FromDeserialized { - reason: "Length of index array does not match corresponding header value", - }); - } - - if data.len() as u32 != header.data_length { - return Err(Error::FromDeserialized { - reason: "Length of data array does not match corresponding header value", - }); - } - - let trie: CodePointTrie<'trie, W, T> = CodePointTrie { - header, - index, - data, - _marker_ty: PhantomData, - }; - Ok(trie) - } - - /// Returns the position in the data array containing the trie's stored - /// error value. - fn trie_error_val_index(&self) -> u32 { - self.header.data_length - ERROR_VALUE_NEG_DATA_OFFSET - } - - fn internal_small_index(&self, code_point: u32) -> u32 { - let mut index1_pos: u32 = code_point >> SHIFT_1; - if T::ENUM_VALUE == TrieTypeEnum::Fast { - debug_assert!( - FAST_TYPE_FAST_INDEXING_MAX < code_point && code_point < self.header.high_start - ); - index1_pos = index1_pos + BMP_INDEX_LENGTH - OMITTED_BMP_INDEX_1_LENGTH; - } else { - assert!(code_point < self.header.high_start && self.header.high_start > SMALL_LIMIT); - index1_pos += SMALL_INDEX_LENGTH; - } - let index1_val = if let Some(index1_val) = self.index.get(index1_pos as usize) { - index1_val - } else { - return self.trie_error_val_index(); - }; - let index3_block_idx: u32 = (index1_val as u32) + ((code_point >> SHIFT_2) & INDEX_2_MASK); - let mut index3_block: u32 = - if let Some(index3_block) = self.index.get(index3_block_idx as usize) { - index3_block as u32 - } else { - return self.trie_error_val_index(); - }; - let mut index3_pos: u32 = (code_point >> SHIFT_3) & INDEX_3_MASK; - let mut data_block: u32; - if index3_block & 0x8000 == 0 { - // 16-bit indexes - data_block = - if let Some(data_block) = self.index.get((index3_block + index3_pos) as usize) { - data_block as u32 - } else { - return self.trie_error_val_index(); - }; - } else { - // 18-bit indexes stored in groups of 9 entries per 8 indexes. - index3_block = (index3_block & 0x7fff) + (index3_pos & !7) + (index3_pos >> 3); - index3_pos &= 7; - data_block = if let Some(data_block) = self.index.get(index3_block as usize) { - data_block as u32 - } else { - return self.trie_error_val_index(); - }; - data_block = ((data_block << (2 + (2 * index3_pos))) as u32 & 0x30000) as u32; - index3_block += 1; - data_block = - if let Some(index3_val) = self.index.get((index3_block + index3_pos) as usize) { - data_block | (index3_val as u32) - } else { - return self.trie_error_val_index(); - }; - } - // Returns data_pos == data_block (offset) + - // portion of code_point bit field for last (4th) lookup - data_block + (code_point & SMALL_DATA_MASK) - } - - /// Returns the position in the `data` array for the given code point, - /// where this code point is at or above the fast limit associated for the - /// trie type, `T`. - /// - /// A lookup of the value in the code point trie for a code point in the - /// code point space range [`T::FAST_MAX`, `high_start`) will be a 4-step - /// lookup: 3 lookups in the `index` array and one lookup in the `data` - /// array. Lookups for code points in the range [`high_start`, - /// `CODE_POINT_MAX`] are short-circuited to be a single lookup, see - /// [CodePointTrieHeader::high_start]. - fn small_index(&self, code_point: u32) -> u32 { - if code_point >= self.header.high_start { - self.header.data_length - HIGH_VALUE_NEG_DATA_OFFSET - } else { - self.internal_small_index(code_point) // helper fn - } - } - - /// Returns the position in the `data` array for the given code point, - /// where this code point is below the fast limit associated for the - /// trie type, `T`. - /// - /// A lookup of the value in the code point trie for a code point in the - /// code point space range [0, `T::FAST_MAX`) will be a 2-step lookup: 1 - /// lookup in the `index` array and one lookup in the `data` array. By - /// design, for trie type `T`, there is an element allocated in the `index` - /// array for each block of code points in [0, `T::FAST_MAX`), which in - /// turn guarantees that those code points are represented and only need 1 - /// lookup. - fn fast_index(&self, code_point: u32) -> u32 { - let index_array_pos: u32 = code_point >> FAST_TYPE_SHIFT; - let index_array_val: u16 = - if let Some(index_array_val) = self.index.get(index_array_pos as usize) { - index_array_val - } else { - return self.trie_error_val_index(); - }; - let fast_index_val: u32 = index_array_val as u32 + (code_point & FAST_TYPE_DATA_MASK); - fast_index_val - } - - /// Returns the value that is associated with `code_point` in this [`CodePointTrie`]. - /// - /// # Examples - /// - /// ``` - /// use icu_codepointtrie::planes; - /// let trie = planes::get_planes_trie(); - /// assert_eq!(0, trie.get(0x41)); // 'A' as u32 - /// assert_eq!(0, trie.get(0x13E0)); // 'Ꮰ' as u32 - /// assert_eq!(1, trie.get(0x10044)); // '𐁄' as u32 - /// ``` - pub fn get(&self, code_point: u32) -> W { - let data_pos: u32 = if code_point <= T::FAST_MAX { - Self::fast_index(self, code_point) - } else if code_point <= CODE_POINT_MAX { - Self::small_index(self, code_point) - } else { - self.trie_error_val_index() - }; - // Returns the trie value (or trie's error value). - // If we cannot read from the data array, then return the associated constant - // DATA_GET_ERROR_VALUE for the instance type for W: ValueWidth. - self.data - .get(data_pos as usize) - .unwrap_or(W::DATA_GET_ERROR_VALUE) - } - - /// Returns the value that is associated with `code_point` for this [`CodePointTrie`] - /// as a `u32`. - // Note: This API method maintains consistency with the corresponding - // original ICU APIs. - pub fn get_u32(&self, code_point: u32) -> u32 { - self.get(code_point).cast_to_widest() - } -} diff --git a/experimental/codepointtrie/src/lib.rs b/experimental/codepointtrie/src/lib.rs deleted file mode 100644 index db6f1c51016..00000000000 --- a/experimental/codepointtrie/src/lib.rs +++ /dev/null @@ -1,8 +0,0 @@ -// This file is part of ICU4X. For terms of use, please see the file -// called LICENSE at the top level of the ICU4X source tree -// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). - -pub mod codepointtrie; -pub mod error; -mod impl_const; -pub mod planes; diff --git a/experimental/codepointtrie/tests/invalid_trie_test.rs b/experimental/codepointtrie/tests/invalid_trie_test.rs deleted file mode 100644 index 03030cc3644..00000000000 --- a/experimental/codepointtrie/tests/invalid_trie_test.rs +++ /dev/null @@ -1,94 +0,0 @@ -// This file is part of ICU4X. For terms of use, please see the file -// called LICENSE at the top level of the ICU4X source tree -// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). - -use icu_codepointtrie::codepointtrie::{CodePointTrie, CodePointTrieHeader, Fast}; -use icu_codepointtrie::error::Error; -use zerovec::ZeroVec; - -const INDEX: [u16; 1024] = [ - 0, 0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0x80, - 0xc0, 0xc0, 0xc0, 0xc0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, -]; - -const DATA_8: [u8; 260] = [ - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 1, 1, 1, 0xad, -]; - -#[test] -fn try_new_error_test() { - let index_length: u32 = 4321; - let data_length: u32 = 260; - let high_start: u32 = 0xa00; - let shifted12_high_start: u16 = 0x1; - let index3_null_offset: u16 = 0x7fff; - let data_null_offset: u32 = 0x0; - let null_value: u32 = 0x1; - - let header = CodePointTrieHeader { - index_length, - data_length, - high_start, - shifted12_high_start, - index3_null_offset, - data_null_offset, - null_value, - }; - - let index = ZeroVec::from_slice(&INDEX); - let data = ZeroVec::from_slice(&DATA_8); - let trie_new_result: Result, Error> = - CodePointTrie::try_new(header, index, data); - match trie_new_result { - Ok(_) => { - panic!("This test expects Result::Err from trie constructor using invalid deserialized values") - } - Err(e) => { - assert_eq!( - e, - Error::FromDeserialized { - reason: "Length of index array does not match corresponding header value" - } - ); - } - } -} diff --git a/experimental/codepointtrie/Cargo.toml b/utils/codepointtrie/Cargo.toml similarity index 69% rename from experimental/codepointtrie/Cargo.toml rename to utils/codepointtrie/Cargo.toml index 45b60b0aecb..6a3cb19b8f1 100644 --- a/experimental/codepointtrie/Cargo.toml +++ b/utils/codepointtrie/Cargo.toml @@ -23,21 +23,25 @@ include = [ "README.md" ] -[lib] -bench = false # This option is required for Benchmark CI -path = "src/lib.rs" +[package.metadata.cargo-all-features] +skip_optional_dependencies = true +# Bench feature gets tested separately and is only relevant for CI +denylist = ["bench"] + +[package.metadata.docs.rs] +all-features = true [dependencies] -serde = { version = "1.0", features = ["derive"], optional = true } +serde = { version = "1.0", default-features = false, features = ["derive", "alloc"], optional = true } thiserror = "1.0" -zerovec = { version = "0.3", path = "../../utils/zerovec" } - -[features] -default = ["provider_serde"] -bench = [] -provider_serde = ["serde", "zerovec/serde"] +zerovec = { version = "0.3", path = "../../utils/zerovec", features = ["serde"] } [dev-dependencies] +postcard = { version = "0.7", features = ["alloc"] } toml = "0.5" serde = { version = "1.0", features = ["derive"] } zerovec = { version = "0.3", path = "../../utils/zerovec", features = ["serde"] } + +[lib] +bench = false # This option is required for Benchmark CI +path = "src/lib.rs" diff --git a/experimental/codepointtrie/LICENSE b/utils/codepointtrie/LICENSE similarity index 100% rename from experimental/codepointtrie/LICENSE rename to utils/codepointtrie/LICENSE diff --git a/utils/codepointtrie/README.md b/utils/codepointtrie/README.md new file mode 100644 index 00000000000..c5665ec2b8a --- /dev/null +++ b/utils/codepointtrie/README.md @@ -0,0 +1,34 @@ +# icu_codepointtrie [![crates.io](http://meritbadge.herokuapp.com/icu_codepointtrie)](https://crates.io/crates/icu_codepointtrie) + +`icu_codepointtrie` is a utility crate of the [`ICU4X`] project. + +This component provides a data structure for an time-efficient lookup of values +associated to code points. + +It is an implementation of the existing [ICU4C UCPTrie](https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/ucptrie_8h.html) +/ [ICU4J CodePointTrie](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4j/) API. + +## Architecture + +ICU4X [`CodePointTrie`](crate::codepointtrie::CodePointTrie) is designed to provide a read-only view of CodePointTrie data that is exported +from ICU4C. Detailed information about the design of the data structure can be found in the documentation +for the [`CodePointTrie`](crate::codepointtrie::CodePointTrie) struct. + +## Examples + +### Querying a `CodePointTrie` + +```rust +use icu_codepointtrie::planes; +let trie = planes::get_planes_trie(); + +assert_eq!(0, trie.get(0x41)); // 'A' as u32 +assert_eq!(0, trie.get(0x13E0)); // 'Ꮰ' as u32 +assert_eq!(1, trie.get(0x10044)); // '𐁄' as u32 +``` + +[`ICU4X`]: ../icu/index.html + +## More Information + +For more information on development, authorship, contributing etc. please visit [`ICU4X home page`](https://github.com/unicode-org/icu4x). diff --git a/utils/codepointtrie/src/codepointtrie.rs b/utils/codepointtrie/src/codepointtrie.rs new file mode 100644 index 00000000000..8c405bfe1c3 --- /dev/null +++ b/utils/codepointtrie/src/codepointtrie.rs @@ -0,0 +1,481 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use crate::error::Error; +use crate::impl_const::*; + +use core::convert::TryFrom; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; +use zerovec::ZeroVec; + +// Enums + +/// The width of the elements in the data array of a [`CodePointTrie`]. +/// See [`UCPTrieValueWidth`](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/ucptrie_8h.html) in ICU4C. +#[derive(Clone, Copy, PartialEq)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +pub enum ValueWidthEnum { + Bits16 = 0, + Bits32 = 1, + Bits8 = 2, +} + +/// The type of trie represents whether the trie has an optimization that +/// would make it small or fast. +/// See [`UCPTrieType`](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/ucptrie_8h.html) in ICU4C. +#[derive(Clone, Copy, PartialEq)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +pub enum TrieTypeEnum { + /// Represents the "fast" type code point tries for the + /// [`TrieType`] trait. The "fast max" limit is set to `0xffff`. + Fast = 0, + /// Represents the "small" type code point tries for the + /// [`TrieType`] trait. The "fast max" limit is set to `0x0fff`. + Small = 1, +} + +// ValueWidth trait + +// AsULE is AsUnalignedLittleEndian, i.e. "allowed in a zerovec" + +/// A trait representing the width of the values stored in the data array of a +/// [`CodePointTrie`]. This trait is used as a type parameter in constructing +/// a `CodePointTrie`. +pub trait ValueWidth: Copy + zerovec::ule::AsULE + 'static { + /// This enum variant represents the specific instance of `ValueWidth` such + /// that the enum discriminant values matches ICU4C's enum integer value. + const ENUM_VALUE: ValueWidthEnum; + /// This value is used to indicate an error in the Rust code in accessing + /// a position in the trie's `data` array. In normal cases, the position in + /// the `data` array will return either the correct value, or in case of a + /// logical error in the trie's computation, the trie's own error value + /// which is stored that in the `data` array. + const DATA_GET_ERROR_VALUE: Self; + fn cast_to_widest(self) -> u32; +} + +impl ValueWidth for u8 { + const ENUM_VALUE: ValueWidthEnum = ValueWidthEnum::Bits8; + const DATA_GET_ERROR_VALUE: u8 = u8::MAX; + + fn cast_to_widest(self) -> u32 { + self as u32 + } +} + +impl ValueWidth for u16 { + const ENUM_VALUE: ValueWidthEnum = ValueWidthEnum::Bits16; + const DATA_GET_ERROR_VALUE: u16 = u16::MAX; + + fn cast_to_widest(self) -> u32 { + self as u32 + } +} + +impl ValueWidth for u32 { + const ENUM_VALUE: ValueWidthEnum = ValueWidthEnum::Bits32; + const DATA_GET_ERROR_VALUE: u32 = u32::MAX; + + fn cast_to_widest(self) -> u32 { + self + } +} + +/// This struct represents a de-serialized CodePointTrie that was exported from +/// ICU binary data. +/// +/// For more information: +/// - [ICU Site design doc](http://site.icu-project.org/design/struct/utrie) +/// - [ICU User Guide section on Properties lookup](https://unicode-org.github.io/icu/userguide/strings/properties.html#lookup) +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +pub struct CodePointTrie<'trie, W: ValueWidth> { + header: CodePointTrieHeader, + #[cfg_attr(feature = "serde", serde(borrow))] + index: ZeroVec<'trie, u16>, + #[cfg_attr(feature = "serde", serde(borrow))] + data: ZeroVec<'trie, W>, +} + +/// This struct contains the fixed-length header fields of a [`CodePointTrie`]. +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +pub struct CodePointTrieHeader { + /// The code point of the start of the last range of the trie. A + /// range is defined as a partition of the code point space such that the + /// value in this trie associated with all code points of the same range is + /// the same. + /// + /// For the property value data for many Unicode properties, + /// often times, `high_start` is `U+10000` or lower. In such cases, not + /// reserving space in the `index` array for duplicate values is a large + /// savings. The "highValue" associated with the `high_start` range is + /// stored at the second-to-last position of the `data` array. + /// (See `impl_const::HIGH_VALUE_NEG_DATA_OFFSET`.) + pub high_start: u32, + /// A version of the `high_start` value that is right-shifted 12 spaces, + /// but is rounded up to a multiple `0x1000` for easy testing from UTF-8 + /// lead bytes. + pub shifted12_high_start: u16, + /// Offset for the null block in the "index-3" table of the `index` array. + /// Set to an impossibly high value (e.g., `0xffff`) if there is no + /// dedicated index-3 null block. + pub index3_null_offset: u16, + /// Internal data null block offset, not shifted. + /// Set to an impossibly high value (e.g., `0xfffff`) if there is no + /// dedicated data null block. + pub data_null_offset: u32, + /// The value stored in the trie that represents a null value being + /// associated to a code point. + pub null_value: u32, + /// The enum value representing the type of trie, where trie type is as it + /// is defined in ICU (ex: Fast, Small). + pub trie_type: TrieTypeEnum, +} + +impl TryFrom for TrieTypeEnum { + type Error = crate::error::Error; + + fn try_from(trie_type_int: u8) -> Result { + match trie_type_int { + 0 => Ok(TrieTypeEnum::Fast), + 1 => Ok(TrieTypeEnum::Small), + _ => Err(crate::error::Error::FromDeserialized { + reason: "Cannot parse value for trie_type", + }), + } + } +} + +impl<'trie, W: ValueWidth> CodePointTrie<'trie, W> { + /// Returns a new [`CodePointTrie`] backed by borrowed data for the `index` + /// array and `data` array, whose data values have width `W`. + pub fn try_new( + header: CodePointTrieHeader, + index: ZeroVec<'trie, u16>, + data: ZeroVec<'trie, W>, + ) -> Result, Error> { + // Validation invariants are not needed here when constructing a new + // `CodePointTrie` because: + // + // - Rust includes the size of a slice (or Vec or similar), which allows it + // to prevent lookups at out-of-bounds indices, whereas in C++, it is the + // programmer's responsibility to keep track of length info. + // - For lookups into collections, Rust guarantees that a fallback value will + // be returned in the case of `.get()` encountering a lookup error, via + // the `Option` type. + // - The `ZeroVec` serializer stores the length of the array along with the + // ZeroVec data, meaning that a deserializer would also see that length info. + + let trie: CodePointTrie<'trie, W> = CodePointTrie { + header, + index, + data, + }; + Ok(trie) + } + + /// Returns the position in the data array containing the trie's stored + /// error value. + fn trie_error_val_index(&self) -> u32 { + self.data.len() as u32 - ERROR_VALUE_NEG_DATA_OFFSET + } + + fn internal_small_index(&self, code_point: u32) -> u32 { + let mut index1_pos: u32 = code_point >> SHIFT_1; + if self.header.trie_type == TrieTypeEnum::Fast { + debug_assert!( + FAST_TYPE_FAST_INDEXING_MAX < code_point && code_point < self.header.high_start + ); + index1_pos = index1_pos + BMP_INDEX_LENGTH - OMITTED_BMP_INDEX_1_LENGTH; + } else { + assert!(code_point < self.header.high_start && self.header.high_start > SMALL_LIMIT); + index1_pos += SMALL_INDEX_LENGTH; + } + let index1_val = if let Some(index1_val) = self.index.get(index1_pos as usize) { + index1_val + } else { + return self.trie_error_val_index(); + }; + let index3_block_idx: u32 = (index1_val as u32) + ((code_point >> SHIFT_2) & INDEX_2_MASK); + let mut index3_block: u32 = + if let Some(index3_block) = self.index.get(index3_block_idx as usize) { + index3_block as u32 + } else { + return self.trie_error_val_index(); + }; + let mut index3_pos: u32 = (code_point >> SHIFT_3) & INDEX_3_MASK; + let mut data_block: u32; + if index3_block & 0x8000 == 0 { + // 16-bit indexes + data_block = + if let Some(data_block) = self.index.get((index3_block + index3_pos) as usize) { + data_block as u32 + } else { + return self.trie_error_val_index(); + }; + } else { + // 18-bit indexes stored in groups of 9 entries per 8 indexes. + index3_block = (index3_block & 0x7fff) + (index3_pos & !7) + (index3_pos >> 3); + index3_pos &= 7; + data_block = if let Some(data_block) = self.index.get(index3_block as usize) { + data_block as u32 + } else { + return self.trie_error_val_index(); + }; + data_block = ((data_block << (2 + (2 * index3_pos))) as u32 & 0x30000) as u32; + index3_block += 1; + data_block = + if let Some(index3_val) = self.index.get((index3_block + index3_pos) as usize) { + data_block | (index3_val as u32) + } else { + return self.trie_error_val_index(); + }; + } + // Returns data_pos == data_block (offset) + + // portion of code_point bit field for last (4th) lookup + data_block + (code_point & SMALL_DATA_MASK) + } + + /// Returns the position in the `data` array for the given code point, + /// where this code point is at or above the fast limit associated for the + /// `trie_type`. We will refer to that limit as "`fastMax`" here. + /// + /// A lookup of the value in the code point trie for a code point in the + /// code point space range [`fastMax`, `high_start`) will be a 4-step + /// lookup: 3 lookups in the `index` array and one lookup in the `data` + /// array. Lookups for code points in the range [`high_start`, + /// `CODE_POINT_MAX`] are short-circuited to be a single lookup, see + /// [CodePointTrieHeader::high_start]. + fn small_index(&self, code_point: u32) -> u32 { + if code_point >= self.header.high_start { + self.data.len() as u32 - HIGH_VALUE_NEG_DATA_OFFSET + } else { + self.internal_small_index(code_point) // helper fn + } + } + + /// Returns the position in the `data` array for the given code point, + /// where this code point is below the fast limit associated for the + /// `trie type`. We will refer to that limit as "`fastMax`" here. + /// + /// A lookup of the value in the code point trie for a code point in the + /// code point space range [0, `fastMax`) will be a 2-step lookup: 1 + /// lookup in the `index` array and one lookup in the `data` array. By + /// design, for trie type `T`, there is an element allocated in the `index` + /// array for each block of code points in [0, `fastMax`), which in + /// turn guarantees that those code points are represented and only need 1 + /// lookup. + fn fast_index(&self, code_point: u32) -> u32 { + let index_array_pos: u32 = code_point >> FAST_TYPE_SHIFT; + let index_array_val: u16 = + if let Some(index_array_val) = self.index.get(index_array_pos as usize) { + index_array_val + } else { + return self.trie_error_val_index(); + }; + let fast_index_val: u32 = index_array_val as u32 + (code_point & FAST_TYPE_DATA_MASK); + fast_index_val + } + + /// Returns the value that is associated with `code_point` in this [`CodePointTrie`]. + /// + /// # Examples + /// + /// ``` + /// use icu_codepointtrie::planes; + /// let trie = planes::get_planes_trie(); + /// + /// assert_eq!(0, trie.get(0x41)); // 'A' as u32 + /// assert_eq!(0, trie.get(0x13E0)); // 'Ꮰ' as u32 + /// assert_eq!(1, trie.get(0x10044)); // '𐁄' as u32 + /// ``` + pub fn get(&self, code_point: u32) -> W { + // All code points up to the fast max limit are represented + // individually in the `index` array to hold their `data` array position, and + // thus only need 2 lookups for a [CodePointTrie::get()](`crate::codepointtrie::CodePointTrie::get`). + // Code points above the "fast max" limit require 4 lookups. + let fast_max = match self.header.trie_type { + TrieTypeEnum::Fast => FAST_TYPE_FAST_INDEXING_MAX, + TrieTypeEnum::Small => SMALL_TYPE_FAST_INDEXING_MAX, + }; + let data_pos: u32 = if code_point <= fast_max { + Self::fast_index(self, code_point) + } else if code_point <= CODE_POINT_MAX { + Self::small_index(self, code_point) + } else { + self.trie_error_val_index() + }; + // Returns the trie value (or trie's error value). + // If we cannot read from the data array, then return the associated constant + // DATA_GET_ERROR_VALUE for the instance type for W: ValueWidth. + self.data + .get(data_pos as usize) + .unwrap_or(W::DATA_GET_ERROR_VALUE) + } + + /// Returns the value that is associated with `code_point` for this [`CodePointTrie`] + /// as a `u32`. + /// + /// # Examples + /// + /// ``` + /// use icu_codepointtrie::planes; + /// let trie = planes::get_planes_trie(); + /// + /// let cp = '𑖎' as u32; + /// assert_eq!(cp, 0x1158E); + /// let trie = planes::get_planes_trie(); + /// let plane_num: u8 = trie.get(cp); + /// assert_eq!(trie.get_u32(cp), plane_num as u32); + /// ``` + /// + // Note: This API method maintains consistency with the corresponding + // original ICU APIs. + pub fn get_u32(&self, code_point: u32) -> u32 { + self.get(code_point).cast_to_widest() + } +} + +#[cfg(test)] +mod tests { + #[cfg(feature = "serde")] + use super::CodePointTrie; + #[cfg(feature = "serde")] + use zerovec::ZeroVec; + + #[test] + #[cfg(feature = "serde")] + fn test_serde_with_postcard_roundtrip() -> Result<(), postcard::Error> { + let trie = crate::planes::get_planes_trie(); + let trie_serialized: Vec = postcard::to_allocvec(&trie).unwrap(); + + // Assert an expected (golden data) version of the serialized trie. + const EXP_TRIE_SERIALIZED: &[u8] = &[ + 0, 0, 16, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 160, 18, 0, 0, 64, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 136, 2, 144, 2, 144, 2, 144, 2, 176, 2, 176, 2, 176, 2, 176, 2, 208, 2, + 208, 2, 208, 2, 208, 2, 240, 2, 240, 2, 240, 2, 240, 2, 16, 3, 16, 3, 16, 3, 16, 3, 48, + 3, 48, 3, 48, 3, 48, 3, 80, 3, 80, 3, 80, 3, 80, 3, 112, 3, 112, 3, 112, 3, 112, 3, + 144, 3, 144, 3, 144, 3, 144, 3, 176, 3, 176, 3, 176, 3, 176, 3, 208, 3, 208, 3, 208, 3, + 208, 3, 240, 3, 240, 3, 240, 3, 240, 3, 16, 4, 16, 4, 16, 4, 16, 4, 48, 4, 48, 4, 48, + 4, 48, 4, 80, 4, 80, 4, 80, 4, 80, 4, 112, 4, 112, 4, 112, 4, 112, 4, 0, 0, 16, 0, 32, + 0, 48, 0, 64, 0, 80, 0, 96, 0, 112, 0, 0, 0, 16, 0, 32, 0, 48, 0, 0, 0, 16, 0, 32, 0, + 48, 0, 0, 0, 16, 0, 32, 0, 48, 0, 0, 0, 16, 0, 32, 0, 48, 0, 0, 0, 16, 0, 32, 0, 48, 0, + 0, 0, 16, 0, 32, 0, 48, 0, 0, 0, 16, 0, 32, 0, 48, 0, 0, 0, 16, 0, 32, 0, 48, 0, 128, + 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, + 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, + 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 144, 0, 144, + 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, + 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, + 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 144, 0, 160, 0, 160, 0, 160, + 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, + 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, + 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 160, 0, 176, 0, 176, 0, 176, 0, 176, + 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, + 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, + 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 176, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, + 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, + 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, + 0, 192, 0, 192, 0, 192, 0, 192, 0, 192, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, + 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, + 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, 0, 208, + 0, 208, 0, 208, 0, 208, 0, 208, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, + 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, + 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, 0, 224, + 0, 224, 0, 224, 0, 224, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, + 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, + 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, 0, 240, + 0, 240, 0, 240, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, + 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, + 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, + 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, 16, 1, + 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, + 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, + 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 32, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, + 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, + 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, 1, 48, + 1, 48, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, + 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, + 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 64, 1, 80, 1, 80, 1, 80, 1, 80, 1, + 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, + 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, 80, 1, + 80, 1, 80, 1, 80, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, + 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, + 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 96, 1, 128, 0, 136, 0, + 136, 0, 136, 0, 136, 0, 136, 0, 136, 0, 136, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, + 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, + 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 168, 0, 168, 0, 168, + 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, + 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, + 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 168, 0, 200, 0, 200, 0, 200, 0, 200, + 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, + 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, + 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 200, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, + 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, + 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, + 0, 232, 0, 232, 0, 232, 0, 232, 0, 232, 0, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, + 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, + 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 8, 1, 40, 1, 40, 1, 40, 1, 40, 1, + 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, + 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, 40, 1, + 40, 1, 40, 1, 40, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, + 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, + 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 72, 1, 104, 1, 104, 1, + 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, + 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, + 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 104, 1, 136, 1, 136, 1, 136, 1, + 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, + 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, + 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 136, 1, 168, 1, 168, 1, 168, 1, 168, 1, + 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, + 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, + 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 168, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, + 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, + 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, + 200, 1, 200, 1, 200, 1, 200, 1, 200, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, + 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, + 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, 232, 1, + 232, 1, 232, 1, 232, 1, 232, 1, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, + 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, + 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 8, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, + 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, + 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, 40, 2, + 40, 2, 40, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, + 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, + 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 72, 2, 104, 2, 104, 2, 104, 2, + 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, + 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, + 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 104, 2, 244, 2, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 16, 16, 16, 0, + ]; + assert_eq!(trie_serialized, EXP_TRIE_SERIALIZED); + + let trie_deserialized = postcard::from_bytes::>(&trie_serialized)?; + + assert_eq!(&trie.index, &trie_deserialized.index); + assert_eq!(&trie.data, &trie_deserialized.data); + + assert!(matches!(trie_deserialized.index, ZeroVec::Borrowed(_))); + assert!(matches!(trie_deserialized.data, ZeroVec::Borrowed(_))); + + Ok(()) + } +} diff --git a/experimental/codepointtrie/src/error.rs b/utils/codepointtrie/src/error.rs similarity index 100% rename from experimental/codepointtrie/src/error.rs rename to utils/codepointtrie/src/error.rs diff --git a/experimental/codepointtrie/src/impl_const.rs b/utils/codepointtrie/src/impl_const.rs similarity index 96% rename from experimental/codepointtrie/src/impl_const.rs rename to utils/codepointtrie/src/impl_const.rs index 98122c897ff..f3cd9c96a1e 100644 --- a/experimental/codepointtrie/src/impl_const.rs +++ b/utils/codepointtrie/src/impl_const.rs @@ -10,10 +10,10 @@ pub const FAST_TYPE_DATA_BLOCK_LENGTH: u32 = 1 << FAST_TYPE_SHIFT; /// Mask for getting the lower bits for the in-fast-data-block offset. pub const FAST_TYPE_DATA_MASK: u32 = FAST_TYPE_DATA_BLOCK_LENGTH - 1; -// Fast indexing limit for "fast"-type trie +/// Fast indexing limit for "fast"-type trie pub const FAST_TYPE_FAST_INDEXING_MAX: u32 = 0xffff; -// Fast indexing limit for "small"-type trie +/// Fast indexing limit for "small"-type trie pub const SMALL_TYPE_FAST_INDEXING_MAX: u32 = 0xfff; /// Offset from dataLength (to be subtracted) for fetching the diff --git a/utils/codepointtrie/src/lib.rs b/utils/codepointtrie/src/lib.rs new file mode 100644 index 00000000000..18c104ff904 --- /dev/null +++ b/utils/codepointtrie/src/lib.rs @@ -0,0 +1,41 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! `icu_codepointtrie` is a utility crate of the [`ICU4X`] project. +//! +//! This component provides a data structure for an time-efficient lookup of values +//! associated to code points. +//! +//! It is an implementation of the existing [ICU4C UCPTrie](https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/ucptrie_8h.html) +//! / [ICU4J CodePointTrie](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4j/) API. +//! +//! # Architecture +//! +//! ICU4X [`CodePointTrie`](crate::codepointtrie::CodePointTrie) is designed to provide a read-only view of CodePointTrie data that is exported +//! from ICU4C. Detailed information about the design of the data structure can be found in the documentation +//! for the [`CodePointTrie`](crate::codepointtrie::CodePointTrie) struct. +//! +//! # Examples +//! +//! ## Querying a `CodePointTrie` +//! +//! ``` +//! use icu_codepointtrie::planes; +//! let trie = planes::get_planes_trie(); +//! +//! assert_eq!(0, trie.get(0x41)); // 'A' as u32 +//! assert_eq!(0, trie.get(0x13E0)); // 'Ꮰ' as u32 +//! assert_eq!(1, trie.get(0x10044)); // '𐁄' as u32 +//! ``` +//! +//! [`ICU4X`]: ../icu/index.html + +// Workaround for https://github.com/rust-lang/rust/issues/87932 +#[cfg(feature = "serde")] +extern crate serde; + +pub mod codepointtrie; +pub mod error; +mod impl_const; +pub mod planes; diff --git a/experimental/codepointtrie/src/planes.rs b/utils/codepointtrie/src/planes.rs similarity index 98% rename from experimental/codepointtrie/src/planes.rs rename to utils/codepointtrie/src/planes.rs index 063727c8474..06e8e816d3a 100644 --- a/experimental/codepointtrie/src/planes.rs +++ b/utils/codepointtrie/src/planes.rs @@ -149,8 +149,8 @@ const INDEX_ARRAY_AS_BYTES: &[u8] = &[ /// integer from 0-16 inclusive, for each code point. This `CodePointTrie` /// does not actually represent any Unicode property, but it is provided in /// case it is useful to users of `CodePointTrie` for testing or other -/// purposes. See https://www.unicode.org/glossary/#plane -pub fn get_planes_trie() -> CodePointTrie<'static, u8, Small> { +/// purposes. See . +pub fn get_planes_trie() -> CodePointTrie<'static, u8> { let index_array_as_bytes: &[u8] = INDEX_ARRAY_AS_BYTES; let data_8_array: &[u8] = &[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -171,25 +171,23 @@ pub fn get_planes_trie() -> CodePointTrie<'static, u8, Small> { ]; let index: ZeroVec = ZeroVec::parse_byte_slice(index_array_as_bytes).expect("infallible"); let data: ZeroVec = ZeroVec::parse_byte_slice(data_8_array).expect("infallible"); - let index_length = 1168; - let data_length = 372; let high_start = 0x100000; let shifted12_high_start = 0x100; let index3_null_offset = 0x2; let data_null_offset = 0x0; let null_value = 0x0; + let trie_type = TrieTypeEnum::Small; let trie_header = CodePointTrieHeader { - index_length, - data_length, high_start, shifted12_high_start, index3_null_offset, data_null_offset, null_value, + trie_type, }; - let trie_result: Result, Error> = + let trie_result: Result, Error> = CodePointTrie::try_new(trie_header, index, data); assert!( trie_result.is_ok(), diff --git a/experimental/codepointtrie/tests/planes_test.rs b/utils/codepointtrie/tests/planes_test.rs similarity index 84% rename from experimental/codepointtrie/tests/planes_test.rs rename to utils/codepointtrie/tests/planes_test.rs index e3c9009f9ca..25b8158a24d 100644 --- a/experimental/codepointtrie/tests/planes_test.rs +++ b/utils/codepointtrie/tests/planes_test.rs @@ -7,10 +7,12 @@ mod test_util; use icu_codepointtrie::codepointtrie::*; use icu_codepointtrie::error::Error; use icu_codepointtrie::planes::get_planes_trie; +use test_util::UnicodeEnumeratedProperty; + +use core::convert::TryFrom; use std::fs::File; use std::io::Read; use std::path::Path; -use test_util::UnicodeEnumeratedProperty; use zerovec::ZeroVec; #[test] @@ -39,19 +41,28 @@ fn planes_trie_deserialize_check_test() { let code_point_trie_struct = planes_enum_prop.code_point_trie.trie_struct; + let trie_type_enum = match TrieTypeEnum::try_from(code_point_trie_struct.trie_type_enum_val) { + Ok(enum_val) => enum_val, + _ => { + panic!( + "Could not parse trie_type serialized enum value in test data file: {}", + code_point_trie_struct.name + ); + } + }; + let trie_header = CodePointTrieHeader { - index_length: code_point_trie_struct.index_length, - data_length: code_point_trie_struct.data_length, high_start: code_point_trie_struct.high_start, shifted12_high_start: code_point_trie_struct.shifted12_high_start, index3_null_offset: code_point_trie_struct.index3_null_offset, data_null_offset: code_point_trie_struct.data_null_offset, null_value: code_point_trie_struct.null_value, + trie_type: trie_type_enum, }; let data = ZeroVec::from_slice(code_point_trie_struct.data_8.as_ref().unwrap()); let index = ZeroVec::from_slice(&code_point_trie_struct.index); - let trie_result: Result, Error> = + let trie_result: Result, Error> = CodePointTrie::try_new(trie_header, index, data); let act_planes_trie = trie_result.unwrap(); diff --git a/experimental/codepointtrie/tests/test_util.rs b/utils/codepointtrie/tests/test_util.rs similarity index 70% rename from experimental/codepointtrie/tests/test_util.rs rename to utils/codepointtrie/tests/test_util.rs index b9e46827012..8b4db2ab8df 100644 --- a/experimental/codepointtrie/tests/test_util.rs +++ b/utils/codepointtrie/tests/test_util.rs @@ -4,12 +4,14 @@ use icu_codepointtrie::codepointtrie::*; use icu_codepointtrie::error::Error; + +use core::convert::TryFrom; use std::fs::File; use std::io::Read; use std::path::Path; use zerovec::ZeroVec; -pub fn check_trie(trie: &CodePointTrie, check_ranges: &[u32]) { +pub fn check_trie(trie: &CodePointTrie, check_ranges: &[u32]) { assert_eq!( 0, check_ranges.len() % 2, @@ -30,15 +32,6 @@ pub fn check_trie(trie: &CodePointTrie, check_ } } -/// Converts the serialized `u8` value for the trie type into a [`TrieTypeEnum`]. -pub fn get_code_point_trie_type_enum(trie_type_int: u8) -> Option { - match trie_type_int { - 0 => Some(TrieTypeEnum::Fast), - 1 => Some(TrieTypeEnum::Small), - _ => None, - } -} - // The following structs might be useful later for de-/serialization of the // main `CodePointTrie` struct in the corresponding data provider. @@ -82,9 +75,9 @@ pub struct EnumPropSerializedCPTStruct { pub data_8: Option>, pub data_16: Option>, pub data_32: Option>, - #[cfg_attr(any(feature = "serde", test), serde(rename = "indexLength"))] + #[cfg_attr(any(feature = "serde", test), serde(skip))] pub index_length: u32, - #[cfg_attr(any(feature = "serde", test), serde(rename = "dataLength"))] + #[cfg_attr(any(feature = "serde", test), serde(skip))] pub data_length: u32, #[cfg_attr(any(feature = "serde", test), serde(rename = "highStart"))] pub high_start: u32, @@ -159,74 +152,31 @@ pub fn run_deserialize_test_from_test_data(test_file_path: &str) { test_struct.name ); + let trie_type_enum = match TrieTypeEnum::try_from(test_struct.trie_type_enum_val) { + Ok(enum_val) => enum_val, + _ => { + panic!( + "Could not parse trie_type serialized enum value in test data file: {}", + test_struct.name + ); + } + }; + let trie_header = CodePointTrieHeader { - index_length: test_struct.index_length, - data_length: test_struct.data_length, high_start: test_struct.high_start, shifted12_high_start: test_struct.shifted12_high_start, index3_null_offset: test_struct.index3_null_offset, data_null_offset: test_struct.data_null_offset, null_value: test_struct.null_value, + trie_type: trie_type_enum, }; let index = ZeroVec::from_slice(&test_struct.index); - let trie_type_enum = get_code_point_trie_type_enum(test_struct.trie_type_enum_val); - - match ( - test_struct.data_8, - test_struct.data_16, - test_struct.data_32, - trie_type_enum, - ) { - (Some(data_8), _, _, Some(TrieTypeEnum::Fast)) => { - let data = ZeroVec::from_slice(&data_8); - let trie_result: Result, Error> = - CodePointTrie::try_new(trie_header, index, data); - assert!(trie_result.is_ok(), "Could not construct trie"); - assert_eq!( - test_struct.value_width_enum_val, - ValueWidthEnum::Bits8 as u8 - ); - check_trie( - &trie_result.unwrap(), - &test_file.code_point_trie.test_data.check_ranges, - ); - } - - (_, Some(data_16), _, Some(TrieTypeEnum::Fast)) => { - let data = ZeroVec::from_slice(&data_16); - let trie_result: Result, Error> = - CodePointTrie::try_new(trie_header, index, data); - assert!(trie_result.is_ok(), "Could not construct trie"); - assert_eq!( - test_struct.value_width_enum_val, - ValueWidthEnum::Bits16 as u8 - ); - check_trie( - &trie_result.unwrap(), - &test_file.code_point_trie.test_data.check_ranges, - ); - } - - (_, _, Some(data_32), Some(TrieTypeEnum::Fast)) => { - let data = ZeroVec::from_slice(&data_32); - let trie_result: Result, Error> = - CodePointTrie::try_new(trie_header, index, data); - assert!(trie_result.is_ok(), "Could not construct trie"); - assert_eq!( - test_struct.value_width_enum_val, - ValueWidthEnum::Bits32 as u8 - ); - check_trie( - &trie_result.unwrap(), - &test_file.code_point_trie.test_data.check_ranges, - ); - } - - (Some(data_8), _, _, Some(TrieTypeEnum::Small)) => { + match (test_struct.data_8, test_struct.data_16, test_struct.data_32) { + (Some(data_8), _, _) => { let data = ZeroVec::from_slice(&data_8); - let trie_result: Result, Error> = + let trie_result: Result, Error> = CodePointTrie::try_new(trie_header, index, data); assert!(trie_result.is_ok(), "Could not construct trie"); assert_eq!( @@ -239,9 +189,9 @@ pub fn run_deserialize_test_from_test_data(test_file_path: &str) { ); } - (_, Some(data_16), _, Some(TrieTypeEnum::Small)) => { + (_, Some(data_16), _) => { let data = ZeroVec::from_slice(&data_16); - let trie_result: Result, Error> = + let trie_result: Result, Error> = CodePointTrie::try_new(trie_header, index, data); assert!(trie_result.is_ok(), "Could not construct trie"); assert_eq!( @@ -254,9 +204,9 @@ pub fn run_deserialize_test_from_test_data(test_file_path: &str) { ); } - (_, _, Some(data_32), Some(TrieTypeEnum::Small)) => { + (_, _, Some(data_32)) => { let data = ZeroVec::from_slice(&data_32); - let trie_result: Result, Error> = + let trie_result: Result, Error> = CodePointTrie::try_new(trie_header, index, data); assert!(trie_result.is_ok(), "Could not construct trie"); assert_eq!( @@ -269,7 +219,7 @@ pub fn run_deserialize_test_from_test_data(test_file_path: &str) { ); } - (_, _, _, _) => { + (_, _, _) => { panic!("Could not match test trie data to a known value width or trie type"); } }; diff --git a/experimental/codepointtrie/tests/testdata/free-blocks.16.toml b/utils/codepointtrie/tests/testdata/free-blocks.16.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/free-blocks.16.toml rename to utils/codepointtrie/tests/testdata/free-blocks.16.toml diff --git a/experimental/codepointtrie/tests/testdata/free-blocks.32.toml b/utils/codepointtrie/tests/testdata/free-blocks.32.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/free-blocks.32.toml rename to utils/codepointtrie/tests/testdata/free-blocks.32.toml diff --git a/experimental/codepointtrie/tests/testdata/free-blocks.8.toml b/utils/codepointtrie/tests/testdata/free-blocks.8.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/free-blocks.8.toml rename to utils/codepointtrie/tests/testdata/free-blocks.8.toml diff --git a/experimental/codepointtrie/tests/testdata/free-blocks.small16.toml b/utils/codepointtrie/tests/testdata/free-blocks.small16.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/free-blocks.small16.toml rename to utils/codepointtrie/tests/testdata/free-blocks.small16.toml diff --git a/experimental/codepointtrie/tests/testdata/grow-data.16.toml b/utils/codepointtrie/tests/testdata/grow-data.16.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/grow-data.16.toml rename to utils/codepointtrie/tests/testdata/grow-data.16.toml diff --git a/experimental/codepointtrie/tests/testdata/grow-data.32.toml b/utils/codepointtrie/tests/testdata/grow-data.32.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/grow-data.32.toml rename to utils/codepointtrie/tests/testdata/grow-data.32.toml diff --git a/experimental/codepointtrie/tests/testdata/grow-data.8.toml b/utils/codepointtrie/tests/testdata/grow-data.8.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/grow-data.8.toml rename to utils/codepointtrie/tests/testdata/grow-data.8.toml diff --git a/experimental/codepointtrie/tests/testdata/grow-data.small16.toml b/utils/codepointtrie/tests/testdata/grow-data.small16.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/grow-data.small16.toml rename to utils/codepointtrie/tests/testdata/grow-data.small16.toml diff --git a/experimental/codepointtrie/tests/testdata/planes.toml b/utils/codepointtrie/tests/testdata/planes.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/planes.toml rename to utils/codepointtrie/tests/testdata/planes.toml diff --git a/experimental/codepointtrie/tests/testdata/set-empty.16.toml b/utils/codepointtrie/tests/testdata/set-empty.16.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/set-empty.16.toml rename to utils/codepointtrie/tests/testdata/set-empty.16.toml diff --git a/experimental/codepointtrie/tests/testdata/set-empty.32.toml b/utils/codepointtrie/tests/testdata/set-empty.32.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/set-empty.32.toml rename to utils/codepointtrie/tests/testdata/set-empty.32.toml diff --git a/experimental/codepointtrie/tests/testdata/set-empty.8.toml b/utils/codepointtrie/tests/testdata/set-empty.8.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/set-empty.8.toml rename to utils/codepointtrie/tests/testdata/set-empty.8.toml diff --git a/experimental/codepointtrie/tests/testdata/set-empty.small16.toml b/utils/codepointtrie/tests/testdata/set-empty.small16.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/set-empty.small16.toml rename to utils/codepointtrie/tests/testdata/set-empty.small16.toml diff --git a/experimental/codepointtrie/tests/testdata/set-single-value.16.toml b/utils/codepointtrie/tests/testdata/set-single-value.16.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/set-single-value.16.toml rename to utils/codepointtrie/tests/testdata/set-single-value.16.toml diff --git a/experimental/codepointtrie/tests/testdata/set-single-value.32.toml b/utils/codepointtrie/tests/testdata/set-single-value.32.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/set-single-value.32.toml rename to utils/codepointtrie/tests/testdata/set-single-value.32.toml diff --git a/experimental/codepointtrie/tests/testdata/set-single-value.8.toml b/utils/codepointtrie/tests/testdata/set-single-value.8.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/set-single-value.8.toml rename to utils/codepointtrie/tests/testdata/set-single-value.8.toml diff --git a/experimental/codepointtrie/tests/testdata/set-single-value.small16.toml b/utils/codepointtrie/tests/testdata/set-single-value.small16.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/set-single-value.small16.toml rename to utils/codepointtrie/tests/testdata/set-single-value.small16.toml diff --git a/experimental/codepointtrie/tests/testdata/set1.16.toml b/utils/codepointtrie/tests/testdata/set1.16.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/set1.16.toml rename to utils/codepointtrie/tests/testdata/set1.16.toml diff --git a/experimental/codepointtrie/tests/testdata/set1.32.toml b/utils/codepointtrie/tests/testdata/set1.32.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/set1.32.toml rename to utils/codepointtrie/tests/testdata/set1.32.toml diff --git a/experimental/codepointtrie/tests/testdata/set1.8.toml b/utils/codepointtrie/tests/testdata/set1.8.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/set1.8.toml rename to utils/codepointtrie/tests/testdata/set1.8.toml diff --git a/experimental/codepointtrie/tests/testdata/set1.small16.toml b/utils/codepointtrie/tests/testdata/set1.small16.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/set1.small16.toml rename to utils/codepointtrie/tests/testdata/set1.small16.toml diff --git a/experimental/codepointtrie/tests/testdata/set2-overlap.16.toml b/utils/codepointtrie/tests/testdata/set2-overlap.16.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/set2-overlap.16.toml rename to utils/codepointtrie/tests/testdata/set2-overlap.16.toml diff --git a/experimental/codepointtrie/tests/testdata/set2-overlap.32.toml b/utils/codepointtrie/tests/testdata/set2-overlap.32.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/set2-overlap.32.toml rename to utils/codepointtrie/tests/testdata/set2-overlap.32.toml diff --git a/experimental/codepointtrie/tests/testdata/set2-overlap.small16.toml b/utils/codepointtrie/tests/testdata/set2-overlap.small16.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/set2-overlap.small16.toml rename to utils/codepointtrie/tests/testdata/set2-overlap.small16.toml diff --git a/experimental/codepointtrie/tests/testdata/set3-initial-9.16.toml b/utils/codepointtrie/tests/testdata/set3-initial-9.16.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/set3-initial-9.16.toml rename to utils/codepointtrie/tests/testdata/set3-initial-9.16.toml diff --git a/experimental/codepointtrie/tests/testdata/set3-initial-9.32.toml b/utils/codepointtrie/tests/testdata/set3-initial-9.32.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/set3-initial-9.32.toml rename to utils/codepointtrie/tests/testdata/set3-initial-9.32.toml diff --git a/experimental/codepointtrie/tests/testdata/set3-initial-9.8.toml b/utils/codepointtrie/tests/testdata/set3-initial-9.8.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/set3-initial-9.8.toml rename to utils/codepointtrie/tests/testdata/set3-initial-9.8.toml diff --git a/experimental/codepointtrie/tests/testdata/set3-initial-9.small16.toml b/utils/codepointtrie/tests/testdata/set3-initial-9.small16.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/set3-initial-9.small16.toml rename to utils/codepointtrie/tests/testdata/set3-initial-9.small16.toml diff --git a/experimental/codepointtrie/tests/testdata/short-all-same.16.toml b/utils/codepointtrie/tests/testdata/short-all-same.16.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/short-all-same.16.toml rename to utils/codepointtrie/tests/testdata/short-all-same.16.toml diff --git a/experimental/codepointtrie/tests/testdata/short-all-same.8.toml b/utils/codepointtrie/tests/testdata/short-all-same.8.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/short-all-same.8.toml rename to utils/codepointtrie/tests/testdata/short-all-same.8.toml diff --git a/experimental/codepointtrie/tests/testdata/short-all-same.small16.toml b/utils/codepointtrie/tests/testdata/short-all-same.small16.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/short-all-same.small16.toml rename to utils/codepointtrie/tests/testdata/short-all-same.small16.toml diff --git a/experimental/codepointtrie/tests/testdata/small0-in-fast.16.toml b/utils/codepointtrie/tests/testdata/small0-in-fast.16.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/small0-in-fast.16.toml rename to utils/codepointtrie/tests/testdata/small0-in-fast.16.toml diff --git a/experimental/codepointtrie/tests/testdata/small0-in-fast.32.toml b/utils/codepointtrie/tests/testdata/small0-in-fast.32.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/small0-in-fast.32.toml rename to utils/codepointtrie/tests/testdata/small0-in-fast.32.toml diff --git a/experimental/codepointtrie/tests/testdata/small0-in-fast.8.toml b/utils/codepointtrie/tests/testdata/small0-in-fast.8.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/small0-in-fast.8.toml rename to utils/codepointtrie/tests/testdata/small0-in-fast.8.toml diff --git a/experimental/codepointtrie/tests/testdata/small0-in-fast.small16.toml b/utils/codepointtrie/tests/testdata/small0-in-fast.small16.toml similarity index 100% rename from experimental/codepointtrie/tests/testdata/small0-in-fast.small16.toml rename to utils/codepointtrie/tests/testdata/small0-in-fast.small16.toml diff --git a/experimental/codepointtrie/tests/trie_test_data_test.rs b/utils/codepointtrie/tests/trie_test_data_test.rs similarity index 100% rename from experimental/codepointtrie/tests/trie_test_data_test.rs rename to utils/codepointtrie/tests/trie_test_data_test.rs