From d1ca58c8cf869918fab790bb42b7f342aec0a2f6 Mon Sep 17 00:00:00 2001 From: Iain Ireland Date: Wed, 6 Oct 2021 17:03:53 -0700 Subject: [PATCH 01/28] Rename TrieTypeEnum to TrieType TrieType no longer exists, so we don't need an awkward name for TrieTypeEnum. --- utils/codepointtrie/src/codepointtrie.rs | 18 +++++++++--------- utils/codepointtrie/src/planes.rs | 2 +- utils/codepointtrie/tests/planes_test.rs | 2 +- utils/codepointtrie/tests/test_util.rs | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/utils/codepointtrie/src/codepointtrie.rs b/utils/codepointtrie/src/codepointtrie.rs index 8c405bfe1c3..e6c15acc9b5 100644 --- a/utils/codepointtrie/src/codepointtrie.rs +++ b/utils/codepointtrie/src/codepointtrie.rs @@ -27,7 +27,7 @@ pub enum ValueWidthEnum { /// See [`UCPTrieType`](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/ucptrie_8h.html) in ICU4C. #[derive(Clone, Copy, PartialEq)] #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -pub enum TrieTypeEnum { +pub enum TrieType { /// Represents the "fast" type code point tries for the /// [`TrieType`] trait. The "fast max" limit is set to `0xffff`. Fast = 0, @@ -130,16 +130,16 @@ pub struct CodePointTrieHeader { pub null_value: u32, /// The enum value representing the type of trie, where trie type is as it /// is defined in ICU (ex: Fast, Small). - pub trie_type: TrieTypeEnum, + pub trie_type: TrieType, } -impl TryFrom for TrieTypeEnum { +impl TryFrom for TrieType { type Error = crate::error::Error; - fn try_from(trie_type_int: u8) -> Result { + fn try_from(trie_type_int: u8) -> Result { match trie_type_int { - 0 => Ok(TrieTypeEnum::Fast), - 1 => Ok(TrieTypeEnum::Small), + 0 => Ok(TrieType::Fast), + 1 => Ok(TrieType::Small), _ => Err(crate::error::Error::FromDeserialized { reason: "Cannot parse value for trie_type", }), @@ -183,7 +183,7 @@ impl<'trie, W: ValueWidth> CodePointTrie<'trie, W> { fn internal_small_index(&self, code_point: u32) -> u32 { let mut index1_pos: u32 = code_point >> SHIFT_1; - if self.header.trie_type == TrieTypeEnum::Fast { + if self.header.trie_type == TrieType::Fast { debug_assert!( FAST_TYPE_FAST_INDEXING_MAX < code_point && code_point < self.header.high_start ); @@ -296,8 +296,8 @@ impl<'trie, W: ValueWidth> CodePointTrie<'trie, W> { // thus only need 2 lookups for a [CodePointTrie::get()](`crate::codepointtrie::CodePointTrie::get`). // Code points above the "fast max" limit require 4 lookups. let fast_max = match self.header.trie_type { - TrieTypeEnum::Fast => FAST_TYPE_FAST_INDEXING_MAX, - TrieTypeEnum::Small => SMALL_TYPE_FAST_INDEXING_MAX, + TrieType::Fast => FAST_TYPE_FAST_INDEXING_MAX, + TrieType::Small => SMALL_TYPE_FAST_INDEXING_MAX, }; let data_pos: u32 = if code_point <= fast_max { Self::fast_index(self, code_point) diff --git a/utils/codepointtrie/src/planes.rs b/utils/codepointtrie/src/planes.rs index 06e8e816d3a..c4bd4706953 100644 --- a/utils/codepointtrie/src/planes.rs +++ b/utils/codepointtrie/src/planes.rs @@ -176,7 +176,7 @@ pub fn get_planes_trie() -> CodePointTrie<'static, u8> { let index3_null_offset = 0x2; let data_null_offset = 0x0; let null_value = 0x0; - let trie_type = TrieTypeEnum::Small; + let trie_type = TrieType::Small; let trie_header = CodePointTrieHeader { high_start, diff --git a/utils/codepointtrie/tests/planes_test.rs b/utils/codepointtrie/tests/planes_test.rs index 25b8158a24d..d396d73fe8b 100644 --- a/utils/codepointtrie/tests/planes_test.rs +++ b/utils/codepointtrie/tests/planes_test.rs @@ -41,7 +41,7 @@ fn planes_trie_deserialize_check_test() { let code_point_trie_struct = planes_enum_prop.code_point_trie.trie_struct; - let trie_type_enum = match TrieTypeEnum::try_from(code_point_trie_struct.trie_type_enum_val) { + let trie_type_enum = match TrieType::try_from(code_point_trie_struct.trie_type_enum_val) { Ok(enum_val) => enum_val, _ => { panic!( diff --git a/utils/codepointtrie/tests/test_util.rs b/utils/codepointtrie/tests/test_util.rs index 8b4db2ab8df..eb1205ff1b6 100644 --- a/utils/codepointtrie/tests/test_util.rs +++ b/utils/codepointtrie/tests/test_util.rs @@ -152,7 +152,7 @@ pub fn run_deserialize_test_from_test_data(test_file_path: &str) { test_struct.name ); - let trie_type_enum = match TrieTypeEnum::try_from(test_struct.trie_type_enum_val) { + let trie_type_enum = match TrieType::try_from(test_struct.trie_type_enum_val) { Ok(enum_val) => enum_val, _ => { panic!( From 27312996eec49a8cff4f122c17072bc69e91583e Mon Sep 17 00:00:00 2001 From: Iain Ireland Date: Thu, 7 Oct 2021 15:10:22 -0700 Subject: [PATCH 02/28] Implement Yokeable/ZeroCopyFrom for CodePointTrie and data struct --- Cargo.lock | 1 + utils/codepointtrie/Cargo.toml | 3 +- utils/codepointtrie/src/codepointtrie.rs | 86 +++++++++--------------- utils/codepointtrie/src/lib.rs | 1 + utils/codepointtrie/src/provider.rs | 76 +++++++++++++++++++++ utils/codepointtrie/tests/test_util.rs | 12 +++- 6 files changed, 123 insertions(+), 56 deletions(-) create mode 100644 utils/codepointtrie/src/provider.rs diff --git a/Cargo.lock b/Cargo.lock index 8695271a843..3e6acd0b6be 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1072,6 +1072,7 @@ dependencies = [ name = "icu_codepointtrie" version = "0.2.0" dependencies = [ + "icu_provider", "postcard", "serde", "thiserror", diff --git a/utils/codepointtrie/Cargo.toml b/utils/codepointtrie/Cargo.toml index 6a3cb19b8f1..be462b8fc18 100644 --- a/utils/codepointtrie/Cargo.toml +++ b/utils/codepointtrie/Cargo.toml @@ -32,9 +32,10 @@ denylist = ["bench"] all-features = true [dependencies] +icu_provider = { version = "0.3", path = "../../provider/core", features = ["macros"] } serde = { version = "1.0", default-features = false, features = ["derive", "alloc"], optional = true } thiserror = "1.0" -zerovec = { version = "0.3", path = "../../utils/zerovec", features = ["serde"] } +zerovec = { version = "0.3", path = "../../utils/zerovec", features = ["serde", "yoke"] } [dev-dependencies] postcard = { version = "0.7", features = ["alloc"] } diff --git a/utils/codepointtrie/src/codepointtrie.rs b/utils/codepointtrie/src/codepointtrie.rs index e6c15acc9b5..b2ff54f687d 100644 --- a/utils/codepointtrie/src/codepointtrie.rs +++ b/utils/codepointtrie/src/codepointtrie.rs @@ -9,18 +9,7 @@ use core::convert::TryFrom; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use zerovec::ZeroVec; - -// Enums - -/// The width of the elements in the data array of a [`CodePointTrie`]. -/// See [`UCPTrieValueWidth`](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/ucptrie_8h.html) in ICU4C. -#[derive(Clone, Copy, PartialEq)] -#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -pub enum ValueWidthEnum { - Bits16 = 0, - Bits32 = 1, - Bits8 = 2, -} +use icu_provider::yoke::ZeroCopyFrom; /// The type of trie represents whether the trie has an optimization that /// would make it small or fast. @@ -36,51 +25,26 @@ pub enum TrieType { Small = 1, } -// ValueWidth trait +// TrieValue trait // AsULE is AsUnalignedLittleEndian, i.e. "allowed in a zerovec" -/// A trait representing the width of the values stored in the data array of a -/// [`CodePointTrie`]. This trait is used as a type parameter in constructing -/// a `CodePointTrie`. -pub trait ValueWidth: Copy + zerovec::ule::AsULE + 'static { - /// This enum variant represents the specific instance of `ValueWidth` such - /// that the enum discriminant values matches ICU4C's enum integer value. - const ENUM_VALUE: ValueWidthEnum; - /// This value is used to indicate an error in the Rust code in accessing - /// a position in the trie's `data` array. In normal cases, the position in - /// the `data` array will return either the correct value, or in case of a - /// logical error in the trie's computation, the trie's own error value - /// which is stored that in the `data` array. +/// A trait representing the values stored in the data array of a [`CodePointTrie`]. +/// This trait is used as a type parameter in constructing a `CodePointTrie`. +pub trait TrieValue: Copy + zerovec::ule::AsULE + 'static { const DATA_GET_ERROR_VALUE: Self; - fn cast_to_widest(self) -> u32; } -impl ValueWidth for u8 { - const ENUM_VALUE: ValueWidthEnum = ValueWidthEnum::Bits8; +impl TrieValue for u8 { const DATA_GET_ERROR_VALUE: u8 = u8::MAX; - - fn cast_to_widest(self) -> u32 { - self as u32 - } } -impl ValueWidth for u16 { - const ENUM_VALUE: ValueWidthEnum = ValueWidthEnum::Bits16; +impl TrieValue for u16 { const DATA_GET_ERROR_VALUE: u16 = u16::MAX; - - fn cast_to_widest(self) -> u32 { - self as u32 - } } -impl ValueWidth for u32 { - const ENUM_VALUE: ValueWidthEnum = ValueWidthEnum::Bits32; +impl TrieValue for u32 { const DATA_GET_ERROR_VALUE: u32 = u32::MAX; - - fn cast_to_widest(self) -> u32 { - self - } } /// This struct represents a de-serialized CodePointTrie that was exported from @@ -90,16 +54,17 @@ impl ValueWidth for u32 { /// - [ICU Site design doc](http://site.icu-project.org/design/struct/utrie) /// - [ICU User Guide section on Properties lookup](https://unicode-org.github.io/icu/userguide/strings/properties.html#lookup) #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -pub struct CodePointTrie<'trie, W: ValueWidth> { +pub struct CodePointTrie<'trie, T: TrieValue> { header: CodePointTrieHeader, #[cfg_attr(feature = "serde", serde(borrow))] index: ZeroVec<'trie, u16>, #[cfg_attr(feature = "serde", serde(borrow))] - data: ZeroVec<'trie, W>, + data: ZeroVec<'trie, T>, } /// This struct contains the fixed-length header fields of a [`CodePointTrie`]. #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[derive(Copy,Clone)] pub struct CodePointTrieHeader { /// The code point of the start of the last range of the trie. A /// range is defined as a partition of the code point space such that the @@ -147,14 +112,14 @@ impl TryFrom for TrieType { } } -impl<'trie, W: ValueWidth> CodePointTrie<'trie, W> { +impl<'trie, T: TrieValue> CodePointTrie<'trie, T> { /// Returns a new [`CodePointTrie`] backed by borrowed data for the `index` /// array and `data` array, whose data values have width `W`. pub fn try_new( header: CodePointTrieHeader, index: ZeroVec<'trie, u16>, - data: ZeroVec<'trie, W>, - ) -> Result, Error> { + data: ZeroVec<'trie, T>, + ) -> Result, Error> { // Validation invariants are not needed here when constructing a new // `CodePointTrie` because: // @@ -167,7 +132,7 @@ impl<'trie, W: ValueWidth> CodePointTrie<'trie, W> { // - The `ZeroVec` serializer stores the length of the array along with the // ZeroVec data, meaning that a deserializer would also see that length info. - let trie: CodePointTrie<'trie, W> = CodePointTrie { + let trie: CodePointTrie<'trie, T> = CodePointTrie { header, index, data, @@ -290,7 +255,7 @@ impl<'trie, W: ValueWidth> CodePointTrie<'trie, W> { /// assert_eq!(0, trie.get(0x13E0)); // 'Ꮰ' as u32 /// assert_eq!(1, trie.get(0x10044)); // '𐁄' as u32 /// ``` - pub fn get(&self, code_point: u32) -> W { + pub fn get(&self, code_point: u32) -> T { // All code points up to the fast max limit are represented // individually in the `index` array to hold their `data` array position, and // thus only need 2 lookups for a [CodePointTrie::get()](`crate::codepointtrie::CodePointTrie::get`). @@ -308,12 +273,14 @@ impl<'trie, W: ValueWidth> CodePointTrie<'trie, W> { }; // Returns the trie value (or trie's error value). // If we cannot read from the data array, then return the associated constant - // DATA_GET_ERROR_VALUE for the instance type for W: ValueWidth. + // DATA_GET_ERROR_VALUE for the instance type for T: TrieValue. self.data .get(data_pos as usize) - .unwrap_or(W::DATA_GET_ERROR_VALUE) + .unwrap_or(T::DATA_GET_ERROR_VALUE) } +} +impl<'trie, T: TrieValue + Into> CodePointTrie<'trie, T> { /// Returns the value that is associated with `code_point` for this [`CodePointTrie`] /// as a `u32`. /// @@ -333,7 +300,18 @@ impl<'trie, W: ValueWidth> CodePointTrie<'trie, W> { // Note: This API method maintains consistency with the corresponding // original ICU APIs. pub fn get_u32(&self, code_point: u32) -> u32 { - self.get(code_point).cast_to_widest() + self.get(code_point).into() + } +} + +impl<'a, T: TrieValue> ZeroCopyFrom> for CodePointTrie<'static, T> +{ + fn zero_copy_from<'b>(cart: &'b CodePointTrie<'a, T>) -> CodePointTrie<'b, T> { + CodePointTrie { + header: cart.header, + index: ZeroVec::<'static, u16>::zero_copy_from(&cart.index), + data: ZeroVec::<'static, T>::zero_copy_from(&cart.data) + } } } diff --git a/utils/codepointtrie/src/lib.rs b/utils/codepointtrie/src/lib.rs index 18c104ff904..357d35fe833 100644 --- a/utils/codepointtrie/src/lib.rs +++ b/utils/codepointtrie/src/lib.rs @@ -39,3 +39,4 @@ pub mod codepointtrie; pub mod error; mod impl_const; pub mod planes; +pub mod provider; diff --git a/utils/codepointtrie/src/provider.rs b/utils/codepointtrie/src/provider.rs new file mode 100644 index 00000000000..df32888e818 --- /dev/null +++ b/utils/codepointtrie/src/provider.rs @@ -0,0 +1,76 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use core::{mem,ptr}; + +use crate::codepointtrie::{CodePointTrie, TrieValue}; +use icu_provider::yoke::*; + +// Note: +// T: TrieValue is T: Copy + zerovec::ule::AsULE + 'static + +unsafe impl<'a, T: TrieValue> Yokeable<'a> for CodePointTrie<'static, T> { + type Output = CodePointTrie<'a, T>; + fn transform(&'a self) -> &'a Self::Output { + self + } + fn transform_owned(self) -> Self::Output { + self + } + unsafe fn make(from: Self::Output) -> Self { + debug_assert!(mem::size_of::() == mem::size_of::()); + let ptr: *const Self = (&from as *const Self::Output).cast(); + mem::forget(from); + ptr::read(ptr) + } + fn transform_mut(&'a mut self, f: F) + where + F: 'static + for<'b> FnOnce(&'b mut Self::Output), + { + unsafe { f(mem::transmute::<&mut Self, &mut Self::Output>(self)) } + } +} + +pub struct UnicodePropertyMapV1<'data, T: TrieValue> { + pub codepoint_trie: CodePointTrie<'data, T>, +} + +unsafe impl<'a, T: TrieValue> Yokeable<'a> for UnicodePropertyMapV1<'static, T> { + type Output = UnicodePropertyMapV1<'a, T>; + fn transform(&'a self) -> &'a Self::Output { + self + } + fn transform_owned(self) -> Self::Output { + self + } + unsafe fn make(from: Self::Output) -> Self { + debug_assert!(mem::size_of::() == mem::size_of::()); + let ptr: *const Self = (&from as *const Self::Output).cast(); + mem::forget(from); + ptr::read(ptr) + } + fn transform_mut(&'a mut self, f: F) + where + F: 'static + for<'b> FnOnce(&'b mut Self::Output), + { + unsafe { f(mem::transmute::<&mut Self, &mut Self::Output>(self)) } + } +} + +impl<'a, T: TrieValue> ZeroCopyFrom> for UnicodePropertyMapV1<'static,T> { + fn zero_copy_from<'b>(cart: &'b UnicodePropertyMapV1<'a, T>) -> UnicodePropertyMapV1<'b,T> { + UnicodePropertyMapV1 { + codepoint_trie: CodePointTrie::<'static, T>::zero_copy_from(&cart.codepoint_trie) + } + } +} + +pub struct UnicodePropertyMapV1Marker { + _phantom: core::marker::PhantomData +} + +impl<'data, T: TrieValue> icu_provider::DataMarker<'data> for UnicodePropertyMapV1Marker { + type Yokeable = UnicodePropertyMapV1<'static, T>; + type Cart = UnicodePropertyMapV1<'data, T>; +} diff --git a/utils/codepointtrie/tests/test_util.rs b/utils/codepointtrie/tests/test_util.rs index eb1205ff1b6..1c7ec5fbe15 100644 --- a/utils/codepointtrie/tests/test_util.rs +++ b/utils/codepointtrie/tests/test_util.rs @@ -11,7 +11,17 @@ use std::io::Read; use std::path::Path; use zerovec::ZeroVec; -pub fn check_trie(trie: &CodePointTrie, check_ranges: &[u32]) { +/// The width of the elements in the data array of a [`CodePointTrie`]. +/// See [`UCPTrieValueWidth`](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/ucptrie_8h.html) in ICU4C. +#[derive(Clone, Copy, PartialEq)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +pub enum ValueWidthEnum { + Bits16 = 0, + Bits32 = 1, + Bits8 = 2, +} + +pub fn check_trie>(trie: &CodePointTrie, check_ranges: &[u32]) { assert_eq!( 0, check_ranges.len() % 2, From 3519819028cb56bd363959ce307c5bf469e12841 Mon Sep 17 00:00:00 2001 From: Iain Ireland Date: Thu, 7 Oct 2021 15:41:03 -0700 Subject: [PATCH 03/28] Cargo fmt + minor fixes --- utils/codepointtrie/src/codepointtrie.rs | 9 ++++----- utils/codepointtrie/src/provider.rs | 12 +++++++----- utils/codepointtrie/tests/test_util.rs | 2 ++ 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/utils/codepointtrie/src/codepointtrie.rs b/utils/codepointtrie/src/codepointtrie.rs index b2ff54f687d..10144daf2d8 100644 --- a/utils/codepointtrie/src/codepointtrie.rs +++ b/utils/codepointtrie/src/codepointtrie.rs @@ -6,10 +6,10 @@ use crate::error::Error; use crate::impl_const::*; use core::convert::TryFrom; +use icu_provider::yoke::ZeroCopyFrom; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; use zerovec::ZeroVec; -use icu_provider::yoke::ZeroCopyFrom; /// The type of trie represents whether the trie has an optimization that /// would make it small or fast. @@ -64,7 +64,7 @@ pub struct CodePointTrie<'trie, T: TrieValue> { /// This struct contains the fixed-length header fields of a [`CodePointTrie`]. #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] -#[derive(Copy,Clone)] +#[derive(Copy, Clone)] pub struct CodePointTrieHeader { /// The code point of the start of the last range of the trie. A /// range is defined as a partition of the code point space such that the @@ -304,13 +304,12 @@ impl<'trie, T: TrieValue + Into> CodePointTrie<'trie, T> { } } -impl<'a, T: TrieValue> ZeroCopyFrom> for CodePointTrie<'static, T> -{ +impl<'a, T: TrieValue> ZeroCopyFrom> for CodePointTrie<'static, T> { fn zero_copy_from<'b>(cart: &'b CodePointTrie<'a, T>) -> CodePointTrie<'b, T> { CodePointTrie { header: cart.header, index: ZeroVec::<'static, u16>::zero_copy_from(&cart.index), - data: ZeroVec::<'static, T>::zero_copy_from(&cart.data) + data: ZeroVec::<'static, T>::zero_copy_from(&cart.data), } } } diff --git a/utils/codepointtrie/src/provider.rs b/utils/codepointtrie/src/provider.rs index df32888e818..79337cb435c 100644 --- a/utils/codepointtrie/src/provider.rs +++ b/utils/codepointtrie/src/provider.rs @@ -2,7 +2,7 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). -use core::{mem,ptr}; +use core::{mem, ptr}; use crate::codepointtrie::{CodePointTrie, TrieValue}; use icu_provider::yoke::*; @@ -58,16 +58,18 @@ unsafe impl<'a, T: TrieValue> Yokeable<'a> for UnicodePropertyMapV1<'static, T> } } -impl<'a, T: TrieValue> ZeroCopyFrom> for UnicodePropertyMapV1<'static,T> { - fn zero_copy_from<'b>(cart: &'b UnicodePropertyMapV1<'a, T>) -> UnicodePropertyMapV1<'b,T> { +impl<'a, T: TrieValue> ZeroCopyFrom> + for UnicodePropertyMapV1<'static, T> +{ + fn zero_copy_from<'b>(cart: &'b UnicodePropertyMapV1<'a, T>) -> UnicodePropertyMapV1<'b, T> { UnicodePropertyMapV1 { - codepoint_trie: CodePointTrie::<'static, T>::zero_copy_from(&cart.codepoint_trie) + codepoint_trie: CodePointTrie::<'static, T>::zero_copy_from(&cart.codepoint_trie), } } } pub struct UnicodePropertyMapV1Marker { - _phantom: core::marker::PhantomData + _phantom: core::marker::PhantomData, } impl<'data, T: TrieValue> icu_provider::DataMarker<'data> for UnicodePropertyMapV1Marker { diff --git a/utils/codepointtrie/tests/test_util.rs b/utils/codepointtrie/tests/test_util.rs index 1c7ec5fbe15..8b04276fa0f 100644 --- a/utils/codepointtrie/tests/test_util.rs +++ b/utils/codepointtrie/tests/test_util.rs @@ -6,6 +6,8 @@ use icu_codepointtrie::codepointtrie::*; use icu_codepointtrie::error::Error; use core::convert::TryFrom; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; use std::fs::File; use std::io::Read; use std::path::Path; From 91427625a880a756fb2854e8f69462ee1728cd61 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Thu, 7 Oct 2021 13:49:59 -0700 Subject: [PATCH 04/28] Add CPT struct to icu_provider_uprops data source struct --- provider/uprops/src/uprops_serde.rs | 33 ++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/provider/uprops/src/uprops_serde.rs b/provider/uprops/src/uprops_serde.rs index 5a752c46520..25f8ef557e2 100644 --- a/provider/uprops/src/uprops_serde.rs +++ b/provider/uprops/src/uprops_serde.rs @@ -30,11 +30,43 @@ pub mod enumerated { pub name: String, } + #[allow(clippy::upper_case_acronyms)] + #[derive(serde::Deserialize)] + pub struct EnumeratedPropertyCodePointTrie { + #[serde(skip)] + pub long_name: String, + #[serde(skip)] + pub name: String, + pub index: Vec, + pub data_8: Option>, + pub data_16: Option>, + pub data_32: Option>, + #[serde(skip)] + pub index_length: u32, + #[serde(skip)] + pub data_length: u32, + #[serde(rename = "highStart")] + pub high_start: u32, + #[serde(rename = "shifted12HighStart")] + pub shifted12_high_start: u16, + #[serde(rename = "type")] + pub trie_type_enum_val: u8, + #[serde(rename = "valueWidth")] + pub value_width_enum_val: u8, + #[serde(rename = "index3NullOffset")] + pub index3_null_offset: u16, + #[serde(rename = "dataNullOffset")] + pub data_null_offset: u32, + #[serde(rename = "nullValue")] + pub null_value: u32, + } + #[derive(serde::Deserialize)] pub struct EnumeratedPropertyMap { pub long_name: String, pub short_name: String, pub ranges: Vec, + pub code_point_trie: EnumeratedPropertyCodePointTrie, } #[derive(serde::Deserialize)] @@ -45,6 +77,5 @@ pub mod enumerated { #[derive(serde::Deserialize)] pub struct Main { pub enum_property: Level1, - // omitted: enum_property.code_point_trie } } From 7ce722eea7a7993b0ab8ce32ed5891720420fa01 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Fri, 8 Oct 2021 13:55:03 -0700 Subject: [PATCH 05/28] Renames data providers for UnicodeSet data ahead of introducing one for CodePointTrie data --- docs/tutorials/writing_a_new_data_struct.md | 2 +- .../uprops/src/{binary.rs => bin_uniset.rs} | 14 ++--- .../src/{enumerated.rs => enum_uniset.rs} | 22 ++++---- provider/uprops/src/lib.rs | 8 +-- provider/uprops/src/provider.rs | 51 ------------------- 5 files changed, 24 insertions(+), 73 deletions(-) rename provider/uprops/src/{binary.rs => bin_uniset.rs} (88%) rename provider/uprops/src/{enumerated.rs => enum_uniset.rs} (93%) delete mode 100644 provider/uprops/src/provider.rs diff --git a/docs/tutorials/writing_a_new_data_struct.md b/docs/tutorials/writing_a_new_data_struct.md index 438a1646ee0..dd09362a584 100644 --- a/docs/tutorials/writing_a_new_data_struct.md +++ b/docs/tutorials/writing_a_new_data_struct.md @@ -60,7 +60,7 @@ Examples of source data providers include: - [`PluralsProvider`](https://unicode-org.github.io/icu4x-docs/doc/icu_provider_cldr/transform/struct.PluralsProvider.html) - [`DateSymbolsProvider`](https://unicode-org.github.io/icu4x-docs/doc/icu_provider_cldr/transform/struct.DateSymbolsProvider.html) - [… more examples](https://unicode-org.github.io/icu4x-docs/doc/icu_provider_cldr/transform/index.html) -- `BinaryPropertiesDataProvider` +- `BinaryPropertyUnicodeSetDataProvider` - [`HelloWorldProvider`](https://unicode-org.github.io/icu4x-docs/doc/icu_provider/hello_world/struct.HelloWorldProvider.html) Source data providers must implement the following traits: diff --git a/provider/uprops/src/binary.rs b/provider/uprops/src/bin_uniset.rs similarity index 88% rename from provider/uprops/src/binary.rs rename to provider/uprops/src/bin_uniset.rs index 9a71598e126..74262428a34 100644 --- a/provider/uprops/src/binary.rs +++ b/provider/uprops/src/bin_uniset.rs @@ -11,14 +11,14 @@ use icu_uniset::UnicodeSetBuilder; use std::fs; use std::path::PathBuf; -pub struct BinaryPropertiesDataProvider { +pub struct BinaryPropertyUnicodeSetDataProvider { root_dir: PathBuf, } /// A data provider reading from .toml files produced by the ICU4C icuwriteuprops tool. -impl BinaryPropertiesDataProvider { +impl BinaryPropertyUnicodeSetDataProvider { pub fn new(root_dir: PathBuf) -> Self { - BinaryPropertiesDataProvider { root_dir } + BinaryPropertyUnicodeSetDataProvider { root_dir } } fn get_toml_data(&self, name: &str) -> Result { let mut path: PathBuf = self.root_dir.clone().join(name); @@ -28,7 +28,7 @@ impl BinaryPropertiesDataProvider { } } -impl<'data> DataProvider<'data, UnicodePropertyV1Marker> for BinaryPropertiesDataProvider { +impl<'data> DataProvider<'data, UnicodePropertyV1Marker> for BinaryPropertyUnicodeSetDataProvider { fn load_payload( &self, req: &DataRequest, @@ -54,11 +54,11 @@ impl<'data> DataProvider<'data, UnicodePropertyV1Marker> for BinaryPropertiesDat } } -icu_provider::impl_dyn_provider!(BinaryPropertiesDataProvider, { +icu_provider::impl_dyn_provider!(BinaryPropertyUnicodeSetDataProvider, { _ => UnicodePropertyV1Marker, }, SERDE_SE, 'data); -impl IterableDataProviderCore for BinaryPropertiesDataProvider { +impl IterableDataProviderCore for BinaryPropertyUnicodeSetDataProvider { fn supported_options_for_key( &self, _resc_key: &ResourceKey, @@ -74,7 +74,7 @@ fn test_basic() { use std::convert::TryInto; let root_dir = icu_testdata::paths::data_root().join("uprops"); - let provider = BinaryPropertiesDataProvider::new(root_dir); + let provider = BinaryPropertyUnicodeSetDataProvider::new(root_dir); let payload: DataPayload<'_, UnicodePropertyV1Marker> = provider .load_payload(&DataRequest { diff --git a/provider/uprops/src/enumerated.rs b/provider/uprops/src/enum_uniset.rs similarity index 93% rename from provider/uprops/src/enumerated.rs rename to provider/uprops/src/enum_uniset.rs index 3ecc9c9ca48..b410c81e375 100644 --- a/provider/uprops/src/enumerated.rs +++ b/provider/uprops/src/enum_uniset.rs @@ -11,14 +11,14 @@ use icu_uniset::UnicodeSetBuilder; use std::fs; use std::path::PathBuf; -pub struct EnumeratedPropertiesDataProvider { +pub struct EnumeratedPropertyUnicodeSetDataProvider { root_dir: PathBuf, } /// A data provider reading from .toml files produced by the ICU4C icuwriteuprops tool. -impl EnumeratedPropertiesDataProvider { +impl EnumeratedPropertyUnicodeSetDataProvider { pub fn new(root_dir: PathBuf) -> Self { - EnumeratedPropertiesDataProvider { root_dir } + EnumeratedPropertyUnicodeSetDataProvider { root_dir } } fn get_toml_data(&self, name: &str) -> Result { let mut path: PathBuf = self.root_dir.clone().join(name); @@ -61,7 +61,9 @@ fn expand_groupings<'a>(prop_name: &str, prop_val: &'a str) -> Vec<&'a str> { } } -impl<'data> DataProvider<'data, UnicodePropertyV1Marker> for EnumeratedPropertiesDataProvider { +impl<'data> DataProvider<'data, UnicodePropertyV1Marker> + for EnumeratedPropertyUnicodeSetDataProvider +{ fn load_payload( &self, req: &DataRequest, @@ -104,11 +106,11 @@ impl<'data> DataProvider<'data, UnicodePropertyV1Marker> for EnumeratedPropertie } } -icu_provider::impl_dyn_provider!(EnumeratedPropertiesDataProvider, { +icu_provider::impl_dyn_provider!(EnumeratedPropertyUnicodeSetDataProvider, { _ => UnicodePropertyV1Marker, }, SERDE_SE, 'data); -impl IterableDataProviderCore for EnumeratedPropertiesDataProvider { +impl IterableDataProviderCore for EnumeratedPropertyUnicodeSetDataProvider { fn supported_options_for_key( &self, _resc_key: &ResourceKey, @@ -124,7 +126,7 @@ fn test_general_category() { use std::convert::TryInto; let root_dir = icu_testdata::paths::data_root().join("uprops"); - let provider = EnumeratedPropertiesDataProvider::new(root_dir); + let provider = EnumeratedPropertyUnicodeSetDataProvider::new(root_dir); let payload: DataPayload<'_, UnicodePropertyV1Marker> = provider .load_payload(&DataRequest { @@ -152,7 +154,7 @@ fn test_script() { use std::convert::TryInto; let root_dir = icu_testdata::paths::data_root().join("uprops"); - let provider = EnumeratedPropertiesDataProvider::new(root_dir); + let provider = EnumeratedPropertyUnicodeSetDataProvider::new(root_dir); let payload: DataPayload<'_, UnicodePropertyV1Marker> = provider .load_payload(&DataRequest { @@ -181,7 +183,7 @@ fn test_gc_groupings() { fn get_uniset_payload<'data>(key: ResourceKey) -> DataPayload<'data, UnicodePropertyV1Marker> { let root_dir = icu_testdata::paths::data_root().join("uprops"); - let provider = EnumeratedPropertiesDataProvider::new(root_dir); + let provider = EnumeratedPropertyUnicodeSetDataProvider::new(root_dir); let payload: DataPayload<'_, UnicodePropertyV1Marker> = provider .load_payload(&DataRequest { resource_path: ResourcePath { @@ -293,7 +295,7 @@ fn test_gc_surrogate() { use std::convert::TryInto; let root_dir = icu_testdata::paths::data_root().join("uprops"); - let provider = EnumeratedPropertiesDataProvider::new(root_dir); + let provider = EnumeratedPropertyUnicodeSetDataProvider::new(root_dir); let payload: DataPayload<'_, UnicodePropertyV1Marker> = provider .load_payload(&DataRequest { diff --git a/provider/uprops/src/lib.rs b/provider/uprops/src/lib.rs index d7d62e2a892..5664fe71faf 100644 --- a/provider/uprops/src/lib.rs +++ b/provider/uprops/src/lib.rs @@ -18,10 +18,10 @@ //! [`StaticDataProvider`]: ../icu_provider_blob/struct.StaticDataProvider.html //! [`PropertiesDataProvider`]: binary::PropertiesDataProvider -mod binary; -mod enumerated; +mod bin_uniset; +mod enum_uniset; mod error; -mod provider; mod uprops_serde; -pub use provider::PropertiesDataProvider; +pub use bin_uniset::BinaryPropertyUnicodeSetDataProvider; +pub use enum_uniset::EnumeratedPropertyUnicodeSetDataProvider; diff --git a/provider/uprops/src/provider.rs b/provider/uprops/src/provider.rs deleted file mode 100644 index 42a77878455..00000000000 --- a/provider/uprops/src/provider.rs +++ /dev/null @@ -1,51 +0,0 @@ -// This file is part of ICU4X. For terms of use, please see the file -// called LICENSE at the top level of the ICU4X source tree -// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). - -use crate::binary::BinaryPropertiesDataProvider; -use crate::enumerated::EnumeratedPropertiesDataProvider; -use icu_provider::iter::IterableDataProviderCore; -use icu_provider::prelude::*; -use icu_uniset::provider::UnicodePropertyV1Marker; - -use std::path::PathBuf; - -pub struct PropertiesDataProvider { - binary: BinaryPropertiesDataProvider, - enumerated: EnumeratedPropertiesDataProvider, -} - -impl PropertiesDataProvider { - pub fn new(root_dir: PathBuf) -> Self { - let binary = BinaryPropertiesDataProvider::new(root_dir.clone()); - let enumerated = EnumeratedPropertiesDataProvider::new(root_dir); - Self { binary, enumerated } - } -} - -impl<'data> DataProvider<'data, UnicodePropertyV1Marker> for PropertiesDataProvider { - fn load_payload( - &self, - req: &DataRequest, - ) -> Result, DataError> { - if req.resource_path.key.sub_category.contains('=') { - self.enumerated.load_payload(req) - } else { - self.binary.load_payload(req) - } - } -} - -icu_provider::impl_dyn_provider!(PropertiesDataProvider, { - _ => UnicodePropertyV1Marker, -}, SERDE_SE, 'data); - -impl IterableDataProviderCore for PropertiesDataProvider { - fn supported_options_for_key( - &self, - _resc_key: &ResourceKey, - ) -> Result>, DataError> { - let list: Vec = vec![ResourceOptions::default()]; - Ok(Box::new(list.into_iter())) - } -} From 7784c1db31024eb4ba9b6a2726fd4c27192aeee4 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Mon, 11 Oct 2021 10:49:21 -0700 Subject: [PATCH 06/28] Matches CPT version to project/sub-crates, adds CPT as dep to provider_uprops --- Cargo.lock | 3 ++- provider/uprops/Cargo.toml | 1 + utils/codepointtrie/Cargo.toml | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3e6acd0b6be..a36ac1f4763 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1070,7 +1070,7 @@ dependencies = [ [[package]] name = "icu_codepointtrie" -version = "0.2.0" +version = "0.3.0" dependencies = [ "icu_provider", "postcard", @@ -1311,6 +1311,7 @@ name = "icu_provider_uprops" version = "0.3.0" dependencies = [ "displaydoc", + "icu_codepointtrie", "icu_provider", "icu_testdata", "icu_uniset", diff --git a/provider/uprops/Cargo.toml b/provider/uprops/Cargo.toml index 390d6a0daf7..b7b562d54a8 100644 --- a/provider/uprops/Cargo.toml +++ b/provider/uprops/Cargo.toml @@ -28,6 +28,7 @@ all-features = true [dependencies] displaydoc = { version = "0.2.3", default-features = false } +icu_codepointtrie = { version = "0.3", path = "../../utils/codepointtrie", features = ["serde"] } icu_provider = { version = "0.3", path = "../../provider/core", features = ["provider_serde"] } icu_uniset = { version = "0.3", path = "../../components/uniset", features = ["provider_serde"] } serde = { version = "1.0", features = ["derive"] } diff --git a/utils/codepointtrie/Cargo.toml b/utils/codepointtrie/Cargo.toml index be462b8fc18..18e1c553651 100644 --- a/utils/codepointtrie/Cargo.toml +++ b/utils/codepointtrie/Cargo.toml @@ -5,7 +5,7 @@ [package] name = "icu_codepointtrie" description = "API for an efficient trie of data for Unicode code points" -version = "0.2.0" +version = "0.3.0" authors = ["The ICU4X Project Developers"] edition = "2018" readme = "README.md" From 1f6d7e7226bf9fb8ff7c4340b01e98ab3bdd07bd Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Mon, 11 Oct 2021 15:38:59 -0700 Subject: [PATCH 07/28] Add WIP code for data provider for CodePointTrie data --- components/uniset/src/enum_props.rs | 18 ++++++- provider/uprops/src/enum_codepointtrie.rs | 64 +++++++++++++++++++++++ provider/uprops/src/lib.rs | 1 + 3 files changed, 81 insertions(+), 2 deletions(-) create mode 100644 provider/uprops/src/enum_codepointtrie.rs diff --git a/components/uniset/src/enum_props.rs b/components/uniset/src/enum_props.rs index 07511890ee7..60d68bf3fbb 100644 --- a/components/uniset/src/enum_props.rs +++ b/components/uniset/src/enum_props.rs @@ -5,17 +5,31 @@ //! A collection of enums for enumerated properties. use num_enum::{TryFromPrimitive, UnsafeFromPrimitive}; +use tinystr::TinyStr16; /// Selection constants for Unicode properties. /// These constants are used to select one of the Unicode properties. /// See UProperty in ICU4C. -#[derive(Clone, PartialEq, Debug)] +#[derive(Clone, PartialEq, Debug, TryFromPrimitive)] #[allow(missing_docs)] // TODO(#1030) - Add missing docs. #[non_exhaustive] +#[repr(i32)] pub enum EnumeratedProperty { GeneralCategory = 0x1005, Script = 0x100A, - ScriptExtensions = 0x7000, + ScriptExtensions = 0x7000, // TODO(#1160) - this is a Miscellaneous property, not Enumerated + InvalidCode = -1, // TODO(#1160) - taken from ICU4C UProperty::UCHAR_INVALID_CODE +} + +impl From<&TinyStr16> for EnumeratedProperty { + fn from(prop_short_alias: &TinyStr16) -> Self { + match prop_short_alias.as_str() { + "gc" => EnumeratedProperty::GeneralCategory, + "sc" => EnumeratedProperty::Script, + "scx" => EnumeratedProperty::ScriptExtensions, + _ => EnumeratedProperty::InvalidCode, + } + } } /// Enumerated Unicode general category types. diff --git a/provider/uprops/src/enum_codepointtrie.rs b/provider/uprops/src/enum_codepointtrie.rs new file mode 100644 index 00000000000..585b0ccfc04 --- /dev/null +++ b/provider/uprops/src/enum_codepointtrie.rs @@ -0,0 +1,64 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use crate::error::Error; +use crate::uprops_serde; +use crate::uprops_serde::enumerated::EnumeratedPropertyCodePointTrie; + +use icu_codepointtrie::codepointtrie::{CodePointTrie, TrieValue}; +use icu_codepointtrie::provider::{UnicodePropertyMapV1, UnicodePropertyMapV1Marker}; +use icu_provider::prelude::*; +use icu_uniset::enum_props::EnumeratedProperty; // TODO(#1160) - Refactor property definitions out of UnicodeSet + +use std::fs; +use std::path::PathBuf; + +pub struct EnumeratedPropertyCodePointTrieProvider { + root_dir: PathBuf, +} + +impl EnumeratedPropertyCodePointTrieProvider { + pub fn new(root_dir: PathBuf) -> Self { + EnumeratedPropertyCodePointTrieProvider { root_dir } + } + + fn get_toml_data(&self, name: &str) -> Result { + let mut path: PathBuf = self.root_dir.clone().join(name); + path.set_extension("toml"); + let toml_str = fs::read_to_string(&path).map_err(|e| Error::Io(e, path.clone()))?; + toml::from_str(&toml_str).map_err(|e| Error::Toml(e, path)) + } +} + +impl From for UnicodePropertyMapV1<'static, T> { + fn from(cpt_data: EnumeratedPropertyCodePointTrie) -> UnicodePropertyMapV1<'static, T> { + let trie = CodePointTrie::::try_new( + // TODO + ); + } +} + +impl<'data, T: TrieValue> DataProvider<'data, UnicodePropertyMapV1Marker> + for EnumeratedPropertyCodePointTrieProvider +{ + fn load_payload( + &self, + req: &DataRequest, + ) -> Result>, DataError> { + // For data resource keys that represent the CodePointTrie data for an enumerated + // property, the ResourceKey sub-category string will just be the short alias + // for the property. + let prop_name = &req.resource_path.key.sub_category; + + let toml_data: uprops_serde::enumerated::Main = self + .get_toml_data(prop_name) + .map_err(DataError::new_resc_error)?; + + let prop_enum: EnumeratedProperty = EnumeratedProperty::from(prop_name); + + let source_cpt_data: uprops_serde::enumerated::EnumeratedPropertyCodePointTrie = + toml_data.enum_property.data.code_point_trie; + + } +} \ No newline at end of file diff --git a/provider/uprops/src/lib.rs b/provider/uprops/src/lib.rs index 5664fe71faf..65a86b11e59 100644 --- a/provider/uprops/src/lib.rs +++ b/provider/uprops/src/lib.rs @@ -19,6 +19,7 @@ //! [`PropertiesDataProvider`]: binary::PropertiesDataProvider mod bin_uniset; +mod enum_codepointtrie; mod enum_uniset; mod error; mod uprops_serde; From 3a890f505cc037f4e1a00c3c3c862b6dbe0f5c4f Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Mon, 11 Oct 2021 18:44:51 -0700 Subject: [PATCH 08/28] More WIP code for CodePointTrie data provider implementation --- Cargo.lock | 2 + components/uniset/src/enum_props.rs | 4 +- provider/uprops/Cargo.toml | 1 + provider/uprops/src/enum_codepointtrie.rs | 66 ++++++++++++++++++----- utils/codepointtrie/Cargo.toml | 3 +- utils/codepointtrie/src/error.rs | 6 +-- 6 files changed, 64 insertions(+), 18 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a36ac1f4763..8c50a0e007a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1072,6 +1072,7 @@ dependencies = [ name = "icu_codepointtrie" version = "0.3.0" dependencies = [ + "displaydoc", "icu_provider", "postcard", "serde", @@ -1317,6 +1318,7 @@ dependencies = [ "icu_uniset", "serde", "toml", + "zerovec", ] [[package]] diff --git a/components/uniset/src/enum_props.rs b/components/uniset/src/enum_props.rs index 60d68bf3fbb..aadfa3a65b8 100644 --- a/components/uniset/src/enum_props.rs +++ b/components/uniset/src/enum_props.rs @@ -17,8 +17,8 @@ use tinystr::TinyStr16; pub enum EnumeratedProperty { GeneralCategory = 0x1005, Script = 0x100A, - ScriptExtensions = 0x7000, // TODO(#1160) - this is a Miscellaneous property, not Enumerated - InvalidCode = -1, // TODO(#1160) - taken from ICU4C UProperty::UCHAR_INVALID_CODE + ScriptExtensions = 0x7000, // TODO(#1160) - this is a Miscellaneous property, not Enumerated + InvalidCode = -1, // TODO(#1160) - taken from ICU4C UProperty::UCHAR_INVALID_CODE } impl From<&TinyStr16> for EnumeratedProperty { diff --git a/provider/uprops/Cargo.toml b/provider/uprops/Cargo.toml index b7b562d54a8..daec0ecdff0 100644 --- a/provider/uprops/Cargo.toml +++ b/provider/uprops/Cargo.toml @@ -33,6 +33,7 @@ icu_provider = { version = "0.3", path = "../../provider/core", features = ["pro icu_uniset = { version = "0.3", path = "../../components/uniset", features = ["provider_serde"] } serde = { version = "1.0", features = ["derive"] } toml = { version = "0.5" } +zerovec = { version = "0.3", path = "../../utils/zerovec", features = ["serde", "yoke"] } [dev-dependencies] icu_testdata = { version = "0.3", path = "../../provider/testdata" } diff --git a/provider/uprops/src/enum_codepointtrie.rs b/provider/uprops/src/enum_codepointtrie.rs index 585b0ccfc04..44e82d30fcd 100644 --- a/provider/uprops/src/enum_codepointtrie.rs +++ b/provider/uprops/src/enum_codepointtrie.rs @@ -6,10 +6,13 @@ use crate::error::Error; use crate::uprops_serde; use crate::uprops_serde::enumerated::EnumeratedPropertyCodePointTrie; -use icu_codepointtrie::codepointtrie::{CodePointTrie, TrieValue}; +use icu_codepointtrie::codepointtrie::{CodePointTrie, CodePointTrieHeader, TrieType, TrieValue}; use icu_codepointtrie::provider::{UnicodePropertyMapV1, UnicodePropertyMapV1Marker}; use icu_provider::prelude::*; -use icu_uniset::enum_props::EnumeratedProperty; // TODO(#1160) - Refactor property definitions out of UnicodeSet +use icu_uniset::enum_props::EnumeratedProperty; // TODO(#1160) - Refactor property definitions out of UnicodeSet +use zerovec::ZeroVec; + +use core::convert::TryFrom; use std::fs; use std::path::PathBuf; @@ -31,15 +34,46 @@ impl EnumeratedPropertyCodePointTrieProvider { } } -impl From for UnicodePropertyMapV1<'static, T> { - fn from(cpt_data: EnumeratedPropertyCodePointTrie) -> UnicodePropertyMapV1<'static, T> { - let trie = CodePointTrie::::try_new( - // TODO - ); +impl TryFrom + for UnicodePropertyMapV1<'static, T> +{ + type Error = DataError; + + fn try_from( + cpt_data: EnumeratedPropertyCodePointTrie, + ) -> Result, DataError> { + let trie_type_enum: TrieType = + TrieType::try_from(cpt_data.trie_type_enum_val).map_err(DataError::new_resc_error)?; + let header = CodePointTrieHeader { + high_start: cpt_data.high_start, + shifted12_high_start: cpt_data.shifted12_high_start, + index3_null_offset: cpt_data.index3_null_offset, + data_null_offset: cpt_data.data_null_offset, + null_value: cpt_data.null_value, + trie_type: trie_type_enum, + }; + let index: ZeroVec = ZeroVec::from_slice(&cpt_data.index); + // TODO: make data have type ZeroVec + let data = if let Some(data_8) = cpt_data.data_8 { + ZeroVec::from_slice(data_8.as_slice()) + } else if let Some(data_16) = cpt_data.data_16 { + ZeroVec::from_slice(data_16.as_slice()) + } else if let Some(data_32) = cpt_data.data_32 { + ZeroVec::from_slice(data_32.as_slice()) + } else { + return Err(DataError::new_resc_error( + icu_codepointtrie::error::Error::FromDeserialized { + reason: "Cannot deserialize data array for CodePointTrie in TOML", + }, + )); + }; + let trie = + CodePointTrie::::try_new(header, index, data).map_err(DataError::new_resc_error); + trie.map(|t| UnicodePropertyMapV1 { codepoint_trie: t }) } } -impl<'data, T: TrieValue> DataProvider<'data, UnicodePropertyMapV1Marker> +impl<'data, T: TrieValue> DataProvider<'data, UnicodePropertyMapV1Marker> for EnumeratedPropertyCodePointTrieProvider { fn load_payload( @@ -52,13 +86,21 @@ impl<'data, T: TrieValue> DataProvider<'data, UnicodePropertyMapV1Marker> let prop_name = &req.resource_path.key.sub_category; let toml_data: uprops_serde::enumerated::Main = self - .get_toml_data(prop_name) - .map_err(DataError::new_resc_error)?; + .get_toml_data(prop_name) + .map_err(DataError::new_resc_error)?; let prop_enum: EnumeratedProperty = EnumeratedProperty::from(prop_name); - let source_cpt_data: uprops_serde::enumerated::EnumeratedPropertyCodePointTrie = + let source_cpt_data: uprops_serde::enumerated::EnumeratedPropertyCodePointTrie = toml_data.enum_property.data.code_point_trie; + let data_struct = UnicodePropertyMapV1::::try_from(source_cpt_data)?; + + Ok(DataResponse { + metadata: DataResponseMetadata { + data_langid: req.resource_path.options.langid.clone(), + }, + payload: Some(DataPayload::from_owned(data_struct)), + }) } -} \ No newline at end of file +} diff --git a/utils/codepointtrie/Cargo.toml b/utils/codepointtrie/Cargo.toml index 18e1c553651..a49307a1457 100644 --- a/utils/codepointtrie/Cargo.toml +++ b/utils/codepointtrie/Cargo.toml @@ -32,6 +32,7 @@ denylist = ["bench"] all-features = true [dependencies] +displaydoc = { version = "0.2.3", default-features = false } icu_provider = { version = "0.3", path = "../../provider/core", features = ["macros"] } serde = { version = "1.0", default-features = false, features = ["derive", "alloc"], optional = true } thiserror = "1.0" @@ -41,7 +42,7 @@ zerovec = { version = "0.3", path = "../../utils/zerovec", features = ["serde", postcard = { version = "0.7", features = ["alloc"] } toml = "0.5" serde = { version = "1.0", features = ["derive"] } -zerovec = { version = "0.3", path = "../../utils/zerovec", features = ["serde"] } +zerovec = { version = "0.3", path = "../../utils/zerovec", features = ["serde"] } # TODO: Remove? [lib] bench = false # This option is required for Benchmark CI diff --git a/utils/codepointtrie/src/error.rs b/utils/codepointtrie/src/error.rs index 55dddf0ca5b..5e8894c4549 100644 --- a/utils/codepointtrie/src/error.rs +++ b/utils/codepointtrie/src/error.rs @@ -2,10 +2,10 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). -use thiserror::Error; +use displaydoc::Display; -#[derive(Error, Debug, PartialEq)] +#[derive(Display, Debug, PartialEq)] pub enum Error { - #[error("Could not construct CodePointTrie from deserialized values: {reason}")] + #[displaydoc("Could not construct CodePointTrie from deserialized values: {reason}")] FromDeserialized { reason: &'static str }, } From 888edc56d34493a68c89ef6443881104690bb7b6 Mon Sep 17 00:00:00 2001 From: Manish Goregaokar Date: Tue, 12 Oct 2021 11:41:15 -0700 Subject: [PATCH 09/28] Fix error --- provider/uprops/src/enum_codepointtrie.rs | 29 +++++++++++++++++------ utils/codepointtrie/src/codepointtrie.rs | 12 +++++++++- 2 files changed, 33 insertions(+), 8 deletions(-) diff --git a/provider/uprops/src/enum_codepointtrie.rs b/provider/uprops/src/enum_codepointtrie.rs index 44e82d30fcd..7b582665fd5 100644 --- a/provider/uprops/src/enum_codepointtrie.rs +++ b/provider/uprops/src/enum_codepointtrie.rs @@ -52,14 +52,27 @@ impl TryFrom = ZeroVec::from_slice(&cpt_data.index); + let index: ZeroVec = ZeroVec::clone_from_slice(&cpt_data.index); // TODO: make data have type ZeroVec - let data = if let Some(data_8) = cpt_data.data_8 { - ZeroVec::from_slice(data_8.as_slice()) + // + let data: Result, String> = if let Some(data_8) = cpt_data.data_8 { + data_8 + .iter() + .map(|i| *i as u32) + .map(|i| T::parse_from_u32(i).map(|i| i.as_unaligned())) + .collect() } else if let Some(data_16) = cpt_data.data_16 { - ZeroVec::from_slice(data_16.as_slice()) + data_16 + .iter() + .map(|i| *i as u32) + .map(|i| T::parse_from_u32(i).map(|i| i.as_unaligned())) + .collect() } else if let Some(data_32) = cpt_data.data_32 { - ZeroVec::from_slice(data_32.as_slice()) + data_32 + .iter() + .map(|i| *i as u32) + .map(|i| T::parse_from_u32(i).map(|i| i.as_unaligned())) + .collect() } else { return Err(DataError::new_resc_error( icu_codepointtrie::error::Error::FromDeserialized { @@ -67,8 +80,10 @@ impl TryFrom::try_new(header, index, data).map_err(DataError::new_resc_error); + + let data = ZeroVec::Owned(data.map_err(DataError::new_resc_error)?); + let trie = CodePointTrie::::try_new(header, index, data) + .map_err(DataError::new_resc_error); trie.map(|t| UnicodePropertyMapV1 { codepoint_trie: t }) } } diff --git a/utils/codepointtrie/src/codepointtrie.rs b/utils/codepointtrie/src/codepointtrie.rs index 10144daf2d8..db586ebca19 100644 --- a/utils/codepointtrie/src/codepointtrie.rs +++ b/utils/codepointtrie/src/codepointtrie.rs @@ -5,7 +5,7 @@ use crate::error::Error; use crate::impl_const::*; -use core::convert::TryFrom; +use core::convert::{TryFrom, TryInto}; use icu_provider::yoke::ZeroCopyFrom; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; @@ -33,18 +33,28 @@ pub enum TrieType { /// This trait is used as a type parameter in constructing a `CodePointTrie`. pub trait TrieValue: Copy + zerovec::ule::AsULE + 'static { const DATA_GET_ERROR_VALUE: Self; + fn parse_from_u32(i: u32) -> Result; } impl TrieValue for u8 { const DATA_GET_ERROR_VALUE: u8 = u8::MAX; + fn parse_from_u32(i: u32) -> Result { + Self::try_from(i).map_err(|e| e.to_string()) + } } impl TrieValue for u16 { const DATA_GET_ERROR_VALUE: u16 = u16::MAX; + fn parse_from_u32(i: u32) -> Result { + Self::try_from(i).map_err(|e| e.to_string()) + } } impl TrieValue for u32 { const DATA_GET_ERROR_VALUE: u32 = u32::MAX; + fn parse_from_u32(i: u32) -> Result { + Self::try_from(i).map_err(|e| e.to_string()) + } } /// This struct represents a de-serialized CodePointTrie that was exported from From 1186512408dfaa5dc64de7c68e88f0bf93d01102 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Tue, 12 Oct 2021 14:36:31 -0700 Subject: [PATCH 10/28] Simplify constructing ZeroVec using ZV's new FromIterator impl --- provider/uprops/src/enum_codepointtrie.rs | 24 ++++++++--------------- utils/codepointtrie/src/codepointtrie.rs | 4 ++-- 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/provider/uprops/src/enum_codepointtrie.rs b/provider/uprops/src/enum_codepointtrie.rs index 7b582665fd5..cf2b84542d0 100644 --- a/provider/uprops/src/enum_codepointtrie.rs +++ b/provider/uprops/src/enum_codepointtrie.rs @@ -9,7 +9,6 @@ use crate::uprops_serde::enumerated::EnumeratedPropertyCodePointTrie; use icu_codepointtrie::codepointtrie::{CodePointTrie, CodePointTrieHeader, TrieType, TrieValue}; use icu_codepointtrie::provider::{UnicodePropertyMapV1, UnicodePropertyMapV1Marker}; use icu_provider::prelude::*; -use icu_uniset::enum_props::EnumeratedProperty; // TODO(#1160) - Refactor property definitions out of UnicodeSet use zerovec::ZeroVec; use core::convert::TryFrom; @@ -22,7 +21,7 @@ pub struct EnumeratedPropertyCodePointTrieProvider { } impl EnumeratedPropertyCodePointTrieProvider { - pub fn new(root_dir: PathBuf) -> Self { + pub fn _new(root_dir: PathBuf) -> Self { EnumeratedPropertyCodePointTrieProvider { root_dir } } @@ -53,25 +52,20 @@ impl TryFrom = ZeroVec::clone_from_slice(&cpt_data.index); - // TODO: make data have type ZeroVec - // - let data: Result, String> = if let Some(data_8) = cpt_data.data_8 { + let data: Result, String> = if let Some(data_8) = cpt_data.data_8 { data_8 .iter() - .map(|i| *i as u32) - .map(|i| T::parse_from_u32(i).map(|i| i.as_unaligned())) + .map(|i| T::parse_from_u32(*i as u32)) .collect() } else if let Some(data_16) = cpt_data.data_16 { data_16 .iter() - .map(|i| *i as u32) - .map(|i| T::parse_from_u32(i).map(|i| i.as_unaligned())) + .map(|i| T::parse_from_u32(*i as u32)) .collect() } else if let Some(data_32) = cpt_data.data_32 { data_32 .iter() - .map(|i| *i as u32) - .map(|i| T::parse_from_u32(i).map(|i| i.as_unaligned())) + .map(|i| T::parse_from_u32(*i as u32)) .collect() } else { return Err(DataError::new_resc_error( @@ -81,9 +75,9 @@ impl TryFrom::try_new(header, index, data) - .map_err(DataError::new_resc_error); + let data = data.map_err(DataError::new_resc_error)?; + let trie = + CodePointTrie::::try_new(header, index, data).map_err(DataError::new_resc_error); trie.map(|t| UnicodePropertyMapV1 { codepoint_trie: t }) } } @@ -104,8 +98,6 @@ impl<'data, T: TrieValue> DataProvider<'data, UnicodePropertyMapV1Marker> .get_toml_data(prop_name) .map_err(DataError::new_resc_error)?; - let prop_enum: EnumeratedProperty = EnumeratedProperty::from(prop_name); - let source_cpt_data: uprops_serde::enumerated::EnumeratedPropertyCodePointTrie = toml_data.enum_property.data.code_point_trie; diff --git a/utils/codepointtrie/src/codepointtrie.rs b/utils/codepointtrie/src/codepointtrie.rs index db586ebca19..2b369f70c01 100644 --- a/utils/codepointtrie/src/codepointtrie.rs +++ b/utils/codepointtrie/src/codepointtrie.rs @@ -5,7 +5,7 @@ use crate::error::Error; use crate::impl_const::*; -use core::convert::{TryFrom, TryInto}; +use core::convert::TryFrom; use icu_provider::yoke::ZeroCopyFrom; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; @@ -53,7 +53,7 @@ impl TrieValue for u16 { impl TrieValue for u32 { const DATA_GET_ERROR_VALUE: u32 = u32::MAX; fn parse_from_u32(i: u32) -> Result { - Self::try_from(i).map_err(|e| e.to_string()) + Ok(i) } } From 4abb8a444a6be009acf9437e56da3a18fa5d9c43 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Tue, 19 Oct 2021 15:19:02 -0700 Subject: [PATCH 11/28] Merge current snapshot of PR #1153 (refactor properties to separate crate) --- Cargo.lock | 17 +- Cargo.toml | 3 +- components/icu/Cargo.toml | 9 +- components/icu/examples/tui.rs | 2 +- components/icu/src/lib.rs | 54 +-- components/properties/Cargo.toml | 52 +++ components/{uniset => properties}/LICENSE | 0 components/properties/README.md | 18 + components/properties/src/lib.rs | 26 ++ .../enum_props.rs => properties/src/props.rs} | 10 +- .../{uniset => properties}/src/provider.rs | 4 +- .../src/props.rs => properties/src/sets.rs} | 271 +++++++------- components/{uniset => properties}/src/ule.rs | 3 +- provider/uprops/Cargo.toml | 3 +- provider/uprops/README.md | 1 - provider/uprops/src/bin_uniset.rs | 4 +- provider/uprops/src/enum_uniset.rs | 343 +++++++++--------- provider/uprops/src/lib.rs | 1 - provider/uprops/src/provider.rs | 51 +++ {components => utils}/uniset/Cargo.toml | 5 +- utils/uniset/LICENSE | 331 +++++++++++++++++ {components => utils}/uniset/README.md | 4 +- .../uniset/benches/inv_list.rs | 0 .../examples/unicode_bmp_blocks_selector.rs | 0 {components => utils}/uniset/src/builder.rs | 32 +- .../uniset/src/conversions.rs | 0 {components => utils}/uniset/src/lib.rs | 8 +- {components => utils}/uniset/src/uniset.rs | 38 +- {components => utils}/uniset/src/utils.rs | 0 29 files changed, 878 insertions(+), 412 deletions(-) create mode 100644 components/properties/Cargo.toml rename components/{uniset => properties}/LICENSE (100%) create mode 100644 components/properties/README.md create mode 100644 components/properties/src/lib.rs rename components/{uniset/src/enum_props.rs => properties/src/props.rs} (98%) rename components/{uniset => properties}/src/provider.rs (99%) rename components/{uniset/src/props.rs => properties/src/sets.rs} (76%) rename components/{uniset => properties}/src/ule.rs (97%) create mode 100644 provider/uprops/src/provider.rs rename {components => utils}/uniset/Cargo.toml (91%) create mode 100644 utils/uniset/LICENSE rename {components => utils}/uniset/README.md (94%) rename {components => utils}/uniset/benches/inv_list.rs (100%) rename {components => utils}/uniset/examples/unicode_bmp_blocks_selector.rs (100%) rename {components => utils}/uniset/src/builder.rs (97%) rename {components => utils}/uniset/src/conversions.rs (100%) rename {components => utils}/uniset/src/lib.rs (94%) rename {components => utils}/uniset/src/uniset.rs (97%) rename {components => utils}/uniset/src/utils.rs (100%) diff --git a/Cargo.lock b/Cargo.lock index 055723a4757..298ddab4996 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -986,6 +986,7 @@ dependencies = [ "icu_locid", "icu_locid_macros", "icu_plurals", + "icu_properties", "icu_provider", "icu_testdata", "icu_uniset", @@ -1214,6 +1215,19 @@ dependencies = [ "serde_json", ] +[[package]] +name = "icu_properties" +version = "0.3.0" +dependencies = [ + "icu", + "icu_provider", + "icu_uniset", + "num_enum", + "serde", + "tinystr", + "zerovec", +] + [[package]] name = "icu_provider" version = "0.3.0" @@ -1316,6 +1330,7 @@ version = "0.3.0" dependencies = [ "displaydoc", "icu_codepointtrie", + "icu_properties", "icu_provider", "icu_testdata", "icu_uniset", @@ -1369,11 +1384,9 @@ version = "0.3.0" dependencies = [ "criterion", "displaydoc", - "icu", "icu_benchmark_macros", "icu_provider", "litemap", - "num_enum", "postcard", "serde", "serde_json", diff --git a/Cargo.toml b/Cargo.toml index 79d489d0a70..90dabec8c88 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,7 +14,7 @@ members = [ "components/locid", "components/locid/macros", "components/plurals", - "components/uniset", + "components/properties", "experimental/bies", "experimental/formatted_string_builder", "experimental/list_formatter", @@ -37,6 +37,7 @@ members = [ "utils/fixed_decimal", "utils/litemap", "utils/pattern", + "utils/uniset", "utils/writeable", "utils/yoke", "utils/yoke/derive", diff --git a/components/icu/Cargo.toml b/components/icu/Cargo.toml index 7874d4302fc..a23daf24fa4 100644 --- a/components/icu/Cargo.toml +++ b/components/icu/Cargo.toml @@ -66,9 +66,9 @@ version = "0.3" path = "../plurals" default-features = false -[dependencies.icu_uniset] +[dependencies.icu_properties] version = "0.3" -path = "../uniset" +path = "../../components/properties" default-features = false [dependencies.fixed_decimal] @@ -79,10 +79,11 @@ default-features = false [dev-dependencies] icu_provider = { version = "0.3", path = "../../provider/core" } icu_testdata = { version = "0.3", path = "../../provider/testdata" } +icu_uniset = { version = "0.3", path = "../../utils/uniset" } writeable = { version = "0.2", path = "../../utils/writeable" } [features] -std = ["icu_datetime/std", "icu_locid/std", "icu_plurals/std", "icu_uniset/std", "fixed_decimal/std"] +std = ["icu_datetime/std", "icu_locid/std", "icu_plurals/std", "icu_properties/std", "fixed_decimal/std"] default = ["provider_serde"] serde = [ "icu_locid/serde" @@ -92,5 +93,5 @@ provider_serde = [ "icu_decimal/provider_serde", "icu_locale_canonicalizer/provider_serde", "icu_plurals/provider_serde", - "icu_uniset/provider_serde", + "icu_properties/provider_serde", ] diff --git a/components/icu/examples/tui.rs b/components/icu/examples/tui.rs index 1ecdc8922f7..025b2ed3c80 100644 --- a/components/icu/examples/tui.rs +++ b/components/icu/examples/tui.rs @@ -10,8 +10,8 @@ use icu::datetime::DateTimeFormatOptions; use icu::locid::{macros::langid, Locale}; use icu::plurals::{PluralCategory, PluralRuleType, PluralRules}; -use icu::uniset::UnicodeSetBuilder; use icu_datetime::{mock::zoned_datetime::MockZonedDateTime, ZonedDateTimeFormat}; +use icu_uniset::UnicodeSetBuilder; use std::env; fn print>(_input: T) { diff --git a/components/icu/src/lib.rs b/components/icu/src/lib.rs index 14ff8c102d7..495166de96a 100644 --- a/components/icu/src/lib.rs +++ b/components/icu/src/lib.rs @@ -367,50 +367,18 @@ pub mod plurals { pub use icu_plurals::*; } -pub mod uniset { - //! Unicode Set operations +pub mod properties { + //! `icu_properties` is a utility crate of the [`ICU4X`] project. //! - //! This API provides necessary functionality for highly efficient querying of sets of Unicode characters. + //! This component provides definitions of [Unicode Properties] and APIs for + //! retrieving property data in an appropriate data structure. //! - //! It is an implementation of the existing [ICU4C UnicodeSet API](https://unicode-org.github.io/icu-docs/apidoc/released/icu4c/classicu_1_1UnicodeSet.html). + //! Currently, only binary property APIs are supported, with APIs that return + //! a [`UnicodeSet`]. See the [`sets`] module for more details. //! - //! # Architecture - //! ICU4X `UnicodeSet` is split up into independent levels, with [`UnicodeSet`] representing the membership/query API, - //! and [`UnicodeSetBuilder`] representing the builder API. A [Properties API](http://userguide.icu-project.org/strings/properties) - //! is in future works. - //! - //! # Examples: - //! - //! ## Creating a `UnicodeSet` - //! - //! UnicodeSets are created from either serialized UnicodeSets, - //! represented by [inversion lists](http://userguide.icu-project.org/strings/properties), - //! the [`UnicodeSetBuilder`], or from the TBA Properties API. - //! - //! ``` - //! use icu::uniset::{UnicodeSet, UnicodeSetBuilder}; - //! - //! let mut builder = UnicodeSetBuilder::new(); - //! builder.add_range(&('A'..'Z')); - //! let set: UnicodeSet = builder.build(); - //! - //! assert!(set.contains('A')); - //! ``` - //! - //! ## Querying a `UnicodeSet` - //! - //! Currently, you can check if a character/range of characters exists in the UnicodeSet, or iterate through the characters. - //! - //! ``` - //! use icu::uniset::{UnicodeSet, UnicodeSetBuilder}; - //! - //! let mut builder = UnicodeSetBuilder::new(); - //! builder.add_range(&('A'..'Z')); - //! let set: UnicodeSet = builder.build(); - //! - //! assert!(set.contains('A')); - //! assert!(set.contains_range(&('A'..='C'))); - //! assert_eq!(set.iter_chars().next(), Some('A')); - //! ``` - pub use icu_uniset::*; + //! [`ICU4X`]: ../icu/index.html + //! [Unicode Properties]: https://unicode-org.github.io/icu/userguide/strings/properties.html + //! [`UnicodeSet`]: ../../icu_uniset/struct.UnicodeSet.html + //! [`sets`]: sets + pub use icu_properties::*; } diff --git a/components/properties/Cargo.toml b/components/properties/Cargo.toml new file mode 100644 index 00000000000..e07a28102eb --- /dev/null +++ b/components/properties/Cargo.toml @@ -0,0 +1,52 @@ +# This file is part of ICU4X. For terms of use, please see the file +# called LICENSE at the top level of the ICU4X source tree +# (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +[package] +name = "icu_properties" +description = "Definitions for Unicode properties" +version = "0.3.0" +authors = ["The ICU4X Project Developers"] +edition = "2018" +readme = "README.md" +repository = "https://github.com/unicode-org/icu4x" +license-file = "LICENSE" +categories = ["internationalization"] +# Keep this in sync with other crates unless there are exceptions +include = [ + "src/**/*", + "examples/**/*", + "benches/**/*", + "tests/**/*", + "Cargo.toml", + "LICENSE", + "README.md" +] + +[package.metadata.cargo-all-features] +skip_optional_dependencies = true +# Bench feature gets tested separately and is only relevant for CI +denylist = ["bench"] + +[package.metadata.docs.rs] +all-features = true + +[dependencies] +icu_provider = { version = "0.3", path = "../../provider/core", features = ["macros"] } +icu_uniset = { version = "0.3", path = "../../utils/uniset", features = ["serde"] } +num_enum = { version = "0.5.4", default-features = false } +serde = { version = "1.0", default-features = false, features = ["derive", "alloc"], optional = true } +tinystr = { version = "0.4.10", features = ["alloc", "serde"], default-features = false } +zerovec = { version = "0.3", path = "../../utils/zerovec", features = ["serde"] } + +[dev-dependencies] +icu = { path = "../../components/icu", default-features = false } + +[lib] +bench = false # This option is required for Benchmark CI +path = "src/lib.rs" + +[features] +std = ["icu_provider/std"] +default = ["provider_serde"] +provider_serde = ["serde"] diff --git a/components/uniset/LICENSE b/components/properties/LICENSE similarity index 100% rename from components/uniset/LICENSE rename to components/properties/LICENSE diff --git a/components/properties/README.md b/components/properties/README.md new file mode 100644 index 00000000000..7b57f0d79af --- /dev/null +++ b/components/properties/README.md @@ -0,0 +1,18 @@ +# icu_properties [![crates.io](http://meritbadge.herokuapp.com/icu_properties)](https://crates.io/crates/icu_properties) + +`icu_properties` is a utility crate of the [`ICU4X`] project. + +This component provides definitions of [Unicode Properties] and APIs for +retrieving property data in an appropriate data structure. + +Currently, only binary property APIs are supported, with APIs that return +a [`UnicodeSet`]. See the [`sets`] module for more details. + +[`ICU4X`]: ../icu/index.html +[Unicode Properties]: https://unicode-org.github.io/icu/userguide/strings/properties.html +[`UnicodeSet`]: ../icu_uniset/struct.UnicodeSet.html +[`sets`]: crate::sets + +## More Information + +For more information on development, authorship, contributing etc. please visit [`ICU4X home page`](https://github.com/unicode-org/icu4x). diff --git a/components/properties/src/lib.rs b/components/properties/src/lib.rs new file mode 100644 index 00000000000..175a965cb81 --- /dev/null +++ b/components/properties/src/lib.rs @@ -0,0 +1,26 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +//! `icu_properties` is a utility crate of the [`ICU4X`] project. +//! +//! This component provides definitions of [Unicode Properties] and APIs for +//! retrieving property data in an appropriate data structure. +//! +//! Currently, only binary property APIs are supported, with APIs that return +//! a [`UnicodeSet`]. See the [`sets`] module for more details. +//! +//! [`ICU4X`]: ../icu/index.html +//! [Unicode Properties]: https://unicode-org.github.io/icu/userguide/strings/properties.html +//! [`UnicodeSet`]: ../icu_uniset/struct.UnicodeSet.html +//! [`sets`]: crate::sets + +#![no_std] + +mod props; +#[allow(unused)] +pub mod provider; +pub mod sets; +mod ule; + +pub use props::*; diff --git a/components/uniset/src/enum_props.rs b/components/properties/src/props.rs similarity index 98% rename from components/uniset/src/enum_props.rs rename to components/properties/src/props.rs index 05d4b7f4501..53b55b217be 100644 --- a/components/uniset/src/enum_props.rs +++ b/components/properties/src/props.rs @@ -9,7 +9,7 @@ use tinystr::TinyStr16; /// Selection constants for Unicode properties. /// These constants are used to select one of the Unicode properties. -/// See UProperty in ICU4C. +/// See `UProperty` in ICU4C. #[derive(Clone, PartialEq, Debug, TryFromPrimitive)] #[non_exhaustive] #[repr(i32)] @@ -36,8 +36,8 @@ impl From<&TinyStr16> for EnumeratedProperty { } /// Enumerated Unicode general category types. -/// GeneralSubcategory only supports specific subcategories (eg UppercaseLetter). -/// It does not support grouped categories (eg Letter). For grouped categories, use GeneralCategory. +/// GeneralSubcategory only supports specific subcategories (eg `UppercaseLetter`). +/// It does not support grouped categories (eg `Letter`). For grouped categories, use [`GeneralCategory`]. #[derive(Copy, Clone, PartialEq, Debug, TryFromPrimitive, UnsafeFromPrimitive)] #[repr(u8)] pub enum GeneralSubcategory { @@ -114,8 +114,8 @@ pub enum GeneralSubcategory { /// The discriminants correspond to the U_GC_XX_MASK constants in ICU4C. /// Unlike GeneralSubcategory, this supports groups of general categories: for example, `Letter` /// is the union of `UppercaseLetter`, `LowercaseLetter`, etc... -/// See https://www.unicode.org/reports/tr44/ . -/// See UCharCategory and U_GET_GC_MASK in ICU4C. +/// See . +/// See `UCharCategory` and `U_GET_GC_MASK` in ICU4C. #[derive(Copy, Clone, PartialEq, Debug, Eq)] #[repr(transparent)] pub struct GeneralCategory(pub(crate) u32); diff --git a/components/uniset/src/provider.rs b/components/properties/src/provider.rs similarity index 99% rename from components/uniset/src/provider.rs rename to components/properties/src/provider.rs index 34a2f82bb4f..7a4dc93a125 100644 --- a/components/uniset/src/provider.rs +++ b/components/properties/src/provider.rs @@ -6,9 +6,9 @@ //! //! Read more about data providers: [`icu_provider`] -use crate::builder::UnicodeSetBuilder; -use crate::uniset::UnicodeSet; use icu_provider::yoke::{self, *}; +use icu_uniset::UnicodeSet; +use icu_uniset::UnicodeSetBuilder; // // resource key structs - the structs used directly by users of data provider diff --git a/components/uniset/src/props.rs b/components/properties/src/sets.rs similarity index 76% rename from components/uniset/src/props.rs rename to components/properties/src/sets.rs index 7e4c5fbd082..d22ba9b877a 100644 --- a/components/uniset/src/props.rs +++ b/components/properties/src/sets.rs @@ -10,18 +10,19 @@ //! documentation for Unicode regular expressions. In particular, Annex C of this document //! defines properties for POSIX compatibility. //! +//! [`UnicodeSet`]: icu_uniset::UnicodeSet //! [`TR44`]: https://www.unicode.org/reports/tr44 //! [`TR18`]: https://www.unicode.org/reports/tr18 -use crate::enum_props::*; use crate::provider::*; -use crate::UnicodeSetError; +use crate::*; use icu_provider::prelude::*; +use icu_uniset::UnicodeSetError; type UnisetResult<'data> = Result, UnicodeSetError>; // helper fn -fn get_prop<'data, D>(provider: &D, resc_key: ResourceKey) -> UnisetResult<'data> +fn get_uniset<'data, D>(provider: &D, resc_key: ResourceKey) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { @@ -46,561 +47,561 @@ where // /// ASCII characters commonly used for the representation of hexadecimal numbers -pub fn get_ascii_hex_digit_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_ascii_hex_digit<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::ASCII_HEX_DIGIT_V1) + get_uniset(provider, key::ASCII_HEX_DIGIT_V1) } /// Characters with the Alphabetic or Decimal_Number property /// This is defined for POSIX compatibility. -pub fn get_alnum_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_alnum<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::ALNUM_V1) + get_uniset(provider, key::ALNUM_V1) } /// Alphabetic characters -pub fn get_alphabetic_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_alphabetic<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::ALPHABETIC_V1) + get_uniset(provider, key::ALPHABETIC_V1) } /// Format control characters which have specific functions in the Unicode Bidirectional /// Algorithm -pub fn get_bidi_control_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_bidi_control<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::BIDI_CONTROL_V1) + get_uniset(provider, key::BIDI_CONTROL_V1) } /// Characters that are mirrored in bidirectional text -pub fn get_bidi_mirrored_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_bidi_mirrored<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::BIDI_MIRRORED_V1) + get_uniset(provider, key::BIDI_MIRRORED_V1) } /// Horizontal whitespace characters -pub fn get_blank_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_blank<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::BLANK_V1) + get_uniset(provider, key::BLANK_V1) } /// Uppercase, lowercase, and titlecase characters -pub fn get_cased_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_cased<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::CASED_V1) + get_uniset(provider, key::CASED_V1) } /// Characters which are ignored for casing purposes -pub fn get_case_ignorable_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_case_ignorable<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::CASE_IGNORABLE_V1) + get_uniset(provider, key::CASE_IGNORABLE_V1) } /// Characters that are excluded from composition /// See -pub fn get_full_composition_exclusion_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_full_composition_exclusion<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::FULL_COMPOSITION_EXCLUSION_V1) + get_uniset(provider, key::FULL_COMPOSITION_EXCLUSION_V1) } /// Characters whose normalized forms are not stable under case folding -pub fn get_changes_when_casefolded_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_changes_when_casefolded<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::CHANGES_WHEN_CASEFOLDED_V1) + get_uniset(provider, key::CHANGES_WHEN_CASEFOLDED_V1) } /// Characters which may change when they undergo case mapping -pub fn get_changes_when_casemapped_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_changes_when_casemapped<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::CHANGES_WHEN_CASEMAPPED_V1) + get_uniset(provider, key::CHANGES_WHEN_CASEMAPPED_V1) } /// Characters which are not identical to their NFKC_Casefold mapping -pub fn get_changes_when_nfkc_casefolded_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_changes_when_nfkc_casefolded<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::CHANGES_WHEN_NFKC_CASEFOLDED_V1) + get_uniset(provider, key::CHANGES_WHEN_NFKC_CASEFOLDED_V1) } /// Characters whose normalized forms are not stable under a toLowercase mapping -pub fn get_changes_when_lowercased_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_changes_when_lowercased<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::CHANGES_WHEN_LOWERCASED_V1) + get_uniset(provider, key::CHANGES_WHEN_LOWERCASED_V1) } /// Characters whose normalized forms are not stable under a toTitlecase mapping -pub fn get_changes_when_titlecased_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_changes_when_titlecased<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::CHANGES_WHEN_TITLECASED_V1) + get_uniset(provider, key::CHANGES_WHEN_TITLECASED_V1) } /// Characters whose normalized forms are not stable under a toUppercase mapping -pub fn get_changes_when_uppercased_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_changes_when_uppercased<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::CHANGES_WHEN_UPPERCASED_V1) + get_uniset(provider, key::CHANGES_WHEN_UPPERCASED_V1) } /// Punctuation characters explicitly called out as dashes in the Unicode Standard, plus /// their compatibility equivalents -pub fn get_dash_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_dash<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::DASH_V1) + get_uniset(provider, key::DASH_V1) } /// Deprecated characters. No characters will ever be removed from the standard, but the /// usage of deprecated characters is strongly discouraged. -pub fn get_deprecated_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_deprecated<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::DEPRECATED_V1) + get_uniset(provider, key::DEPRECATED_V1) } /// For programmatic determination of default ignorable code points. New characters that /// should be ignored in rendering (unless explicitly supported) will be assigned in these /// ranges, permitting programs to correctly handle the default rendering of such /// characters when not otherwise supported. -pub fn get_default_ignorable_code_point_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_default_ignorable_code_point<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::DEFAULT_IGNORABLE_CODE_POINT_V1) + get_uniset(provider, key::DEFAULT_IGNORABLE_CODE_POINT_V1) } /// Characters that linguistically modify the meaning of another character to which they apply -pub fn get_diacritic_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_diacritic<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::DIACRITIC_V1) + get_uniset(provider, key::DIACRITIC_V1) } /// Characters that can serve as a base for emoji modifiers -pub fn get_emoji_modifier_base_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_emoji_modifier_base<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::EMOJI_MODIFIER_BASE_V1) + get_uniset(provider, key::EMOJI_MODIFIER_BASE_V1) } /// Characters used in emoji sequences that normally do not appear on emoji keyboards as /// separate choices, such as base characters for emoji keycaps -pub fn get_emoji_component_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_emoji_component<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::EMOJI_COMPONENT_V1) + get_uniset(provider, key::EMOJI_COMPONENT_V1) } /// Characters that are emoji modifiers -pub fn get_emoji_modifier_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_emoji_modifier<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::EMOJI_MODIFIER_V1) + get_uniset(provider, key::EMOJI_MODIFIER_V1) } /// Characters that are emoji -pub fn get_emoji_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_emoji<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::EMOJI_V1) + get_uniset(provider, key::EMOJI_V1) } /// Characters that have emoji presentation by default -pub fn get_emoji_presentation_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_emoji_presentation<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::EMOJI_PRESENTATION_V1) + get_uniset(provider, key::EMOJI_PRESENTATION_V1) } /// Characters whose principal function is to extend the value of a preceding alphabetic /// character or to extend the shape of adjacent characters. -pub fn get_extender_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_extender<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::EXTENDER_V1) + get_uniset(provider, key::EXTENDER_V1) } /// Pictographic symbols, as well as reserved ranges in blocks largely associated with /// emoji characters -pub fn get_extended_pictographic_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_extended_pictographic<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::EXTENDED_PICTOGRAPHIC_V1) + get_uniset(provider, key::EXTENDED_PICTOGRAPHIC_V1) } /// Visible characters. /// This is defined for POSIX compatibility. -pub fn get_graph_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_graph<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::GRAPH_V1) + get_uniset(provider, key::GRAPH_V1) } /// Property used together with the definition of Standard Korean Syllable Block to define /// "Grapheme base". See D58 in Chapter 3, Conformance in the Unicode Standard. -pub fn get_grapheme_base_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_grapheme_base<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::GRAPHEME_BASE_V1) + get_uniset(provider, key::GRAPHEME_BASE_V1) } /// Property used to define "Grapheme extender". See D59 in Chapter 3, Conformance in the /// Unicode Standard. -pub fn get_grapheme_extend_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_grapheme_extend<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::GRAPHEME_EXTEND_V1) + get_uniset(provider, key::GRAPHEME_EXTEND_V1) } /// Deprecated property. Formerly proposed for programmatic determination of grapheme /// cluster boundaries. -pub fn get_grapheme_link_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_grapheme_link<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::GRAPHEME_LINK_V1) + get_uniset(provider, key::GRAPHEME_LINK_V1) } /// Characters commonly used for the representation of hexadecimal numbers, plus their /// compatibility equivalents -pub fn get_hex_digit_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_hex_digit<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::HEX_DIGIT_V1) + get_uniset(provider, key::HEX_DIGIT_V1) } /// Deprecated property. Dashes which are used to mark connections between pieces of /// words, plus the Katakana middle dot. -pub fn get_hyphen_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_hyphen<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::HYPHEN_V1) + get_uniset(provider, key::HYPHEN_V1) } /// Characters that can come after the first character in an identifier. If using NFKC to /// fold differences between characters, use [`get_xid_continue_property`] instead. See /// [`Unicode Standard Annex #31`](https://www.unicode.org/reports/tr31/tr31-35.html) for /// more details. -pub fn get_id_continue_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_id_continue<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::ID_CONTINUE_V1) + get_uniset(provider, key::ID_CONTINUE_V1) } /// Characters considered to be CJKV (Chinese, Japanese, Korean, and Vietnamese) /// ideographs, or related siniform ideographs -pub fn get_ideographic_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_ideographic<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::IDEOGRAPHIC_V1) + get_uniset(provider, key::IDEOGRAPHIC_V1) } /// Characters that can begin an identifier. If using NFKC to fold differences between /// characters, use [`get_xid_start_property`] instead. See [`Unicode Standard Annex /// #31`](https://www.unicode.org/reports/tr31/tr31-35.html) for more details. -pub fn get_id_start_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_id_start<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::ID_START_V1) + get_uniset(provider, key::ID_START_V1) } /// Characters used in Ideographic Description Sequences -pub fn get_ids_binary_operator_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_ids_binary_operator<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::IDS_BINARY_OPERATOR_V1) + get_uniset(provider, key::IDS_BINARY_OPERATOR_V1) } /// Characters used in Ideographic Description Sequences -pub fn get_ids_trinary_operator_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_ids_trinary_operator<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::IDS_TRINARY_OPERATOR_V1) + get_uniset(provider, key::IDS_TRINARY_OPERATOR_V1) } /// Format control characters which have specific functions for control of cursive joining /// and ligation -pub fn get_join_control_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_join_control<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::JOIN_CONTROL_V1) + get_uniset(provider, key::JOIN_CONTROL_V1) } /// A small number of spacing vowel letters occurring in certain Southeast Asian scripts such as Thai and Lao -pub fn get_logical_order_exception_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_logical_order_exception<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::LOGICAL_ORDER_EXCEPTION_V1) + get_uniset(provider, key::LOGICAL_ORDER_EXCEPTION_V1) } /// Lowercase characters -pub fn get_lowercase_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_lowercase<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::LOWERCASE_V1) + get_uniset(provider, key::LOWERCASE_V1) } /// Characters used in mathematical notation -pub fn get_math_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_math<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::MATH_V1) + get_uniset(provider, key::MATH_V1) } /// Code points permanently reserved for internal use -pub fn get_noncharacter_code_point_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_noncharacter_code_point<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::NONCHARACTER_CODE_POINT_V1) + get_uniset(provider, key::NONCHARACTER_CODE_POINT_V1) } /// Characters that are inert under NFC, i.e., they do not interact with adjacent characters -pub fn get_nfc_inert_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_nfc_inert<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::NFC_INERT_V1) + get_uniset(provider, key::NFC_INERT_V1) } /// Characters that are inert under NFD, i.e., they do not interact with adjacent characters -pub fn get_nfd_inert_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_nfd_inert<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::NFD_INERT_V1) + get_uniset(provider, key::NFD_INERT_V1) } /// Characters that are inert under NFKC, i.e., they do not interact with adjacent characters -pub fn get_nfkc_inert_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_nfkc_inert<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::NFKC_INERT_V1) + get_uniset(provider, key::NFKC_INERT_V1) } /// Characters that are inert under NFKD, i.e., they do not interact with adjacent characters -pub fn get_nfkd_inert_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_nfkd_inert<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::NFKD_INERT_V1) + get_uniset(provider, key::NFKD_INERT_V1) } /// Characters used as syntax in patterns (such as regular expressions). See [`Unicode /// Standard Annex #31`](https://www.unicode.org/reports/tr31/tr31-35.html) for more /// details. -pub fn get_pattern_syntax_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_pattern_syntax<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::PATTERN_SYNTAX_V1) + get_uniset(provider, key::PATTERN_SYNTAX_V1) } /// Characters used as whitespace in patterns (such as regular expressions). See /// [`Unicode Standard Annex #31`](https://www.unicode.org/reports/tr31/tr31-35.html) for /// more details. -pub fn get_pattern_white_space_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_pattern_white_space<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::PATTERN_WHITE_SPACE_V1) + get_uniset(provider, key::PATTERN_WHITE_SPACE_V1) } /// A small class of visible format controls, which precede and then span a sequence of /// other characters, usually digits. -pub fn get_prepended_concatenation_mark_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_prepended_concatenation_mark<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::PREPENDED_CONCATENATION_MARK_V1) + get_uniset(provider, key::PREPENDED_CONCATENATION_MARK_V1) } /// Printable characters (visible characters and whitespace). /// This is defined for POSIX compatibility. -pub fn get_print_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_print<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::PRINT_V1) + get_uniset(provider, key::PRINT_V1) } /// Punctuation characters that function as quotation marks. -pub fn get_quotation_mark_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_quotation_mark<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::QUOTATION_MARK_V1) + get_uniset(provider, key::QUOTATION_MARK_V1) } /// Characters used in the definition of Ideographic Description Sequences -pub fn get_radical_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_radical<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::RADICAL_V1) + get_uniset(provider, key::RADICAL_V1) } /// Regional indicator characters, U+1F1E6..U+1F1FF -pub fn get_regional_indicator_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_regional_indicator<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::REGIONAL_INDICATOR_V1) + get_uniset(provider, key::REGIONAL_INDICATOR_V1) } /// Characters with a "soft dot", like i or j. An accent placed on these characters causes /// the dot to disappear. -pub fn get_soft_dotted_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_soft_dotted<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::SOFT_DOTTED_V1) + get_uniset(provider, key::SOFT_DOTTED_V1) } /// Characters that are starters in terms of Unicode normalization and combining character /// sequences -pub fn get_segment_starter_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_segment_starter<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::SEGMENT_STARTER_V1) + get_uniset(provider, key::SEGMENT_STARTER_V1) } /// Characters that are either the source of a case mapping or in the target of a case /// mapping -pub fn get_case_sensitive_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_case_sensitive<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::CASE_SENSITIVE_V1) + get_uniset(provider, key::CASE_SENSITIVE_V1) } /// Punctuation characters that generally mark the end of sentences -pub fn get_sentence_terminal_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_sentence_terminal<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::SENTENCE_TERMINAL_V1) + get_uniset(provider, key::SENTENCE_TERMINAL_V1) } /// Punctuation characters that generally mark the end of textual units -pub fn get_terminal_punctuation_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_terminal_punctuation<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::TERMINAL_PUNCTUATION_V1) + get_uniset(provider, key::TERMINAL_PUNCTUATION_V1) } /// A property which specifies the exact set of Unified CJK Ideographs in the standard -pub fn get_unified_ideograph_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_unified_ideograph<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::UNIFIED_IDEOGRAPH_V1) + get_uniset(provider, key::UNIFIED_IDEOGRAPH_V1) } /// Uppercase characters -pub fn get_uppercase_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_uppercase<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::UPPERCASE_V1) + get_uniset(provider, key::UPPERCASE_V1) } /// Characters that are Variation Selectors. -pub fn get_variation_selector_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_variation_selector<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::VARIATION_SELECTOR_V1) + get_uniset(provider, key::VARIATION_SELECTOR_V1) } /// Spaces, separator characters and other control characters which should be treated by /// programming languages as "white space" for the purpose of parsing elements -pub fn get_white_space_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_white_space<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::WHITE_SPACE_V1) + get_uniset(provider, key::WHITE_SPACE_V1) } /// Hexadecimal digits /// This is defined for POSIX compatibility. -pub fn get_xdigit_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_xdigit<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::XDIGIT_V1) + get_uniset(provider, key::XDIGIT_V1) } /// Characters that can begin an identifier. See [`Unicode Standard Annex /// #31`](https://www.unicode.org/reports/tr31/tr31-35.html) for more details. -pub fn get_xid_continue_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_xid_continue<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::XID_CONTINUE_V1) + get_uniset(provider, key::XID_CONTINUE_V1) } /// Characters that can come after the first character in an identifier. See [`Unicode /// Standard Annex #31`](https://www.unicode.org/reports/tr31/tr31-35.html) for more /// details. -pub fn get_xid_start_property<'data, D>(provider: &D) -> UnisetResult<'data> +pub fn get_xid_start<'data, D>(provider: &D) -> UnisetResult<'data> where D: DataProvider<'data, UnicodePropertyV1Marker> + ?Sized, { - get_prop(provider, key::XID_START_V1) + get_uniset(provider, key::XID_START_V1) } // @@ -658,7 +659,7 @@ where GeneralCategory::SpaceSeparator => key::GENERAL_CATEGORY_SPACE_SEPARATOR_V1, _ => return Err(UnicodeSetError::UnknownGeneralCategorySet(enum_val.0)), }; - get_prop(provider, key) + get_uniset(provider, key) } /// Return a [`UnicodeSet`] for a particular value of the Script Unicode enumerated property @@ -832,5 +833,5 @@ where Script::ZanabazarSquare => key::SCRIPT_ZANABAZAR_SQUARE_V1, _ => return Err(UnicodeSetError::UnknownScriptId(enum_val.0)), }; - get_prop(provider, key) + get_uniset(provider, key) } diff --git a/components/uniset/src/ule.rs b/components/properties/src/ule.rs similarity index 97% rename from components/uniset/src/ule.rs rename to components/properties/src/ule.rs index e4c62f0739c..73057d7b0e6 100644 --- a/components/uniset/src/ule.rs +++ b/components/properties/src/ule.rs @@ -2,7 +2,8 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). -use crate::enum_props::{GeneralSubcategory, Script}; +use crate::{GeneralSubcategory, Script}; + use core::convert::TryFrom; use num_enum::TryFromPrimitiveError; use zerovec::ule::{AsULE, PlainOldULE, ULE}; diff --git a/provider/uprops/Cargo.toml b/provider/uprops/Cargo.toml index daec0ecdff0..36b33f1f479 100644 --- a/provider/uprops/Cargo.toml +++ b/provider/uprops/Cargo.toml @@ -29,8 +29,9 @@ all-features = true [dependencies] displaydoc = { version = "0.2.3", default-features = false } icu_codepointtrie = { version = "0.3", path = "../../utils/codepointtrie", features = ["serde"] } +icu_properties = { version = "0.3", path = "../../components/properties", features = ["provider_serde"] } icu_provider = { version = "0.3", path = "../../provider/core", features = ["provider_serde"] } -icu_uniset = { version = "0.3", path = "../../components/uniset", features = ["provider_serde"] } +icu_uniset = { version = "0.3", path = "../../utils/uniset", features = ["provider_serde"] } serde = { version = "1.0", features = ["derive"] } toml = { version = "0.5" } zerovec = { version = "0.3", path = "../../utils/zerovec", features = ["serde", "yoke"] } diff --git a/provider/uprops/README.md b/provider/uprops/README.md index fa1de03fee6..37a3db968a5 100644 --- a/provider/uprops/README.md +++ b/provider/uprops/README.md @@ -14,7 +14,6 @@ for production use. It is much more efficient if you use [`DataProvider`]: icu_provider::prelude::DataProvider [`FsDataProvider`]: ../icu_provider_fs/struct.FsDataProvider.html [`StaticDataProvider`]: ../icu_provider_blob/struct.StaticDataProvider.html -[`PropertiesDataProvider`]: binary::PropertiesDataProvider ## More Information diff --git a/provider/uprops/src/bin_uniset.rs b/provider/uprops/src/bin_uniset.rs index 74262428a34..952df2799b7 100644 --- a/provider/uprops/src/bin_uniset.rs +++ b/provider/uprops/src/bin_uniset.rs @@ -4,9 +4,10 @@ use crate::error::Error; use crate::uprops_serde; +use icu_properties::provider::UnicodePropertyV1; +use icu_properties::provider::UnicodePropertyV1Marker; use icu_provider::iter::IterableDataProviderCore; use icu_provider::prelude::*; -use icu_uniset::provider::*; use icu_uniset::UnicodeSetBuilder; use std::fs; use std::path::PathBuf; @@ -70,6 +71,7 @@ impl IterableDataProviderCore for BinaryPropertyUnicodeSetDataProvider { #[test] fn test_basic() { + use icu_properties::provider::key; use icu_uniset::UnicodeSet; use std::convert::TryInto; diff --git a/provider/uprops/src/enum_uniset.rs b/provider/uprops/src/enum_uniset.rs index b410c81e375..c4c77e4e6bd 100644 --- a/provider/uprops/src/enum_uniset.rs +++ b/provider/uprops/src/enum_uniset.rs @@ -4,9 +4,10 @@ use crate::error::Error; use crate::uprops_serde; +use icu_properties::provider::UnicodePropertyV1; +use icu_properties::provider::UnicodePropertyV1Marker; use icu_provider::iter::IterableDataProviderCore; use icu_provider::prelude::*; -use icu_uniset::provider::*; use icu_uniset::UnicodeSetBuilder; use std::fs; use std::path::PathBuf; @@ -120,199 +121,207 @@ impl IterableDataProviderCore for EnumeratedPropertyUnicodeSetDataProvider { } } -#[test] -fn test_general_category() { - use icu_uniset::UnicodeSet; - use std::convert::TryInto; +#[cfg(test)] +mod tests { + use super::*; + use icu_properties::provider::key; - let root_dir = icu_testdata::paths::data_root().join("uprops"); - let provider = EnumeratedPropertyUnicodeSetDataProvider::new(root_dir); + #[test] + fn test_general_category() { + use icu_uniset::UnicodeSet; + use std::convert::TryInto; - let payload: DataPayload<'_, UnicodePropertyV1Marker> = provider - .load_payload(&DataRequest { - resource_path: ResourcePath { - key: key::GENERAL_CATEGORY_NUMBER_V1, - options: ResourceOptions::default(), - }, - }) - .expect("The data should be valid") - .take_payload() - .expect("Loading was successful"); - - let digits: UnicodeSet = payload.get().clone().try_into().expect("Valid unicode set"); - - assert!(digits.contains('5')); - assert!(digits.contains('\u{0665}')); // U+0665 ARABIC-INDIC DIGIT FIVE - assert!(digits.contains('\u{096b}')); // U+0969 DEVANAGARI DIGIT FIVE - - assert!(!digits.contains('A')); -} - -#[test] -fn test_script() { - use icu_uniset::UnicodeSet; - use std::convert::TryInto; - - let root_dir = icu_testdata::paths::data_root().join("uprops"); - let provider = EnumeratedPropertyUnicodeSetDataProvider::new(root_dir); + let root_dir = icu_testdata::paths::data_root().join("uprops"); + let provider = EnumeratedPropertyUnicodeSetDataProvider::new(root_dir); - let payload: DataPayload<'_, UnicodePropertyV1Marker> = provider - .load_payload(&DataRequest { - resource_path: ResourcePath { - key: key::SCRIPT_THAI_V1, - options: ResourceOptions::default(), - }, - }) - .expect("The data should be valid") - .take_payload() - .expect("Loading was successful"); + let payload: DataPayload<'_, UnicodePropertyV1Marker> = provider + .load_payload(&DataRequest { + resource_path: ResourcePath { + key: key::GENERAL_CATEGORY_NUMBER_V1, + options: ResourceOptions::default(), + }, + }) + .expect("The data should be valid") + .take_payload() + .expect("Loading was successful"); - let thai: UnicodeSet = payload.get().clone().try_into().expect("Valid unicode set"); + let digits: UnicodeSet = payload.get().clone().try_into().expect("Valid unicode set"); - assert!(thai.contains('\u{0e01}')); // U+0E01 THAI CHARACTER KO KAI - assert!(thai.contains('\u{0e50}')); // U+0E50 THAI DIGIT ZERO + assert!(digits.contains('5')); + assert!(digits.contains('\u{0665}')); // U+0665 ARABIC-INDIC DIGIT FIVE + assert!(digits.contains('\u{096b}')); // U+0969 DEVANAGARI DIGIT FIVE - assert!(!thai.contains('A')); - assert!(!thai.contains('\u{0e3f}')); // U+0E50 THAI CURRENCY SYMBOL BAHT -} + assert!(!digits.contains('A')); + } -#[test] -fn test_gc_groupings() { - use icu_uniset::{UnicodeSet, UnicodeSetBuilder}; - use std::convert::TryInto; + #[test] + fn test_script() { + use icu_uniset::UnicodeSet; + use std::convert::TryInto; - fn get_uniset_payload<'data>(key: ResourceKey) -> DataPayload<'data, UnicodePropertyV1Marker> { let root_dir = icu_testdata::paths::data_root().join("uprops"); let provider = EnumeratedPropertyUnicodeSetDataProvider::new(root_dir); + let payload: DataPayload<'_, UnicodePropertyV1Marker> = provider .load_payload(&DataRequest { resource_path: ResourcePath { - key, + key: key::SCRIPT_THAI_V1, options: ResourceOptions::default(), }, }) .expect("The data should be valid") .take_payload() .expect("Loading was successful"); - payload + + let thai: UnicodeSet = payload.get().clone().try_into().expect("Valid unicode set"); + + assert!(thai.contains('\u{0e01}')); // U+0E01 THAI CHARACTER KO KAI + assert!(thai.contains('\u{0e50}')); // U+0E50 THAI DIGIT ZERO + + assert!(!thai.contains('A')); + assert!(!thai.contains('\u{0e3f}')); // U+0E50 THAI CURRENCY SYMBOL BAHT } - let test_group = |category: ResourceKey, subcategories: &[ResourceKey]| { - let category_set_payload = get_uniset_payload(category); - let category_set: UnicodeSet = category_set_payload - .get() - .clone() - .try_into() - .expect("Valid unicode set"); - let mut builder = UnicodeSetBuilder::new(); - for subcategory in subcategories { - builder.add_set( - &get_uniset_payload(*subcategory) - .get() - .clone() - .try_into() - .expect("Valid unicode set"), - ); + #[test] + fn test_gc_groupings() { + use icu_uniset::{UnicodeSet, UnicodeSetBuilder}; + use std::convert::TryInto; + + fn get_uniset_payload<'data>( + key: ResourceKey, + ) -> DataPayload<'data, UnicodePropertyV1Marker> { + let root_dir = icu_testdata::paths::data_root().join("uprops"); + let provider = EnumeratedPropertyUnicodeSetDataProvider::new(root_dir); + let payload: DataPayload<'_, UnicodePropertyV1Marker> = provider + .load_payload(&DataRequest { + resource_path: ResourcePath { + key, + options: ResourceOptions::default(), + }, + }) + .expect("The data should be valid") + .take_payload() + .expect("Loading was successful"); + payload } - let combined_set = builder.build(); - println!("{:?} {:?}", category, subcategories); - assert_eq!( - category_set.get_inversion_list(), - combined_set.get_inversion_list() + + let test_group = |category: ResourceKey, subcategories: &[ResourceKey]| { + let category_set_payload = get_uniset_payload(category); + let category_set: UnicodeSet = category_set_payload + .get() + .clone() + .try_into() + .expect("Valid unicode set"); + let mut builder = UnicodeSetBuilder::new(); + for subcategory in subcategories { + builder.add_set( + &get_uniset_payload(*subcategory) + .get() + .clone() + .try_into() + .expect("Valid unicode set"), + ); + } + let combined_set = builder.build(); + println!("{:?} {:?}", category, subcategories); + assert_eq!( + category_set.get_inversion_list(), + combined_set.get_inversion_list() + ); + }; + + test_group( + key::GENERAL_CATEGORY_LETTER_V1, + &[ + key::GENERAL_CATEGORY_UPPERCASE_LETTER_V1, + key::GENERAL_CATEGORY_LOWERCASE_LETTER_V1, + key::GENERAL_CATEGORY_TITLECASE_LETTER_V1, + key::GENERAL_CATEGORY_MODIFIER_LETTER_V1, + key::GENERAL_CATEGORY_OTHER_LETTER_V1, + ], ); - }; - - test_group( - key::GENERAL_CATEGORY_LETTER_V1, - &[ - key::GENERAL_CATEGORY_UPPERCASE_LETTER_V1, - key::GENERAL_CATEGORY_LOWERCASE_LETTER_V1, - key::GENERAL_CATEGORY_TITLECASE_LETTER_V1, - key::GENERAL_CATEGORY_MODIFIER_LETTER_V1, - key::GENERAL_CATEGORY_OTHER_LETTER_V1, - ], - ); - test_group( - key::GENERAL_CATEGORY_OTHER_V1, - &[ - key::GENERAL_CATEGORY_CONTROL_V1, - key::GENERAL_CATEGORY_FORMAT_V1, - key::GENERAL_CATEGORY_UNASSIGNED_V1, - key::GENERAL_CATEGORY_PRIVATE_USE_V1, - key::GENERAL_CATEGORY_SURROGATE_V1, - ], - ); - test_group( - key::GENERAL_CATEGORY_MARK_V1, - &[ - key::GENERAL_CATEGORY_SPACING_MARK_V1, - key::GENERAL_CATEGORY_ENCLOSING_MARK_V1, - key::GENERAL_CATEGORY_NONSPACING_MARK_V1, - ], - ); - test_group( - key::GENERAL_CATEGORY_NUMBER_V1, - &[ - key::GENERAL_CATEGORY_DIGIT_V1, - key::GENERAL_CATEGORY_LETTER_NUMBER_V1, - key::GENERAL_CATEGORY_OTHER_NUMBER_V1, - ], - ); - test_group( - key::GENERAL_CATEGORY_PUNCTUATION_V1, - &[ - key::GENERAL_CATEGORY_CONNECTOR_PUNCTUATION_V1, - key::GENERAL_CATEGORY_DASH_PUNCTUATION_V1, - key::GENERAL_CATEGORY_CLOSE_PUNCTUATION_V1, - key::GENERAL_CATEGORY_FINAL_PUNCTUATION_V1, - key::GENERAL_CATEGORY_INITIAL_PUNCTUATION_V1, - key::GENERAL_CATEGORY_OTHER_PUNCTUATION_V1, - key::GENERAL_CATEGORY_OPEN_PUNCTUATION_V1, - ], - ); - test_group( - key::GENERAL_CATEGORY_SYMBOL_V1, - &[ - key::GENERAL_CATEGORY_CURRENCY_SYMBOL_V1, - key::GENERAL_CATEGORY_MODIFIER_SYMBOL_V1, - key::GENERAL_CATEGORY_MATH_SYMBOL_V1, - key::GENERAL_CATEGORY_OTHER_SYMBOL_V1, - ], - ); - test_group( - key::GENERAL_CATEGORY_SEPARATOR_V1, - &[ - key::GENERAL_CATEGORY_LINE_SEPARATOR_V1, - key::GENERAL_CATEGORY_PARAGRAPH_SEPARATOR_V1, - key::GENERAL_CATEGORY_SPACE_SEPARATOR_V1, - ], - ); -} + test_group( + key::GENERAL_CATEGORY_OTHER_V1, + &[ + key::GENERAL_CATEGORY_CONTROL_V1, + key::GENERAL_CATEGORY_FORMAT_V1, + key::GENERAL_CATEGORY_UNASSIGNED_V1, + key::GENERAL_CATEGORY_PRIVATE_USE_V1, + key::GENERAL_CATEGORY_SURROGATE_V1, + ], + ); + test_group( + key::GENERAL_CATEGORY_MARK_V1, + &[ + key::GENERAL_CATEGORY_SPACING_MARK_V1, + key::GENERAL_CATEGORY_ENCLOSING_MARK_V1, + key::GENERAL_CATEGORY_NONSPACING_MARK_V1, + ], + ); + test_group( + key::GENERAL_CATEGORY_NUMBER_V1, + &[ + key::GENERAL_CATEGORY_DIGIT_V1, + key::GENERAL_CATEGORY_LETTER_NUMBER_V1, + key::GENERAL_CATEGORY_OTHER_NUMBER_V1, + ], + ); + test_group( + key::GENERAL_CATEGORY_PUNCTUATION_V1, + &[ + key::GENERAL_CATEGORY_CONNECTOR_PUNCTUATION_V1, + key::GENERAL_CATEGORY_DASH_PUNCTUATION_V1, + key::GENERAL_CATEGORY_CLOSE_PUNCTUATION_V1, + key::GENERAL_CATEGORY_FINAL_PUNCTUATION_V1, + key::GENERAL_CATEGORY_INITIAL_PUNCTUATION_V1, + key::GENERAL_CATEGORY_OTHER_PUNCTUATION_V1, + key::GENERAL_CATEGORY_OPEN_PUNCTUATION_V1, + ], + ); + test_group( + key::GENERAL_CATEGORY_SYMBOL_V1, + &[ + key::GENERAL_CATEGORY_CURRENCY_SYMBOL_V1, + key::GENERAL_CATEGORY_MODIFIER_SYMBOL_V1, + key::GENERAL_CATEGORY_MATH_SYMBOL_V1, + key::GENERAL_CATEGORY_OTHER_SYMBOL_V1, + ], + ); + test_group( + key::GENERAL_CATEGORY_SEPARATOR_V1, + &[ + key::GENERAL_CATEGORY_LINE_SEPARATOR_V1, + key::GENERAL_CATEGORY_PARAGRAPH_SEPARATOR_V1, + key::GENERAL_CATEGORY_SPACE_SEPARATOR_V1, + ], + ); + } -#[test] -fn test_gc_surrogate() { - use icu_uniset::UnicodeSet; - use std::convert::TryInto; + #[test] + fn test_gc_surrogate() { + use icu_uniset::UnicodeSet; + use std::convert::TryInto; - let root_dir = icu_testdata::paths::data_root().join("uprops"); - let provider = EnumeratedPropertyUnicodeSetDataProvider::new(root_dir); + let root_dir = icu_testdata::paths::data_root().join("uprops"); + let provider = EnumeratedPropertyUnicodeSetDataProvider::new(root_dir); - let payload: DataPayload<'_, UnicodePropertyV1Marker> = provider - .load_payload(&DataRequest { - resource_path: ResourcePath { - key: key::GENERAL_CATEGORY_SURROGATE_V1, - options: ResourceOptions::default(), - }, - }) - .expect("The data should be valid") - .take_payload() - .expect("Loading was successful"); + let payload: DataPayload<'_, UnicodePropertyV1Marker> = provider + .load_payload(&DataRequest { + resource_path: ResourcePath { + key: key::GENERAL_CATEGORY_SURROGATE_V1, + options: ResourceOptions::default(), + }, + }) + .expect("The data should be valid") + .take_payload() + .expect("Loading was successful"); - let surrogates: UnicodeSet = payload.get().clone().try_into().expect("Valid unicode set"); + let surrogates: UnicodeSet = payload.get().clone().try_into().expect("Valid unicode set"); - assert!(surrogates.contains_u32(0xd800)); - assert!(surrogates.contains_u32(0xd900)); - assert!(surrogates.contains_u32(0xdfff)); + assert!(surrogates.contains_u32(0xd800)); + assert!(surrogates.contains_u32(0xd900)); + assert!(surrogates.contains_u32(0xdfff)); - assert!(!surrogates.contains('A')); + assert!(!surrogates.contains('A')); + } } diff --git a/provider/uprops/src/lib.rs b/provider/uprops/src/lib.rs index 65a86b11e59..e5997cef97b 100644 --- a/provider/uprops/src/lib.rs +++ b/provider/uprops/src/lib.rs @@ -16,7 +16,6 @@ //! [`DataProvider`]: icu_provider::prelude::DataProvider //! [`FsDataProvider`]: ../icu_provider_fs/struct.FsDataProvider.html //! [`StaticDataProvider`]: ../icu_provider_blob/struct.StaticDataProvider.html -//! [`PropertiesDataProvider`]: binary::PropertiesDataProvider mod bin_uniset; mod enum_codepointtrie; diff --git a/provider/uprops/src/provider.rs b/provider/uprops/src/provider.rs new file mode 100644 index 00000000000..b54d2674ae5 --- /dev/null +++ b/provider/uprops/src/provider.rs @@ -0,0 +1,51 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use crate::binary::BinaryPropertiesDataProvider; +use crate::enumerated::EnumeratedPropertiesDataProvider; +use icu_properties::provider::UnicodePropertyV1Marker; +use icu_provider::iter::IterableDataProviderCore; +use icu_provider::prelude::*; + +use std::path::PathBuf; + +pub struct PropertiesDataProvider { + binary: BinaryPropertiesDataProvider, + enumerated: EnumeratedPropertiesDataProvider, +} + +impl PropertiesDataProvider { + pub fn new(root_dir: PathBuf) -> Self { + let binary = BinaryPropertiesDataProvider::new(root_dir.clone()); + let enumerated = EnumeratedPropertiesDataProvider::new(root_dir); + Self { binary, enumerated } + } +} + +impl<'data> DataProvider<'data, UnicodePropertyV1Marker> for PropertiesDataProvider { + fn load_payload( + &self, + req: &DataRequest, + ) -> Result, DataError> { + if req.resource_path.key.sub_category.contains('=') { + self.enumerated.load_payload(req) + } else { + self.binary.load_payload(req) + } + } +} + +icu_provider::impl_dyn_provider!(PropertiesDataProvider, { + _ => UnicodePropertyV1Marker, +}, SERDE_SE, 'data); + +impl IterableDataProviderCore for PropertiesDataProvider { + fn supported_options_for_key( + &self, + _resc_key: &ResourceKey, + ) -> Result>, DataError> { + let list: Vec = vec![ResourceOptions::default()]; + Ok(Box::new(list.into_iter())) + } +} diff --git a/components/uniset/Cargo.toml b/utils/uniset/Cargo.toml similarity index 91% rename from components/uniset/Cargo.toml rename to utils/uniset/Cargo.toml index 8c000d5f9e5..b5026352308 100644 --- a/components/uniset/Cargo.toml +++ b/utils/uniset/Cargo.toml @@ -34,7 +34,6 @@ all-features = true [dependencies] icu_provider = { version = "0.3", path = "../../provider/core", features = ["macros"] } litemap = { version = "0.2", path = "../../utils/litemap" } -num_enum = { version = "0.5.4", default-features = false } serde = { version = "1.0", default-features = false, features = ["derive", "alloc"], optional = true } tinystr = { version = "0.4.10", features = ["alloc"], default-features = false } displaydoc = { version = "0.2.3", default-features = false } @@ -42,9 +41,9 @@ zerovec = { version = "0.3", path = "../../utils/zerovec", features = ["serde"] [dev-dependencies] criterion = "0.3.3" -icu = { path = "../icu", default-features = false } icu_benchmark_macros = { version = "0.3", path = "../../tools/benchmark/macros" } postcard = { version = "0.7", features = ["use-std", "alloc"] } +serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" [lib] @@ -52,8 +51,6 @@ bench = false # This option is required for Benchmark CI path = "src/lib.rs" [features] -std = ["icu_provider/std"] -default = ["provider_serde"] bench = [] provider_serde = ["serde"] diff --git a/utils/uniset/LICENSE b/utils/uniset/LICENSE new file mode 100644 index 00000000000..5ab1f57507b --- /dev/null +++ b/utils/uniset/LICENSE @@ -0,0 +1,331 @@ +Except as otherwise noted below, ICU4X is licensed under the Apache +License, Version 2.0 (included below) or the MIT license (included +below), at your option. Unless importing data or code in the manner +stated below, any contribution intentionally submitted for inclusion +in ICU4X by you, as defined in the Apache-2.0 license, shall be dual +licensed in the foregoing manner, without any additional terms or +conditions. + +As exceptions to the above: +* Portions of ICU4X that have been adapted from ICU4C and/or ICU4J are +under the Unicode license (included below) and/or the ICU license +(included below) as indicated by source code comments. +* Unicode data incorporated in ICU4X is under the Unicode license +(included below). +* Your contributions may import code from ICU4C and/or ICU4J and +Unicode data under these licenses. Indicate the license and the ICU4C +or ICU4J origin in source code comments. + +- - - - + +Apache License, version 2.0 + + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +- - - - + +MIT License + +Copyright The ICU4X Authors + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. + +- - - - + +Unicode License + +COPYRIGHT AND PERMISSION NOTICE (ICU 58 and later) + +Copyright Š 1991-2020 Unicode, Inc. All rights reserved. +Distributed under the Terms of Use in https://www.unicode.org/copyright.html. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of the Unicode data files and any associated documentation +(the "Data Files") or Unicode software and any associated documentation +(the "Software") to deal in the Data Files or Software +without restriction, including without limitation the rights to use, +copy, modify, merge, publish, distribute, and/or sell copies of +the Data Files or Software, and to permit persons to whom the Data Files +or Software are furnished to do so, provided that either +(a) this copyright and permission notice appear with all copies +of the Data Files or Software, or +(b) this copyright and permission notice appear in associated +Documentation. + +THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE +WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT OF THIRD PARTY RIGHTS. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS +NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL +DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, +DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER +TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +PERFORMANCE OF THE DATA FILES OR SOFTWARE. + +Except as contained in this notice, the name of a copyright holder +shall not be used in advertising or otherwise to promote the sale, +use or other dealings in these Data Files or Software without prior +written authorization of the copyright holder. + +- - - - + +ICU License - ICU 1.8.1 to ICU 57.1 + +COPYRIGHT AND PERMISSION NOTICE + +Copyright (c) 1995-2016 International Business Machines Corporation and others +All rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, and/or sell copies of the Software, and to permit persons +to whom the Software is furnished to do so, provided that the above +copyright notice(s) and this permission notice appear in all copies of +the Software and that both the above copyright notice(s) and this +permission notice appear in supporting documentation. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT +OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR +HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY +SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER +RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF +CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + +Except as contained in this notice, the name of a copyright holder +shall not be used in advertising or otherwise to promote the sale, use +or other dealings in this Software without prior written authorization +of the copyright holder. + +All trademarks and registered trademarks mentioned herein are the +property of their respective owners. + +- - - - diff --git a/components/uniset/README.md b/utils/uniset/README.md similarity index 94% rename from components/uniset/README.md rename to utils/uniset/README.md index ee6192e8050..2a325cb7aaa 100644 --- a/components/uniset/README.md +++ b/utils/uniset/README.md @@ -20,7 +20,7 @@ represented by [inversion lists](http://userguide.icu-project.org/strings/proper the [`UnicodeSetBuilder`], or from the TBA Properties API. ```rust -use icu::uniset::{UnicodeSet, UnicodeSetBuilder}; +use icu_uniset::{UnicodeSet, UnicodeSetBuilder}; let mut builder = UnicodeSetBuilder::new(); builder.add_range(&('A'..'Z')); @@ -34,7 +34,7 @@ assert!(set.contains('A')); Currently, you can check if a character/range of characters exists in the [`UnicodeSet`], or iterate through the characters. ```rust -use icu::uniset::{UnicodeSet, UnicodeSetBuilder}; +use icu_uniset::{UnicodeSet, UnicodeSetBuilder}; let mut builder = UnicodeSetBuilder::new(); builder.add_range(&('A'..'Z')); diff --git a/components/uniset/benches/inv_list.rs b/utils/uniset/benches/inv_list.rs similarity index 100% rename from components/uniset/benches/inv_list.rs rename to utils/uniset/benches/inv_list.rs diff --git a/components/uniset/examples/unicode_bmp_blocks_selector.rs b/utils/uniset/examples/unicode_bmp_blocks_selector.rs similarity index 100% rename from components/uniset/examples/unicode_bmp_blocks_selector.rs rename to utils/uniset/examples/unicode_bmp_blocks_selector.rs diff --git a/components/uniset/src/builder.rs b/utils/uniset/src/builder.rs similarity index 97% rename from components/uniset/src/builder.rs rename to utils/uniset/src/builder.rs index f1345e1f60b..f58fa5cdc05 100644 --- a/components/uniset/src/builder.rs +++ b/utils/uniset/src/builder.rs @@ -85,7 +85,7 @@ impl UnicodeSetBuilder { /// # Examples /// /// ``` - /// use icu::uniset::UnicodeSetBuilder; + /// use icu_uniset::UnicodeSetBuilder; /// let mut builder = UnicodeSetBuilder::new(); /// builder.add_char('a'); /// let check = builder.build(); @@ -106,7 +106,7 @@ impl UnicodeSetBuilder { /// # Examples /// /// ``` - /// use icu::uniset::UnicodeSetBuilder; + /// use icu_uniset::UnicodeSetBuilder; /// let mut builder = UnicodeSetBuilder::new(); /// builder.add_u32(0x41); /// let check = builder.build(); @@ -124,7 +124,7 @@ impl UnicodeSetBuilder { /// # Examples /// /// ``` - /// use icu::uniset::UnicodeSetBuilder; + /// use icu_uniset::UnicodeSetBuilder; /// let mut builder = UnicodeSetBuilder::new(); /// builder.add_range(&('A'..='Z')); /// let check = builder.build(); @@ -140,7 +140,7 @@ impl UnicodeSetBuilder { /// # Examples /// /// ``` - /// use icu::uniset::UnicodeSetBuilder; + /// use icu_uniset::UnicodeSetBuilder; /// let mut builder = UnicodeSetBuilder::new(); /// builder.add_range_u32(&(0xd800..=0xdfff)); /// let check = builder.build(); @@ -158,7 +158,7 @@ impl UnicodeSetBuilder { /// # Examples /// /// ``` - /// use icu::uniset::{UnicodeSet, UnicodeSetBuilder}; + /// use icu_uniset::{UnicodeSet, UnicodeSetBuilder}; /// let mut builder = UnicodeSetBuilder::new(); /// let set = UnicodeSet::from_inversion_list_slice(&[0x41, 0x4C]).unwrap(); /// builder.add_set(&set); @@ -200,7 +200,7 @@ impl UnicodeSetBuilder { /// # Examples /// /// ``` - /// use icu::uniset::UnicodeSetBuilder; + /// use icu_uniset::UnicodeSetBuilder; /// let mut builder = UnicodeSetBuilder::new(); /// builder.add_range(&('A'..='Z')); /// builder.remove_char('A'); @@ -216,7 +216,7 @@ impl UnicodeSetBuilder { /// # Examples /// /// ``` - /// use icu::uniset::UnicodeSetBuilder; + /// use icu_uniset::UnicodeSetBuilder; /// let mut builder = UnicodeSetBuilder::new(); /// builder.add_range(&('A'..='Z')); /// builder.remove_range(&('A'..='C')); @@ -232,7 +232,7 @@ impl UnicodeSetBuilder { /// # Examples /// /// ``` - /// use icu::uniset::{UnicodeSet, UnicodeSetBuilder}; + /// use icu_uniset::{UnicodeSet, UnicodeSetBuilder}; /// let mut builder = UnicodeSetBuilder::new(); /// let set = UnicodeSet::from_inversion_list_slice(&[0x41, 0x46]).unwrap(); /// builder.add_range(&('A'..='Z')); @@ -257,7 +257,7 @@ impl UnicodeSetBuilder { /// # Examples /// /// ``` - /// use icu::uniset::UnicodeSetBuilder; + /// use icu_uniset::UnicodeSetBuilder; /// let mut builder = UnicodeSetBuilder::new(); /// builder.add_range(&('A'..='Z')); /// builder.retain_char('A'); @@ -277,7 +277,7 @@ impl UnicodeSetBuilder { /// # Examples /// /// ``` - /// use icu::uniset::UnicodeSetBuilder; + /// use icu_uniset::UnicodeSetBuilder; /// let mut builder = UnicodeSetBuilder::new(); /// builder.add_range(&('A'..='Z')); /// builder.retain_range(&('A'..='B')); @@ -298,7 +298,7 @@ impl UnicodeSetBuilder { /// # Examples /// /// ``` - /// use icu::uniset::{UnicodeSetBuilder, UnicodeSet}; + /// use icu_uniset::{UnicodeSetBuilder, UnicodeSet}; /// let mut builder = UnicodeSetBuilder::new(); /// let set = UnicodeSet::from_inversion_list_slice(&[65, 70]).unwrap(); /// builder.add_range(&('A'..='Z')); @@ -363,7 +363,7 @@ impl UnicodeSetBuilder { /// # Examples /// /// ``` - /// use icu::uniset::{UnicodeSetBuilder, UnicodeSet}; + /// use icu_uniset::{UnicodeSetBuilder, UnicodeSet}; /// let mut builder = UnicodeSetBuilder::new(); /// let set = UnicodeSet::from_inversion_list_slice(&[0x0, 0x41, 0x46, (std::char::MAX as u32) + 1]).unwrap(); /// builder.add_set(&set); @@ -394,7 +394,7 @@ impl UnicodeSetBuilder { /// # Examples /// /// ``` - /// use icu::uniset::UnicodeSetBuilder; + /// use icu_uniset::UnicodeSetBuilder; /// let mut builder = UnicodeSetBuilder::new(); /// builder.add_range(&('A'..='D')); /// builder.complement_char('A'); @@ -415,7 +415,7 @@ impl UnicodeSetBuilder { /// # Examples /// /// ``` - /// use icu::uniset::UnicodeSetBuilder; + /// use icu_uniset::UnicodeSetBuilder; /// let mut builder = UnicodeSetBuilder::new(); /// builder.add_range(&('A'..='D')); /// builder.complement_range(&('C'..='F')); @@ -435,7 +435,7 @@ impl UnicodeSetBuilder { /// # Examples /// /// ``` - /// use icu::uniset::{UnicodeSetBuilder, UnicodeSet}; + /// use icu_uniset::{UnicodeSetBuilder, UnicodeSet}; /// let mut builder = UnicodeSetBuilder::new(); /// let set = UnicodeSet::from_inversion_list_slice(&[0x41, 0x46, 0x4B, 0x5A]).unwrap(); /// builder.add_range(&('C'..='N')); // 67 - 78 @@ -458,7 +458,7 @@ impl UnicodeSetBuilder { /// # Examples /// /// ``` - /// use icu::uniset::{UnicodeSetBuilder, UnicodeSet}; + /// use icu_uniset::{UnicodeSetBuilder, UnicodeSet}; /// let mut builder = UnicodeSetBuilder::new(); /// let check = builder.build(); /// assert!(check.is_empty()); diff --git a/components/uniset/src/conversions.rs b/utils/uniset/src/conversions.rs similarity index 100% rename from components/uniset/src/conversions.rs rename to utils/uniset/src/conversions.rs diff --git a/components/uniset/src/lib.rs b/utils/uniset/src/lib.rs similarity index 94% rename from components/uniset/src/lib.rs rename to utils/uniset/src/lib.rs index 1bc6eb7bd33..908c4dc187b 100644 --- a/components/uniset/src/lib.rs +++ b/utils/uniset/src/lib.rs @@ -22,7 +22,7 @@ //! the [`UnicodeSetBuilder`], or from the TBA Properties API. //! //! ``` -//! use icu::uniset::{UnicodeSet, UnicodeSetBuilder}; +//! use icu_uniset::{UnicodeSet, UnicodeSetBuilder}; //! //! let mut builder = UnicodeSetBuilder::new(); //! builder.add_range(&('A'..'Z')); @@ -36,7 +36,7 @@ //! Currently, you can check if a character/range of characters exists in the [`UnicodeSet`], or iterate through the characters. //! //! ``` -//! use icu::uniset::{UnicodeSet, UnicodeSetBuilder}; +//! use icu_uniset::{UnicodeSet, UnicodeSetBuilder}; //! //! let mut builder = UnicodeSetBuilder::new(); //! builder.add_range(&('A'..'Z')); @@ -61,10 +61,6 @@ extern crate alloc; #[macro_use] mod builder; mod conversions; -pub mod enum_props; -pub mod props; -pub mod provider; -mod ule; mod uniset; mod utils; diff --git a/components/uniset/src/uniset.rs b/utils/uniset/src/uniset.rs similarity index 97% rename from components/uniset/src/uniset.rs rename to utils/uniset/src/uniset.rs index 439863e1926..da39a4a0b97 100644 --- a/components/uniset/src/uniset.rs +++ b/utils/uniset/src/uniset.rs @@ -37,7 +37,7 @@ pub struct UnicodeSet<'data> { size: usize, } -#[cfg(feature = "serde")] +#[cfg(any(feature = "serde", test))] impl<'de: 'a, 'a> serde::Deserialize<'de> for UnicodeSet<'a> { fn deserialize(deserializer: D) -> Result where @@ -59,7 +59,7 @@ impl<'de: 'a, 'a> serde::Deserialize<'de> for UnicodeSet<'a> { // to replace the struct when serializing. The error message from the default // serialization is: "can only flatten structs and maps (got a sequence)". -#[cfg(feature = "serde")] +#[cfg(any(feature = "serde", test))] impl<'data> serde::Serialize for UnicodeSet<'data> { fn serialize(&self, serializer: S) -> Result where @@ -79,8 +79,8 @@ impl<'data> UnicodeSet<'data> { /// # Examples /// /// ``` - /// use icu::uniset::UnicodeSet; - /// use icu::uniset::UnicodeSetError; + /// use icu_uniset::UnicodeSet; + /// use icu_uniset::UnicodeSetError; /// use zerovec::ZeroVec; /// let valid = [0x0, 0x10000]; /// let inv_list: ZeroVec = ZeroVec::from_slice(&valid); @@ -122,8 +122,8 @@ impl<'data> UnicodeSet<'data> { /// # Examples /// /// ``` - /// use icu::uniset::UnicodeSet; - /// use icu::uniset::UnicodeSetError; + /// use icu_uniset::UnicodeSet; + /// use icu_uniset::UnicodeSetError; /// use zerovec::ZeroVec; /// let valid = [0x0, 0x10000]; /// let result = UnicodeSet::from_inversion_list_slice(&valid); @@ -150,8 +150,8 @@ impl<'data> UnicodeSet<'data> { /// # Examples /// /// ``` - /// use icu::uniset::UnicodeSet; - /// use icu::uniset::UnicodeSetError; + /// use icu_uniset::UnicodeSet; + /// use icu_uniset::UnicodeSetError; /// use zerovec::ZeroVec; /// /// use std::vec::Vec; @@ -195,7 +195,7 @@ impl<'data> UnicodeSet<'data> { /// # Examples /// /// ``` - /// use icu::uniset::UnicodeSet; + /// use icu_uniset::UnicodeSet; /// use zerovec::ZeroVec; /// /// let expected = vec![0x0, (char::MAX as u32) + 1]; @@ -219,7 +219,7 @@ impl<'data> UnicodeSet<'data> { /// # Examples /// /// ``` - /// use icu::uniset::UnicodeSet; + /// use icu_uniset::UnicodeSet; /// use zerovec::ZeroVec; /// /// const BMP_MAX: u32 = 0xFFFF; @@ -250,7 +250,7 @@ impl<'data> UnicodeSet<'data> { /// # Examples /// /// ``` - /// use icu::uniset::UnicodeSet; + /// use icu_uniset::UnicodeSet; /// let example_list = [0x41, 0x44, 0x45, 0x46]; /// let example = UnicodeSet::from_inversion_list_slice(&example_list).unwrap(); /// let mut ex_iter_chars = example.iter_chars(); @@ -278,7 +278,7 @@ impl<'data> UnicodeSet<'data> { /// # Example /// /// ``` - /// use icu::uniset::UnicodeSet; + /// use icu_uniset::UnicodeSet; /// let example_list = [0x41, 0x44, 0x45, 0x46]; /// let example = UnicodeSet::from_inversion_list_slice(&example_list).unwrap(); /// let mut example_iter_ranges = example.iter_ranges(); @@ -353,7 +353,7 @@ impl<'data> UnicodeSet<'data> { /// # Examples /// /// ``` - /// use icu::uniset::UnicodeSet; + /// use icu_uniset::UnicodeSet; /// let example_list = [0x41, 0x43, 0x44, 0x45]; /// let example = UnicodeSet::from_inversion_list_slice(&example_list).unwrap(); /// assert!(example.contains('A')); @@ -376,7 +376,7 @@ impl<'data> UnicodeSet<'data> { /// # Examples /// /// ``` - /// use icu::uniset::UnicodeSet; + /// use icu_uniset::UnicodeSet; /// let example_list = [0x41, 0x43, 0x44, 0x45]; /// let example = UnicodeSet::from_inversion_list_slice(&example_list).unwrap(); /// assert!(example.contains_u32(0x41)); @@ -395,7 +395,7 @@ impl<'data> UnicodeSet<'data> { /// # Examples /// /// ``` - /// use icu::uniset::UnicodeSet; + /// use icu_uniset::UnicodeSet; /// let example_list = [0x41, 0x43, 0x44, 0x45]; /// let example = UnicodeSet::from_inversion_list_slice(&example_list).unwrap(); /// assert!(example.contains_range(&('A'..'C'))); @@ -414,7 +414,7 @@ impl<'data> UnicodeSet<'data> { /// # Examples /// /// ``` - /// use icu::uniset::UnicodeSet; + /// use icu_uniset::UnicodeSet; /// use std::char; /// let check = char::from_u32(0xD7FE).unwrap() .. char::from_u32(0xE001).unwrap(); /// let example_list = [0xD7FE, 0xD7FF, 0xE000, 0xE001]; @@ -447,7 +447,7 @@ impl<'data> UnicodeSet<'data> { /// # Examples /// /// ``` - /// use icu::uniset::UnicodeSet; + /// use icu_uniset::UnicodeSet; /// let example_list = [0x41, 0x46, 0x55, 0x5B]; // A - E, U - Z /// let example = UnicodeSet::from_inversion_list_slice(&example_list).unwrap(); /// let a_to_d = UnicodeSet::from_inversion_list_slice(&[0x41, 0x45]).unwrap(); @@ -487,7 +487,7 @@ impl<'data> UnicodeSet<'data> { /// # Examples /// /// ``` - /// use icu::uniset::UnicodeSet; + /// use icu_uniset::UnicodeSet; /// let example_list = [0x41, 0x44]; // {A, B, C} /// let example = UnicodeSet::from_inversion_list_slice(&example_list).unwrap(); /// assert_eq!(example.span("CABXYZ", true), 3); @@ -508,7 +508,7 @@ impl<'data> UnicodeSet<'data> { /// # Examples /// /// ``` - /// use icu::uniset::UnicodeSet; + /// use icu_uniset::UnicodeSet; /// let example_list = [0x41, 0x44]; // {A, B, C} /// let example = UnicodeSet::from_inversion_list_slice(&example_list).unwrap(); /// assert_eq!(example.span_back("XYZCAB", true), 3); diff --git a/components/uniset/src/utils.rs b/utils/uniset/src/utils.rs similarity index 100% rename from components/uniset/src/utils.rs rename to utils/uniset/src/utils.rs From ee4e8e9661270445ab50f9de07b6e3f662294142 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Tue, 19 Oct 2021 15:28:04 -0700 Subject: [PATCH 12/28] Update path to uniset crate in CI job for benchmarking --- .github/workflows/build-test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml index 4c73a48ca42..db23b5ece31 100644 --- a/.github/workflows/build-test.yml +++ b/.github/workflows/build-test.yml @@ -460,9 +460,9 @@ jobs: matrix: component: - components/locid - - components/uniset - components/plurals - components/datetime + - utils/uniset - utils/fixed_decimal From c5432001298aa975abc113bc172ddfa113de9276 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Tue, 12 Oct 2021 16:27:29 -0700 Subject: [PATCH 13/28] Implement TrieValue for GeneralSubcategory --- Cargo.lock | 1 + components/properties/Cargo.toml | 1 + components/properties/src/lib.rs | 1 + components/properties/src/props.rs | 2 +- components/properties/src/provider.rs | 16 ++++++++-- components/properties/src/trievalue.rs | 20 ++++++++++++ provider/uprops/src/enum_codepointtrie.rs | 38 +++++++++++++++++++++-- utils/codepointtrie/src/codepointtrie.rs | 19 ++++++++---- 8 files changed, 86 insertions(+), 12 deletions(-) create mode 100644 components/properties/src/trievalue.rs diff --git a/Cargo.lock b/Cargo.lock index 298ddab4996..433a8874f2b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1220,6 +1220,7 @@ name = "icu_properties" version = "0.3.0" dependencies = [ "icu", + "icu_codepointtrie", "icu_provider", "icu_uniset", "num_enum", diff --git a/components/properties/Cargo.toml b/components/properties/Cargo.toml index e07a28102eb..f497a452360 100644 --- a/components/properties/Cargo.toml +++ b/components/properties/Cargo.toml @@ -32,6 +32,7 @@ denylist = ["bench"] all-features = true [dependencies] +icu_codepointtrie = { version = "0.3", path = "../../utils/codepointtrie", features = ["serde"] } icu_provider = { version = "0.3", path = "../../provider/core", features = ["macros"] } icu_uniset = { version = "0.3", path = "../../utils/uniset", features = ["serde"] } num_enum = { version = "0.5.4", default-features = false } diff --git a/components/properties/src/lib.rs b/components/properties/src/lib.rs index 951b019ff83..24327b43db1 100644 --- a/components/properties/src/lib.rs +++ b/components/properties/src/lib.rs @@ -20,6 +20,7 @@ mod props; pub mod provider; pub mod sets; +mod trievalue; mod ule; pub use props::*; diff --git a/components/properties/src/props.rs b/components/properties/src/props.rs index c2ef3c9aa4b..949978c5249 100644 --- a/components/properties/src/props.rs +++ b/components/properties/src/props.rs @@ -38,7 +38,7 @@ impl From<&TinyStr16> for EnumeratedProperty { /// Enumerated Unicode general category types. /// GeneralSubcategory only supports specific subcategories (eg `UppercaseLetter`). /// It does not support grouped categories (eg `Letter`). For grouped categories, use [`GeneralCategory`]. -#[derive(Copy, Clone, PartialEq, Debug, TryFromPrimitive, UnsafeFromPrimitive)] +#[derive(Copy, Clone, PartialEq, Eq, Debug, TryFromPrimitive, UnsafeFromPrimitive)] #[repr(u8)] pub enum GeneralSubcategory { /// A reserved unassigned code point or a noncharacter diff --git a/components/properties/src/provider.rs b/components/properties/src/provider.rs index 7a4dc93a125..35e044fef7a 100644 --- a/components/properties/src/provider.rs +++ b/components/properties/src/provider.rs @@ -32,10 +32,10 @@ pub mod key { }; } - define_resource_keys!(265; + define_resource_keys!(267; // - // Binary properties + // Binary property UnicodeSets // (ASCII_HEX_DIGIT_V1, "AHex"), @@ -105,7 +105,7 @@ pub mod key { (XID_START_V1, "XIDS"), // - // Enumerated properties + // Enumerated property prop=val UnicodeSets // // Note: The ResourceKey subcategory strings are determined from the @@ -312,6 +312,16 @@ pub mod key { (SCRIPT_YEZIDI_V1, "sc=Yezi"), (SCRIPT_YI_V1, "sc=Yiii"), (SCRIPT_ZANABAZAR_SQUARE_V1, "sc=Zanb"), + + // + // Enumerated property CodePointMaps + // + + // ResourceKey subcategory string is the short alias of the property + + (GENERAL_CATEGORY_V1, "gc"), + (SCRIPT_V1, "sc"), + ); } diff --git a/components/properties/src/trievalue.rs b/components/properties/src/trievalue.rs new file mode 100644 index 00000000000..756b782490b --- /dev/null +++ b/components/properties/src/trievalue.rs @@ -0,0 +1,20 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use crate::{GeneralSubcategory, Script}; +use icu_codepointtrie::codepointtrie::TrieValue; +use num_enum::TryFromPrimitiveError; + +use core::convert::TryFrom; + +impl TrieValue for GeneralSubcategory { + const DATA_GET_ERROR_VALUE: GeneralSubcategory = GeneralSubcategory::Unassigned; + type Error = TryFromPrimitiveError; + // Values from CodePointTrie data from ICU will use `u8` values, for + // which we use the `GeneralSubcategory` enum. The values will be widened + // to `u32` at the call site, so it is okay to truncate to convert to `u8`. + fn parse_from_u32(i: u32) -> Result { + GeneralSubcategory::try_from(i as u8) + } +} diff --git a/provider/uprops/src/enum_codepointtrie.rs b/provider/uprops/src/enum_codepointtrie.rs index cf2b84542d0..94e690cc7fe 100644 --- a/provider/uprops/src/enum_codepointtrie.rs +++ b/provider/uprops/src/enum_codepointtrie.rs @@ -21,7 +21,7 @@ pub struct EnumeratedPropertyCodePointTrieProvider { } impl EnumeratedPropertyCodePointTrieProvider { - pub fn _new(root_dir: PathBuf) -> Self { + pub fn new(root_dir: PathBuf) -> Self { EnumeratedPropertyCodePointTrieProvider { root_dir } } @@ -52,7 +52,7 @@ impl TryFrom = ZeroVec::clone_from_slice(&cpt_data.index); - let data: Result, String> = if let Some(data_8) = cpt_data.data_8 { + let data: Result, T::Error> = if let Some(data_8) = cpt_data.data_8 { data_8 .iter() .map(|i| T::parse_from_u32(*i as u32)) @@ -111,3 +111,37 @@ impl<'data, T: TrieValue> DataProvider<'data, UnicodePropertyMapV1Marker> }) } } + +#[cfg(test)] +mod tests { + use super::*; + use icu_codepointtrie::codepointtrie::CodePointTrie; + use icu_properties::GeneralSubcategory; + use icu_properties::provider::key; + + // A test of the UnicodeProperty General_Category is truly a test of the + // `GeneralSubcategory` Rust enum, not the `GeneralCategory` Rust enum, + // since we must match the representation and value width of the data from + // the ICU CodePointTrie that ICU4X is reading from. + #[test] + fn test_general_category() { + let root_dir = icu_testdata::paths::data_root().join("uprops"); + let provider = EnumeratedPropertyCodePointTrieProvider::new(root_dir); + + let payload: DataPayload<'_, UnicodePropertyMapV1Marker> = provider + .load_payload(&DataRequest { + resource_path: ResourcePath { + key: key::GENERAL_CATEGORY_V1, + options: ResourceOptions::default(), + }, + }) + .expect("The data should be valid") + .take_payload() + .expect("Loading was successful"); + + let trie: &CodePointTrie = &payload.get().codepoint_trie; + + assert_eq!(trie.get('꣓' as u32), GeneralSubcategory::Digit); + assert_eq!(trie.get('≈' as u32), GeneralSubcategory::MathSymbol); + } +} \ No newline at end of file diff --git a/utils/codepointtrie/src/codepointtrie.rs b/utils/codepointtrie/src/codepointtrie.rs index b4886468caf..157720ccc5c 100644 --- a/utils/codepointtrie/src/codepointtrie.rs +++ b/utils/codepointtrie/src/codepointtrie.rs @@ -6,9 +6,11 @@ use crate::error::Error; use crate::impl_const::*; use core::convert::TryFrom; +use core::fmt::Display; use icu_provider::yoke::{self, Yokeable, ZeroCopyFrom}; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; +use core::num::TryFromIntError; use zerovec::ZeroVec; /// The type of trie represents whether the trie has an optimization that @@ -36,29 +38,34 @@ pub trait TrieValue: Copy + Eq + PartialEq + zerovec::ule::AsULE + 'static { /// /// In most cases, the error value is read from the last element of the `data` array. const DATA_GET_ERROR_VALUE: Self; + // TODO: comment + type Error: Display; /// A parsing function that is primarily motivated by deserialization contexts. /// When the serialization type width is smaller than 32 bits, then it is expected /// that the call site will widen the value to a `u32` first. - fn parse_from_u32(i: u32) -> Result; + fn parse_from_u32(i: u32) -> Result; } impl TrieValue for u8 { const DATA_GET_ERROR_VALUE: u8 = u8::MAX; - fn parse_from_u32(i: u32) -> Result { - Self::try_from(i).map_err(|e| e.to_string()) + type Error = TryFromIntError; + fn parse_from_u32(i: u32) -> Result { + Self::try_from(i) } } impl TrieValue for u16 { const DATA_GET_ERROR_VALUE: u16 = u16::MAX; - fn parse_from_u32(i: u32) -> Result { - Self::try_from(i).map_err(|e| e.to_string()) + type Error = TryFromIntError; + fn parse_from_u32(i: u32) -> Result { + Self::try_from(i) } } impl TrieValue for u32 { const DATA_GET_ERROR_VALUE: u32 = u32::MAX; - fn parse_from_u32(i: u32) -> Result { + type Error = TryFromIntError; + fn parse_from_u32(i: u32) -> Result { Ok(i) } } From 40381be31c38ff37e60a2af657efc6d18778cb31 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Wed, 20 Oct 2021 13:27:21 -0700 Subject: [PATCH 14/28] Implement TrieValue for Script --- components/properties/src/props.rs | 2 +- components/properties/src/trievalue.rs | 21 +++++++++++++++------ 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/components/properties/src/props.rs b/components/properties/src/props.rs index 949978c5249..70f7fcccd54 100644 --- a/components/properties/src/props.rs +++ b/components/properties/src/props.rs @@ -250,7 +250,7 @@ impl From for GeneralCategory { /// See UScriptCode in ICU4C. #[derive(Copy, Clone, Debug, Eq, PartialEq)] #[repr(transparent)] -pub struct Script(pub(crate) u16); +pub struct Script(pub u16); #[allow(missing_docs)] // These constants don't need individual documentation. #[allow(non_upper_case_globals)] diff --git a/components/properties/src/trievalue.rs b/components/properties/src/trievalue.rs index 756b782490b..941a4d2cc0c 100644 --- a/components/properties/src/trievalue.rs +++ b/components/properties/src/trievalue.rs @@ -3,6 +3,8 @@ // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). use crate::{GeneralSubcategory, Script}; +use core::convert::TryInto; +use core::num::TryFromIntError; use icu_codepointtrie::codepointtrie::TrieValue; use num_enum::TryFromPrimitiveError; @@ -10,11 +12,18 @@ use core::convert::TryFrom; impl TrieValue for GeneralSubcategory { const DATA_GET_ERROR_VALUE: GeneralSubcategory = GeneralSubcategory::Unassigned; - type Error = TryFromPrimitiveError; - // Values from CodePointTrie data from ICU will use `u8` values, for - // which we use the `GeneralSubcategory` enum. The values will be widened - // to `u32` at the call site, so it is okay to truncate to convert to `u8`. - fn parse_from_u32(i: u32) -> Result { - GeneralSubcategory::try_from(i as u8) + type TryFromU32Error = TryFromPrimitiveError; + fn try_from_u32(i: u32) -> Result { + // If the u32 is out of range, fall back to u8::MAX, which is out of range of the GeneralSubcategory enum. + GeneralSubcategory::try_from(i.try_into().unwrap_or(u8::MAX)) + } +} + +impl TrieValue for Script { + const DATA_GET_ERROR_VALUE: Script = Script::Unknown; + type TryFromU32Error = TryFromIntError; + + fn try_from_u32(i: u32) -> Result { + u16::try_from(i).map(Script) } } From d4bbcbd44d49b3c7fe8e838264490acb8584531e Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Wed, 20 Oct 2021 13:39:34 -0700 Subject: [PATCH 15/28] Rename TrieValue trait's associate type for Result errors --- provider/uprops/src/enum_codepointtrie.rs | 8 ++++---- utils/codepointtrie/src/codepointtrie.rs | 18 +++++++++--------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/provider/uprops/src/enum_codepointtrie.rs b/provider/uprops/src/enum_codepointtrie.rs index 94e690cc7fe..b3922c1d897 100644 --- a/provider/uprops/src/enum_codepointtrie.rs +++ b/provider/uprops/src/enum_codepointtrie.rs @@ -52,20 +52,20 @@ impl TryFrom = ZeroVec::clone_from_slice(&cpt_data.index); - let data: Result, T::Error> = if let Some(data_8) = cpt_data.data_8 { + let data: Result, T::TryFromU32Error> = if let Some(data_8) = cpt_data.data_8 { data_8 .iter() - .map(|i| T::parse_from_u32(*i as u32)) + .map(|i| T::try_from_u32(*i as u32)) .collect() } else if let Some(data_16) = cpt_data.data_16 { data_16 .iter() - .map(|i| T::parse_from_u32(*i as u32)) + .map(|i| T::try_from_u32(*i as u32)) .collect() } else if let Some(data_32) = cpt_data.data_32 { data_32 .iter() - .map(|i| T::parse_from_u32(*i as u32)) + .map(|i| T::try_from_u32(*i as u32)) .collect() } else { return Err(DataError::new_resc_error( diff --git a/utils/codepointtrie/src/codepointtrie.rs b/utils/codepointtrie/src/codepointtrie.rs index 157720ccc5c..e59138f7e67 100644 --- a/utils/codepointtrie/src/codepointtrie.rs +++ b/utils/codepointtrie/src/codepointtrie.rs @@ -38,34 +38,34 @@ pub trait TrieValue: Copy + Eq + PartialEq + zerovec::ule::AsULE + 'static { /// /// In most cases, the error value is read from the last element of the `data` array. const DATA_GET_ERROR_VALUE: Self; - // TODO: comment - type Error: Display; + /// Error type when converting from a u32 to this TrieValue. + type TryFromU32Error: Display; /// A parsing function that is primarily motivated by deserialization contexts. /// When the serialization type width is smaller than 32 bits, then it is expected /// that the call site will widen the value to a `u32` first. - fn parse_from_u32(i: u32) -> Result; + fn try_from_u32(i: u32) -> Result; } impl TrieValue for u8 { const DATA_GET_ERROR_VALUE: u8 = u8::MAX; - type Error = TryFromIntError; - fn parse_from_u32(i: u32) -> Result { + type TryFromU32Error = TryFromIntError; + fn try_from_u32(i: u32) -> Result { Self::try_from(i) } } impl TrieValue for u16 { const DATA_GET_ERROR_VALUE: u16 = u16::MAX; - type Error = TryFromIntError; - fn parse_from_u32(i: u32) -> Result { + type TryFromU32Error = TryFromIntError; + fn try_from_u32(i: u32) -> Result { Self::try_from(i) } } impl TrieValue for u32 { const DATA_GET_ERROR_VALUE: u32 = u32::MAX; - type Error = TryFromIntError; - fn parse_from_u32(i: u32) -> Result { + type TryFromU32Error = TryFromIntError; + fn try_from_u32(i: u32) -> Result { Ok(i) } } From 997db9347347b153c6ea5338f623909f05d0f139 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Wed, 20 Oct 2021 13:40:45 -0700 Subject: [PATCH 16/28] Remove unneeded dependency --- utils/codepointtrie/Cargo.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/utils/codepointtrie/Cargo.toml b/utils/codepointtrie/Cargo.toml index 1f102a894b8..10ba039cb67 100644 --- a/utils/codepointtrie/Cargo.toml +++ b/utils/codepointtrie/Cargo.toml @@ -42,7 +42,6 @@ zerovec = { version = "0.3", path = "../../utils/zerovec", features = ["serde", postcard = { version = "0.7", features = ["alloc"] } toml = "0.5" serde = { version = "1.0", features = ["derive"] } -zerovec = { version = "0.3", path = "../../utils/zerovec", features = ["serde"] } # TODO: Remove? [lib] bench = false # This option is required for Benchmark CI From 28b51c0201d43305d7bced4c694e8d011fb11b48 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Wed, 20 Oct 2021 13:41:14 -0700 Subject: [PATCH 17/28] Revert version number of icu_codepointtrie --- Cargo.lock | 2 +- components/properties/Cargo.toml | 2 +- provider/uprops/Cargo.toml | 2 +- utils/codepointtrie/Cargo.toml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 433a8874f2b..b432c12bbe1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1071,7 +1071,7 @@ dependencies = [ [[package]] name = "icu_codepointtrie" -version = "0.3.0" +version = "0.2.0" dependencies = [ "displaydoc", "icu_provider", diff --git a/components/properties/Cargo.toml b/components/properties/Cargo.toml index f497a452360..ebbd09133cc 100644 --- a/components/properties/Cargo.toml +++ b/components/properties/Cargo.toml @@ -32,7 +32,7 @@ denylist = ["bench"] all-features = true [dependencies] -icu_codepointtrie = { version = "0.3", path = "../../utils/codepointtrie", features = ["serde"] } +icu_codepointtrie = { version = "0.2", path = "../../utils/codepointtrie", features = ["serde"] } icu_provider = { version = "0.3", path = "../../provider/core", features = ["macros"] } icu_uniset = { version = "0.3", path = "../../utils/uniset", features = ["serde"] } num_enum = { version = "0.5.4", default-features = false } diff --git a/provider/uprops/Cargo.toml b/provider/uprops/Cargo.toml index a4f9bcddd89..9e493b43711 100644 --- a/provider/uprops/Cargo.toml +++ b/provider/uprops/Cargo.toml @@ -28,7 +28,7 @@ all-features = true [dependencies] displaydoc = { version = "0.2.3", default-features = false } -icu_codepointtrie = { version = "0.3", path = "../../utils/codepointtrie", features = ["provider_serde"] } +icu_codepointtrie = { version = "0.2", path = "../../utils/codepointtrie", features = ["provider_serde"] } icu_properties = { version = "0.3", path = "../../components/properties", features = ["provider_serde"] } icu_provider = { version = "0.3", path = "../../provider/core", features = ["provider_serde"] } icu_uniset = { version = "0.3", path = "../../utils/uniset", features = ["provider_serde"] } diff --git a/utils/codepointtrie/Cargo.toml b/utils/codepointtrie/Cargo.toml index 10ba039cb67..62d0808e3d6 100644 --- a/utils/codepointtrie/Cargo.toml +++ b/utils/codepointtrie/Cargo.toml @@ -5,7 +5,7 @@ [package] name = "icu_codepointtrie" description = "API for an efficient trie of data for Unicode code points" -version = "0.3.0" +version = "0.2.0" authors = ["The ICU4X Project Developers"] edition = "2018" readme = "README.md" From e10eb66ec9d0a9b2a962ff87bf013b81cd5f78c5 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Wed, 20 Oct 2021 13:57:51 -0700 Subject: [PATCH 18/28] Move data structs for UnicodePropertyMap from icu_codepointtrie to icu_properties --- components/properties/src/provider.rs | 43 ++++++++++++++++ provider/uprops/src/enum_codepointtrie.rs | 60 ++++++++++------------- utils/codepointtrie/src/codepointtrie.rs | 2 +- utils/codepointtrie/src/lib.rs | 1 - utils/codepointtrie/src/provider.rs | 44 ----------------- 5 files changed, 70 insertions(+), 80 deletions(-) delete mode 100644 utils/codepointtrie/src/provider.rs diff --git a/components/properties/src/provider.rs b/components/properties/src/provider.rs index 35e044fef7a..d4e2d5abaa8 100644 --- a/components/properties/src/provider.rs +++ b/components/properties/src/provider.rs @@ -6,6 +6,7 @@ //! //! Read more about data providers: [`icu_provider`] +use icu_codepointtrie::codepointtrie::{CodePointTrie, TrieValue}; use icu_provider::yoke::{self, *}; use icu_uniset::UnicodeSet; use icu_uniset::UnicodeSetBuilder; @@ -325,6 +326,10 @@ pub mod key { ); } +// +// UnicodeProperty +// + /// A set of characters with a particular property. #[icu_provider::data_struct] #[derive(Debug, Eq, PartialEq, Clone)] @@ -359,3 +364,41 @@ impl<'data> From> for UnicodeSet<'data> { prop.inv_list } } + +// +// UnicodePropertyMap +// + +/// A map efficiently storing data about individual characters. +#[derive(Debug, Eq, PartialEq, Yokeable, ZeroCopyFrom)] +#[cfg_attr( + feature = "provider_serde", + derive(serde::Serialize, serde::Deserialize) +)] +pub struct UnicodePropertyMapV1<'data, T: TrieValue> { + /// A codepoint trie storing the data + #[cfg_attr(feature = "provider_serde", serde(borrow))] + pub codepoint_trie: CodePointTrie<'data, T>, +} + +impl<'data, T: TrieValue> Clone for UnicodePropertyMapV1<'data, T> +where + ::ULE: Clone, +{ + fn clone(&self) -> Self { + UnicodePropertyMapV1 { + codepoint_trie: self.codepoint_trie.clone(), + } + } +} + +/// Marker type for UnicodePropertyMapV1. +/// This is generated by hand because icu_provider::data_struct doesn't support generics yet. +pub struct UnicodePropertyMapV1Marker { + _phantom: core::marker::PhantomData, +} + +impl<'data, T: TrieValue> icu_provider::DataMarker<'data> for UnicodePropertyMapV1Marker { + type Yokeable = UnicodePropertyMapV1<'static, T>; + type Cart = UnicodePropertyMapV1<'data, T>; +} diff --git a/provider/uprops/src/enum_codepointtrie.rs b/provider/uprops/src/enum_codepointtrie.rs index b3922c1d897..12b85c95a54 100644 --- a/provider/uprops/src/enum_codepointtrie.rs +++ b/provider/uprops/src/enum_codepointtrie.rs @@ -7,7 +7,7 @@ use crate::uprops_serde; use crate::uprops_serde::enumerated::EnumeratedPropertyCodePointTrie; use icu_codepointtrie::codepointtrie::{CodePointTrie, CodePointTrieHeader, TrieType, TrieValue}; -use icu_codepointtrie::provider::{UnicodePropertyMapV1, UnicodePropertyMapV1Marker}; +use icu_properties::provider::{UnicodePropertyMapV1, UnicodePropertyMapV1Marker}; use icu_provider::prelude::*; use zerovec::ZeroVec; @@ -52,28 +52,20 @@ impl TryFrom = ZeroVec::clone_from_slice(&cpt_data.index); - let data: Result, T::TryFromU32Error> = if let Some(data_8) = cpt_data.data_8 { - data_8 - .iter() - .map(|i| T::try_from_u32(*i as u32)) - .collect() - } else if let Some(data_16) = cpt_data.data_16 { - data_16 - .iter() - .map(|i| T::try_from_u32(*i as u32)) - .collect() - } else if let Some(data_32) = cpt_data.data_32 { - data_32 - .iter() - .map(|i| T::try_from_u32(*i as u32)) - .collect() - } else { - return Err(DataError::new_resc_error( - icu_codepointtrie::error::Error::FromDeserialized { - reason: "Cannot deserialize data array for CodePointTrie in TOML", - }, - )); - }; + let data: Result, T::TryFromU32Error> = + if let Some(data_8) = cpt_data.data_8 { + data_8.iter().map(|i| T::try_from_u32(*i as u32)).collect() + } else if let Some(data_16) = cpt_data.data_16 { + data_16.iter().map(|i| T::try_from_u32(*i as u32)).collect() + } else if let Some(data_32) = cpt_data.data_32 { + data_32.iter().map(|i| T::try_from_u32(*i as u32)).collect() + } else { + return Err(DataError::new_resc_error( + icu_codepointtrie::error::Error::FromDeserialized { + reason: "Cannot deserialize data array for CodePointTrie in TOML", + }, + )); + }; let data = data.map_err(DataError::new_resc_error)?; let trie = @@ -116,8 +108,8 @@ impl<'data, T: TrieValue> DataProvider<'data, UnicodePropertyMapV1Marker> mod tests { use super::*; use icu_codepointtrie::codepointtrie::CodePointTrie; - use icu_properties::GeneralSubcategory; use icu_properties::provider::key; + use icu_properties::GeneralSubcategory; // A test of the UnicodeProperty General_Category is truly a test of the // `GeneralSubcategory` Rust enum, not the `GeneralCategory` Rust enum, @@ -129,19 +121,19 @@ mod tests { let provider = EnumeratedPropertyCodePointTrieProvider::new(root_dir); let payload: DataPayload<'_, UnicodePropertyMapV1Marker> = provider - .load_payload(&DataRequest { - resource_path: ResourcePath { - key: key::GENERAL_CATEGORY_V1, - options: ResourceOptions::default(), - }, - }) - .expect("The data should be valid") - .take_payload() - .expect("Loading was successful"); + .load_payload(&DataRequest { + resource_path: ResourcePath { + key: key::GENERAL_CATEGORY_V1, + options: ResourceOptions::default(), + }, + }) + .expect("The data should be valid") + .take_payload() + .expect("Loading was successful"); let trie: &CodePointTrie = &payload.get().codepoint_trie; assert_eq!(trie.get('꣓' as u32), GeneralSubcategory::Digit); assert_eq!(trie.get('≈' as u32), GeneralSubcategory::MathSymbol); } -} \ No newline at end of file +} diff --git a/utils/codepointtrie/src/codepointtrie.rs b/utils/codepointtrie/src/codepointtrie.rs index e59138f7e67..c70fc2cc12a 100644 --- a/utils/codepointtrie/src/codepointtrie.rs +++ b/utils/codepointtrie/src/codepointtrie.rs @@ -7,10 +7,10 @@ use crate::impl_const::*; use core::convert::TryFrom; use core::fmt::Display; +use core::num::TryFromIntError; use icu_provider::yoke::{self, Yokeable, ZeroCopyFrom}; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; -use core::num::TryFromIntError; use zerovec::ZeroVec; /// The type of trie represents whether the trie has an optimization that diff --git a/utils/codepointtrie/src/lib.rs b/utils/codepointtrie/src/lib.rs index 357d35fe833..18c104ff904 100644 --- a/utils/codepointtrie/src/lib.rs +++ b/utils/codepointtrie/src/lib.rs @@ -39,4 +39,3 @@ pub mod codepointtrie; pub mod error; mod impl_const; pub mod planes; -pub mod provider; diff --git a/utils/codepointtrie/src/provider.rs b/utils/codepointtrie/src/provider.rs deleted file mode 100644 index 1909da5c796..00000000000 --- a/utils/codepointtrie/src/provider.rs +++ /dev/null @@ -1,44 +0,0 @@ -// This file is part of ICU4X. For terms of use, please see the file -// called LICENSE at the top level of the ICU4X source tree -// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). - -//! Data provider struct definitions for this ICU4X component. -//! -//! Read more about data providers: [`icu_provider`] - -use crate::codepointtrie::{CodePointTrie, TrieValue}; -use icu_provider::yoke::{self, Yokeable, ZeroCopyFrom}; - -/// A map efficiently storing data about individual characters. -#[derive(Debug, Eq, PartialEq, Yokeable, ZeroCopyFrom)] -#[cfg_attr( - feature = "provider_serde", - derive(serde::Serialize, serde::Deserialize) -)] -pub struct UnicodePropertyMapV1<'data, T: TrieValue> { - /// A codepoint trie storing the data - #[cfg_attr(feature = "provider_serde", serde(borrow))] - pub codepoint_trie: CodePointTrie<'data, T>, -} - -impl<'data, T: TrieValue> Clone for UnicodePropertyMapV1<'data, T> -where - ::ULE: Clone, -{ - fn clone(&self) -> Self { - UnicodePropertyMapV1 { - codepoint_trie: self.codepoint_trie.clone(), - } - } -} - -/// Marker type for UnicodePropertyMapV1. -/// This is generated by hand because icu_provider::data_struct doesn't support generics yet. -pub struct UnicodePropertyMapV1Marker { - _phantom: core::marker::PhantomData, -} - -impl<'data, T: TrieValue> icu_provider::DataMarker<'data> for UnicodePropertyMapV1Marker { - type Yokeable = UnicodePropertyMapV1<'static, T>; - type Cart = UnicodePropertyMapV1<'data, T>; -} From 5b32a4d3f202811cdc7e0716a795a035af250f7e Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Wed, 20 Oct 2021 14:08:42 -0700 Subject: [PATCH 19/28] Error message rewording --- provider/uprops/src/enum_codepointtrie.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/provider/uprops/src/enum_codepointtrie.rs b/provider/uprops/src/enum_codepointtrie.rs index 12b85c95a54..9dfc31275a5 100644 --- a/provider/uprops/src/enum_codepointtrie.rs +++ b/provider/uprops/src/enum_codepointtrie.rs @@ -62,7 +62,7 @@ impl TryFrom Date: Wed, 20 Oct 2021 14:19:10 -0700 Subject: [PATCH 20/28] Finish reverting unneeded renaming/refactoring in icu_properties --- provider/uprops/src/lib.rs | 4 ++-- provider/uprops/src/provider.rs | 12 ++++++------ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/provider/uprops/src/lib.rs b/provider/uprops/src/lib.rs index e5997cef97b..12749c30a3a 100644 --- a/provider/uprops/src/lib.rs +++ b/provider/uprops/src/lib.rs @@ -21,7 +21,7 @@ mod bin_uniset; mod enum_codepointtrie; mod enum_uniset; mod error; +mod provider; mod uprops_serde; -pub use bin_uniset::BinaryPropertyUnicodeSetDataProvider; -pub use enum_uniset::EnumeratedPropertyUnicodeSetDataProvider; +pub use provider::PropertiesDataProvider; diff --git a/provider/uprops/src/provider.rs b/provider/uprops/src/provider.rs index b54d2674ae5..35bc74325f5 100644 --- a/provider/uprops/src/provider.rs +++ b/provider/uprops/src/provider.rs @@ -2,8 +2,8 @@ // called LICENSE at the top level of the ICU4X source tree // (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). -use crate::binary::BinaryPropertiesDataProvider; -use crate::enumerated::EnumeratedPropertiesDataProvider; +use crate::bin_uniset::BinaryPropertyUnicodeSetDataProvider; +use crate::enum_uniset::EnumeratedPropertyUnicodeSetDataProvider; use icu_properties::provider::UnicodePropertyV1Marker; use icu_provider::iter::IterableDataProviderCore; use icu_provider::prelude::*; @@ -11,14 +11,14 @@ use icu_provider::prelude::*; use std::path::PathBuf; pub struct PropertiesDataProvider { - binary: BinaryPropertiesDataProvider, - enumerated: EnumeratedPropertiesDataProvider, + binary: BinaryPropertyUnicodeSetDataProvider, + enumerated: EnumeratedPropertyUnicodeSetDataProvider, } impl PropertiesDataProvider { pub fn new(root_dir: PathBuf) -> Self { - let binary = BinaryPropertiesDataProvider::new(root_dir.clone()); - let enumerated = EnumeratedPropertiesDataProvider::new(root_dir); + let binary = BinaryPropertyUnicodeSetDataProvider::new(root_dir.clone()); + let enumerated = EnumeratedPropertyUnicodeSetDataProvider::new(root_dir); Self { binary, enumerated } } } From 7139775b1b6ccb81f96827a3cbc9d53f48febc59 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Wed, 20 Oct 2021 14:35:43 -0700 Subject: [PATCH 21/28] Add docstrings for the uprops data providers --- provider/uprops/src/enum_codepointtrie.rs | 6 ++++++ provider/uprops/src/provider.rs | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/provider/uprops/src/enum_codepointtrie.rs b/provider/uprops/src/enum_codepointtrie.rs index 9dfc31275a5..9d97289836b 100644 --- a/provider/uprops/src/enum_codepointtrie.rs +++ b/provider/uprops/src/enum_codepointtrie.rs @@ -16,7 +16,13 @@ use core::convert::TryFrom; use std::fs; use std::path::PathBuf; +/// This data provider returns `CodePointTrie` data inside a +/// `UnicodePropertyMap` data struct. The source data is the same as that of +/// [crate::provider::PropertiesDataProvider], which is a TOML file of data +/// for the property(-ies) desired, as given by the ICU4C property data +/// exporter tool. pub struct EnumeratedPropertyCodePointTrieProvider { + /// Path to the root directory containing the property data TOML files. root_dir: PathBuf, } diff --git a/provider/uprops/src/provider.rs b/provider/uprops/src/provider.rs index 35bc74325f5..21430826e18 100644 --- a/provider/uprops/src/provider.rs +++ b/provider/uprops/src/provider.rs @@ -10,12 +10,18 @@ use icu_provider::prelude::*; use std::path::PathBuf; +/// This data provider returns `UnicodeSet` data inside a `UnicodeProperty` +/// data struct. The source data is in the form of a directory of TOML file(s) +/// of data for the property(-ies) desired, as given by the ICU4C property data +/// exporter tool. pub struct PropertiesDataProvider { binary: BinaryPropertyUnicodeSetDataProvider, enumerated: EnumeratedPropertyUnicodeSetDataProvider, } impl PropertiesDataProvider { + /// Construct a new data provider instance. `root_dir` is the path to the + /// root directory containing the property data TOML files. pub fn new(root_dir: PathBuf) -> Self { let binary = BinaryPropertyUnicodeSetDataProvider::new(root_dir.clone()); let enumerated = EnumeratedPropertyUnicodeSetDataProvider::new(root_dir); From fd0f74821d8cf64d62ec6721b70245a38a668b63 Mon Sep 17 00:00:00 2001 From: Elango Cheran Date: Wed, 20 Oct 2021 14:44:53 -0700 Subject: [PATCH 22/28] Add test for Script using data provider for CodePointTrie data --- provider/uprops/src/enum_codepointtrie.rs | 24 ++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/provider/uprops/src/enum_codepointtrie.rs b/provider/uprops/src/enum_codepointtrie.rs index 9d97289836b..a8cf5f2a6ad 100644 --- a/provider/uprops/src/enum_codepointtrie.rs +++ b/provider/uprops/src/enum_codepointtrie.rs @@ -115,7 +115,7 @@ mod tests { use super::*; use icu_codepointtrie::codepointtrie::CodePointTrie; use icu_properties::provider::key; - use icu_properties::GeneralSubcategory; + use icu_properties::{GeneralSubcategory, Script}; // A test of the UnicodeProperty General_Category is truly a test of the // `GeneralSubcategory` Rust enum, not the `GeneralCategory` Rust enum, @@ -142,4 +142,26 @@ mod tests { assert_eq!(trie.get('꣓' as u32), GeneralSubcategory::Digit); assert_eq!(trie.get('≈' as u32), GeneralSubcategory::MathSymbol); } + + #[test] + fn test_script() { + let root_dir = icu_testdata::paths::data_root().join("uprops"); + let provider = EnumeratedPropertyCodePointTrieProvider::new(root_dir); + + let payload: DataPayload<'_, UnicodePropertyMapV1Marker