From 3ea4f805888821d2ca52d821a868806ae2fa8abb Mon Sep 17 00:00:00 2001 From: Elango Date: Wed, 20 Oct 2021 18:01:02 -0700 Subject: [PATCH] CodePointTrie data provider (#1167) --- Cargo.lock | 7 +- components/properties/Cargo.toml | 1 + components/properties/src/lib.rs | 3 +- components/properties/src/props.rs | 9 +- components/properties/src/provider.rs | 59 ++++++- components/properties/src/trievalue.rs | 29 +++ docs/tutorials/writing_a_new_data_struct.md | 2 +- provider/uprops/Cargo.toml | 2 + .../uprops/src/{binary.rs => bin_uniset.rs} | 14 +- provider/uprops/src/enum_codepointtrie.rs | 167 ++++++++++++++++++ .../src/{enumerated.rs => enum_uniset.rs} | 22 +-- provider/uprops/src/lib.rs | 6 +- provider/uprops/src/provider.rs | 18 +- provider/uprops/src/uprops_serde.rs | 33 +++- utils/codepointtrie/Cargo.toml | 5 +- utils/codepointtrie/src/codepointtrie.rs | 23 ++- utils/codepointtrie/src/error.rs | 6 +- utils/codepointtrie/src/lib.rs | 5 +- utils/codepointtrie/src/provider.rs | 44 ----- 19 files changed, 367 insertions(+), 88 deletions(-) create mode 100644 components/properties/src/trievalue.rs rename provider/uprops/src/{binary.rs => bin_uniset.rs} (88%) create mode 100644 provider/uprops/src/enum_codepointtrie.rs rename provider/uprops/src/{enumerated.rs => enum_uniset.rs} (93%) delete mode 100644 utils/codepointtrie/src/provider.rs diff --git a/Cargo.lock b/Cargo.lock index 7f2f593e260..a6e2e5ca52c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1073,11 +1073,11 @@ dependencies = [ name = "icu_codepointtrie" version = "0.2.0" dependencies = [ - "icu_provider", + "displaydoc", "postcard", "serde", - "thiserror", "toml", + "yoke", "zerovec", ] @@ -1219,6 +1219,7 @@ name = "icu_properties" version = "0.3.0" dependencies = [ "icu", + "icu_codepointtrie", "icu_provider", "icu_uniset", "num_enum", @@ -1327,12 +1328,14 @@ name = "icu_provider_uprops" version = "0.3.0" dependencies = [ "displaydoc", + "icu_codepointtrie", "icu_properties", "icu_provider", "icu_testdata", "icu_uniset", "serde", "toml", + "zerovec", ] [[package]] diff --git a/components/properties/Cargo.toml b/components/properties/Cargo.toml index 40c19b4662b..767cd07fd81 100644 --- a/components/properties/Cargo.toml +++ b/components/properties/Cargo.toml @@ -32,6 +32,7 @@ denylist = ["bench"] all-features = true [dependencies] +icu_codepointtrie = { version = "0.2", path = "../../utils/codepointtrie", features = ["serde"] } icu_provider = { version = "0.3", path = "../../provider/core", features = ["macros"] } icu_uniset = { version = "0.3", path = "../../utils/uniset", features = ["serde"] } num_enum = { version = "0.5.4", default-features = false } diff --git a/components/properties/src/lib.rs b/components/properties/src/lib.rs index 951b019ff83..8695e175ae5 100644 --- a/components/properties/src/lib.rs +++ b/components/properties/src/lib.rs @@ -15,11 +15,12 @@ //! [`UnicodeSet`]: icu_uniset::UnicodeSet //! [`sets`]: crate::sets -#![no_std] +#![cfg_attr(not(any(test, feature = "std")), no_std)] mod props; pub mod provider; pub mod sets; +mod trievalue; mod ule; pub use props::*; diff --git a/components/properties/src/props.rs b/components/properties/src/props.rs index db0bb77f59f..4a27bdf2ae6 100644 --- a/components/properties/src/props.rs +++ b/components/properties/src/props.rs @@ -11,19 +11,22 @@ use num_enum::{TryFromPrimitive, UnsafeFromPrimitive}; /// See `UProperty` in ICU4C. #[derive(Clone, PartialEq, Debug)] #[non_exhaustive] +#[repr(i32)] pub enum EnumeratedProperty { /// The General Category property. GeneralCategory = 0x1005, /// The Script property. See [`Script`]. Script = 0x100A, /// The Script_Extensions property. See [`Script`]. - ScriptExtensions = 0x7000, + ScriptExtensions = 0x7000, // TODO(#1160) - this is a Miscellaneous property, not Enumerated + /// Represents an invalid or unknown Unicode property. + InvalidCode = -1, // TODO(#1160) - taken from ICU4C UProperty::UCHAR_INVALID_CODE } /// Enumerated Unicode general category types. /// GeneralSubcategory only supports specific subcategories (eg `UppercaseLetter`). /// It does not support grouped categories (eg `Letter`). For grouped categories, use [`GeneralCategory`]. -#[derive(Copy, Clone, PartialEq, Debug, TryFromPrimitive, UnsafeFromPrimitive)] +#[derive(Copy, Clone, PartialEq, Eq, Debug, TryFromPrimitive, UnsafeFromPrimitive)] #[repr(u8)] pub enum GeneralSubcategory { /// A reserved unassigned code point or a noncharacter @@ -235,7 +238,7 @@ impl From for GeneralCategory { /// See UScriptCode in ICU4C. #[derive(Copy, Clone, Debug, Eq, PartialEq)] #[repr(transparent)] -pub struct Script(pub(crate) u16); +pub struct Script(pub u16); #[allow(missing_docs)] // These constants don't need individual documentation. #[allow(non_upper_case_globals)] diff --git a/components/properties/src/provider.rs b/components/properties/src/provider.rs index 7a4dc93a125..d4e2d5abaa8 100644 --- a/components/properties/src/provider.rs +++ b/components/properties/src/provider.rs @@ -6,6 +6,7 @@ //! //! Read more about data providers: [`icu_provider`] +use icu_codepointtrie::codepointtrie::{CodePointTrie, TrieValue}; use icu_provider::yoke::{self, *}; use icu_uniset::UnicodeSet; use icu_uniset::UnicodeSetBuilder; @@ -32,10 +33,10 @@ pub mod key { }; } - define_resource_keys!(265; + define_resource_keys!(267; // - // Binary properties + // Binary property UnicodeSets // (ASCII_HEX_DIGIT_V1, "AHex"), @@ -105,7 +106,7 @@ pub mod key { (XID_START_V1, "XIDS"), // - // Enumerated properties + // Enumerated property prop=val UnicodeSets // // Note: The ResourceKey subcategory strings are determined from the @@ -312,9 +313,23 @@ pub mod key { (SCRIPT_YEZIDI_V1, "sc=Yezi"), (SCRIPT_YI_V1, "sc=Yiii"), (SCRIPT_ZANABAZAR_SQUARE_V1, "sc=Zanb"), + + // + // Enumerated property CodePointMaps + // + + // ResourceKey subcategory string is the short alias of the property + + (GENERAL_CATEGORY_V1, "gc"), + (SCRIPT_V1, "sc"), + ); } +// +// UnicodeProperty +// + /// A set of characters with a particular property. #[icu_provider::data_struct] #[derive(Debug, Eq, PartialEq, Clone)] @@ -349,3 +364,41 @@ impl<'data> From> for UnicodeSet<'data> { prop.inv_list } } + +// +// UnicodePropertyMap +// + +/// A map efficiently storing data about individual characters. +#[derive(Debug, Eq, PartialEq, Yokeable, ZeroCopyFrom)] +#[cfg_attr( + feature = "provider_serde", + derive(serde::Serialize, serde::Deserialize) +)] +pub struct UnicodePropertyMapV1<'data, T: TrieValue> { + /// A codepoint trie storing the data + #[cfg_attr(feature = "provider_serde", serde(borrow))] + pub codepoint_trie: CodePointTrie<'data, T>, +} + +impl<'data, T: TrieValue> Clone for UnicodePropertyMapV1<'data, T> +where + ::ULE: Clone, +{ + fn clone(&self) -> Self { + UnicodePropertyMapV1 { + codepoint_trie: self.codepoint_trie.clone(), + } + } +} + +/// Marker type for UnicodePropertyMapV1. +/// This is generated by hand because icu_provider::data_struct doesn't support generics yet. +pub struct UnicodePropertyMapV1Marker { + _phantom: core::marker::PhantomData, +} + +impl<'data, T: TrieValue> icu_provider::DataMarker<'data> for UnicodePropertyMapV1Marker { + type Yokeable = UnicodePropertyMapV1<'static, T>; + type Cart = UnicodePropertyMapV1<'data, T>; +} diff --git a/components/properties/src/trievalue.rs b/components/properties/src/trievalue.rs new file mode 100644 index 00000000000..941a4d2cc0c --- /dev/null +++ b/components/properties/src/trievalue.rs @@ -0,0 +1,29 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use crate::{GeneralSubcategory, Script}; +use core::convert::TryInto; +use core::num::TryFromIntError; +use icu_codepointtrie::codepointtrie::TrieValue; +use num_enum::TryFromPrimitiveError; + +use core::convert::TryFrom; + +impl TrieValue for GeneralSubcategory { + const DATA_GET_ERROR_VALUE: GeneralSubcategory = GeneralSubcategory::Unassigned; + type TryFromU32Error = TryFromPrimitiveError; + fn try_from_u32(i: u32) -> Result { + // If the u32 is out of range, fall back to u8::MAX, which is out of range of the GeneralSubcategory enum. + GeneralSubcategory::try_from(i.try_into().unwrap_or(u8::MAX)) + } +} + +impl TrieValue for Script { + const DATA_GET_ERROR_VALUE: Script = Script::Unknown; + type TryFromU32Error = TryFromIntError; + + fn try_from_u32(i: u32) -> Result { + u16::try_from(i).map(Script) + } +} diff --git a/docs/tutorials/writing_a_new_data_struct.md b/docs/tutorials/writing_a_new_data_struct.md index 438a1646ee0..dd09362a584 100644 --- a/docs/tutorials/writing_a_new_data_struct.md +++ b/docs/tutorials/writing_a_new_data_struct.md @@ -60,7 +60,7 @@ Examples of source data providers include: - [`PluralsProvider`](https://unicode-org.github.io/icu4x-docs/doc/icu_provider_cldr/transform/struct.PluralsProvider.html) - [`DateSymbolsProvider`](https://unicode-org.github.io/icu4x-docs/doc/icu_provider_cldr/transform/struct.DateSymbolsProvider.html) - [… more examples](https://unicode-org.github.io/icu4x-docs/doc/icu_provider_cldr/transform/index.html) -- `BinaryPropertiesDataProvider` +- `BinaryPropertyUnicodeSetDataProvider` - [`HelloWorldProvider`](https://unicode-org.github.io/icu4x-docs/doc/icu_provider/hello_world/struct.HelloWorldProvider.html) Source data providers must implement the following traits: diff --git a/provider/uprops/Cargo.toml b/provider/uprops/Cargo.toml index d51b2f20ce4..8e25dd1290f 100644 --- a/provider/uprops/Cargo.toml +++ b/provider/uprops/Cargo.toml @@ -28,11 +28,13 @@ all-features = true [dependencies] displaydoc = { version = "0.2.3", default-features = false } +icu_codepointtrie = { version = "0.2", path = "../../utils/codepointtrie", features = ["provider_serde"] } icu_properties = { version = "0.3", path = "../../components/properties", features = ["provider_serde"] } icu_provider = { version = "0.3", path = "../../provider/core", features = ["provider_serde"] } icu_uniset = { version = "0.3", path = "../../utils/uniset", features = ["provider_serde"] } serde = { version = "1.0", features = ["derive"] } toml = { version = "0.5" } +zerovec = { version = "0.4", path = "../../utils/zerovec", features = ["serde", "yoke"] } [dev-dependencies] icu_testdata = { version = "0.3", path = "../../provider/testdata" } diff --git a/provider/uprops/src/binary.rs b/provider/uprops/src/bin_uniset.rs similarity index 88% rename from provider/uprops/src/binary.rs rename to provider/uprops/src/bin_uniset.rs index 2150b77c591..952df2799b7 100644 --- a/provider/uprops/src/binary.rs +++ b/provider/uprops/src/bin_uniset.rs @@ -12,14 +12,14 @@ use icu_uniset::UnicodeSetBuilder; use std::fs; use std::path::PathBuf; -pub struct BinaryPropertiesDataProvider { +pub struct BinaryPropertyUnicodeSetDataProvider { root_dir: PathBuf, } /// A data provider reading from .toml files produced by the ICU4C icuwriteuprops tool. -impl BinaryPropertiesDataProvider { +impl BinaryPropertyUnicodeSetDataProvider { pub fn new(root_dir: PathBuf) -> Self { - BinaryPropertiesDataProvider { root_dir } + BinaryPropertyUnicodeSetDataProvider { root_dir } } fn get_toml_data(&self, name: &str) -> Result { let mut path: PathBuf = self.root_dir.clone().join(name); @@ -29,7 +29,7 @@ impl BinaryPropertiesDataProvider { } } -impl<'data> DataProvider<'data, UnicodePropertyV1Marker> for BinaryPropertiesDataProvider { +impl<'data> DataProvider<'data, UnicodePropertyV1Marker> for BinaryPropertyUnicodeSetDataProvider { fn load_payload( &self, req: &DataRequest, @@ -55,11 +55,11 @@ impl<'data> DataProvider<'data, UnicodePropertyV1Marker> for BinaryPropertiesDat } } -icu_provider::impl_dyn_provider!(BinaryPropertiesDataProvider, { +icu_provider::impl_dyn_provider!(BinaryPropertyUnicodeSetDataProvider, { _ => UnicodePropertyV1Marker, }, SERDE_SE, 'data); -impl IterableDataProviderCore for BinaryPropertiesDataProvider { +impl IterableDataProviderCore for BinaryPropertyUnicodeSetDataProvider { fn supported_options_for_key( &self, _resc_key: &ResourceKey, @@ -76,7 +76,7 @@ fn test_basic() { use std::convert::TryInto; let root_dir = icu_testdata::paths::data_root().join("uprops"); - let provider = BinaryPropertiesDataProvider::new(root_dir); + let provider = BinaryPropertyUnicodeSetDataProvider::new(root_dir); let payload: DataPayload<'_, UnicodePropertyV1Marker> = provider .load_payload(&DataRequest { diff --git a/provider/uprops/src/enum_codepointtrie.rs b/provider/uprops/src/enum_codepointtrie.rs new file mode 100644 index 00000000000..a8cf5f2a6ad --- /dev/null +++ b/provider/uprops/src/enum_codepointtrie.rs @@ -0,0 +1,167 @@ +// This file is part of ICU4X. For terms of use, please see the file +// called LICENSE at the top level of the ICU4X source tree +// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ). + +use crate::error::Error; +use crate::uprops_serde; +use crate::uprops_serde::enumerated::EnumeratedPropertyCodePointTrie; + +use icu_codepointtrie::codepointtrie::{CodePointTrie, CodePointTrieHeader, TrieType, TrieValue}; +use icu_properties::provider::{UnicodePropertyMapV1, UnicodePropertyMapV1Marker}; +use icu_provider::prelude::*; +use zerovec::ZeroVec; + +use core::convert::TryFrom; + +use std::fs; +use std::path::PathBuf; + +/// This data provider returns `CodePointTrie` data inside a +/// `UnicodePropertyMap` data struct. The source data is the same as that of +/// [crate::provider::PropertiesDataProvider], which is a TOML file of data +/// for the property(-ies) desired, as given by the ICU4C property data +/// exporter tool. +pub struct EnumeratedPropertyCodePointTrieProvider { + /// Path to the root directory containing the property data TOML files. + root_dir: PathBuf, +} + +impl EnumeratedPropertyCodePointTrieProvider { + pub fn new(root_dir: PathBuf) -> Self { + EnumeratedPropertyCodePointTrieProvider { root_dir } + } + + fn get_toml_data(&self, name: &str) -> Result { + let mut path: PathBuf = self.root_dir.clone().join(name); + path.set_extension("toml"); + let toml_str = fs::read_to_string(&path).map_err(|e| Error::Io(e, path.clone()))?; + toml::from_str(&toml_str).map_err(|e| Error::Toml(e, path)) + } +} + +impl TryFrom + for UnicodePropertyMapV1<'static, T> +{ + type Error = DataError; + + fn try_from( + cpt_data: EnumeratedPropertyCodePointTrie, + ) -> Result, DataError> { + let trie_type_enum: TrieType = + TrieType::try_from(cpt_data.trie_type_enum_val).map_err(DataError::new_resc_error)?; + let header = CodePointTrieHeader { + high_start: cpt_data.high_start, + shifted12_high_start: cpt_data.shifted12_high_start, + index3_null_offset: cpt_data.index3_null_offset, + data_null_offset: cpt_data.data_null_offset, + null_value: cpt_data.null_value, + trie_type: trie_type_enum, + }; + let index: ZeroVec = ZeroVec::clone_from_slice(&cpt_data.index); + let data: Result, T::TryFromU32Error> = + if let Some(data_8) = cpt_data.data_8 { + data_8.iter().map(|i| T::try_from_u32(*i as u32)).collect() + } else if let Some(data_16) = cpt_data.data_16 { + data_16.iter().map(|i| T::try_from_u32(*i as u32)).collect() + } else if let Some(data_32) = cpt_data.data_32 { + data_32.iter().map(|i| T::try_from_u32(*i as u32)).collect() + } else { + return Err(DataError::new_resc_error( + icu_codepointtrie::error::Error::FromDeserialized { + reason: "Did not find data array for CodePointTrie in TOML", + }, + )); + }; + + let data = data.map_err(DataError::new_resc_error)?; + let trie = + CodePointTrie::::try_new(header, index, data).map_err(DataError::new_resc_error); + trie.map(|t| UnicodePropertyMapV1 { codepoint_trie: t }) + } +} + +impl<'data, T: TrieValue> DataProvider<'data, UnicodePropertyMapV1Marker> + for EnumeratedPropertyCodePointTrieProvider +{ + fn load_payload( + &self, + req: &DataRequest, + ) -> Result>, DataError> { + // For data resource keys that represent the CodePointTrie data for an enumerated + // property, the ResourceKey sub-category string will just be the short alias + // for the property. + let prop_name = &req.resource_path.key.sub_category; + + let toml_data: uprops_serde::enumerated::Main = self + .get_toml_data(prop_name) + .map_err(DataError::new_resc_error)?; + + let source_cpt_data: uprops_serde::enumerated::EnumeratedPropertyCodePointTrie = + toml_data.enum_property.data.code_point_trie; + + let data_struct = UnicodePropertyMapV1::::try_from(source_cpt_data)?; + + Ok(DataResponse { + metadata: DataResponseMetadata { + data_langid: req.resource_path.options.langid.clone(), + }, + payload: Some(DataPayload::from_owned(data_struct)), + }) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use icu_codepointtrie::codepointtrie::CodePointTrie; + use icu_properties::provider::key; + use icu_properties::{GeneralSubcategory, Script}; + + // A test of the UnicodeProperty General_Category is truly a test of the + // `GeneralSubcategory` Rust enum, not the `GeneralCategory` Rust enum, + // since we must match the representation and value width of the data from + // the ICU CodePointTrie that ICU4X is reading from. + #[test] + fn test_general_category() { + let root_dir = icu_testdata::paths::data_root().join("uprops"); + let provider = EnumeratedPropertyCodePointTrieProvider::new(root_dir); + + let payload: DataPayload<'_, UnicodePropertyMapV1Marker> = provider + .load_payload(&DataRequest { + resource_path: ResourcePath { + key: key::GENERAL_CATEGORY_V1, + options: ResourceOptions::default(), + }, + }) + .expect("The data should be valid") + .take_payload() + .expect("Loading was successful"); + + let trie: &CodePointTrie = &payload.get().codepoint_trie; + + assert_eq!(trie.get('꣓' as u32), GeneralSubcategory::Digit); + assert_eq!(trie.get('≈' as u32), GeneralSubcategory::MathSymbol); + } + + #[test] + fn test_script() { + let root_dir = icu_testdata::paths::data_root().join("uprops"); + let provider = EnumeratedPropertyCodePointTrieProvider::new(root_dir); + + let payload: DataPayload<'_, UnicodePropertyMapV1Marker