Skip to content

Commit

Permalink
CodePointTrie data provider (#1167)
Browse files Browse the repository at this point in the history
  • Loading branch information
echeran authored Oct 21, 2021
1 parent 2611e3a commit 3ea4f80
Show file tree
Hide file tree
Showing 19 changed files with 367 additions and 88 deletions.
7 changes: 5 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions components/properties/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ denylist = ["bench"]
all-features = true

[dependencies]
icu_codepointtrie = { version = "0.2", path = "../../utils/codepointtrie", features = ["serde"] }
icu_provider = { version = "0.3", path = "../../provider/core", features = ["macros"] }
icu_uniset = { version = "0.3", path = "../../utils/uniset", features = ["serde"] }
num_enum = { version = "0.5.4", default-features = false }
Expand Down
3 changes: 2 additions & 1 deletion components/properties/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,12 @@
//! [`UnicodeSet`]: icu_uniset::UnicodeSet
//! [`sets`]: crate::sets
#![no_std]
#![cfg_attr(not(any(test, feature = "std")), no_std)]

mod props;
pub mod provider;
pub mod sets;
mod trievalue;
mod ule;

pub use props::*;
9 changes: 6 additions & 3 deletions components/properties/src/props.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,22 @@ use num_enum::{TryFromPrimitive, UnsafeFromPrimitive};
/// See `UProperty` in ICU4C.
#[derive(Clone, PartialEq, Debug)]
#[non_exhaustive]
#[repr(i32)]
pub enum EnumeratedProperty {
/// The General Category property.
GeneralCategory = 0x1005,
/// The Script property. See [`Script`].
Script = 0x100A,
/// The Script_Extensions property. See [`Script`].
ScriptExtensions = 0x7000,
ScriptExtensions = 0x7000, // TODO(#1160) - this is a Miscellaneous property, not Enumerated
/// Represents an invalid or unknown Unicode property.
InvalidCode = -1, // TODO(#1160) - taken from ICU4C UProperty::UCHAR_INVALID_CODE
}

/// Enumerated Unicode general category types.
/// GeneralSubcategory only supports specific subcategories (eg `UppercaseLetter`).
/// It does not support grouped categories (eg `Letter`). For grouped categories, use [`GeneralCategory`].
#[derive(Copy, Clone, PartialEq, Debug, TryFromPrimitive, UnsafeFromPrimitive)]
#[derive(Copy, Clone, PartialEq, Eq, Debug, TryFromPrimitive, UnsafeFromPrimitive)]
#[repr(u8)]
pub enum GeneralSubcategory {
/// A reserved unassigned code point or a noncharacter
Expand Down Expand Up @@ -235,7 +238,7 @@ impl From<GeneralSubcategory> for GeneralCategory {
/// See UScriptCode in ICU4C.
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
#[repr(transparent)]
pub struct Script(pub(crate) u16);
pub struct Script(pub u16);

#[allow(missing_docs)] // These constants don't need individual documentation.
#[allow(non_upper_case_globals)]
Expand Down
59 changes: 56 additions & 3 deletions components/properties/src/provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
//!
//! Read more about data providers: [`icu_provider`]
use icu_codepointtrie::codepointtrie::{CodePointTrie, TrieValue};
use icu_provider::yoke::{self, *};
use icu_uniset::UnicodeSet;
use icu_uniset::UnicodeSetBuilder;
Expand All @@ -32,10 +33,10 @@ pub mod key {
};
}

define_resource_keys!(265;
define_resource_keys!(267;

//
// Binary properties
// Binary property UnicodeSets
//

(ASCII_HEX_DIGIT_V1, "AHex"),
Expand Down Expand Up @@ -105,7 +106,7 @@ pub mod key {
(XID_START_V1, "XIDS"),

//
// Enumerated properties
// Enumerated property prop=val UnicodeSets
//

// Note: The ResourceKey subcategory strings are determined from the
Expand Down Expand Up @@ -312,9 +313,23 @@ pub mod key {
(SCRIPT_YEZIDI_V1, "sc=Yezi"),
(SCRIPT_YI_V1, "sc=Yiii"),
(SCRIPT_ZANABAZAR_SQUARE_V1, "sc=Zanb"),

//
// Enumerated property CodePointMaps
//

// ResourceKey subcategory string is the short alias of the property

(GENERAL_CATEGORY_V1, "gc"),
(SCRIPT_V1, "sc"),

);
}

//
// UnicodeProperty
//

/// A set of characters with a particular property.
#[icu_provider::data_struct]
#[derive(Debug, Eq, PartialEq, Clone)]
Expand Down Expand Up @@ -349,3 +364,41 @@ impl<'data> From<UnicodePropertyV1<'data>> for UnicodeSet<'data> {
prop.inv_list
}
}

//
// UnicodePropertyMap
//

/// A map efficiently storing data about individual characters.
#[derive(Debug, Eq, PartialEq, Yokeable, ZeroCopyFrom)]
#[cfg_attr(
feature = "provider_serde",
derive(serde::Serialize, serde::Deserialize)
)]
pub struct UnicodePropertyMapV1<'data, T: TrieValue> {
/// A codepoint trie storing the data
#[cfg_attr(feature = "provider_serde", serde(borrow))]
pub codepoint_trie: CodePointTrie<'data, T>,
}

impl<'data, T: TrieValue> Clone for UnicodePropertyMapV1<'data, T>
where
<T as zerovec::ule::AsULE>::ULE: Clone,
{
fn clone(&self) -> Self {
UnicodePropertyMapV1 {
codepoint_trie: self.codepoint_trie.clone(),
}
}
}

/// Marker type for UnicodePropertyMapV1.
/// This is generated by hand because icu_provider::data_struct doesn't support generics yet.
pub struct UnicodePropertyMapV1Marker<T: TrieValue> {
_phantom: core::marker::PhantomData<T>,
}

impl<'data, T: TrieValue> icu_provider::DataMarker<'data> for UnicodePropertyMapV1Marker<T> {
type Yokeable = UnicodePropertyMapV1<'static, T>;
type Cart = UnicodePropertyMapV1<'data, T>;
}
29 changes: 29 additions & 0 deletions components/properties/src/trievalue.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

use crate::{GeneralSubcategory, Script};
use core::convert::TryInto;
use core::num::TryFromIntError;
use icu_codepointtrie::codepointtrie::TrieValue;
use num_enum::TryFromPrimitiveError;

use core::convert::TryFrom;

impl TrieValue for GeneralSubcategory {
const DATA_GET_ERROR_VALUE: GeneralSubcategory = GeneralSubcategory::Unassigned;
type TryFromU32Error = TryFromPrimitiveError<Self>;
fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
// If the u32 is out of range, fall back to u8::MAX, which is out of range of the GeneralSubcategory enum.
GeneralSubcategory::try_from(i.try_into().unwrap_or(u8::MAX))
}
}

impl TrieValue for Script {
const DATA_GET_ERROR_VALUE: Script = Script::Unknown;
type TryFromU32Error = TryFromIntError;

fn try_from_u32(i: u32) -> Result<Self, Self::TryFromU32Error> {
u16::try_from(i).map(Script)
}
}
2 changes: 1 addition & 1 deletion docs/tutorials/writing_a_new_data_struct.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ Examples of source data providers include:
- [`PluralsProvider`](https://unicode-org.github.io/icu4x-docs/doc/icu_provider_cldr/transform/struct.PluralsProvider.html)
- [`DateSymbolsProvider`](https://unicode-org.github.io/icu4x-docs/doc/icu_provider_cldr/transform/struct.DateSymbolsProvider.html)
- [&hellip; more examples](https://unicode-org.github.io/icu4x-docs/doc/icu_provider_cldr/transform/index.html)
- `BinaryPropertiesDataProvider`
- `BinaryPropertyUnicodeSetDataProvider`
- [`HelloWorldProvider`](https://unicode-org.github.io/icu4x-docs/doc/icu_provider/hello_world/struct.HelloWorldProvider.html)

Source data providers must implement the following traits:
Expand Down
2 changes: 2 additions & 0 deletions provider/uprops/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,13 @@ all-features = true

[dependencies]
displaydoc = { version = "0.2.3", default-features = false }
icu_codepointtrie = { version = "0.2", path = "../../utils/codepointtrie", features = ["provider_serde"] }
icu_properties = { version = "0.3", path = "../../components/properties", features = ["provider_serde"] }
icu_provider = { version = "0.3", path = "../../provider/core", features = ["provider_serde"] }
icu_uniset = { version = "0.3", path = "../../utils/uniset", features = ["provider_serde"] }
serde = { version = "1.0", features = ["derive"] }
toml = { version = "0.5" }
zerovec = { version = "0.4", path = "../../utils/zerovec", features = ["serde", "yoke"] }

[dev-dependencies]
icu_testdata = { version = "0.3", path = "../../provider/testdata" }
Original file line number Diff line number Diff line change
Expand Up @@ -12,14 +12,14 @@ use icu_uniset::UnicodeSetBuilder;
use std::fs;
use std::path::PathBuf;

pub struct BinaryPropertiesDataProvider {
pub struct BinaryPropertyUnicodeSetDataProvider {
root_dir: PathBuf,
}

/// A data provider reading from .toml files produced by the ICU4C icuwriteuprops tool.
impl BinaryPropertiesDataProvider {
impl BinaryPropertyUnicodeSetDataProvider {
pub fn new(root_dir: PathBuf) -> Self {
BinaryPropertiesDataProvider { root_dir }
BinaryPropertyUnicodeSetDataProvider { root_dir }
}
fn get_toml_data(&self, name: &str) -> Result<uprops_serde::binary::Main, Error> {
let mut path: PathBuf = self.root_dir.clone().join(name);
Expand All @@ -29,7 +29,7 @@ impl BinaryPropertiesDataProvider {
}
}

impl<'data> DataProvider<'data, UnicodePropertyV1Marker> for BinaryPropertiesDataProvider {
impl<'data> DataProvider<'data, UnicodePropertyV1Marker> for BinaryPropertyUnicodeSetDataProvider {
fn load_payload(
&self,
req: &DataRequest,
Expand All @@ -55,11 +55,11 @@ impl<'data> DataProvider<'data, UnicodePropertyV1Marker> for BinaryPropertiesDat
}
}

icu_provider::impl_dyn_provider!(BinaryPropertiesDataProvider, {
icu_provider::impl_dyn_provider!(BinaryPropertyUnicodeSetDataProvider, {
_ => UnicodePropertyV1Marker,
}, SERDE_SE, 'data);

impl IterableDataProviderCore for BinaryPropertiesDataProvider {
impl IterableDataProviderCore for BinaryPropertyUnicodeSetDataProvider {
fn supported_options_for_key(
&self,
_resc_key: &ResourceKey,
Expand All @@ -76,7 +76,7 @@ fn test_basic() {
use std::convert::TryInto;

let root_dir = icu_testdata::paths::data_root().join("uprops");
let provider = BinaryPropertiesDataProvider::new(root_dir);
let provider = BinaryPropertyUnicodeSetDataProvider::new(root_dir);

let payload: DataPayload<'_, UnicodePropertyV1Marker> = provider
.load_payload(&DataRequest {
Expand Down
Loading

0 comments on commit 3ea4f80

Please sign in to comment.