Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds UnicodePropertyMapV1 data struct for enumerated properties #1161

Merged
merged 9 commits into from
Oct 18, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 6 additions & 1 deletion utils/codepointtrie/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,10 @@ denylist = ["bench"]
all-features = true

[dependencies]
icu_provider = { version = "0.3", path = "../../provider/core", features = ["macros"] }
serde = { version = "1.0", default-features = false, features = ["derive", "alloc"], optional = true }
thiserror = "1.0"
zerovec = { version = "0.3", path = "../../utils/zerovec", features = ["serde"] }
zerovec = { version = "0.3", path = "../../utils/zerovec", features = ["serde", "yoke"] }

[dev-dependencies]
postcard = { version = "0.7", features = ["alloc"] }
Expand All @@ -45,3 +46,7 @@ zerovec = { version = "0.3", path = "../../utils/zerovec", features = ["serde"]
[lib]
bench = false # This option is required for Benchmark CI
path = "src/lib.rs"

[features]
default = ["provider_serde"]
provider_serde = ["serde"]
112 changes: 48 additions & 64 deletions utils/codepointtrie/src/codepointtrie.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,17 @@ use crate::error::Error;
use crate::impl_const::*;

use core::convert::TryFrom;
use icu_provider::yoke::{self, Yokeable, ZeroCopyFrom};
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
use zerovec::ZeroVec;

// Enums

/// The width of the elements in the data array of a [`CodePointTrie`].
/// See [`UCPTrieValueWidth`](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/ucptrie_8h.html) in ICU4C.
#[derive(Clone, Copy, PartialEq)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub enum ValueWidthEnum {
Bits16 = 0,
Bits32 = 1,
Bits8 = 2,
}

/// The type of trie represents whether the trie has an optimization that
/// would make it small or fast.
/// See [`UCPTrieType`](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/ucptrie_8h.html) in ICU4C.
#[derive(Clone, Copy, PartialEq)]
#[derive(Clone, Copy, PartialEq, Debug, Eq)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub enum TrieTypeEnum {
pub enum TrieType {
/// Represents the "fast" type code point tries for the
/// [`TrieType`] trait. The "fast max" limit is set to `0xffff`.
Fast = 0,
Expand All @@ -36,51 +25,29 @@ pub enum TrieTypeEnum {
Small = 1,
}

// ValueWidth trait
// TrieValue trait

// AsULE is AsUnalignedLittleEndian, i.e. "allowed in a zerovec"

/// A trait representing the width of the values stored in the data array of a
/// [`CodePointTrie`]. This trait is used as a type parameter in constructing
/// a `CodePointTrie`.
pub trait ValueWidth: Copy + zerovec::ule::AsULE + 'static {
/// This enum variant represents the specific instance of `ValueWidth` such
/// that the enum discriminant values matches ICU4C's enum integer value.
const ENUM_VALUE: ValueWidthEnum;
/// This value is used to indicate an error in the Rust code in accessing
/// a position in the trie's `data` array. In normal cases, the position in
/// the `data` array will return either the correct value, or in case of a
/// logical error in the trie's computation, the trie's own error value
/// which is stored that in the `data` array.
/// A trait representing the values stored in the data array of a [`CodePointTrie`].
/// This trait is used as a type parameter in constructing a `CodePointTrie`.
pub trait TrieValue: Copy + Eq + PartialEq + zerovec::ule::AsULE + 'static {
/// Last-resort fallback value to return if we cannot read data from the trie.
///
/// In most cases, the error value is read from the last element of the `data` array.
const DATA_GET_ERROR_VALUE: Self;
fn cast_to_widest(self) -> u32;
}

impl ValueWidth for u8 {
const ENUM_VALUE: ValueWidthEnum = ValueWidthEnum::Bits8;
impl TrieValue for u8 {
const DATA_GET_ERROR_VALUE: u8 = u8::MAX;

fn cast_to_widest(self) -> u32 {
self as u32
}
}

impl ValueWidth for u16 {
const ENUM_VALUE: ValueWidthEnum = ValueWidthEnum::Bits16;
impl TrieValue for u16 {
const DATA_GET_ERROR_VALUE: u16 = u16::MAX;

fn cast_to_widest(self) -> u32 {
self as u32
}
}

impl ValueWidth for u32 {
const ENUM_VALUE: ValueWidthEnum = ValueWidthEnum::Bits32;
impl TrieValue for u32 {
const DATA_GET_ERROR_VALUE: u32 = u32::MAX;

fn cast_to_widest(self) -> u32 {
self
}
}

/// This struct represents a de-serialized CodePointTrie that was exported from
Expand All @@ -90,16 +57,18 @@ impl ValueWidth for u32 {
/// - [ICU Site design doc](http://site.icu-project.org/design/struct/utrie)
/// - [ICU User Guide section on Properties lookup](https://unicode-org.github.io/icu/userguide/strings/properties.html#lookup)
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub struct CodePointTrie<'trie, W: ValueWidth> {
#[derive(Debug, Eq, PartialEq, Yokeable, ZeroCopyFrom)]
pub struct CodePointTrie<'trie, T: TrieValue> {
header: CodePointTrieHeader,
#[cfg_attr(feature = "serde", serde(borrow))]
index: ZeroVec<'trie, u16>,
#[cfg_attr(feature = "serde", serde(borrow))]
data: ZeroVec<'trie, W>,
data: ZeroVec<'trie, T>,
}

/// This struct contains the fixed-length header fields of a [`CodePointTrie`].
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[derive(Copy, Clone, Debug, Eq, PartialEq, Yokeable, ZeroCopyFrom)]
pub struct CodePointTrieHeader {
/// The code point of the start of the last range of the trie. A
/// range is defined as a partition of the code point space such that the
Expand Down Expand Up @@ -130,31 +99,31 @@ pub struct CodePointTrieHeader {
pub null_value: u32,
/// The enum value representing the type of trie, where trie type is as it
/// is defined in ICU (ex: Fast, Small).
pub trie_type: TrieTypeEnum,
pub trie_type: TrieType,
}

impl TryFrom<u8> for TrieTypeEnum {
impl TryFrom<u8> for TrieType {
type Error = crate::error::Error;

fn try_from(trie_type_int: u8) -> Result<TrieTypeEnum, crate::error::Error> {
fn try_from(trie_type_int: u8) -> Result<TrieType, crate::error::Error> {
match trie_type_int {
0 => Ok(TrieTypeEnum::Fast),
1 => Ok(TrieTypeEnum::Small),
0 => Ok(TrieType::Fast),
1 => Ok(TrieType::Small),
_ => Err(crate::error::Error::FromDeserialized {
reason: "Cannot parse value for trie_type",
}),
}
}
}

impl<'trie, W: ValueWidth> CodePointTrie<'trie, W> {
impl<'trie, T: TrieValue> CodePointTrie<'trie, T> {
/// Returns a new [`CodePointTrie`] backed by borrowed data for the `index`
/// array and `data` array, whose data values have width `W`.
pub fn try_new(
header: CodePointTrieHeader,
index: ZeroVec<'trie, u16>,
data: ZeroVec<'trie, W>,
) -> Result<CodePointTrie<'trie, W>, Error> {
data: ZeroVec<'trie, T>,
) -> Result<CodePointTrie<'trie, T>, Error> {
// Validation invariants are not needed here when constructing a new
// `CodePointTrie` because:
//
Expand All @@ -167,7 +136,7 @@ impl<'trie, W: ValueWidth> CodePointTrie<'trie, W> {
// - The `ZeroVec` serializer stores the length of the array along with the
// ZeroVec data, meaning that a deserializer would also see that length info.

let trie: CodePointTrie<'trie, W> = CodePointTrie {
let trie: CodePointTrie<'trie, T> = CodePointTrie {
header,
index,
data,
Expand All @@ -183,7 +152,7 @@ impl<'trie, W: ValueWidth> CodePointTrie<'trie, W> {

fn internal_small_index(&self, code_point: u32) -> u32 {
let mut index1_pos: u32 = code_point >> SHIFT_1;
if self.header.trie_type == TrieTypeEnum::Fast {
if self.header.trie_type == TrieType::Fast {
debug_assert!(
FAST_TYPE_FAST_INDEXING_MAX < code_point && code_point < self.header.high_start
);
Expand Down Expand Up @@ -290,14 +259,14 @@ impl<'trie, W: ValueWidth> CodePointTrie<'trie, W> {
/// assert_eq!(0, trie.get(0x13E0)); // 'Ꮰ' as u32
/// assert_eq!(1, trie.get(0x10044)); // '𐁄' as u32
/// ```
pub fn get(&self, code_point: u32) -> W {
pub fn get(&self, code_point: u32) -> T {
// All code points up to the fast max limit are represented
// individually in the `index` array to hold their `data` array position, and
// thus only need 2 lookups for a [CodePointTrie::get()](`crate::codepointtrie::CodePointTrie::get`).
// Code points above the "fast max" limit require 4 lookups.
let fast_max = match self.header.trie_type {
TrieTypeEnum::Fast => FAST_TYPE_FAST_INDEXING_MAX,
TrieTypeEnum::Small => SMALL_TYPE_FAST_INDEXING_MAX,
TrieType::Fast => FAST_TYPE_FAST_INDEXING_MAX,
TrieType::Small => SMALL_TYPE_FAST_INDEXING_MAX,
};
let data_pos: u32 = if code_point <= fast_max {
Self::fast_index(self, code_point)
Expand All @@ -308,12 +277,14 @@ impl<'trie, W: ValueWidth> CodePointTrie<'trie, W> {
};
// Returns the trie value (or trie's error value).
// If we cannot read from the data array, then return the associated constant
// DATA_GET_ERROR_VALUE for the instance type for W: ValueWidth.
// DATA_GET_ERROR_VALUE for the instance type for T: TrieValue.
self.data
.get(data_pos as usize)
.unwrap_or(W::DATA_GET_ERROR_VALUE)
.unwrap_or(T::DATA_GET_ERROR_VALUE)
}
}

impl<'trie, T: TrieValue + Into<u32>> CodePointTrie<'trie, T> {
/// Returns the value that is associated with `code_point` for this [`CodePointTrie`]
/// as a `u32`.
///
Expand All @@ -333,7 +304,20 @@ impl<'trie, W: ValueWidth> CodePointTrie<'trie, W> {
// Note: This API method maintains consistency with the corresponding
// original ICU APIs.
pub fn get_u32(&self, code_point: u32) -> u32 {
self.get(code_point).cast_to_widest()
self.get(code_point).into()
}
}

impl<'trie, T: TrieValue> Clone for CodePointTrie<'trie, T>
where
<T as zerovec::ule::AsULE>::ULE: Clone,
{
fn clone(&self) -> Self {
CodePointTrie {
header: self.header,
index: self.index.clone(),
data: self.data.clone(),
}
}
}

Expand Down
1 change: 1 addition & 0 deletions utils/codepointtrie/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,4 @@ pub mod codepointtrie;
pub mod error;
mod impl_const;
pub mod planes;
pub mod provider;
2 changes: 1 addition & 1 deletion utils/codepointtrie/src/planes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ pub fn get_planes_trie() -> CodePointTrie<'static, u8> {
let index3_null_offset = 0x2;
let data_null_offset = 0x0;
let null_value = 0x0;
let trie_type = TrieTypeEnum::Small;
let trie_type = TrieType::Small;

let trie_header = CodePointTrieHeader {
high_start,
Expand Down
44 changes: 44 additions & 0 deletions utils/codepointtrie/src/provider.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
// This file is part of ICU4X. For terms of use, please see the file
// called LICENSE at the top level of the ICU4X source tree
// (online at: https://github.com/unicode-org/icu4x/blob/main/LICENSE ).

//! Data provider struct definitions for this ICU4X component.
//!
//! Read more about data providers: [`icu_provider`]

use crate::codepointtrie::{CodePointTrie, TrieValue};
use icu_provider::yoke::{self, Yokeable, ZeroCopyFrom};

/// A map efficiently storing data about individual characters.
#[derive(Debug, Eq, PartialEq, Yokeable, ZeroCopyFrom)]
#[cfg_attr(
feature = "provider_serde",
derive(serde::Serialize, serde::Deserialize)
)]
pub struct UnicodePropertyMapV1<'data, T: TrieValue> {
sffc marked this conversation as resolved.
Show resolved Hide resolved
/// A codepoint trie storing the data
#[cfg_attr(feature = "provider_serde", serde(borrow))]
pub codepoint_trie: CodePointTrie<'data, T>,
}

impl<'data, T: TrieValue> Clone for UnicodePropertyMapV1<'data, T>
where
<T as zerovec::ule::AsULE>::ULE: Clone,
{
fn clone(&self) -> Self {
UnicodePropertyMapV1 {
codepoint_trie: self.codepoint_trie.clone(),
}
}
}

/// Marker type for UnicodePropertyMapV1.
/// This is generated by hand because icu_provider::data_struct doesn't support generics yet.
pub struct UnicodePropertyMapV1Marker<T: TrieValue> {
_phantom: core::marker::PhantomData<T>,
}

impl<'data, T: TrieValue> icu_provider::DataMarker<'data> for UnicodePropertyMapV1Marker<T> {
type Yokeable = UnicodePropertyMapV1<'static, T>;
type Cart = UnicodePropertyMapV1<'data, T>;
}
2 changes: 1 addition & 1 deletion utils/codepointtrie/tests/planes_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ fn planes_trie_deserialize_check_test() {

let code_point_trie_struct = planes_enum_prop.code_point_trie.trie_struct;

let trie_type_enum = match TrieTypeEnum::try_from(code_point_trie_struct.trie_type_enum_val) {
let trie_type_enum = match TrieType::try_from(code_point_trie_struct.trie_type_enum_val) {
Ok(enum_val) => enum_val,
_ => {
panic!(
Expand Down
16 changes: 14 additions & 2 deletions utils/codepointtrie/tests/test_util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,24 @@ use icu_codepointtrie::codepointtrie::*;
use icu_codepointtrie::error::Error;

use core::convert::TryFrom;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
use std::fs::File;
use std::io::Read;
use std::path::Path;
use zerovec::ZeroVec;

pub fn check_trie<W: ValueWidth>(trie: &CodePointTrie<W>, check_ranges: &[u32]) {
/// The width of the elements in the data array of a [`CodePointTrie`].
/// See [`UCPTrieValueWidth`](https://unicode-org.github.io/icu-docs/apidoc/dev/icu4c/ucptrie_8h.html) in ICU4C.
#[derive(Clone, Copy, PartialEq)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
pub enum ValueWidthEnum {
Bits16 = 0,
Bits32 = 1,
Bits8 = 2,
}

pub fn check_trie<T: TrieValue + Into<u32>>(trie: &CodePointTrie<T>, check_ranges: &[u32]) {
assert_eq!(
0,
check_ranges.len() % 2,
Expand Down Expand Up @@ -152,7 +164,7 @@ pub fn run_deserialize_test_from_test_data(test_file_path: &str) {
test_struct.name
);

let trie_type_enum = match TrieTypeEnum::try_from(test_struct.trie_type_enum_val) {
let trie_type_enum = match TrieType::try_from(test_struct.trie_type_enum_val) {
Ok(enum_val) => enum_val,
_ => {
panic!(
Expand Down