Skip to content

Commit

Permalink
Full transliterator datagen (#5712)
Browse files Browse the repository at this point in the history
Follow-up issues filed in CLDR.
  • Loading branch information
robertbastian authored Oct 22, 2024
1 parent c676494 commit 74ae3a9
Show file tree
Hide file tree
Showing 35 changed files with 5,138 additions and 482 deletions.
3 changes: 2 additions & 1 deletion components/experimental/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ criterion = { workspace = true }

[dev-dependencies]
icu = { path = "../../components/icu", features = ["experimental"]}
icu_provider = { path = "../../provider/core", features = ["std"]}

icu_locale_data = { workspace = true }
icu_properties_data = { workspace = true }
Expand All @@ -72,7 +73,7 @@ default = ["compiled_data"]
compiled_data = ["dep:icu_experimental_data", "icu_decimal/compiled_data", "icu_list/compiled_data", "icu_plurals/compiled_data", "icu_properties/compiled_data", "icu_normalizer/compiled_data"]
datagen = ["serde", "std", "dep:databake", "zerovec/databake", "zerotrie/databake", "tinystr/databake", "icu_collections/databake", "std", "log", "icu_pattern/databake", "icu_plurals/datagen", "icu_pattern/alloc"]
ryu = ["fixed_decimal/ryu"]
serde = ["dep:serde", "zerovec/serde", "potential_utf/serde", "tinystr/serde", "icu_collections/serde", "icu_decimal/serde", "icu_list/serde", "icu_pattern/serde", "icu_plurals/serde", "icu_provider/serde", "zerotrie/serde"]
serde = ["dep:serde", "zerovec/serde", "potential_utf/serde", "tinystr/serde", "icu_collections/serde", "icu_decimal/serde", "icu_list/serde", "icu_pattern/serde", "icu_plurals/serde", "icu_provider/serde", "zerotrie/serde", "icu_normalizer/serde"]
std = ["fixed_decimal/std", "icu_decimal/std", "icu_pattern/std", "icu_plurals/std", "icu_provider/std", "icu_locale_core/std"]

bench = []
Expand Down
6 changes: 1 addition & 5 deletions components/experimental/benches/transliterate/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,6 @@ use icu_locale_core::Locale;

use icu_experimental::transliterate::Transliterator;

#[allow(clippy::single_component_path_imports)]
use icu_experimental;
include!("../../tests/transliterate/data/provider.rs");

struct BenchDataContent {
pub num: usize,
pub name: String,
Expand All @@ -36,7 +32,7 @@ fn bench_data_from_sources(locale_str: &str, source: &str) -> Vec<BenchDataConte
.map(|(idx, input)| BenchDataContent {
num: idx + 1,
name: locale_str.to_string(),
translit: Transliterator::try_new_unstable(locale.clone(), &TestingProvider).unwrap(),
translit: Transliterator::try_new(&locale).unwrap(),
input,
})
.collect()
Expand Down
3 changes: 3 additions & 0 deletions components/experimental/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ pub mod provider {
use icu_experimental_data::*;
pub mod icu {
pub use crate as experimental;
pub use icu_collections as collections;
pub use icu_experimental_data::icu_locale as locale;
pub use icu_plurals as plurals;
}
Expand Down Expand Up @@ -85,6 +86,7 @@ pub mod provider {
impl_short_second_relative_time_format_data_v1_marker!(Baked);
impl_short_week_relative_time_format_data_v1_marker!(Baked);
impl_short_year_relative_time_format_data_v1_marker!(Baked);
impl_transliterator_rules_v1_marker!(Baked);
impl_units_info_v1_marker!(Baked);
impl_units_trie_v1_marker!(Baked);
};
Expand Down Expand Up @@ -139,6 +141,7 @@ pub mod provider {
super::relativetime::provider::ShortSecondRelativeTimeFormatDataV1Marker::INFO,
super::relativetime::provider::ShortWeekRelativeTimeFormatDataV1Marker::INFO,
super::relativetime::provider::ShortYearRelativeTimeFormatDataV1Marker::INFO,
super::transliterate::provider::TransliteratorRulesV1Marker::INFO,
super::units::provider::UnitsInfoV1Marker::INFO,
];
}
45 changes: 30 additions & 15 deletions components/experimental/src/transliterate/compile/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ impl Direction {
/// true,
/// );
///
/// let t = Transliterator::try_new_unstable("de-t-de-d0-ascii".parse().unwrap(), &collection.as_provider()).unwrap();
/// let t = Transliterator::try_new_unstable(&collection.as_provider(), &collection.as_provider(), &"de-t-de-d0-ascii".parse().unwrap()).unwrap();
/// assert_eq!(t.transliterate("Käse".into()), "Kaese");
#[allow(clippy::type_complexity)] // well
pub struct RuleCollection {
Expand Down Expand Up @@ -126,11 +126,22 @@ impl RuleCollection {
id: &icu_locale_core::Locale,
aliases: impl IntoIterator<Item = &'a str>,
) {
self.id_mapping.extend(
aliases
.into_iter()
.map(|alias| (alias.to_ascii_lowercase(), id.clone())),
)
for alias in aliases {
self.id_mapping
.entry(alias.to_ascii_lowercase())
.and_modify(|prev| {
if prev != id {
icu_provider::log::warn!(
"Duplicate entry for alias for {alias}: {prev}, {id}"
);
// stability
if prev.to_string() > id.to_string() {
*prev = id.clone();
}
}
})
.or_insert(id.clone());
}
}

/// Returns a provider that is usable by [`Transliterator::try_new_unstable`](crate::transliterate::Transliterator::try_new_unstable).
Expand Down Expand Up @@ -256,6 +267,8 @@ where
+ DataProvider<AlphabeticV1Marker>
+ DataProvider<BidiControlV1Marker>
+ DataProvider<BidiMirroredV1Marker>
+ DataProvider<CanonicalCombiningClassV1Marker>
+ DataProvider<CanonicalCombiningClassNameToValueV2Marker>
+ DataProvider<CaseIgnorableV1Marker>
+ DataProvider<CasedV1Marker>
+ DataProvider<ChangesWhenCasefoldedV1Marker>
Expand Down Expand Up @@ -417,6 +430,8 @@ where
+ DataProvider<AlphabeticV1Marker>
+ DataProvider<BidiControlV1Marker>
+ DataProvider<BidiMirroredV1Marker>
+ DataProvider<CanonicalCombiningClassV1Marker>
+ DataProvider<CanonicalCombiningClassNameToValueV2Marker>
+ DataProvider<CaseIgnorableV1Marker>
+ DataProvider<CasedV1Marker>
+ DataProvider<ChangesWhenCasefoldedV1Marker>
Expand Down Expand Up @@ -604,7 +619,7 @@ mod tests {
use crate::transliterate::provider as ds;
use icu_locale_core::locale;
use std::collections::HashSet;
use zerovec::VarZeroVec;
use zerovec::{vecs::Index32, VarZeroVec};

fn parse_set(source: &str) -> super::parse::UnicodeSet {
crate::unicodeset_parse::parse_unstable(source, &icu_properties::provider::Baked)
Expand Down Expand Up @@ -680,7 +695,7 @@ mod tests {
}];
let expected_id_group2 = vec![ds::SimpleId {
filter: parse_set_cp(r"[\ ]"),
id: Cow::Borrowed("x-any-remove"),
id: Cow::Borrowed("any-remove"),
}];
let expected_id_group3 = vec![
ds::SimpleId {
Expand All @@ -689,7 +704,7 @@ mod tests {
},
ds::SimpleId {
filter: parse::FilterSet::all(),
id: Cow::Borrowed("x-any-nfc"),
id: Cow::Borrowed("any-nfc"),
},
];

Expand Down Expand Up @@ -726,7 +741,7 @@ mod tests {
replacer: Cow::Borrowed("splitsuprulegroups"),
}];

let expected_rule_group_list: Vec<VarZeroVec<'_, ds::RuleULE>> = vec![
let expected_rule_group_list: Vec<VarZeroVec<'_, ds::RuleULE, Index32>> = vec![
VarZeroVec::from(&expected_rule_group1),
VarZeroVec::from(&expected_rule_group2),
VarZeroVec::new(), // empty rule group after the last transform rule
Expand Down Expand Up @@ -767,8 +782,8 @@ mod tests {
assert_eq!(
forward.payload.get().deps().collect::<HashSet<_>>(),
HashSet::from_iter([
Cow::Borrowed("x-any-nfc"),
Cow::Borrowed("x-any-remove"),
Cow::Borrowed("any-nfc"),
Cow::Borrowed("any-remove"),
Cow::Borrowed("x-interindic-devanagari"),
Cow::Borrowed("x-latin-interindic"),
])
Expand All @@ -780,7 +795,7 @@ mod tests {
let expected_id_group1 = vec![
ds::SimpleId {
filter: parse::FilterSet::all(),
id: Cow::Borrowed("x-any-nfd"),
id: Cow::Borrowed("any-nfd"),
},
ds::SimpleId {
filter: parse::FilterSet::all(),
Expand Down Expand Up @@ -816,7 +831,7 @@ mod tests {
},
];

let expected_rule_group_list: Vec<VarZeroVec<'_, ds::RuleULE>> =
let expected_rule_group_list: Vec<VarZeroVec<'_, ds::RuleULE, Index32>> =
vec![VarZeroVec::from(&expected_rule_group1), VarZeroVec::new()];

let expected_compounds = vec![
Expand Down Expand Up @@ -873,7 +888,7 @@ mod tests {
reverse.payload.get().deps().collect::<HashSet<_>>(),
HashSet::from_iter([
Cow::Borrowed("und-t-d0-addrndsp-m0-fifty-s0-anyrev"),
Cow::Borrowed("x-any-nfd"),
Cow::Borrowed("any-nfd"),
Cow::Borrowed("x-any-revfncall"),
Cow::Borrowed("x-devanagari-interindic"),
Cow::Borrowed("x-interindic-latin"),
Expand Down
2 changes: 2 additions & 0 deletions components/experimental/src/transliterate/compile/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,8 @@ where
+ DataProvider<AlphabeticV1Marker>
+ DataProvider<BidiControlV1Marker>
+ DataProvider<BidiMirroredV1Marker>
+ DataProvider<CanonicalCombiningClassV1Marker>
+ DataProvider<CanonicalCombiningClassNameToValueV2Marker>
+ DataProvider<CaseIgnorableV1Marker>
+ DataProvider<CasedV1Marker>
+ DataProvider<ChangesWhenCasefoldedV1Marker>
Expand Down
33 changes: 26 additions & 7 deletions components/experimental/src/transliterate/compile/pass2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use alloc::string::ToString;
use core::fmt::{self, Display, Formatter};
use icu_collections::codepointinvlist::CodePointInversionList;
use icu_locale_core::Locale;
use zerovec::VarZeroVec;
use zerovec::{vecs::Index32, VarZeroVec};

type Result<T> = core::result::Result<T, CompileError>;

Expand Down Expand Up @@ -253,7 +253,8 @@ impl<'a, 'p> Pass2<'a, 'p> {
curr_segment: 0,
};
let mut compiled_transform_groups: Vec<VarZeroVec<'static, ds::SimpleIdULE>> = Vec::new();
let mut compiled_conversion_groups: Vec<VarZeroVec<'static, ds::RuleULE>> = Vec::new();
let mut compiled_conversion_groups: Vec<VarZeroVec<'static, ds::RuleULE, Index32>> =
Vec::new();

for (transform_group, conversion_group) in pass1.groups {
let compiled_transform_group: Vec<_> = transform_group
Expand Down Expand Up @@ -389,13 +390,31 @@ impl<'a, 'p> Pass2<'a, 'p> {
}

fn compile_single_id(&self, id: parse::SingleId) -> ds::SimpleId<'static> {
let mut unparsed = id.basic_id.to_string();
let string = if let Some(bcp47_id) = self.id_mapping.get(&unparsed) {
let unparsed = id.basic_id.to_string();

let string = if matches!(
unparsed.as_str(),
"any-nfc"
| "any-nfkc"
| "any-nfd"
| "any-nfkd"
| "any-null"
| "any-remove"
| "any-lower"
| "any-upper"
| "any-title"
| "any-hex/unicode"
| "any-hex/rust"
| "any-hex/xml"
| "any-hex/perl"
| "any-hex/plain"
) {
unparsed
} else if let Some(bcp47_id) = self.id_mapping.get(&unparsed) {
bcp47_id.to_string()
} else {
// Non-BCP47 ids get prefixed with `x-`.
unparsed.replace_range(0..0, "x-");
unparsed
icu_provider::log::warn!("Reference to unknown transliterator: {unparsed}");
format!("x-{unparsed}")
};
ds::SimpleId {
id: string.into(),
Expand Down
2 changes: 1 addition & 1 deletion components/experimental/src/transliterate/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ mod compile;
#[allow(clippy::indexing_slicing, clippy::unwrap_used)] // TODO(#3958): Remove.
mod transliterator;

pub use transliterator::*;
pub use transliterator::{CustomTransliterator, Transliterator};

pub use compile::RuleCollection;
pub use compile::RuleCollectionProvider;
19 changes: 10 additions & 9 deletions components/experimental/src/transliterate/provider.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ use icu_collections::{
codepointinvliststringlist::CodePointInversionListAndStringListULE,
};
use icu_provider::prelude::*;
use vecs::Index32;
use zerovec::*;

// TODO(#3776): Improve the documentation of this datastruct.
Expand All @@ -45,7 +46,7 @@ pub struct RuleBasedTransliterator<'a> {
/// The list of transform rule groups this transliterator uses.
pub id_group_list: VarZeroVec<'a, VarZeroSlice<SimpleIdULE>>,
/// The list of conversion rule groups this transliterator uses.
pub rule_group_list: VarZeroVec<'a, VarZeroSlice<RuleULE>>,
pub rule_group_list: VarZeroVec<'a, VarZeroSlice<RuleULE, Index32>, Index32>,
}

#[cfg(feature = "serde")]
Expand All @@ -65,7 +66,7 @@ impl<'de> serde::Deserialize<'de> for RuleBasedTransliterator<'de> {
#[serde(borrow)]
pub id_group_list: VarZeroVec<'a, VarZeroSlice<SimpleIdULE>>,
#[serde(borrow)]
pub rule_group_list: VarZeroVec<'a, VarZeroSlice<RuleULE>>,
pub rule_group_list: VarZeroVec<'a, VarZeroSlice<RuleULE, Index32>, Index32>,
}

let Raw {
Expand Down Expand Up @@ -172,25 +173,25 @@ pub struct Rule<'a> {
pub struct VarTable<'a> {
/// Variable definitions.
#[cfg_attr(feature = "serde", serde(borrow))]
pub compounds: VarZeroVec<'a, str>,
pub compounds: VarZeroVec<'a, str, Index32>,
/// Zero or one quantifiers.
#[cfg_attr(feature = "serde", serde(borrow))]
pub quantifiers_opt: VarZeroVec<'a, str>,
pub quantifiers_opt: VarZeroVec<'a, str, Index32>,
/// Zero or more quantifiers.
#[cfg_attr(feature = "serde", serde(borrow))]
pub quantifiers_kleene: VarZeroVec<'a, str>,
pub quantifiers_kleene: VarZeroVec<'a, str, Index32>,
/// One or more quantifiers.
#[cfg_attr(feature = "serde", serde(borrow))]
pub quantifiers_kleene_plus: VarZeroVec<'a, str>,
pub quantifiers_kleene_plus: VarZeroVec<'a, str, Index32>,
/// Segments.
#[cfg_attr(feature = "serde", serde(borrow))]
pub segments: VarZeroVec<'a, SegmentULE>,
pub segments: VarZeroVec<'a, SegmentULE, Index32>,
/// UnicodeSets. These are represented as a [`CodePointInversionListAndStringList`](icu_collections::codepointinvliststringlist::CodePointInversionListAndStringList)
#[cfg_attr(feature = "serde", serde(borrow))]
pub unicode_sets: VarZeroVec<'a, CodePointInversionListAndStringListULE>,
pub unicode_sets: VarZeroVec<'a, CodePointInversionListAndStringListULE, Index32>,
/// Function calls.
#[cfg_attr(feature = "serde", serde(borrow))]
pub function_calls: VarZeroVec<'a, FunctionCallULE>,
pub function_calls: VarZeroVec<'a, FunctionCallULE, Index32>,
/// The maximum number of _left_ placeholders (`rest @@@ |`) in any rule.
pub max_left_placeholder_count: u16,
/// The maximum number of _right_ placeholders (`| @@@ rest`) in any rule.
Expand Down
Loading

0 comments on commit 74ae3a9

Please sign in to comment.