Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add LocaleData parameter for word/sentence segmenter #5318

Merged
merged 12 commits into from
Sep 3, 2024
Merged
4 changes: 4 additions & 0 deletions components/segmenter/src/grapheme.rs
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,7 @@ impl GraphemeClusterSegmenter {
data: payload,
complex: None,
boundary_property: 0,
locale_override: None,
})
}

Expand All @@ -214,6 +215,7 @@ impl GraphemeClusterSegmenter {
data: self.payload.get(),
complex: None,
boundary_property: 0,
locale_override: None,
})
}
/// Creates a grapheme cluster break iterator for a Latin-1 (8-bit) string.
Expand All @@ -231,6 +233,7 @@ impl GraphemeClusterSegmenter {
data: self.payload.get(),
complex: None,
boundary_property: 0,
locale_override: None,
})
}

Expand All @@ -257,6 +260,7 @@ impl GraphemeClusterSegmenter {
data: payload,
complex: None,
boundary_property: 0,
locale_override: None,
})
}
}
Expand Down
2 changes: 2 additions & 0 deletions components/segmenter/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,8 @@ pub use crate::word::WordSegmenter;
pub use crate::line::LineBreakOptions;
pub use crate::line::LineBreakStrictness;
pub use crate::line::LineBreakWordOption;
pub use crate::sentence::SentenceBreakOptions;
pub use crate::word::WordBreakOptions;
pub use crate::word::WordType;

// Typedefs
Expand Down
23 changes: 23 additions & 0 deletions components/segmenter/src/provider/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ const _: () = {
pub mod icu {
pub use crate as segmenter;
pub use icu_collections as collections;
pub use icu_segmenter_data::icu_locale as locale;
}
make_provider!(Baked);
impl_dictionary_for_word_only_auto_v1_marker!(Baked);
Expand All @@ -49,7 +50,9 @@ const _: () = {
impl_line_break_data_v2_marker!(Baked);
#[cfg(feature = "lstm")]
impl_lstm_for_word_line_auto_v1_marker!(Baked);
impl_sentence_break_data_override_v1_marker!(Baked);
impl_sentence_break_data_v2_marker!(Baked);
impl_word_break_data_override_v1_marker!(Baked);
impl_word_break_data_v2_marker!(Baked);
};

Expand All @@ -61,7 +64,9 @@ pub const MARKERS: &[DataMarkerInfo] = &[
GraphemeClusterBreakDataV2Marker::INFO,
LineBreakDataV2Marker::INFO,
LstmForWordLineAutoV1Marker::INFO,
SentenceBreakDataOverrideV1Marker::INFO,
SentenceBreakDataV2Marker::INFO,
WordBreakDataOverrideV1Marker::INFO,
WordBreakDataV2Marker::INFO,
];

Expand Down Expand Up @@ -154,6 +159,24 @@ impl DynamicDataMarker for UCharDictionaryBreakDataV1Marker {
type DataStruct = UCharDictionaryBreakDataV1<'static>;
}

/// codepoint trie data that the difference by specific locale
#[icu_provider::data_struct(
marker(SentenceBreakDataOverrideV1Marker, "segmenter/sentence/override@1",),
marker(WordBreakDataOverrideV1Marker, "segmenter/word/override@1")
)]
#[derive(Debug, PartialEq, Clone)]
#[cfg_attr(
feature = "datagen",
derive(serde::Serialize,databake::Bake),
databake(path = icu_segmenter::provider),
)]
#[cfg_attr(feature = "serde", derive(serde::Deserialize))]
pub struct RuleBreakDataOverrideV1<'data> {
/// The difference of property table for special locale.
#[cfg_attr(feature = "serde", serde(borrow))]
pub property_table_override: CodePointTrie<'data, u8>,
}

#[derive(Clone, Copy, PartialEq, Debug)]
#[cfg_attr(
feature = "datagen",
Expand Down
9 changes: 9 additions & 0 deletions components/segmenter/src/rule_segmenter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ pub struct RuleBreakIterator<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> {
pub(crate) data: &'l RuleBreakDataV2<'l>,
pub(crate) complex: Option<&'l ComplexPayloads>,
pub(crate) boundary_property: u8,
pub(crate) locale_override: Option<&'l RuleBreakDataOverrideV1<'l>>,
}

impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> Iterator for RuleBreakIterator<'l, 's, Y> {
Expand Down Expand Up @@ -210,6 +211,14 @@ impl<'l, 's, Y: RuleBreakType<'l, 's> + ?Sized> RuleBreakIterator<'l, 's, Y> {

fn get_break_property(&self, codepoint: Y::CharType) -> u8 {
// Note: Default value is 0 == UNKNOWN
if let Some(locale_override) = &self.locale_override {
let property = locale_override
.property_table_override
.get32(codepoint.into());
if property != 0 {
return property;
}
}
self.data.property_table.get32(codepoint.into())
}

Expand Down
85 changes: 84 additions & 1 deletion components/segmenter/src/sentence.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@ use crate::provider::*;
use crate::rule_segmenter::*;
use utf8_iter::Utf8CharIndices;

/// Options to tailor sentence breaking behavior.
#[non_exhaustive]
#[derive(Clone, PartialEq, Eq, Debug, Default)]
pub struct SentenceBreakOptions {
/// Content locale for sentence segmenter.
pub content_locale: Option<DataLocale>,
}

/// Implements the [`Iterator`] trait over the sentence boundaries of the given string.
///
/// Lifetimes:
Expand Down Expand Up @@ -100,6 +108,7 @@ pub type SentenceBreakIteratorUtf16<'l, 's> = SentenceBreakIterator<'l, 's, Rule
#[derive(Debug)]
pub struct SentenceSegmenter {
payload: DataPayload<SentenceBreakDataV2Marker>,
payload_locale_override: Option<DataPayload<SentenceBreakDataOverrideV1Marker>>,
}

#[cfg(feature = "compiled_data")]
Expand All @@ -121,6 +130,7 @@ impl SentenceSegmenter {
payload: DataPayload::from_static_ref(
crate::provider::Baked::SINGLETON_SENTENCE_BREAK_DATA_V2_MARKER,
),
payload_locale_override: None,
}
}

Expand All @@ -140,13 +150,70 @@ impl SentenceSegmenter {
D: DataProvider<SentenceBreakDataV2Marker> + ?Sized,
{
let payload = provider.load(Default::default())?.payload;
Ok(Self { payload })
Ok(Self {
payload,
payload_locale_override: None,
})
}

icu_provider::gen_any_buffer_data_constructors!(
(options: SentenceBreakOptions) -> error: DataError,
/// Constructs a [`SentenceSegmenter`] for a given options and using compiled data.
///
/// ✨ *Enabled with the `compiled_data` Cargo feature.*
///
/// [📚 Help choosing a constructor](icu_provider::constructors)
functions: [
try_new_with_options,
try_new_with_options_with_any_provider,
try_new_with_options_with_buffer_provider,
try_new_with_options_unstable,
Self
]
);

#[doc = icu_provider::gen_any_buffer_unstable_docs!(UNSTABLE, Self::try_new_with_options)]
pub fn try_new_with_options_unstable<D>(
provider: &D,
options: SentenceBreakOptions,
) -> Result<Self, DataError>
where
D: DataProvider<SentenceBreakDataV2Marker>
+ DataProvider<SentenceBreakDataOverrideV1Marker>
+ ?Sized,
{
let payload = provider.load(Default::default())?.payload;
let payload_locale_override = if let Some(locale) = options.content_locale {
let req = DataRequest {
id: DataIdentifierBorrowed::for_locale(&locale),
..Default::default()
};
match provider.load(req) {
Ok(response) => Ok(Some(response.payload)),
Err(DataError {
kind: DataErrorKind::IdentifierNotFound,
..
}) => Ok(None),
Err(e) => Err(e),
}
} else {
Ok(None)
};

Ok(Self {
payload,
payload_locale_override: payload_locale_override?,
})
}

/// Creates a sentence break iterator for an `str` (a UTF-8 string).
///
/// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
pub fn segment_str<'l, 's>(&'l self, input: &'s str) -> SentenceBreakIteratorUtf8<'l, 's> {
let locale_override = self
.payload_locale_override
.as_ref()
.map(|payload| payload.get());
SentenceBreakIterator(RuleBreakIterator {
iter: input.char_indices(),
len: input.len(),
Expand All @@ -155,6 +222,7 @@ impl SentenceSegmenter {
data: self.payload.get(),
complex: None,
boundary_property: 0,
locale_override,
})
}
/// Creates a sentence break iterator for a potentially ill-formed UTF8 string
Expand All @@ -166,6 +234,10 @@ impl SentenceSegmenter {
&'l self,
input: &'s [u8],
) -> SentenceBreakIteratorPotentiallyIllFormedUtf8<'l, 's> {
let locale_override = self
.payload_locale_override
.as_ref()
.map(|payload| payload.get());
SentenceBreakIterator(RuleBreakIterator {
iter: Utf8CharIndices::new(input),
len: input.len(),
Expand All @@ -174,6 +246,7 @@ impl SentenceSegmenter {
data: self.payload.get(),
complex: None,
boundary_property: 0,
locale_override,
})
}
/// Creates a sentence break iterator for a Latin-1 (8-bit) string.
Expand All @@ -183,6 +256,10 @@ impl SentenceSegmenter {
&'l self,
input: &'s [u8],
) -> SentenceBreakIteratorLatin1<'l, 's> {
let locale_override = self
.payload_locale_override
.as_ref()
.map(|payload| payload.get());
SentenceBreakIterator(RuleBreakIterator {
iter: Latin1Indices::new(input),
len: input.len(),
Expand All @@ -191,13 +268,18 @@ impl SentenceSegmenter {
data: self.payload.get(),
complex: None,
boundary_property: 0,
locale_override,
})
}

/// Creates a sentence break iterator for a UTF-16 string.
///
/// There are always breakpoints at 0 and the string length, or only at 0 for the empty string.
pub fn segment_utf16<'l, 's>(&'l self, input: &'s [u16]) -> SentenceBreakIteratorUtf16<'l, 's> {
let locale_override = self
.payload_locale_override
.as_ref()
.map(|payload| payload.get());
SentenceBreakIterator(RuleBreakIterator {
iter: Utf16Indices::new(input),
len: input.len(),
Expand All @@ -206,6 +288,7 @@ impl SentenceSegmenter {
data: self.payload.get(),
complex: None,
boundary_property: 0,
locale_override,
})
}
}
Expand Down
Loading