Skip to content

Commit

Permalink
Merge pull request #24 from tomcumming/master
Browse files Browse the repository at this point in the history
Unicode sentence boundaries
  • Loading branch information
Manishearth committed May 15, 2019
2 parents 8ca8e23 + 9c7abf2 commit c7a6b6f
Show file tree
Hide file tree
Showing 8 changed files with 1,757 additions and 2 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ license = "MIT/Apache-2.0"
keywords = ["text", "unicode", "grapheme", "word", "boundary"]
readme = "README.md"
description = """
This crate provides Grapheme Cluster and Word boundaries
This crate provides Grapheme Cluster, Word and Sentence boundaries
according to Unicode Standard Annex #29 rules.
"""

Expand Down
7 changes: 7 additions & 0 deletions scripts/unicode.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,3 +351,10 @@ def emit_break_module(f, break_table, break_cats, name):
word_table.extend([(x, y, cat) for (x, y) in word_cats[cat]])
word_table.sort(key=lambda w: w[0])
emit_break_module(rf, word_table, word_cats.keys(), "word")

sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", [])
sentence_table = []
for cat in sentence_cats:
sentence_table.extend([(x, y, cat) for (x, y) in sentence_cats[cat]])
sentence_table.sort(key=lambda w: w[0])
emit_break_module(rf, sentence_table, sentence_cats.keys(), "sentence")
15 changes: 15 additions & 0 deletions scripts/unicode_gen_breaktests.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,8 +190,23 @@ def create_words_data(f):
f.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt\n")
unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)

def create_sentence_data(f):
d = load_test_data("auxiliary/SentenceBreakTest.txt")

test = []

for (c, i) in d:
allchars = [cn for s in c for cn in s]
test.append((allchars, c))

wtype = "&'static [(&'static str, &'static [&'static str])]"
f.write(" // official Unicode test data\n")
f.write(" // http://www.unicode.org/Public/UNIDATA/auxiliary/SentenceBreakTest.txt\n")
unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True)

if __name__ == "__main__":
with open("testdata.rs", "w") as rf:
rf.write(unicode.preamble)
create_grapheme_data(rf)
create_words_data(rf)
create_sentence_data(rf)
40 changes: 39 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
// option. This file may not be copied, modified, or distributed
// except according to those terms.

//! Iterators which split strings on Grapheme Cluster or Word boundaries, according
//! Iterators which split strings on Grapheme Cluster, Word or Sentence boundaries, according
//! to the [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/) rules.
//!
//! ```rust
Expand Down Expand Up @@ -67,10 +67,12 @@ pub use grapheme::{Graphemes, GraphemeIndices};
pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
pub use tables::UNICODE_VERSION;
pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords};
pub use sentence::{USentenceBounds, USentenceBoundIndices, UnicodeSentences};

mod grapheme;
mod tables;
mod word;
mod sentence;

#[cfg(test)]
mod test;
Expand Down Expand Up @@ -174,6 +176,27 @@ pub trait UnicodeSegmentation {
/// assert_eq!(&swi1[..], b);
/// ```
fn split_word_bound_indices<'a>(&'a self) -> UWordBoundIndices<'a>;

/// Returns an iterator over substrings of `self` separated on
/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
///
/// The concatenation of the substrings returned by this function is just the original string.
fn unicode_sentences<'a>(&'a self) -> UnicodeSentences<'a>;

/// Returns an iterator over substrings of `self` separated on
/// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
///
/// Here, "sentences" are just those substrings which, after splitting on
/// UAX#29 sentence boundaries, contain any alphanumeric characters. That is, the
/// substring must contain at least one character with the
/// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
/// property, or with
/// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
fn split_sentence_bounds<'a>(&'a self) -> USentenceBounds<'a>;

/// Returns an iterator over substrings of `self`, split on UAX#29 sentence boundaries,
/// and their offsets. See `split_sentence_bounds()` for more information.
fn split_sentence_bound_indices<'a>(&'a self) -> USentenceBoundIndices<'a>;
}

impl UnicodeSegmentation for str {
Expand Down Expand Up @@ -201,4 +224,19 @@ impl UnicodeSegmentation for str {
fn split_word_bound_indices(&self) -> UWordBoundIndices {
word::new_word_bound_indices(self)
}

#[inline]
fn unicode_sentences(&self) -> UnicodeSentences {
sentence::new_unicode_sentences(self)
}

#[inline]
fn split_sentence_bounds(&self) -> USentenceBounds {
sentence::new_sentence_bounds(self)
}

#[inline]
fn split_sentence_bound_indices(&self) -> USentenceBoundIndices {
sentence::new_sentence_bound_indices(self)
}
}
Loading

0 comments on commit c7a6b6f

Please sign in to comment.