Merge pull request #24 from tomcumming/master

Unicode sentence boundaries
unicode-rs · May 15, 2019 · c7a6b6f · c7a6b6f
2 parents 8ca8e23 + 9c7abf2
commit c7a6b6f
Show file tree

Hide file tree

Showing 8 changed files with 1,757 additions and 2 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -12,7 +12,7 @@ license = "MIT/Apache-2.0"
 keywords = ["text", "unicode", "grapheme", "word", "boundary"]
 readme = "README.md"
 description = """
-This crate provides Grapheme Cluster and Word boundaries
+This crate provides Grapheme Cluster, Word and Sentence boundaries
 according to Unicode Standard Annex #29 rules.
 """
 

diff --git a/scripts/unicode.py b/scripts/unicode.py
@@ -351,3 +351,10 @@ def emit_break_module(f, break_table, break_cats, name):
             word_table.extend([(x, y, cat) for (x, y) in word_cats[cat]])
         word_table.sort(key=lambda w: w[0])
         emit_break_module(rf, word_table, word_cats.keys(), "word")
+
+        sentence_cats = load_properties("auxiliary/SentenceBreakProperty.txt", [])
+        sentence_table = []
+        for cat in sentence_cats:
+            sentence_table.extend([(x, y, cat) for (x, y) in sentence_cats[cat]])
+        sentence_table.sort(key=lambda w: w[0])
+        emit_break_module(rf, sentence_table, sentence_cats.keys(), "sentence")
diff --git a/scripts/unicode_gen_breaktests.py b/scripts/unicode_gen_breaktests.py
@@ -190,8 +190,23 @@ def create_words_data(f):
     f.write("    // http://www.unicode.org/Public/UNIDATA/auxiliary/WordBreakTest.txt\n")
     unicode.emit_table(f, "TEST_WORD", test, wtype, True, showfun, True)
 
+def create_sentence_data(f):
+    d = load_test_data("auxiliary/SentenceBreakTest.txt")
+
+    test = []
+
+    for (c, i) in d:
+        allchars = [cn for s in c for cn in s]
+        test.append((allchars, c))
+
+    wtype = "&'static [(&'static str, &'static [&'static str])]"
+    f.write("    // official Unicode test data\n")
+    f.write("    // http://www.unicode.org/Public/UNIDATA/auxiliary/SentenceBreakTest.txt\n")
+    unicode.emit_table(f, "TEST_SENTENCE", test, wtype, True, showfun, True)
+
 if __name__ == "__main__":
     with open("testdata.rs", "w") as rf:
         rf.write(unicode.preamble)
         create_grapheme_data(rf)
         create_words_data(rf)
+        create_sentence_data(rf)
diff --git a/src/lib.rs b/src/lib.rs
@@ -8,7 +8,7 @@
 // option. This file may not be copied, modified, or distributed
 // except according to those terms.
 
-//! Iterators which split strings on Grapheme Cluster or Word boundaries, according
+//! Iterators which split strings on Grapheme Cluster, Word or Sentence boundaries, according
 //! to the [Unicode Standard Annex #29](http://www.unicode.org/reports/tr29/) rules.
 //!
 //! ```rust
@@ -67,10 +67,12 @@ pub use grapheme::{Graphemes, GraphemeIndices};
 pub use grapheme::{GraphemeCursor, GraphemeIncomplete};
 pub use tables::UNICODE_VERSION;
 pub use word::{UWordBounds, UWordBoundIndices, UnicodeWords};
+pub use sentence::{USentenceBounds, USentenceBoundIndices, UnicodeSentences};
 
 mod grapheme;
 mod tables;
 mod word;
+mod sentence;
 
 #[cfg(test)]
 mod test;
@@ -174,6 +176,27 @@ pub trait UnicodeSegmentation {
     /// assert_eq!(&swi1[..], b);
     /// ```
     fn split_word_bound_indices<'a>(&'a self) -> UWordBoundIndices<'a>;
+
+    /// Returns an iterator over substrings of `self` separated on
+    /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
+    ///
+    /// The concatenation of the substrings returned by this function is just the original string.
+    fn unicode_sentences<'a>(&'a self) -> UnicodeSentences<'a>;
+
+    /// Returns an iterator over substrings of `self` separated on
+    /// [UAX#29 sentence boundaries](http://www.unicode.org/reports/tr29/#Sentence_Boundaries).
+    ///
+    /// Here, "sentences" are just those substrings which, after splitting on
+    /// UAX#29 sentence boundaries, contain any alphanumeric characters. That is, the
+    /// substring must contain at least one character with the
+    /// [Alphabetic](http://unicode.org/reports/tr44/#Alphabetic)
+    /// property, or with
+    /// [General_Category=Number](http://unicode.org/reports/tr44/#General_Category_Values).
+    fn split_sentence_bounds<'a>(&'a self) -> USentenceBounds<'a>;
+
+    /// Returns an iterator over substrings of `self`, split on UAX#29 sentence boundaries,
+    /// and their offsets. See `split_sentence_bounds()` for more information.
+    fn split_sentence_bound_indices<'a>(&'a self) -> USentenceBoundIndices<'a>;
 }
 
 impl UnicodeSegmentation for str {
@@ -201,4 +224,19 @@ impl UnicodeSegmentation for str {
     fn split_word_bound_indices(&self) -> UWordBoundIndices {
         word::new_word_bound_indices(self)
     }
+
+    #[inline]
+    fn unicode_sentences(&self) -> UnicodeSentences {
+        sentence::new_unicode_sentences(self)
+    }
+
+    #[inline]
+    fn split_sentence_bounds(&self) -> USentenceBounds {
+        sentence::new_sentence_bounds(self)
+    }
+
+    #[inline]
+    fn split_sentence_bound_indices(&self) -> USentenceBoundIndices {
+        sentence::new_sentence_bound_indices(self)
+    }
 }