explosion · honnibal · Mar 23, 2019 · Mar 23, 2019 · Mar 23, 2019 · Mar 23, 2019
diff --git a/netlify.toml b/netlify.toml
@@ -43,8 +43,9 @@ redirects = [
     {from = "/usage/lightning-tour", to = "/usage/spacy-101#lightning-tour"},
     {from = "/usage/linguistic-features#rule-based-matching", to = "/usage/rule-based-matching"},
     {from = "/models/comparison", to = "/models"},
-    {from = "/api/#section-cython", to = "/api/cython"},
-    {from = "/api/#cython", to = "/api/cython"},
+    {from = "/api/#section-cython", to = "/api/cython", force = true},
+    {from = "/api/#cython", to = "/api/cython", force = true},
+    {from = "/api/sentencesegmenter", to="/api/sentencizer"},
     {from = "/universe", to = "/universe/project/:id", query = {id = ":id"}, force = true},
     {from = "/universe", to = "/universe/category/:category", query = {category = ":category"}, force = true},
 ]
diff --git a/spacy/language.py b/spacy/language.py
@@ -15,7 +15,7 @@
 from .vocab import Vocab
 from .lemmatizer import Lemmatizer
 from .pipeline import DependencyParser, Tensorizer, Tagger, EntityRecognizer
-from .pipeline import SimilarityHook, TextCategorizer, SentenceSegmenter
+from .pipeline import SimilarityHook, TextCategorizer, Sentencizer
 from .pipeline import merge_noun_chunks, merge_entities, merge_subtokens
 from .pipeline import EntityRuler
 from .compat import izip, basestring_
@@ -119,7 +119,7 @@ class Language(object):
         "ner": lambda nlp, **cfg: EntityRecognizer(nlp.vocab, **cfg),
         "similarity": lambda nlp, **cfg: SimilarityHook(nlp.vocab, **cfg),
         "textcat": lambda nlp, **cfg: TextCategorizer(nlp.vocab, **cfg),
-        "sentencizer": lambda nlp, **cfg: SentenceSegmenter(nlp.vocab, **cfg),
+        "sentencizer": lambda nlp, **cfg: Sentencizer(**cfg),
         "merge_noun_chunks": lambda nlp, **cfg: merge_noun_chunks,
         "merge_entities": lambda nlp, **cfg: merge_entities,
         "merge_subtokens": lambda nlp, **cfg: merge_subtokens,

diff --git a/spacy/pipeline/__init__.py b/spacy/pipeline/__init__.py
@@ -2,7 +2,7 @@
 from __future__ import unicode_literals
 
 from .pipes import Tagger, DependencyParser, EntityRecognizer
-from .pipes import TextCategorizer, Tensorizer, Pipe
+from .pipes import TextCategorizer, Tensorizer, Pipe, Sentencizer
 from .entityruler import EntityRuler
 from .hooks import SentenceSegmenter, SimilarityHook
 from .functions import merge_entities, merge_noun_chunks, merge_subtokens
@@ -15,6 +15,7 @@
     "Tensorizer",
     "Pipe",
     "EntityRuler",
+    "Sentencizer",
     "SentenceSegmenter",
     "SimilarityHook",
     "merge_entities",

diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
@@ -191,7 +191,7 @@ def to_disk(self, path, **kwargs):
         **kwargs: Other config paramters, mostly for consistency.
         RETURNS (EntityRuler): The loaded entity ruler.
 
-        DOCS: https://spacy.io/api/entityruler
+        DOCS: https://spacy.io/api/entityruler#to_disk
         """
         path = ensure_path(path)
         path = path.with_suffix(".jsonl")

diff --git a/spacy/pipeline/hooks.py b/spacy/pipeline/hooks.py
@@ -15,8 +15,6 @@ class SentenceSegmenter(object):
     initialization, or assign a new strategy to the .strategy attribute.
     Sentence detection strategies should be generators that take `Doc` objects
     and yield `Span` objects for each sentence.
-
-    DOCS: https://spacy.io/api/sentencesegmenter
     """
 
     name = "sentencizer"
@@ -35,12 +33,12 @@ def __call__(self, doc):
     def split_on_punct(doc):
         start = 0
         seen_period = False
-        for i, word in enumerate(doc):
-            if seen_period and not word.is_punct:
-                yield doc[start : word.i]
-                start = word.i
+        for i, token in enumerate(doc):
+            if seen_period and not token.is_punct:
+                yield doc[start : token.i]
+                start = token.i
                 seen_period = False
-            elif word.text in [".", "!", "?"]:
+            elif token.text in [".", "!", "?"]:
                 seen_period = True
         if start < len(doc):
             yield doc[start : len(doc)]

diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
@@ -1058,4 +1058,90 @@ cdef class EntityRecognizer(Parser):
                 if move[0] in ("B", "I", "L", "U")))
 
 
-__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer"]
+class Sentencizer(object):
+    """Segment the Doc into sentences using a rule-based strategy.
+
+    DOCS: https://spacy.io/api/sentencizer
+    """
+
+    name = "sentencizer"
+    default_punct_chars = [".", "!", "?"]
+
+    def __init__(self, punct_chars=None, **kwargs):
+        """Initialize the sentencizer.
+
+        punct_chars (list): Punctuation characters to split on. Will be
+            serialized with the nlp object.
+        RETURNS (Sentencizer): The sentencizer component.
+
+        DOCS: https://spacy.io/api/sentencizer#init
+        """
+        self.punct_chars = punct_chars or self.default_punct_chars
+
+    def __call__(self, doc):
+        """Apply the sentencizer to a Doc and set Token.is_sent_start.
+
+        doc (Doc): The document to process.
+        RETURNS (Doc): The processed Doc.
+
+        DOCS: https://spacy.io/api/sentencizer#call
+        """
+        start = 0
+        seen_period = False
+        for i, token in enumerate(doc):
+            is_in_punct_chars = token.text in self.punct_chars
+            token.is_sent_start = i == 0
+            if seen_period and not token.is_punct and not is_in_punct_chars:
+                doc[start].is_sent_start = True
+                start = token.i
+                seen_period = False
+            elif is_in_punct_chars:
+                seen_period = True
+        if start < len(doc):
+            doc[start].is_sent_start = True
+        return doc
+
+    def to_bytes(self, **kwargs):
+        """Serialize the sentencizer to a bytestring.
+
+        RETURNS (bytes): The serialized object.
+
+        DOCS: https://spacy.io/api/sentencizer#to_bytes
+        """
+        return srsly.msgpack_dumps({"punct_chars": self.punct_chars})
+
+    def from_bytes(self, bytes_data, **kwargs):
+        """Load the sentencizer from a bytestring.
+
+        bytes_data (bytes): The data to load.
+        returns (Sentencizer): The loaded object.
+
+        DOCS: https://spacy.io/api/sentencizer#from_bytes
+        """
+        cfg = srsly.msgpack_loads(bytes_data)
+        self.punct_chars = cfg.get("punct_chars", self.default_punct_chars)
+        return self
+
+    def to_disk(self, path, exclude=tuple(), **kwargs):
+        """Serialize the sentencizer to disk.
+
+        DOCS: https://spacy.io/api/sentencizer#to_disk
+        """
+        path = util.ensure_path(path)
+        path = path.with_suffix(".json")
+        srsly.write_json(path, {"punct_chars": self.punct_chars})
+
+
+    def from_disk(self, path, exclude=tuple(), **kwargs):
+        """Load the sentencizer from disk.
+
+        DOCS: https://spacy.io/api/sentencizer#from_disk
+        """
+        path = util.ensure_path(path)
+        path = path.with_suffix(".json")
+        cfg = srsly.read_json(path)
+        self.punct_chars = cfg.get("punct_chars", self.default_punct_chars)
+        return self
+
+
+__all__ = ["Tagger", "DependencyParser", "EntityRecognizer", "Tensorizer", "TextCategorizer", "Sentencizer"]
diff --git a/spacy/tests/pipeline/test_sentencizer.py b/spacy/tests/pipeline/test_sentencizer.py
@@ -0,0 +1,87 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+import pytest
+from spacy.pipeline import Sentencizer
+from spacy.tokens import Doc
+
+
+def test_sentencizer(en_vocab):
+    doc = Doc(en_vocab, words=["Hello", "!", "This", "is", "a", "test", "."])
+    sentencizer = Sentencizer()
+    doc = sentencizer(doc)
+    assert doc.is_sentenced
+    sent_starts = [t.is_sent_start for t in doc]
+    assert sent_starts == [True, False, True, False, False, False, False]
+    assert len(list(doc.sents)) == 2
+
+
+@pytest.mark.parametrize(
+    "words,sent_starts,n_sents",
+    [
+        # The expected result here is that the duplicate punctuation gets merged
+        # onto the same sentence and no one-token sentence is created for them.
+        (
+            ["Hello", "!", ".", "Test", ".", ".", "ok"],
+            [True, False, False, True, False, False, True],
+            3,
+        ),
+        # We also want to make sure ¡ and ¿ aren't treated as sentence end
+        # markers, even though they're punctuation
+        (
+            ["¡", "Buen", "día", "!", "Hola", ",", "¿", "qué", "tal", "?"],
+            [True, False, False, False, True, False, False, False, False, False],
+            2,
+        ),
+        # The Token.is_punct check ensures that quotes are handled as well
+        (
+            ['"', "Nice", "!", '"', "I", "am", "happy", "."],
+            [True, False, False, False, True, False, False, False],
+            2,
+        ),
+    ],
+)
+def test_sentencizer_complex(en_vocab, words, sent_starts, n_sents):
+    doc = Doc(en_vocab, words=words)
+    sentencizer = Sentencizer()
+    doc = sentencizer(doc)
+    assert doc.is_sentenced
+    assert [t.is_sent_start for t in doc] == sent_starts
+    assert len(list(doc.sents)) == n_sents
+
+
+@pytest.mark.parametrize(
+    "punct_chars,words,sent_starts,n_sents",
+    [
+        (
+            ["~", "?"],
+            ["Hello", "world", "~", "A", ".", "B", "."],
+            [True, False, False, True, False, False, False],
+            2,
+        ),
+        # Even thought it's not common, the punct_chars should be able to
+        # handle any tokens
+        (
+            [".", "ö"],
+            ["Hello", ".", "Test", "ö", "Ok", "."],
+            [True, False, True, False, True, False],
+            3,
+        ),
+    ],
+)
+def test_sentencizer_custom_punct(en_vocab, punct_chars, words, sent_starts, n_sents):
+    doc = Doc(en_vocab, words=words)
+    sentencizer = Sentencizer(punct_chars=punct_chars)
+    doc = sentencizer(doc)
+    assert doc.is_sentenced
+    assert [t.is_sent_start for t in doc] == sent_starts
+    assert len(list(doc.sents)) == n_sents
+
+
+def test_sentencizer_serialize_bytes(en_vocab):
+    punct_chars = [".", "~", "+"]
+    sentencizer = Sentencizer(punct_chars=punct_chars)
+    assert sentencizer.punct_chars == punct_chars
+    bytes_data = sentencizer.to_bytes()
+    new_sentencizer = Sentencizer().from_bytes(bytes_data)
+    assert new_sentencizer.punct_chars == punct_chars
diff --git a/spacy/tests/regression/test_issue3468.py b/spacy/tests/regression/test_issue3468.py
@@ -6,10 +6,9 @@
 from spacy.tokens import Doc
 
 
-@pytest.mark.xfail
 def test_issue3468():
-    """Test that sentence boundaries are serialized if they're not set by the
-    dependency parser."""
+    """Test that sentence boundaries are set correctly so Doc.is_sentenced can
+    be restored after serialization."""
     nlp = English()
     nlp.add_pipe(nlp.create_pipe("sentencizer"))
     doc = nlp("Hello world")

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
@@ -230,7 +230,7 @@ cdef class Doc:
         defined as having at least one of the following:
 
         a) An entry "sents" in doc.user_hooks";
-        b) sent.is_parsed is set to True;
+        b) Doc.is_parsed is set to True;
         c) At least one token other than the first where sent_start is not None.
         """
         if "sents" in self.user_hooks:

diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
@@ -441,6 +441,7 @@ cdef class Token:
 
     property sent_start:
         def __get__(self):
+            """Deprecated: use Token.is_sent_start instead."""
             # Raising a deprecation warning here causes errors for autocomplete
             # Handle broken backwards compatibility case: doc[0].sent_start
             # was False.

diff --git a/website/docs/api/sentencesegmenter.md b/website/docs/api/sentencesegmenter.md