Update docstring for gensim.summarization.summarize. Fix piskvorky#…

…1575 (piskvorky#1702)
VaiyeBe · Nov 26, 2017 · acb3825 · acb3825
1 parent 4de55b9
commit acb3825
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 7 deletions.
diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py
@@ -173,14 +173,18 @@ def summarize_corpus(corpus, ratio=0.2):
 def summarize(text, ratio=0.2, word_count=None, split=False):
     """
     Returns a summarized version of the given text using a variation of
-    the TextRank algorithm.
-    The input must be longer than INPUT_MIN_LENGTH sentences for the
-    summary to make sense and must be given as a string.
+    the TextRank algorithm (see https://arxiv.org/abs/1602.03606).
 
     The output summary will consist of the most representative sentences
-    and will also be returned as a string, divided by newlines. If the
-    split parameter is set to True, a list of sentences will be
-    returned.
+    and will be returned as a string, divided by newlines.
+    If the split parameter is set to True, a list of sentences will be
+    returned instead.
+
+    The input should be a string, and must be longer than
+    INPUT_MIN_LENGTH sentences for the summary to make sense. The text
+    will be split into sentences using the split_sentences method in the
+    summarization.texcleaner module.
+    Note that newlines divide sentences.
 
     The length of the output can be specified using the ratio and
     word_count parameters:

diff --git a/gensim/summarization/textcleaner.py b/gensim/summarization/textcleaner.py
@@ -22,7 +22,7 @@
 
 
 SEPARATOR = r'@'
-RE_SENTENCE = re.compile(r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE)  # backup (\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)
+RE_SENTENCE = re.compile(r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE)
 AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)\s(\w)', re.UNICODE)
 AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)\s(\w)', re.UNICODE)
 AB_ACRONYM_LETTERS = re.compile(r'([a-zA-Z])\.([a-zA-Z])\.', re.UNICODE)