diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py index d187330a58..2e2d4ed45e 100644 --- a/gensim/summarization/summarizer.py +++ b/gensim/summarization/summarizer.py @@ -173,14 +173,18 @@ def summarize_corpus(corpus, ratio=0.2): def summarize(text, ratio=0.2, word_count=None, split=False): """ Returns a summarized version of the given text using a variation of - the TextRank algorithm. - The input must be longer than INPUT_MIN_LENGTH sentences for the - summary to make sense and must be given as a string. + the TextRank algorithm (see https://arxiv.org/abs/1602.03606). The output summary will consist of the most representative sentences - and will also be returned as a string, divided by newlines. If the - split parameter is set to True, a list of sentences will be - returned. + and will be returned as a string, divided by newlines. + If the split parameter is set to True, a list of sentences will be + returned instead. + + The input should be a string, and must be longer than + INPUT_MIN_LENGTH sentences for the summary to make sense. The text + will be split into sentences using the split_sentences method in the + summarization.texcleaner module. + Note that newlines divide sentences. The length of the output can be specified using the ratio and word_count parameters: diff --git a/gensim/summarization/textcleaner.py b/gensim/summarization/textcleaner.py index 5f33bbcea9..4829d9f892 100644 --- a/gensim/summarization/textcleaner.py +++ b/gensim/summarization/textcleaner.py @@ -22,7 +22,7 @@ SEPARATOR = r'@' -RE_SENTENCE = re.compile(r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE) # backup (\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$) +RE_SENTENCE = re.compile(r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE) AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)\s(\w)', re.UNICODE) AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)\s(\w)', re.UNICODE) AB_ACRONYM_LETTERS = re.compile(r'([a-zA-Z])\.([a-zA-Z])\.', re.UNICODE)