diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py index 234dcec377..11b1e48d95 100644 --- a/gensim/summarization/summarizer.py +++ b/gensim/summarization/summarizer.py @@ -158,6 +158,11 @@ def summarize_corpus(corpus, ratio=0.2): _set_graph_edge_weights(graph) _remove_unreachable_nodes(graph) + # Cannot calculate eigenvectors if number of unique words in text < 3. Warns user to add more text. The function ends. + if len(graph.nodes()) < 3: + logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3") + return + pagerank_scores = _pagerank(graph) hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True) diff --git a/gensim/test/test_data/testlowdistinctwords.txt b/gensim/test/test_data/testlowdistinctwords.txt new file mode 100644 index 0000000000..70e20fa3d3 --- /dev/null +++ b/gensim/test/test_data/testlowdistinctwords.txt @@ -0,0 +1,10 @@ +here here. +there there. +here here. +there there. +here here. +there there. +here here. +there there. +here here. +there there. \ No newline at end of file diff --git a/gensim/test/test_summarization.py b/gensim/test/test_summarization.py index fde845dc93..70d2dcada4 100644 --- a/gensim/test/test_summarization.py +++ b/gensim/test/test_summarization.py @@ -160,6 +160,20 @@ def test_keywords_runs(self): kwds_lst = keywords(text, split=True) self.assertTrue(len(kwds_lst)) + def test_low_distinct_words_corpus_summarization_is_none(self): + pre_path = os.path.join(os.path.dirname(__file__), 'test_data') + + with utils.smart_open(os.path.join(pre_path, "testlowdistinctwords.txt"), mode="r") as f: + text = f.read() + + # Generate the corpus. + sentences = text.split("\n") + tokens = [sentence.split() for sentence in sentences] + dictionary = Dictionary(tokens) + corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens] + + self.assertTrue(summarize_corpus(corpus) is None) + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() diff --git a/gensim/test/test_wikicorpus.py b/gensim/test/test_wikicorpus.py index 77c4212831..39e205002b 100644 --- a/gensim/test/test_wikicorpus.py +++ b/gensim/test/test_wikicorpus.py @@ -27,10 +27,11 @@ class TestWikiCorpus(unittest.TestCase): def setUp(self): wc = WikiCorpus(datapath(FILENAME)) - def test_get_texts_returns_generator_of_lists(self): + if sys.version_info < (2, 7, 0): return + wc = WikiCorpus(datapath(FILENAME)) l = wc.get_texts() self.assertEqual(type(l), types.GeneratorType)