From 0157ab885a11a06d3a4cfb862bd27fe931b5aaa0 Mon Sep 17 00:00:00 2001 From: anmol01gulati Date: Sun, 25 Sep 2016 23:57:07 +0530 Subject: [PATCH 1/6] Added check in summarize_corpus to fix bug in summarizer --- gensim/summarization/summarizer.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py index 234dcec377..3d0ae91656 100644 --- a/gensim/summarization/summarizer.py +++ b/gensim/summarization/summarizer.py @@ -158,6 +158,11 @@ def summarize_corpus(corpus, ratio=0.2): _set_graph_edge_weights(graph) _remove_unreachable_nodes(graph) + # If the number of nodes < 3, the function ends. + if len(graph.nodes) < 3: + logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3") + return + pagerank_scores = _pagerank(graph) hashable_corpus.sort(key=lambda doc: pagerank_scores.get(doc, 0), reverse=True) From d74dc16dcf2e3c0c17b1e396dc57990ad750685e Mon Sep 17 00:00:00 2001 From: anmol01gulati Date: Mon, 26 Sep 2016 00:38:03 +0530 Subject: [PATCH 2/6] Fix #805: Added check in summarizing text --- gensim/summarization/summarizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py index 3d0ae91656..703f2b8115 100644 --- a/gensim/summarization/summarizer.py +++ b/gensim/summarization/summarizer.py @@ -159,7 +159,7 @@ def summarize_corpus(corpus, ratio=0.2): _remove_unreachable_nodes(graph) # If the number of nodes < 3, the function ends. - if len(graph.nodes) < 3: + if len(graph.nodes()) < 3: logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3") return From f24533bb8e681dbe5c1c4df29f221a6dab8cf9fe Mon Sep 17 00:00:00 2001 From: anmol01gulati Date: Wed, 28 Sep 2016 08:37:22 +0530 Subject: [PATCH 3/6] Added test for checking low number of distinct words in text --- gensim/summarization/summarizer.py | 2 +- gensim/test/test_data/testlowdistinctwords.txt | 10 ++++++++++ gensim/test/test_summarization.py | 14 ++++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) create mode 100644 gensim/test/test_data/testlowdistinctwords.txt diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py index 703f2b8115..11b1e48d95 100644 --- a/gensim/summarization/summarizer.py +++ b/gensim/summarization/summarizer.py @@ -158,7 +158,7 @@ def summarize_corpus(corpus, ratio=0.2): _set_graph_edge_weights(graph) _remove_unreachable_nodes(graph) - # If the number of nodes < 3, the function ends. + # Cannot calculate eigenvectors if number of unique words in text < 3. Warns user to add more text. The function ends. if len(graph.nodes()) < 3: logger.warning("Please add more sentences to the text. The number of reachable nodes is below 3") return diff --git a/gensim/test/test_data/testlowdistinctwords.txt b/gensim/test/test_data/testlowdistinctwords.txt new file mode 100644 index 0000000000..70e20fa3d3 --- /dev/null +++ b/gensim/test/test_data/testlowdistinctwords.txt @@ -0,0 +1,10 @@ +here here. +there there. +here here. +there there. +here here. +there there. +here here. +there there. +here here. +there there. \ No newline at end of file diff --git a/gensim/test/test_summarization.py b/gensim/test/test_summarization.py index fde845dc93..0535e09e32 100644 --- a/gensim/test/test_summarization.py +++ b/gensim/test/test_summarization.py @@ -160,6 +160,20 @@ def test_keywords_runs(self): kwds_lst = keywords(text, split=True) self.assertTrue(len(kwds_lst)) + def test_low_distinct_words_corpus_summarization_is_none(self): + pre_path = os.path.join(os.path.dirname(__file__), 'test_data') + + with utils.smart_open(os.path.join(pre_path, "testlowdistinctwords.txt")) as f: + text = f.read() + + # Generate the corpus. + sentences = text.split('\n') + tokens = [sentence.split() for sentence in sentences] + dictionary = Dictionary(tokens) + corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens] + + self.assertTrue(summarize_corpus(corpus) is None) + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() From 990f70c72d9f3d8bbc6bce63da954d669de62707 Mon Sep 17 00:00:00 2001 From: anmol01gulati Date: Wed, 28 Sep 2016 09:32:16 +0530 Subject: [PATCH 4/6] Text split method changed to allow running in Python 3.3 and above. --- gensim/test/test_summarization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/test/test_summarization.py b/gensim/test/test_summarization.py index 0535e09e32..0af739be03 100644 --- a/gensim/test/test_summarization.py +++ b/gensim/test/test_summarization.py @@ -163,11 +163,11 @@ def test_keywords_runs(self): def test_low_distinct_words_corpus_summarization_is_none(self): pre_path = os.path.join(os.path.dirname(__file__), 'test_data') - with utils.smart_open(os.path.join(pre_path, "testlowdistinctwords.txt")) as f: + with utils.smart_open(os.path.join(pre_path, "testlowdistinctwords.txt"), mode="r") as f: text = f.read() # Generate the corpus. - sentences = text.split('\n') + sentences = text.split(b'\n') tokens = [sentence.split() for sentence in sentences] dictionary = Dictionary(tokens) corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens] From 343b9a0f8712c889b45953182bf8b859b3d9ad23 Mon Sep 17 00:00:00 2001 From: anmol01gulati Date: Wed, 28 Sep 2016 11:06:46 +0530 Subject: [PATCH 5/6] Change to fix test in python versions 3.3 and higher --- gensim/test/test_summarization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/test/test_summarization.py b/gensim/test/test_summarization.py index 0af739be03..70d2dcada4 100644 --- a/gensim/test/test_summarization.py +++ b/gensim/test/test_summarization.py @@ -167,7 +167,7 @@ def test_low_distinct_words_corpus_summarization_is_none(self): text = f.read() # Generate the corpus. - sentences = text.split(b'\n') + sentences = text.split("\n") tokens = [sentence.split() for sentence in sentences] dictionary = Dictionary(tokens) corpus = [dictionary.doc2bow(sentence_tokens) for sentence_tokens in tokens] From df238ef1bc71568819ba92502f0e9df46b933698 Mon Sep 17 00:00:00 2001 From: anmol01gulati Date: Thu, 29 Sep 2016 18:56:14 +0530 Subject: [PATCH 6/6] Added blank line test_wikicorpus.py file Added blank line to fix issue with travis CI --- gensim/test/test_wikicorpus.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gensim/test/test_wikicorpus.py b/gensim/test/test_wikicorpus.py index 77c4212831..39e205002b 100644 --- a/gensim/test/test_wikicorpus.py +++ b/gensim/test/test_wikicorpus.py @@ -27,10 +27,11 @@ class TestWikiCorpus(unittest.TestCase): def setUp(self): wc = WikiCorpus(datapath(FILENAME)) - def test_get_texts_returns_generator_of_lists(self): + if sys.version_info < (2, 7, 0): return + wc = WikiCorpus(datapath(FILENAME)) l = wc.get_texts() self.assertEqual(type(l), types.GeneratorType)