diff --git a/gensim/summarization/keywords.py b/gensim/summarization/keywords.py index 5146c560ad..155c448a80 100644 --- a/gensim/summarization/keywords.py +++ b/gensim/summarization/keywords.py @@ -11,13 +11,16 @@ from itertools import combinations as _combinations from six.moves.queue import Queue as _Queue from six.moves import xrange +from six import iteritems WINDOW_SIZE = 2 -"""Check tags in http://www.clips.ua.ac.be/pages/mbsp-tags and use only first two letters +""" +Check tags in http://www.clips.ua.ac.be/pages/mbsp-tags and use only first two letters Example: filter for nouns and adjectives: -INCLUDING_FILTER = ['NN', 'JJ']""" +INCLUDING_FILTER = ['NN', 'JJ'] +""" INCLUDING_FILTER = ['NN', 'JJ'] EXCLUDING_FILTER = [] @@ -26,13 +29,17 @@ def _get_pos_filters(): return frozenset(INCLUDING_FILTER), frozenset(EXCLUDING_FILTER) -def _get_words_for_graph(tokens): - include_filters, exclude_filters = _get_pos_filters() +def _get_words_for_graph(tokens, pos_filter): + if pos_filter is None: + include_filters, exclude_filters = _get_pos_filters() + else: + include_filters = set(pos_filter) + exclude_filters = frozenset([]) if include_filters and exclude_filters: raise ValueError("Can't use both include and exclude filters, should use only one") result = [] - for word, unit in tokens.iteritems(): + for word, unit in iteritems(tokens): if exclude_filters and unit.tag in exclude_filters: continue if (include_filters and unit.tag in include_filters) or not include_filters or not unit.tag: @@ -111,7 +118,7 @@ def _extract_tokens(lemmas, scores, ratio, words): def _lemmas_to_words(tokens): lemma_to_word = {} - for word, unit in tokens.iteritems(): + for word, unit in iteritems(tokens): lemma = unit.token if lemma in lemma_to_word: lemma_to_word[lemma].append(word) @@ -189,13 +196,13 @@ def _format_results(_keywords, combined_keywords, split, scores): return "\n".join(combined_keywords) -def keywords(text, ratio=0.2, words=None, split=False, scores=False): +def keywords(text, ratio=0.2, words=None, split=False, scores=False, pos_filter=['NN', 'JJ'], lemmatize=False): # Gets a dict of word -> lemma tokens = _clean_text_by_word(text) split_text = list(_tokenize_by_word(text)) # Creates the graph and adds the edges - graph = _build_graph(_get_words_for_graph(tokens)) + graph = _build_graph(_get_words_for_graph(tokens, pos_filter)) _set_graph_edges(graph, tokens, split_text) del split_text # It's no longer used @@ -206,7 +213,14 @@ def keywords(text, ratio=0.2, words=None, split=False, scores=False): extracted_lemmas = _extract_tokens(graph.nodes(), pagerank_scores, ratio, words) - lemmas_to_word = _lemmas_to_words(tokens) + # The results can be polluted by many variations of the same word + if lemmatize: + lemmas_to_word = {} + for word, unit in iteritems(tokens): + lemmas_to_word[unit.token] = [word] + else: + lemmas_to_word = _lemmas_to_words(tokens) + keywords = _get_keywords_with_score(extracted_lemmas, lemmas_to_word) # text.split() to keep numbers and punctuation marks, so separeted concepts are not combined diff --git a/gensim/summarization/pagerank_weighted.py b/gensim/summarization/pagerank_weighted.py index e5a31357d0..1978c6e1c7 100644 --- a/gensim/summarization/pagerank_weighted.py +++ b/gensim/summarization/pagerank_weighted.py @@ -2,10 +2,9 @@ # -*- coding: utf-8 -*- # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - from numpy import empty as empty_matrix from scipy.sparse import csr_matrix -from scipy.linalg import eig +from scipy.sparse.linalg import eigs from six.moves import xrange try: @@ -21,8 +20,10 @@ def pagerank_weighted(graph, damping=0.85): probability_matrix = build_probability_matrix(graph) pagerank_matrix = damping * adjacency_matrix.todense() + (1 - damping) * probability_matrix - vals, vecs = eig(pagerank_matrix, left=True, right=False) # TODO optimize this. - return process_results(graph, vecs) + + vals, vecs = eigs(pagerank_matrix.T, k=1) # TODO raise an error if matrix has complex eigenvectors? + + return process_results(graph, vecs.real) def build_adjacency_matrix(graph): @@ -37,7 +38,7 @@ def build_adjacency_matrix(graph): neighbors_sum = sum(graph.edge_weight((current_node, neighbor)) for neighbor in graph.neighbors(current_node)) for j in xrange(length): edge_weight = float(graph.edge_weight((current_node, nodes[j]))) - if i != j and edge_weight != 0: + if i != j and edge_weight != 0.0: row.append(i) col.append(j) data.append(edge_weight / neighbors_sum) @@ -49,7 +50,7 @@ def build_probability_matrix(graph): dimension = len(graph.nodes()) matrix = empty_matrix((dimension, dimension)) - probability = 1 / float(dimension) + probability = 1.0 / float(dimension) matrix.fill(probability) return matrix @@ -58,6 +59,6 @@ def build_probability_matrix(graph): def process_results(graph, vecs): scores = {} for i, node in enumerate(graph.nodes()): - scores[node] = abs(vecs[i][0]) # TODO: this is wasteful (only compute the principal component). + scores[node] = abs(vecs[i, :]) return scores diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py index 9e5e343299..2d46716b4a 100644 --- a/gensim/summarization/summarizer.py +++ b/gensim/summarization/summarizer.py @@ -10,13 +10,14 @@ from gensim.summarization.commons import remove_unreachable_nodes as _remove_unreachable_nodes from gensim.summarization.bm25 import get_bm25_weights as _bm25_weights from gensim.corpora import Dictionary -from scipy.sparse import csr_matrix from math import log10 as _log10 from six.moves import xrange INPUT_MIN_LENGTH = 10 +WEIGHT_THRESHOLD = 1.e-3 + logger = logging.getLogger(__name__) @@ -26,7 +27,7 @@ def _set_graph_edge_weights(graph): for i in xrange(len(documents)): for j in xrange(len(documents)): - if i == j: + if i == j or weights[i][j] < WEIGHT_THRESHOLD: continue sentence_1 = documents[i] diff --git a/gensim/test/test_data/mihalcea_tarau.kw.txt b/gensim/test/test_data/mihalcea_tarau.kw.txt new file mode 100644 index 0000000000..9ba4443c91 --- /dev/null +++ b/gensim/test/test_data/mihalcea_tarau.kw.txt @@ -0,0 +1,14 @@ +hurricane +coast +saturday +storm +flood +flooding +gilbert +winds heavy +strong +defense +puerto +north +weather +southeast \ No newline at end of file diff --git a/gensim/test/test_data/mihalcea_tarau.kwpos.txt b/gensim/test/test_data/mihalcea_tarau.kwpos.txt new file mode 100644 index 0000000000..b9597ee237 --- /dev/null +++ b/gensim/test/test_data/mihalcea_tarau.kwpos.txt @@ -0,0 +1,17 @@ +hurricane +gilbert +storm +coast +saturday +winds heavy +weather +flood +flooding +strong +defense +people +cabral said +associated +north +residents +southeast \ No newline at end of file diff --git a/gensim/test/test_keywords.py b/gensim/test/test_keywords.py new file mode 100644 index 0000000000..2e7c3a48f5 --- /dev/null +++ b/gensim/test/test_keywords.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python +# encoding: utf-8 +# +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +""" +Automated test to reproduce the results of Mihalcea and Tarau (2004). + +Mihalcea and Tarau (2004) introduces the TextRank summarization algorithm. +As a validation of the gensim implementation we reproduced its results +in this test. + +""" + +import os.path +import logging +import unittest + +from gensim import utils +from gensim.corpora import Dictionary +from gensim.summarization import keywords + + +class TestKeywordsTest(unittest.TestCase): + + def test_text_keywords(self): + pre_path = os.path.join(os.path.dirname(__file__), 'test_data') + + with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: + text = f.read() + + # calculate keywords + generated_keywords = keywords(text, split=True) + + # To be compared to the reference. + with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kw.txt"), mode="r") as f: + kw = f.read().strip().split("\n") + + self.assertEqual(set(map(str, generated_keywords)), set(map(str, kw))) + + def test_text_keywords_words(self): + pre_path = os.path.join(os.path.dirname(__file__), 'test_data') + + with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: + text = f.read() + + # calculate exactly 13 keywords + generated_keywords = keywords(text, words=15, split=True) + + self.assertEqual(len(generated_keywords), 15) + + def test_text_keywords_pos(self): + pre_path = os.path.join(os.path.dirname(__file__), 'test_data') + + with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: + text = f.read() + + # calculate keywords using only certain parts of speech + generated_keywords_NNVBJJ = keywords(text, pos_filter=['NN', 'VB', 'JJ'], split=True) + + # To be compared to the reference. + with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.kwpos.txt"), mode="r") as f: + kw = f.read().strip().split("\n") + + self.assertEqual(set(map(str, generated_keywords_NNVBJJ)), set(map(str, kw))) + + def test_text_summarization_raises_exception_on_short_input_text(self): + pre_path = os.path.join(os.path.dirname(__file__), 'test_data') + + with utils.smart_open(os.path.join(pre_path, "testsummarization_unrelated.txt"), mode="r") as f: + text = f.read() + + # Keeps the first 8 sentences to make the text shorter. + text = "\n".join(text.split('\n')[:8]) + + self.assertTrue(keywords(text) is not None) + + def test_keywords_ratio(self): + pre_path = os.path.join(os.path.dirname(__file__), 'test_data') + + with utils.smart_open(os.path.join(pre_path, "mihalcea_tarau.txt"), mode="r") as f: + text = f.read() + + # Check ratio parameter is well behaved. Because length is taken on tokenized clean text + # we just check that ratio 40% is twice as long as ratio 20% + selected_docs_20 = keywords(text, ratio=0.2, split=True) + selected_docs_40 = keywords(text, ratio=0.4, split=True) + + self.assertAlmostEqual(float(len(selected_docs_40))/len(selected_docs_20), 1.9, places=1) + +if __name__ == '__main__': + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) + unittest.main() diff --git a/gensim/test/test_summarization.py b/gensim/test/test_summarization.py index 909a3c60ac..b739c49e57 100644 --- a/gensim/test/test_summarization.py +++ b/gensim/test/test_summarization.py @@ -13,6 +13,7 @@ """ import os.path +import logging import unittest from gensim import utils @@ -128,3 +129,7 @@ def test_corpus_summarization_ratio(self): expected_summary_length = int(len(corpus) * ratio) self.assertEqual(len(selected_docs), expected_summary_length) + +if __name__ == '__main__': + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) + unittest.main()