From 3abcb9f94f9bae01aed265395f21fe40d7341c64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= Date: Wed, 8 Jan 2020 04:51:13 +0100 Subject: [PATCH] Refactor bm25 to include model parametrization (cont.) (#2722) * Refactor bm25 to include model parametrization * Refactor constants back and fix typo * Refactor parameters order and description * Add BM25 tests This closes #2597 and closes #2606 * Simplify asserts in BM25 tests * Refactor BM25.get_score Co-authored-by: Marcelo d'Almeida --- gensim/summarization/bm25.py | 80 +++++++++++++++++++++++++++----- gensim/test/test_BM25.py | 90 +++++++++++++++++++++++++++++++++++- 2 files changed, 158 insertions(+), 12 deletions(-) diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py index 896d45cc94..beabbf728c 100644 --- a/gensim/summarization/bm25.py +++ b/gensim/summarization/bm25.py @@ -69,14 +69,36 @@ class BM25(object): List of document lengths. """ - def __init__(self, corpus): + def __init__(self, corpus, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON): """ Parameters ---------- corpus : list of list of str Given corpus. + k1 : float + Constant used for influencing the term frequency saturation. After saturation is reached, additional + presence for the term adds a significantly less additional score. According to [1]_, experiments suggest + that 1.2 < k1 < 2 yields reasonably good results, although the optimal value depends on factors such as + the type of documents or queries. + b : float + Constant used for influencing the effects of different document lengths relative to average document length. + When b is bigger, lengthier documents (compared to average) have more impact on its effect. According to + [1]_, experiments suggest that 0.5 < b < 0.8 yields reasonably good results, although the optimal value + depends on factors such as the type of documents or queries. + epsilon : float + Constant used as floor value for idf of a document in the corpus. When epsilon is positive, it restricts + negative idf values. Negative idf implies that adding a very common term to a document penalize the overall + score (with 'very common' meaning that it is present in more than half of the documents). That can be + undesirable as it means that an identical document would score less than an almost identical one (by + removing the referred term). Increasing epsilon above 0 raises the sense of how rare a word has to be (among + different documents) to receive an extra score. """ + + self.k1 = k1 + self.b = b + self.epsilon = epsilon + self.corpus_size = 0 self.avgdl = 0 self.doc_freqs = [] @@ -126,7 +148,7 @@ def _initialize(self, corpus): ' unintuitive results.'.format(self.corpus_size) ) - eps = EPSILON * self.average_idf + eps = self.epsilon * self.average_idf for word in negative_idfs: self.idf[word] = eps @@ -146,13 +168,15 @@ def get_score(self, document, index): BM25 score. """ - score = 0 + score = 0.0 doc_freqs = self.doc_freqs[index] + numerator_constant = self.k1 + 1 + denominator_constant = self.k1 * (1 - self.b + self.b * self.doc_len[index] / self.avgdl) for word in document: - if word not in doc_freqs: - continue - score += (self.idf[word] * doc_freqs[word] * (PARAM_K1 + 1) - / (doc_freqs[word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.doc_len[index] / self.avgdl))) + if word in doc_freqs: + df = self.doc_freqs[index][word] + idf = self.idf[word] + score += (idf * df * numerator_constant) / (df + denominator_constant) return score def get_scores(self, document): @@ -236,7 +260,7 @@ def _get_scores(bm25, document): return bm25.get_scores(document) -def iter_bm25_bow(corpus, n_jobs=1): +def iter_bm25_bow(corpus, n_jobs=1, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON): """Yield BM25 scores (weights) of documents in corpus. Each document has to be weighted with every document in given corpus. @@ -246,6 +270,23 @@ def iter_bm25_bow(corpus, n_jobs=1): Corpus of documents. n_jobs : int The number of processes to use for computing bm25. + k1 : float + Constant used for influencing the term frequency saturation. After saturation is reached, additional + presence for the term adds a significantly less additional score. According to [1]_, experiments suggest + that 1.2 < k1 < 2 yields reasonably good results, although the optimal value depends on factors such as + the type of documents or queries. + b : float + Constant used for influencing the effects of different document lengths relative to average document length. + When b is bigger, lengthier documents (compared to average) have more impact on its effect. According to + [1]_, experiments suggest that 0.5 < b < 0.8 yields reasonably good results, although the optimal value + depends on factors such as the type of documents or queries. + epsilon : float + Constant used as floor value for idf of a document in the corpus. When epsilon is positive, it restricts + negative idf values. Negative idf implies that adding a very common term to a document penalize the overall + score (with 'very common' meaning that it is present in more than half of the documents). That can be + undesirable as it means that an identical document would score less than an almost identical one (by + removing the referred term). Increasing epsilon above 0 raises the sense of how rare a word has to be (among + different documents) to receive an extra score. Yields ------- @@ -265,7 +306,7 @@ def iter_bm25_bow(corpus, n_jobs=1): >>> result = iter_bm25_weights(corpus, n_jobs=-1) """ - bm25 = BM25(corpus) + bm25 = BM25(corpus, k1, b, epsilon) n_processes = effective_n_jobs(n_jobs) if n_processes == 1: @@ -282,7 +323,7 @@ def iter_bm25_bow(corpus, n_jobs=1): pool.join() -def get_bm25_weights(corpus, n_jobs=1): +def get_bm25_weights(corpus, n_jobs=1, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON): """Returns BM25 scores (weights) of documents in corpus. Each document has to be weighted with every document in given corpus. @@ -292,6 +333,23 @@ def get_bm25_weights(corpus, n_jobs=1): Corpus of documents. n_jobs : int The number of processes to use for computing bm25. + k1 : float + Constant used for influencing the term frequency saturation. After saturation is reached, additional + presence for the term adds a significantly less additional score. According to [1]_, experiments suggest + that 1.2 < k1 < 2 yields reasonably good results, although the optimal value depends on factors such as + the type of documents or queries. + b : float + Constant used for influencing the effects of different document lengths relative to average document length. + When b is bigger, lengthier documents (compared to average) have more impact on its effect. According to + [1]_, experiments suggest that 0.5 < b < 0.8 yields reasonably good results, although the optimal value + depends on factors such as the type of documents or queries. + epsilon : float + Constant used as floor value for idf of a document in the corpus. When epsilon is positive, it restricts + negative idf values. Negative idf implies that adding a very common term to a document penalize the overall + score (with 'very common' meaning that it is present in more than half of the documents). That can be + undesirable as it means that an identical document would score less than an almost identical one (by + removing the referred term). Increasing epsilon above 0 raises the sense of how rare a word has to be (among + different documents) to receive an extra score. Returns ------- @@ -311,7 +369,7 @@ def get_bm25_weights(corpus, n_jobs=1): >>> result = get_bm25_weights(corpus, n_jobs=-1) """ - bm25 = BM25(corpus) + bm25 = BM25(corpus, k1, b, epsilon) n_processes = effective_n_jobs(n_jobs) if n_processes == 1: diff --git a/gensim/test/test_BM25.py b/gensim/test/test_BM25.py index d575ae9433..eb63ddc328 100644 --- a/gensim/test/test_BM25.py +++ b/gensim/test/test_BM25.py @@ -11,7 +11,7 @@ import logging import unittest -from gensim.summarization.bm25 import get_bm25_weights +from gensim.summarization.bm25 import get_bm25_weights, iter_bm25_bow, BM25 from gensim.test.utils import common_texts @@ -62,6 +62,94 @@ def test_multiprocessing(self): self.assertAlmostEqual(weights1, weights3) self.assertAlmostEqual(weights2, weights3) + def test_k1(self): + """ Changing the k1 parameter should give consistent results """ + corpus = common_texts + index = 0 + doc = corpus[index] + first_k1 = 1.0 + second_k1 = 2.0 + + first_bm25 = BM25(corpus, k1=first_k1) + second_bm25 = BM25(corpus, k1=second_k1) + first_score = first_bm25.get_score(doc, index) + second_score = second_bm25.get_score(doc, index) + self.assertLess(first_score, second_score) + + first_iter = iter_bm25_bow(corpus, k1=first_k1) + second_iter = iter_bm25_bow(corpus, k1=second_k1) + first_score = dict(next(iter(first_iter)))[index] + second_score = dict(next(iter(second_iter)))[index] + self.assertLess(first_score, second_score) + + first_weights = get_bm25_weights(corpus, k1=first_k1) + second_weights = get_bm25_weights(corpus, k1=second_k1) + first_score = first_weights[index] + second_score = second_weights[index] + self.assertLess(first_score, second_score) + + def test_b(self): + """ Changing the b parameter should give consistent results """ + corpus = common_texts + index = 0 + doc = corpus[index] + first_b = 1.0 + second_b = 2.0 + + first_bm25 = BM25(corpus, b=first_b) + second_bm25 = BM25(corpus, b=second_b) + first_score = first_bm25.get_score(doc, index) + second_score = second_bm25.get_score(doc, index) + self.assertLess(first_score, second_score) + + first_iter = iter_bm25_bow(corpus, b=first_b) + second_iter = iter_bm25_bow(corpus, b=second_b) + first_score = dict(next(iter(first_iter)))[index] + second_score = dict(next(iter(second_iter)))[index] + self.assertLess(first_score, second_score) + + first_weights = get_bm25_weights(corpus, b=first_b) + second_weights = get_bm25_weights(corpus, b=second_b) + first_score = first_weights[index] + second_score = second_weights[index] + self.assertLess(first_score, second_score) + + def test_epsilon(self): + """ Changing the b parameter should give consistent results """ + corpus = [['cat', 'dog', 'mouse'], ['cat', 'lion'], ['cat', 'lion']] + first_epsilon = 1.0 + second_epsilon = 2.0 + bm25 = BM25(corpus) + words_with_negative_idfs = set([ + word + for word, idf in bm25.idf.items() + if idf < 0 + ]) + index, doc = [ + (index, document) + for index, document + in enumerate(corpus) + if words_with_negative_idfs & set(document) + ][0] + + first_bm25 = BM25(corpus, epsilon=first_epsilon) + second_bm25 = BM25(corpus, epsilon=second_epsilon) + first_score = first_bm25.get_score(doc, index) + second_score = second_bm25.get_score(doc, index) + self.assertGreater(first_score, second_score) + + first_iter = iter_bm25_bow(corpus, epsilon=first_epsilon) + second_iter = iter_bm25_bow(corpus, epsilon=second_epsilon) + first_score = dict(next(iter(first_iter)))[index] + second_score = dict(next(iter(second_iter)))[index] + self.assertGreater(first_score, second_score) + + first_weights = get_bm25_weights(corpus, epsilon=first_epsilon) + second_weights = get_bm25_weights(corpus, epsilon=second_epsilon) + first_score = first_weights[index] + second_score = second_weights[index] + self.assertGreater(first_score, second_score) + if __name__ == '__main__': logging.basicConfig(level=logging.DEBUG)