Skip to content

Commit

Permalink
Refactor bm25 to include model parametrization (cont.) (#2722)
Browse files Browse the repository at this point in the history
* Refactor bm25 to include model parametrization

* Refactor constants back and fix typo

* Refactor parameters order and description

* Add BM25 tests
This closes #2597 and closes #2606

* Simplify asserts in BM25 tests

* Refactor BM25.get_score

Co-authored-by: Marcelo d'Almeida <md@id.uff.br>
  • Loading branch information
2 people authored and mpenkov committed Jan 8, 2020
1 parent 3d129de commit 3abcb9f
Show file tree
Hide file tree
Showing 2 changed files with 158 additions and 12 deletions.
80 changes: 69 additions & 11 deletions gensim/summarization/bm25.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,36 @@ class BM25(object):
List of document lengths.
"""

def __init__(self, corpus):
def __init__(self, corpus, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON):
"""
Parameters
----------
corpus : list of list of str
Given corpus.
k1 : float
Constant used for influencing the term frequency saturation. After saturation is reached, additional
presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
that 1.2 < k1 < 2 yields reasonably good results, although the optimal value depends on factors such as
the type of documents or queries.
b : float
Constant used for influencing the effects of different document lengths relative to average document length.
When b is bigger, lengthier documents (compared to average) have more impact on its effect. According to
[1]_, experiments suggest that 0.5 < b < 0.8 yields reasonably good results, although the optimal value
depends on factors such as the type of documents or queries.
epsilon : float
Constant used as floor value for idf of a document in the corpus. When epsilon is positive, it restricts
negative idf values. Negative idf implies that adding a very common term to a document penalize the overall
score (with 'very common' meaning that it is present in more than half of the documents). That can be
undesirable as it means that an identical document would score less than an almost identical one (by
removing the referred term). Increasing epsilon above 0 raises the sense of how rare a word has to be (among
different documents) to receive an extra score.
"""

self.k1 = k1
self.b = b
self.epsilon = epsilon

self.corpus_size = 0
self.avgdl = 0
self.doc_freqs = []
Expand Down Expand Up @@ -126,7 +148,7 @@ def _initialize(self, corpus):
' unintuitive results.'.format(self.corpus_size)
)

eps = EPSILON * self.average_idf
eps = self.epsilon * self.average_idf
for word in negative_idfs:
self.idf[word] = eps

Expand All @@ -146,13 +168,15 @@ def get_score(self, document, index):
BM25 score.
"""
score = 0
score = 0.0
doc_freqs = self.doc_freqs[index]
numerator_constant = self.k1 + 1
denominator_constant = self.k1 * (1 - self.b + self.b * self.doc_len[index] / self.avgdl)
for word in document:
if word not in doc_freqs:
continue
score += (self.idf[word] * doc_freqs[word] * (PARAM_K1 + 1)
/ (doc_freqs[word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.doc_len[index] / self.avgdl)))
if word in doc_freqs:
df = self.doc_freqs[index][word]
idf = self.idf[word]
score += (idf * df * numerator_constant) / (df + denominator_constant)
return score

def get_scores(self, document):
Expand Down Expand Up @@ -236,7 +260,7 @@ def _get_scores(bm25, document):
return bm25.get_scores(document)


def iter_bm25_bow(corpus, n_jobs=1):
def iter_bm25_bow(corpus, n_jobs=1, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON):
"""Yield BM25 scores (weights) of documents in corpus.
Each document has to be weighted with every document in given corpus.
Expand All @@ -246,6 +270,23 @@ def iter_bm25_bow(corpus, n_jobs=1):
Corpus of documents.
n_jobs : int
The number of processes to use for computing bm25.
k1 : float
Constant used for influencing the term frequency saturation. After saturation is reached, additional
presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
that 1.2 < k1 < 2 yields reasonably good results, although the optimal value depends on factors such as
the type of documents or queries.
b : float
Constant used for influencing the effects of different document lengths relative to average document length.
When b is bigger, lengthier documents (compared to average) have more impact on its effect. According to
[1]_, experiments suggest that 0.5 < b < 0.8 yields reasonably good results, although the optimal value
depends on factors such as the type of documents or queries.
epsilon : float
Constant used as floor value for idf of a document in the corpus. When epsilon is positive, it restricts
negative idf values. Negative idf implies that adding a very common term to a document penalize the overall
score (with 'very common' meaning that it is present in more than half of the documents). That can be
undesirable as it means that an identical document would score less than an almost identical one (by
removing the referred term). Increasing epsilon above 0 raises the sense of how rare a word has to be (among
different documents) to receive an extra score.
Yields
-------
Expand All @@ -265,7 +306,7 @@ def iter_bm25_bow(corpus, n_jobs=1):
>>> result = iter_bm25_weights(corpus, n_jobs=-1)
"""
bm25 = BM25(corpus)
bm25 = BM25(corpus, k1, b, epsilon)

n_processes = effective_n_jobs(n_jobs)
if n_processes == 1:
Expand All @@ -282,7 +323,7 @@ def iter_bm25_bow(corpus, n_jobs=1):
pool.join()


def get_bm25_weights(corpus, n_jobs=1):
def get_bm25_weights(corpus, n_jobs=1, k1=PARAM_K1, b=PARAM_B, epsilon=EPSILON):
"""Returns BM25 scores (weights) of documents in corpus.
Each document has to be weighted with every document in given corpus.
Expand All @@ -292,6 +333,23 @@ def get_bm25_weights(corpus, n_jobs=1):
Corpus of documents.
n_jobs : int
The number of processes to use for computing bm25.
k1 : float
Constant used for influencing the term frequency saturation. After saturation is reached, additional
presence for the term adds a significantly less additional score. According to [1]_, experiments suggest
that 1.2 < k1 < 2 yields reasonably good results, although the optimal value depends on factors such as
the type of documents or queries.
b : float
Constant used for influencing the effects of different document lengths relative to average document length.
When b is bigger, lengthier documents (compared to average) have more impact on its effect. According to
[1]_, experiments suggest that 0.5 < b < 0.8 yields reasonably good results, although the optimal value
depends on factors such as the type of documents or queries.
epsilon : float
Constant used as floor value for idf of a document in the corpus. When epsilon is positive, it restricts
negative idf values. Negative idf implies that adding a very common term to a document penalize the overall
score (with 'very common' meaning that it is present in more than half of the documents). That can be
undesirable as it means that an identical document would score less than an almost identical one (by
removing the referred term). Increasing epsilon above 0 raises the sense of how rare a word has to be (among
different documents) to receive an extra score.
Returns
-------
Expand All @@ -311,7 +369,7 @@ def get_bm25_weights(corpus, n_jobs=1):
>>> result = get_bm25_weights(corpus, n_jobs=-1)
"""
bm25 = BM25(corpus)
bm25 = BM25(corpus, k1, b, epsilon)

n_processes = effective_n_jobs(n_jobs)
if n_processes == 1:
Expand Down
90 changes: 89 additions & 1 deletion gensim/test/test_BM25.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import logging
import unittest

from gensim.summarization.bm25 import get_bm25_weights
from gensim.summarization.bm25 import get_bm25_weights, iter_bm25_bow, BM25
from gensim.test.utils import common_texts


Expand Down Expand Up @@ -62,6 +62,94 @@ def test_multiprocessing(self):
self.assertAlmostEqual(weights1, weights3)
self.assertAlmostEqual(weights2, weights3)

def test_k1(self):
""" Changing the k1 parameter should give consistent results """
corpus = common_texts
index = 0
doc = corpus[index]
first_k1 = 1.0
second_k1 = 2.0

first_bm25 = BM25(corpus, k1=first_k1)
second_bm25 = BM25(corpus, k1=second_k1)
first_score = first_bm25.get_score(doc, index)
second_score = second_bm25.get_score(doc, index)
self.assertLess(first_score, second_score)

first_iter = iter_bm25_bow(corpus, k1=first_k1)
second_iter = iter_bm25_bow(corpus, k1=second_k1)
first_score = dict(next(iter(first_iter)))[index]
second_score = dict(next(iter(second_iter)))[index]
self.assertLess(first_score, second_score)

first_weights = get_bm25_weights(corpus, k1=first_k1)
second_weights = get_bm25_weights(corpus, k1=second_k1)
first_score = first_weights[index]
second_score = second_weights[index]
self.assertLess(first_score, second_score)

def test_b(self):
""" Changing the b parameter should give consistent results """
corpus = common_texts
index = 0
doc = corpus[index]
first_b = 1.0
second_b = 2.0

first_bm25 = BM25(corpus, b=first_b)
second_bm25 = BM25(corpus, b=second_b)
first_score = first_bm25.get_score(doc, index)
second_score = second_bm25.get_score(doc, index)
self.assertLess(first_score, second_score)

first_iter = iter_bm25_bow(corpus, b=first_b)
second_iter = iter_bm25_bow(corpus, b=second_b)
first_score = dict(next(iter(first_iter)))[index]
second_score = dict(next(iter(second_iter)))[index]
self.assertLess(first_score, second_score)

first_weights = get_bm25_weights(corpus, b=first_b)
second_weights = get_bm25_weights(corpus, b=second_b)
first_score = first_weights[index]
second_score = second_weights[index]
self.assertLess(first_score, second_score)

def test_epsilon(self):
""" Changing the b parameter should give consistent results """
corpus = [['cat', 'dog', 'mouse'], ['cat', 'lion'], ['cat', 'lion']]
first_epsilon = 1.0
second_epsilon = 2.0
bm25 = BM25(corpus)
words_with_negative_idfs = set([
word
for word, idf in bm25.idf.items()
if idf < 0
])
index, doc = [
(index, document)
for index, document
in enumerate(corpus)
if words_with_negative_idfs & set(document)
][0]

first_bm25 = BM25(corpus, epsilon=first_epsilon)
second_bm25 = BM25(corpus, epsilon=second_epsilon)
first_score = first_bm25.get_score(doc, index)
second_score = second_bm25.get_score(doc, index)
self.assertGreater(first_score, second_score)

first_iter = iter_bm25_bow(corpus, epsilon=first_epsilon)
second_iter = iter_bm25_bow(corpus, epsilon=second_epsilon)
first_score = dict(next(iter(first_iter)))[index]
second_score = dict(next(iter(second_iter)))[index]
self.assertGreater(first_score, second_score)

first_weights = get_bm25_weights(corpus, epsilon=first_epsilon)
second_weights = get_bm25_weights(corpus, epsilon=second_epsilon)
first_score = first_weights[index]
second_score = second_weights[index]
self.assertGreater(first_score, second_score)


if __name__ == '__main__':
logging.basicConfig(level=logging.DEBUG)
Expand Down

0 comments on commit 3abcb9f

Please sign in to comment.