Skip to content

Commit

Permalink
Add and unit-test gensim.models.bm25model.BM25PlusModel
Browse files Browse the repository at this point in the history
  • Loading branch information
Witiko committed Mar 31, 2022
1 parent 869f07b commit be7a0e6
Show file tree
Hide file tree
Showing 3 changed files with 112 additions and 2 deletions.
2 changes: 1 addition & 1 deletion gensim/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from .ldamodel import LdaModel # noqa:F401
from .lsimodel import LsiModel # noqa:F401
from .tfidfmodel import TfidfModel # noqa:F401
from .bm25model import OkapiBM25Model # noqa:F401
from .bm25model import OkapiBM25Model, BM25PlusModel # noqa:F401
from .rpmodel import RpModel # noqa:F401
from .logentropy_model import LogEntropyModel # noqa:F401
from .word2vec import Word2Vec, FAST_VERSION # noqa:F401
Expand Down
38 changes: 38 additions & 0 deletions gensim/models/bm25model.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,41 @@ def __getitem__(self, bow):
in zip(term_ids, term_weights)
]
return vector


class BM25PlusModel(BM25ABC):
def __init__(self, corpus=None, dictionary=None, k1=1.5, b=0.75, delta=1.0):
self.k1, self.b, self.delta = k1, b, delta
super().__init__(corpus, dictionary)

def precompute_idfs(self, dfs, num_docs):
idfs = dict()
for term_id, freq in dfs.items():
idf = math.log(num_docs + 1) - math.log(freq)
idfs[term_id] = idf
return idfs

def __getitem__(self, bow):
is_corpus, bow = utils.is_corpus(bow)
if is_corpus:
return self._apply(bow)

num_tokens = sum(freq for term_id, freq in bow)

term_ids, term_frequencies, idfs = [], [], []
for term_id, term_frequency in bow:
term_ids.append(term_id)
term_frequencies.append(term_frequency)
idfs.append(self.idfs.get(term_id) or 0.0)
term_frequencies, idfs = np.array(term_frequencies), np.array(idfs)

term_weights = idfs * (self.delta + term_frequencies * (self.k1 + 1)
/ (term_frequencies + self.k1 * (1 - self.b + self.b
* num_tokens / self.avgdl)))

vector = [
(term_id, float(weight))
for term_id, weight
in zip(term_ids, term_weights)
]
return vector
74 changes: 73 additions & 1 deletion gensim/test/test_bm25model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import unittest

from gensim.models.bm25model import BM25ABC
from gensim.models import OkapiBM25Model
from gensim.models import OkapiBM25Model, BM25PlusModel

from gensim.corpora import Dictionary

Expand Down Expand Up @@ -116,3 +116,75 @@ def get_expected_weight(word):
self.assertAlmostEqual(expected_cat_weight, actual_cat_weight)
self.assertAlmostEqual(expected_mouse_weight, actual_mouse_weight)
self.assertAlmostEqual(expected_lion_weight, actual_lion_weight)


class BM25PlusModelTest(unittest.TestCase):
def setUp(self):
self.documents = [['cat', 'dog', 'mouse'], ['cat', 'lion'], ['cat', 'lion']]
self.dictionary = Dictionary(self.documents)
self.k1, self.b, self.delta = 1.5, 0.75, 1.0

def get_idf(word):
frequency = sum(map(lambda document: word in document, self.documents))
return math.log(len(self.documents) + 1) - math.log(frequency)

self.expected_dog_idf = get_idf('dog')
self.expected_cat_idf = get_idf('cat')
self.expected_mouse_idf = get_idf('mouse')
self.expected_lion_idf = get_idf('lion')

def test_idfs_from_corpus(self):
corpus = list(map(self.dictionary.doc2bow, self.documents))
model = BM25PlusModel(corpus=corpus, k1=self.k1, b=self.b, delta=self.delta)

actual_dog_idf = model.idfs[self.dictionary.token2id['dog']]
actual_cat_idf = model.idfs[self.dictionary.token2id['cat']]
actual_mouse_idf = model.idfs[self.dictionary.token2id['mouse']]
actual_lion_idf = model.idfs[self.dictionary.token2id['lion']]

self.assertAlmostEqual(self.expected_dog_idf, actual_dog_idf)
self.assertAlmostEqual(self.expected_cat_idf, actual_cat_idf)
self.assertAlmostEqual(self.expected_mouse_idf, actual_mouse_idf)
self.assertAlmostEqual(self.expected_lion_idf, actual_lion_idf)

def test_idfs_from_dictionary(self):
model = BM25PlusModel(dictionary=self.dictionary, k1=self.k1, b=self.b, delta=self.delta)

actual_dog_idf = model.idfs[self.dictionary.token2id['dog']]
actual_cat_idf = model.idfs[self.dictionary.token2id['cat']]
actual_mouse_idf = model.idfs[self.dictionary.token2id['mouse']]
actual_lion_idf = model.idfs[self.dictionary.token2id['lion']]

self.assertAlmostEqual(self.expected_dog_idf, actual_dog_idf)
self.assertAlmostEqual(self.expected_cat_idf, actual_cat_idf)
self.assertAlmostEqual(self.expected_mouse_idf, actual_mouse_idf)
self.assertAlmostEqual(self.expected_lion_idf, actual_lion_idf)

def test_score(self):
model = BM25PlusModel(dictionary=self.dictionary, k1=self.k1, b=self.b, delta=self.delta)

first_document = self.documents[0]
first_bow = self.dictionary.doc2bow(first_document)
weights = defaultdict(lambda: 0.0)
weights.update(model[first_bow])

actual_dog_weight = weights[self.dictionary.token2id['dog']]
actual_cat_weight = weights[self.dictionary.token2id['cat']]
actual_mouse_weight = weights[self.dictionary.token2id['mouse']]
actual_lion_weight = weights[self.dictionary.token2id['lion']]

def get_expected_weight(word):
idf = model.idfs[self.dictionary.token2id[word]]
numerator = self.k1 + 1
denominator = 1 + self.k1 * (1 - self.b + self.b * len(first_document) / model.avgdl)
return idf * (numerator / denominator + self.delta)

expected_dog_weight = get_expected_weight('dog') if 'dog' in first_document else 0.0
expected_cat_weight = get_expected_weight('cat') if 'cat' in first_document else 0.0
expected_mouse_weight = get_expected_weight('mouse') if 'mouse' in first_document else 0.0
expected_lion_weight = get_expected_weight('lion') if 'lion' in first_document else 0.0

self.assertAlmostEqual(expected_dog_weight, actual_dog_weight)
self.assertAlmostEqual(expected_cat_weight, actual_cat_weight)
self.assertAlmostEqual(expected_mouse_weight, actual_mouse_weight)
self.assertAlmostEqual(expected_lion_weight, actual_lion_weight)

0 comments on commit be7a0e6

Please sign in to comment.