Skip to content

Commit

Permalink
Add and unit-test gensim.models.bm25model.BM25LModel
Browse files Browse the repository at this point in the history
  • Loading branch information
Witiko committed Mar 15, 2022
1 parent 9ab6f52 commit f43806d
Show file tree
Hide file tree
Showing 3 changed files with 120 additions and 2 deletions.
2 changes: 1 addition & 1 deletion gensim/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from .ldamodel import LdaModel # noqa:F401
from .lsimodel import LsiModel # noqa:F401
from .tfidfmodel import TfidfModel # noqa:F401
from .bm25model import OkapiBM25Model # noqa:F401
from .bm25model import OkapiBM25Model, BM25LModel # noqa:F401
from .rpmodel import RpModel # noqa:F401
from .logentropy_model import LogEntropyModel # noqa:F401
from .word2vec import Word2Vec, FAST_VERSION # noqa:F401
Expand Down
37 changes: 37 additions & 0 deletions gensim/models/bm25model.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,3 +99,40 @@ def __getitem__(self, bow):
in zip(term_ids, term_weights)
]
return vector


class BM25LModel(BM25ABC):
def __init__(self, corpus=None, dictionary=None, k1=1.5, b=0.75, epsilon=0.25, delta=0.5):
self.k1, self.b, self.epsilon, self.delta = k1, b, epsilon, delta
super().__init__(corpus, dictionary)

def precompute_idfs(self, dfs, num_docs):
idfs = dict()
for term_id, freq in dfs.items():
idf = math.log(num_docs + 1) - math.log(freq + 0.5)
idfs[term_id] = idf
return idfs

def __getitem__(self, bow):
is_corpus, bow = utils.is_corpus(bow)
if is_corpus:
return self._apply(bow)

num_tokens = sum(freq for term_id, freq in bow)

term_ids, term_frequencies, idfs = [], [], []
for term_id, term_frequency in bow:
term_ids.append(term_id)
term_frequencies.append(term_frequency)
idfs.append(self.idfs.get(term_id) or 0.0)
term_frequencies, idfs = np.array(term_frequencies), np.array(idfs)

ctd = term_frequencies / (1 - self.b + self.b * num_tokens / self.avgdl)
term_weights = idfs * (self.k1 + 1) * (ctd + self.delta) / (self.k1 + ctd + self.delta)

vector = [
(term_id, float(weight))
for term_id, weight
in zip(term_ids, term_weights)
]
return vector
83 changes: 82 additions & 1 deletion gensim/test/test_bm25model.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import unittest

from gensim.models.bm25model import BM25ABC
from gensim.models import OkapiBM25Model
from gensim.models import OkapiBM25Model, BM25LModel

from gensim.corpora import Dictionary

Expand Down Expand Up @@ -116,3 +116,84 @@ def get_expected_weight(word):
self.assertAlmostEqual(expected_cat_weight, actual_cat_weight)
self.assertAlmostEqual(expected_mouse_weight, actual_mouse_weight)
self.assertAlmostEqual(expected_lion_weight, actual_lion_weight)


class BM25LModelTest(unittest.TestCase):
def setUp(self):
self.documents = [['cat', 'dog', 'mouse'], ['cat', 'lion'], ['cat', 'lion']]
self.dictionary = Dictionary(self.documents)
self.k1, self.b, self.epsilon, self.delta = 1.5, 0.75, 0.25, 0.5

def get_idf(word):
frequency = sum(map(lambda document: word in document, self.documents))
return math.log(len(self.documents) + 1) - math.log(frequency + 0.5)

dog_idf = get_idf('dog')
cat_idf = get_idf('cat')
mouse_idf = get_idf('mouse')
lion_idf = get_idf('lion')

self.expected_dog_idf = dog_idf
self.expected_cat_idf = cat_idf
self.expected_mouse_idf = mouse_idf
self.expected_lion_idf = lion_idf

def test_idfs_from_corpus(self):
corpus = list(map(self.dictionary.doc2bow, self.documents))
model = BM25LModel(corpus=corpus, k1=self.k1, b=self.b, epsilon=self.epsilon,
delta=self.delta)

actual_dog_idf = model.idfs[self.dictionary.token2id['dog']]
actual_cat_idf = model.idfs[self.dictionary.token2id['cat']]
actual_mouse_idf = model.idfs[self.dictionary.token2id['mouse']]
actual_lion_idf = model.idfs[self.dictionary.token2id['lion']]

self.assertAlmostEqual(self.expected_dog_idf, actual_dog_idf)
self.assertAlmostEqual(self.expected_cat_idf, actual_cat_idf)
self.assertAlmostEqual(self.expected_mouse_idf, actual_mouse_idf)
self.assertAlmostEqual(self.expected_lion_idf, actual_lion_idf)

def test_idfs_from_dictionary(self):
model = BM25LModel(dictionary=self.dictionary, k1=self.k1, b=self.b, epsilon=self.epsilon,
delta=self.delta)

actual_dog_idf = model.idfs[self.dictionary.token2id['dog']]
actual_cat_idf = model.idfs[self.dictionary.token2id['cat']]
actual_mouse_idf = model.idfs[self.dictionary.token2id['mouse']]
actual_lion_idf = model.idfs[self.dictionary.token2id['lion']]

self.assertAlmostEqual(self.expected_dog_idf, actual_dog_idf)
self.assertAlmostEqual(self.expected_cat_idf, actual_cat_idf)
self.assertAlmostEqual(self.expected_mouse_idf, actual_mouse_idf)
self.assertAlmostEqual(self.expected_lion_idf, actual_lion_idf)

def test_score(self):
model = BM25LModel(dictionary=self.dictionary, k1=self.k1, b=self.b, epsilon=self.epsilon,
delta=self.delta)

first_document = self.documents[0]
first_bow = self.dictionary.doc2bow(first_document)
weights = defaultdict(lambda: 0.0)
weights.update(model[first_bow])

actual_dog_weight = weights[self.dictionary.token2id['dog']]
actual_cat_weight = weights[self.dictionary.token2id['cat']]
actual_mouse_weight = weights[self.dictionary.token2id['mouse']]
actual_lion_weight = weights[self.dictionary.token2id['lion']]

def get_expected_weight(word):
idf = model.idfs[self.dictionary.token2id[word]]
ctd = 1.0 / (1 - self.b + self.b * (len(first_document) / model.avgdl))
numerator = (self.k1 + 1) * (ctd + self.delta)
denominator = self.k1 + ctd + self.delta
return idf * numerator / denominator

expected_dog_weight = get_expected_weight('dog') if 'dog' in first_document else 0.0
expected_cat_weight = get_expected_weight('cat') if 'cat' in first_document else 0.0
expected_mouse_weight = get_expected_weight('mouse') if 'mouse' in first_document else 0.0
expected_lion_weight = get_expected_weight('lion') if 'lion' in first_document else 0.0

self.assertAlmostEqual(expected_dog_weight, actual_dog_weight)
self.assertAlmostEqual(expected_cat_weight, actual_cat_weight)
self.assertAlmostEqual(expected_mouse_weight, actual_mouse_weight)
self.assertAlmostEqual(expected_lion_weight, actual_lion_weight)

0 comments on commit f43806d

Please sign in to comment.