Skip to content

Commit

Permalink
Refactor BM25
Browse files Browse the repository at this point in the history
- remove unnecessary attributes
- move calculation of average_idf in _initialize
- more readable names
  • Loading branch information
horpto committed Nov 22, 2018
1 parent 7e4965e commit d2f2311
Showing 1 changed file with 42 additions and 46 deletions.
88 changes: 42 additions & 46 deletions gensim/summarization/bm25.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,10 @@ class BM25(object):
Size of corpus (number of documents).
avgdl : float
Average length of document in `corpus`.
corpus : list of list of str
Corpus of documents.
f : list of dicts of int
doc_freqs : list of dicts of int
Dictionary with terms frequencies for each document in `corpus`. Words used as keys and frequencies as values.
df : dict
Dictionary with terms frequencies for whole `corpus`. Words used as keys and frequencies as values.
idf : dict
Dictionary with inversed terms frequencies for whole `corpus`. Words used as keys and frequencies as values.
Dictionary with inversed documents frequencies for whole `corpus`. Words used as keys and frequencies as values.
doc_len : list of int
List of document lengths.
"""
Expand All @@ -80,33 +76,47 @@ def __init__(self, corpus):
"""
self.corpus_size = len(corpus)
self.avgdl = sum(float(len(x)) for x in corpus) / self.corpus_size
self.corpus = corpus
self.f = []
self.df = {}
self.doc_freqs = []
self.idf = {}
self.doc_len = []
self.initialize()
self._initialize(corpus)

def initialize(self):
def _initialize(self, corpus):
"""Calculates frequencies of terms in documents and in corpus. Also computes inverse document frequencies."""
for document in self.corpus:
frequencies = {}

nd = {} # word -> number of documents with word
for document in corpus:
self.doc_len.append(len(document))
frequencies = {}
for word in document:
if word not in frequencies:
frequencies[word] = 0
frequencies[word] += 1
self.f.append(frequencies)
self.doc_freqs.append(frequencies)

for word, freq in iteritems(frequencies):
if word not in self.df:
self.df[word] = 0
self.df[word] += 1

for word, freq in iteritems(self.df):
self.idf[word] = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)

def get_score(self, document, index, average_idf):
if word not in nd:
nd[word] = 0
nd[word] += 1

# collect idf sum to calculate an average idf for epsilon value
idf_sum = 0
# collect words with negative idf to set them a special epsilon value.
# idf can be negative if word is contained in more than half of documents
negative_idfs = []
for word, freq in iteritems(nd):
idf = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)
self.idf[word] = idf
idf_sum += idf
if idf < 0:
negative_idfs.append(word)
self.average_idf = idf_sum / len(self.idf)

eps = EPSILON * self.average_idf
for word in negative_idfs:
self.idf[word] = eps

def get_score(self, document, index):
"""Computes BM25 score of given `document` in relation to item of corpus selected by `index`.
Parameters
Expand All @@ -115,8 +125,6 @@ def get_score(self, document, index, average_idf):
Document to be scored.
index : int
Index of document in corpus selected to score with `document`.
average_idf : float
Average idf in corpus.
Returns
-------
Expand All @@ -125,39 +133,34 @@ def get_score(self, document, index, average_idf):
"""
score = 0
doc_freqs = self.doc_freqs[index]
for word in document:
if word not in self.f[index]:
if word not in doc_freqs:
continue
idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf
score += (idf * self.f[index][word] * (PARAM_K1 + 1)
/ (self.f[index][word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.doc_len[index] / self.avgdl)))
score += (self.idf[word] * doc_freqs[word] * (PARAM_K1 + 1)
/ (doc_freqs[word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.doc_len[index] / self.avgdl)))
return score

def get_scores(self, document, average_idf):
def get_scores(self, document):
"""Computes and returns BM25 scores of given `document` in relation to
every item in corpus.
Parameters
----------
document : list of str
Document to be scored.
average_idf : float
Average idf in corpus.
Returns
-------
list of float
BM25 scores.
"""
scores = []
for index in xrange(self.corpus_size):
score = self.get_score(document, index, average_idf)
scores.append(score)
scores = [self.get_score(document, index) for index in xrange(self.corpus_size)]
return scores


def _get_scores(bm25, document, average_idf):
def _get_scores(bm25, document):
"""Helper function for retrieving bm25 scores of given `document` in parallel
in relation to every item in corpus.
Expand All @@ -167,20 +170,14 @@ def _get_scores(bm25, document, average_idf):
BM25 object fitted on the corpus where documents are retrieved.
document : list of str
Document to be scored.
average_idf : float
Average idf in corpus.
Returns
-------
list of float
BM25 scores.
"""
scores = []
for index in xrange(bm25.corpus_size):
score = bm25.get_score(document, index, average_idf)
scores.append(score)
return scores
return bm25.get_scores(document)


def get_bm25_weights(corpus, n_jobs=1):
Expand Down Expand Up @@ -213,14 +210,13 @@ def get_bm25_weights(corpus, n_jobs=1):
"""
bm25 = BM25(corpus)
average_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf)

n_processes = effective_n_jobs(n_jobs)
if n_processes == 1:
weights = [bm25.get_scores(doc, average_idf) for doc in corpus]
weights = [bm25.get_scores(doc) for doc in corpus]
return weights

get_score = partial(_get_scores, bm25, average_idf=average_idf)
get_score = partial(_get_scores, bm25)
pool = Pool(n_processes)
weights = pool.map(get_score, corpus)
pool.close()
Expand Down

0 comments on commit d2f2311

Please sign in to comment.