Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor BM25 #2275

Merged
merged 6 commits into from
Jan 9, 2019
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 40 additions & 46 deletions gensim/summarization/bm25.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,14 +58,10 @@ class BM25(object):
Size of corpus (number of documents).
avgdl : float
Average length of document in `corpus`.
corpus : list of list of str
Corpus of documents.
f : list of dicts of int
doc_freqs : list of dicts of int
Dictionary with terms frequencies for each document in `corpus`. Words used as keys and frequencies as values.
df : dict
Dictionary with terms frequencies for whole `corpus`. Words used as keys and frequencies as values.
idf : dict
Dictionary with inversed terms frequencies for whole `corpus`. Words used as keys and frequencies as values.
Dictionary with inversed documents frequencies for whole `corpus`. Words used as keys and frequencies as values.
doc_len : list of int
List of document lengths.
"""
Expand All @@ -80,38 +76,50 @@ def __init__(self, corpus):
"""
self.corpus_size = len(corpus)
self.avgdl = 0
self.corpus = corpus
self.f = []
self.df = {}
self.doc_freqs = []
self.idf = {}
self.doc_len = []
self.initialize()
self._initialize(corpus)

def initialize(self):
def _initialize(self, corpus):
"""Calculates frequencies of terms in documents and in corpus. Also computes inverse document frequencies."""
nd = {} # word -> number of documents with word
num_doc = 0
for document in self.corpus:
num_doc += len(document)
for document in corpus:
self.doc_len.append(len(document))
num_doc += len(document)

frequencies = {}
for word in document:
if word not in frequencies:
frequencies[word] = 0
frequencies[word] += 1
self.f.append(frequencies)
self.doc_freqs.append(frequencies)

for word, freq in iteritems(frequencies):
if word not in self.df:
self.df[word] = 0
self.df[word] += 1
if word not in nd:
nd[word] = 0
nd[word] += 1

self.avgdl = float(num_doc) / self.corpus_size

for word, freq in iteritems(self.df):
self.idf[word] = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)

def get_score(self, document, index, average_idf):
# collect idf sum to calculate an average idf for epsilon value
idf_sum = 0
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Safer to use float (or explicitly cast to float below).

Both to avoid potential Python errors in integer vs float division in future refactorings, as well as (mainly) to make the intent clear to readers.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh, yes. you're right. Damn python2.

# collect words with negative idf to set them a special epsilon value.
# idf can be negative if word is contained in more than half of documents
negative_idfs = []
for word, freq in iteritems(nd):
idf = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5)
self.idf[word] = idf
idf_sum += idf
if idf < 0:
negative_idfs.append(word)
self.average_idf = float(idf_sum) / len(self.idf)

eps = EPSILON * self.average_idf
for word in negative_idfs:
self.idf[word] = eps

def get_score(self, document, index):
"""Computes BM25 score of given `document` in relation to item of corpus selected by `index`.

Parameters
Expand All @@ -120,8 +128,6 @@ def get_score(self, document, index, average_idf):
Document to be scored.
index : int
Index of document in corpus selected to score with `document`.
average_idf : float
Average idf in corpus.

Returns
-------
Expand All @@ -130,39 +136,34 @@ def get_score(self, document, index, average_idf):

"""
score = 0
doc_freqs = self.doc_freqs[index]
for word in document:
if word not in self.f[index]:
if word not in doc_freqs:
continue
idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf
score += (idf * self.f[index][word] * (PARAM_K1 + 1)
/ (self.f[index][word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.doc_len[index] / self.avgdl)))
score += (self.idf[word] * doc_freqs[word] * (PARAM_K1 + 1)
/ (doc_freqs[word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.doc_len[index] / self.avgdl)))
return score

def get_scores(self, document, average_idf):
def get_scores(self, document):
"""Computes and returns BM25 scores of given `document` in relation to
every item in corpus.

Parameters
----------
document : list of str
Document to be scored.
average_idf : float
Average idf in corpus.

Returns
-------
list of float
BM25 scores.

"""
scores = []
for index in range(self.corpus_size):
score = self.get_score(document, index, average_idf)
scores.append(score)
scores = [self.get_score(document, index) for index in range(self.corpus_size)]
return scores


def _get_scores(bm25, document, average_idf):
def _get_scores(bm25, document):
"""Helper function for retrieving bm25 scores of given `document` in parallel
in relation to every item in corpus.

Expand All @@ -172,20 +173,14 @@ def _get_scores(bm25, document, average_idf):
BM25 object fitted on the corpus where documents are retrieved.
document : list of str
Document to be scored.
average_idf : float
Average idf in corpus.

Returns
-------
list of float
BM25 scores.

"""
scores = []
for index in range(bm25.corpus_size):
score = bm25.get_score(document, index, average_idf)
scores.append(score)
return scores
return bm25.get_scores(document)


def get_bm25_weights(corpus, n_jobs=1):
Expand Down Expand Up @@ -218,14 +213,13 @@ def get_bm25_weights(corpus, n_jobs=1):

"""
bm25 = BM25(corpus)
average_idf = float(sum(val for val in bm25.idf.values())) / len(bm25.idf)

n_processes = effective_n_jobs(n_jobs)
if n_processes == 1:
weights = [bm25.get_scores(doc, average_idf) for doc in corpus]
weights = [bm25.get_scores(doc) for doc in corpus]
return weights

get_score = partial(_get_scores, bm25, average_idf=average_idf)
get_score = partial(_get_scores, bm25)
pool = Pool(n_processes)
weights = pool.map(get_score, corpus)
pool.close()
Expand Down