diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py index e516d625f6..7910fe71ab 100644 --- a/gensim/summarization/bm25.py +++ b/gensim/summarization/bm25.py @@ -58,14 +58,10 @@ class BM25(object): Size of corpus (number of documents). avgdl : float Average length of document in `corpus`. - corpus : list of list of str - Corpus of documents. - f : list of dicts of int + doc_freqs : list of dicts of int Dictionary with terms frequencies for each document in `corpus`. Words used as keys and frequencies as values. - df : dict - Dictionary with terms frequencies for whole `corpus`. Words used as keys and frequencies as values. idf : dict - Dictionary with inversed terms frequencies for whole `corpus`. Words used as keys and frequencies as values. + Dictionary with inversed documents frequencies for whole `corpus`. Words used as keys and frequencies as values. doc_len : list of int List of document lengths. """ @@ -80,33 +76,47 @@ def __init__(self, corpus): """ self.corpus_size = len(corpus) self.avgdl = sum(float(len(x)) for x in corpus) / self.corpus_size - self.corpus = corpus - self.f = [] - self.df = {} + self.doc_freqs = [] self.idf = {} self.doc_len = [] - self.initialize() + self._initialize(corpus) - def initialize(self): + def _initialize(self, corpus): """Calculates frequencies of terms in documents and in corpus. Also computes inverse document frequencies.""" - for document in self.corpus: - frequencies = {} + + nd = {} # word -> number of documents with word + for document in corpus: self.doc_len.append(len(document)) + frequencies = {} for word in document: if word not in frequencies: frequencies[word] = 0 frequencies[word] += 1 - self.f.append(frequencies) + self.doc_freqs.append(frequencies) for word, freq in iteritems(frequencies): - if word not in self.df: - self.df[word] = 0 - self.df[word] += 1 - - for word, freq in iteritems(self.df): - self.idf[word] = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5) - - def get_score(self, document, index, average_idf): + if word not in nd: + nd[word] = 0 + nd[word] += 1 + + # collect idf sum to calculate an average idf for epsilon value + idf_sum = 0 + # collect words with negative idf to set them a special epsilon value. + # idf can be negative if word is contained in more than half of documents + negative_idfs = [] + for word, freq in iteritems(nd): + idf = math.log(self.corpus_size - freq + 0.5) - math.log(freq + 0.5) + self.idf[word] = idf + idf_sum += idf + if idf < 0: + negative_idfs.append(word) + self.average_idf = idf_sum / len(self.idf) + + eps = EPSILON * self.average_idf + for word in negative_idfs: + self.idf[word] = eps + + def get_score(self, document, index): """Computes BM25 score of given `document` in relation to item of corpus selected by `index`. Parameters @@ -115,8 +125,6 @@ def get_score(self, document, index, average_idf): Document to be scored. index : int Index of document in corpus selected to score with `document`. - average_idf : float - Average idf in corpus. Returns ------- @@ -125,15 +133,15 @@ def get_score(self, document, index, average_idf): """ score = 0 + doc_freqs = self.doc_freqs[index] for word in document: - if word not in self.f[index]: + if word not in doc_freqs: continue - idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf - score += (idf * self.f[index][word] * (PARAM_K1 + 1) - / (self.f[index][word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.doc_len[index] / self.avgdl))) + score += (self.idf[word] * doc_freqs[word] * (PARAM_K1 + 1) + / (doc_freqs[word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.doc_len[index] / self.avgdl))) return score - def get_scores(self, document, average_idf): + def get_scores(self, document): """Computes and returns BM25 scores of given `document` in relation to every item in corpus. @@ -141,8 +149,6 @@ def get_scores(self, document, average_idf): ---------- document : list of str Document to be scored. - average_idf : float - Average idf in corpus. Returns ------- @@ -150,14 +156,11 @@ def get_scores(self, document, average_idf): BM25 scores. """ - scores = [] - for index in xrange(self.corpus_size): - score = self.get_score(document, index, average_idf) - scores.append(score) + scores = [self.get_score(document, index) for index in xrange(self.corpus_size)] return scores -def _get_scores(bm25, document, average_idf): +def _get_scores(bm25, document): """Helper function for retrieving bm25 scores of given `document` in parallel in relation to every item in corpus. @@ -167,8 +170,6 @@ def _get_scores(bm25, document, average_idf): BM25 object fitted on the corpus where documents are retrieved. document : list of str Document to be scored. - average_idf : float - Average idf in corpus. Returns ------- @@ -176,11 +177,7 @@ def _get_scores(bm25, document, average_idf): BM25 scores. """ - scores = [] - for index in xrange(bm25.corpus_size): - score = bm25.get_score(document, index, average_idf) - scores.append(score) - return scores + return bm25.get_scores(document) def get_bm25_weights(corpus, n_jobs=1): @@ -213,14 +210,13 @@ def get_bm25_weights(corpus, n_jobs=1): """ bm25 = BM25(corpus) - average_idf = sum(float(val) for val in bm25.idf.values()) / len(bm25.idf) n_processes = effective_n_jobs(n_jobs) if n_processes == 1: - weights = [bm25.get_scores(doc, average_idf) for doc in corpus] + weights = [bm25.get_scores(doc) for doc in corpus] return weights - get_score = partial(_get_scores, bm25, average_idf=average_idf) + get_score = partial(_get_scores, bm25) pool = Pool(n_processes) weights = pool.map(get_score, corpus) pool.close()