Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix formula in gensim.summarization.bm25. Fix #1828 #1833

Merged
merged 16 commits into from
Jan 11, 2018
Merged
Binary file added Keras-2.1.2-py2.7.egg
Binary file not shown.
9 changes: 6 additions & 3 deletions gensim/summarization/bm25.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please remove `Keras-2.1.2-py2.7.egg from PR (comment here, because I can't add comment to binary file)


"""This module contains function of computing rank scores for documents in
corpus and helper class `BM25` used in calculations. Original alhorithm
corpus and helper class `BM25` used in calculations. Original algorithm
descibed in [1]_, also you may check Wikipedia page [2]_.


Expand Down Expand Up @@ -61,7 +61,8 @@ class BM25(object):
Dictionary with terms frequencies for whole `corpus`. Words used as keys and frequencies as values.
idf : dict
Dictionary with inversed terms frequencies for whole `corpus`. Words used as keys and frequencies as values.

doc_len : list of int
List of document lengths.
"""

def __init__(self, corpus):
Expand All @@ -78,12 +79,14 @@ def __init__(self, corpus):
self.f = []
self.df = {}
self.idf = {}
self.doc_len = []
self.initialize()

def initialize(self):
"""Calculates frequencies of terms in documents and in corpus. Also computes inverse document frequencies."""
for document in self.corpus:
frequencies = {}
self.doc_len.append(len(document))
for word in document:
if word not in frequencies:
frequencies[word] = 0
Expand Down Expand Up @@ -122,7 +125,7 @@ def get_score(self, document, index, average_idf):
continue
idf = self.idf[word] if self.idf[word] >= 0 else EPSILON * average_idf
score += (idf * self.f[index][word] * (PARAM_K1 + 1)
/ (self.f[index][word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * len(document) / self.avgdl)))
/ (self.f[index][word] + PARAM_K1 * (1 - PARAM_B + PARAM_B * self.doc_len[index] / self.avgdl)))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please add several simple tests for BM25

return score

def get_scores(self, document, average_idf):
Expand Down
50 changes: 50 additions & 0 deletions gensim/test/test_BM25.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

"""
Automated tests for checking transformation algorithms (the models package).
"""

import logging
import unittest

from gensim.summarization.bm25 import get_bm25_weights
from gensim.test.utils import common_texts


class TestBM25(unittest.TestCase):
def test_max_match_with_itself(self):
""" Document should show maximum matching with itself """
weights = get_bm25_weights(common_texts)
for index, doc_weights in enumerate(weights):
expected = max(doc_weights)
predicted = doc_weights[index]
self.assertEqual(expected, predicted)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be assertAlmostEqual (because scores are float)


def test_nonnegative_weights(self):
""" All the weights for a partiular document should be non negative """
weights = get_bm25_weights(common_texts)
for doc_weights in weights:
for weight in doc_weights:
self.assertTrue(weight >= 0)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

0 -> 0.


def test_same_match_with_same_document(self):
""" A document should always get the same weight when matched with a particular document """
corpus = [['cat', 'dog', 'mouse'], ['cat', 'lion'], ['cat', 'lion']]
weights = get_bm25_weights(corpus)
self.assertEqual(weights[0][1], weights[0][2])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be assertAlmostEqual (because scores are float)


def test_disjoint_docs_if_weight_zero(self):
""" Two disjoint documents should have zero matching"""
corpus = [['cat', 'dog', 'lion'], ['goat', 'fish', 'tiger']]
weights = get_bm25_weights(corpus)
self.assertTrue(weights[0][1] == 0)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should be assertAlmostEqual (because scores are float), same for next line.

self.assertTrue(weights[1][0] == 0)


if __name__ == '__main__':
logging.basicConfig(level=logging.DEBUG)
unittest.main()