From 3c4ac0abc1e23fe15ef1bc55c523707456683a38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?V=C3=ADt=20Novotn=C3=BD?= Date: Tue, 26 Nov 2019 20:42:53 +0100 Subject: [PATCH] Warn when BM25.average_idf < 0 Closes #2684 --- gensim/summarization/bm25.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/gensim/summarization/bm25.py b/gensim/summarization/bm25.py index b9621aeb1e..0477ca7430 100644 --- a/gensim/summarization/bm25.py +++ b/gensim/summarization/bm25.py @@ -37,6 +37,7 @@ """ +import logging import math from six import iteritems from six.moves import range @@ -48,6 +49,8 @@ PARAM_B = 0.75 EPSILON = 0.25 +logger = logging.getLogger(__name__) + class BM25(object): """Implementation of Best Matching 25 ranking function. @@ -116,6 +119,13 @@ def _initialize(self, corpus): negative_idfs.append(word) self.average_idf = float(idf_sum) / len(self.idf) + if self.average_idf < 0: + logger.warning( + 'Average inverse document frequency is less than zero. Your corpus of {} documents' + ' is either too small or it does not originate from actual text documents. BM25' + ' will likely produce "wrong" results.'.format(self.corpus_size) + ) + eps = EPSILON * self.average_idf for word in negative_idfs: self.idf[word] = eps