From 5e1830bd1d3946504c773ff825b2c262e1fe89cb Mon Sep 17 00:00:00 2001 From: markroxor Date: Thu, 20 Oct 2016 23:17:18 +0530 Subject: [PATCH 01/25] fixing appveyor --- gensim/test/{test_basemodel.py => basetests.py} | 0 gensim/test/test_hdpmodel.py | 4 ++-- gensim/test/test_ldamodel.py | 4 ++-- gensim/test/test_lsimodel.py | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) rename gensim/test/{test_basemodel.py => basetests.py} (100%) diff --git a/gensim/test/test_basemodel.py b/gensim/test/basetests.py similarity index 100% rename from gensim/test/test_basemodel.py rename to gensim/test/basetests.py diff --git a/gensim/test/test_hdpmodel.py b/gensim/test/test_hdpmodel.py index 8c0495cb9a..2fb4fb8a80 100644 --- a/gensim/test/test_hdpmodel.py +++ b/gensim/test/test_hdpmodel.py @@ -22,7 +22,7 @@ from gensim.corpora import mmcorpus, Dictionary from gensim.models import hdpmodel from gensim import matutils -from gensim.test import test_basemodel +from gensim.test import basetests module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder @@ -48,7 +48,7 @@ def testfile(): return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') -class TestHdpModel(unittest.TestCase, test_basemodel.TestBaseTopicModel): +class TestHdpModel(unittest.TestCase, basetests.TestBaseTopicModel): def setUp(self): self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) self.class_ = hdpmodel.HdpModel diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py index cc2cececc9..a96d96ae6f 100644 --- a/gensim/test/test_ldamodel.py +++ b/gensim/test/test_ldamodel.py @@ -23,7 +23,7 @@ from gensim.corpora import mmcorpus, Dictionary from gensim.models import ldamodel, ldamulticore from gensim import matutils -from gensim.test import test_basemodel +from gensim.test import basetests module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder @@ -55,7 +55,7 @@ def testRandomState(): assert(isinstance(ldamodel.get_random_state(testcase), numpy.random.RandomState)) -class TestLdaModel(unittest.TestCase, test_basemodel.TestBaseTopicModel): +class TestLdaModel(unittest.TestCase, basetests.TestBaseTopicModel): def setUp(self): self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) self.class_ = ldamodel.LdaModel diff --git a/gensim/test/test_lsimodel.py b/gensim/test/test_lsimodel.py index ab86c18d4f..cb2052773c 100644 --- a/gensim/test/test_lsimodel.py +++ b/gensim/test/test_lsimodel.py @@ -22,7 +22,7 @@ from gensim.corpora import mmcorpus, Dictionary from gensim.models import lsimodel from gensim import matutils -from gensim.test import test_basemodel +from gensim.test import basetests module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder @@ -51,7 +51,7 @@ def testfile(): return os.path.join(tempfile.gettempdir(), 'gensim_models.tst') -class TestLsiModel(unittest.TestCase, test_basemodel.TestBaseTopicModel): +class TestLsiModel(unittest.TestCase, basetests.TestBaseTopicModel): def setUp(self): self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm')) self.model = lsimodel.LsiModel(self.corpus, num_topics=2) From e8a3f1671e2416845353822513366909fddddc45 Mon Sep 17 00:00:00 2001 From: Mohit Rathore Date: Fri, 15 Dec 2017 12:14:05 +0530 Subject: [PATCH 02/25] verify weights --- gensim/models/tfidfmodel.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 50320ad747..75716f86bc 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -14,6 +14,23 @@ logger = logging.getLogger(__name__) +def resolve_weights(smartirs): + if not isinstance(smartirs, str) or len(smartirs)!=3: + raise ValueError('Expected a string of length 3 except got ' + smartirs): + + w_tf, w_df, w_n = smartirs + + if w_tf not in 'nlabL': + raise ValueError('Expected term frequency weight to be one of nlabL, except got ' + n_tf) + + if w_idf not in 'ntp': + raise ValueError('Expected inverse document frequency weight to be one of ntp, except got ' + n_idf) + + if w_n not in 'ncb': + raise ValueError('Expected normalization weight to be one of ncb, except got ' + n_n) + + return w_tf, w_idf, w_n + def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0): """ @@ -50,7 +67,7 @@ class TfidfModel(interfaces.TransformationABC): """ def __init__(self, corpus=None, id2word=None, dictionary=None, - wlocal=utils.identity, wglobal=df2idf, normalize=True): + wlocal=utils.identity, wglobal=df2idf, normalize=True, smartirs="nnc"): """ Compute tf-idf by multiplying a local component (term frequency) with a global component (inverse document frequency), and normalizing @@ -82,6 +99,8 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, self.id2word = id2word self.wlocal, self.wglobal = wlocal, wglobal self.num_docs, self.num_nnz, self.idfs = None, None, None + self.smartirs = smartirs + if dictionary is not None: # user supplied a Dictionary object, which already contains all the # statistics we need to construct the IDF mapping. we can skip the From 648bf21573b59e66b86e2c16b2ed4e0b0bfd0c55 Mon Sep 17 00:00:00 2001 From: Mohit Rathore Date: Fri, 15 Dec 2017 12:14:05 +0530 Subject: [PATCH 03/25] verify weights --- gensim/models/tfidfmodel.py | 40 ++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 50320ad747..e4844f377e 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -15,6 +15,24 @@ logger = logging.getLogger(__name__) +def resolve_weights(smartirs): + if not isinstance(smartirs, str) or len(smartirs) != 3: + raise ValueError('Expected a string of length 3 except got ' + smartirs) + + w_tf, w_df, w_n = smartirs + + if w_tf not in 'nlabL': + raise ValueError('Expected term frequency weight to be one of nlabL, except got ' + w_tf) + + if w_df not in 'ntp': + raise ValueError('Expected inverse document frequency weight to be one of ntp, except got ' + w_df) + + if w_n not in 'ncb': + raise ValueError('Expected normalization weight to be one of ncb, except got ' + w_n) + + return w_tf, w_df, w_n + + def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0): """ Compute default inverse-document-frequency for a term with document frequency `doc_freq`:: @@ -49,8 +67,8 @@ class TfidfModel(interfaces.TransformationABC): Model persistency is achieved via its load/save methods. """ - def __init__(self, corpus=None, id2word=None, dictionary=None, - wlocal=utils.identity, wglobal=df2idf, normalize=True): + def __init__(self, corpus=None, id2word=None, dictionary=None, smartirs="nnn", + wlocal=None, wglobal=None, normalize=True): """ Compute tf-idf by multiplying a local component (term frequency) with a global component (inverse document frequency), and normalizing @@ -82,6 +100,16 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, self.id2word = id2word self.wlocal, self.wglobal = wlocal, wglobal self.num_docs, self.num_nnz, self.idfs = None, None, None + n_tf, n_df, n_n = smartirs + + if n_tf == "n": + pass + elif n_tf == "": + pass + + self.wlocal = utils.identity + self.wglobal = df2idf + if dictionary is not None: # user supplied a Dictionary object, which already contains all the # statistics we need to construct the IDF mapping. we can skip the @@ -92,6 +120,7 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, ) self.num_docs, self.num_nnz = dictionary.num_docs, dictionary.num_nnz self.dfs = dictionary.dfs.copy() + self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs) if id2word is None: self.id2word = dictionary @@ -113,6 +142,7 @@ def initialize(self, corpus): logger.info("collecting document frequencies") dfs = {} numnnz, docno = 0, -1 + for docno, bow in enumerate(corpus): if docno % 10000 == 0: logger.info("PROGRESS: processing document #%i", docno) @@ -131,7 +161,7 @@ def initialize(self, corpus): "calculating IDF weights for %i documents and %i features (%i matrix non-zeros)", self.num_docs, n_features, self.num_nnz ) - self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs) + #self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs) def __getitem__(self, bow, eps=1e-12): """ @@ -145,8 +175,8 @@ def __getitem__(self, bow, eps=1e-12): # unknown (new) terms will be given zero weight (NOT infinity/huge weight, # as strict application of the IDF formula would dictate) vector = [ - (termid, self.wlocal(tf) * self.idfs.get(termid)) - for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0 + (termid, self.wlocal(tf) * self.wglobal(self.dfs[termid], self.num_docs)) + for termid, tf in bow if self.wglobal(self.dfs[termid], self.num_docs) != 0.0 ] # and finally, normalize the vector either to unit length, or use a From a6f1afbe0fb218f07aafbf5a7648792a0ae65f81 Mon Sep 17 00:00:00 2001 From: Mohit Rathore Date: Fri, 15 Dec 2017 15:11:29 +0530 Subject: [PATCH 04/25] smartirs ready --- gensim/models/tfidfmodel.py | 69 +++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 34 deletions(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index e4844f377e..e396618c1f 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -11,6 +11,7 @@ from gensim import interfaces, matutils, utils from six import iteritems +import numpy as np logger = logging.getLogger(__name__) @@ -33,22 +34,6 @@ def resolve_weights(smartirs): return w_tf, w_df, w_n -def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0): - """ - Compute default inverse-document-frequency for a term with document frequency `doc_freq`:: - - idf = add + log(totaldocs / doc_freq) - """ - return add + math.log(1.0 * totaldocs / docfreq, log_base) - - -def precompute_idfs(wglobal, dfs, total_docs): - """Precompute the inverse document frequency mapping for all terms.""" - # not strictly necessary and could be computed on the fly in TfidfModel__getitem__. - # this method is here just to speed things up a little. - return {termid: wglobal(df, total_docs) for termid, df in iteritems(dfs)} - - class TfidfModel(interfaces.TransformationABC): """ Objects of this class realize the transformation between word-document co-occurrence @@ -67,8 +52,8 @@ class TfidfModel(interfaces.TransformationABC): Model persistency is achieved via its load/save methods. """ - def __init__(self, corpus=None, id2word=None, dictionary=None, smartirs="nnn", - wlocal=None, wglobal=None, normalize=True): + def __init__(self, corpus=None, id2word=None, dictionary=None, smartirs="ntc", + wlocal=None, wglobal=None, wnormalize=None): """ Compute tf-idf by multiplying a local component (term frequency) with a global component (inverse document frequency), and normalizing @@ -96,19 +81,38 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, smartirs="nnn", and it will be used to directly construct the inverse document frequency mapping (then `corpus`, if specified, is ignored). """ - self.normalize = normalize self.id2word = id2word - self.wlocal, self.wglobal = wlocal, wglobal + self.wlocal, self.wglobal, self.wnormalize = wlocal, wglobal, wnormalize self.num_docs, self.num_nnz, self.idfs = None, None, None n_tf, n_df, n_n = smartirs - if n_tf == "n": - pass - elif n_tf == "": - pass - - self.wlocal = utils.identity - self.wglobal = df2idf + if self.wlocal is None: + if n_tf == "n": + self.wlocal = lambda tf, mean=None, _max=None: tf + elif n_tf == "l": + self.wlocal = lambda tf, mean=None, _max=None: 1 + math.log(tf) + elif n_tf == "a": + self.wlocal = lambda tf, mean=None, _max=None: 0.5 + (0.5 * tf / _max) + elif n_tf == "b": + self.wlocal = lambda tf, mean=None, _max=None: 1 if tf > 0 else 0 + elif n_tf == "L": + self.wlocal = lambda tf, mean=None, _max=None: (1 + math.log(tf)) / (1 + math.log(mean)) + + if self.wglobal is None: + if n_df == "n": + self.wglobal = utils.identity + elif n_df == "t": + self.wglobal = lambda docfreq, totaldocs: math.log(1.0 * totaldocs / docfreq, 10) + elif n_tf == "p": + self.wglobal = lambda docfreq, totaldocs: math.log((float(totaldocs) - docfreq) / docfreq) + + if self.wnormalize is None: + if n_n == "n": + self.wnormalize = lambda x: x + elif n_n == "c": + self.wnormalize = matutils.unitvec + elif n_n == "t": + self.wnormalize = matutils.unitvec if dictionary is not None: # user supplied a Dictionary object, which already contains all the @@ -121,7 +125,6 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, smartirs="nnn", self.num_docs, self.num_nnz = dictionary.num_docs, dictionary.num_nnz self.dfs = dictionary.dfs.copy() - self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs) if id2word is None: self.id2word = dictionary elif corpus is not None: @@ -161,7 +164,6 @@ def initialize(self, corpus): "calculating IDF weights for %i documents and %i features (%i matrix non-zeros)", self.num_docs, n_features, self.num_nnz ) - #self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs) def __getitem__(self, bow, eps=1e-12): """ @@ -174,17 +176,16 @@ def __getitem__(self, bow, eps=1e-12): # unknown (new) terms will be given zero weight (NOT infinity/huge weight, # as strict application of the IDF formula would dictate) + vector = [ - (termid, self.wlocal(tf) * self.wglobal(self.dfs[termid], self.num_docs)) + (termid, self.wlocal(tf, mean=np.mean(np.array(bow), axis=1), _max=np.max(bow, axis=1)) * self.wglobal(self.dfs[termid], self.num_docs)) for termid, tf in bow if self.wglobal(self.dfs[termid], self.num_docs) != 0.0 ] # and finally, normalize the vector either to unit length, or use a # user-defined normalization function - if self.normalize is True: - vector = matutils.unitvec(vector) - elif self.normalize: - vector = self.normalize(vector) + + vector = self.wnormalize(vector) # make sure there are no explicit zeroes in the vector (must be sparse) vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps] From d091138ee30798483919b5977db707ad36d4eb9c Mon Sep 17 00:00:00 2001 From: Mohit Rathore Date: Fri, 15 Dec 2017 16:10:47 +0530 Subject: [PATCH 05/25] change old tests --- gensim/models/tfidfmodel.py | 26 ++++++++++++-------------- gensim/sklearn_api/tfidf.py | 7 ++++--- gensim/test/test_sklearn_api.py | 7 +++---- 3 files changed, 19 insertions(+), 21 deletions(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index e396618c1f..e408dd9118 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -53,7 +53,7 @@ class TfidfModel(interfaces.TransformationABC): """ def __init__(self, corpus=None, id2word=None, dictionary=None, smartirs="ntc", - wlocal=None, wglobal=None, wnormalize=None): + wlocal=None, wglobal=None, normalize=None): """ Compute tf-idf by multiplying a local component (term frequency) with a global component (inverse document frequency), and normalizing @@ -82,9 +82,10 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, smartirs="ntc", mapping (then `corpus`, if specified, is ignored). """ self.id2word = id2word - self.wlocal, self.wglobal, self.wnormalize = wlocal, wglobal, wnormalize + self.wlocal, self.wglobal, self.normalize = wlocal, wglobal, normalize self.num_docs, self.num_nnz, self.idfs = None, None, None n_tf, n_df, n_n = smartirs + self.smartirs = smartirs if self.wlocal is None: if n_tf == "n": @@ -106,13 +107,14 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, smartirs="ntc", elif n_tf == "p": self.wglobal = lambda docfreq, totaldocs: math.log((float(totaldocs) - docfreq) / docfreq) - if self.wnormalize is None: - if n_n == "n": - self.wnormalize = lambda x: x - elif n_n == "c": - self.wnormalize = matutils.unitvec - elif n_n == "t": - self.wnormalize = matutils.unitvec + if self.normalize is None or isinstance(self.normalize, bool): + if n_n == "n" or self.normalize is False: + self.normalize = lambda x: x + elif n_n == "c" or self.normalize is True: + self.normalize = matutils.unitvec + # TODO write byte-size normalisation + # elif n_n == "b": + # self.normalize = matutils.unitvec if dictionary is not None: # user supplied a Dictionary object, which already contains all the @@ -160,10 +162,6 @@ def initialize(self, corpus): # and finally compute the idf weights n_features = max(dfs) if dfs else 0 - logger.info( - "calculating IDF weights for %i documents and %i features (%i matrix non-zeros)", - self.num_docs, n_features, self.num_nnz - ) def __getitem__(self, bow, eps=1e-12): """ @@ -185,7 +183,7 @@ def __getitem__(self, bow, eps=1e-12): # and finally, normalize the vector either to unit length, or use a # user-defined normalization function - vector = self.wnormalize(vector) + vector = self.normalize(vector) # make sure there are no explicit zeroes in the vector (must be sparse) vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps] diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py index c0a45f1823..28bd908329 100644 --- a/gensim/sklearn_api/tfidf.py +++ b/gensim/sklearn_api/tfidf.py @@ -21,14 +21,15 @@ class TfIdfTransformer(TransformerMixin, BaseEstimator): Base Tf-Idf module """ - def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity, - wglobal=gensim.models.tfidfmodel.df2idf, normalize=True): + def __init__(self, id2word=None, dictionary=None, smartirs="ntc", wlocal=None, + wglobal=None, normalize=True): """ Sklearn wrapper for Tf-Idf model. """ self.gensim_model = None self.id2word = id2word self.dictionary = dictionary + self.smartirs = smartirs self.wlocal = wlocal self.wglobal = wglobal self.normalize = normalize @@ -38,7 +39,7 @@ def fit(self, X, y=None): Fit the model according to the given training data. """ self.gensim_model = TfidfModel( - corpus=X, id2word=self.id2word, dictionary=self.dictionary, + corpus=X, id2word=self.id2word, dictionary=self.dictionary, smartirs=self.smartirs, wlocal=self.wlocal, wglobal=self.wglobal, normalize=self.normalize ) return self diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py index 3793c79948..947804c59d 100644 --- a/gensim/test/test_sklearn_api.py +++ b/gensim/test/test_sklearn_api.py @@ -498,7 +498,6 @@ def testPersistence(self): original_matrix = self.model.transform(original_bow) passed = numpy.allclose(loaded_matrix, original_matrix, atol=1e-1) self.assertTrue(passed) - def testModelNotFitted(self): lsi_wrapper = LsiTransformer(id2word=dictionary, num_topics=2) texts_new = ['graph', 'eulerian'] @@ -973,13 +972,13 @@ def testTransform(self): def testSetGetParams(self): # updating only one param - self.model.set_params(normalize=False) + self.model.set_params(smartirs='nnn') model_params = self.model.get_params() - self.assertEqual(model_params["normalize"], False) + self.assertEqual(model_params["smartirs"], 'nnn') # verify that the attributes values are also changed for `gensim_model` after fitting self.model.fit(self.corpus) - self.assertEqual(getattr(self.model.gensim_model, 'normalize'), False) + self.assertEqual(getattr(self.model.gensim_model, 'smartirs'), 'nnn') def testPipeline(self): with open(datapath('mini_newsgroup'), 'rb') as f: From 951c549dff26c49f105c70686c4063c3c267c59f Mon Sep 17 00:00:00 2001 From: Mohit Rathore Date: Fri, 15 Dec 2017 18:31:18 +0530 Subject: [PATCH 06/25] remove lambdas --- gensim/models/tfidfmodel.py | 58 ++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 26 deletions(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index e408dd9118..a418f1334f 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -87,34 +87,40 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, smartirs="ntc", n_tf, n_df, n_n = smartirs self.smartirs = smartirs - if self.wlocal is None: - if n_tf == "n": - self.wlocal = lambda tf, mean=None, _max=None: tf - elif n_tf == "l": - self.wlocal = lambda tf, mean=None, _max=None: 1 + math.log(tf) - elif n_tf == "a": - self.wlocal = lambda tf, mean=None, _max=None: 0.5 + (0.5 * tf / _max) - elif n_tf == "b": - self.wlocal = lambda tf, mean=None, _max=None: 1 if tf > 0 else 0 - elif n_tf == "L": - self.wlocal = lambda tf, mean=None, _max=None: (1 + math.log(tf)) / (1 + math.log(mean)) - - if self.wglobal is None: - if n_df == "n": - self.wglobal = utils.identity - elif n_df == "t": - self.wglobal = lambda docfreq, totaldocs: math.log(1.0 * totaldocs / docfreq, 10) - elif n_tf == "p": - self.wglobal = lambda docfreq, totaldocs: math.log((float(totaldocs) - docfreq) / docfreq) + if wlocal is None: + def wlocal(tf, mean=None, _max=None): + if n_tf == "n": + return tf + elif n_tf == "l": + return 1 + math.log(tf) + elif n_tf == "a": + return 0.5 + (0.5 * tf / _max) + elif n_tf == "b": + return 1 if tf > 0 else 0 + elif n_tf == "L": + return (1 + math.log(tf)) / (1 + math.log(mean)) + self.wlocal = wlocal + + if wglobal is None: + def wglobal(docfreq, totaldocs): + if n_df == "n": + return utils.identity(docfreq) + elif n_df == "t": + return math.log(1.0 * totaldocs / docfreq, 10) + elif n_tf == "p": + return math.log((float(totaldocs) - docfreq) / docfreq) + self.wglobal = wglobal if self.normalize is None or isinstance(self.normalize, bool): - if n_n == "n" or self.normalize is False: - self.normalize = lambda x: x - elif n_n == "c" or self.normalize is True: - self.normalize = matutils.unitvec - # TODO write byte-size normalisation - # elif n_n == "b": - # self.normalize = matutils.unitvec + def normalize(x): + if n_n == "n" or self.normalize is False: + return x + elif n_n == "c" or self.normalize is True: + return matutils.unitvec(x) + # TODO write byte-size normalisation + # elif n_n == "b": + # pass + self.normalize = normalize if dictionary is not None: # user supplied a Dictionary object, which already contains all the From 40c0558e67e3679b2797e419b75fe3852e225905 Mon Sep 17 00:00:00 2001 From: Mohit Rathore Date: Sun, 17 Dec 2017 02:14:40 +0530 Subject: [PATCH 07/25] address suggestions --- gensim/models/tfidfmodel.py | 95 ++++++++++++++++++++++++--------- gensim/sklearn_api/tfidf.py | 12 ++--- gensim/test/test_sklearn_api.py | 1 + 3 files changed, 78 insertions(+), 30 deletions(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index a418f1334f..3c381ca715 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -6,11 +6,11 @@ import logging -import math from gensim import interfaces, matutils, utils from six import iteritems +import math import numpy as np logger = logging.getLogger(__name__) @@ -23,17 +23,32 @@ def resolve_weights(smartirs): w_tf, w_df, w_n = smartirs if w_tf not in 'nlabL': - raise ValueError('Expected term frequency weight to be one of nlabL, except got ' + w_tf) + raise ValueError('Expected term frequency weight to be one of \'nlabL\', except got ' + w_tf + '\'') if w_df not in 'ntp': - raise ValueError('Expected inverse document frequency weight to be one of ntp, except got ' + w_df) + raise ValueError('Expected inverse document frequency weight to be one of \'ntp\', except got ' + w_df + '\'') if w_n not in 'ncb': - raise ValueError('Expected normalization weight to be one of ncb, except got ' + w_n) + raise ValueError('Expected normalization weight to be one of \'ncb\', except got \'' + w_n + '\'') return w_tf, w_df, w_n +def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0): + """ + Compute default inverse-document-frequency for a term with document frequency `doc_freq`:: + idf = add + log(totaldocs / doc_freq) + """ + return add + np.log(float(totaldocs) / docfreq) / np.log(2) + + +def precompute_idfs(wglobal, dfs, total_docs): + """Precompute the inverse document frequency mapping for all terms.""" + # not strictly necessary and could be computed on the fly in TfidfModel__getitem__. + # this method is here just to speed things up a little. + return {termid: wglobal(df, total_docs) for termid, df in iteritems(dfs)} + + class TfidfModel(interfaces.TransformationABC): """ Objects of this class realize the transformation between word-document co-occurrence @@ -52,8 +67,8 @@ class TfidfModel(interfaces.TransformationABC): Model persistency is achieved via its load/save methods. """ - def __init__(self, corpus=None, id2word=None, dictionary=None, smartirs="ntc", - wlocal=None, wglobal=None, normalize=None): + def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.identity, + wglobal=df2idf, normalize=True, smartirs=None): """ Compute tf-idf by multiplying a local component (term frequency) with a global component (inverse document frequency), and normalizing @@ -80,42 +95,63 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, smartirs="ntc", If `dictionary` is specified, it must be a `corpora.Dictionary` object and it will be used to directly construct the inverse document frequency mapping (then `corpus`, if specified, is ignored). + + `smartirs` or SMART (System for the Mechanical Analysis and Retrieval of Text) + Information Retrieval System, a mnemonic scheme for denoting tf-idf weighting + variants in the vector space model. The mnemonic for representing a combination + of weights takes the form ddd, where the letters represents the term weighting + of the document vector. + + Term frequency weighing: + natural - `n`, logarithm - `l` , augmented - `a`, boolean `b`, log average - `L`. + Document frequency weighting: + none - `n`, idf - `t`, prob idf - `p`. + Document normalization: + none - `n`, cosine - `c`, byte size - `b`. + + for more information visit https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System """ + self.id2word = id2word self.wlocal, self.wglobal, self.normalize = wlocal, wglobal, normalize self.num_docs, self.num_nnz, self.idfs = None, None, None - n_tf, n_df, n_n = smartirs self.smartirs = smartirs - if wlocal is None: - def wlocal(tf, mean=None, _max=None): + if self.normalize is True: + self.normalize = matutils.unitvec + elif self.normalize is False: + self.normalize = utils.identity + + # If smartirs is not None, override wlocal, wglobal and normalize + if smartirs is not None: + n_tf, n_df, n_n = resolve_weights(smartirs) + + def wlocal(tf): if n_tf == "n": return tf elif n_tf == "l": - return 1 + math.log(tf) + return 1 + np.log(tf) / np.log(2) elif n_tf == "a": - return 0.5 + (0.5 * tf / _max) + return 0.5 + (0.5 * tf / tf.max(axis=0)) elif n_tf == "b": - return 1 if tf > 0 else 0 + return tf.astype('bool').astype('int') elif n_tf == "L": - return (1 + math.log(tf)) / (1 + math.log(mean)) + return (1 + np.log(tf) / np.log(2)) / (1 + np.log(tf.mean(axis=0) / np.log(2))) self.wlocal = wlocal - if wglobal is None: def wglobal(docfreq, totaldocs): if n_df == "n": return utils.identity(docfreq) elif n_df == "t": - return math.log(1.0 * totaldocs / docfreq, 10) - elif n_tf == "p": - return math.log((float(totaldocs) - docfreq) / docfreq) + return np.log(1.0 * totaldocs / docfreq) / np.log(2) + elif n_df == "p": + return np.log((1.0 * totaldocs - docfreq) / docfreq) / np.log(2) self.wglobal = wglobal - if self.normalize is None or isinstance(self.normalize, bool): def normalize(x): - if n_n == "n" or self.normalize is False: + if n_n == "n": return x - elif n_n == "c" or self.normalize is True: + elif n_n == "c": return matutils.unitvec(x) # TODO write byte-size normalisation # elif n_n == "b": @@ -132,7 +168,7 @@ def normalize(x): ) self.num_docs, self.num_nnz = dictionary.num_docs, dictionary.num_nnz self.dfs = dictionary.dfs.copy() - + self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs) if id2word is None: self.id2word = dictionary elif corpus is not None: @@ -165,9 +201,13 @@ def initialize(self, corpus): self.num_docs = docno + 1 self.num_nnz = numnnz self.dfs = dfs - # and finally compute the idf weights n_features = max(dfs) if dfs else 0 + logger.info( + "calculating IDF weights for %i documents and %i features (%i matrix non-zeros)", + self.num_docs, n_features, self.num_nnz + ) + self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs) def __getitem__(self, bow, eps=1e-12): """ @@ -181,9 +221,16 @@ def __getitem__(self, bow, eps=1e-12): # unknown (new) terms will be given zero weight (NOT infinity/huge weight, # as strict application of the IDF formula would dictate) + termid_array, tf_array = [], [] + for termid, tf in bow: + termid_array.append(termid) + tf_array.append(tf) + + tf_array = self.wlocal(np.array(tf_array)) + vector = [ - (termid, self.wlocal(tf, mean=np.mean(np.array(bow), axis=1), _max=np.max(bow, axis=1)) * self.wglobal(self.dfs[termid], self.num_docs)) - for termid, tf in bow if self.wglobal(self.dfs[termid], self.num_docs) != 0.0 + (termid, tf * self.idfs.get(termid)) + for termid, tf in zip(termid_array, tf_array) if self.idfs.get(termid, 0.0) != 0.0 ] # and finally, normalize the vector either to unit length, or use a diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py index 28bd908329..dc4ab86c01 100644 --- a/gensim/sklearn_api/tfidf.py +++ b/gensim/sklearn_api/tfidf.py @@ -12,8 +12,8 @@ from sklearn.base import TransformerMixin, BaseEstimator from sklearn.exceptions import NotFittedError -import gensim from gensim.models import TfidfModel +import gensim class TfIdfTransformer(TransformerMixin, BaseEstimator): @@ -21,26 +21,26 @@ class TfIdfTransformer(TransformerMixin, BaseEstimator): Base Tf-Idf module """ - def __init__(self, id2word=None, dictionary=None, smartirs="ntc", wlocal=None, - wglobal=None, normalize=True): + def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity, + wglobal=gensim.models.tfidfmodel.df2idf, normalize=True, smartirs="ntc"): """ Sklearn wrapper for Tf-Idf model. """ self.gensim_model = None self.id2word = id2word self.dictionary = dictionary - self.smartirs = smartirs self.wlocal = wlocal self.wglobal = wglobal self.normalize = normalize + self.smartirs = smartirs def fit(self, X, y=None): """ Fit the model according to the given training data. """ self.gensim_model = TfidfModel( - corpus=X, id2word=self.id2word, dictionary=self.dictionary, smartirs=self.smartirs, - wlocal=self.wlocal, wglobal=self.wglobal, normalize=self.normalize + corpus=X, id2word=self.id2word, dictionary=self.dictionary, wlocal=self.wlocal, + wglobal=self.wglobal, normalize=self.normalize, smartirs=self.smartirs, ) return self diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py index 947804c59d..5e0511aa5c 100644 --- a/gensim/test/test_sklearn_api.py +++ b/gensim/test/test_sklearn_api.py @@ -498,6 +498,7 @@ def testPersistence(self): original_matrix = self.model.transform(original_bow) passed = numpy.allclose(loaded_matrix, original_matrix, atol=1e-1) self.assertTrue(passed) + def testModelNotFitted(self): lsi_wrapper = LsiTransformer(id2word=dictionary, num_topics=2) texts_new = ['graph', 'eulerian'] From b35344c43b0009cbaae5b46936800bf58ae578bb Mon Sep 17 00:00:00 2001 From: Mohit Rathore Date: Tue, 19 Dec 2017 16:00:43 +0530 Subject: [PATCH 08/25] minor fix --- gensim/models/tfidfmodel.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 75716f86bc..b308aaef04 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -14,22 +14,23 @@ logger = logging.getLogger(__name__) + def resolve_weights(smartirs): - if not isinstance(smartirs, str) or len(smartirs)!=3: - raise ValueError('Expected a string of length 3 except got ' + smartirs): + if not isinstance(smartirs, str) or len(smartirs) != 3: + raise ValueError('Expected a string of length 3 except got ' + smartirs) w_tf, w_df, w_n = smartirs if w_tf not in 'nlabL': - raise ValueError('Expected term frequency weight to be one of nlabL, except got ' + n_tf) + raise ValueError('Expected term frequency weight to be one of nlabL, except got ' + w_tf) - if w_idf not in 'ntp': - raise ValueError('Expected inverse document frequency weight to be one of ntp, except got ' + n_idf) + if w_df not in 'ntp': + raise ValueError('Expected inverse document frequency weight to be one of ntp, except got ' + w_df) if w_n not in 'ncb': - raise ValueError('Expected normalization weight to be one of ncb, except got ' + n_n) + raise ValueError('Expected normalization weight to be one of ncb, except got ' + w_n) - return w_tf, w_idf, w_n + return w_tf, w_df, w_n def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0): From 0917e75c03bec7273cc264a405c28b6a48f7b59e Mon Sep 17 00:00:00 2001 From: Mohit Rathore Date: Tue, 19 Dec 2017 16:24:43 +0530 Subject: [PATCH 09/25] pep8 fix --- gensim/models/tfidfmodel.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 3c381ca715..4546672ddd 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -10,7 +10,6 @@ from gensim import interfaces, matutils, utils from six import iteritems -import math import numpy as np logger = logging.getLogger(__name__) From d3d431c5a4274ec6c0143c9f89742389a6935244 Mon Sep 17 00:00:00 2001 From: ivan Date: Thu, 21 Dec 2017 11:48:27 +0500 Subject: [PATCH 10/25] fix pickle problem --- gensim/models/tfidfmodel.py | 67 ++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 31 deletions(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index fa5a7aa8b1..7fb25430ee 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -6,6 +6,7 @@ import logging +from functools import partial from gensim import interfaces, matutils, utils from six import iteritems @@ -53,6 +54,38 @@ def precompute_idfs(wglobal, dfs, total_docs): return {termid: wglobal(df, total_docs) for termid, df in iteritems(dfs)} +def wlocal_g(tf, n_tf): # TODO rename it (to avoid confusion) + if n_tf == "n": + return tf + elif n_tf == "l": + return 1 + np.log(tf) / np.log(2) + elif n_tf == "a": + return 0.5 + (0.5 * tf / tf.max(axis=0)) + elif n_tf == "b": + return tf.astype('bool').astype('int') + elif n_tf == "L": + return (1 + np.log(tf) / np.log(2)) / (1 + np.log(tf.mean(axis=0) / np.log(2))) + + +def wglobal_g(docfreq, totaldocs, n_df): # TODO rename it (to avoid confusion) + if n_df == "n": + return utils.identity(docfreq) + elif n_df == "t": + return np.log(1.0 * totaldocs / docfreq) / np.log(2) + elif n_df == "p": + return np.log((1.0 * totaldocs - docfreq) / docfreq) / np.log(2) + + +def normalize_g(x, n_n): # TODO rename it (to avoid confusion) + if n_n == "n": + return x + elif n_n == "c": + return matutils.unitvec(x) + # TODO write byte-size normalisation + # elif n_n == "b": + # pass + + class TfidfModel(interfaces.TransformationABC): """ Objects of this class realize the transformation between word-document co-occurrence @@ -148,37 +181,9 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden if smartirs is not None: n_tf, n_df, n_n = resolve_weights(smartirs) - def wlocal(tf): - if n_tf == "n": - return tf - elif n_tf == "l": - return 1 + np.log(tf) / np.log(2) - elif n_tf == "a": - return 0.5 + (0.5 * tf / tf.max(axis=0)) - elif n_tf == "b": - return tf.astype('bool').astype('int') - elif n_tf == "L": - return (1 + np.log(tf) / np.log(2)) / (1 + np.log(tf.mean(axis=0) / np.log(2))) - self.wlocal = wlocal - - def wglobal(docfreq, totaldocs): - if n_df == "n": - return utils.identity(docfreq) - elif n_df == "t": - return np.log(1.0 * totaldocs / docfreq) / np.log(2) - elif n_df == "p": - return np.log((1.0 * totaldocs - docfreq) / docfreq) / np.log(2) - self.wglobal = wglobal - - def normalize(x): - if n_n == "n": - return x - elif n_n == "c": - return matutils.unitvec(x) - # TODO write byte-size normalisation - # elif n_n == "b": - # pass - self.normalize = normalize + self.wlocal = partial(wlocal_g, n_tf=n_tf) + self.wglobal = partial(wglobal_g, n_df=n_df) + self.normalize = partial(normalize_g, n_n=n_n) if dictionary is not None: # user supplied a Dictionary object, which already contains all the From 0e6f21e24149ee33c46323f0db9ac818f5cc53bc Mon Sep 17 00:00:00 2001 From: Mohit Rathore Date: Thu, 21 Dec 2017 12:42:38 +0530 Subject: [PATCH 11/25] flake8 fix --- gensim/models/tfidfmodel.py | 1 + 1 file changed, 1 insertion(+) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 7fb25430ee..8b3d2697ed 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -104,6 +104,7 @@ class TfidfModel(interfaces.TransformationABC): >>> tfidf.save('/tmp/foo.tfidf_model') Model persistency is achieved via its load/save methods. + """ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.identity, From 7ee75602157528110455a21da1ef0128a42f186c Mon Sep 17 00:00:00 2001 From: ivan Date: Thu, 21 Dec 2017 14:21:48 +0500 Subject: [PATCH 12/25] fix bug in docstring --- gensim/models/tfidfmodel.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 8b3d2697ed..f4f762c879 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -91,14 +91,8 @@ class TfidfModel(interfaces.TransformationABC): Objects of this class realize the transformation between word-document co-occurrence matrix (integers) into a locally/globally weighted TF_IDF matrix (positive floats). - Methods - ------- - __init__(corpus=None, id2word=None, dictionary=None, wlocal=utils.identity, - wglobal=df2idf, normalize=True, smartirs=None): - Calculates inverse document counts for all terms in the training corpus. - __getitem__(bow, eps=1e-12) - which transforms a simple count representation into the TfIdf space. - + Examples + -------- >>> tfidf = TfidfModel(corpus) >>> print(tfidf[some_doc]) >>> tfidf.save('/tmp/foo.tfidf_model') From b2def84db38ef7b6158b92eb6ae3f5e48b0a2299 Mon Sep 17 00:00:00 2001 From: Mohit Rathore Date: Fri, 22 Dec 2017 13:01:42 +0530 Subject: [PATCH 13/25] added few tests --- gensim/test/test_sklearn_api.py | 203 ++++++++++++++++++++++++++++++++ 1 file changed, 203 insertions(+) diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py index 5e0511aa5c..ec2c287356 100644 --- a/gensim/test/test_sklearn_api.py +++ b/gensim/test/test_sklearn_api.py @@ -1014,6 +1014,209 @@ def testModelNotFitted(self): tfidf_wrapper = TfIdfTransformer() self.assertRaises(NotFittedError, tfidf_wrapper.transform, corpus[0]) + def testConsistency(self): + docs = [corpus[0], corpus[1]] + self.model.set_params(smartirs='ntc') + self.model.fit(self.corpus) + transformed_docs = self.model.transform(docs) + expected_docs = [ + [(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)], + [(3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), + (6, 0.44424552527467476), (7, 0.3244870206138555), (8, 0.44424552527467476)] + ] + self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) + + + # nnn + docs = [corpus[0], corpus[1]] + self.model.set_params(smartirs='nnn') + self.model.fit(self.corpus) + transformed_docs = self.model.transform(docs) + + expected_docs = [ + [(0, 2), (1, 2), (2, 2)], + [(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)] + ] + self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) + + # nnc + docs = [corpus[0], corpus[1]] + self.model.set_params(smartirs='nnc') + self.model.fit(self.corpus) + transformed_docs = self.model.transform(docs) + + expected_docs = [ + [(0, 0.57735026918962584), (1, 0.57735026918962584), (2, 0.57735026918962584)], + [(3, 0.34299717028501764), (4, 0.34299717028501764), (5, 0.51449575542752646), (6, 0.34299717028501764), (7, 0.51449575542752646), (8, 0.34299717028501764)] + ] + self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) + + # ntn + docs = [corpus[0], corpus[1]] + self.model.set_params(smartirs='ntn') + self.model.fit(self.corpus) + transformed_docs = self.model.transform(docs) + + expected_docs = [ + [(0, 2.1699250014423126), (1, 2.1699250014423126), (2, 2.1699250014423126)], + [(3, 2.1699250014423126), (4, 2.1699250014423126), (5, 1.5849625007211563), (6, 2.1699250014423126), (7, 1.5849625007211563), (8, 2.1699250014423126)] + ] + self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) + + # ntc + docs = [corpus[0], corpus[1]] + self.model.set_params(smartirs='ntc') + self.model.fit(self.corpus) + transformed_docs = self.model.transform(docs) + + expected_docs = [ + [(0, 0.57735026918962573), (1, 0.57735026918962573), (2, 0.57735026918962573)], + [(3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.32448702061385548), (6, 0.44424552527467476), (7, 0.32448702061385548), (8, 0.44424552527467476)] + ] + self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) + + # npn + docs = [corpus[0], corpus[1]] + self.model.set_params(smartirs='npn') + self.model.fit(self.corpus) + transformed_docs = self.model.transform(docs) + + expected_docs = [ + [(0, 1.8073549220576042), (1, 1.8073549220576042), (2, 1.8073549220576042)], + [(3, 1.8073549220576042), (4, 1.8073549220576042), (5, 1.0), (6, 1.8073549220576042), (7, 1.0), (8, 1.8073549220576042)] + ] + self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) + + # npc + docs = [corpus[0], corpus[1]] + self.model.set_params(smartirs='npc') + self.model.fit(self.corpus) + transformed_docs = self.model.transform(docs) + + expected_docs = [ + [(0, 0.57735026918962573), (1, 0.57735026918962573), (2, 0.57735026918962573)], + [(3, 0.46563179782533826), (4, 0.46563179782533826), (5, 0.25763163180767745), (6, 0.46563179782533826), (7, 0.25763163180767745), (8, 0.46563179782533826)] + ] + self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) + + # lnn + docs = [corpus[0], corpus[1]] + self.model.set_params(smartirs='lnn') + self.model.fit(self.corpus) + transformed_docs = self.model.transform(docs) + + expected_docs = [ + [(0, 2.0), (1, 2.0), (2, 2.0)], + [(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0), (8, 2.0)] + ] + self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) + + # lnc + docs = [corpus[0], corpus[1]] + self.model.set_params(smartirs='lnc') + self.model.fit(self.corpus) + transformed_docs = self.model.transform(docs) + + expected_docs = [ + [(0, 0.57735026918962584), (1, 0.57735026918962584), (2, 0.57735026918962584)], + [(3, 0.34299717028501764), (4, 0.34299717028501764), (5, 0.51449575542752646), (6, 0.34299717028501764), (7, 0.51449575542752646), (8, 0.34299717028501764)] + ] + self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) + + # ltn + docs = [corpus[0], corpus[1]] + self.model.set_params(smartirs='ltn') + self.model.fit(self.corpus) + transformed_docs = self.model.transform(docs) + + expected_docs = [ + [(0, 2.1699250014423126), (1, 2.1699250014423126), (2, 2.1699250014423126)], + [(3, 2.1699250014423126), (4, 2.1699250014423126), (5, 1.5849625007211563), (6, 2.1699250014423126), (7, 1.5849625007211563), (8, 2.1699250014423126)] + ] + self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) + + # ltc + docs = [corpus[0], corpus[1]] + self.model.set_params(smartirs='ltc') + self.model.fit(self.corpus) + transformed_docs = self.model.transform(docs) + + expected_docs = [[(0, 0.57735026918962573), + (1, 0.57735026918962573), + (2, 0.57735026918962573)], + [(3, 0.44424552527467476), + (4, 0.44424552527467476), + (5, 0.32448702061385548), + (6, 0.44424552527467476), + (7, 0.32448702061385548), + (8, 0.44424552527467476)]] + self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) + + # lpn + docs = [corpus[0], corpus[1]] + self.model.set_params(smartirs='lpn') + self.model.fit(self.corpus) + transformed_docs = self.model.transform(docs) + + expected_docs = [[(0, 1.8073549220576042), + (1, 1.8073549220576042), + (2, 1.8073549220576042)], + [(3, 1.8073549220576042), + (4, 1.8073549220576042), + (5, 1.0), + (6, 1.8073549220576042), + (7, 1.0), + (8, 1.8073549220576042)]] + self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) + + # lpc + docs = [corpus[0], corpus[1]] + self.model.set_params(smartirs='lpc') + self.model.fit(self.corpus) + transformed_docs = self.model.transform(docs) + + expected_docs = [[(0, 0.57735026918962573), + (1, 0.57735026918962573), + (2, 0.57735026918962573)], + [(3, 0.46563179782533826), + (4, 0.46563179782533826), + (5, 0.25763163180767745), + (6, 0.46563179782533826), + (7, 0.25763163180767745), + (8, 0.46563179782533826)]] + self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) + + # lpc + docs = [corpus[0], corpus[1]] + self.model.set_params(smartirs='lpc') + self.model.fit(self.corpus) + transformed_docs = self.model.transform(docs) + + #pprint.pprint(transformed_docs) + expected_docs = [[(0, 0.57735026918962573), + (1, 0.57735026918962573), + (2, 0.57735026918962573)], + [(3, 0.46563179782533826), + (4, 0.46563179782533826), + (5, 0.25763163180767745), + (6, 0.46563179782533826), + (7, 0.25763163180767745), + (8, 0.46563179782533826)]] + self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) class TestHdpTransformer(unittest.TestCase): def setUp(self): From 5b2d37afbeb02dfb3b004ea2c7814e6670bbc19a Mon Sep 17 00:00:00 2001 From: Mohit Rathore Date: Fri, 22 Dec 2017 15:05:43 +0530 Subject: [PATCH 14/25] fix normalize issue for pickling --- gensim/models/tfidfmodel.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index f4f762c879..6f6c6e106b 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -167,11 +167,6 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden self.num_docs, self.num_nnz, self.idfs = None, None, None self.smartirs = smartirs - if self.normalize is True: - self.normalize = matutils.unitvec - elif self.normalize is False: - self.normalize = utils.identity - # If smartirs is not None, override wlocal, wglobal and normalize if smartirs is not None: n_tf, n_df, n_n = resolve_weights(smartirs) @@ -255,9 +250,13 @@ def __getitem__(self, bow, eps=1e-12): for termid, tf in zip(termid_array, tf_array) if self.idfs.get(termid, 0.0) != 0.0 ] + if self.normalize is True: + self.normalize = matutils.unitvec + elif self.normalize is False: + self.normalize = utils.identity + # and finally, normalize the vector either to unit length, or use a # user-defined normalization function - vector = self.normalize(vector) # make sure there are no explicit zeroes in the vector (must be sparse) From ac4b154ef5dbde037c833dd71386676cd84e21fc Mon Sep 17 00:00:00 2001 From: Mohit Rathore Date: Fri, 22 Dec 2017 15:05:43 +0530 Subject: [PATCH 15/25] fix normalize issue for pickling --- gensim/models/tfidfmodel.py | 28 ++-- gensim/test/test_sklearn_api.py | 273 ++++++++++++++------------------ 2 files changed, 129 insertions(+), 172 deletions(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index f4f762c879..3d32b334fb 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -54,7 +54,7 @@ def precompute_idfs(wglobal, dfs, total_docs): return {termid: wglobal(df, total_docs) for termid, df in iteritems(dfs)} -def wlocal_g(tf, n_tf): # TODO rename it (to avoid confusion) +def updated_wlocal(tf, n_tf): if n_tf == "n": return tf elif n_tf == "l": @@ -67,7 +67,7 @@ def wlocal_g(tf, n_tf): # TODO rename it (to avoid confusion) return (1 + np.log(tf) / np.log(2)) / (1 + np.log(tf.mean(axis=0) / np.log(2))) -def wglobal_g(docfreq, totaldocs, n_df): # TODO rename it (to avoid confusion) +def updated_wglobal(docfreq, totaldocs, n_df): # TODO rename it (to avoid confusion) if n_df == "n": return utils.identity(docfreq) elif n_df == "t": @@ -76,14 +76,11 @@ def wglobal_g(docfreq, totaldocs, n_df): # TODO rename it (to avoid confusion) return np.log((1.0 * totaldocs - docfreq) / docfreq) / np.log(2) -def normalize_g(x, n_n): # TODO rename it (to avoid confusion) +def updated_normalize(x, n_n): # TODO rename it (to avoid confusion) if n_n == "n": return x elif n_n == "c": return matutils.unitvec(x) - # TODO write byte-size normalisation - # elif n_n == "b": - # pass class TfidfModel(interfaces.TransformationABC): @@ -152,7 +149,7 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden Document frequency weighting: none - `n`, idf - `t`, prob idf - `p`. Document normalization: - none - `n`, cosine - `c`, byte size - `b`. + none - `n`, cosine - `c`. for more information visit https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System @@ -167,18 +164,13 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden self.num_docs, self.num_nnz, self.idfs = None, None, None self.smartirs = smartirs - if self.normalize is True: - self.normalize = matutils.unitvec - elif self.normalize is False: - self.normalize = utils.identity - # If smartirs is not None, override wlocal, wglobal and normalize if smartirs is not None: n_tf, n_df, n_n = resolve_weights(smartirs) - self.wlocal = partial(wlocal_g, n_tf=n_tf) - self.wglobal = partial(wglobal_g, n_df=n_df) - self.normalize = partial(normalize_g, n_n=n_n) + self.wlocal = partial(updated_wlocal, n_tf=n_tf) + self.wglobal = partial(updated_wglobal, n_df=n_df) + self.normalize = partial(updated_normalize, n_n=n_n) if dictionary is not None: # user supplied a Dictionary object, which already contains all the @@ -255,9 +247,13 @@ def __getitem__(self, bow, eps=1e-12): for termid, tf in zip(termid_array, tf_array) if self.idfs.get(termid, 0.0) != 0.0 ] + if self.normalize is True: + self.normalize = matutils.unitvec + elif self.normalize is False: + self.normalize = utils.identity + # and finally, normalize the vector either to unit length, or use a # user-defined normalization function - vector = self.normalize(vector) # make sure there are no explicit zeroes in the vector (must be sparse) diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py index ec2c287356..33d2d5b777 100644 --- a/gensim/test/test_sklearn_api.py +++ b/gensim/test/test_sklearn_api.py @@ -1000,6 +1000,9 @@ def testPipeline(self): self.assertGreater(score, 0.40) def testPersistence(self): + # Test current model persistency. + self.model.set_params(smartirs='ntc') + model_dump = pickle.dumps(self.model) model_load = pickle.loads(model_dump) @@ -1010,213 +1013,171 @@ def testPersistence(self): original_transformed_doc = self.model.transform(doc) self.assertEqual(original_transformed_doc, loaded_transformed_doc) + # compare backward model pickle compatibility + with open("test_data/tfidf_model.pkl", "rb") as model_handler: + model_load = pickle.load(model_handler) + + loaded_transformed_doc = model_load.transform(doc) + + # comparing the original and new models + original_transformed_doc = self.model.transform(doc) + self.assertEqual(original_transformed_doc, loaded_transformed_doc) + def testModelNotFitted(self): tfidf_wrapper = TfIdfTransformer() self.assertRaises(NotFittedError, tfidf_wrapper.transform, corpus[0]) def testConsistency(self): - docs = [corpus[0], corpus[1]] + # Test if `ntc` yields the default docs. + docs = [corpus[1], corpus[2]] + self.model.set_params(smartirs='ntc') self.model.fit(self.corpus) transformed_docs = self.model.transform(docs) - expected_docs = [ - [(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)], - [(3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555), - (6, 0.44424552527467476), (7, 0.3244870206138555), (8, 0.44424552527467476)] - ] + + self.model.set_params(normalize=True) + self.model.fit(self.corpus) + expected_docs = self.model.transform(docs) + self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) - - - # nnn - docs = [corpus[0], corpus[1]] + + # Testing all the variations of `wlocal` + # smartirs=`nnn` self.model.set_params(smartirs='nnn') self.model.fit(self.corpus) + transformed_docs = self.model.transform(docs) - - expected_docs = [ - [(0, 2), (1, 2), (2, 2)], - [(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)] - ] - self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) - self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) - - # nnc - docs = [corpus[0], corpus[1]] - self.model.set_params(smartirs='nnc') - self.model.fit(self.corpus) - transformed_docs = self.model.transform(docs) - - expected_docs = [ - [(0, 0.57735026918962584), (1, 0.57735026918962584), (2, 0.57735026918962584)], - [(3, 0.34299717028501764), (4, 0.34299717028501764), (5, 0.51449575542752646), (6, 0.34299717028501764), (7, 0.51449575542752646), (8, 0.34299717028501764)] - ] - self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) - self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) - - # ntn - docs = [corpus[0], corpus[1]] - self.model.set_params(smartirs='ntn') - self.model.fit(self.corpus) - transformed_docs = self.model.transform(docs) - - expected_docs = [ - [(0, 2.1699250014423126), (1, 2.1699250014423126), (2, 2.1699250014423126)], - [(3, 2.1699250014423126), (4, 2.1699250014423126), (5, 1.5849625007211563), (6, 2.1699250014423126), (7, 1.5849625007211563), (8, 2.1699250014423126)] - ] - self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) - self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) - - # ntc - docs = [corpus[0], corpus[1]] - self.model.set_params(smartirs='ntc') - self.model.fit(self.corpus) - transformed_docs = self.model.transform(docs) - - expected_docs = [ - [(0, 0.57735026918962573), (1, 0.57735026918962573), (2, 0.57735026918962573)], - [(3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.32448702061385548), (6, 0.44424552527467476), (7, 0.32448702061385548), (8, 0.44424552527467476)] + expected_docs = [[(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)], + [(5, 6), (9, 3), (10, 3)] ] + self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) - self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) - - # npn - docs = [corpus[0], corpus[1]] - self.model.set_params(smartirs='npn') + self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) + + # smartirs=`lnn` + self.model.set_params(smartirs='lnn') self.model.fit(self.corpus) + transformed_docs = self.model.transform(docs) - - expected_docs = [ - [(0, 1.8073549220576042), (1, 1.8073549220576042), (2, 1.8073549220576042)], - [(3, 1.8073549220576042), (4, 1.8073549220576042), (5, 1.0), (6, 1.8073549220576042), (7, 1.0), (8, 1.8073549220576042)] + expected_docs = [[(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0), (8, 2.0)], + [(5, 6.0), (9, 3.0), (10, 3.0)] ] + self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) - self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) - - # npc - docs = [corpus[0], corpus[1]] - self.model.set_params(smartirs='npc') + self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) + + # smartirs=`ann` + self.model.set_params(smartirs='ann') self.model.fit(self.corpus) + transformed_docs = self.model.transform(docs) - expected_docs = [ - [(0, 0.57735026918962573), (1, 0.57735026918962573), (2, 0.57735026918962573)], - [(3, 0.46563179782533826), (4, 0.46563179782533826), (5, 0.25763163180767745), (6, 0.46563179782533826), (7, 0.25763163180767745), (8, 0.46563179782533826)] + [(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0), (8, 2.0)], + [(5, 3.0), (9, 2.25), (10, 2.25)] ] + self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) - self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) - - # lnn - docs = [corpus[0], corpus[1]] - self.model.set_params(smartirs='lnn') + self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) + + # smartirs=`bnn` + self.model.set_params(smartirs='bnn') self.model.fit(self.corpus) + transformed_docs = self.model.transform(docs) - expected_docs = [ - [(0, 2.0), (1, 2.0), (2, 2.0)], - [(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0), (8, 2.0)] + [(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)], + [(5, 3), (9, 3), (10, 3)] ] + self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) - self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) - - # lnc - docs = [corpus[0], corpus[1]] - self.model.set_params(smartirs='lnc') + self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) + + # smartirs=`Lnn` + self.model.set_params(smartirs='Lnn') self.model.fit(self.corpus) + transformed_docs = self.model.transform(docs) - - expected_docs = [ - [(0, 0.57735026918962584), (1, 0.57735026918962584), (2, 0.57735026918962584)], - [(3, 0.34299717028501764), (4, 0.34299717028501764), (5, 0.51449575542752646), (6, 0.34299717028501764), (7, 0.51449575542752646), (8, 0.34299717028501764)] + expected_docs = [[(3, 1.4635792826230198), + (4, 1.4635792826230198), + (5, 2.19536892393453), + (6, 1.4635792826230198), + (7, 2.19536892393453), + (8, 1.4635792826230198)], + [(5, 3.627141918134611), (9, 1.8135709590673055), (10, 1.8135709590673055)] ] + self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) - self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) - - # ltn - docs = [corpus[0], corpus[1]] - self.model.set_params(smartirs='ltn') + self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) + + # Testing all the variations of `glocal` + # smartirs=`ntn` + self.model.set_params(smartirs='ntn') self.model.fit(self.corpus) + transformed_docs = self.model.transform(docs) - - expected_docs = [ - [(0, 2.1699250014423126), (1, 2.1699250014423126), (2, 2.1699250014423126)], - [(3, 2.1699250014423126), (4, 2.1699250014423126), (5, 1.5849625007211563), (6, 2.1699250014423126), (7, 1.5849625007211563), (8, 2.1699250014423126)] + expected_docs = [[(3, 2.1699250014423126), + (4, 2.1699250014423126), + (5, 1.5849625007211563), + (6, 2.1699250014423126), + (7, 1.5849625007211563), + (8, 2.1699250014423126)], + [(5, 3.1699250014423126), (9, 1.5849625007211563), (10, 1.5849625007211563)] ] + self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) - self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) - - # ltc - docs = [corpus[0], corpus[1]] - self.model.set_params(smartirs='ltc') - self.model.fit(self.corpus) - transformed_docs = self.model.transform(docs) - - expected_docs = [[(0, 0.57735026918962573), - (1, 0.57735026918962573), - (2, 0.57735026918962573)], - [(3, 0.44424552527467476), - (4, 0.44424552527467476), - (5, 0.32448702061385548), - (6, 0.44424552527467476), - (7, 0.32448702061385548), - (8, 0.44424552527467476)]] - self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) - self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) - - # lpn - docs = [corpus[0], corpus[1]] - self.model.set_params(smartirs='lpn') + self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) + + # smartirs=`npn` + self.model.set_params(smartirs='npn') self.model.fit(self.corpus) + transformed_docs = self.model.transform(docs) - - expected_docs = [[(0, 1.8073549220576042), - (1, 1.8073549220576042), - (2, 1.8073549220576042)], - [(3, 1.8073549220576042), + expected_docs = [[(3, 1.8073549220576042), (4, 1.8073549220576042), (5, 1.0), (6, 1.8073549220576042), (7, 1.0), - (8, 1.8073549220576042)]] + (8, 1.8073549220576042)], + [(5, 2.0), (9, 1.0), (10, 1.0)] + ] + self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) - self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) - - # lpc - docs = [corpus[0], corpus[1]] - self.model.set_params(smartirs='lpc') + self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) + + # Testing all the variations of `normalize` + # smartirs=`nnc` + self.model.set_params(smartirs='nnc') self.model.fit(self.corpus) + transformed_docs = self.model.transform(docs) - - expected_docs = [[(0, 0.57735026918962573), - (1, 0.57735026918962573), - (2, 0.57735026918962573)], - [(3, 0.46563179782533826), - (4, 0.46563179782533826), - (5, 0.25763163180767745), - (6, 0.46563179782533826), - (7, 0.25763163180767745), - (8, 0.46563179782533826)]] + expected_docs = [[(3, 0.34299717028501764), + (4, 0.34299717028501764), + (5, 0.51449575542752646), + (6, 0.34299717028501764), + (7, 0.51449575542752646), + (8, 0.34299717028501764)], + [(5, 0.81649658092772603), + (9, 0.40824829046386302), + (10, 0.40824829046386302)] + ] + self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) - - # lpc - docs = [corpus[0], corpus[1]] - self.model.set_params(smartirs='lpc') + + # Check if wlocal and wglobal are overriden if smartirs is not None + self.model.set_params(wlocal=lambda x: x, wglobal=lambda x, y: x * x, smartirs='nnc') self.model.fit(self.corpus) + transformed_docs = self.model.transform(docs) - - #pprint.pprint(transformed_docs) - expected_docs = [[(0, 0.57735026918962573), - (1, 0.57735026918962573), - (2, 0.57735026918962573)], - [(3, 0.46563179782533826), - (4, 0.46563179782533826), - (5, 0.25763163180767745), - (6, 0.46563179782533826), - (7, 0.25763163180767745), - (8, 0.46563179782533826)]] + + self.model.set_params(wlocal=lambda x: x * x, wglobal=lambda x, y: x, smartirs='nnc') + self.model.fit(self.corpus) + expected_docs = self.model.transform(docs) + self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0])) - self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) + self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) + class TestHdpTransformer(unittest.TestCase): def setUp(self): From 0bacc081ce60b6e83652c1430a8d479585513319 Mon Sep 17 00:00:00 2001 From: Mohit Rathore Date: Fri, 22 Dec 2017 16:29:37 +0530 Subject: [PATCH 16/25] test without sklearn api --- gensim/test/test_data/tfidf_model.tst | Bin 0 -> 458 bytes gensim/test/test_data/tfidf_model.tst.bz2 | Bin 0 -> 338 bytes gensim/test/test_sklearn_api.py | 170 +--------------------- gensim/test/test_tfidfmodel.py | 163 ++++++++++++++++++++- 4 files changed, 165 insertions(+), 168 deletions(-) create mode 100644 gensim/test/test_data/tfidf_model.tst create mode 100644 gensim/test/test_data/tfidf_model.tst.bz2 diff --git a/gensim/test/test_data/tfidf_model.tst b/gensim/test/test_data/tfidf_model.tst new file mode 100644 index 0000000000000000000000000000000000000000..e9e5f3f3cff5372e7e5ce18a89efe58321c84e4f GIT binary patch literal 458 zcmZ{gy-ve06otu8^W*T3*m!`lWataP*1>`aZ$wrayOkv;=Eg~@R;mOWZ^6KG@L+6Q zBft}~j_&89W6AeChs{cMR<&hogw(bimP#zYnLIeWb2fqC`US#0L{pY`rf+Mlb`s)O zmT}&V?bFu6q+&7VEhKwLr}SWc8$1T@Zy$;V9z&0j$Jpa!e)IYBEb(sY-ORhWcZtX1 zFEFwU|Hg%BEMzoisk?RE4sv?ME;;XIGxk=kCAaljYEugfq+*0~wZbJOUWT*T4*^jL0KkKS=gsPrT_rX|M>s^`UC)B5CFi71_r;U|FU2}004*p00ICY zFaa9dHX0(C2dU_R=pX<9XwVrRpa`CzC#q@c05oW502%-Q07_|7Aw4Ecn?f0)W{HWT zL4W|#L;Vr~C^N8ePzY=&PU1tjo%n8 Date: Tue, 26 Dec 2017 01:34:34 +0530 Subject: [PATCH 17/25] hanging idents and new tests --- gensim/models/tfidfmodel.py | 4 +- gensim/test/test_tfidfmodel.py | 101 +++++++++++++++++++++++---------- 2 files changed, 74 insertions(+), 31 deletions(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 3d32b334fb..e52afe25a4 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -67,7 +67,7 @@ def updated_wlocal(tf, n_tf): return (1 + np.log(tf) / np.log(2)) / (1 + np.log(tf.mean(axis=0) / np.log(2))) -def updated_wglobal(docfreq, totaldocs, n_df): # TODO rename it (to avoid confusion) +def updated_wglobal(docfreq, totaldocs, n_df): if n_df == "n": return utils.identity(docfreq) elif n_df == "t": @@ -76,7 +76,7 @@ def updated_wglobal(docfreq, totaldocs, n_df): # TODO rename it (to avoid confu return np.log((1.0 * totaldocs - docfreq) / docfreq) / np.log(2) -def updated_normalize(x, n_n): # TODO rename it (to avoid confusion) +def updated_normalize(x, n_n): if n_n == "n": return x elif n_n == "c": diff --git a/gensim/test/test_tfidfmodel.py b/gensim/test/test_tfidfmodel.py index 32e9ee4d7e..7bc5d63cd5 100644 --- a/gensim/test/test_tfidfmodel.py +++ b/gensim/test/test_tfidfmodel.py @@ -66,35 +66,55 @@ def testInit(self): self.assertEqual(model1.idfs, model2.idfs) def testPersistence(self): + # Test persistence without using `smartirs` fname = get_tmpfile('gensim_models.tst') model = tfidfmodel.TfidfModel(self.corpus, normalize=True) model.save(fname) model2 = tfidfmodel.TfidfModel.load(fname) self.assertTrue(model.idfs == model2.idfs) - tstvec = [] + tstvec = [corpus[1], corpus[2]] self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector - # Test persistence between old and new model. + # Test persistence with using `smartirs` + fname = get_tmpfile('gensim_models_smartirs.tst') + model = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc") + model.save(fname) + model2 = tfidfmodel.TfidfModel.load(fname) + self.assertTrue(model.idfs == model2.idfs) + tstvec = [corpus[1], corpus[2]] + self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector + + # Test persistence between Gensim v3.2.0 and current model. model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc") model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst')) self.assertTrue(model3.idfs == model4.idfs) - tstvec = [] + tstvec = [corpus[1], corpus[2]] self.assertTrue(np.allclose(model3[tstvec], model4[tstvec])) # try projecting an empty vector def testPersistenceCompressed(self): + # Test persistence without using `smartirs` fname = get_tmpfile('gensim_models.tst.gz') model = tfidfmodel.TfidfModel(self.corpus, normalize=True) model.save(fname) model2 = tfidfmodel.TfidfModel.load(fname, mmap=None) self.assertTrue(model.idfs == model2.idfs) - tstvec = [] + tstvec = [corpus[1], corpus[2]] + self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector + + # Test persistence with using `smartirs` + fname = get_tmpfile('gensim_models_smartirs.tst.gz') + model = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc") + model.save(fname) + model2 = tfidfmodel.TfidfModel.load(fname, mmap=None) + self.assertTrue(model.idfs == model2.idfs) + tstvec = [corpus[1], corpus[2]] self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector - # Test persistence between old and new compressed model. + # Test persistence between Gensim v3.2.0 and current compressed model. model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc") model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst.bz2')) self.assertTrue(model3.idfs == model4.idfs) - tstvec = [] + tstvec = [corpus[1], corpus[2]] self.assertTrue(np.allclose(model3[tstvec], model4[tstvec])) # try projecting an empty vector def TestConsistency(self): @@ -114,9 +134,15 @@ def TestConsistency(self): # nnn model = tfidfmodel.TfidfModel(self.corpus, smartirs='nnn') transformed_docs = [model[docs[0]], model[docs[1]]] - expected_docs = [[(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)], - [(5, 6), (9, 3), (10, 3)] - ] + expected_docs = [[(3, 2), + (4, 2), + (5, 3), + (6, 2), + (7, 3), + (8, 2)], + [(5, 6), + (9, 3), + (10, 3)]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) @@ -124,9 +150,15 @@ def TestConsistency(self): # lnn model = tfidfmodel.TfidfModel(self.corpus, smartirs='lnn') transformed_docs = [model[docs[0]], model[docs[1]]] - expected_docs = [[(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0), (8, 2.0)], - [(5, 6.0), (9, 3.0), (10, 3.0)] - ] + expected_docs = [[(3, 2.0), + (4, 2.0), + (5, 3.0), + (6, 2.0), + (7, 3.0), + (8, 2.0)], + [(5, 6.0), + (9, 3.0), + (10, 3.0)]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) @@ -134,10 +166,15 @@ def TestConsistency(self): # ann model = tfidfmodel.TfidfModel(self.corpus, smartirs='ann') transformed_docs = [model[docs[0]], model[docs[1]]] - expected_docs = [ - [(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0), (8, 2.0)], - [(5, 3.0), (9, 2.25), (10, 2.25)] - ] + expected_docs = [[(3, 2.0), + (4, 2.0), + (5, 3.0), + (6, 2.0), + (7, 3.0), + (8, 2.0)], + [(5, 3.0), + (9, 2.25), + (10, 2.25)]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) @@ -145,10 +182,15 @@ def TestConsistency(self): # bnn model = tfidfmodel.TfidfModel(self.corpus, smartirs='bnn') transformed_docs = [model[docs[0]], model[docs[1]]] - expected_docs = [ - [(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)], - [(5, 3), (9, 3), (10, 3)] - ] + expected_docs = [[(3, 2), + (4, 2), + (5, 3), + (6, 2), + (7, 3), + (8, 2)], + [(5, 3), + (9, 3), + (10, 3)]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) @@ -162,8 +204,9 @@ def TestConsistency(self): (6, 1.4635792826230198), (7, 2.19536892393453), (8, 1.4635792826230198)], - [(5, 3.627141918134611), (9, 1.8135709590673055), (10, 1.8135709590673055)] - ] + [(5, 3.627141918134611), + (9, 1.8135709590673055), + (10, 1.8135709590673055)]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) @@ -178,8 +221,9 @@ def TestConsistency(self): (6, 2.1699250014423126), (7, 1.5849625007211563), (8, 2.1699250014423126)], - [(5, 3.1699250014423126), (9, 1.5849625007211563), (10, 1.5849625007211563)] - ] + [(5, 3.1699250014423126), + (9, 1.5849625007211563), + (10, 1.5849625007211563)]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) @@ -193,8 +237,9 @@ def TestConsistency(self): (6, 1.8073549220576042), (7, 1.0), (8, 1.8073549220576042)], - [(5, 2.0), (9, 1.0), (10, 1.0)] - ] + [(5, 2.0), + (9, 1.0), + (10, 1.0)]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) @@ -211,8 +256,7 @@ def TestConsistency(self): (8, 0.34299717028501764)], [(5, 0.81649658092772603), (9, 0.40824829046386302), - (10, 0.40824829046386302)] - ] + (10, 0.40824829046386302)]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) @@ -226,7 +270,6 @@ def TestConsistency(self): self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) -# endclass TestTfidfModel if __name__ == '__main__': From e5140f840ea8015eee8942ac217313519c551587 Mon Sep 17 00:00:00 2001 From: Mohit Rathore Date: Tue, 26 Dec 2017 11:35:11 +0530 Subject: [PATCH 18/25] add docstring --- gensim/models/tfidfmodel.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index e52afe25a4..45dc2fb7f8 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -19,6 +19,34 @@ def resolve_weights(smartirs): """ Checks for validity of smartirs parameter. + + Parameters + ---------- + smartirs : {'None' ,'str'} + `smartirs` or SMART (System for the Mechanical Analysis and Retrieval of Text) + Information Retrieval System, a mnemonic scheme for denoting tf-idf weighting + variants in the vector space model. The mnemonic for representing a combination + of weights takes the form ddd, where the letters represents the term weighting + of the document vector. + + + for more information visit https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System + + Raises + ------ + ValueError : If `smartirs` is not a string of length 3 or one of the decomposed value + doesn't fit the list of permissible values + + Returns + ------- + w_tf, w_df, w_n : str, str, str + Term frequency weighing: + natural - `n`, logarithm - `l` , augmented - `a`, boolean `b`, log average - `L`. + Document frequency weighting: + none - `n`, idf - `t`, prob idf - `p`. + Document normalization: + none - `n`, cosine - `c`. + """ if not isinstance(smartirs, str) or len(smartirs) != 3: raise ValueError("Expected a string of length 3 except got " + smartirs) From 4afbaddef33e40a74b6370cd820215ae3ce0774d Mon Sep 17 00:00:00 2001 From: Mohit Rathore Date: Tue, 26 Dec 2017 11:35:11 +0530 Subject: [PATCH 19/25] add docstring --- gensim/models/tfidfmodel.py | 28 ++++++++++++++++++++++++++++ gensim/test/test_tfidfmodel.py | 24 ++++++++++++++++++------ 2 files changed, 46 insertions(+), 6 deletions(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index e52afe25a4..45dc2fb7f8 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -19,6 +19,34 @@ def resolve_weights(smartirs): """ Checks for validity of smartirs parameter. + + Parameters + ---------- + smartirs : {'None' ,'str'} + `smartirs` or SMART (System for the Mechanical Analysis and Retrieval of Text) + Information Retrieval System, a mnemonic scheme for denoting tf-idf weighting + variants in the vector space model. The mnemonic for representing a combination + of weights takes the form ddd, where the letters represents the term weighting + of the document vector. + + + for more information visit https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System + + Raises + ------ + ValueError : If `smartirs` is not a string of length 3 or one of the decomposed value + doesn't fit the list of permissible values + + Returns + ------- + w_tf, w_df, w_n : str, str, str + Term frequency weighing: + natural - `n`, logarithm - `l` , augmented - `a`, boolean `b`, log average - `L`. + Document frequency weighting: + none - `n`, idf - `t`, prob idf - `p`. + Document normalization: + none - `n`, cosine - `c`. + """ if not isinstance(smartirs, str) or len(smartirs) != 3: raise ValueError("Expected a string of length 3 except got " + smartirs) diff --git a/gensim/test/test_tfidfmodel.py b/gensim/test/test_tfidfmodel.py index 7bc5d63cd5..3669e08bfb 100644 --- a/gensim/test/test_tfidfmodel.py +++ b/gensim/test/test_tfidfmodel.py @@ -73,7 +73,9 @@ def testPersistence(self): model2 = tfidfmodel.TfidfModel.load(fname) self.assertTrue(model.idfs == model2.idfs) tstvec = [corpus[1], corpus[2]] - self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector + self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]])) + self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]])) + self.assertTrue(np.allclose(model[[]], model2[[]])) # try projecting an empty vector # Test persistence with using `smartirs` fname = get_tmpfile('gensim_models_smartirs.tst') @@ -82,14 +84,18 @@ def testPersistence(self): model2 = tfidfmodel.TfidfModel.load(fname) self.assertTrue(model.idfs == model2.idfs) tstvec = [corpus[1], corpus[2]] - self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector + self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]])) + self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]])) + self.assertTrue(np.allclose(model[[]], model2[[]])) # try projecting an empty vector # Test persistence between Gensim v3.2.0 and current model. model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc") model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst')) self.assertTrue(model3.idfs == model4.idfs) tstvec = [corpus[1], corpus[2]] - self.assertTrue(np.allclose(model3[tstvec], model4[tstvec])) # try projecting an empty vector + self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]])) + self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]])) + self.assertTrue(np.allclose(model3[[]], model4[[]])) # try projecting an empty vector def testPersistenceCompressed(self): # Test persistence without using `smartirs` @@ -99,7 +105,9 @@ def testPersistenceCompressed(self): model2 = tfidfmodel.TfidfModel.load(fname, mmap=None) self.assertTrue(model.idfs == model2.idfs) tstvec = [corpus[1], corpus[2]] - self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector + self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]])) + self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]])) + self.assertTrue(np.allclose(model[[]], model2[[]])) # try projecting an empty vector # Test persistence with using `smartirs` fname = get_tmpfile('gensim_models_smartirs.tst.gz') @@ -108,14 +116,18 @@ def testPersistenceCompressed(self): model2 = tfidfmodel.TfidfModel.load(fname, mmap=None) self.assertTrue(model.idfs == model2.idfs) tstvec = [corpus[1], corpus[2]] - self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector + self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]])) + self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]])) + self.assertTrue(np.allclose(model[[]], model2[[]])) # try projecting an empty vector # Test persistence between Gensim v3.2.0 and current compressed model. model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc") model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst.bz2')) self.assertTrue(model3.idfs == model4.idfs) tstvec = [corpus[1], corpus[2]] - self.assertTrue(np.allclose(model3[tstvec], model4[tstvec])) # try projecting an empty vector + self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]])) + self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]])) + self.assertTrue(np.allclose(model3[[]], model4[[]])) # try projecting an empty vector def TestConsistency(self): docs = [corpus[1], corpus[2]] From 52ee3c45691b442dd71278d522faec6632586b5c Mon Sep 17 00:00:00 2001 From: Mohit Rathore Date: Wed, 27 Dec 2017 11:32:38 +0530 Subject: [PATCH 20/25] better way cmparing floats --- gensim/models/tfidfmodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 45dc2fb7f8..c7461899e5 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -272,7 +272,7 @@ def __getitem__(self, bow, eps=1e-12): vector = [ (termid, tf * self.idfs.get(termid)) - for termid, tf in zip(termid_array, tf_array) if self.idfs.get(termid, 0.0) != 0.0 + for termid, tf in zip(termid_array, tf_array) if self.idfs.get(termid, 0.0) > eps ] if self.normalize is True: From 48e84f7c899b2245fee5843ca20b322a726db22b Mon Sep 17 00:00:00 2001 From: Mohit Rathore Date: Mon, 8 Jan 2018 12:34:41 +0530 Subject: [PATCH 21/25] old way of cmp floats --- gensim/models/tfidfmodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index c7461899e5..45dc2fb7f8 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -272,7 +272,7 @@ def __getitem__(self, bow, eps=1e-12): vector = [ (termid, tf * self.idfs.get(termid)) - for termid, tf in zip(termid_array, tf_array) if self.idfs.get(termid, 0.0) > eps + for termid, tf in zip(termid_array, tf_array) if self.idfs.get(termid, 0.0) != 0.0 ] if self.normalize is True: From d0878a41771250678a8170aba93292b335e74fd9 Mon Sep 17 00:00:00 2001 From: ivan Date: Wed, 10 Jan 2018 23:32:01 +0500 Subject: [PATCH 22/25] doc fix[1] --- gensim/models/tfidfmodel.py | 137 ++++++++++++++++++++++++++++-------- 1 file changed, 108 insertions(+), 29 deletions(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 45dc2fb7f8..41786fcf50 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -17,35 +17,37 @@ def resolve_weights(smartirs): - """ - Checks for validity of smartirs parameter. + """Checks for validity of `smartirs` parameter. Parameters ---------- - smartirs : {'None' ,'str'} - `smartirs` or SMART (System for the Mechanical Analysis and Retrieval of Text) - Information Retrieval System, a mnemonic scheme for denoting tf-idf weighting - variants in the vector space model. The mnemonic for representing a combination - of weights takes the form ddd, where the letters represents the term weighting - of the document vector. - + smartirs : str + `smartirs` or SMART (System for the Mechanical Analysis and Retrieval of Text) + Information Retrieval System, a mnemonic scheme for denoting tf-idf weighting + variants in the vector space model. The mnemonic for representing a combination + of weights takes the form ddd, where the letters represents the term weighting of the document vector. + for more information visit [1]_. - for more information visit https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System + Returns + ------- + w_tf : str + Term frequency weighing: natural - `n`, logarithm - `l` , augmented - `a`, boolean `b`, log average - `L`. + w_df : str + Document frequency weighting: none - `n`, idf - `t`, prob idf - `p`. + w_n : str + Document normalization: none - `n`, cosine - `c`. Raises ------ - ValueError : If `smartirs` is not a string of length 3 or one of the decomposed value - doesn't fit the list of permissible values + ValueError + If `smartirs` is not a string of length 3 or one of the decomposed value + doesn't fit the list of permissible values + - Returns - ------- - w_tf, w_df, w_n : str, str, str - Term frequency weighing: - natural - `n`, logarithm - `l` , augmented - `a`, boolean `b`, log average - `L`. - Document frequency weighting: - none - `n`, idf - `t`, prob idf - `p`. - Document normalization: - none - `n`, cosine - `c`. + + References + ---------- + .. [1] https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System """ if not isinstance(smartirs, str) or len(smartirs) != 3: @@ -54,28 +56,58 @@ def resolve_weights(smartirs): w_tf, w_df, w_n = smartirs if w_tf not in 'nlabL': - raise ValueError("Expected term frequency weight to be one of 'nlabL', except got " + w_tf) + raise ValueError("Expected term frequency weight to be one of 'nlabL', except got {}".format(w_tf)) if w_df not in 'ntp': - raise ValueError("Expected inverse document frequency weight to be one of 'ntp', except got " + w_df) + raise ValueError("Expected inverse document frequency weight to be one of 'ntp', except got {}".format(w_df)) if w_n not in 'ncb': - raise ValueError("Expected normalization weight to be one of 'ncb', except got " + w_n) + raise ValueError("Expected normalization weight to be one of 'ncb', except got {}".format(w_n)) return w_tf, w_df, w_n def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0): - """ - Compute default inverse-document-frequency for a term with document frequency `doc_freq`:: - idf = add + log(totaldocs / doc_freq) + """Compute default inverse-document-frequency for a term with document frequency: + :math:`idf = add + log_{log\_base} \\frac{totaldocs}{doc\_freq}` + + Parameters + ---------- + docfreq : float + Document frequency. + totaldocs : int + Total number of documents. + log_base : float, optional + Base of logarithm. + add : float, optional + Offset. + + Returns + ------- + float + Inverse document frequency. + """ return add + np.log(float(totaldocs) / docfreq) / np.log(log_base) def precompute_idfs(wglobal, dfs, total_docs): - """ - Precompute the inverse document frequency mapping for all terms. + """Pre-compute the inverse document frequency mapping for all terms. + + Parameters + ---------- + wglobal : function + Custom function for calculation idf, look at "universal" :func:`~gensim.models.tfidfmodel.updated_wglobal`. + dfs : dict + Dictionary with term_id and how many documents this token appeared. + total_docs : int + Total number of document. + + Returns + ------- + dict + Precomputed idfs in format {term_id_1: idfs_1, term_id_2: idfs_2, ...} + """ # not strictly necessary and could be computed on the fly in TfidfModel__getitem__. # this method is here just to speed things up a little. @@ -83,6 +115,21 @@ def precompute_idfs(wglobal, dfs, total_docs): def updated_wlocal(tf, n_tf): + """Apply needed function based on `n_tf`. + + Parameters + ---------- + tf : int + Term frequency. + n_tf : str + Parameter, that choice concrete function. + + Returns + ------- + float + Calculated wlocal. + + """ if n_tf == "n": return tf elif n_tf == "l": @@ -96,6 +143,23 @@ def updated_wlocal(tf, n_tf): def updated_wglobal(docfreq, totaldocs, n_df): + """Apply needed function based on `n_df`. + + Parameters + ---------- + docfreq : int + Document frequency. + totaldocs : int + Total number of documents. + n_df : str + Parameter, that choice concrete function. + + Returns + ------- + float + Calculated wglobal. + + """ if n_df == "n": return utils.identity(docfreq) elif n_df == "t": @@ -105,6 +169,21 @@ def updated_wglobal(docfreq, totaldocs, n_df): def updated_normalize(x, n_n): + """Apply needed normalization based on `n_n` + + Parameters + ---------- + x : numpy.ndarray + Input array + n_n : str + Parameter, that choice concrete function. + + Returns + ------- + numpy.ndarray + Normalized array. + + """ if n_n == "n": return x elif n_n == "c": From b544c9cf6d193dea12494090af22096ea8a40fad Mon Sep 17 00:00:00 2001 From: ivan Date: Thu, 11 Jan 2018 11:05:12 +0500 Subject: [PATCH 23/25] doc fix[2] --- docs/src/models/tfidfmodel.rst | 3 +- gensim/models/tfidfmodel.py | 171 +++++++++++++++++++-------------- 2 files changed, 100 insertions(+), 74 deletions(-) diff --git a/docs/src/models/tfidfmodel.rst b/docs/src/models/tfidfmodel.rst index 6b622d7589..55907470d3 100644 --- a/docs/src/models/tfidfmodel.rst +++ b/docs/src/models/tfidfmodel.rst @@ -1,5 +1,5 @@ :mod:`models.tfidfmodel` -- TF-IDF model -====================================================== +======================================== .. automodule:: gensim.models.tfidfmodel :synopsis: TF-IDF model @@ -7,3 +7,4 @@ :inherited-members: :undoc-members: :show-inheritance: + :special-members: __getitem__ diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 41786fcf50..9507a7aede 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -31,11 +31,21 @@ def resolve_weights(smartirs): Returns ------- w_tf : str - Term frequency weighing: natural - `n`, logarithm - `l` , augmented - `a`, boolean `b`, log average - `L`. + Term frequency weighing: + * `n` - natural, + * `l` - logarithm, + * `a` - augmented, + * `b` - boolean, + * `L` - log average. w_df : str - Document frequency weighting: none - `n`, idf - `t`, prob idf - `p`. + Document frequency weighting: + * `n` - none, + * `t` - idf, + * `p` - prob idf. w_n : str - Document normalization: none - `n`, cosine - `c`. + Document normalization: + * `n` - none, + * `c` - cosine. Raises ------ @@ -43,8 +53,6 @@ def resolve_weights(smartirs): If `smartirs` is not a string of length 3 or one of the decomposed value doesn't fit the list of permissible values - - References ---------- .. [1] https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System @@ -115,13 +123,13 @@ def precompute_idfs(wglobal, dfs, total_docs): def updated_wlocal(tf, n_tf): - """Apply needed function based on `n_tf`. + """Apply needed function based on `n_tf`. TODO: add better descriptions for function/parameters. Parameters ---------- tf : int Term frequency. - n_tf : str + n_tf : {'n', 'l', 'a', 'b', 'L'} Parameter, that choice concrete function. Returns @@ -143,7 +151,7 @@ def updated_wlocal(tf, n_tf): def updated_wglobal(docfreq, totaldocs, n_df): - """Apply needed function based on `n_df`. + """Apply needed function based on `n_df`. TODO: add better descriptions for function/parameters. Parameters ---------- @@ -151,7 +159,7 @@ def updated_wglobal(docfreq, totaldocs, n_df): Document frequency. totaldocs : int Total number of documents. - n_df : str + n_df : {'n', 't', 'p'} Parameter, that choice concrete function. Returns @@ -169,13 +177,13 @@ def updated_wglobal(docfreq, totaldocs, n_df): def updated_normalize(x, n_n): - """Apply needed normalization based on `n_n` + """Apply needed normalization based on `n_n`. TODO: add better descriptions for function/parameters. Parameters ---------- x : numpy.ndarray Input array - n_n : str + n_n : {'n', 'c'} Parameter, that choice concrete function. Returns @@ -191,78 +199,79 @@ def updated_normalize(x, n_n): class TfidfModel(interfaces.TransformationABC): - """ - Objects of this class realize the transformation between word-document co-occurrence - matrix (integers) into a locally/globally weighted TF_IDF matrix (positive floats). + """Objects of this class realize the transformation between word-document co-occurrence matrix (int) + into a locally/globally weighted TF_IDF matrix (positive floats). Examples -------- - >>> tfidf = TfidfModel(corpus) - >>> print(tfidf[some_doc]) - >>> tfidf.save('/tmp/foo.tfidf_model') - - Model persistency is achieved via its load/save methods. + >>> import gensim.downloader as api + >>> from gensim.models import TfidfModel + >>> from gensim.corpora import Dictionary + >>> + >>> dataset = api.load("text8") + >>> dct = Dictionary(dataset) # fit dictionary + >>> corpus = [dct.doc2bow(line) for line in dataset] # convert dataset to BoW format + >>> + >>> model = TfidfModel(corpus) # fit model + >>> vector = model[corpus[0]] # apply model """ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.identity, wglobal=df2idf, normalize=True, smartirs=None): - """ - Compute tf-idf by multiplying a local component (term frequency) with a - global component (inverse document frequency), and normalizing - the resulting documents to unit length. Formula for unnormalized weight - of term `i` in document `j` in a corpus of D documents:: + """Compute tf-idf by multiplying a local component (term frequency) with a global component + (inverse document frequency), and normalizing the resulting documents to unit length. + Formula for non-normalized weight of term :math:`i` in document :math:`j` in a corpus of :math:`D` documents - weight_{i,j} = frequency_{i,j} * log_2(D / document_freq_{i}) + .. math:: weight_{i,j} = frequency_{i,j} * log_2 \\frac{D}{document\_freq_{i}} - or, more generally:: + or, more generally - weight_{i,j} = wlocal(frequency_{i,j}) * wglobal(document_freq_{i}, D) - - so you can plug in your own custom `wlocal` and `wglobal` functions. + .. math:: weight_{i,j} = wlocal(frequency_{i,j}) * wglobal(document\_freq_{i}, D) + so you can plug in your own custom :math:`wlocal` and :math:`wglobal` functions. Parameters ---------- - corpus : dictionary.doc2bow - Corpus is a list of sets where each set has two elements. First being the termid and - second being the term frequency of each term in the document. - id2word : dict - id2word is an optional dictionary that maps the word_id to a token. - In case id2word isn’t specified the mapping id2word[word_id] = str(word_id) will be used. - dictionary :corpora.Dictionary - If `dictionary` is specified, it must be a `corpora.Dictionary` object - and it will be used to directly construct the inverse document frequency - mapping (then `corpus`, if specified, is ignored). - wlocals : user specified function - Default for `wlocal` is identity (other options: math.sqrt, math.log1p, ...) - wglobal : user specified function - Default for `wglobal` is `log_2(total_docs / doc_freq)`, giving the - formula above. - normalize : user specified function - It dictates how the final transformed vectors will be normalized. - `normalize=True` means set to unit length (default); `False` means don't - normalize. You can also set `normalize` to your own function that accepts - and returns a sparse vector. - smartirs : {'None' ,'str'} - `smartirs` or SMART (System for the Mechanical Analysis and Retrieval of Text) - Information Retrieval System, a mnemonic scheme for denoting tf-idf weighting - variants in the vector space model. The mnemonic for representing a combination - of weights takes the form ddd, where the letters represents the term weighting - of the document vector. - - Term frequency weighing: - natural - `n`, logarithm - `l` , augmented - `a`, boolean `b`, log average - `L`. - Document frequency weighting: - none - `n`, idf - `t`, prob idf - `p`. - Document normalization: - none - `n`, cosine - `c`. - - for more information visit https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System - - Returns - ------- - x : gensim.models.tfidfmodel.TfidfModel + corpus : iterable of iterable of (int, int), optional + Input corpus + id2word : {dict, :class:`~gensim.corpora.Dictionary`}, optional + Mapping token - id, that was used for converting input data to bag of words format. + dictionary : :class:`~gensim.corpora.Dictionary` + If `dictionary` is specified, it must be a `corpora.Dictionary` object and it will be used. + to directly construct the inverse document frequency mapping (then `corpus`, if specified, is ignored). + wlocals : function, optional + Function for local weighting, default for `wlocal` is :func:`~gensim.utils.identity` + (other options: :func:`math.sqrt`, :func:`math.log1p`, etc). + wglobal : function, optional + Function for global weighting, default is :func:`~gensim.models.tfidfmodel.df2idf`. + normalize : bool, optional + It dictates how the final transformed vectors will be normalized. `normalize=True` means set to unit length + (default); `False` means don't normalize. You can also set `normalize` to your own function that accepts + and returns a sparse vector. + smartirs : str, optional + SMART (System for the Mechanical Analysis and Retrieval of Text) Information Retrieval System, + a mnemonic scheme for denoting tf-idf weighting variants in the vector space model. + The mnemonic for representing a combination of weights takes the form XYZ, + for example 'ntc', 'bpn' and so on, where the letters represents the term weighting of the document vector. + + Term frequency weighing: + * `n` - natural, + * `l` - logarithm, + * `a` - augmented, + * `b` - boolean, + * `L` - log average. + + Document frequency weighting: + * `n` - none, + * `t` - idf, + * `p` - prob idf. + + Document normalization: + * `n` - none, + * `c` - cosine. + + For more information visit [1]_. """ @@ -303,9 +312,13 @@ def __str__(self): return "TfidfModel(num_docs=%s, num_nnz=%s)" % (self.num_docs, self.num_nnz) def initialize(self, corpus): - """ - Compute inverse document weights, which will be used to modify term - frequencies for documents. + """Compute inverse document weights, which will be used to modify term frequencies for documents. + + Parameters + ---------- + corpus : iterable of iterable of (int, int) + Input corpus. + """ logger.info("collecting document frequencies") dfs = {} @@ -331,8 +344,20 @@ def initialize(self, corpus): self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs) def __getitem__(self, bow, eps=1e-12): - """ - Return tf-idf representation of the input vector and/or corpus. + """Get tf-idf representation of the input vector and/or corpus. + + bow : {list of (int, int), iterable of iterable of (int, int)} + Input document or copus in BoW format. + eps : float + Threshold value, will remove all position that have tfidf-value less than `eps`. + + Returns + ------- + vector : list of (int, float) + TfIdf vector, if `bow` is document **OR** + :class:`~gensim.interfaces.TransformedCorpus` + TfIdf corpus, if `bow` is corpus. + """ # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) From c4e3656d5ba4a568ab811d9404de6c241ad245b4 Mon Sep 17 00:00:00 2001 From: Mohit Rathore Date: Thu, 11 Jan 2018 12:18:31 +0530 Subject: [PATCH 24/25] fix description TODOs --- gensim/models/tfidfmodel.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 9507a7aede..ab454c67df 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -2,6 +2,7 @@ # -*- coding: utf-8 -*- # # Copyright (C) 2012 Radim Rehurek +# Copyright (C) 2017 Mohit Rathore # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html @@ -123,14 +124,14 @@ def precompute_idfs(wglobal, dfs, total_docs): def updated_wlocal(tf, n_tf): - """Apply needed function based on `n_tf`. TODO: add better descriptions for function/parameters. + """A scheme to transform `tf` or term frequency based on the value of `n_tf`. Parameters ---------- tf : int Term frequency. n_tf : {'n', 'l', 'a', 'b', 'L'} - Parameter, that choice concrete function. + Parameter to decide the current transformation scheme. Returns ------- @@ -151,7 +152,7 @@ def updated_wlocal(tf, n_tf): def updated_wglobal(docfreq, totaldocs, n_df): - """Apply needed function based on `n_df`. TODO: add better descriptions for function/parameters. + """A scheme to transform `docfreq` or document frequency based on the value of `n_df`. Parameters ---------- @@ -160,7 +161,7 @@ def updated_wglobal(docfreq, totaldocs, n_df): totaldocs : int Total number of documents. n_df : {'n', 't', 'p'} - Parameter, that choice concrete function. + Parameter to decide the current transformation scheme. Returns ------- @@ -177,14 +178,14 @@ def updated_wglobal(docfreq, totaldocs, n_df): def updated_normalize(x, n_n): - """Apply needed normalization based on `n_n`. TODO: add better descriptions for function/parameters. + """Normalizes the final tf-idf value according to the value of `n_n`. Parameters ---------- x : numpy.ndarray Input array n_n : {'n', 'c'} - Parameter, that choice concrete function. + Parameter that decides the normalizing function to be used. Returns ------- From 98ffde5909e59448ecfdba8f28d8cf973a2298fc Mon Sep 17 00:00:00 2001 From: Mohit Rathore Date: Mon, 15 Jan 2018 19:41:09 +0530 Subject: [PATCH 25/25] fix irksome comparision --- gensim/models/tfidfmodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index ab454c67df..a61e993333 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -377,7 +377,7 @@ def __getitem__(self, bow, eps=1e-12): vector = [ (termid, tf * self.idfs.get(termid)) - for termid, tf in zip(termid_array, tf_array) if self.idfs.get(termid, 0.0) != 0.0 + for termid, tf in zip(termid_array, tf_array) if abs(self.idfs.get(termid, 0.0)) > eps ] if self.normalize is True: