From 5e1830bd1d3946504c773ff825b2c262e1fe89cb Mon Sep 17 00:00:00 2001
From: markroxor <mrmohitrathoremr@gmail.com>
Date: Thu, 20 Oct 2016 23:17:18 +0530
Subject: [PATCH 01/25] fixing appveyor

---
 gensim/test/{test_basemodel.py => basetests.py} | 0
 gensim/test/test_hdpmodel.py                    | 4 ++--
 gensim/test/test_ldamodel.py                    | 4 ++--
 gensim/test/test_lsimodel.py                    | 4 ++--
 4 files changed, 6 insertions(+), 6 deletions(-)
 rename gensim/test/{test_basemodel.py => basetests.py} (100%)

diff --git a/gensim/test/test_basemodel.py b/gensim/test/basetests.py
similarity index 100%
rename from gensim/test/test_basemodel.py
rename to gensim/test/basetests.py
diff --git a/gensim/test/test_hdpmodel.py b/gensim/test/test_hdpmodel.py
index 8c0495cb9a..2fb4fb8a80 100644
--- a/gensim/test/test_hdpmodel.py
+++ b/gensim/test/test_hdpmodel.py
@@ -22,7 +22,7 @@
 from gensim.corpora import mmcorpus, Dictionary
 from gensim.models import hdpmodel
 from gensim import matutils
-from gensim.test import test_basemodel
+from gensim.test import basetests
 
 
 module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
@@ -48,7 +48,7 @@ def testfile():
     return os.path.join(tempfile.gettempdir(), 'gensim_models.tst')
 
 
-class TestHdpModel(unittest.TestCase, test_basemodel.TestBaseTopicModel):
+class TestHdpModel(unittest.TestCase, basetests.TestBaseTopicModel):
     def setUp(self):
         self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
         self.class_ = hdpmodel.HdpModel
diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py
index cc2cececc9..a96d96ae6f 100644
--- a/gensim/test/test_ldamodel.py
+++ b/gensim/test/test_ldamodel.py
@@ -23,7 +23,7 @@
 from gensim.corpora import mmcorpus, Dictionary
 from gensim.models import ldamodel, ldamulticore
 from gensim import matutils
-from gensim.test import test_basemodel
+from gensim.test import basetests
 
 
 module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
@@ -55,7 +55,7 @@ def testRandomState():
         assert(isinstance(ldamodel.get_random_state(testcase), numpy.random.RandomState))
 
 
-class TestLdaModel(unittest.TestCase, test_basemodel.TestBaseTopicModel):
+class TestLdaModel(unittest.TestCase, basetests.TestBaseTopicModel):
     def setUp(self):
         self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
         self.class_ = ldamodel.LdaModel
diff --git a/gensim/test/test_lsimodel.py b/gensim/test/test_lsimodel.py
index ab86c18d4f..cb2052773c 100644
--- a/gensim/test/test_lsimodel.py
+++ b/gensim/test/test_lsimodel.py
@@ -22,7 +22,7 @@
 from gensim.corpora import mmcorpus, Dictionary
 from gensim.models import lsimodel
 from gensim import matutils
-from gensim.test import test_basemodel
+from gensim.test import basetests
 
 
 module_path = os.path.dirname(__file__)  # needed because sample data files are located in the same folder
@@ -51,7 +51,7 @@ def testfile():
     return os.path.join(tempfile.gettempdir(), 'gensim_models.tst')
 
 
-class TestLsiModel(unittest.TestCase, test_basemodel.TestBaseTopicModel):
+class TestLsiModel(unittest.TestCase, basetests.TestBaseTopicModel):
     def setUp(self):
         self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
         self.model = lsimodel.LsiModel(self.corpus, num_topics=2)

From e8a3f1671e2416845353822513366909fddddc45 Mon Sep 17 00:00:00 2001
From: Mohit Rathore <mrmohitrathoremr@gmail.com>
Date: Fri, 15 Dec 2017 12:14:05 +0530
Subject: [PATCH 02/25] verify weights

---
 gensim/models/tfidfmodel.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index 50320ad747..75716f86bc 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -14,6 +14,23 @@
 
 logger = logging.getLogger(__name__)
 
+def resolve_weights(smartirs):
+    if not isinstance(smartirs, str) or len(smartirs)!=3:
+        raise ValueError('Expected a string of length 3 except got ' + smartirs):
+
+    w_tf, w_df, w_n = smartirs
+
+    if w_tf not in 'nlabL':
+      raise ValueError('Expected term frequency weight to be one of nlabL, except got ' + n_tf)
+
+    if w_idf not in 'ntp':
+      raise ValueError('Expected inverse document frequency weight to be one of ntp, except got ' + n_idf)
+
+    if w_n not in 'ncb':
+      raise ValueError('Expected normalization weight to be one of ncb, except got ' + n_n)
+
+    return w_tf, w_idf, w_n
+
 
 def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0):
     """
@@ -50,7 +67,7 @@ class TfidfModel(interfaces.TransformationABC):
     """
 
     def __init__(self, corpus=None, id2word=None, dictionary=None,
-                 wlocal=utils.identity, wglobal=df2idf, normalize=True):
+                 wlocal=utils.identity, wglobal=df2idf, normalize=True, smartirs="nnc"):
         """
         Compute tf-idf by multiplying a local component (term frequency) with a
         global component (inverse document frequency), and normalizing
@@ -82,6 +99,8 @@ def __init__(self, corpus=None, id2word=None, dictionary=None,
         self.id2word = id2word
         self.wlocal, self.wglobal = wlocal, wglobal
         self.num_docs, self.num_nnz, self.idfs = None, None, None
+        self.smartirs = smartirs
+
         if dictionary is not None:
             # user supplied a Dictionary object, which already contains all the
             # statistics we need to construct the IDF mapping. we can skip the

From 648bf21573b59e66b86e2c16b2ed4e0b0bfd0c55 Mon Sep 17 00:00:00 2001
From: Mohit Rathore <mrmohitrathoremr@gmail.com>
Date: Fri, 15 Dec 2017 12:14:05 +0530
Subject: [PATCH 03/25] verify weights

---
 gensim/models/tfidfmodel.py | 40 ++++++++++++++++++++++++++++++++-----
 1 file changed, 35 insertions(+), 5 deletions(-)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index 50320ad747..e4844f377e 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -15,6 +15,24 @@
 logger = logging.getLogger(__name__)
 
 
+def resolve_weights(smartirs):
+    if not isinstance(smartirs, str) or len(smartirs) != 3:
+        raise ValueError('Expected a string of length 3 except got ' + smartirs)
+
+    w_tf, w_df, w_n = smartirs
+
+    if w_tf not in 'nlabL':
+        raise ValueError('Expected term frequency weight to be one of nlabL, except got ' + w_tf)
+
+    if w_df not in 'ntp':
+        raise ValueError('Expected inverse document frequency weight to be one of ntp, except got ' + w_df)
+
+    if w_n not in 'ncb':
+        raise ValueError('Expected normalization weight to be one of ncb, except got ' + w_n)
+
+    return w_tf, w_df, w_n
+
+
 def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0):
     """
     Compute default inverse-document-frequency for a term with document frequency `doc_freq`::
@@ -49,8 +67,8 @@ class TfidfModel(interfaces.TransformationABC):
     Model persistency is achieved via its load/save methods.
     """
 
-    def __init__(self, corpus=None, id2word=None, dictionary=None,
-                 wlocal=utils.identity, wglobal=df2idf, normalize=True):
+    def __init__(self, corpus=None, id2word=None, dictionary=None, smartirs="nnn",
+                 wlocal=None, wglobal=None, normalize=True):
         """
         Compute tf-idf by multiplying a local component (term frequency) with a
         global component (inverse document frequency), and normalizing
@@ -82,6 +100,16 @@ def __init__(self, corpus=None, id2word=None, dictionary=None,
         self.id2word = id2word
         self.wlocal, self.wglobal = wlocal, wglobal
         self.num_docs, self.num_nnz, self.idfs = None, None, None
+        n_tf, n_df, n_n = smartirs
+
+        if n_tf == "n":
+            pass
+        elif n_tf == "":
+            pass
+
+        self.wlocal = utils.identity
+        self.wglobal = df2idf
+
         if dictionary is not None:
             # user supplied a Dictionary object, which already contains all the
             # statistics we need to construct the IDF mapping. we can skip the
@@ -92,6 +120,7 @@ def __init__(self, corpus=None, id2word=None, dictionary=None,
                 )
             self.num_docs, self.num_nnz = dictionary.num_docs, dictionary.num_nnz
             self.dfs = dictionary.dfs.copy()
+
             self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
             if id2word is None:
                 self.id2word = dictionary
@@ -113,6 +142,7 @@ def initialize(self, corpus):
         logger.info("collecting document frequencies")
         dfs = {}
         numnnz, docno = 0, -1
+
         for docno, bow in enumerate(corpus):
             if docno % 10000 == 0:
                 logger.info("PROGRESS: processing document #%i", docno)
@@ -131,7 +161,7 @@ def initialize(self, corpus):
             "calculating IDF weights for %i documents and %i features (%i matrix non-zeros)",
             self.num_docs, n_features, self.num_nnz
         )
-        self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
+        #self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
 
     def __getitem__(self, bow, eps=1e-12):
         """
@@ -145,8 +175,8 @@ def __getitem__(self, bow, eps=1e-12):
         # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
         # as strict application of the IDF formula would dictate)
         vector = [
-            (termid, self.wlocal(tf) * self.idfs.get(termid))
-            for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0
+            (termid, self.wlocal(tf) * self.wglobal(self.dfs[termid], self.num_docs))
+            for termid, tf in bow if self.wglobal(self.dfs[termid], self.num_docs) != 0.0
         ]
 
         # and finally, normalize the vector either to unit length, or use a

From a6f1afbe0fb218f07aafbf5a7648792a0ae65f81 Mon Sep 17 00:00:00 2001
From: Mohit Rathore <mrmohitrathoremr@gmail.com>
Date: Fri, 15 Dec 2017 15:11:29 +0530
Subject: [PATCH 04/25] smartirs ready

---
 gensim/models/tfidfmodel.py | 69 +++++++++++++++++++------------------
 1 file changed, 35 insertions(+), 34 deletions(-)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index e4844f377e..e396618c1f 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -11,6 +11,7 @@
 from gensim import interfaces, matutils, utils
 from six import iteritems
 
+import numpy as np
 
 logger = logging.getLogger(__name__)
 
@@ -33,22 +34,6 @@ def resolve_weights(smartirs):
     return w_tf, w_df, w_n
 
 
-def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0):
-    """
-    Compute default inverse-document-frequency for a term with document frequency `doc_freq`::
-
-      idf = add + log(totaldocs / doc_freq)
-    """
-    return add + math.log(1.0 * totaldocs / docfreq, log_base)
-
-
-def precompute_idfs(wglobal, dfs, total_docs):
-    """Precompute the inverse document frequency mapping for all terms."""
-    # not strictly necessary and could be computed on the fly in TfidfModel__getitem__.
-    # this method is here just to speed things up a little.
-    return {termid: wglobal(df, total_docs) for termid, df in iteritems(dfs)}
-
-
 class TfidfModel(interfaces.TransformationABC):
     """
     Objects of this class realize the transformation between word-document co-occurrence
@@ -67,8 +52,8 @@ class TfidfModel(interfaces.TransformationABC):
     Model persistency is achieved via its load/save methods.
     """
 
-    def __init__(self, corpus=None, id2word=None, dictionary=None, smartirs="nnn",
-                 wlocal=None, wglobal=None, normalize=True):
+    def __init__(self, corpus=None, id2word=None, dictionary=None, smartirs="ntc",
+                 wlocal=None, wglobal=None, wnormalize=None):
         """
         Compute tf-idf by multiplying a local component (term frequency) with a
         global component (inverse document frequency), and normalizing
@@ -96,19 +81,38 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, smartirs="nnn",
         and it will be used to directly construct the inverse document frequency
         mapping (then `corpus`, if specified, is ignored).
         """
-        self.normalize = normalize
         self.id2word = id2word
-        self.wlocal, self.wglobal = wlocal, wglobal
+        self.wlocal, self.wglobal, self.wnormalize = wlocal, wglobal, wnormalize
         self.num_docs, self.num_nnz, self.idfs = None, None, None
         n_tf, n_df, n_n = smartirs
 
-        if n_tf == "n":
-            pass
-        elif n_tf == "":
-            pass
-
-        self.wlocal = utils.identity
-        self.wglobal = df2idf
+        if self.wlocal is None:
+            if n_tf == "n":
+                self.wlocal = lambda tf, mean=None, _max=None: tf
+            elif n_tf == "l":
+                self.wlocal = lambda tf, mean=None, _max=None: 1 + math.log(tf)
+            elif n_tf == "a":
+                self.wlocal = lambda tf, mean=None, _max=None: 0.5 + (0.5 * tf / _max)
+            elif n_tf == "b":
+                self.wlocal = lambda tf, mean=None, _max=None: 1 if tf > 0 else 0
+            elif n_tf == "L":
+                self.wlocal = lambda tf, mean=None, _max=None: (1 + math.log(tf)) / (1 + math.log(mean))
+
+        if self.wglobal is None:
+            if n_df == "n":
+                self.wglobal = utils.identity
+            elif n_df == "t":
+                self.wglobal = lambda docfreq, totaldocs: math.log(1.0 * totaldocs / docfreq, 10)
+            elif n_tf == "p":
+                self.wglobal = lambda docfreq, totaldocs: math.log((float(totaldocs) - docfreq) / docfreq)
+
+        if self.wnormalize is None:
+            if n_n == "n":
+                self.wnormalize = lambda x: x
+            elif n_n == "c":
+                self.wnormalize = matutils.unitvec
+            elif n_n == "t":
+                self.wnormalize = matutils.unitvec
 
         if dictionary is not None:
             # user supplied a Dictionary object, which already contains all the
@@ -121,7 +125,6 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, smartirs="nnn",
             self.num_docs, self.num_nnz = dictionary.num_docs, dictionary.num_nnz
             self.dfs = dictionary.dfs.copy()
 
-            self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
             if id2word is None:
                 self.id2word = dictionary
         elif corpus is not None:
@@ -161,7 +164,6 @@ def initialize(self, corpus):
             "calculating IDF weights for %i documents and %i features (%i matrix non-zeros)",
             self.num_docs, n_features, self.num_nnz
         )
-        #self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
 
     def __getitem__(self, bow, eps=1e-12):
         """
@@ -174,17 +176,16 @@ def __getitem__(self, bow, eps=1e-12):
 
         # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
         # as strict application of the IDF formula would dictate)
+
         vector = [
-            (termid, self.wlocal(tf) * self.wglobal(self.dfs[termid], self.num_docs))
+            (termid, self.wlocal(tf, mean=np.mean(np.array(bow), axis=1), _max=np.max(bow, axis=1)) * self.wglobal(self.dfs[termid], self.num_docs))
             for termid, tf in bow if self.wglobal(self.dfs[termid], self.num_docs) != 0.0
         ]
 
         # and finally, normalize the vector either to unit length, or use a
         # user-defined normalization function
-        if self.normalize is True:
-            vector = matutils.unitvec(vector)
-        elif self.normalize:
-            vector = self.normalize(vector)
+
+        vector = self.wnormalize(vector)
 
         # make sure there are no explicit zeroes in the vector (must be sparse)
         vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps]

From d091138ee30798483919b5977db707ad36d4eb9c Mon Sep 17 00:00:00 2001
From: Mohit Rathore <mrmohitrathoremr@gmail.com>
Date: Fri, 15 Dec 2017 16:10:47 +0530
Subject: [PATCH 05/25] change old tests

---
 gensim/models/tfidfmodel.py     | 26 ++++++++++++--------------
 gensim/sklearn_api/tfidf.py     |  7 ++++---
 gensim/test/test_sklearn_api.py |  7 +++----
 3 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index e396618c1f..e408dd9118 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -53,7 +53,7 @@ class TfidfModel(interfaces.TransformationABC):
     """
 
     def __init__(self, corpus=None, id2word=None, dictionary=None, smartirs="ntc",
-                 wlocal=None, wglobal=None, wnormalize=None):
+                 wlocal=None, wglobal=None, normalize=None):
         """
         Compute tf-idf by multiplying a local component (term frequency) with a
         global component (inverse document frequency), and normalizing
@@ -82,9 +82,10 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, smartirs="ntc",
         mapping (then `corpus`, if specified, is ignored).
         """
         self.id2word = id2word
-        self.wlocal, self.wglobal, self.wnormalize = wlocal, wglobal, wnormalize
+        self.wlocal, self.wglobal, self.normalize = wlocal, wglobal, normalize
         self.num_docs, self.num_nnz, self.idfs = None, None, None
         n_tf, n_df, n_n = smartirs
+        self.smartirs = smartirs
 
         if self.wlocal is None:
             if n_tf == "n":
@@ -106,13 +107,14 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, smartirs="ntc",
             elif n_tf == "p":
                 self.wglobal = lambda docfreq, totaldocs: math.log((float(totaldocs) - docfreq) / docfreq)
 
-        if self.wnormalize is None:
-            if n_n == "n":
-                self.wnormalize = lambda x: x
-            elif n_n == "c":
-                self.wnormalize = matutils.unitvec
-            elif n_n == "t":
-                self.wnormalize = matutils.unitvec
+        if self.normalize is None or isinstance(self.normalize, bool):
+            if n_n == "n" or self.normalize is False:
+                self.normalize = lambda x: x
+            elif n_n == "c" or self.normalize is True:
+                self.normalize = matutils.unitvec
+            # TODO write byte-size normalisation
+            # elif n_n == "b":
+            #     self.normalize = matutils.unitvec
 
         if dictionary is not None:
             # user supplied a Dictionary object, which already contains all the
@@ -160,10 +162,6 @@ def initialize(self, corpus):
 
         # and finally compute the idf weights
         n_features = max(dfs) if dfs else 0
-        logger.info(
-            "calculating IDF weights for %i documents and %i features (%i matrix non-zeros)",
-            self.num_docs, n_features, self.num_nnz
-        )
 
     def __getitem__(self, bow, eps=1e-12):
         """
@@ -185,7 +183,7 @@ def __getitem__(self, bow, eps=1e-12):
         # and finally, normalize the vector either to unit length, or use a
         # user-defined normalization function
 
-        vector = self.wnormalize(vector)
+        vector = self.normalize(vector)
 
         # make sure there are no explicit zeroes in the vector (must be sparse)
         vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps]
diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py
index c0a45f1823..28bd908329 100644
--- a/gensim/sklearn_api/tfidf.py
+++ b/gensim/sklearn_api/tfidf.py
@@ -21,14 +21,15 @@ class TfIdfTransformer(TransformerMixin, BaseEstimator):
     Base Tf-Idf module
     """
 
-    def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity,
-                 wglobal=gensim.models.tfidfmodel.df2idf, normalize=True):
+    def __init__(self, id2word=None, dictionary=None, smartirs="ntc", wlocal=None,
+                 wglobal=None, normalize=True):
         """
         Sklearn wrapper for Tf-Idf model.
         """
         self.gensim_model = None
         self.id2word = id2word
         self.dictionary = dictionary
+        self.smartirs = smartirs
         self.wlocal = wlocal
         self.wglobal = wglobal
         self.normalize = normalize
@@ -38,7 +39,7 @@ def fit(self, X, y=None):
         Fit the model according to the given training data.
         """
         self.gensim_model = TfidfModel(
-            corpus=X, id2word=self.id2word, dictionary=self.dictionary,
+            corpus=X, id2word=self.id2word, dictionary=self.dictionary, smartirs=self.smartirs,
             wlocal=self.wlocal, wglobal=self.wglobal, normalize=self.normalize
         )
         return self
diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py
index 3793c79948..947804c59d 100644
--- a/gensim/test/test_sklearn_api.py
+++ b/gensim/test/test_sklearn_api.py
@@ -498,7 +498,6 @@ def testPersistence(self):
         original_matrix = self.model.transform(original_bow)
         passed = numpy.allclose(loaded_matrix, original_matrix, atol=1e-1)
         self.assertTrue(passed)
-
     def testModelNotFitted(self):
         lsi_wrapper = LsiTransformer(id2word=dictionary, num_topics=2)
         texts_new = ['graph', 'eulerian']
@@ -973,13 +972,13 @@ def testTransform(self):
 
     def testSetGetParams(self):
         # updating only one param
-        self.model.set_params(normalize=False)
+        self.model.set_params(smartirs='nnn')
         model_params = self.model.get_params()
-        self.assertEqual(model_params["normalize"], False)
+        self.assertEqual(model_params["smartirs"], 'nnn')
 
         # verify that the attributes values are also changed for `gensim_model` after fitting
         self.model.fit(self.corpus)
-        self.assertEqual(getattr(self.model.gensim_model, 'normalize'), False)
+        self.assertEqual(getattr(self.model.gensim_model, 'smartirs'), 'nnn')
 
     def testPipeline(self):
         with open(datapath('mini_newsgroup'), 'rb') as f:

From 951c549dff26c49f105c70686c4063c3c267c59f Mon Sep 17 00:00:00 2001
From: Mohit Rathore <mrmohitrathoremr@gmail.com>
Date: Fri, 15 Dec 2017 18:31:18 +0530
Subject: [PATCH 06/25] remove lambdas

---
 gensim/models/tfidfmodel.py | 58 ++++++++++++++++++++-----------------
 1 file changed, 32 insertions(+), 26 deletions(-)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index e408dd9118..a418f1334f 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -87,34 +87,40 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, smartirs="ntc",
         n_tf, n_df, n_n = smartirs
         self.smartirs = smartirs
 
-        if self.wlocal is None:
-            if n_tf == "n":
-                self.wlocal = lambda tf, mean=None, _max=None: tf
-            elif n_tf == "l":
-                self.wlocal = lambda tf, mean=None, _max=None: 1 + math.log(tf)
-            elif n_tf == "a":
-                self.wlocal = lambda tf, mean=None, _max=None: 0.5 + (0.5 * tf / _max)
-            elif n_tf == "b":
-                self.wlocal = lambda tf, mean=None, _max=None: 1 if tf > 0 else 0
-            elif n_tf == "L":
-                self.wlocal = lambda tf, mean=None, _max=None: (1 + math.log(tf)) / (1 + math.log(mean))
-
-        if self.wglobal is None:
-            if n_df == "n":
-                self.wglobal = utils.identity
-            elif n_df == "t":
-                self.wglobal = lambda docfreq, totaldocs: math.log(1.0 * totaldocs / docfreq, 10)
-            elif n_tf == "p":
-                self.wglobal = lambda docfreq, totaldocs: math.log((float(totaldocs) - docfreq) / docfreq)
+        if wlocal is None:
+            def wlocal(tf, mean=None, _max=None):
+                if n_tf == "n":
+                    return tf
+                elif n_tf == "l":
+                    return 1 + math.log(tf)
+                elif n_tf == "a":
+                    return 0.5 + (0.5 * tf / _max)
+                elif n_tf == "b":
+                    return 1 if tf > 0 else 0
+                elif n_tf == "L":
+                    return (1 + math.log(tf)) / (1 + math.log(mean))
+            self.wlocal = wlocal
+
+        if wglobal is None:
+            def wglobal(docfreq, totaldocs):
+                if n_df == "n":
+                    return utils.identity(docfreq)
+                elif n_df == "t":
+                    return math.log(1.0 * totaldocs / docfreq, 10)
+                elif n_tf == "p":
+                    return math.log((float(totaldocs) - docfreq) / docfreq)
+            self.wglobal = wglobal
 
         if self.normalize is None or isinstance(self.normalize, bool):
-            if n_n == "n" or self.normalize is False:
-                self.normalize = lambda x: x
-            elif n_n == "c" or self.normalize is True:
-                self.normalize = matutils.unitvec
-            # TODO write byte-size normalisation
-            # elif n_n == "b":
-            #     self.normalize = matutils.unitvec
+            def normalize(x):
+                if n_n == "n" or self.normalize is False:
+                    return x
+                elif n_n == "c" or self.normalize is True:
+                    return matutils.unitvec(x)
+                # TODO write byte-size normalisation
+                # elif n_n == "b":
+                #    pass
+            self.normalize = normalize
 
         if dictionary is not None:
             # user supplied a Dictionary object, which already contains all the

From 40c0558e67e3679b2797e419b75fe3852e225905 Mon Sep 17 00:00:00 2001
From: Mohit Rathore <mrmohitrathoremr@gmail.com>
Date: Sun, 17 Dec 2017 02:14:40 +0530
Subject: [PATCH 07/25] address suggestions

---
 gensim/models/tfidfmodel.py     | 95 ++++++++++++++++++++++++---------
 gensim/sklearn_api/tfidf.py     | 12 ++---
 gensim/test/test_sklearn_api.py |  1 +
 3 files changed, 78 insertions(+), 30 deletions(-)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index a418f1334f..3c381ca715 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -6,11 +6,11 @@
 
 
 import logging
-import math
 
 from gensim import interfaces, matutils, utils
 from six import iteritems
 
+import math
 import numpy as np
 
 logger = logging.getLogger(__name__)
@@ -23,17 +23,32 @@ def resolve_weights(smartirs):
     w_tf, w_df, w_n = smartirs
 
     if w_tf not in 'nlabL':
-        raise ValueError('Expected term frequency weight to be one of nlabL, except got ' + w_tf)
+        raise ValueError('Expected term frequency weight to be one of \'nlabL\', except got ' + w_tf + '\'')
 
     if w_df not in 'ntp':
-        raise ValueError('Expected inverse document frequency weight to be one of ntp, except got ' + w_df)
+        raise ValueError('Expected inverse document frequency weight to be one of \'ntp\', except got ' + w_df + '\'')
 
     if w_n not in 'ncb':
-        raise ValueError('Expected normalization weight to be one of ncb, except got ' + w_n)
+        raise ValueError('Expected normalization weight to be one of \'ncb\', except got \'' + w_n + '\'')
 
     return w_tf, w_df, w_n
 
 
+def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0):
+    """
+    Compute default inverse-document-frequency for a term with document frequency `doc_freq`::
+    idf = add + log(totaldocs / doc_freq)
+    """
+    return add + np.log(float(totaldocs) / docfreq) / np.log(2)
+
+
+def precompute_idfs(wglobal, dfs, total_docs):
+    """Precompute the inverse document frequency mapping for all terms."""
+    # not strictly necessary and could be computed on the fly in TfidfModel__getitem__.
+    # this method is here just to speed things up a little.
+    return {termid: wglobal(df, total_docs) for termid, df in iteritems(dfs)}
+
+
 class TfidfModel(interfaces.TransformationABC):
     """
     Objects of this class realize the transformation between word-document co-occurrence
@@ -52,8 +67,8 @@ class TfidfModel(interfaces.TransformationABC):
     Model persistency is achieved via its load/save methods.
     """
 
-    def __init__(self, corpus=None, id2word=None, dictionary=None, smartirs="ntc",
-                 wlocal=None, wglobal=None, normalize=None):
+    def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.identity,
+                 wglobal=df2idf, normalize=True, smartirs=None):
         """
         Compute tf-idf by multiplying a local component (term frequency) with a
         global component (inverse document frequency), and normalizing
@@ -80,42 +95,63 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, smartirs="ntc",
         If `dictionary` is specified, it must be a `corpora.Dictionary` object
         and it will be used to directly construct the inverse document frequency
         mapping (then `corpus`, if specified, is ignored).
+
+        `smartirs` or SMART (System for the Mechanical Analysis and Retrieval of Text)
+        Information Retrieval System, a mnemonic scheme for denoting tf-idf weighting
+        variants in the vector space model. The mnemonic for representing a combination
+        of weights takes the form ddd, where the letters represents the term weighting
+        of the document vector.
+
+        Term frequency weighing:
+           natural - `n`, logarithm - `l` , augmented - `a`,  boolean `b`, log average - `L`.
+        Document frequency weighting:
+           none - `n`, idf - `t`, prob idf - `p`.
+        Document normalization:
+           none - `n`, cosine - `c`, byte size - `b`.
+
+        for more information visit https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System
         """
+
         self.id2word = id2word
         self.wlocal, self.wglobal, self.normalize = wlocal, wglobal, normalize
         self.num_docs, self.num_nnz, self.idfs = None, None, None
-        n_tf, n_df, n_n = smartirs
         self.smartirs = smartirs
 
-        if wlocal is None:
-            def wlocal(tf, mean=None, _max=None):
+        if self.normalize is True:
+            self.normalize = matutils.unitvec
+        elif self.normalize is False:
+            self.normalize = utils.identity
+
+        # If smartirs is not None, override wlocal, wglobal and normalize
+        if smartirs is not None:
+            n_tf, n_df, n_n = resolve_weights(smartirs)
+
+            def wlocal(tf):
                 if n_tf == "n":
                     return tf
                 elif n_tf == "l":
-                    return 1 + math.log(tf)
+                    return 1 + np.log(tf) / np.log(2)
                 elif n_tf == "a":
-                    return 0.5 + (0.5 * tf / _max)
+                    return 0.5 + (0.5 * tf / tf.max(axis=0))
                 elif n_tf == "b":
-                    return 1 if tf > 0 else 0
+                    return tf.astype('bool').astype('int')
                 elif n_tf == "L":
-                    return (1 + math.log(tf)) / (1 + math.log(mean))
+                    return (1 + np.log(tf) / np.log(2)) / (1 + np.log(tf.mean(axis=0) / np.log(2)))
             self.wlocal = wlocal
 
-        if wglobal is None:
             def wglobal(docfreq, totaldocs):
                 if n_df == "n":
                     return utils.identity(docfreq)
                 elif n_df == "t":
-                    return math.log(1.0 * totaldocs / docfreq, 10)
-                elif n_tf == "p":
-                    return math.log((float(totaldocs) - docfreq) / docfreq)
+                    return np.log(1.0 * totaldocs / docfreq) / np.log(2)
+                elif n_df == "p":
+                    return np.log((1.0 * totaldocs - docfreq) / docfreq) / np.log(2)
             self.wglobal = wglobal
 
-        if self.normalize is None or isinstance(self.normalize, bool):
             def normalize(x):
-                if n_n == "n" or self.normalize is False:
+                if n_n == "n":
                     return x
-                elif n_n == "c" or self.normalize is True:
+                elif n_n == "c":
                     return matutils.unitvec(x)
                 # TODO write byte-size normalisation
                 # elif n_n == "b":
@@ -132,7 +168,7 @@ def normalize(x):
                 )
             self.num_docs, self.num_nnz = dictionary.num_docs, dictionary.num_nnz
             self.dfs = dictionary.dfs.copy()
-
+            self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
             if id2word is None:
                 self.id2word = dictionary
         elif corpus is not None:
@@ -165,9 +201,13 @@ def initialize(self, corpus):
         self.num_docs = docno + 1
         self.num_nnz = numnnz
         self.dfs = dfs
-
         # and finally compute the idf weights
         n_features = max(dfs) if dfs else 0
+        logger.info(
+            "calculating IDF weights for %i documents and %i features (%i matrix non-zeros)",
+            self.num_docs, n_features, self.num_nnz
+        )
+        self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
 
     def __getitem__(self, bow, eps=1e-12):
         """
@@ -181,9 +221,16 @@ def __getitem__(self, bow, eps=1e-12):
         # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
         # as strict application of the IDF formula would dictate)
 
+        termid_array, tf_array = [], []
+        for termid, tf in bow:
+            termid_array.append(termid)
+            tf_array.append(tf)
+
+        tf_array = self.wlocal(np.array(tf_array))
+
         vector = [
-            (termid, self.wlocal(tf, mean=np.mean(np.array(bow), axis=1), _max=np.max(bow, axis=1)) * self.wglobal(self.dfs[termid], self.num_docs))
-            for termid, tf in bow if self.wglobal(self.dfs[termid], self.num_docs) != 0.0
+            (termid, tf * self.idfs.get(termid))
+            for termid, tf in zip(termid_array, tf_array) if self.idfs.get(termid, 0.0) != 0.0
         ]
 
         # and finally, normalize the vector either to unit length, or use a
diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py
index 28bd908329..dc4ab86c01 100644
--- a/gensim/sklearn_api/tfidf.py
+++ b/gensim/sklearn_api/tfidf.py
@@ -12,8 +12,8 @@
 from sklearn.base import TransformerMixin, BaseEstimator
 from sklearn.exceptions import NotFittedError
 
-import gensim
 from gensim.models import TfidfModel
+import gensim
 
 
 class TfIdfTransformer(TransformerMixin, BaseEstimator):
@@ -21,26 +21,26 @@ class TfIdfTransformer(TransformerMixin, BaseEstimator):
     Base Tf-Idf module
     """
 
-    def __init__(self, id2word=None, dictionary=None, smartirs="ntc", wlocal=None,
-                 wglobal=None, normalize=True):
+    def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity,
+                 wglobal=gensim.models.tfidfmodel.df2idf, normalize=True, smartirs="ntc"):
         """
         Sklearn wrapper for Tf-Idf model.
         """
         self.gensim_model = None
         self.id2word = id2word
         self.dictionary = dictionary
-        self.smartirs = smartirs
         self.wlocal = wlocal
         self.wglobal = wglobal
         self.normalize = normalize
+        self.smartirs = smartirs
 
     def fit(self, X, y=None):
         """
         Fit the model according to the given training data.
         """
         self.gensim_model = TfidfModel(
-            corpus=X, id2word=self.id2word, dictionary=self.dictionary, smartirs=self.smartirs,
-            wlocal=self.wlocal, wglobal=self.wglobal, normalize=self.normalize
+            corpus=X, id2word=self.id2word, dictionary=self.dictionary, wlocal=self.wlocal,
+            wglobal=self.wglobal, normalize=self.normalize, smartirs=self.smartirs,
         )
         return self
 
diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py
index 947804c59d..5e0511aa5c 100644
--- a/gensim/test/test_sklearn_api.py
+++ b/gensim/test/test_sklearn_api.py
@@ -498,6 +498,7 @@ def testPersistence(self):
         original_matrix = self.model.transform(original_bow)
         passed = numpy.allclose(loaded_matrix, original_matrix, atol=1e-1)
         self.assertTrue(passed)
+
     def testModelNotFitted(self):
         lsi_wrapper = LsiTransformer(id2word=dictionary, num_topics=2)
         texts_new = ['graph', 'eulerian']

From b35344c43b0009cbaae5b46936800bf58ae578bb Mon Sep 17 00:00:00 2001
From: Mohit Rathore <mrmohitrathoremr@gmail.com>
Date: Tue, 19 Dec 2017 16:00:43 +0530
Subject: [PATCH 08/25] minor fix

---
 gensim/models/tfidfmodel.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index 75716f86bc..b308aaef04 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -14,22 +14,23 @@
 
 logger = logging.getLogger(__name__)
 
+
 def resolve_weights(smartirs):
-    if not isinstance(smartirs, str) or len(smartirs)!=3:
-        raise ValueError('Expected a string of length 3 except got ' + smartirs):
+    if not isinstance(smartirs, str) or len(smartirs) != 3:
+        raise ValueError('Expected a string of length 3 except got ' + smartirs)
 
     w_tf, w_df, w_n = smartirs
 
     if w_tf not in 'nlabL':
-      raise ValueError('Expected term frequency weight to be one of nlabL, except got ' + n_tf)
+        raise ValueError('Expected term frequency weight to be one of nlabL, except got ' + w_tf)
 
-    if w_idf not in 'ntp':
-      raise ValueError('Expected inverse document frequency weight to be one of ntp, except got ' + n_idf)
+    if w_df not in 'ntp':
+        raise ValueError('Expected inverse document frequency weight to be one of ntp, except got ' + w_df)
 
     if w_n not in 'ncb':
-      raise ValueError('Expected normalization weight to be one of ncb, except got ' + n_n)
+        raise ValueError('Expected normalization weight to be one of ncb, except got ' + w_n)
 
-    return w_tf, w_idf, w_n
+    return w_tf, w_df, w_n
 
 
 def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0):

From 0917e75c03bec7273cc264a405c28b6a48f7b59e Mon Sep 17 00:00:00 2001
From: Mohit Rathore <mrmohitrathoremr@gmail.com>
Date: Tue, 19 Dec 2017 16:24:43 +0530
Subject: [PATCH 09/25] pep8 fix

---
 gensim/models/tfidfmodel.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index 3c381ca715..4546672ddd 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -10,7 +10,6 @@
 from gensim import interfaces, matutils, utils
 from six import iteritems
 
-import math
 import numpy as np
 
 logger = logging.getLogger(__name__)

From d3d431c5a4274ec6c0143c9f89742389a6935244 Mon Sep 17 00:00:00 2001
From: ivan <menshikh.iv@gmail.com>
Date: Thu, 21 Dec 2017 11:48:27 +0500
Subject: [PATCH 10/25] fix pickle problem

---
 gensim/models/tfidfmodel.py | 67 ++++++++++++++++++++-----------------
 1 file changed, 36 insertions(+), 31 deletions(-)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index fa5a7aa8b1..7fb25430ee 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -6,6 +6,7 @@
 
 
 import logging
+from functools import partial
 
 from gensim import interfaces, matutils, utils
 from six import iteritems
@@ -53,6 +54,38 @@ def precompute_idfs(wglobal, dfs, total_docs):
     return {termid: wglobal(df, total_docs) for termid, df in iteritems(dfs)}
 
 
+def wlocal_g(tf, n_tf):  # TODO rename it (to avoid confusion)
+    if n_tf == "n":
+        return tf
+    elif n_tf == "l":
+        return 1 + np.log(tf) / np.log(2)
+    elif n_tf == "a":
+        return 0.5 + (0.5 * tf / tf.max(axis=0))
+    elif n_tf == "b":
+        return tf.astype('bool').astype('int')
+    elif n_tf == "L":
+        return (1 + np.log(tf) / np.log(2)) / (1 + np.log(tf.mean(axis=0) / np.log(2)))
+
+
+def wglobal_g(docfreq, totaldocs, n_df):  # TODO rename it (to avoid confusion)
+    if n_df == "n":
+        return utils.identity(docfreq)
+    elif n_df == "t":
+        return np.log(1.0 * totaldocs / docfreq) / np.log(2)
+    elif n_df == "p":
+        return np.log((1.0 * totaldocs - docfreq) / docfreq) / np.log(2)
+
+
+def normalize_g(x, n_n):  # TODO rename it (to avoid confusion)
+    if n_n == "n":
+        return x
+    elif n_n == "c":
+        return matutils.unitvec(x)
+    # TODO write byte-size normalisation
+    # elif n_n == "b":
+    #    pass
+
+
 class TfidfModel(interfaces.TransformationABC):
     """
     Objects of this class realize the transformation between word-document co-occurrence
@@ -148,37 +181,9 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden
         if smartirs is not None:
             n_tf, n_df, n_n = resolve_weights(smartirs)
 
-            def wlocal(tf):
-                if n_tf == "n":
-                    return tf
-                elif n_tf == "l":
-                    return 1 + np.log(tf) / np.log(2)
-                elif n_tf == "a":
-                    return 0.5 + (0.5 * tf / tf.max(axis=0))
-                elif n_tf == "b":
-                    return tf.astype('bool').astype('int')
-                elif n_tf == "L":
-                    return (1 + np.log(tf) / np.log(2)) / (1 + np.log(tf.mean(axis=0) / np.log(2)))
-            self.wlocal = wlocal
-
-            def wglobal(docfreq, totaldocs):
-                if n_df == "n":
-                    return utils.identity(docfreq)
-                elif n_df == "t":
-                    return np.log(1.0 * totaldocs / docfreq) / np.log(2)
-                elif n_df == "p":
-                    return np.log((1.0 * totaldocs - docfreq) / docfreq) / np.log(2)
-            self.wglobal = wglobal
-
-            def normalize(x):
-                if n_n == "n":
-                    return x
-                elif n_n == "c":
-                    return matutils.unitvec(x)
-                # TODO write byte-size normalisation
-                # elif n_n == "b":
-                #    pass
-            self.normalize = normalize
+            self.wlocal = partial(wlocal_g, n_tf=n_tf)
+            self.wglobal = partial(wglobal_g, n_df=n_df)
+            self.normalize = partial(normalize_g, n_n=n_n)
 
         if dictionary is not None:
             # user supplied a Dictionary object, which already contains all the

From 0e6f21e24149ee33c46323f0db9ac818f5cc53bc Mon Sep 17 00:00:00 2001
From: Mohit Rathore <mrmohitrathoremr@gmail.com>
Date: Thu, 21 Dec 2017 12:42:38 +0530
Subject: [PATCH 11/25] flake8 fix

---
 gensim/models/tfidfmodel.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index 7fb25430ee..8b3d2697ed 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -104,6 +104,7 @@ class TfidfModel(interfaces.TransformationABC):
     >>> tfidf.save('/tmp/foo.tfidf_model')
 
     Model persistency is achieved via its load/save methods.
+
     """
 
     def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.identity,

From 7ee75602157528110455a21da1ef0128a42f186c Mon Sep 17 00:00:00 2001
From: ivan <menshikh.iv@gmail.com>
Date: Thu, 21 Dec 2017 14:21:48 +0500
Subject: [PATCH 12/25] fix bug in docstring

---
 gensim/models/tfidfmodel.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index 8b3d2697ed..f4f762c879 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -91,14 +91,8 @@ class TfidfModel(interfaces.TransformationABC):
     Objects of this class realize the transformation between word-document co-occurrence
     matrix (integers) into a locally/globally weighted TF_IDF matrix (positive floats).
 
-    Methods
-    -------
-    __init__(corpus=None, id2word=None, dictionary=None, wlocal=utils.identity,
-                wglobal=df2idf, normalize=True, smartirs=None):
-            Calculates inverse document counts for all terms in the training corpus.
-    __getitem__(bow, eps=1e-12)
-            which transforms a simple count representation into the TfIdf space.
-
+    Examples
+    --------
     >>> tfidf = TfidfModel(corpus)
     >>> print(tfidf[some_doc])
     >>> tfidf.save('/tmp/foo.tfidf_model')

From b2def84db38ef7b6158b92eb6ae3f5e48b0a2299 Mon Sep 17 00:00:00 2001
From: Mohit Rathore <mrmohitrathoremr@gmail.com>
Date: Fri, 22 Dec 2017 13:01:42 +0530
Subject: [PATCH 13/25] added few tests

---
 gensim/test/test_sklearn_api.py | 203 ++++++++++++++++++++++++++++++++
 1 file changed, 203 insertions(+)

diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py
index 5e0511aa5c..ec2c287356 100644
--- a/gensim/test/test_sklearn_api.py
+++ b/gensim/test/test_sklearn_api.py
@@ -1014,6 +1014,209 @@ def testModelNotFitted(self):
         tfidf_wrapper = TfIdfTransformer()
         self.assertRaises(NotFittedError, tfidf_wrapper.transform, corpus[0])
 
+    def testConsistency(self):
+        docs = [corpus[0], corpus[1]]
+        self.model.set_params(smartirs='ntc')
+        self.model.fit(self.corpus)
+        transformed_docs = self.model.transform(docs)
+        expected_docs = [
+            [(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)],
+            [(3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555),
+             (6, 0.44424552527467476), (7, 0.3244870206138555), (8, 0.44424552527467476)]
+        ]
+        self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))
+        
+        
+        # nnn
+        docs = [corpus[0], corpus[1]]
+        self.model.set_params(smartirs='nnn')
+        self.model.fit(self.corpus)
+        transformed_docs = self.model.transform(docs)
+        
+        expected_docs = [
+            [(0, 2), (1, 2), (2, 2)], 
+            [(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)]
+        ]
+        self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))          
+        
+        # nnc
+        docs = [corpus[0], corpus[1]]
+        self.model.set_params(smartirs='nnc')
+        self.model.fit(self.corpus)
+        transformed_docs = self.model.transform(docs)
+        
+        expected_docs = [
+            [(0, 0.57735026918962584), (1, 0.57735026918962584), (2, 0.57735026918962584)],
+            [(3, 0.34299717028501764), (4, 0.34299717028501764), (5, 0.51449575542752646), (6, 0.34299717028501764), (7, 0.51449575542752646), (8, 0.34299717028501764)]
+        ]
+        self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) 
+        
+        # ntn
+        docs = [corpus[0], corpus[1]]
+        self.model.set_params(smartirs='ntn')
+        self.model.fit(self.corpus)
+        transformed_docs = self.model.transform(docs)
+        
+        expected_docs = [
+            [(0, 2.1699250014423126), (1, 2.1699250014423126), (2, 2.1699250014423126)],
+            [(3, 2.1699250014423126), (4, 2.1699250014423126), (5, 1.5849625007211563), (6, 2.1699250014423126), (7, 1.5849625007211563), (8, 2.1699250014423126)]
+        ]
+        self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) 
+        
+        # ntc
+        docs = [corpus[0], corpus[1]]
+        self.model.set_params(smartirs='ntc')
+        self.model.fit(self.corpus)
+        transformed_docs = self.model.transform(docs)
+        
+        expected_docs = [
+            [(0, 0.57735026918962573), (1, 0.57735026918962573), (2, 0.57735026918962573)],
+            [(3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.32448702061385548), (6, 0.44424552527467476), (7, 0.32448702061385548), (8, 0.44424552527467476)]
+        ]
+        self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))  
+        
+        # npn
+        docs = [corpus[0], corpus[1]]
+        self.model.set_params(smartirs='npn')
+        self.model.fit(self.corpus)
+        transformed_docs = self.model.transform(docs)
+        
+        expected_docs = [
+            [(0, 1.8073549220576042), (1, 1.8073549220576042), (2, 1.8073549220576042)],
+            [(3, 1.8073549220576042), (4, 1.8073549220576042), (5, 1.0), (6, 1.8073549220576042), (7, 1.0), (8, 1.8073549220576042)]
+        ]
+        self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))    
+        
+        # npc
+        docs = [corpus[0], corpus[1]]
+        self.model.set_params(smartirs='npc')
+        self.model.fit(self.corpus)
+        transformed_docs = self.model.transform(docs)
+        
+        expected_docs = [
+            [(0, 0.57735026918962573), (1, 0.57735026918962573), (2, 0.57735026918962573)],
+            [(3, 0.46563179782533826), (4, 0.46563179782533826), (5, 0.25763163180767745), (6, 0.46563179782533826), (7, 0.25763163180767745), (8, 0.46563179782533826)]
+        ]
+        self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))  
+        
+        # lnn
+        docs = [corpus[0], corpus[1]]
+        self.model.set_params(smartirs='lnn')
+        self.model.fit(self.corpus)
+        transformed_docs = self.model.transform(docs)
+        
+        expected_docs = [
+            [(0, 2.0), (1, 2.0), (2, 2.0)],
+            [(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0), (8, 2.0)]
+        ]
+        self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) 
+        
+        # lnc
+        docs = [corpus[0], corpus[1]]
+        self.model.set_params(smartirs='lnc')
+        self.model.fit(self.corpus)
+        transformed_docs = self.model.transform(docs)
+        
+        expected_docs = [
+            [(0, 0.57735026918962584), (1, 0.57735026918962584), (2, 0.57735026918962584)],
+            [(3, 0.34299717028501764), (4, 0.34299717028501764), (5, 0.51449575542752646), (6, 0.34299717028501764), (7, 0.51449575542752646), (8, 0.34299717028501764)]
+        ]
+        self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) 
+        
+        # ltn
+        docs = [corpus[0], corpus[1]]
+        self.model.set_params(smartirs='ltn')
+        self.model.fit(self.corpus)
+        transformed_docs = self.model.transform(docs)
+        
+        expected_docs = [
+            [(0, 2.1699250014423126), (1, 2.1699250014423126), (2, 2.1699250014423126)],
+            [(3, 2.1699250014423126), (4, 2.1699250014423126), (5, 1.5849625007211563), (6, 2.1699250014423126), (7, 1.5849625007211563), (8, 2.1699250014423126)]
+        ]
+        self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) 
+        
+        # ltc
+        docs = [corpus[0], corpus[1]]
+        self.model.set_params(smartirs='ltc')
+        self.model.fit(self.corpus)
+        transformed_docs = self.model.transform(docs)
+        
+        expected_docs = [[(0, 0.57735026918962573),
+                          (1, 0.57735026918962573),
+                          (2, 0.57735026918962573)],
+                         [(3, 0.44424552527467476),
+                          (4, 0.44424552527467476),
+                          (5, 0.32448702061385548),
+                          (6, 0.44424552527467476),
+                          (7, 0.32448702061385548),
+                          (8, 0.44424552527467476)]]
+        self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) 
+        
+        # lpn
+        docs = [corpus[0], corpus[1]]
+        self.model.set_params(smartirs='lpn')
+        self.model.fit(self.corpus)
+        transformed_docs = self.model.transform(docs)
+        
+        expected_docs = [[(0, 1.8073549220576042),
+                          (1, 1.8073549220576042),
+                          (2, 1.8073549220576042)],
+                         [(3, 1.8073549220576042),
+                          (4, 1.8073549220576042),
+                          (5, 1.0),
+                          (6, 1.8073549220576042),
+                          (7, 1.0),
+                          (8, 1.8073549220576042)]]
+        self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) 
+        
+        # lpc
+        docs = [corpus[0], corpus[1]]
+        self.model.set_params(smartirs='lpc')
+        self.model.fit(self.corpus)
+        transformed_docs = self.model.transform(docs)
+        
+        expected_docs = [[(0, 0.57735026918962573),
+                          (1, 0.57735026918962573),
+                          (2, 0.57735026918962573)],
+                         [(3, 0.46563179782533826),
+                          (4, 0.46563179782533826),
+                          (5, 0.25763163180767745),
+                          (6, 0.46563179782533826),
+                          (7, 0.25763163180767745),
+                          (8, 0.46563179782533826)]]
+        self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))
+        
+        # lpc
+        docs = [corpus[0], corpus[1]]
+        self.model.set_params(smartirs='lpc')
+        self.model.fit(self.corpus)
+        transformed_docs = self.model.transform(docs)
+        
+        #pprint.pprint(transformed_docs)
+        expected_docs = [[(0, 0.57735026918962573),
+                          (1, 0.57735026918962573),
+                          (2, 0.57735026918962573)],
+                         [(3, 0.46563179782533826),
+                          (4, 0.46563179782533826),
+                          (5, 0.25763163180767745),
+                          (6, 0.46563179782533826),
+                          (7, 0.25763163180767745),
+                          (8, 0.46563179782533826)]]
+        self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) 
 
 class TestHdpTransformer(unittest.TestCase):
     def setUp(self):

From 5b2d37afbeb02dfb3b004ea2c7814e6670bbc19a Mon Sep 17 00:00:00 2001
From: Mohit Rathore <mrmohitrathoremr@gmail.com>
Date: Fri, 22 Dec 2017 15:05:43 +0530
Subject: [PATCH 14/25] fix normalize issue for pickling

---
 gensim/models/tfidfmodel.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index f4f762c879..6f6c6e106b 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -167,11 +167,6 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden
         self.num_docs, self.num_nnz, self.idfs = None, None, None
         self.smartirs = smartirs
 
-        if self.normalize is True:
-            self.normalize = matutils.unitvec
-        elif self.normalize is False:
-            self.normalize = utils.identity
-
         # If smartirs is not None, override wlocal, wglobal and normalize
         if smartirs is not None:
             n_tf, n_df, n_n = resolve_weights(smartirs)
@@ -255,9 +250,13 @@ def __getitem__(self, bow, eps=1e-12):
             for termid, tf in zip(termid_array, tf_array) if self.idfs.get(termid, 0.0) != 0.0
         ]
 
+        if self.normalize is True:
+            self.normalize = matutils.unitvec
+        elif self.normalize is False:
+            self.normalize = utils.identity
+
         # and finally, normalize the vector either to unit length, or use a
         # user-defined normalization function
-
         vector = self.normalize(vector)
 
         # make sure there are no explicit zeroes in the vector (must be sparse)

From ac4b154ef5dbde037c833dd71386676cd84e21fc Mon Sep 17 00:00:00 2001
From: Mohit Rathore <mrmohitrathoremr@gmail.com>
Date: Fri, 22 Dec 2017 15:05:43 +0530
Subject: [PATCH 15/25] fix normalize issue for pickling

---
 gensim/models/tfidfmodel.py     |  28 ++--
 gensim/test/test_sklearn_api.py | 273 ++++++++++++++------------------
 2 files changed, 129 insertions(+), 172 deletions(-)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index f4f762c879..3d32b334fb 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -54,7 +54,7 @@ def precompute_idfs(wglobal, dfs, total_docs):
     return {termid: wglobal(df, total_docs) for termid, df in iteritems(dfs)}
 
 
-def wlocal_g(tf, n_tf):  # TODO rename it (to avoid confusion)
+def updated_wlocal(tf, n_tf):
     if n_tf == "n":
         return tf
     elif n_tf == "l":
@@ -67,7 +67,7 @@ def wlocal_g(tf, n_tf):  # TODO rename it (to avoid confusion)
         return (1 + np.log(tf) / np.log(2)) / (1 + np.log(tf.mean(axis=0) / np.log(2)))
 
 
-def wglobal_g(docfreq, totaldocs, n_df):  # TODO rename it (to avoid confusion)
+def updated_wglobal(docfreq, totaldocs, n_df):  # TODO rename it (to avoid confusion)
     if n_df == "n":
         return utils.identity(docfreq)
     elif n_df == "t":
@@ -76,14 +76,11 @@ def wglobal_g(docfreq, totaldocs, n_df):  # TODO rename it (to avoid confusion)
         return np.log((1.0 * totaldocs - docfreq) / docfreq) / np.log(2)
 
 
-def normalize_g(x, n_n):  # TODO rename it (to avoid confusion)
+def updated_normalize(x, n_n):  # TODO rename it (to avoid confusion)
     if n_n == "n":
         return x
     elif n_n == "c":
         return matutils.unitvec(x)
-    # TODO write byte-size normalisation
-    # elif n_n == "b":
-    #    pass
 
 
 class TfidfModel(interfaces.TransformationABC):
@@ -152,7 +149,7 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden
                     Document frequency weighting:
                       none - `n`, idf - `t`, prob idf - `p`.
                     Document normalization:
-                      none - `n`, cosine - `c`, byte size - `b`.
+                      none - `n`, cosine - `c`.
 
                     for more information visit https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System
 
@@ -167,18 +164,13 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden
         self.num_docs, self.num_nnz, self.idfs = None, None, None
         self.smartirs = smartirs
 
-        if self.normalize is True:
-            self.normalize = matutils.unitvec
-        elif self.normalize is False:
-            self.normalize = utils.identity
-
         # If smartirs is not None, override wlocal, wglobal and normalize
         if smartirs is not None:
             n_tf, n_df, n_n = resolve_weights(smartirs)
 
-            self.wlocal = partial(wlocal_g, n_tf=n_tf)
-            self.wglobal = partial(wglobal_g, n_df=n_df)
-            self.normalize = partial(normalize_g, n_n=n_n)
+            self.wlocal = partial(updated_wlocal, n_tf=n_tf)
+            self.wglobal = partial(updated_wglobal, n_df=n_df)
+            self.normalize = partial(updated_normalize, n_n=n_n)
 
         if dictionary is not None:
             # user supplied a Dictionary object, which already contains all the
@@ -255,9 +247,13 @@ def __getitem__(self, bow, eps=1e-12):
             for termid, tf in zip(termid_array, tf_array) if self.idfs.get(termid, 0.0) != 0.0
         ]
 
+        if self.normalize is True:
+            self.normalize = matutils.unitvec
+        elif self.normalize is False:
+            self.normalize = utils.identity
+
         # and finally, normalize the vector either to unit length, or use a
         # user-defined normalization function
-
         vector = self.normalize(vector)
 
         # make sure there are no explicit zeroes in the vector (must be sparse)
diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py
index ec2c287356..33d2d5b777 100644
--- a/gensim/test/test_sklearn_api.py
+++ b/gensim/test/test_sklearn_api.py
@@ -1000,6 +1000,9 @@ def testPipeline(self):
         self.assertGreater(score, 0.40)
 
     def testPersistence(self):
+        # Test current model persistency.
+        self.model.set_params(smartirs='ntc')
+
         model_dump = pickle.dumps(self.model)
         model_load = pickle.loads(model_dump)
 
@@ -1010,213 +1013,171 @@ def testPersistence(self):
         original_transformed_doc = self.model.transform(doc)
         self.assertEqual(original_transformed_doc, loaded_transformed_doc)
 
+        # compare backward model pickle compatibility
+        with open("test_data/tfidf_model.pkl", "rb") as model_handler:
+            model_load = pickle.load(model_handler)
+
+        loaded_transformed_doc = model_load.transform(doc)
+
+        # comparing the original and new models
+        original_transformed_doc = self.model.transform(doc)
+        self.assertEqual(original_transformed_doc, loaded_transformed_doc)
+
     def testModelNotFitted(self):
         tfidf_wrapper = TfIdfTransformer()
         self.assertRaises(NotFittedError, tfidf_wrapper.transform, corpus[0])
 
     def testConsistency(self):
-        docs = [corpus[0], corpus[1]]
+        # Test if `ntc` yields the default docs.
+        docs = [corpus[1], corpus[2]]
+
         self.model.set_params(smartirs='ntc')
         self.model.fit(self.corpus)
         transformed_docs = self.model.transform(docs)
-        expected_docs = [
-            [(0, 0.5773502691896257), (1, 0.5773502691896257), (2, 0.5773502691896257)],
-            [(3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.3244870206138555),
-             (6, 0.44424552527467476), (7, 0.3244870206138555), (8, 0.44424552527467476)]
-        ]
+
+        self.model.set_params(normalize=True)
+        self.model.fit(self.corpus)
+        expected_docs = self.model.transform(docs)
+
         self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
         self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))
-        
-        
-        # nnn
-        docs = [corpus[0], corpus[1]]
+
+        # Testing all the variations of `wlocal`
+        # smartirs=`nnn`
         self.model.set_params(smartirs='nnn')
         self.model.fit(self.corpus)
+
         transformed_docs = self.model.transform(docs)
-        
-        expected_docs = [
-            [(0, 2), (1, 2), (2, 2)], 
-            [(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)]
-        ]
-        self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
-        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))          
-        
-        # nnc
-        docs = [corpus[0], corpus[1]]
-        self.model.set_params(smartirs='nnc')
-        self.model.fit(self.corpus)
-        transformed_docs = self.model.transform(docs)
-        
-        expected_docs = [
-            [(0, 0.57735026918962584), (1, 0.57735026918962584), (2, 0.57735026918962584)],
-            [(3, 0.34299717028501764), (4, 0.34299717028501764), (5, 0.51449575542752646), (6, 0.34299717028501764), (7, 0.51449575542752646), (8, 0.34299717028501764)]
-        ]
-        self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
-        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) 
-        
-        # ntn
-        docs = [corpus[0], corpus[1]]
-        self.model.set_params(smartirs='ntn')
-        self.model.fit(self.corpus)
-        transformed_docs = self.model.transform(docs)
-        
-        expected_docs = [
-            [(0, 2.1699250014423126), (1, 2.1699250014423126), (2, 2.1699250014423126)],
-            [(3, 2.1699250014423126), (4, 2.1699250014423126), (5, 1.5849625007211563), (6, 2.1699250014423126), (7, 1.5849625007211563), (8, 2.1699250014423126)]
-        ]
-        self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
-        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) 
-        
-        # ntc
-        docs = [corpus[0], corpus[1]]
-        self.model.set_params(smartirs='ntc')
-        self.model.fit(self.corpus)
-        transformed_docs = self.model.transform(docs)
-        
-        expected_docs = [
-            [(0, 0.57735026918962573), (1, 0.57735026918962573), (2, 0.57735026918962573)],
-            [(3, 0.44424552527467476), (4, 0.44424552527467476), (5, 0.32448702061385548), (6, 0.44424552527467476), (7, 0.32448702061385548), (8, 0.44424552527467476)]
+        expected_docs = [[(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)],
+                         [(5, 6), (9, 3), (10, 3)]
         ]
+
         self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
-        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))  
-        
-        # npn
-        docs = [corpus[0], corpus[1]]
-        self.model.set_params(smartirs='npn')
+        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))
+
+        # smartirs=`lnn`
+        self.model.set_params(smartirs='lnn')
         self.model.fit(self.corpus)
+
         transformed_docs = self.model.transform(docs)
-        
-        expected_docs = [
-            [(0, 1.8073549220576042), (1, 1.8073549220576042), (2, 1.8073549220576042)],
-            [(3, 1.8073549220576042), (4, 1.8073549220576042), (5, 1.0), (6, 1.8073549220576042), (7, 1.0), (8, 1.8073549220576042)]
+        expected_docs = [[(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0), (8, 2.0)],
+                         [(5, 6.0), (9, 3.0), (10, 3.0)]
         ]
+
         self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
-        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))    
-        
-        # npc
-        docs = [corpus[0], corpus[1]]
-        self.model.set_params(smartirs='npc')
+        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))
+
+        # smartirs=`ann`
+        self.model.set_params(smartirs='ann')
         self.model.fit(self.corpus)
+
         transformed_docs = self.model.transform(docs)
-        
         expected_docs = [
-            [(0, 0.57735026918962573), (1, 0.57735026918962573), (2, 0.57735026918962573)],
-            [(3, 0.46563179782533826), (4, 0.46563179782533826), (5, 0.25763163180767745), (6, 0.46563179782533826), (7, 0.25763163180767745), (8, 0.46563179782533826)]
+            [(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0), (8, 2.0)],
+            [(5, 3.0), (9, 2.25), (10, 2.25)]
         ]
+
         self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
-        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))  
-        
-        # lnn
-        docs = [corpus[0], corpus[1]]
-        self.model.set_params(smartirs='lnn')
+        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))
+
+        # smartirs=`bnn`
+        self.model.set_params(smartirs='bnn')
         self.model.fit(self.corpus)
+
         transformed_docs = self.model.transform(docs)
-        
         expected_docs = [
-            [(0, 2.0), (1, 2.0), (2, 2.0)],
-            [(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0), (8, 2.0)]
+            [(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)],
+            [(5, 3), (9, 3), (10, 3)]
         ]
+
         self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
-        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) 
-        
-        # lnc
-        docs = [corpus[0], corpus[1]]
-        self.model.set_params(smartirs='lnc')
+        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))
+
+        # smartirs=`Lnn`
+        self.model.set_params(smartirs='Lnn')
         self.model.fit(self.corpus)
+
         transformed_docs = self.model.transform(docs)
-        
-        expected_docs = [
-            [(0, 0.57735026918962584), (1, 0.57735026918962584), (2, 0.57735026918962584)],
-            [(3, 0.34299717028501764), (4, 0.34299717028501764), (5, 0.51449575542752646), (6, 0.34299717028501764), (7, 0.51449575542752646), (8, 0.34299717028501764)]
+        expected_docs = [[(3, 1.4635792826230198),
+                          (4, 1.4635792826230198),
+                          (5, 2.19536892393453),
+                          (6, 1.4635792826230198),
+                          (7, 2.19536892393453),
+                          (8, 1.4635792826230198)],
+                         [(5, 3.627141918134611), (9, 1.8135709590673055), (10, 1.8135709590673055)]
         ]
+
         self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
-        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) 
-        
-        # ltn
-        docs = [corpus[0], corpus[1]]
-        self.model.set_params(smartirs='ltn')
+        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))
+
+        # Testing all the variations of `glocal`
+        # smartirs=`ntn`
+        self.model.set_params(smartirs='ntn')
         self.model.fit(self.corpus)
+
         transformed_docs = self.model.transform(docs)
-        
-        expected_docs = [
-            [(0, 2.1699250014423126), (1, 2.1699250014423126), (2, 2.1699250014423126)],
-            [(3, 2.1699250014423126), (4, 2.1699250014423126), (5, 1.5849625007211563), (6, 2.1699250014423126), (7, 1.5849625007211563), (8, 2.1699250014423126)]
+        expected_docs = [[(3, 2.1699250014423126),
+                          (4, 2.1699250014423126),
+                          (5, 1.5849625007211563),
+                          (6, 2.1699250014423126),
+                          (7, 1.5849625007211563),
+                          (8, 2.1699250014423126)],
+                         [(5, 3.1699250014423126), (9, 1.5849625007211563), (10, 1.5849625007211563)]
         ]
+
         self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
-        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) 
-        
-        # ltc
-        docs = [corpus[0], corpus[1]]
-        self.model.set_params(smartirs='ltc')
-        self.model.fit(self.corpus)
-        transformed_docs = self.model.transform(docs)
-        
-        expected_docs = [[(0, 0.57735026918962573),
-                          (1, 0.57735026918962573),
-                          (2, 0.57735026918962573)],
-                         [(3, 0.44424552527467476),
-                          (4, 0.44424552527467476),
-                          (5, 0.32448702061385548),
-                          (6, 0.44424552527467476),
-                          (7, 0.32448702061385548),
-                          (8, 0.44424552527467476)]]
-        self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
-        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) 
-        
-        # lpn
-        docs = [corpus[0], corpus[1]]
-        self.model.set_params(smartirs='lpn')
+        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))
+
+        # smartirs=`npn`
+        self.model.set_params(smartirs='npn')
         self.model.fit(self.corpus)
+
         transformed_docs = self.model.transform(docs)
-        
-        expected_docs = [[(0, 1.8073549220576042),
-                          (1, 1.8073549220576042),
-                          (2, 1.8073549220576042)],
-                         [(3, 1.8073549220576042),
+        expected_docs = [[(3, 1.8073549220576042),
                           (4, 1.8073549220576042),
                           (5, 1.0),
                           (6, 1.8073549220576042),
                           (7, 1.0),
-                          (8, 1.8073549220576042)]]
+                          (8, 1.8073549220576042)],
+                         [(5, 2.0), (9, 1.0), (10, 1.0)]
+        ]
+
         self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
-        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) 
-        
-        # lpc
-        docs = [corpus[0], corpus[1]]
-        self.model.set_params(smartirs='lpc')
+        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))
+
+        # Testing all the variations of `normalize`
+        # smartirs=`nnc`
+        self.model.set_params(smartirs='nnc')
         self.model.fit(self.corpus)
+
         transformed_docs = self.model.transform(docs)
-        
-        expected_docs = [[(0, 0.57735026918962573),
-                          (1, 0.57735026918962573),
-                          (2, 0.57735026918962573)],
-                         [(3, 0.46563179782533826),
-                          (4, 0.46563179782533826),
-                          (5, 0.25763163180767745),
-                          (6, 0.46563179782533826),
-                          (7, 0.25763163180767745),
-                          (8, 0.46563179782533826)]]
+        expected_docs = [[(3, 0.34299717028501764),
+                          (4, 0.34299717028501764),
+                          (5, 0.51449575542752646),
+                          (6, 0.34299717028501764),
+                          (7, 0.51449575542752646),
+                          (8, 0.34299717028501764)],
+                         [(5, 0.81649658092772603),
+                          (9, 0.40824829046386302),
+                          (10, 0.40824829046386302)]
+        ]
+
         self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
         self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))
-        
-        # lpc
-        docs = [corpus[0], corpus[1]]
-        self.model.set_params(smartirs='lpc')
+
+        # Check if wlocal and wglobal are overriden if smartirs is not None
+        self.model.set_params(wlocal=lambda x: x, wglobal=lambda x, y: x * x, smartirs='nnc')
         self.model.fit(self.corpus)
+
         transformed_docs = self.model.transform(docs)
-        
-        #pprint.pprint(transformed_docs)
-        expected_docs = [[(0, 0.57735026918962573),
-                          (1, 0.57735026918962573),
-                          (2, 0.57735026918962573)],
-                         [(3, 0.46563179782533826),
-                          (4, 0.46563179782533826),
-                          (5, 0.25763163180767745),
-                          (6, 0.46563179782533826),
-                          (7, 0.25763163180767745),
-                          (8, 0.46563179782533826)]]
+
+        self.model.set_params(wlocal=lambda x: x * x, wglobal=lambda x, y: x, smartirs='nnc')
+        self.model.fit(self.corpus)
+        expected_docs = self.model.transform(docs)
+
         self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
-        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1])) 
+        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))
+
 
 class TestHdpTransformer(unittest.TestCase):
     def setUp(self):

From 0bacc081ce60b6e83652c1430a8d479585513319 Mon Sep 17 00:00:00 2001
From: Mohit Rathore <mrmohitrathoremr@gmail.com>
Date: Fri, 22 Dec 2017 16:29:37 +0530
Subject: [PATCH 16/25] test without sklearn api

---
 gensim/test/test_data/tfidf_model.tst     | Bin 0 -> 458 bytes
 gensim/test/test_data/tfidf_model.tst.bz2 | Bin 0 -> 338 bytes
 gensim/test/test_sklearn_api.py           | 170 +---------------------
 gensim/test/test_tfidfmodel.py            | 163 ++++++++++++++++++++-
 4 files changed, 165 insertions(+), 168 deletions(-)
 create mode 100644 gensim/test/test_data/tfidf_model.tst
 create mode 100644 gensim/test/test_data/tfidf_model.tst.bz2

diff --git a/gensim/test/test_data/tfidf_model.tst b/gensim/test/test_data/tfidf_model.tst
new file mode 100644
index 0000000000000000000000000000000000000000..e9e5f3f3cff5372e7e5ce18a89efe58321c84e4f
GIT binary patch
literal 458
zcmZ{gy-ve06otu8^W*T3*m!`lWataP*1>`aZ$wrayOkv;=Eg~@R;mOWZ^6KG@L+6Q
zBft}~j_&89W6AeChs{cMR<&hogw(bimP#zYnLIeWb2fqC`US#0L{pY`rf+Mlb`s)O
zmT}&V?bFu6q+&7VEhKwLr}SWc8$1T@Zy$;V9z&0j$Jpa!e)IYBEb(sY-ORhWcZtX1
zFEFwU|Hg%BEMzoisk?RE4sv?ME;;XIGxk=kCAaljYEugfq+*0~wZbJOUW<xFZl}XN
z9N|thqyfsE2o<9yND0mqXNEJ!Nl*n1N0urv+n8R!@jXj7D{U6FhLisr7t0w|NZ^zu
h8*Lg4pZ$i%L1{~rklmn$ErD}ay=|1M{-PQe<rl|zm%IP~

literal 0
HcmV?d00001

diff --git a/gensim/test/test_data/tfidf_model.tst.bz2 b/gensim/test/test_data/tfidf_model.tst.bz2
new file mode 100644
index 0000000000000000000000000000000000000000..1cb3b2513f43939bf32a3897ae2bb28ec9838143
GIT binary patch
literal 338
zcmV-Y0j>T*T4*^jL0KkKS=gsPrT_rX|M>s^`UC)B5CFi71_r;U|FU2}004*p00ICY
zFaa9dHX0(C2dU_R=pX<9XwVrRpa`CzC#q@c05oW502%-Q07_|7Aw4Ecn?f0)W{HWT
zL4W|#L;Vr~C^N8ePzY=&PU1tj<FXyu$4D_EQjUkdcSEEDsILp}N)!)rq|yO0q*1X@
zE*PV?luF=cqBAo%8uz{IU;<!BNK6S(454w-mF|s~)-~03YYa@1LXJ9lRezNy@<x@i
zcfPD1aF;&w!vt4dgj7Lwb9JNPBS9d?ayFSw<RA+q)&3$Ht52-9*i#8erk-|6(b#<<
z`T)w<W5RHksqCZ%3r;oSg9vGg?gUng=f$uf?t~^8t^smR1x<)iJr%mwjuJ|uBXt}D
kN|0?_ac{dP!fDc<&MuP74fqW(G{M|m$rRy2Lt>o%n8<UFApigX

literal 0
HcmV?d00001

diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py
index 33d2d5b777..3793c79948 100644
--- a/gensim/test/test_sklearn_api.py
+++ b/gensim/test/test_sklearn_api.py
@@ -973,13 +973,13 @@ def testTransform(self):
 
     def testSetGetParams(self):
         # updating only one param
-        self.model.set_params(smartirs='nnn')
+        self.model.set_params(normalize=False)
         model_params = self.model.get_params()
-        self.assertEqual(model_params["smartirs"], 'nnn')
+        self.assertEqual(model_params["normalize"], False)
 
         # verify that the attributes values are also changed for `gensim_model` after fitting
         self.model.fit(self.corpus)
-        self.assertEqual(getattr(self.model.gensim_model, 'smartirs'), 'nnn')
+        self.assertEqual(getattr(self.model.gensim_model, 'normalize'), False)
 
     def testPipeline(self):
         with open(datapath('mini_newsgroup'), 'rb') as f:
@@ -1000,9 +1000,6 @@ def testPipeline(self):
         self.assertGreater(score, 0.40)
 
     def testPersistence(self):
-        # Test current model persistency.
-        self.model.set_params(smartirs='ntc')
-
         model_dump = pickle.dumps(self.model)
         model_load = pickle.loads(model_dump)
 
@@ -1013,171 +1010,10 @@ def testPersistence(self):
         original_transformed_doc = self.model.transform(doc)
         self.assertEqual(original_transformed_doc, loaded_transformed_doc)
 
-        # compare backward model pickle compatibility
-        with open("test_data/tfidf_model.pkl", "rb") as model_handler:
-            model_load = pickle.load(model_handler)
-
-        loaded_transformed_doc = model_load.transform(doc)
-
-        # comparing the original and new models
-        original_transformed_doc = self.model.transform(doc)
-        self.assertEqual(original_transformed_doc, loaded_transformed_doc)
-
     def testModelNotFitted(self):
         tfidf_wrapper = TfIdfTransformer()
         self.assertRaises(NotFittedError, tfidf_wrapper.transform, corpus[0])
 
-    def testConsistency(self):
-        # Test if `ntc` yields the default docs.
-        docs = [corpus[1], corpus[2]]
-
-        self.model.set_params(smartirs='ntc')
-        self.model.fit(self.corpus)
-        transformed_docs = self.model.transform(docs)
-
-        self.model.set_params(normalize=True)
-        self.model.fit(self.corpus)
-        expected_docs = self.model.transform(docs)
-
-        self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
-        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))
-
-        # Testing all the variations of `wlocal`
-        # smartirs=`nnn`
-        self.model.set_params(smartirs='nnn')
-        self.model.fit(self.corpus)
-
-        transformed_docs = self.model.transform(docs)
-        expected_docs = [[(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)],
-                         [(5, 6), (9, 3), (10, 3)]
-        ]
-
-        self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
-        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))
-
-        # smartirs=`lnn`
-        self.model.set_params(smartirs='lnn')
-        self.model.fit(self.corpus)
-
-        transformed_docs = self.model.transform(docs)
-        expected_docs = [[(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0), (8, 2.0)],
-                         [(5, 6.0), (9, 3.0), (10, 3.0)]
-        ]
-
-        self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
-        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))
-
-        # smartirs=`ann`
-        self.model.set_params(smartirs='ann')
-        self.model.fit(self.corpus)
-
-        transformed_docs = self.model.transform(docs)
-        expected_docs = [
-            [(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0), (8, 2.0)],
-            [(5, 3.0), (9, 2.25), (10, 2.25)]
-        ]
-
-        self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
-        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))
-
-        # smartirs=`bnn`
-        self.model.set_params(smartirs='bnn')
-        self.model.fit(self.corpus)
-
-        transformed_docs = self.model.transform(docs)
-        expected_docs = [
-            [(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)],
-            [(5, 3), (9, 3), (10, 3)]
-        ]
-
-        self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
-        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))
-
-        # smartirs=`Lnn`
-        self.model.set_params(smartirs='Lnn')
-        self.model.fit(self.corpus)
-
-        transformed_docs = self.model.transform(docs)
-        expected_docs = [[(3, 1.4635792826230198),
-                          (4, 1.4635792826230198),
-                          (5, 2.19536892393453),
-                          (6, 1.4635792826230198),
-                          (7, 2.19536892393453),
-                          (8, 1.4635792826230198)],
-                         [(5, 3.627141918134611), (9, 1.8135709590673055), (10, 1.8135709590673055)]
-        ]
-
-        self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
-        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))
-
-        # Testing all the variations of `glocal`
-        # smartirs=`ntn`
-        self.model.set_params(smartirs='ntn')
-        self.model.fit(self.corpus)
-
-        transformed_docs = self.model.transform(docs)
-        expected_docs = [[(3, 2.1699250014423126),
-                          (4, 2.1699250014423126),
-                          (5, 1.5849625007211563),
-                          (6, 2.1699250014423126),
-                          (7, 1.5849625007211563),
-                          (8, 2.1699250014423126)],
-                         [(5, 3.1699250014423126), (9, 1.5849625007211563), (10, 1.5849625007211563)]
-        ]
-
-        self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
-        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))
-
-        # smartirs=`npn`
-        self.model.set_params(smartirs='npn')
-        self.model.fit(self.corpus)
-
-        transformed_docs = self.model.transform(docs)
-        expected_docs = [[(3, 1.8073549220576042),
-                          (4, 1.8073549220576042),
-                          (5, 1.0),
-                          (6, 1.8073549220576042),
-                          (7, 1.0),
-                          (8, 1.8073549220576042)],
-                         [(5, 2.0), (9, 1.0), (10, 1.0)]
-        ]
-
-        self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
-        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))
-
-        # Testing all the variations of `normalize`
-        # smartirs=`nnc`
-        self.model.set_params(smartirs='nnc')
-        self.model.fit(self.corpus)
-
-        transformed_docs = self.model.transform(docs)
-        expected_docs = [[(3, 0.34299717028501764),
-                          (4, 0.34299717028501764),
-                          (5, 0.51449575542752646),
-                          (6, 0.34299717028501764),
-                          (7, 0.51449575542752646),
-                          (8, 0.34299717028501764)],
-                         [(5, 0.81649658092772603),
-                          (9, 0.40824829046386302),
-                          (10, 0.40824829046386302)]
-        ]
-
-        self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
-        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))
-
-        # Check if wlocal and wglobal are overriden if smartirs is not None
-        self.model.set_params(wlocal=lambda x: x, wglobal=lambda x, y: x * x, smartirs='nnc')
-        self.model.fit(self.corpus)
-
-        transformed_docs = self.model.transform(docs)
-
-        self.model.set_params(wlocal=lambda x: x * x, wglobal=lambda x, y: x, smartirs='nnc')
-        self.model.fit(self.corpus)
-        expected_docs = self.model.transform(docs)
-
-        self.assertTrue(numpy.allclose(transformed_docs[0], expected_docs[0]))
-        self.assertTrue(numpy.allclose(transformed_docs[1], expected_docs[1]))
-
 
 class TestHdpTransformer(unittest.TestCase):
     def setUp(self):
diff --git a/gensim/test/test_tfidfmodel.py b/gensim/test/test_tfidfmodel.py
index c308923c29..7f10ded948 100644
--- a/gensim/test/test_tfidfmodel.py
+++ b/gensim/test/test_tfidfmodel.py
@@ -18,10 +18,27 @@
 from gensim.models import tfidfmodel
 from gensim.test.utils import datapath, get_tmpfile, common_dictionary, common_corpus
 
+from gensim.corpora import Dictionary
+
+texts = [
+    ['complier', 'system', 'computer'],
+    ['eulerian', 'node', 'cycle', 'graph', 'tree', 'path'],
+    ['graph', 'flow', 'network', 'graph'],
+    ['loading', 'computer', 'system'],
+    ['user', 'server', 'system'],
+    ['tree', 'hamiltonian'],
+    ['graph', 'trees'],
+    ['computer', 'kernel', 'malfunction', 'computer'],
+    ['server', 'system', 'computer'],
+]
+dictionary = Dictionary(texts)
+corpus = [dictionary.doc2bow(text) for text in texts]
+
 
 class TestTfidfModel(unittest.TestCase):
     def setUp(self):
         self.corpus = MmCorpus(datapath('testcorpus.mm'))
+        self.model = tfidfmodel.TfidfModel(self.corpus, normalize=True)
 
     def testTransform(self):
         # create the transformation model
@@ -58,6 +75,13 @@ def testPersistence(self):
         tstvec = []
         self.assertTrue(np.allclose(model[tstvec], model2[tstvec]))  # try projecting an empty vector
 
+        # Test persistence between old and new model.
+        model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc")
+        model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst'))
+        self.assertTrue(model3.idfs == model4.idfs)
+        tstvec = []
+        self.assertTrue(np.allclose(model3[tstvec], model4[tstvec]))  # try projecting an empty vector
+
     def testPersistenceCompressed(self):
         fname = get_tmpfile('gensim_models.tst.gz')
         model = tfidfmodel.TfidfModel(self.corpus, normalize=True)
@@ -66,8 +90,145 @@ def testPersistenceCompressed(self):
         self.assertTrue(model.idfs == model2.idfs)
         tstvec = []
         self.assertTrue(np.allclose(model[tstvec], model2[tstvec]))  # try projecting an empty vector
-# endclass TestTfidfModel
 
+        # Test persistence between old and new compressed model.
+        model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc")
+        model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst.bz2'))
+        self.assertTrue(model3.idfs == model4.idfs)
+        tstvec = []
+        self.assertTrue(np.allclose(model3[tstvec], model4[tstvec]))  # try projecting an empty vector
+
+    def TestConsistency(self):
+        docs = [corpus[1], corpus[2]]
+
+        # Test if `ntc` yields the default docs.
+        model = tfidfmodel.TfidfModel(self.corpus, smartirs='ntc')
+        transformed_docs = [model[docs[0]], model[docs[1]]]
+
+        model = tfidfmodel.TfidfModel(self.corpus)
+        expected_docs = [model[docs[0]], model[docs[1]]]
+
+        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
+
+        # Testing all the variations of `wlocal`
+        # nnn
+        model = tfidfmodel.TfidfModel(self.corpus, smartirs='nnn')
+        transformed_docs = [model[docs[0]], model[docs[1]]]
+        expected_docs = [[(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)],
+                         [(5, 6), (9, 3), (10, 3)]
+        ]
+
+        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
+
+        # lnn
+        model = tfidfmodel.TfidfModel(self.corpus, smartirs='lnn')
+        transformed_docs = [model[docs[0]], model[docs[1]]]
+        expected_docs = [[(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0), (8, 2.0)],
+                         [(5, 6.0), (9, 3.0), (10, 3.0)]
+        ]
+
+        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
+
+        # ann
+        model = tfidfmodel.TfidfModel(self.corpus, smartirs='ann')
+        transformed_docs = [model[docs[0]], model[docs[1]]]
+        expected_docs = [
+            [(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0), (8, 2.0)],
+            [(5, 3.0), (9, 2.25), (10, 2.25)]
+        ]
+
+        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
+
+        # bnn
+        model = tfidfmodel.TfidfModel(self.corpus, smartirs='bnn')
+        transformed_docs = [model[docs[0]], model[docs[1]]]
+        expected_docs = [
+            [(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)],
+            [(5, 3), (9, 3), (10, 3)]
+        ]
+
+        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
+
+        # Lnn
+        model = tfidfmodel.TfidfModel(self.corpus, smartirs='Lnn')
+        transformed_docs = [model[docs[0]], model[docs[1]]]
+        expected_docs = [[(3, 1.4635792826230198),
+                          (4, 1.4635792826230198),
+                          (5, 2.19536892393453),
+                          (6, 1.4635792826230198),
+                          (7, 2.19536892393453),
+                          (8, 1.4635792826230198)],
+                         [(5, 3.627141918134611), (9, 1.8135709590673055), (10, 1.8135709590673055)]
+        ]
+
+        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
+
+        # Testing all the variations of `glocal`
+        # ntn
+        model = tfidfmodel.TfidfModel(self.corpus, smartirs='ntn')
+        transformed_docs = [model[docs[0]], model[docs[1]]]
+        expected_docs = [[(3, 2.1699250014423126),
+                          (4, 2.1699250014423126),
+                          (5, 1.5849625007211563),
+                          (6, 2.1699250014423126),
+                          (7, 1.5849625007211563),
+                          (8, 2.1699250014423126)],
+                         [(5, 3.1699250014423126), (9, 1.5849625007211563), (10, 1.5849625007211563)]
+        ]
+
+        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
+
+        # npn
+        model = tfidfmodel.TfidfModel(self.corpus, smartirs='npn')
+        transformed_docs = [model[docs[0]], model[docs[1]]]
+        expected_docs = [[(3, 1.8073549220576042),
+                          (4, 1.8073549220576042),
+                          (5, 1.0),
+                          (6, 1.8073549220576042),
+                          (7, 1.0),
+                          (8, 1.8073549220576042)],
+                         [(5, 2.0), (9, 1.0), (10, 1.0)]
+        ]
+
+        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
+
+        # Testing all the variations of `normalize`
+        # nnc
+        model = tfidfmodel.TfidfModel(self.corpus, smartirs='nnc')
+        transformed_docs = [model[docs[0]], model[docs[1]]]
+        expected_docs = [[(3, 0.34299717028501764),
+                          (4, 0.34299717028501764),
+                          (5, 0.51449575542752646),
+                          (6, 0.34299717028501764),
+                          (7, 0.51449575542752646),
+                          (8, 0.34299717028501764)],
+                         [(5, 0.81649658092772603),
+                          (9, 0.40824829046386302),
+                          (10, 0.40824829046386302)]
+        ]
+
+        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
+
+        model = tfidfmodel.TfidfModel(self.corpus, wlocal=lambda x: x, wglobal=lambda x, y: x * x, smartirs='nnc')
+
+        transformed_docs = [model[docs[0]], model[docs[1]]]
+
+        model = tfidfmodel.TfidfModel(self.corpus, wlocal=lambda x: x * x, wglobal=lambda x, y: x, smartirs='nnc')
+        expected_docs = [model[docs[0]], model[docs[1]]]
+
+        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
+
+# endclass TestTfidfModel
 
 if __name__ == '__main__':
     logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)

From 3039732a2b91530ee8c2b5029ad260f9307c0c6f Mon Sep 17 00:00:00 2001
From: Mohit Rathore <mrmohitrathoremr@gmail.com>
Date: Tue, 26 Dec 2017 01:34:34 +0530
Subject: [PATCH 17/25] hanging idents and new tests

---
 gensim/models/tfidfmodel.py    |   4 +-
 gensim/test/test_tfidfmodel.py | 101 +++++++++++++++++++++++----------
 2 files changed, 74 insertions(+), 31 deletions(-)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index 3d32b334fb..e52afe25a4 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -67,7 +67,7 @@ def updated_wlocal(tf, n_tf):
         return (1 + np.log(tf) / np.log(2)) / (1 + np.log(tf.mean(axis=0) / np.log(2)))
 
 
-def updated_wglobal(docfreq, totaldocs, n_df):  # TODO rename it (to avoid confusion)
+def updated_wglobal(docfreq, totaldocs, n_df):
     if n_df == "n":
         return utils.identity(docfreq)
     elif n_df == "t":
@@ -76,7 +76,7 @@ def updated_wglobal(docfreq, totaldocs, n_df):  # TODO rename it (to avoid confu
         return np.log((1.0 * totaldocs - docfreq) / docfreq) / np.log(2)
 
 
-def updated_normalize(x, n_n):  # TODO rename it (to avoid confusion)
+def updated_normalize(x, n_n):
     if n_n == "n":
         return x
     elif n_n == "c":
diff --git a/gensim/test/test_tfidfmodel.py b/gensim/test/test_tfidfmodel.py
index 32e9ee4d7e..7bc5d63cd5 100644
--- a/gensim/test/test_tfidfmodel.py
+++ b/gensim/test/test_tfidfmodel.py
@@ -66,35 +66,55 @@ def testInit(self):
         self.assertEqual(model1.idfs, model2.idfs)
 
     def testPersistence(self):
+        # Test persistence without using `smartirs`
         fname = get_tmpfile('gensim_models.tst')
         model = tfidfmodel.TfidfModel(self.corpus, normalize=True)
         model.save(fname)
         model2 = tfidfmodel.TfidfModel.load(fname)
         self.assertTrue(model.idfs == model2.idfs)
-        tstvec = []
+        tstvec = [corpus[1], corpus[2]]
         self.assertTrue(np.allclose(model[tstvec], model2[tstvec]))  # try projecting an empty vector
 
-        # Test persistence between old and new model.
+        # Test persistence with using `smartirs`
+        fname = get_tmpfile('gensim_models_smartirs.tst')
+        model = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc")
+        model.save(fname)
+        model2 = tfidfmodel.TfidfModel.load(fname)
+        self.assertTrue(model.idfs == model2.idfs)
+        tstvec = [corpus[1], corpus[2]]
+        self.assertTrue(np.allclose(model[tstvec], model2[tstvec]))  # try projecting an empty vector
+
+        # Test persistence between Gensim v3.2.0 and current model.
         model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc")
         model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst'))
         self.assertTrue(model3.idfs == model4.idfs)
-        tstvec = []
+        tstvec = [corpus[1], corpus[2]]
         self.assertTrue(np.allclose(model3[tstvec], model4[tstvec]))  # try projecting an empty vector
 
     def testPersistenceCompressed(self):
+        # Test persistence without using `smartirs`
         fname = get_tmpfile('gensim_models.tst.gz')
         model = tfidfmodel.TfidfModel(self.corpus, normalize=True)
         model.save(fname)
         model2 = tfidfmodel.TfidfModel.load(fname, mmap=None)
         self.assertTrue(model.idfs == model2.idfs)
-        tstvec = []
+        tstvec = [corpus[1], corpus[2]]
+        self.assertTrue(np.allclose(model[tstvec], model2[tstvec]))  # try projecting an empty vector
+
+        # Test persistence with using `smartirs`
+        fname = get_tmpfile('gensim_models_smartirs.tst.gz')
+        model = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc")
+        model.save(fname)
+        model2 = tfidfmodel.TfidfModel.load(fname, mmap=None)
+        self.assertTrue(model.idfs == model2.idfs)
+        tstvec = [corpus[1], corpus[2]]
         self.assertTrue(np.allclose(model[tstvec], model2[tstvec]))  # try projecting an empty vector
 
-        # Test persistence between old and new compressed model.
+        # Test persistence between Gensim v3.2.0 and current compressed model.
         model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc")
         model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst.bz2'))
         self.assertTrue(model3.idfs == model4.idfs)
-        tstvec = []
+        tstvec = [corpus[1], corpus[2]]
         self.assertTrue(np.allclose(model3[tstvec], model4[tstvec]))  # try projecting an empty vector
 
     def TestConsistency(self):
@@ -114,9 +134,15 @@ def TestConsistency(self):
         # nnn
         model = tfidfmodel.TfidfModel(self.corpus, smartirs='nnn')
         transformed_docs = [model[docs[0]], model[docs[1]]]
-        expected_docs = [[(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)],
-                         [(5, 6), (9, 3), (10, 3)]
-        ]
+        expected_docs = [[(3, 2),
+                          (4, 2),
+                          (5, 3),
+                          (6, 2),
+                          (7, 3),
+                          (8, 2)],
+                         [(5, 6),
+                          (9, 3),
+                          (10, 3)]]
 
         self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
         self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
@@ -124,9 +150,15 @@ def TestConsistency(self):
         # lnn
         model = tfidfmodel.TfidfModel(self.corpus, smartirs='lnn')
         transformed_docs = [model[docs[0]], model[docs[1]]]
-        expected_docs = [[(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0), (8, 2.0)],
-                         [(5, 6.0), (9, 3.0), (10, 3.0)]
-        ]
+        expected_docs = [[(3, 2.0),
+                          (4, 2.0),
+                          (5, 3.0),
+                          (6, 2.0),
+                          (7, 3.0),
+                          (8, 2.0)],
+                         [(5, 6.0),
+                          (9, 3.0),
+                          (10, 3.0)]]
 
         self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
         self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
@@ -134,10 +166,15 @@ def TestConsistency(self):
         # ann
         model = tfidfmodel.TfidfModel(self.corpus, smartirs='ann')
         transformed_docs = [model[docs[0]], model[docs[1]]]
-        expected_docs = [
-            [(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0), (8, 2.0)],
-            [(5, 3.0), (9, 2.25), (10, 2.25)]
-        ]
+        expected_docs = [[(3, 2.0),
+                          (4, 2.0),
+                          (5, 3.0),
+                          (6, 2.0),
+                          (7, 3.0),
+                          (8, 2.0)],
+                         [(5, 3.0),
+                          (9, 2.25),
+                          (10, 2.25)]]
 
         self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
         self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
@@ -145,10 +182,15 @@ def TestConsistency(self):
         # bnn
         model = tfidfmodel.TfidfModel(self.corpus, smartirs='bnn')
         transformed_docs = [model[docs[0]], model[docs[1]]]
-        expected_docs = [
-            [(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)],
-            [(5, 3), (9, 3), (10, 3)]
-        ]
+        expected_docs = [[(3, 2),
+                          (4, 2),
+                          (5, 3),
+                          (6, 2),
+                          (7, 3),
+                          (8, 2)],
+                         [(5, 3),
+                          (9, 3),
+                          (10, 3)]]
 
         self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
         self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
@@ -162,8 +204,9 @@ def TestConsistency(self):
                           (6, 1.4635792826230198),
                           (7, 2.19536892393453),
                           (8, 1.4635792826230198)],
-                         [(5, 3.627141918134611), (9, 1.8135709590673055), (10, 1.8135709590673055)]
-        ]
+                         [(5, 3.627141918134611),
+                          (9, 1.8135709590673055),
+                          (10, 1.8135709590673055)]]
 
         self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
         self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
@@ -178,8 +221,9 @@ def TestConsistency(self):
                           (6, 2.1699250014423126),
                           (7, 1.5849625007211563),
                           (8, 2.1699250014423126)],
-                         [(5, 3.1699250014423126), (9, 1.5849625007211563), (10, 1.5849625007211563)]
-        ]
+                         [(5, 3.1699250014423126),
+                          (9, 1.5849625007211563),
+                          (10, 1.5849625007211563)]]
 
         self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
         self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
@@ -193,8 +237,9 @@ def TestConsistency(self):
                           (6, 1.8073549220576042),
                           (7, 1.0),
                           (8, 1.8073549220576042)],
-                         [(5, 2.0), (9, 1.0), (10, 1.0)]
-        ]
+                         [(5, 2.0),
+                          (9, 1.0),
+                          (10, 1.0)]]
 
         self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
         self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
@@ -211,8 +256,7 @@ def TestConsistency(self):
                           (8, 0.34299717028501764)],
                          [(5, 0.81649658092772603),
                           (9, 0.40824829046386302),
-                          (10, 0.40824829046386302)]
-        ]
+                          (10, 0.40824829046386302)]]
 
         self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
         self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
@@ -226,7 +270,6 @@ def TestConsistency(self):
 
         self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
         self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
-# endclass TestTfidfModel
 
 
 if __name__ == '__main__':

From e5140f840ea8015eee8942ac217313519c551587 Mon Sep 17 00:00:00 2001
From: Mohit Rathore <mrmohitrathoremr@gmail.com>
Date: Tue, 26 Dec 2017 11:35:11 +0530
Subject: [PATCH 18/25] add docstring

---
 gensim/models/tfidfmodel.py | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index e52afe25a4..45dc2fb7f8 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -19,6 +19,34 @@
 def resolve_weights(smartirs):
     """
     Checks for validity of smartirs parameter.
+
+    Parameters
+    ----------
+    smartirs : {'None' ,'str'}
+                `smartirs` or SMART (System for the Mechanical Analysis and Retrieval of Text)
+                Information Retrieval System, a mnemonic scheme for denoting tf-idf weighting
+                variants in the vector space model. The mnemonic for representing a combination
+                of weights takes the form ddd, where the letters represents the term weighting
+                of the document vector.
+
+
+                for more information visit https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System
+
+    Raises
+    ------
+    ValueError : If `smartirs` is not a string of length 3 or one of the decomposed value
+                 doesn't fit the list of permissible values
+
+    Returns
+    -------
+    w_tf, w_df, w_n : str, str, str
+                      Term frequency weighing:
+                        natural - `n`, logarithm - `l` , augmented - `a`,  boolean `b`, log average - `L`.
+                      Document frequency weighting:
+                        none - `n`, idf - `t`, prob idf - `p`.
+                      Document normalization:
+                        none - `n`, cosine - `c`.
+
     """
     if not isinstance(smartirs, str) or len(smartirs) != 3:
         raise ValueError("Expected a string of length 3 except got " + smartirs)

From 4afbaddef33e40a74b6370cd820215ae3ce0774d Mon Sep 17 00:00:00 2001
From: Mohit Rathore <mrmohitrathoremr@gmail.com>
Date: Tue, 26 Dec 2017 11:35:11 +0530
Subject: [PATCH 19/25] add docstring

---
 gensim/models/tfidfmodel.py    | 28 ++++++++++++++++++++++++++++
 gensim/test/test_tfidfmodel.py | 24 ++++++++++++++++++------
 2 files changed, 46 insertions(+), 6 deletions(-)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index e52afe25a4..45dc2fb7f8 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -19,6 +19,34 @@
 def resolve_weights(smartirs):
     """
     Checks for validity of smartirs parameter.
+
+    Parameters
+    ----------
+    smartirs : {'None' ,'str'}
+                `smartirs` or SMART (System for the Mechanical Analysis and Retrieval of Text)
+                Information Retrieval System, a mnemonic scheme for denoting tf-idf weighting
+                variants in the vector space model. The mnemonic for representing a combination
+                of weights takes the form ddd, where the letters represents the term weighting
+                of the document vector.
+
+
+                for more information visit https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System
+
+    Raises
+    ------
+    ValueError : If `smartirs` is not a string of length 3 or one of the decomposed value
+                 doesn't fit the list of permissible values
+
+    Returns
+    -------
+    w_tf, w_df, w_n : str, str, str
+                      Term frequency weighing:
+                        natural - `n`, logarithm - `l` , augmented - `a`,  boolean `b`, log average - `L`.
+                      Document frequency weighting:
+                        none - `n`, idf - `t`, prob idf - `p`.
+                      Document normalization:
+                        none - `n`, cosine - `c`.
+
     """
     if not isinstance(smartirs, str) or len(smartirs) != 3:
         raise ValueError("Expected a string of length 3 except got " + smartirs)
diff --git a/gensim/test/test_tfidfmodel.py b/gensim/test/test_tfidfmodel.py
index 7bc5d63cd5..3669e08bfb 100644
--- a/gensim/test/test_tfidfmodel.py
+++ b/gensim/test/test_tfidfmodel.py
@@ -73,7 +73,9 @@ def testPersistence(self):
         model2 = tfidfmodel.TfidfModel.load(fname)
         self.assertTrue(model.idfs == model2.idfs)
         tstvec = [corpus[1], corpus[2]]
-        self.assertTrue(np.allclose(model[tstvec], model2[tstvec]))  # try projecting an empty vector
+        self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]]))
+        self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]]))
+        self.assertTrue(np.allclose(model[[]], model2[[]])) # try projecting an empty vector
 
         # Test persistence with using `smartirs`
         fname = get_tmpfile('gensim_models_smartirs.tst')
@@ -82,14 +84,18 @@ def testPersistence(self):
         model2 = tfidfmodel.TfidfModel.load(fname)
         self.assertTrue(model.idfs == model2.idfs)
         tstvec = [corpus[1], corpus[2]]
-        self.assertTrue(np.allclose(model[tstvec], model2[tstvec]))  # try projecting an empty vector
+        self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]]))
+        self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]]))
+        self.assertTrue(np.allclose(model[[]], model2[[]])) # try projecting an empty vector
 
         # Test persistence between Gensim v3.2.0 and current model.
         model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc")
         model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst'))
         self.assertTrue(model3.idfs == model4.idfs)
         tstvec = [corpus[1], corpus[2]]
-        self.assertTrue(np.allclose(model3[tstvec], model4[tstvec]))  # try projecting an empty vector
+        self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]]))
+        self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]]))
+        self.assertTrue(np.allclose(model3[[]], model4[[]])) # try projecting an empty vector
 
     def testPersistenceCompressed(self):
         # Test persistence without using `smartirs`
@@ -99,7 +105,9 @@ def testPersistenceCompressed(self):
         model2 = tfidfmodel.TfidfModel.load(fname, mmap=None)
         self.assertTrue(model.idfs == model2.idfs)
         tstvec = [corpus[1], corpus[2]]
-        self.assertTrue(np.allclose(model[tstvec], model2[tstvec]))  # try projecting an empty vector
+        self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]]))
+        self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]]))
+        self.assertTrue(np.allclose(model[[]], model2[[]])) # try projecting an empty vector
 
         # Test persistence with using `smartirs`
         fname = get_tmpfile('gensim_models_smartirs.tst.gz')
@@ -108,14 +116,18 @@ def testPersistenceCompressed(self):
         model2 = tfidfmodel.TfidfModel.load(fname, mmap=None)
         self.assertTrue(model.idfs == model2.idfs)
         tstvec = [corpus[1], corpus[2]]
-        self.assertTrue(np.allclose(model[tstvec], model2[tstvec]))  # try projecting an empty vector
+        self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]]))
+        self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]]))
+        self.assertTrue(np.allclose(model[[]], model2[[]])) # try projecting an empty vector
 
         # Test persistence between Gensim v3.2.0 and current compressed model.
         model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc")
         model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst.bz2'))
         self.assertTrue(model3.idfs == model4.idfs)
         tstvec = [corpus[1], corpus[2]]
-        self.assertTrue(np.allclose(model3[tstvec], model4[tstvec]))  # try projecting an empty vector
+        self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]]))
+        self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]]))
+        self.assertTrue(np.allclose(model3[[]], model4[[]])) # try projecting an empty vector
 
     def TestConsistency(self):
         docs = [corpus[1], corpus[2]]

From 52ee3c45691b442dd71278d522faec6632586b5c Mon Sep 17 00:00:00 2001
From: Mohit Rathore <mrmohitrathoremr@gmail.com>
Date: Wed, 27 Dec 2017 11:32:38 +0530
Subject: [PATCH 20/25] better way cmparing floats

---
 gensim/models/tfidfmodel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index 45dc2fb7f8..c7461899e5 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -272,7 +272,7 @@ def __getitem__(self, bow, eps=1e-12):
 
         vector = [
             (termid, tf * self.idfs.get(termid))
-            for termid, tf in zip(termid_array, tf_array) if self.idfs.get(termid, 0.0) != 0.0
+            for termid, tf in zip(termid_array, tf_array) if self.idfs.get(termid, 0.0) > eps
         ]
 
         if self.normalize is True:

From 48e84f7c899b2245fee5843ca20b322a726db22b Mon Sep 17 00:00:00 2001
From: Mohit Rathore <mrmohitrathoremr@gmail.com>
Date: Mon, 8 Jan 2018 12:34:41 +0530
Subject: [PATCH 21/25] old way of cmp floats

---
 gensim/models/tfidfmodel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index c7461899e5..45dc2fb7f8 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -272,7 +272,7 @@ def __getitem__(self, bow, eps=1e-12):
 
         vector = [
             (termid, tf * self.idfs.get(termid))
-            for termid, tf in zip(termid_array, tf_array) if self.idfs.get(termid, 0.0) > eps
+            for termid, tf in zip(termid_array, tf_array) if self.idfs.get(termid, 0.0) != 0.0
         ]
 
         if self.normalize is True:

From d0878a41771250678a8170aba93292b335e74fd9 Mon Sep 17 00:00:00 2001
From: ivan <menshikh.iv@gmail.com>
Date: Wed, 10 Jan 2018 23:32:01 +0500
Subject: [PATCH 22/25] doc fix[1]

---
 gensim/models/tfidfmodel.py | 137 ++++++++++++++++++++++++++++--------
 1 file changed, 108 insertions(+), 29 deletions(-)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index 45dc2fb7f8..41786fcf50 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -17,35 +17,37 @@
 
 
 def resolve_weights(smartirs):
-    """
-    Checks for validity of smartirs parameter.
+    """Checks for validity of `smartirs` parameter.
 
     Parameters
     ----------
-    smartirs : {'None' ,'str'}
-                `smartirs` or SMART (System for the Mechanical Analysis and Retrieval of Text)
-                Information Retrieval System, a mnemonic scheme for denoting tf-idf weighting
-                variants in the vector space model. The mnemonic for representing a combination
-                of weights takes the form ddd, where the letters represents the term weighting
-                of the document vector.
-
+    smartirs : str
+        `smartirs` or SMART (System for the Mechanical Analysis and Retrieval of Text)
+        Information Retrieval System, a mnemonic scheme for denoting tf-idf weighting
+        variants in the vector space model. The mnemonic for representing a combination
+        of weights takes the form ddd, where the letters represents the term weighting of the document vector.
+        for more information visit [1]_.
 
-                for more information visit https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System
+    Returns
+    -------
+    w_tf : str
+        Term frequency weighing: natural - `n`, logarithm - `l` , augmented - `a`,  boolean `b`, log average - `L`.
+    w_df : str
+        Document frequency weighting: none - `n`, idf - `t`, prob idf - `p`.
+    w_n : str
+        Document normalization: none - `n`, cosine - `c`.
 
     Raises
     ------
-    ValueError : If `smartirs` is not a string of length 3 or one of the decomposed value
-                 doesn't fit the list of permissible values
+    ValueError
+        If `smartirs` is not a string of length 3 or one of the decomposed value
+        doesn't fit the list of permissible values
+
 
-    Returns
-    -------
-    w_tf, w_df, w_n : str, str, str
-                      Term frequency weighing:
-                        natural - `n`, logarithm - `l` , augmented - `a`,  boolean `b`, log average - `L`.
-                      Document frequency weighting:
-                        none - `n`, idf - `t`, prob idf - `p`.
-                      Document normalization:
-                        none - `n`, cosine - `c`.
+
+    References
+    ----------
+    .. [1] https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System
 
     """
     if not isinstance(smartirs, str) or len(smartirs) != 3:
@@ -54,28 +56,58 @@ def resolve_weights(smartirs):
     w_tf, w_df, w_n = smartirs
 
     if w_tf not in 'nlabL':
-        raise ValueError("Expected term frequency weight to be one of 'nlabL', except got " + w_tf)
+        raise ValueError("Expected term frequency weight to be one of 'nlabL', except got {}".format(w_tf))
 
     if w_df not in 'ntp':
-        raise ValueError("Expected inverse document frequency weight to be one of 'ntp', except got " + w_df)
+        raise ValueError("Expected inverse document frequency weight to be one of 'ntp', except got {}".format(w_df))
 
     if w_n not in 'ncb':
-        raise ValueError("Expected normalization weight to be one of 'ncb', except got " + w_n)
+        raise ValueError("Expected normalization weight to be one of 'ncb', except got {}".format(w_n))
 
     return w_tf, w_df, w_n
 
 
 def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0):
-    """
-    Compute default inverse-document-frequency for a term with document frequency `doc_freq`::
-    idf = add + log(totaldocs / doc_freq)
+    """Compute default inverse-document-frequency for a term with document frequency:
+    :math:`idf = add + log_{log\_base} \\frac{totaldocs}{doc\_freq}`
+
+    Parameters
+    ----------
+    docfreq : float
+        Document frequency.
+    totaldocs : int
+        Total number of documents.
+    log_base : float, optional
+        Base of logarithm.
+    add : float, optional
+        Offset.
+
+    Returns
+    -------
+    float
+        Inverse document frequency.
+
     """
     return add + np.log(float(totaldocs) / docfreq) / np.log(log_base)
 
 
 def precompute_idfs(wglobal, dfs, total_docs):
-    """
-    Precompute the inverse document frequency mapping for all terms.
+    """Pre-compute the inverse document frequency mapping for all terms.
+
+    Parameters
+    ----------
+    wglobal : function
+        Custom function for calculation idf, look at "universal" :func:`~gensim.models.tfidfmodel.updated_wglobal`.
+    dfs : dict
+        Dictionary with term_id and how many documents this token appeared.
+    total_docs : int
+        Total number of document.
+
+    Returns
+    -------
+    dict
+        Precomputed idfs in format {term_id_1: idfs_1, term_id_2: idfs_2, ...}
+
     """
     # not strictly necessary and could be computed on the fly in TfidfModel__getitem__.
     # this method is here just to speed things up a little.
@@ -83,6 +115,21 @@ def precompute_idfs(wglobal, dfs, total_docs):
 
 
 def updated_wlocal(tf, n_tf):
+    """Apply needed function based on `n_tf`.
+
+    Parameters
+    ----------
+    tf : int
+        Term frequency.
+    n_tf : str
+        Parameter, that choice concrete function.
+
+    Returns
+    -------
+    float
+        Calculated wlocal.
+
+    """
     if n_tf == "n":
         return tf
     elif n_tf == "l":
@@ -96,6 +143,23 @@ def updated_wlocal(tf, n_tf):
 
 
 def updated_wglobal(docfreq, totaldocs, n_df):
+    """Apply needed function based on `n_df`.
+
+    Parameters
+    ----------
+    docfreq : int
+        Document frequency.
+    totaldocs : int
+        Total number of documents.
+    n_df : str
+        Parameter, that choice concrete function.
+
+    Returns
+    -------
+    float
+        Calculated wglobal.
+
+    """
     if n_df == "n":
         return utils.identity(docfreq)
     elif n_df == "t":
@@ -105,6 +169,21 @@ def updated_wglobal(docfreq, totaldocs, n_df):
 
 
 def updated_normalize(x, n_n):
+    """Apply needed normalization based on `n_n`
+
+    Parameters
+    ----------
+    x : numpy.ndarray
+        Input array
+    n_n : str
+        Parameter, that choice concrete function.
+
+    Returns
+    -------
+    numpy.ndarray
+        Normalized array.
+
+    """
     if n_n == "n":
         return x
     elif n_n == "c":

From b544c9cf6d193dea12494090af22096ea8a40fad Mon Sep 17 00:00:00 2001
From: ivan <menshikh.iv@gmail.com>
Date: Thu, 11 Jan 2018 11:05:12 +0500
Subject: [PATCH 23/25] doc fix[2]

---
 docs/src/models/tfidfmodel.rst |   3 +-
 gensim/models/tfidfmodel.py    | 171 +++++++++++++++++++--------------
 2 files changed, 100 insertions(+), 74 deletions(-)

diff --git a/docs/src/models/tfidfmodel.rst b/docs/src/models/tfidfmodel.rst
index 6b622d7589..55907470d3 100644
--- a/docs/src/models/tfidfmodel.rst
+++ b/docs/src/models/tfidfmodel.rst
@@ -1,5 +1,5 @@
 :mod:`models.tfidfmodel` -- TF-IDF model
-======================================================
+========================================
 
 .. automodule:: gensim.models.tfidfmodel
     :synopsis: TF-IDF model
@@ -7,3 +7,4 @@
     :inherited-members:
     :undoc-members:
     :show-inheritance:
+    :special-members: __getitem__
diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index 41786fcf50..9507a7aede 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -31,11 +31,21 @@ def resolve_weights(smartirs):
     Returns
     -------
     w_tf : str
-        Term frequency weighing: natural - `n`, logarithm - `l` , augmented - `a`,  boolean `b`, log average - `L`.
+        Term frequency weighing:
+            * `n` - natural,
+            * `l` - logarithm,
+            * `a` - augmented,
+            * `b` - boolean,
+            * `L` - log average.
     w_df : str
-        Document frequency weighting: none - `n`, idf - `t`, prob idf - `p`.
+        Document frequency weighting:
+            * `n` - none,
+            * `t` - idf,
+            * `p` - prob idf.
     w_n : str
-        Document normalization: none - `n`, cosine - `c`.
+        Document normalization:
+            * `n` - none,
+            * `c` - cosine.
 
     Raises
     ------
@@ -43,8 +53,6 @@ def resolve_weights(smartirs):
         If `smartirs` is not a string of length 3 or one of the decomposed value
         doesn't fit the list of permissible values
 
-
-
     References
     ----------
     .. [1] https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System
@@ -115,13 +123,13 @@ def precompute_idfs(wglobal, dfs, total_docs):
 
 
 def updated_wlocal(tf, n_tf):
-    """Apply needed function based on `n_tf`.
+    """Apply needed function based on `n_tf`. TODO: add better descriptions for function/parameters.
 
     Parameters
     ----------
     tf : int
         Term frequency.
-    n_tf : str
+    n_tf : {'n', 'l', 'a', 'b', 'L'}
         Parameter, that choice concrete function.
 
     Returns
@@ -143,7 +151,7 @@ def updated_wlocal(tf, n_tf):
 
 
 def updated_wglobal(docfreq, totaldocs, n_df):
-    """Apply needed function based on `n_df`.
+    """Apply needed function based on `n_df`. TODO: add better descriptions for function/parameters.
 
     Parameters
     ----------
@@ -151,7 +159,7 @@ def updated_wglobal(docfreq, totaldocs, n_df):
         Document frequency.
     totaldocs : int
         Total number of documents.
-    n_df : str
+    n_df : {'n', 't', 'p'}
         Parameter, that choice concrete function.
 
     Returns
@@ -169,13 +177,13 @@ def updated_wglobal(docfreq, totaldocs, n_df):
 
 
 def updated_normalize(x, n_n):
-    """Apply needed normalization based on `n_n`
+    """Apply needed normalization based on `n_n`. TODO: add better descriptions for function/parameters.
 
     Parameters
     ----------
     x : numpy.ndarray
         Input array
-    n_n : str
+    n_n : {'n', 'c'}
         Parameter, that choice concrete function.
 
     Returns
@@ -191,78 +199,79 @@ def updated_normalize(x, n_n):
 
 
 class TfidfModel(interfaces.TransformationABC):
-    """
-    Objects of this class realize the transformation between word-document co-occurrence
-    matrix (integers) into a locally/globally weighted TF_IDF matrix (positive floats).
+    """Objects of this class realize the transformation between word-document co-occurrence matrix (int)
+    into a locally/globally weighted TF_IDF matrix (positive floats).
 
     Examples
     --------
-    >>> tfidf = TfidfModel(corpus)
-    >>> print(tfidf[some_doc])
-    >>> tfidf.save('/tmp/foo.tfidf_model')
-
-    Model persistency is achieved via its load/save methods.
+    >>> import gensim.downloader as api
+    >>> from gensim.models import TfidfModel
+    >>> from gensim.corpora import Dictionary
+    >>>
+    >>> dataset = api.load("text8")
+    >>> dct = Dictionary(dataset)  # fit dictionary
+    >>> corpus = [dct.doc2bow(line) for line in dataset]  # convert dataset to BoW format
+    >>>
+    >>> model = TfidfModel(corpus)  # fit model
+    >>> vector = model[corpus[0]]  # apply model
 
     """
 
     def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.identity,
                  wglobal=df2idf, normalize=True, smartirs=None):
-        """
-        Compute tf-idf by multiplying a local component (term frequency) with a
-        global component (inverse document frequency), and normalizing
-        the resulting documents to unit length. Formula for unnormalized weight
-        of term `i` in document `j` in a corpus of D documents::
+        """Compute tf-idf by multiplying a local component (term frequency) with a global component
+        (inverse document frequency), and normalizing the resulting documents to unit length.
+        Formula for non-normalized weight of term :math:`i` in document :math:`j` in a corpus of :math:`D` documents
 
-          weight_{i,j} = frequency_{i,j} * log_2(D / document_freq_{i})
+        .. math:: weight_{i,j} = frequency_{i,j} * log_2 \\frac{D}{document\_freq_{i}}
 
-        or, more generally::
+        or, more generally
 
-          weight_{i,j} = wlocal(frequency_{i,j}) * wglobal(document_freq_{i}, D)
-
-        so you can plug in your own custom `wlocal` and `wglobal` functions.
+        .. math:: weight_{i,j} = wlocal(frequency_{i,j}) * wglobal(document\_freq_{i}, D)
 
+        so you can plug in your own custom :math:`wlocal` and :math:`wglobal` functions.
 
         Parameters
         ----------
-        corpus :    dictionary.doc2bow
-                    Corpus is a list of sets where each set has two elements. First being the termid and
-                    second being the term frequency of each term in the document.
-        id2word :   dict
-                    id2word is an optional dictionary that maps the word_id to a token.
-                    In case id2word isn’t specified the mapping id2word[word_id] = str(word_id) will be used.
-        dictionary :corpora.Dictionary
-                    If `dictionary` is specified, it must be a `corpora.Dictionary` object
-                    and it will be used to directly construct the inverse document frequency
-                    mapping (then `corpus`, if specified, is ignored).
-        wlocals :   user specified function
-                    Default for `wlocal` is identity (other options: math.sqrt, math.log1p, ...)
-        wglobal :   user specified function
-                    Default for `wglobal` is `log_2(total_docs / doc_freq)`, giving the
-                    formula above.
-        normalize : user specified function
-                    It dictates how the final transformed vectors will be normalized.
-                    `normalize=True` means set to unit length (default); `False` means don't
-                    normalize. You can also set `normalize` to your own function that accepts
-                    and returns a sparse vector.
-        smartirs : {'None' ,'str'}
-                    `smartirs` or SMART (System for the Mechanical Analysis and Retrieval of Text)
-                    Information Retrieval System, a mnemonic scheme for denoting tf-idf weighting
-                    variants in the vector space model. The mnemonic for representing a combination
-                    of weights takes the form ddd, where the letters represents the term weighting
-                    of the document vector.
-
-                    Term frequency weighing:
-                      natural - `n`, logarithm - `l` , augmented - `a`,  boolean `b`, log average - `L`.
-                    Document frequency weighting:
-                      none - `n`, idf - `t`, prob idf - `p`.
-                    Document normalization:
-                      none - `n`, cosine - `c`.
-
-                    for more information visit https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System
-
-        Returns
-        -------
-        x : gensim.models.tfidfmodel.TfidfModel
+        corpus : iterable of iterable of (int, int), optional
+            Input corpus
+        id2word : {dict, :class:`~gensim.corpora.Dictionary`}, optional
+            Mapping token - id, that was used for converting input data to bag of words format.
+        dictionary : :class:`~gensim.corpora.Dictionary`
+            If `dictionary` is specified, it must be a `corpora.Dictionary` object and it will be used.
+            to directly construct the inverse document frequency mapping (then `corpus`, if specified, is ignored).
+        wlocals : function, optional
+            Function for local weighting, default for `wlocal` is :func:`~gensim.utils.identity`
+            (other options: :func:`math.sqrt`, :func:`math.log1p`, etc).
+        wglobal : function, optional
+            Function for global weighting, default is :func:`~gensim.models.tfidfmodel.df2idf`.
+        normalize : bool, optional
+            It dictates how the final transformed vectors will be normalized. `normalize=True` means set to unit length
+            (default); `False` means don't normalize. You can also set `normalize` to your own function that accepts
+            and returns a sparse vector.
+        smartirs : str, optional
+            SMART (System for the Mechanical Analysis and Retrieval of Text) Information Retrieval System,
+            a mnemonic scheme for denoting tf-idf weighting variants in the vector space model.
+            The mnemonic for representing a combination of weights takes the form XYZ,
+            for example 'ntc', 'bpn' and so on, where the letters represents the term weighting of the document vector.
+
+            Term frequency weighing:
+                * `n` - natural,
+                * `l` - logarithm,
+                * `a` - augmented,
+                * `b` - boolean,
+                * `L` - log average.
+
+            Document frequency weighting:
+                * `n` - none,
+                * `t` - idf,
+                * `p` - prob idf.
+
+            Document normalization:
+                * `n` - none,
+                * `c` - cosine.
+
+            For more information visit [1]_.
 
         """
 
@@ -303,9 +312,13 @@ def __str__(self):
         return "TfidfModel(num_docs=%s, num_nnz=%s)" % (self.num_docs, self.num_nnz)
 
     def initialize(self, corpus):
-        """
-        Compute inverse document weights, which will be used to modify term
-        frequencies for documents.
+        """Compute inverse document weights, which will be used to modify term frequencies for documents.
+
+        Parameters
+        ----------
+        corpus : iterable of iterable of (int, int)
+            Input corpus.
+
         """
         logger.info("collecting document frequencies")
         dfs = {}
@@ -331,8 +344,20 @@ def initialize(self, corpus):
         self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
 
     def __getitem__(self, bow, eps=1e-12):
-        """
-        Return tf-idf representation of the input vector and/or corpus.
+        """Get tf-idf representation of the input vector and/or corpus.
+
+        bow : {list of (int, int), iterable of iterable of (int, int)}
+            Input document or copus in BoW format.
+        eps : float
+            Threshold value, will remove all position that have tfidf-value less than `eps`.
+
+        Returns
+        -------
+        vector : list of (int, float)
+            TfIdf vector, if `bow` is document **OR**
+        :class:`~gensim.interfaces.TransformedCorpus`
+            TfIdf corpus, if `bow` is corpus.
+
         """
         # if the input vector is in fact a corpus, return a transformed corpus as a result
         is_corpus, bow = utils.is_corpus(bow)

From c4e3656d5ba4a568ab811d9404de6c241ad245b4 Mon Sep 17 00:00:00 2001
From: Mohit Rathore <mrmohitrathoremr@gmail.com>
Date: Thu, 11 Jan 2018 12:18:31 +0530
Subject: [PATCH 24/25] fix description TODOs

---
 gensim/models/tfidfmodel.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index 9507a7aede..ab454c67df 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -2,6 +2,7 @@
 # -*- coding: utf-8 -*-
 #
 # Copyright (C) 2012 Radim Rehurek <radimrehurek@seznam.cz>
+# Copyright (C) 2017 Mohit Rathore <mrmohitrathoremr@gmail.com>
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
 
@@ -123,14 +124,14 @@ def precompute_idfs(wglobal, dfs, total_docs):
 
 
 def updated_wlocal(tf, n_tf):
-    """Apply needed function based on `n_tf`. TODO: add better descriptions for function/parameters.
+    """A scheme to transform `tf` or term frequency based on the value of `n_tf`.
 
     Parameters
     ----------
     tf : int
         Term frequency.
     n_tf : {'n', 'l', 'a', 'b', 'L'}
-        Parameter, that choice concrete function.
+        Parameter to decide the current transformation scheme.
 
     Returns
     -------
@@ -151,7 +152,7 @@ def updated_wlocal(tf, n_tf):
 
 
 def updated_wglobal(docfreq, totaldocs, n_df):
-    """Apply needed function based on `n_df`. TODO: add better descriptions for function/parameters.
+    """A scheme to transform `docfreq` or document frequency based on the value of `n_df`.
 
     Parameters
     ----------
@@ -160,7 +161,7 @@ def updated_wglobal(docfreq, totaldocs, n_df):
     totaldocs : int
         Total number of documents.
     n_df : {'n', 't', 'p'}
-        Parameter, that choice concrete function.
+        Parameter to decide the current transformation scheme.
 
     Returns
     -------
@@ -177,14 +178,14 @@ def updated_wglobal(docfreq, totaldocs, n_df):
 
 
 def updated_normalize(x, n_n):
-    """Apply needed normalization based on `n_n`. TODO: add better descriptions for function/parameters.
+    """Normalizes the final tf-idf value according to the value of `n_n`.
 
     Parameters
     ----------
     x : numpy.ndarray
         Input array
     n_n : {'n', 'c'}
-        Parameter, that choice concrete function.
+        Parameter that decides the normalizing function to be used.
 
     Returns
     -------

From 98ffde5909e59448ecfdba8f28d8cf973a2298fc Mon Sep 17 00:00:00 2001
From: Mohit Rathore <mrmohitrathoremr@gmail.com>
Date: Mon, 15 Jan 2018 19:41:09 +0530
Subject: [PATCH 25/25] fix irksome comparision

---
 gensim/models/tfidfmodel.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index ab454c67df..a61e993333 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -377,7 +377,7 @@ def __getitem__(self, bow, eps=1e-12):
 
         vector = [
             (termid, tf * self.idfs.get(termid))
-            for termid, tf in zip(termid_array, tf_array) if self.idfs.get(termid, 0.0) != 0.0
+            for termid, tf in zip(termid_array, tf_array) if abs(self.idfs.get(termid, 0.0)) > eps
         ]
 
         if self.normalize is True: