From b72f36618297172bd8e8402182ec30e83d3e377a Mon Sep 17 00:00:00 2001
From: Mohit Rathore <mrmohitrathoremr@gmail.com>
Date: Mon, 15 Jan 2018 20:32:28 +0530
Subject: [PATCH] Add smart information retrieval system for `TfidfModel`. Fix
 #1785 (#1791)

* fixing appveyor

* verify weights

* verify weights

* smartirs ready

* change old tests

* remove lambdas

* address suggestions

* minor fix

* pep8 fix

* fix pickle problem

* flake8 fix

* fix bug in docstring

* added few tests

* fix normalize issue for pickling

* fix normalize issue for pickling

* test without sklearn api

* hanging idents and new tests

* add docstring

* add docstring

* better way cmparing floats

* old way of cmp floats

* doc fix[1]

* doc fix[2]

* fix description TODOs

* fix irksome comparision
---
 docs/src/models/tfidfmodel.rst            |   3 +-
 gensim/models/tfidfmodel.py               | 339 ++++++++++++++++++----
 gensim/sklearn_api/tfidf.py               |   9 +-
 gensim/test/test_data/tfidf_model.tst     | Bin 0 -> 458 bytes
 gensim/test/test_data/tfidf_model.tst.bz2 | Bin 0 -> 338 bytes
 gensim/test/test_sklearn_api.py           |   6 +-
 gensim/test/test_tfidfmodel.py            | 225 +++++++++++++-
 7 files changed, 516 insertions(+), 66 deletions(-)
 create mode 100644 gensim/test/test_data/tfidf_model.tst
 create mode 100644 gensim/test/test_data/tfidf_model.tst.bz2

diff --git a/docs/src/models/tfidfmodel.rst b/docs/src/models/tfidfmodel.rst
index 6b622d7589..55907470d3 100644
--- a/docs/src/models/tfidfmodel.rst
+++ b/docs/src/models/tfidfmodel.rst
@@ -1,5 +1,5 @@
 :mod:`models.tfidfmodel` -- TF-IDF model
-======================================================
+========================================
 
 .. automodule:: gensim.models.tfidfmodel
     :synopsis: TF-IDF model
@@ -7,3 +7,4 @@
     :inherited-members:
     :undoc-members:
     :show-inheritance:
+    :special-members: __getitem__
diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index 50320ad747..a61e993333 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -2,86 +2,293 @@
 # -*- coding: utf-8 -*-
 #
 # Copyright (C) 2012 Radim Rehurek <radimrehurek@seznam.cz>
+# Copyright (C) 2017 Mohit Rathore <mrmohitrathoremr@gmail.com>
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
 
 import logging
-import math
+from functools import partial
 
 from gensim import interfaces, matutils, utils
 from six import iteritems
 
+import numpy as np
 
 logger = logging.getLogger(__name__)
 
 
-def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0):
+def resolve_weights(smartirs):
+    """Checks for validity of `smartirs` parameter.
+
+    Parameters
+    ----------
+    smartirs : str
+        `smartirs` or SMART (System for the Mechanical Analysis and Retrieval of Text)
+        Information Retrieval System, a mnemonic scheme for denoting tf-idf weighting
+        variants in the vector space model. The mnemonic for representing a combination
+        of weights takes the form ddd, where the letters represents the term weighting of the document vector.
+        for more information visit [1]_.
+
+    Returns
+    -------
+    w_tf : str
+        Term frequency weighing:
+            * `n` - natural,
+            * `l` - logarithm,
+            * `a` - augmented,
+            * `b` - boolean,
+            * `L` - log average.
+    w_df : str
+        Document frequency weighting:
+            * `n` - none,
+            * `t` - idf,
+            * `p` - prob idf.
+    w_n : str
+        Document normalization:
+            * `n` - none,
+            * `c` - cosine.
+
+    Raises
+    ------
+    ValueError
+        If `smartirs` is not a string of length 3 or one of the decomposed value
+        doesn't fit the list of permissible values
+
+    References
+    ----------
+    .. [1] https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System
+
     """
-    Compute default inverse-document-frequency for a term with document frequency `doc_freq`::
+    if not isinstance(smartirs, str) or len(smartirs) != 3:
+        raise ValueError("Expected a string of length 3 except got " + smartirs)
+
+    w_tf, w_df, w_n = smartirs
+
+    if w_tf not in 'nlabL':
+        raise ValueError("Expected term frequency weight to be one of 'nlabL', except got {}".format(w_tf))
+
+    if w_df not in 'ntp':
+        raise ValueError("Expected inverse document frequency weight to be one of 'ntp', except got {}".format(w_df))
+
+    if w_n not in 'ncb':
+        raise ValueError("Expected normalization weight to be one of 'ncb', except got {}".format(w_n))
+
+    return w_tf, w_df, w_n
+
+
+def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0):
+    """Compute default inverse-document-frequency for a term with document frequency:
+    :math:`idf = add + log_{log\_base} \\frac{totaldocs}{doc\_freq}`
+
+    Parameters
+    ----------
+    docfreq : float
+        Document frequency.
+    totaldocs : int
+        Total number of documents.
+    log_base : float, optional
+        Base of logarithm.
+    add : float, optional
+        Offset.
+
+    Returns
+    -------
+    float
+        Inverse document frequency.
 
-      idf = add + log(totaldocs / doc_freq)
     """
-    return add + math.log(1.0 * totaldocs / docfreq, log_base)
+    return add + np.log(float(totaldocs) / docfreq) / np.log(log_base)
 
 
 def precompute_idfs(wglobal, dfs, total_docs):
-    """Precompute the inverse document frequency mapping for all terms."""
+    """Pre-compute the inverse document frequency mapping for all terms.
+
+    Parameters
+    ----------
+    wglobal : function
+        Custom function for calculation idf, look at "universal" :func:`~gensim.models.tfidfmodel.updated_wglobal`.
+    dfs : dict
+        Dictionary with term_id and how many documents this token appeared.
+    total_docs : int
+        Total number of document.
+
+    Returns
+    -------
+    dict
+        Precomputed idfs in format {term_id_1: idfs_1, term_id_2: idfs_2, ...}
+
+    """
     # not strictly necessary and could be computed on the fly in TfidfModel__getitem__.
     # this method is here just to speed things up a little.
     return {termid: wglobal(df, total_docs) for termid, df in iteritems(dfs)}
 
 
-class TfidfModel(interfaces.TransformationABC):
-    """
-    Objects of this class realize the transformation between word-document co-occurrence
-    matrix (integers) into a locally/globally weighted TF_IDF matrix (positive floats).
+def updated_wlocal(tf, n_tf):
+    """A scheme to transform `tf` or term frequency based on the value of `n_tf`.
 
-    The main methods are:
+    Parameters
+    ----------
+    tf : int
+        Term frequency.
+    n_tf : {'n', 'l', 'a', 'b', 'L'}
+        Parameter to decide the current transformation scheme.
 
-    1. constructor, which calculates inverse document counts for all terms in the training corpus.
-    2. the [] method, which transforms a simple count representation into the TfIdf
-       space.
+    Returns
+    -------
+    float
+        Calculated wlocal.
 
-    >>> tfidf = TfidfModel(corpus)
-    >>> print(tfidf[some_doc])
-    >>> tfidf.save('/tmp/foo.tfidf_model')
+    """
+    if n_tf == "n":
+        return tf
+    elif n_tf == "l":
+        return 1 + np.log(tf) / np.log(2)
+    elif n_tf == "a":
+        return 0.5 + (0.5 * tf / tf.max(axis=0))
+    elif n_tf == "b":
+        return tf.astype('bool').astype('int')
+    elif n_tf == "L":
+        return (1 + np.log(tf) / np.log(2)) / (1 + np.log(tf.mean(axis=0) / np.log(2)))
+
+
+def updated_wglobal(docfreq, totaldocs, n_df):
+    """A scheme to transform `docfreq` or document frequency based on the value of `n_df`.
+
+    Parameters
+    ----------
+    docfreq : int
+        Document frequency.
+    totaldocs : int
+        Total number of documents.
+    n_df : {'n', 't', 'p'}
+        Parameter to decide the current transformation scheme.
+
+    Returns
+    -------
+    float
+        Calculated wglobal.
 
-    Model persistency is achieved via its load/save methods.
     """
+    if n_df == "n":
+        return utils.identity(docfreq)
+    elif n_df == "t":
+        return np.log(1.0 * totaldocs / docfreq) / np.log(2)
+    elif n_df == "p":
+        return np.log((1.0 * totaldocs - docfreq) / docfreq) / np.log(2)
 
-    def __init__(self, corpus=None, id2word=None, dictionary=None,
-                 wlocal=utils.identity, wglobal=df2idf, normalize=True):
-        """
-        Compute tf-idf by multiplying a local component (term frequency) with a
-        global component (inverse document frequency), and normalizing
-        the resulting documents to unit length. Formula for unnormalized weight
-        of term `i` in document `j` in a corpus of D documents::
 
-          weight_{i,j} = frequency_{i,j} * log_2(D / document_freq_{i})
+def updated_normalize(x, n_n):
+    """Normalizes the final tf-idf value according to the value of `n_n`.
+
+    Parameters
+    ----------
+    x : numpy.ndarray
+        Input array
+    n_n : {'n', 'c'}
+        Parameter that decides the normalizing function to be used.
 
-        or, more generally::
+    Returns
+    -------
+    numpy.ndarray
+        Normalized array.
 
-          weight_{i,j} = wlocal(frequency_{i,j}) * wglobal(document_freq_{i}, D)
+    """
+    if n_n == "n":
+        return x
+    elif n_n == "c":
+        return matutils.unitvec(x)
 
-        so you can plug in your own custom `wlocal` and `wglobal` functions.
 
-        Default for `wlocal` is identity (other options: math.sqrt, math.log1p, ...)
-        and default for `wglobal` is `log_2(total_docs / doc_freq)`, giving the
-        formula above.
+class TfidfModel(interfaces.TransformationABC):
+    """Objects of this class realize the transformation between word-document co-occurrence matrix (int)
+    into a locally/globally weighted TF_IDF matrix (positive floats).
+
+    Examples
+    --------
+    >>> import gensim.downloader as api
+    >>> from gensim.models import TfidfModel
+    >>> from gensim.corpora import Dictionary
+    >>>
+    >>> dataset = api.load("text8")
+    >>> dct = Dictionary(dataset)  # fit dictionary
+    >>> corpus = [dct.doc2bow(line) for line in dataset]  # convert dataset to BoW format
+    >>>
+    >>> model = TfidfModel(corpus)  # fit model
+    >>> vector = model[corpus[0]]  # apply model
+
+    """
 
-        `normalize` dictates how the final transformed vectors will be normalized.
-        `normalize=True` means set to unit length (default); `False` means don't
-        normalize. You can also set `normalize` to your own function that accepts
-        and returns a sparse vector.
+    def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.identity,
+                 wglobal=df2idf, normalize=True, smartirs=None):
+        """Compute tf-idf by multiplying a local component (term frequency) with a global component
+        (inverse document frequency), and normalizing the resulting documents to unit length.
+        Formula for non-normalized weight of term :math:`i` in document :math:`j` in a corpus of :math:`D` documents
+
+        .. math:: weight_{i,j} = frequency_{i,j} * log_2 \\frac{D}{document\_freq_{i}}
+
+        or, more generally
+
+        .. math:: weight_{i,j} = wlocal(frequency_{i,j}) * wglobal(document\_freq_{i}, D)
+
+        so you can plug in your own custom :math:`wlocal` and :math:`wglobal` functions.
+
+        Parameters
+        ----------
+        corpus : iterable of iterable of (int, int), optional
+            Input corpus
+        id2word : {dict, :class:`~gensim.corpora.Dictionary`}, optional
+            Mapping token - id, that was used for converting input data to bag of words format.
+        dictionary : :class:`~gensim.corpora.Dictionary`
+            If `dictionary` is specified, it must be a `corpora.Dictionary` object and it will be used.
+            to directly construct the inverse document frequency mapping (then `corpus`, if specified, is ignored).
+        wlocals : function, optional
+            Function for local weighting, default for `wlocal` is :func:`~gensim.utils.identity`
+            (other options: :func:`math.sqrt`, :func:`math.log1p`, etc).
+        wglobal : function, optional
+            Function for global weighting, default is :func:`~gensim.models.tfidfmodel.df2idf`.
+        normalize : bool, optional
+            It dictates how the final transformed vectors will be normalized. `normalize=True` means set to unit length
+            (default); `False` means don't normalize. You can also set `normalize` to your own function that accepts
+            and returns a sparse vector.
+        smartirs : str, optional
+            SMART (System for the Mechanical Analysis and Retrieval of Text) Information Retrieval System,
+            a mnemonic scheme for denoting tf-idf weighting variants in the vector space model.
+            The mnemonic for representing a combination of weights takes the form XYZ,
+            for example 'ntc', 'bpn' and so on, where the letters represents the term weighting of the document vector.
+
+            Term frequency weighing:
+                * `n` - natural,
+                * `l` - logarithm,
+                * `a` - augmented,
+                * `b` - boolean,
+                * `L` - log average.
+
+            Document frequency weighting:
+                * `n` - none,
+                * `t` - idf,
+                * `p` - prob idf.
+
+            Document normalization:
+                * `n` - none,
+                * `c` - cosine.
+
+            For more information visit [1]_.
 
-        If `dictionary` is specified, it must be a `corpora.Dictionary` object
-        and it will be used to directly construct the inverse document frequency
-        mapping (then `corpus`, if specified, is ignored).
         """
-        self.normalize = normalize
+
         self.id2word = id2word
-        self.wlocal, self.wglobal = wlocal, wglobal
+        self.wlocal, self.wglobal, self.normalize = wlocal, wglobal, normalize
         self.num_docs, self.num_nnz, self.idfs = None, None, None
+        self.smartirs = smartirs
+
+        # If smartirs is not None, override wlocal, wglobal and normalize
+        if smartirs is not None:
+            n_tf, n_df, n_n = resolve_weights(smartirs)
+
+            self.wlocal = partial(updated_wlocal, n_tf=n_tf)
+            self.wglobal = partial(updated_wglobal, n_df=n_df)
+            self.normalize = partial(updated_normalize, n_n=n_n)
+
         if dictionary is not None:
             # user supplied a Dictionary object, which already contains all the
             # statistics we need to construct the IDF mapping. we can skip the
@@ -106,13 +313,18 @@ def __str__(self):
         return "TfidfModel(num_docs=%s, num_nnz=%s)" % (self.num_docs, self.num_nnz)
 
     def initialize(self, corpus):
-        """
-        Compute inverse document weights, which will be used to modify term
-        frequencies for documents.
+        """Compute inverse document weights, which will be used to modify term frequencies for documents.
+
+        Parameters
+        ----------
+        corpus : iterable of iterable of (int, int)
+            Input corpus.
+
         """
         logger.info("collecting document frequencies")
         dfs = {}
         numnnz, docno = 0, -1
+
         for docno, bow in enumerate(corpus):
             if docno % 10000 == 0:
                 logger.info("PROGRESS: processing document #%i", docno)
@@ -124,7 +336,6 @@ def initialize(self, corpus):
         self.num_docs = docno + 1
         self.num_nnz = numnnz
         self.dfs = dfs
-
         # and finally compute the idf weights
         n_features = max(dfs) if dfs else 0
         logger.info(
@@ -134,8 +345,20 @@ def initialize(self, corpus):
         self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
 
     def __getitem__(self, bow, eps=1e-12):
-        """
-        Return tf-idf representation of the input vector and/or corpus.
+        """Get tf-idf representation of the input vector and/or corpus.
+
+        bow : {list of (int, int), iterable of iterable of (int, int)}
+            Input document or copus in BoW format.
+        eps : float
+            Threshold value, will remove all position that have tfidf-value less than `eps`.
+
+        Returns
+        -------
+        vector : list of (int, float)
+            TfIdf vector, if `bow` is document **OR**
+        :class:`~gensim.interfaces.TransformedCorpus`
+            TfIdf corpus, if `bow` is corpus.
+
         """
         # if the input vector is in fact a corpus, return a transformed corpus as a result
         is_corpus, bow = utils.is_corpus(bow)
@@ -144,17 +367,27 @@ def __getitem__(self, bow, eps=1e-12):
 
         # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
         # as strict application of the IDF formula would dictate)
+
+        termid_array, tf_array = [], []
+        for termid, tf in bow:
+            termid_array.append(termid)
+            tf_array.append(tf)
+
+        tf_array = self.wlocal(np.array(tf_array))
+
         vector = [
-            (termid, self.wlocal(tf) * self.idfs.get(termid))
-            for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0
+            (termid, tf * self.idfs.get(termid))
+            for termid, tf in zip(termid_array, tf_array) if abs(self.idfs.get(termid, 0.0)) > eps
         ]
 
+        if self.normalize is True:
+            self.normalize = matutils.unitvec
+        elif self.normalize is False:
+            self.normalize = utils.identity
+
         # and finally, normalize the vector either to unit length, or use a
         # user-defined normalization function
-        if self.normalize is True:
-            vector = matutils.unitvec(vector)
-        elif self.normalize:
-            vector = self.normalize(vector)
+        vector = self.normalize(vector)
 
         # make sure there are no explicit zeroes in the vector (must be sparse)
         vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps]
diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py
index c0a45f1823..dc4ab86c01 100644
--- a/gensim/sklearn_api/tfidf.py
+++ b/gensim/sklearn_api/tfidf.py
@@ -12,8 +12,8 @@
 from sklearn.base import TransformerMixin, BaseEstimator
 from sklearn.exceptions import NotFittedError
 
-import gensim
 from gensim.models import TfidfModel
+import gensim
 
 
 class TfIdfTransformer(TransformerMixin, BaseEstimator):
@@ -22,7 +22,7 @@ class TfIdfTransformer(TransformerMixin, BaseEstimator):
     """
 
     def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity,
-                 wglobal=gensim.models.tfidfmodel.df2idf, normalize=True):
+                 wglobal=gensim.models.tfidfmodel.df2idf, normalize=True, smartirs="ntc"):
         """
         Sklearn wrapper for Tf-Idf model.
         """
@@ -32,14 +32,15 @@ def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity,
         self.wlocal = wlocal
         self.wglobal = wglobal
         self.normalize = normalize
+        self.smartirs = smartirs
 
     def fit(self, X, y=None):
         """
         Fit the model according to the given training data.
         """
         self.gensim_model = TfidfModel(
-            corpus=X, id2word=self.id2word, dictionary=self.dictionary,
-            wlocal=self.wlocal, wglobal=self.wglobal, normalize=self.normalize
+            corpus=X, id2word=self.id2word, dictionary=self.dictionary, wlocal=self.wlocal,
+            wglobal=self.wglobal, normalize=self.normalize, smartirs=self.smartirs,
         )
         return self
 
diff --git a/gensim/test/test_data/tfidf_model.tst b/gensim/test/test_data/tfidf_model.tst
new file mode 100644
index 0000000000000000000000000000000000000000..e9e5f3f3cff5372e7e5ce18a89efe58321c84e4f
GIT binary patch
literal 458
zcmZ{gy-ve06otu8^W*T3*m!`lWataP*1>`aZ$wrayOkv;=Eg~@R;mOWZ^6KG@L+6Q
zBft}~j_&89W6AeChs{cMR<&hogw(bimP#zYnLIeWb2fqC`US#0L{pY`rf+Mlb`s)O
zmT}&V?bFu6q+&7VEhKwLr}SWc8$1T@Zy$;V9z&0j$Jpa!e)IYBEb(sY-ORhWcZtX1
zFEFwU|Hg%BEMzoisk?RE4sv?ME;;XIGxk=kCAaljYEugfq+*0~wZbJOUW<xFZl}XN
z9N|thqyfsE2o<9yND0mqXNEJ!Nl*n1N0urv+n8R!@jXj7D{U6FhLisr7t0w|NZ^zu
h8*Lg4pZ$i%L1{~rklmn$ErD}ay=|1M{-PQe<rl|zm%IP~

literal 0
HcmV?d00001

diff --git a/gensim/test/test_data/tfidf_model.tst.bz2 b/gensim/test/test_data/tfidf_model.tst.bz2
new file mode 100644
index 0000000000000000000000000000000000000000..1cb3b2513f43939bf32a3897ae2bb28ec9838143
GIT binary patch
literal 338
zcmV-Y0j>T*T4*^jL0KkKS=gsPrT_rX|M>s^`UC)B5CFi71_r;U|FU2}004*p00ICY
zFaa9dHX0(C2dU_R=pX<9XwVrRpa`CzC#q@c05oW502%-Q07_|7Aw4Ecn?f0)W{HWT
zL4W|#L;Vr~C^N8ePzY=&PU1tj<FXyu$4D_EQjUkdcSEEDsILp}N)!)rq|yO0q*1X@
zE*PV?luF=cqBAo%8uz{IU;<!BNK6S(454w-mF|s~)-~03YYa@1LXJ9lRezNy@<x@i
zcfPD1aF;&w!vt4dgj7Lwb9JNPBS9d?ayFSw<RA+q)&3$Ht52-9*i#8erk-|6(b#<<
z`T)w<W5RHksqCZ%3r;oSg9vGg?gUng=f$uf?t~^8t^smR1x<)iJr%mwjuJ|uBXt}D
kN|0?_ac{dP!fDc<&MuP74fqW(G{M|m$rRy2Lt>o%n8<UFApigX

literal 0
HcmV?d00001

diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py
index 3793c79948..5e0511aa5c 100644
--- a/gensim/test/test_sklearn_api.py
+++ b/gensim/test/test_sklearn_api.py
@@ -973,13 +973,13 @@ def testTransform(self):
 
     def testSetGetParams(self):
         # updating only one param
-        self.model.set_params(normalize=False)
+        self.model.set_params(smartirs='nnn')
         model_params = self.model.get_params()
-        self.assertEqual(model_params["normalize"], False)
+        self.assertEqual(model_params["smartirs"], 'nnn')
 
         # verify that the attributes values are also changed for `gensim_model` after fitting
         self.model.fit(self.corpus)
-        self.assertEqual(getattr(self.model.gensim_model, 'normalize'), False)
+        self.assertEqual(getattr(self.model.gensim_model, 'smartirs'), 'nnn')
 
     def testPipeline(self):
         with open(datapath('mini_newsgroup'), 'rb') as f:
diff --git a/gensim/test/test_tfidfmodel.py b/gensim/test/test_tfidfmodel.py
index c308923c29..35fbb4a4af 100644
--- a/gensim/test/test_tfidfmodel.py
+++ b/gensim/test/test_tfidfmodel.py
@@ -18,6 +18,22 @@
 from gensim.models import tfidfmodel
 from gensim.test.utils import datapath, get_tmpfile, common_dictionary, common_corpus
 
+from gensim.corpora import Dictionary
+
+texts = [
+    ['complier', 'system', 'computer'],
+    ['eulerian', 'node', 'cycle', 'graph', 'tree', 'path'],
+    ['graph', 'flow', 'network', 'graph'],
+    ['loading', 'computer', 'system'],
+    ['user', 'server', 'system'],
+    ['tree', 'hamiltonian'],
+    ['graph', 'trees'],
+    ['computer', 'kernel', 'malfunction', 'computer'],
+    ['server', 'system', 'computer'],
+]
+dictionary = Dictionary(texts)
+corpus = [dictionary.doc2bow(text) for text in texts]
+
 
 class TestTfidfModel(unittest.TestCase):
     def setUp(self):
@@ -50,23 +66,222 @@ def testInit(self):
         self.assertEqual(model1.idfs, model2.idfs)
 
     def testPersistence(self):
+        # Test persistence without using `smartirs`
         fname = get_tmpfile('gensim_models.tst')
         model = tfidfmodel.TfidfModel(self.corpus, normalize=True)
         model.save(fname)
         model2 = tfidfmodel.TfidfModel.load(fname)
         self.assertTrue(model.idfs == model2.idfs)
-        tstvec = []
-        self.assertTrue(np.allclose(model[tstvec], model2[tstvec]))  # try projecting an empty vector
+        tstvec = [corpus[1], corpus[2]]
+        self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]]))
+        self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]]))
+        self.assertTrue(np.allclose(model[[]], model2[[]]))  # try projecting an empty vector
+
+        # Test persistence with using `smartirs`
+        fname = get_tmpfile('gensim_models_smartirs.tst')
+        model = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc")
+        model.save(fname)
+        model2 = tfidfmodel.TfidfModel.load(fname)
+        self.assertTrue(model.idfs == model2.idfs)
+        tstvec = [corpus[1], corpus[2]]
+        self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]]))
+        self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]]))
+        self.assertTrue(np.allclose(model[[]], model2[[]]))  # try projecting an empty vector
+
+        # Test persistence between Gensim v3.2.0 and current model.
+        model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc")
+        model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst'))
+        self.assertTrue(model3.idfs == model4.idfs)
+        tstvec = [corpus[1], corpus[2]]
+        self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]]))
+        self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]]))
+        self.assertTrue(np.allclose(model3[[]], model4[[]]))  # try projecting an empty vector
 
     def testPersistenceCompressed(self):
+        # Test persistence without using `smartirs`
         fname = get_tmpfile('gensim_models.tst.gz')
         model = tfidfmodel.TfidfModel(self.corpus, normalize=True)
         model.save(fname)
         model2 = tfidfmodel.TfidfModel.load(fname, mmap=None)
         self.assertTrue(model.idfs == model2.idfs)
-        tstvec = []
-        self.assertTrue(np.allclose(model[tstvec], model2[tstvec]))  # try projecting an empty vector
-# endclass TestTfidfModel
+        tstvec = [corpus[1], corpus[2]]
+        self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]]))
+        self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]]))
+        self.assertTrue(np.allclose(model[[]], model2[[]]))  # try projecting an empty vector
+
+        # Test persistence with using `smartirs`
+        fname = get_tmpfile('gensim_models_smartirs.tst.gz')
+        model = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc")
+        model.save(fname)
+        model2 = tfidfmodel.TfidfModel.load(fname, mmap=None)
+        self.assertTrue(model.idfs == model2.idfs)
+        tstvec = [corpus[1], corpus[2]]
+        self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]]))
+        self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]]))
+        self.assertTrue(np.allclose(model[[]], model2[[]]))  # try projecting an empty vector
+
+        # Test persistence between Gensim v3.2.0 and current compressed model.
+        model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc")
+        model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst.bz2'))
+        self.assertTrue(model3.idfs == model4.idfs)
+        tstvec = [corpus[1], corpus[2]]
+        self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]]))
+        self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]]))
+        self.assertTrue(np.allclose(model3[[]], model4[[]]))  # try projecting an empty vector
+
+    def TestConsistency(self):
+        docs = [corpus[1], corpus[2]]
+
+        # Test if `ntc` yields the default docs.
+        model = tfidfmodel.TfidfModel(self.corpus, smartirs='ntc')
+        transformed_docs = [model[docs[0]], model[docs[1]]]
+
+        model = tfidfmodel.TfidfModel(self.corpus)
+        expected_docs = [model[docs[0]], model[docs[1]]]
+
+        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
+
+        # Testing all the variations of `wlocal`
+        # nnn
+        model = tfidfmodel.TfidfModel(self.corpus, smartirs='nnn')
+        transformed_docs = [model[docs[0]], model[docs[1]]]
+        expected_docs = [[(3, 2),
+                          (4, 2),
+                          (5, 3),
+                          (6, 2),
+                          (7, 3),
+                          (8, 2)],
+                         [(5, 6),
+                          (9, 3),
+                          (10, 3)]]
+
+        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
+
+        # lnn
+        model = tfidfmodel.TfidfModel(self.corpus, smartirs='lnn')
+        transformed_docs = [model[docs[0]], model[docs[1]]]
+        expected_docs = [[(3, 2.0),
+                          (4, 2.0),
+                          (5, 3.0),
+                          (6, 2.0),
+                          (7, 3.0),
+                          (8, 2.0)],
+                         [(5, 6.0),
+                          (9, 3.0),
+                          (10, 3.0)]]
+
+        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
+
+        # ann
+        model = tfidfmodel.TfidfModel(self.corpus, smartirs='ann')
+        transformed_docs = [model[docs[0]], model[docs[1]]]
+        expected_docs = [[(3, 2.0),
+                          (4, 2.0),
+                          (5, 3.0),
+                          (6, 2.0),
+                          (7, 3.0),
+                          (8, 2.0)],
+                         [(5, 3.0),
+                          (9, 2.25),
+                          (10, 2.25)]]
+
+        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
+
+        # bnn
+        model = tfidfmodel.TfidfModel(self.corpus, smartirs='bnn')
+        transformed_docs = [model[docs[0]], model[docs[1]]]
+        expected_docs = [[(3, 2),
+                          (4, 2),
+                          (5, 3),
+                          (6, 2),
+                          (7, 3),
+                          (8, 2)],
+                         [(5, 3),
+                          (9, 3),
+                          (10, 3)]]
+
+        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
+
+        # Lnn
+        model = tfidfmodel.TfidfModel(self.corpus, smartirs='Lnn')
+        transformed_docs = [model[docs[0]], model[docs[1]]]
+        expected_docs = [[(3, 1.4635792826230198),
+                          (4, 1.4635792826230198),
+                          (5, 2.19536892393453),
+                          (6, 1.4635792826230198),
+                          (7, 2.19536892393453),
+                          (8, 1.4635792826230198)],
+                         [(5, 3.627141918134611),
+                          (9, 1.8135709590673055),
+                          (10, 1.8135709590673055)]]
+
+        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
+
+        # Testing all the variations of `glocal`
+        # ntn
+        model = tfidfmodel.TfidfModel(self.corpus, smartirs='ntn')
+        transformed_docs = [model[docs[0]], model[docs[1]]]
+        expected_docs = [[(3, 2.1699250014423126),
+                          (4, 2.1699250014423126),
+                          (5, 1.5849625007211563),
+                          (6, 2.1699250014423126),
+                          (7, 1.5849625007211563),
+                          (8, 2.1699250014423126)],
+                         [(5, 3.1699250014423126),
+                          (9, 1.5849625007211563),
+                          (10, 1.5849625007211563)]]
+
+        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
+
+        # npn
+        model = tfidfmodel.TfidfModel(self.corpus, smartirs='npn')
+        transformed_docs = [model[docs[0]], model[docs[1]]]
+        expected_docs = [[(3, 1.8073549220576042),
+                          (4, 1.8073549220576042),
+                          (5, 1.0),
+                          (6, 1.8073549220576042),
+                          (7, 1.0),
+                          (8, 1.8073549220576042)],
+                         [(5, 2.0),
+                          (9, 1.0),
+                          (10, 1.0)]]
+
+        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
+
+        # Testing all the variations of `normalize`
+        # nnc
+        model = tfidfmodel.TfidfModel(self.corpus, smartirs='nnc')
+        transformed_docs = [model[docs[0]], model[docs[1]]]
+        expected_docs = [[(3, 0.34299717028501764),
+                          (4, 0.34299717028501764),
+                          (5, 0.51449575542752646),
+                          (6, 0.34299717028501764),
+                          (7, 0.51449575542752646),
+                          (8, 0.34299717028501764)],
+                         [(5, 0.81649658092772603),
+                          (9, 0.40824829046386302),
+                          (10, 0.40824829046386302)]]
+
+        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
+
+        model = tfidfmodel.TfidfModel(self.corpus, wlocal=lambda x: x, wglobal=lambda x, y: x * x, smartirs='nnc')
+
+        transformed_docs = [model[docs[0]], model[docs[1]]]
+
+        model = tfidfmodel.TfidfModel(self.corpus, wlocal=lambda x: x * x, wglobal=lambda x, y: x, smartirs='nnc')
+        expected_docs = [model[docs[0]], model[docs[1]]]
+
+        self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
+        self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))
 
 
 if __name__ == '__main__':