piskvorky · menshikh-iv · Jan 15, 2018 · Oct 20, 2016 · Dec 2, 2016 · Dec 15, 2017
diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
@@ -6,51 +6,100 @@
 
 
 import logging
-import math
+from functools import partial
 
 from gensim import interfaces, matutils, utils
 from six import iteritems
 
+import numpy as np
 
 logger = logging.getLogger(__name__)
 
 
+def resolve_weights(smartirs):
+    """
+    Checks for validity of smartirs parameter.
+    """
+    if not isinstance(smartirs, str) or len(smartirs) != 3:
+        raise ValueError("Expected a string of length 3 except got " + smartirs)
+
+    w_tf, w_df, w_n = smartirs
+
+    if w_tf not in 'nlabL':
+        raise ValueError("Expected term frequency weight to be one of 'nlabL', except got " + w_tf)
+
+    if w_df not in 'ntp':
+        raise ValueError("Expected inverse document frequency weight to be one of 'ntp', except got " + w_df)
+
+    if w_n not in 'ncb':
+        raise ValueError("Expected normalization weight to be one of 'ncb', except got " + w_n)
+
+    return w_tf, w_df, w_n
+
+
 def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0):
     """
     Compute default inverse-document-frequency for a term with document frequency `doc_freq`::
-
-      idf = add + log(totaldocs / doc_freq)
+    idf = add + log(totaldocs / doc_freq)
     """
-    return add + math.log(1.0 * totaldocs / docfreq, log_base)
+    return add + np.log(float(totaldocs) / docfreq) / np.log(log_base)
 
 
 def precompute_idfs(wglobal, dfs, total_docs):
-    """Precompute the inverse document frequency mapping for all terms."""
+    """
+    Precompute the inverse document frequency mapping for all terms.
+    """
     # not strictly necessary and could be computed on the fly in TfidfModel__getitem__.
     # this method is here just to speed things up a little.
     return {termid: wglobal(df, total_docs) for termid, df in iteritems(dfs)}
 
 
+def updated_wlocal(tf, n_tf):
+    if n_tf == "n":
+        return tf
+    elif n_tf == "l":
+        return 1 + np.log(tf) / np.log(2)
+    elif n_tf == "a":
+        return 0.5 + (0.5 * tf / tf.max(axis=0))
+    elif n_tf == "b":
+        return tf.astype('bool').astype('int')
+    elif n_tf == "L":
+        return (1 + np.log(tf) / np.log(2)) / (1 + np.log(tf.mean(axis=0) / np.log(2)))
+
+
+def updated_wglobal(docfreq, totaldocs, n_df):  # TODO rename it (to avoid confusion)
+    if n_df == "n":
+        return utils.identity(docfreq)
+    elif n_df == "t":
+        return np.log(1.0 * totaldocs / docfreq) / np.log(2)
+    elif n_df == "p":
+        return np.log((1.0 * totaldocs - docfreq) / docfreq) / np.log(2)
+
+
+def updated_normalize(x, n_n):  # TODO rename it (to avoid confusion)
+    if n_n == "n":
+        return x
+    elif n_n == "c":
+        return matutils.unitvec(x)
+
+
 class TfidfModel(interfaces.TransformationABC):
     """
     Objects of this class realize the transformation between word-document co-occurrence
     matrix (integers) into a locally/globally weighted TF_IDF matrix (positive floats).
 
-    The main methods are:
-
-    1. constructor, which calculates inverse document counts for all terms in the training corpus.
-    2. the [] method, which transforms a simple count representation into the TfIdf
-       space.
-
+    Examples
+    --------
     >>> tfidf = TfidfModel(corpus)
     >>> print(tfidf[some_doc])
     >>> tfidf.save('/tmp/foo.tfidf_model')
 
     Model persistency is achieved via its load/save methods.
+
     """
 
-    def __init__(self, corpus=None, id2word=None, dictionary=None,
-                 wlocal=utils.identity, wglobal=df2idf, normalize=True):
+    def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.identity,
+                 wglobal=df2idf, normalize=True, smartirs=None):
         """
         Compute tf-idf by multiplying a local component (term frequency) with a
         global component (inverse document frequency), and normalizing
@@ -65,23 +114,64 @@ def __init__(self, corpus=None, id2word=None, dictionary=None,
 
         so you can plug in your own custom `wlocal` and `wglobal` functions.
 
-        Default for `wlocal` is identity (other options: math.sqrt, math.log1p, ...)
-        and default for `wglobal` is `log_2(total_docs / doc_freq)`, giving the
-        formula above.
 
-        `normalize` dictates how the final transformed vectors will be normalized.
-        `normalize=True` means set to unit length (default); `False` means don't
-        normalize. You can also set `normalize` to your own function that accepts
-        and returns a sparse vector.
+        Parameters
+        ----------
+        corpus :    dictionary.doc2bow
+                    Corpus is a list of sets where each set has two elements. First being the termid and
+                    second being the term frequency of each term in the document.
+        id2word :   dict
+                    id2word is an optional dictionary that maps the word_id to a token.
+                    In case id2word isn’t specified the mapping id2word[word_id] = str(word_id) will be used.
+        dictionary :corpora.Dictionary
+                    If `dictionary` is specified, it must be a `corpora.Dictionary` object
+                    and it will be used to directly construct the inverse document frequency
+                    mapping (then `corpus`, if specified, is ignored).
+        wlocals :   user specified function
+                    Default for `wlocal` is identity (other options: math.sqrt, math.log1p, ...)
+        wglobal :   user specified function
+                    Default for `wglobal` is `log_2(total_docs / doc_freq)`, giving the
+                    formula above.
+        normalize : user specified function
+                    It dictates how the final transformed vectors will be normalized.
+                    `normalize=True` means set to unit length (default); `False` means don't
+                    normalize. You can also set `normalize` to your own function that accepts
+                    and returns a sparse vector.
+        smartirs : {'None' ,'str'}
+                    `smartirs` or SMART (System for the Mechanical Analysis and Retrieval of Text)
+                    Information Retrieval System, a mnemonic scheme for denoting tf-idf weighting
+                    variants in the vector space model. The mnemonic for representing a combination
+                    of weights takes the form ddd, where the letters represents the term weighting
+                    of the document vector.
+
+                    Term frequency weighing:
+                      natural - `n`, logarithm - `l` , augmented - `a`,  boolean `b`, log average - `L`.
+                    Document frequency weighting:
+                      none - `n`, idf - `t`, prob idf - `p`.
+                    Document normalization:
+                      none - `n`, cosine - `c`.
+
+                    for more information visit https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System
+
+        Returns
+        -------
+        x : gensim.models.tfidfmodel.TfidfModel
 
-        If `dictionary` is specified, it must be a `corpora.Dictionary` object
-        and it will be used to directly construct the inverse document frequency
-        mapping (then `corpus`, if specified, is ignored).
         """
-        self.normalize = normalize
+
         self.id2word = id2word
-        self.wlocal, self.wglobal = wlocal, wglobal
+        self.wlocal, self.wglobal, self.normalize = wlocal, wglobal, normalize
         self.num_docs, self.num_nnz, self.idfs = None, None, None
+        self.smartirs = smartirs
+
+        # If smartirs is not None, override wlocal, wglobal and normalize
+        if smartirs is not None:
+            n_tf, n_df, n_n = resolve_weights(smartirs)
+
+            self.wlocal = partial(updated_wlocal, n_tf=n_tf)
+            self.wglobal = partial(updated_wglobal, n_df=n_df)
+            self.normalize = partial(updated_normalize, n_n=n_n)
+
         if dictionary is not None:
             # user supplied a Dictionary object, which already contains all the
             # statistics we need to construct the IDF mapping. we can skip the
@@ -113,6 +203,7 @@ def initialize(self, corpus):
         logger.info("collecting document frequencies")
         dfs = {}
         numnnz, docno = 0, -1
+
         for docno, bow in enumerate(corpus):
             if docno % 10000 == 0:
                 logger.info("PROGRESS: processing document #%i", docno)
@@ -124,7 +215,6 @@ def initialize(self, corpus):
         self.num_docs = docno + 1
         self.num_nnz = numnnz
         self.dfs = dfs
-
         # and finally compute the idf weights
         n_features = max(dfs) if dfs else 0
         logger.info(
@@ -144,17 +234,27 @@ def __getitem__(self, bow, eps=1e-12):
 
         # unknown (new) terms will be given zero weight (NOT infinity/huge weight,
         # as strict application of the IDF formula would dictate)
+
+        termid_array, tf_array = [], []
+        for termid, tf in bow:
+            termid_array.append(termid)
+            tf_array.append(tf)
+
+        tf_array = self.wlocal(np.array(tf_array))
+
         vector = [
-            (termid, self.wlocal(tf) * self.idfs.get(termid))
-            for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0
+            (termid, tf * self.idfs.get(termid))
+            for termid, tf in zip(termid_array, tf_array) if self.idfs.get(termid, 0.0) != 0.0
         ]
 
+        if self.normalize is True:
+            self.normalize = matutils.unitvec
+        elif self.normalize is False:
+            self.normalize = utils.identity
+
         # and finally, normalize the vector either to unit length, or use a
         # user-defined normalization function
-        if self.normalize is True:
-            vector = matutils.unitvec(vector)
-        elif self.normalize:
-            vector = self.normalize(vector)
+        vector = self.normalize(vector)
 
         # make sure there are no explicit zeroes in the vector (must be sparse)
         vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps]

diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py
@@ -12,8 +12,8 @@
 from sklearn.base import TransformerMixin, BaseEstimator
 from sklearn.exceptions import NotFittedError
 
-import gensim
 from gensim.models import TfidfModel
+import gensim
 
 
 class TfIdfTransformer(TransformerMixin, BaseEstimator):
@@ -22,7 +22,7 @@ class TfIdfTransformer(TransformerMixin, BaseEstimator):
     """
 
     def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity,
-                 wglobal=gensim.models.tfidfmodel.df2idf, normalize=True):
+                 wglobal=gensim.models.tfidfmodel.df2idf, normalize=True, smartirs="ntc"):
         """
         Sklearn wrapper for Tf-Idf model.
         """
@@ -32,14 +32,15 @@ def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity,
         self.wlocal = wlocal
         self.wglobal = wglobal
         self.normalize = normalize
+        self.smartirs = smartirs
 
     def fit(self, X, y=None):
         """
         Fit the model according to the given training data.
         """
         self.gensim_model = TfidfModel(
-            corpus=X, id2word=self.id2word, dictionary=self.dictionary,
-            wlocal=self.wlocal, wglobal=self.wglobal, normalize=self.normalize
+            corpus=X, id2word=self.id2word, dictionary=self.dictionary, wlocal=self.wlocal,
+            wglobal=self.wglobal, normalize=self.normalize, smartirs=self.smartirs,
         )
         return self
 

diff --git a/gensim/test/test_data/tfidf_model.tst b/gensim/test/test_data/tfidf_model.tst
diff --git a/gensim/test/test_data/tfidf_model.tst.bz2 b/gensim/test/test_data/tfidf_model.tst.bz2
diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py
@@ -973,13 +973,13 @@ def testTransform(self):
 
     def testSetGetParams(self):
         # updating only one param
-        self.model.set_params(normalize=False)
+        self.model.set_params(smartirs='nnn')
         model_params = self.model.get_params()
-        self.assertEqual(model_params["normalize"], False)
+        self.assertEqual(model_params["smartirs"], 'nnn')
 
         # verify that the attributes values are also changed for `gensim_model` after fitting
         self.model.fit(self.corpus)
-        self.assertEqual(getattr(self.model.gensim_model, 'normalize'), False)
+        self.assertEqual(getattr(self.model.gensim_model, 'smartirs'), 'nnn')
 
     def testPipeline(self):
         with open(datapath('mini_newsgroup'), 'rb') as f: