diff --git a/docs/src/models/tfidfmodel.rst b/docs/src/models/tfidfmodel.rst index 6b622d7589..55907470d3 100644 --- a/docs/src/models/tfidfmodel.rst +++ b/docs/src/models/tfidfmodel.rst @@ -1,5 +1,5 @@ :mod:`models.tfidfmodel` -- TF-IDF model -====================================================== +======================================== .. automodule:: gensim.models.tfidfmodel :synopsis: TF-IDF model @@ -7,3 +7,4 @@ :inherited-members: :undoc-members: :show-inheritance: + :special-members: __getitem__ diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index 50320ad747..a61e993333 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -2,86 +2,293 @@ # -*- coding: utf-8 -*- # # Copyright (C) 2012 Radim Rehurek +# Copyright (C) 2017 Mohit Rathore # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html import logging -import math +from functools import partial from gensim import interfaces, matutils, utils from six import iteritems +import numpy as np logger = logging.getLogger(__name__) -def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0): +def resolve_weights(smartirs): + """Checks for validity of `smartirs` parameter. + + Parameters + ---------- + smartirs : str + `smartirs` or SMART (System for the Mechanical Analysis and Retrieval of Text) + Information Retrieval System, a mnemonic scheme for denoting tf-idf weighting + variants in the vector space model. The mnemonic for representing a combination + of weights takes the form ddd, where the letters represents the term weighting of the document vector. + for more information visit [1]_. + + Returns + ------- + w_tf : str + Term frequency weighing: + * `n` - natural, + * `l` - logarithm, + * `a` - augmented, + * `b` - boolean, + * `L` - log average. + w_df : str + Document frequency weighting: + * `n` - none, + * `t` - idf, + * `p` - prob idf. + w_n : str + Document normalization: + * `n` - none, + * `c` - cosine. + + Raises + ------ + ValueError + If `smartirs` is not a string of length 3 or one of the decomposed value + doesn't fit the list of permissible values + + References + ---------- + .. [1] https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System + """ - Compute default inverse-document-frequency for a term with document frequency `doc_freq`:: + if not isinstance(smartirs, str) or len(smartirs) != 3: + raise ValueError("Expected a string of length 3 except got " + smartirs) + + w_tf, w_df, w_n = smartirs + + if w_tf not in 'nlabL': + raise ValueError("Expected term frequency weight to be one of 'nlabL', except got {}".format(w_tf)) + + if w_df not in 'ntp': + raise ValueError("Expected inverse document frequency weight to be one of 'ntp', except got {}".format(w_df)) + + if w_n not in 'ncb': + raise ValueError("Expected normalization weight to be one of 'ncb', except got {}".format(w_n)) + + return w_tf, w_df, w_n + + +def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0): + """Compute default inverse-document-frequency for a term with document frequency: + :math:`idf = add + log_{log\_base} \\frac{totaldocs}{doc\_freq}` + + Parameters + ---------- + docfreq : float + Document frequency. + totaldocs : int + Total number of documents. + log_base : float, optional + Base of logarithm. + add : float, optional + Offset. + + Returns + ------- + float + Inverse document frequency. - idf = add + log(totaldocs / doc_freq) """ - return add + math.log(1.0 * totaldocs / docfreq, log_base) + return add + np.log(float(totaldocs) / docfreq) / np.log(log_base) def precompute_idfs(wglobal, dfs, total_docs): - """Precompute the inverse document frequency mapping for all terms.""" + """Pre-compute the inverse document frequency mapping for all terms. + + Parameters + ---------- + wglobal : function + Custom function for calculation idf, look at "universal" :func:`~gensim.models.tfidfmodel.updated_wglobal`. + dfs : dict + Dictionary with term_id and how many documents this token appeared. + total_docs : int + Total number of document. + + Returns + ------- + dict + Precomputed idfs in format {term_id_1: idfs_1, term_id_2: idfs_2, ...} + + """ # not strictly necessary and could be computed on the fly in TfidfModel__getitem__. # this method is here just to speed things up a little. return {termid: wglobal(df, total_docs) for termid, df in iteritems(dfs)} -class TfidfModel(interfaces.TransformationABC): - """ - Objects of this class realize the transformation between word-document co-occurrence - matrix (integers) into a locally/globally weighted TF_IDF matrix (positive floats). +def updated_wlocal(tf, n_tf): + """A scheme to transform `tf` or term frequency based on the value of `n_tf`. - The main methods are: + Parameters + ---------- + tf : int + Term frequency. + n_tf : {'n', 'l', 'a', 'b', 'L'} + Parameter to decide the current transformation scheme. - 1. constructor, which calculates inverse document counts for all terms in the training corpus. - 2. the [] method, which transforms a simple count representation into the TfIdf - space. + Returns + ------- + float + Calculated wlocal. - >>> tfidf = TfidfModel(corpus) - >>> print(tfidf[some_doc]) - >>> tfidf.save('/tmp/foo.tfidf_model') + """ + if n_tf == "n": + return tf + elif n_tf == "l": + return 1 + np.log(tf) / np.log(2) + elif n_tf == "a": + return 0.5 + (0.5 * tf / tf.max(axis=0)) + elif n_tf == "b": + return tf.astype('bool').astype('int') + elif n_tf == "L": + return (1 + np.log(tf) / np.log(2)) / (1 + np.log(tf.mean(axis=0) / np.log(2))) + + +def updated_wglobal(docfreq, totaldocs, n_df): + """A scheme to transform `docfreq` or document frequency based on the value of `n_df`. + + Parameters + ---------- + docfreq : int + Document frequency. + totaldocs : int + Total number of documents. + n_df : {'n', 't', 'p'} + Parameter to decide the current transformation scheme. + + Returns + ------- + float + Calculated wglobal. - Model persistency is achieved via its load/save methods. """ + if n_df == "n": + return utils.identity(docfreq) + elif n_df == "t": + return np.log(1.0 * totaldocs / docfreq) / np.log(2) + elif n_df == "p": + return np.log((1.0 * totaldocs - docfreq) / docfreq) / np.log(2) - def __init__(self, corpus=None, id2word=None, dictionary=None, - wlocal=utils.identity, wglobal=df2idf, normalize=True): - """ - Compute tf-idf by multiplying a local component (term frequency) with a - global component (inverse document frequency), and normalizing - the resulting documents to unit length. Formula for unnormalized weight - of term `i` in document `j` in a corpus of D documents:: - weight_{i,j} = frequency_{i,j} * log_2(D / document_freq_{i}) +def updated_normalize(x, n_n): + """Normalizes the final tf-idf value according to the value of `n_n`. + + Parameters + ---------- + x : numpy.ndarray + Input array + n_n : {'n', 'c'} + Parameter that decides the normalizing function to be used. - or, more generally:: + Returns + ------- + numpy.ndarray + Normalized array. - weight_{i,j} = wlocal(frequency_{i,j}) * wglobal(document_freq_{i}, D) + """ + if n_n == "n": + return x + elif n_n == "c": + return matutils.unitvec(x) - so you can plug in your own custom `wlocal` and `wglobal` functions. - Default for `wlocal` is identity (other options: math.sqrt, math.log1p, ...) - and default for `wglobal` is `log_2(total_docs / doc_freq)`, giving the - formula above. +class TfidfModel(interfaces.TransformationABC): + """Objects of this class realize the transformation between word-document co-occurrence matrix (int) + into a locally/globally weighted TF_IDF matrix (positive floats). + + Examples + -------- + >>> import gensim.downloader as api + >>> from gensim.models import TfidfModel + >>> from gensim.corpora import Dictionary + >>> + >>> dataset = api.load("text8") + >>> dct = Dictionary(dataset) # fit dictionary + >>> corpus = [dct.doc2bow(line) for line in dataset] # convert dataset to BoW format + >>> + >>> model = TfidfModel(corpus) # fit model + >>> vector = model[corpus[0]] # apply model + + """ - `normalize` dictates how the final transformed vectors will be normalized. - `normalize=True` means set to unit length (default); `False` means don't - normalize. You can also set `normalize` to your own function that accepts - and returns a sparse vector. + def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.identity, + wglobal=df2idf, normalize=True, smartirs=None): + """Compute tf-idf by multiplying a local component (term frequency) with a global component + (inverse document frequency), and normalizing the resulting documents to unit length. + Formula for non-normalized weight of term :math:`i` in document :math:`j` in a corpus of :math:`D` documents + + .. math:: weight_{i,j} = frequency_{i,j} * log_2 \\frac{D}{document\_freq_{i}} + + or, more generally + + .. math:: weight_{i,j} = wlocal(frequency_{i,j}) * wglobal(document\_freq_{i}, D) + + so you can plug in your own custom :math:`wlocal` and :math:`wglobal` functions. + + Parameters + ---------- + corpus : iterable of iterable of (int, int), optional + Input corpus + id2word : {dict, :class:`~gensim.corpora.Dictionary`}, optional + Mapping token - id, that was used for converting input data to bag of words format. + dictionary : :class:`~gensim.corpora.Dictionary` + If `dictionary` is specified, it must be a `corpora.Dictionary` object and it will be used. + to directly construct the inverse document frequency mapping (then `corpus`, if specified, is ignored). + wlocals : function, optional + Function for local weighting, default for `wlocal` is :func:`~gensim.utils.identity` + (other options: :func:`math.sqrt`, :func:`math.log1p`, etc). + wglobal : function, optional + Function for global weighting, default is :func:`~gensim.models.tfidfmodel.df2idf`. + normalize : bool, optional + It dictates how the final transformed vectors will be normalized. `normalize=True` means set to unit length + (default); `False` means don't normalize. You can also set `normalize` to your own function that accepts + and returns a sparse vector. + smartirs : str, optional + SMART (System for the Mechanical Analysis and Retrieval of Text) Information Retrieval System, + a mnemonic scheme for denoting tf-idf weighting variants in the vector space model. + The mnemonic for representing a combination of weights takes the form XYZ, + for example 'ntc', 'bpn' and so on, where the letters represents the term weighting of the document vector. + + Term frequency weighing: + * `n` - natural, + * `l` - logarithm, + * `a` - augmented, + * `b` - boolean, + * `L` - log average. + + Document frequency weighting: + * `n` - none, + * `t` - idf, + * `p` - prob idf. + + Document normalization: + * `n` - none, + * `c` - cosine. + + For more information visit [1]_. - If `dictionary` is specified, it must be a `corpora.Dictionary` object - and it will be used to directly construct the inverse document frequency - mapping (then `corpus`, if specified, is ignored). """ - self.normalize = normalize + self.id2word = id2word - self.wlocal, self.wglobal = wlocal, wglobal + self.wlocal, self.wglobal, self.normalize = wlocal, wglobal, normalize self.num_docs, self.num_nnz, self.idfs = None, None, None + self.smartirs = smartirs + + # If smartirs is not None, override wlocal, wglobal and normalize + if smartirs is not None: + n_tf, n_df, n_n = resolve_weights(smartirs) + + self.wlocal = partial(updated_wlocal, n_tf=n_tf) + self.wglobal = partial(updated_wglobal, n_df=n_df) + self.normalize = partial(updated_normalize, n_n=n_n) + if dictionary is not None: # user supplied a Dictionary object, which already contains all the # statistics we need to construct the IDF mapping. we can skip the @@ -106,13 +313,18 @@ def __str__(self): return "TfidfModel(num_docs=%s, num_nnz=%s)" % (self.num_docs, self.num_nnz) def initialize(self, corpus): - """ - Compute inverse document weights, which will be used to modify term - frequencies for documents. + """Compute inverse document weights, which will be used to modify term frequencies for documents. + + Parameters + ---------- + corpus : iterable of iterable of (int, int) + Input corpus. + """ logger.info("collecting document frequencies") dfs = {} numnnz, docno = 0, -1 + for docno, bow in enumerate(corpus): if docno % 10000 == 0: logger.info("PROGRESS: processing document #%i", docno) @@ -124,7 +336,6 @@ def initialize(self, corpus): self.num_docs = docno + 1 self.num_nnz = numnnz self.dfs = dfs - # and finally compute the idf weights n_features = max(dfs) if dfs else 0 logger.info( @@ -134,8 +345,20 @@ def initialize(self, corpus): self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs) def __getitem__(self, bow, eps=1e-12): - """ - Return tf-idf representation of the input vector and/or corpus. + """Get tf-idf representation of the input vector and/or corpus. + + bow : {list of (int, int), iterable of iterable of (int, int)} + Input document or copus in BoW format. + eps : float + Threshold value, will remove all position that have tfidf-value less than `eps`. + + Returns + ------- + vector : list of (int, float) + TfIdf vector, if `bow` is document **OR** + :class:`~gensim.interfaces.TransformedCorpus` + TfIdf corpus, if `bow` is corpus. + """ # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) @@ -144,17 +367,27 @@ def __getitem__(self, bow, eps=1e-12): # unknown (new) terms will be given zero weight (NOT infinity/huge weight, # as strict application of the IDF formula would dictate) + + termid_array, tf_array = [], [] + for termid, tf in bow: + termid_array.append(termid) + tf_array.append(tf) + + tf_array = self.wlocal(np.array(tf_array)) + vector = [ - (termid, self.wlocal(tf) * self.idfs.get(termid)) - for termid, tf in bow if self.idfs.get(termid, 0.0) != 0.0 + (termid, tf * self.idfs.get(termid)) + for termid, tf in zip(termid_array, tf_array) if abs(self.idfs.get(termid, 0.0)) > eps ] + if self.normalize is True: + self.normalize = matutils.unitvec + elif self.normalize is False: + self.normalize = utils.identity + # and finally, normalize the vector either to unit length, or use a # user-defined normalization function - if self.normalize is True: - vector = matutils.unitvec(vector) - elif self.normalize: - vector = self.normalize(vector) + vector = self.normalize(vector) # make sure there are no explicit zeroes in the vector (must be sparse) vector = [(termid, weight) for termid, weight in vector if abs(weight) > eps] diff --git a/gensim/sklearn_api/tfidf.py b/gensim/sklearn_api/tfidf.py index c0a45f1823..dc4ab86c01 100644 --- a/gensim/sklearn_api/tfidf.py +++ b/gensim/sklearn_api/tfidf.py @@ -12,8 +12,8 @@ from sklearn.base import TransformerMixin, BaseEstimator from sklearn.exceptions import NotFittedError -import gensim from gensim.models import TfidfModel +import gensim class TfIdfTransformer(TransformerMixin, BaseEstimator): @@ -22,7 +22,7 @@ class TfIdfTransformer(TransformerMixin, BaseEstimator): """ def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity, - wglobal=gensim.models.tfidfmodel.df2idf, normalize=True): + wglobal=gensim.models.tfidfmodel.df2idf, normalize=True, smartirs="ntc"): """ Sklearn wrapper for Tf-Idf model. """ @@ -32,14 +32,15 @@ def __init__(self, id2word=None, dictionary=None, wlocal=gensim.utils.identity, self.wlocal = wlocal self.wglobal = wglobal self.normalize = normalize + self.smartirs = smartirs def fit(self, X, y=None): """ Fit the model according to the given training data. """ self.gensim_model = TfidfModel( - corpus=X, id2word=self.id2word, dictionary=self.dictionary, - wlocal=self.wlocal, wglobal=self.wglobal, normalize=self.normalize + corpus=X, id2word=self.id2word, dictionary=self.dictionary, wlocal=self.wlocal, + wglobal=self.wglobal, normalize=self.normalize, smartirs=self.smartirs, ) return self diff --git a/gensim/test/test_data/tfidf_model.tst b/gensim/test/test_data/tfidf_model.tst new file mode 100644 index 0000000000..e9e5f3f3cf Binary files /dev/null and b/gensim/test/test_data/tfidf_model.tst differ diff --git a/gensim/test/test_data/tfidf_model.tst.bz2 b/gensim/test/test_data/tfidf_model.tst.bz2 new file mode 100644 index 0000000000..1cb3b2513f Binary files /dev/null and b/gensim/test/test_data/tfidf_model.tst.bz2 differ diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py index 3793c79948..5e0511aa5c 100644 --- a/gensim/test/test_sklearn_api.py +++ b/gensim/test/test_sklearn_api.py @@ -973,13 +973,13 @@ def testTransform(self): def testSetGetParams(self): # updating only one param - self.model.set_params(normalize=False) + self.model.set_params(smartirs='nnn') model_params = self.model.get_params() - self.assertEqual(model_params["normalize"], False) + self.assertEqual(model_params["smartirs"], 'nnn') # verify that the attributes values are also changed for `gensim_model` after fitting self.model.fit(self.corpus) - self.assertEqual(getattr(self.model.gensim_model, 'normalize'), False) + self.assertEqual(getattr(self.model.gensim_model, 'smartirs'), 'nnn') def testPipeline(self): with open(datapath('mini_newsgroup'), 'rb') as f: diff --git a/gensim/test/test_tfidfmodel.py b/gensim/test/test_tfidfmodel.py index c308923c29..35fbb4a4af 100644 --- a/gensim/test/test_tfidfmodel.py +++ b/gensim/test/test_tfidfmodel.py @@ -18,6 +18,22 @@ from gensim.models import tfidfmodel from gensim.test.utils import datapath, get_tmpfile, common_dictionary, common_corpus +from gensim.corpora import Dictionary + +texts = [ + ['complier', 'system', 'computer'], + ['eulerian', 'node', 'cycle', 'graph', 'tree', 'path'], + ['graph', 'flow', 'network', 'graph'], + ['loading', 'computer', 'system'], + ['user', 'server', 'system'], + ['tree', 'hamiltonian'], + ['graph', 'trees'], + ['computer', 'kernel', 'malfunction', 'computer'], + ['server', 'system', 'computer'], +] +dictionary = Dictionary(texts) +corpus = [dictionary.doc2bow(text) for text in texts] + class TestTfidfModel(unittest.TestCase): def setUp(self): @@ -50,23 +66,222 @@ def testInit(self): self.assertEqual(model1.idfs, model2.idfs) def testPersistence(self): + # Test persistence without using `smartirs` fname = get_tmpfile('gensim_models.tst') model = tfidfmodel.TfidfModel(self.corpus, normalize=True) model.save(fname) model2 = tfidfmodel.TfidfModel.load(fname) self.assertTrue(model.idfs == model2.idfs) - tstvec = [] - self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector + tstvec = [corpus[1], corpus[2]] + self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]])) + self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]])) + self.assertTrue(np.allclose(model[[]], model2[[]])) # try projecting an empty vector + + # Test persistence with using `smartirs` + fname = get_tmpfile('gensim_models_smartirs.tst') + model = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc") + model.save(fname) + model2 = tfidfmodel.TfidfModel.load(fname) + self.assertTrue(model.idfs == model2.idfs) + tstvec = [corpus[1], corpus[2]] + self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]])) + self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]])) + self.assertTrue(np.allclose(model[[]], model2[[]])) # try projecting an empty vector + + # Test persistence between Gensim v3.2.0 and current model. + model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc") + model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst')) + self.assertTrue(model3.idfs == model4.idfs) + tstvec = [corpus[1], corpus[2]] + self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]])) + self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]])) + self.assertTrue(np.allclose(model3[[]], model4[[]])) # try projecting an empty vector def testPersistenceCompressed(self): + # Test persistence without using `smartirs` fname = get_tmpfile('gensim_models.tst.gz') model = tfidfmodel.TfidfModel(self.corpus, normalize=True) model.save(fname) model2 = tfidfmodel.TfidfModel.load(fname, mmap=None) self.assertTrue(model.idfs == model2.idfs) - tstvec = [] - self.assertTrue(np.allclose(model[tstvec], model2[tstvec])) # try projecting an empty vector -# endclass TestTfidfModel + tstvec = [corpus[1], corpus[2]] + self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]])) + self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]])) + self.assertTrue(np.allclose(model[[]], model2[[]])) # try projecting an empty vector + + # Test persistence with using `smartirs` + fname = get_tmpfile('gensim_models_smartirs.tst.gz') + model = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc") + model.save(fname) + model2 = tfidfmodel.TfidfModel.load(fname, mmap=None) + self.assertTrue(model.idfs == model2.idfs) + tstvec = [corpus[1], corpus[2]] + self.assertTrue(np.allclose(model[tstvec[0]], model2[tstvec[0]])) + self.assertTrue(np.allclose(model[tstvec[1]], model2[tstvec[1]])) + self.assertTrue(np.allclose(model[[]], model2[[]])) # try projecting an empty vector + + # Test persistence between Gensim v3.2.0 and current compressed model. + model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc") + model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst.bz2')) + self.assertTrue(model3.idfs == model4.idfs) + tstvec = [corpus[1], corpus[2]] + self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]])) + self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]])) + self.assertTrue(np.allclose(model3[[]], model4[[]])) # try projecting an empty vector + + def TestConsistency(self): + docs = [corpus[1], corpus[2]] + + # Test if `ntc` yields the default docs. + model = tfidfmodel.TfidfModel(self.corpus, smartirs='ntc') + transformed_docs = [model[docs[0]], model[docs[1]]] + + model = tfidfmodel.TfidfModel(self.corpus) + expected_docs = [model[docs[0]], model[docs[1]]] + + self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) + + # Testing all the variations of `wlocal` + # nnn + model = tfidfmodel.TfidfModel(self.corpus, smartirs='nnn') + transformed_docs = [model[docs[0]], model[docs[1]]] + expected_docs = [[(3, 2), + (4, 2), + (5, 3), + (6, 2), + (7, 3), + (8, 2)], + [(5, 6), + (9, 3), + (10, 3)]] + + self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) + + # lnn + model = tfidfmodel.TfidfModel(self.corpus, smartirs='lnn') + transformed_docs = [model[docs[0]], model[docs[1]]] + expected_docs = [[(3, 2.0), + (4, 2.0), + (5, 3.0), + (6, 2.0), + (7, 3.0), + (8, 2.0)], + [(5, 6.0), + (9, 3.0), + (10, 3.0)]] + + self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) + + # ann + model = tfidfmodel.TfidfModel(self.corpus, smartirs='ann') + transformed_docs = [model[docs[0]], model[docs[1]]] + expected_docs = [[(3, 2.0), + (4, 2.0), + (5, 3.0), + (6, 2.0), + (7, 3.0), + (8, 2.0)], + [(5, 3.0), + (9, 2.25), + (10, 2.25)]] + + self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) + + # bnn + model = tfidfmodel.TfidfModel(self.corpus, smartirs='bnn') + transformed_docs = [model[docs[0]], model[docs[1]]] + expected_docs = [[(3, 2), + (4, 2), + (5, 3), + (6, 2), + (7, 3), + (8, 2)], + [(5, 3), + (9, 3), + (10, 3)]] + + self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) + + # Lnn + model = tfidfmodel.TfidfModel(self.corpus, smartirs='Lnn') + transformed_docs = [model[docs[0]], model[docs[1]]] + expected_docs = [[(3, 1.4635792826230198), + (4, 1.4635792826230198), + (5, 2.19536892393453), + (6, 1.4635792826230198), + (7, 2.19536892393453), + (8, 1.4635792826230198)], + [(5, 3.627141918134611), + (9, 1.8135709590673055), + (10, 1.8135709590673055)]] + + self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) + + # Testing all the variations of `glocal` + # ntn + model = tfidfmodel.TfidfModel(self.corpus, smartirs='ntn') + transformed_docs = [model[docs[0]], model[docs[1]]] + expected_docs = [[(3, 2.1699250014423126), + (4, 2.1699250014423126), + (5, 1.5849625007211563), + (6, 2.1699250014423126), + (7, 1.5849625007211563), + (8, 2.1699250014423126)], + [(5, 3.1699250014423126), + (9, 1.5849625007211563), + (10, 1.5849625007211563)]] + + self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) + + # npn + model = tfidfmodel.TfidfModel(self.corpus, smartirs='npn') + transformed_docs = [model[docs[0]], model[docs[1]]] + expected_docs = [[(3, 1.8073549220576042), + (4, 1.8073549220576042), + (5, 1.0), + (6, 1.8073549220576042), + (7, 1.0), + (8, 1.8073549220576042)], + [(5, 2.0), + (9, 1.0), + (10, 1.0)]] + + self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) + + # Testing all the variations of `normalize` + # nnc + model = tfidfmodel.TfidfModel(self.corpus, smartirs='nnc') + transformed_docs = [model[docs[0]], model[docs[1]]] + expected_docs = [[(3, 0.34299717028501764), + (4, 0.34299717028501764), + (5, 0.51449575542752646), + (6, 0.34299717028501764), + (7, 0.51449575542752646), + (8, 0.34299717028501764)], + [(5, 0.81649658092772603), + (9, 0.40824829046386302), + (10, 0.40824829046386302)]] + + self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) + + model = tfidfmodel.TfidfModel(self.corpus, wlocal=lambda x: x, wglobal=lambda x, y: x * x, smartirs='nnc') + + transformed_docs = [model[docs[0]], model[docs[1]]] + + model = tfidfmodel.TfidfModel(self.corpus, wlocal=lambda x: x * x, wglobal=lambda x, y: x, smartirs='nnc') + expected_docs = [model[docs[0]], model[docs[1]]] + + self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) + self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) if __name__ == '__main__':