diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py index a736849b4e..9a9edf24ed 100644 --- a/gensim/corpora/dictionary.py +++ b/gensim/corpora/dictionary.py @@ -245,7 +245,6 @@ def doc2bow(self, document, allow_update=False, return_missing=False): # new id = number of ids made so far; # NOTE this assumes there are no gaps in the id sequence! token2id[w] = len(token2id) - result = {token2id[w]: freq for w, freq in iteritems(counter) if w in token2id} if allow_update: diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index a9e12c995a..68e83d8e6f 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -139,13 +139,13 @@ def updated_wlocal(tf, n_tf): if n_tf == "n": return tf elif n_tf == "l": - return 1 + np.log(tf) / np.log(2) + return 1 + np.log2(tf) elif n_tf == "a": return 0.5 + (0.5 * tf / tf.max(axis=0)) elif n_tf == "b": return tf.astype('bool').astype('int') elif n_tf == "L": - return (1 + np.log(tf) / np.log(2)) / (1 + np.log(tf.mean(axis=0) / np.log(2))) + return (1 + np.log2(tf)) / (1 + np.log2(tf.mean(axis=0))) def updated_wglobal(docfreq, totaldocs, n_df): @@ -166,12 +166,13 @@ def updated_wglobal(docfreq, totaldocs, n_df): Calculated wglobal. """ + if n_df == "n": - return utils.identity(docfreq) + return 1. elif n_df == "t": - return np.log(1.0 * totaldocs / docfreq) / np.log(2) + return np.log2(1.0 * totaldocs / docfreq) elif n_df == "p": - return np.log((1.0 * totaldocs - docfreq) / docfreq) / np.log(2) + return max(0, np.log2((1.0 * totaldocs - docfreq) / docfreq)) def updated_normalize(x, n_n, return_norm=False): @@ -303,7 +304,6 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden # If smartirs is not None, override wlocal, wglobal and normalize if smartirs is not None: n_tf, n_df, n_n = resolve_weights(smartirs) - self.wlocal = partial(updated_wlocal, n_tf=n_tf) self.wglobal = partial(updated_wglobal, n_df=n_df) # also return norm factor if pivot is not none @@ -371,7 +371,6 @@ def initialize(self, corpus): numnnz += len(bow) for termid, _ in bow: dfs[termid] = dfs.get(termid, 0) + 1 - # keep some stats about the training corpus self.num_docs = docno + 1 self.num_nnz = numnnz diff --git a/gensim/test/test_data/tfidf_model.tst b/gensim/test/test_data/tfidf_model.tst index e9e5f3f3cf..8d3c60c73e 100644 Binary files a/gensim/test/test_data/tfidf_model.tst and b/gensim/test/test_data/tfidf_model.tst differ diff --git a/gensim/test/test_data/tfidf_model.tst.bz2 b/gensim/test/test_data/tfidf_model.tst.bz2 index 1cb3b2513f..f25e0399f4 100644 Binary files a/gensim/test/test_data/tfidf_model.tst.bz2 and b/gensim/test/test_data/tfidf_model.tst.bz2 differ diff --git a/gensim/test/test_tfidfmodel.py b/gensim/test/test_tfidfmodel.py index b15e892c9c..79e3742d48 100644 --- a/gensim/test/test_tfidfmodel.py +++ b/gensim/test/test_tfidfmodel.py @@ -20,6 +20,7 @@ from gensim.corpora import Dictionary + texts = [ ['complier', 'system', 'computer'], ['eulerian', 'node', 'cycle', 'graph', 'tree', 'path'], @@ -91,7 +92,9 @@ def test_persistence(self): # Test persistence between Gensim v3.2.0 and current model. model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc") model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst')) - self.assertTrue(model3.idfs == model4.idfs) + idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())] + idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())] + self.assertTrue(np.allclose(idfs3, idfs4)) tstvec = [corpus[1], corpus[2]] self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]])) self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]])) @@ -110,7 +113,9 @@ def test_persistence(self): # Test persistence between Gensim v3.2.0 and pivoted normalization compressed model. model3 = tfidfmodel.TfidfModel(self.corpus, pivot=0, slope=1) model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst')) - self.assertTrue(model3.idfs == model4.idfs) + idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())] + idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())] + self.assertTrue(np.allclose(idfs3, idfs4)) tstvec = [corpus[1], corpus[2]] self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]])) self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]])) @@ -141,7 +146,9 @@ def test_persistence_compressed(self): # Test persistence between Gensim v3.2.0 and current compressed model. model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc") model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst.bz2')) - self.assertTrue(model3.idfs == model4.idfs) + idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())] + idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())] + self.assertTrue(np.allclose(idfs3, idfs4)) tstvec = [corpus[1], corpus[2]] self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]])) self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]])) @@ -160,7 +167,9 @@ def test_persistence_compressed(self): # Test persistence between Gensim v3.2.0 and pivoted normalization compressed model. model3 = tfidfmodel.TfidfModel(self.corpus, pivot=0, slope=1) model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst.bz2')) - self.assertTrue(model3.idfs == model4.idfs) + idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())] + idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())] + self.assertTrue(np.allclose(idfs3, idfs4)) tstvec = [corpus[1], corpus[2]] self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]])) self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]])) @@ -169,10 +178,10 @@ def test_consistency(self): docs = [corpus[1], corpus[2]] # Test if `ntc` yields the default docs. - model = tfidfmodel.TfidfModel(self.corpus, smartirs='ntc') + model = tfidfmodel.TfidfModel(corpus, smartirs='ntc') transformed_docs = [model[docs[0]], model[docs[1]]] - model = tfidfmodel.TfidfModel(self.corpus) + model = tfidfmodel.TfidfModel(corpus) expected_docs = [model[docs[0]], model[docs[1]]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) @@ -180,59 +189,56 @@ def test_consistency(self): # Testing all the variations of `wlocal` # nnn - model = tfidfmodel.TfidfModel(self.corpus, smartirs='nnn') + model = tfidfmodel.TfidfModel(corpus, smartirs='nnn') transformed_docs = [model[docs[0]], model[docs[1]]] - expected_docs = [ - [(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)], - [(5, 6), (9, 3), (10, 3)] - ] + expected_docs = docs[:] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # lnn - model = tfidfmodel.TfidfModel(self.corpus, smartirs='lnn') + model = tfidfmodel.TfidfModel(corpus, smartirs='lnn') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [ - [(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0), (8, 2.0)], - [(5, 6.0), (9, 3.0), (10, 3.0)] + [(3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0), (8, 1.0)], + [(5, 2.0), (9, 1.0), (10, 1.0)] ] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # ann - model = tfidfmodel.TfidfModel(self.corpus, smartirs='ann') + model = tfidfmodel.TfidfModel(corpus, smartirs='ann') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [ - [(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0), (8, 2.0)], - [(5, 3.0), (9, 2.25), (10, 2.25)] + [(3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0), (8, 1.0)], + [(5, 1.0), (9, 0.75), (10, 0.75)] ] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # bnn - model = tfidfmodel.TfidfModel(self.corpus, smartirs='bnn') + model = tfidfmodel.TfidfModel(corpus, smartirs='bnn') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [ - [(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)], - [(5, 3), (9, 3), (10, 3)] + [(3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)], + [(5, 1), (9, 1), (10, 1)] ] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # Lnn - model = tfidfmodel.TfidfModel(self.corpus, smartirs='Lnn') + model = tfidfmodel.TfidfModel(corpus, smartirs='Lnn') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [ [ - (3, 1.4635792826230198), (4, 1.4635792826230198), (5, 2.19536892393453), (6, 1.4635792826230198), - (7, 2.19536892393453), (8, 1.4635792826230198) + (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), + (7, 1.0), (8, 1.0) ], [ - (5, 3.627141918134611), (9, 1.8135709590673055), (10, 1.8135709590673055) + (5, 1.4133901052), (9, 0.7066950526), (10, 0.7066950526) ] ] @@ -241,15 +247,15 @@ def test_consistency(self): # Testing all the variations of `glocal` # ntn - model = tfidfmodel.TfidfModel(self.corpus, smartirs='ntn') + model = tfidfmodel.TfidfModel(corpus, smartirs='ntn') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [ [ - (3, 2.1699250014423126), (4, 2.1699250014423126), (5, 1.5849625007211563), (6, 2.1699250014423126), - (7, 1.5849625007211563), (8, 2.1699250014423126) + (3, 3.169925001442312), (4, 3.169925001442312), (5, 1.584962500721156), (6, 3.169925001442312), + (7, 3.169925001442312), (8, 2.169925001442312) ], [ - (5, 3.1699250014423126), (9, 1.5849625007211563), (10, 1.5849625007211563) + (5, 3.169925001442312), (9, 3.169925001442312), (10, 3.169925001442312) ] ] @@ -257,15 +263,15 @@ def test_consistency(self): self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) # npn - model = tfidfmodel.TfidfModel(self.corpus, smartirs='npn') + model = tfidfmodel.TfidfModel(corpus, smartirs='npn') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [ [ - (3, 1.8073549220576042), (4, 1.8073549220576042), (5, 1.0), (6, 1.8073549220576042), - (7, 1.0), (8, 1.8073549220576042) + (3, 3.0), (4, 3.0), (5, 1.0), (6, 3.0), + (7, 3.0), (8, 1.8073549220576042) ], [ - (5, 2.0), (9, 1.0), (10, 1.0) + (5, 2.0), (9, 3.0), (10, 3.0) ] ] @@ -274,12 +280,12 @@ def test_consistency(self): # Testing all the variations of `normalize` # nnc - model = tfidfmodel.TfidfModel(self.corpus, smartirs='nnc') + model = tfidfmodel.TfidfModel(corpus, smartirs='nnc') transformed_docs = [model[docs[0]], model[docs[1]]] expected_docs = [ [ - (3, 0.34299717028501764), (4, 0.34299717028501764), (5, 0.51449575542752646), (6, 0.34299717028501764), - (7, 0.51449575542752646), (8, 0.34299717028501764) + (3, 0.4082482905), (4, 0.4082482905), (5, 0.4082482905), (6, 0.4082482905), + (7, 0.4082482905), (8, 0.4082482905) ], [ (5, 0.81649658092772603), (9, 0.40824829046386302), (10, 0.40824829046386302) @@ -289,11 +295,11 @@ def test_consistency(self): self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0])) self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1])) - model = tfidfmodel.TfidfModel(self.corpus, wlocal=lambda x: x, wglobal=lambda x, y: x * x, smartirs='nnc') + model = tfidfmodel.TfidfModel(corpus, wlocal=lambda x: x, wglobal=lambda x, y: x * x, smartirs='nnc') transformed_docs = [model[docs[0]], model[docs[1]]] - model = tfidfmodel.TfidfModel(self.corpus, wlocal=lambda x: x * x, wglobal=lambda x, y: x, smartirs='nnc') + model = tfidfmodel.TfidfModel(corpus, wlocal=lambda x: x * x, wglobal=lambda x, y: x, smartirs='nnc') expected_docs = [model[docs[0]], model[docs[1]]] self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))