Skip to content

Commit

Permalink
Fix SMART from TfidfModel for case when df == "n". Fix #2020 (#2021)
Browse files Browse the repository at this point in the history
* Added Montemurro and Zanette's entropy-based keyword extraction algorithm

* Improved Docstrings

* Fixed numerical bugs due to zero frequencies

* Coding style changes, test and tutorial

* I hate git

* Summarization tutorial

* Fixed some failing tests

* Tests, demo, nan_to_num and a few last flake8 issues

* Further flake8 issues

* Further flake8 issues

* Removed Jupyter checkpoint

* Removed trailing whitespace

* Trailing whitespace

* Speed up test and add comment to explain threshold value

* Flake8 again

* rename vars + style fixes

* fix operation order

* Update docs with Montemurro and Zanette's algorithm

* Revert "Update docs with Montemurro and Zanette's algorithm"

This reverts commit 6add3ba.

* Fixed bug in TfidfModel, as described in Issue #2020

* Fix return type

* Updated unit tests for TfidfModel

* Updated unit tests for TfidfModel

* Changed log(x)/log(2) to log2(x) since this is clearer. Fixed the placement of a parenthesis. Updated predicted values for unit tests

* Fixed persistence tests

* Flake 8
  • Loading branch information
PeteBleackley authored and menshikh-iv committed Apr 10, 2018
1 parent 49e6abd commit 06f5f5c
Show file tree
Hide file tree
Showing 5 changed files with 49 additions and 45 deletions.
1 change: 0 additions & 1 deletion gensim/corpora/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,6 @@ def doc2bow(self, document, allow_update=False, return_missing=False):
# new id = number of ids made so far;
# NOTE this assumes there are no gaps in the id sequence!
token2id[w] = len(token2id)

result = {token2id[w]: freq for w, freq in iteritems(counter) if w in token2id}

if allow_update:
Expand Down
13 changes: 6 additions & 7 deletions gensim/models/tfidfmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,13 +139,13 @@ def updated_wlocal(tf, n_tf):
if n_tf == "n":
return tf
elif n_tf == "l":
return 1 + np.log(tf) / np.log(2)
return 1 + np.log2(tf)
elif n_tf == "a":
return 0.5 + (0.5 * tf / tf.max(axis=0))
elif n_tf == "b":
return tf.astype('bool').astype('int')
elif n_tf == "L":
return (1 + np.log(tf) / np.log(2)) / (1 + np.log(tf.mean(axis=0) / np.log(2)))
return (1 + np.log2(tf)) / (1 + np.log2(tf.mean(axis=0)))


def updated_wglobal(docfreq, totaldocs, n_df):
Expand All @@ -166,12 +166,13 @@ def updated_wglobal(docfreq, totaldocs, n_df):
Calculated wglobal.
"""

if n_df == "n":
return utils.identity(docfreq)
return 1.
elif n_df == "t":
return np.log(1.0 * totaldocs / docfreq) / np.log(2)
return np.log2(1.0 * totaldocs / docfreq)
elif n_df == "p":
return np.log((1.0 * totaldocs - docfreq) / docfreq) / np.log(2)
return max(0, np.log2((1.0 * totaldocs - docfreq) / docfreq))


def updated_normalize(x, n_n, return_norm=False):
Expand Down Expand Up @@ -303,7 +304,6 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden
# If smartirs is not None, override wlocal, wglobal and normalize
if smartirs is not None:
n_tf, n_df, n_n = resolve_weights(smartirs)

self.wlocal = partial(updated_wlocal, n_tf=n_tf)
self.wglobal = partial(updated_wglobal, n_df=n_df)
# also return norm factor if pivot is not none
Expand Down Expand Up @@ -371,7 +371,6 @@ def initialize(self, corpus):
numnnz += len(bow)
for termid, _ in bow:
dfs[termid] = dfs.get(termid, 0) + 1

# keep some stats about the training corpus
self.num_docs = docno + 1
self.num_nnz = numnnz
Expand Down
Binary file modified gensim/test/test_data/tfidf_model.tst
Binary file not shown.
Binary file modified gensim/test/test_data/tfidf_model.tst.bz2
Binary file not shown.
80 changes: 43 additions & 37 deletions gensim/test/test_tfidfmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

from gensim.corpora import Dictionary


texts = [
['complier', 'system', 'computer'],
['eulerian', 'node', 'cycle', 'graph', 'tree', 'path'],
Expand Down Expand Up @@ -91,7 +92,9 @@ def test_persistence(self):
# Test persistence between Gensim v3.2.0 and current model.
model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc")
model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst'))
self.assertTrue(model3.idfs == model4.idfs)
idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())]
idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())]
self.assertTrue(np.allclose(idfs3, idfs4))
tstvec = [corpus[1], corpus[2]]
self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]]))
self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]]))
Expand All @@ -110,7 +113,9 @@ def test_persistence(self):
# Test persistence between Gensim v3.2.0 and pivoted normalization compressed model.
model3 = tfidfmodel.TfidfModel(self.corpus, pivot=0, slope=1)
model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst'))
self.assertTrue(model3.idfs == model4.idfs)
idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())]
idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())]
self.assertTrue(np.allclose(idfs3, idfs4))
tstvec = [corpus[1], corpus[2]]
self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]]))
self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]]))
Expand Down Expand Up @@ -141,7 +146,9 @@ def test_persistence_compressed(self):
# Test persistence between Gensim v3.2.0 and current compressed model.
model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc")
model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst.bz2'))
self.assertTrue(model3.idfs == model4.idfs)
idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())]
idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())]
self.assertTrue(np.allclose(idfs3, idfs4))
tstvec = [corpus[1], corpus[2]]
self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]]))
self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]]))
Expand All @@ -160,7 +167,9 @@ def test_persistence_compressed(self):
# Test persistence between Gensim v3.2.0 and pivoted normalization compressed model.
model3 = tfidfmodel.TfidfModel(self.corpus, pivot=0, slope=1)
model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst.bz2'))
self.assertTrue(model3.idfs == model4.idfs)
idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())]
idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())]
self.assertTrue(np.allclose(idfs3, idfs4))
tstvec = [corpus[1], corpus[2]]
self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]]))
self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]]))
Expand All @@ -169,70 +178,67 @@ def test_consistency(self):
docs = [corpus[1], corpus[2]]

# Test if `ntc` yields the default docs.
model = tfidfmodel.TfidfModel(self.corpus, smartirs='ntc')
model = tfidfmodel.TfidfModel(corpus, smartirs='ntc')
transformed_docs = [model[docs[0]], model[docs[1]]]

model = tfidfmodel.TfidfModel(self.corpus)
model = tfidfmodel.TfidfModel(corpus)
expected_docs = [model[docs[0]], model[docs[1]]]

self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

# Testing all the variations of `wlocal`
# nnn
model = tfidfmodel.TfidfModel(self.corpus, smartirs='nnn')
model = tfidfmodel.TfidfModel(corpus, smartirs='nnn')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [
[(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)],
[(5, 6), (9, 3), (10, 3)]
]
expected_docs = docs[:]

self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

# lnn
model = tfidfmodel.TfidfModel(self.corpus, smartirs='lnn')
model = tfidfmodel.TfidfModel(corpus, smartirs='lnn')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [
[(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0), (8, 2.0)],
[(5, 6.0), (9, 3.0), (10, 3.0)]
[(3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0), (8, 1.0)],
[(5, 2.0), (9, 1.0), (10, 1.0)]
]

self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

# ann
model = tfidfmodel.TfidfModel(self.corpus, smartirs='ann')
model = tfidfmodel.TfidfModel(corpus, smartirs='ann')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [
[(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0), (8, 2.0)],
[(5, 3.0), (9, 2.25), (10, 2.25)]
[(3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0), (8, 1.0)],
[(5, 1.0), (9, 0.75), (10, 0.75)]
]

self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

# bnn
model = tfidfmodel.TfidfModel(self.corpus, smartirs='bnn')
model = tfidfmodel.TfidfModel(corpus, smartirs='bnn')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [
[(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)],
[(5, 3), (9, 3), (10, 3)]
[(3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)],
[(5, 1), (9, 1), (10, 1)]
]

self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

# Lnn
model = tfidfmodel.TfidfModel(self.corpus, smartirs='Lnn')
model = tfidfmodel.TfidfModel(corpus, smartirs='Lnn')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [
[
(3, 1.4635792826230198), (4, 1.4635792826230198), (5, 2.19536892393453), (6, 1.4635792826230198),
(7, 2.19536892393453), (8, 1.4635792826230198)
(3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0),
(7, 1.0), (8, 1.0)
],
[
(5, 3.627141918134611), (9, 1.8135709590673055), (10, 1.8135709590673055)
(5, 1.4133901052), (9, 0.7066950526), (10, 0.7066950526)
]
]

Expand All @@ -241,31 +247,31 @@ def test_consistency(self):

# Testing all the variations of `glocal`
# ntn
model = tfidfmodel.TfidfModel(self.corpus, smartirs='ntn')
model = tfidfmodel.TfidfModel(corpus, smartirs='ntn')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [
[
(3, 2.1699250014423126), (4, 2.1699250014423126), (5, 1.5849625007211563), (6, 2.1699250014423126),
(7, 1.5849625007211563), (8, 2.1699250014423126)
(3, 3.169925001442312), (4, 3.169925001442312), (5, 1.584962500721156), (6, 3.169925001442312),
(7, 3.169925001442312), (8, 2.169925001442312)
],
[
(5, 3.1699250014423126), (9, 1.5849625007211563), (10, 1.5849625007211563)
(5, 3.169925001442312), (9, 3.169925001442312), (10, 3.169925001442312)
]
]

self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

# npn
model = tfidfmodel.TfidfModel(self.corpus, smartirs='npn')
model = tfidfmodel.TfidfModel(corpus, smartirs='npn')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [
[
(3, 1.8073549220576042), (4, 1.8073549220576042), (5, 1.0), (6, 1.8073549220576042),
(7, 1.0), (8, 1.8073549220576042)
(3, 3.0), (4, 3.0), (5, 1.0), (6, 3.0),
(7, 3.0), (8, 1.8073549220576042)
],
[
(5, 2.0), (9, 1.0), (10, 1.0)
(5, 2.0), (9, 3.0), (10, 3.0)
]
]

Expand All @@ -274,12 +280,12 @@ def test_consistency(self):

# Testing all the variations of `normalize`
# nnc
model = tfidfmodel.TfidfModel(self.corpus, smartirs='nnc')
model = tfidfmodel.TfidfModel(corpus, smartirs='nnc')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [
[
(3, 0.34299717028501764), (4, 0.34299717028501764), (5, 0.51449575542752646), (6, 0.34299717028501764),
(7, 0.51449575542752646), (8, 0.34299717028501764)
(3, 0.4082482905), (4, 0.4082482905), (5, 0.4082482905), (6, 0.4082482905),
(7, 0.4082482905), (8, 0.4082482905)
],
[
(5, 0.81649658092772603), (9, 0.40824829046386302), (10, 0.40824829046386302)
Expand All @@ -289,11 +295,11 @@ def test_consistency(self):
self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

model = tfidfmodel.TfidfModel(self.corpus, wlocal=lambda x: x, wglobal=lambda x, y: x * x, smartirs='nnc')
model = tfidfmodel.TfidfModel(corpus, wlocal=lambda x: x, wglobal=lambda x, y: x * x, smartirs='nnc')

transformed_docs = [model[docs[0]], model[docs[1]]]

model = tfidfmodel.TfidfModel(self.corpus, wlocal=lambda x: x * x, wglobal=lambda x, y: x, smartirs='nnc')
model = tfidfmodel.TfidfModel(corpus, wlocal=lambda x: x * x, wglobal=lambda x, y: x, smartirs='nnc')
expected_docs = [model[docs[0]], model[docs[1]]]

self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
Expand Down

0 comments on commit 06f5f5c

Please sign in to comment.