Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix SMART from TfidfModel for case when df == "n". Fix #2020 #2021

Merged
merged 31 commits into from
Apr 10, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
3160dea
Added Montemurro and Zanette's entropy-based keyword extraction algor…
PeteBleackley Nov 23, 2017
0550651
Improved Docstrings
PeteBleackley Nov 23, 2017
a072f93
Fixed numerical bugs due to zero frequencies
PeteBleackley Nov 23, 2017
02428fa
Merge branch 'develop' into develop
menshikh-iv Nov 24, 2017
c8a3792
Coding style changes, test and tutorial
PeteBleackley Nov 27, 2017
8a8264e
Trying to fix a merge conflict
PeteBleackley Nov 27, 2017
e763f3c
I hate git
PeteBleackley Nov 27, 2017
195c3c0
Summarization tutorial
PeteBleackley Nov 27, 2017
5b9a3ad
Fixed some failing tests
PeteBleackley Nov 27, 2017
4c2d8de
Tests, demo, nan_to_num and a few last flake8 issues
PeteBleackley Nov 28, 2017
d9c290a
Further flake8 issues
PeteBleackley Nov 28, 2017
8809e5a
Further flake8 issues
PeteBleackley Nov 28, 2017
a97fd82
Removed Jupyter checkpoint
PeteBleackley Nov 28, 2017
0d4e31c
Removed trailing whitespace
PeteBleackley Nov 28, 2017
4d18223
Trailing whitespace
PeteBleackley Nov 28, 2017
dc42cee
Speed up test and add comment to explain threshold value
PeteBleackley Nov 29, 2017
fdddf02
Flake8 again
PeteBleackley Nov 29, 2017
86db65c
rename vars + style fixes
menshikh-iv Nov 30, 2017
28ae7cb
fix operation order
menshikh-iv Nov 30, 2017
6add3ba
Update docs with Montemurro and Zanette's algorithm
PeteBleackley Jan 3, 2018
86590bb
Revert "Update docs with Montemurro and Zanette's algorithm"
PeteBleackley Jan 4, 2018
eb65041
Merge remote-tracking branch 'upstream/master' into develop
PeteBleackley Jan 4, 2018
7711451
Merge remote-tracking branch 'upstream/develop' into develop
PeteBleackley Apr 6, 2018
bdc1a6d
Fixed bug in TfidfModel, as described in Issue #2020
PeteBleackley Apr 6, 2018
590b52a
Fix return type
menshikh-iv Apr 6, 2018
3b64a78
Updated unit tests for TfidfModel
PeteBleackley Apr 6, 2018
3588b88
Merge branch 'develop' of https://github.com/PeteBleackley/gensim int…
PeteBleackley Apr 6, 2018
5f9aa93
Updated unit tests for TfidfModel
PeteBleackley Apr 6, 2018
fdab1e8
Changed log(x)/log(2) to log2(x) since this is clearer. Fixed the pla…
PeteBleackley Apr 7, 2018
d2f3ea8
Fixed persistence tests
PeteBleackley Apr 8, 2018
5aff83b
Flake 8
PeteBleackley Apr 8, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion gensim/corpora/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,6 @@ def doc2bow(self, document, allow_update=False, return_missing=False):
# new id = number of ids made so far;
# NOTE this assumes there are no gaps in the id sequence!
token2id[w] = len(token2id)

result = {token2id[w]: freq for w, freq in iteritems(counter) if w in token2id}

if allow_update:
Expand Down
13 changes: 6 additions & 7 deletions gensim/models/tfidfmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,13 +139,13 @@ def updated_wlocal(tf, n_tf):
if n_tf == "n":
return tf
elif n_tf == "l":
return 1 + np.log(tf) / np.log(2)
return 1 + np.log2(tf)
elif n_tf == "a":
return 0.5 + (0.5 * tf / tf.max(axis=0))
elif n_tf == "b":
return tf.astype('bool').astype('int')
elif n_tf == "L":
return (1 + np.log(tf) / np.log(2)) / (1 + np.log(tf.mean(axis=0) / np.log(2)))
return (1 + np.log2(tf)) / (1 + np.log2(tf.mean(axis=0)))


def updated_wglobal(docfreq, totaldocs, n_df):
Expand All @@ -166,12 +166,13 @@ def updated_wglobal(docfreq, totaldocs, n_df):
Calculated wglobal.

"""

if n_df == "n":
return utils.identity(docfreq)
return 1.
elif n_df == "t":
return np.log(1.0 * totaldocs / docfreq) / np.log(2)
return np.log2(1.0 * totaldocs / docfreq)
elif n_df == "p":
return np.log((1.0 * totaldocs - docfreq) / docfreq) / np.log(2)
return max(0, np.log2((1.0 * totaldocs - docfreq) / docfreq))


def updated_normalize(x, n_n, return_norm=False):
Expand Down Expand Up @@ -303,7 +304,6 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden
# If smartirs is not None, override wlocal, wglobal and normalize
if smartirs is not None:
n_tf, n_df, n_n = resolve_weights(smartirs)

self.wlocal = partial(updated_wlocal, n_tf=n_tf)
self.wglobal = partial(updated_wglobal, n_df=n_df)
# also return norm factor if pivot is not none
Expand Down Expand Up @@ -371,7 +371,6 @@ def initialize(self, corpus):
numnnz += len(bow)
for termid, _ in bow:
dfs[termid] = dfs.get(termid, 0) + 1

# keep some stats about the training corpus
self.num_docs = docno + 1
self.num_nnz = numnnz
Expand Down
Binary file modified gensim/test/test_data/tfidf_model.tst
Binary file not shown.
Binary file modified gensim/test/test_data/tfidf_model.tst.bz2
Binary file not shown.
80 changes: 43 additions & 37 deletions gensim/test/test_tfidfmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

from gensim.corpora import Dictionary


texts = [
['complier', 'system', 'computer'],
['eulerian', 'node', 'cycle', 'graph', 'tree', 'path'],
Expand Down Expand Up @@ -91,7 +92,9 @@ def test_persistence(self):
# Test persistence between Gensim v3.2.0 and current model.
model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc")
model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst'))
self.assertTrue(model3.idfs == model4.idfs)
idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())]
idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())]
self.assertTrue(np.allclose(idfs3, idfs4))
tstvec = [corpus[1], corpus[2]]
self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]]))
self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]]))
Expand All @@ -110,7 +113,9 @@ def test_persistence(self):
# Test persistence between Gensim v3.2.0 and pivoted normalization compressed model.
model3 = tfidfmodel.TfidfModel(self.corpus, pivot=0, slope=1)
model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst'))
self.assertTrue(model3.idfs == model4.idfs)
idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())]
idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())]
self.assertTrue(np.allclose(idfs3, idfs4))
tstvec = [corpus[1], corpus[2]]
self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]]))
self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]]))
Expand Down Expand Up @@ -141,7 +146,9 @@ def test_persistence_compressed(self):
# Test persistence between Gensim v3.2.0 and current compressed model.
model3 = tfidfmodel.TfidfModel(self.corpus, smartirs="ntc")
model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst.bz2'))
self.assertTrue(model3.idfs == model4.idfs)
idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())]
idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())]
self.assertTrue(np.allclose(idfs3, idfs4))
tstvec = [corpus[1], corpus[2]]
self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]]))
self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]]))
Expand All @@ -160,7 +167,9 @@ def test_persistence_compressed(self):
# Test persistence between Gensim v3.2.0 and pivoted normalization compressed model.
model3 = tfidfmodel.TfidfModel(self.corpus, pivot=0, slope=1)
model4 = tfidfmodel.TfidfModel.load(datapath('tfidf_model.tst.bz2'))
self.assertTrue(model3.idfs == model4.idfs)
idfs3 = [model3.idfs[key] for key in sorted(model3.idfs.keys())]
idfs4 = [model4.idfs[key] for key in sorted(model4.idfs.keys())]
self.assertTrue(np.allclose(idfs3, idfs4))
tstvec = [corpus[1], corpus[2]]
self.assertTrue(np.allclose(model3[tstvec[0]], model4[tstvec[0]]))
self.assertTrue(np.allclose(model3[tstvec[1]], model4[tstvec[1]]))
Expand All @@ -169,70 +178,67 @@ def test_consistency(self):
docs = [corpus[1], corpus[2]]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add a self. to corpus here and everywhere.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I changed self.corpus to corpus because my calculations for expected_docs are based on the corpus given at the top of test_tfidfmodel.py, and self.corpus (based on test_data/test_corpus.mm) doesn't give matching results

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If self.corpus is not used anywhere I suggest removing it from the setUp method.
@menshikh-iv please suggest the corpus which should be chosen.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

up to you guys, it is not so important

Copy link
Contributor

@markroxor markroxor Apr 9, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks good. Just one last step, since we are not using testcorpus.mm (defined in the setUp method) can you replace it with text, dictionary and corpus defined at the top of the code.
After that it is ready to merge @menshikh-iv

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm still using self.corpus in the persistence tests.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use either self.corpus or corpus, the redundancy is polluting the code.


# Test if `ntc` yields the default docs.
model = tfidfmodel.TfidfModel(self.corpus, smartirs='ntc')
model = tfidfmodel.TfidfModel(corpus, smartirs='ntc')
transformed_docs = [model[docs[0]], model[docs[1]]]

model = tfidfmodel.TfidfModel(self.corpus)
model = tfidfmodel.TfidfModel(corpus)
expected_docs = [model[docs[0]], model[docs[1]]]

self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

# Testing all the variations of `wlocal`
# nnn
model = tfidfmodel.TfidfModel(self.corpus, smartirs='nnn')
model = tfidfmodel.TfidfModel(corpus, smartirs='nnn')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [
[(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)],
[(5, 6), (9, 3), (10, 3)]
]
expected_docs = docs[:]

self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

# lnn
model = tfidfmodel.TfidfModel(self.corpus, smartirs='lnn')
model = tfidfmodel.TfidfModel(corpus, smartirs='lnn')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [
[(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0), (8, 2.0)],
[(5, 6.0), (9, 3.0), (10, 3.0)]
[(3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0), (8, 1.0)],
[(5, 2.0), (9, 1.0), (10, 1.0)]
]

self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

# ann
model = tfidfmodel.TfidfModel(self.corpus, smartirs='ann')
model = tfidfmodel.TfidfModel(corpus, smartirs='ann')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [
[(3, 2.0), (4, 2.0), (5, 3.0), (6, 2.0), (7, 3.0), (8, 2.0)],
[(5, 3.0), (9, 2.25), (10, 2.25)]
[(3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0), (8, 1.0)],
[(5, 1.0), (9, 0.75), (10, 0.75)]
]

self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

# bnn
model = tfidfmodel.TfidfModel(self.corpus, smartirs='bnn')
model = tfidfmodel.TfidfModel(corpus, smartirs='bnn')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [
[(3, 2), (4, 2), (5, 3), (6, 2), (7, 3), (8, 2)],
[(5, 3), (9, 3), (10, 3)]
[(3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)],
[(5, 1), (9, 1), (10, 1)]
]

self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

# Lnn
model = tfidfmodel.TfidfModel(self.corpus, smartirs='Lnn')
model = tfidfmodel.TfidfModel(corpus, smartirs='Lnn')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [
[
(3, 1.4635792826230198), (4, 1.4635792826230198), (5, 2.19536892393453), (6, 1.4635792826230198),
(7, 2.19536892393453), (8, 1.4635792826230198)
(3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0),
(7, 1.0), (8, 1.0)
],
[
(5, 3.627141918134611), (9, 1.8135709590673055), (10, 1.8135709590673055)
(5, 1.4133901052), (9, 0.7066950526), (10, 0.7066950526)
]
]

Expand All @@ -241,31 +247,31 @@ def test_consistency(self):

# Testing all the variations of `glocal`
# ntn
model = tfidfmodel.TfidfModel(self.corpus, smartirs='ntn')
model = tfidfmodel.TfidfModel(corpus, smartirs='ntn')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [
[
(3, 2.1699250014423126), (4, 2.1699250014423126), (5, 1.5849625007211563), (6, 2.1699250014423126),
(7, 1.5849625007211563), (8, 2.1699250014423126)
(3, 3.169925001442312), (4, 3.169925001442312), (5, 1.584962500721156), (6, 3.169925001442312),
(7, 3.169925001442312), (8, 2.169925001442312)
],
[
(5, 3.1699250014423126), (9, 1.5849625007211563), (10, 1.5849625007211563)
(5, 3.169925001442312), (9, 3.169925001442312), (10, 3.169925001442312)
]
]

self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

# npn
model = tfidfmodel.TfidfModel(self.corpus, smartirs='npn')
model = tfidfmodel.TfidfModel(corpus, smartirs='npn')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [
[
(3, 1.8073549220576042), (4, 1.8073549220576042), (5, 1.0), (6, 1.8073549220576042),
(7, 1.0), (8, 1.8073549220576042)
(3, 3.0), (4, 3.0), (5, 1.0), (6, 3.0),
(7, 3.0), (8, 1.8073549220576042)
],
[
(5, 2.0), (9, 1.0), (10, 1.0)
(5, 2.0), (9, 3.0), (10, 3.0)
]
]

Expand All @@ -274,12 +280,12 @@ def test_consistency(self):

# Testing all the variations of `normalize`
# nnc
model = tfidfmodel.TfidfModel(self.corpus, smartirs='nnc')
model = tfidfmodel.TfidfModel(corpus, smartirs='nnc')
transformed_docs = [model[docs[0]], model[docs[1]]]
expected_docs = [
[
(3, 0.34299717028501764), (4, 0.34299717028501764), (5, 0.51449575542752646), (6, 0.34299717028501764),
(7, 0.51449575542752646), (8, 0.34299717028501764)
(3, 0.4082482905), (4, 0.4082482905), (5, 0.4082482905), (6, 0.4082482905),
(7, 0.4082482905), (8, 0.4082482905)
],
[
(5, 0.81649658092772603), (9, 0.40824829046386302), (10, 0.40824829046386302)
Expand All @@ -289,11 +295,11 @@ def test_consistency(self):
self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
self.assertTrue(np.allclose(transformed_docs[1], expected_docs[1]))

model = tfidfmodel.TfidfModel(self.corpus, wlocal=lambda x: x, wglobal=lambda x, y: x * x, smartirs='nnc')
model = tfidfmodel.TfidfModel(corpus, wlocal=lambda x: x, wglobal=lambda x, y: x * x, smartirs='nnc')

transformed_docs = [model[docs[0]], model[docs[1]]]

model = tfidfmodel.TfidfModel(self.corpus, wlocal=lambda x: x * x, wglobal=lambda x, y: x, smartirs='nnc')
model = tfidfmodel.TfidfModel(corpus, wlocal=lambda x: x * x, wglobal=lambda x, y: x, smartirs='nnc')
expected_docs = [model[docs[0]], model[docs[1]]]

self.assertTrue(np.allclose(transformed_docs[0], expected_docs[0]))
Expand Down