From 06f5f5c4fa9fb54a169e53034a3bf3fa035cbc3c Mon Sep 17 00:00:00 2001 From: Pete Bleackley Date: Tue, 10 Apr 2018 10:03:02 +0100 Subject: [PATCH] Fix SMART from TfidfModel for case when `df == "n"`. Fix #2020 (#2021) * Added Montemurro and Zanette's entropy-based keyword extraction algorithm * Improved Docstrings * Fixed numerical bugs due to zero frequencies * Coding style changes, test and tutorial * I hate git * Summarization tutorial * Fixed some failing tests * Tests, demo, nan_to_num and a few last flake8 issues * Further flake8 issues * Further flake8 issues * Removed Jupyter checkpoint * Removed trailing whitespace * Trailing whitespace * Speed up test and add comment to explain threshold value * Flake8 again * rename vars + style fixes * fix operation order * Update docs with Montemurro and Zanette's algorithm * Revert "Update docs with Montemurro and Zanette's algorithm" This reverts commit 6add3bac515d9e27c0d0bb9e251b980c185c8951. * Fixed bug in TfidfModel, as described in Issue #2020 * Fix return type * Updated unit tests for TfidfModel * Updated unit tests for TfidfModel * Changed log(x)/log(2) to log2(x) since this is clearer. Fixed the placement of a parenthesis. Updated predicted values for unit tests * Fixed persistence tests * Flake 8 --- gensim/corpora/dictionary.py | 1 - gensim/models/tfidfmodel.py | 13 ++-- gensim/test/test_data/tfidf_model.tst | Bin 458 -> 1261 bytes gensim/test/test_data/tfidf_model.tst.bz2 | Bin 338 -> 822 bytes gensim/test/test_tfidfmodel.py | 80 ++++++++++++---------- 5 files changed, 49 insertions(+), 45 deletions(-) diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py index a736849b4e..9a9edf24ed 100644 --- a/gensim/corpora/dictionary.py +++ b/gensim/corpora/dictionary.py @@ -245,7 +245,6 @@ def doc2bow(self, document, allow_update=False, return_missing=False): # new id = number of ids made so far; # NOTE this assumes there are no gaps in the id sequence! token2id[w] = len(token2id) - result = {token2id[w]: freq for w, freq in iteritems(counter) if w in token2id} if allow_update: diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index a9e12c995a..68e83d8e6f 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -139,13 +139,13 @@ def updated_wlocal(tf, n_tf): if n_tf == "n": return tf elif n_tf == "l": - return 1 + np.log(tf) / np.log(2) + return 1 + np.log2(tf) elif n_tf == "a": return 0.5 + (0.5 * tf / tf.max(axis=0)) elif n_tf == "b": return tf.astype('bool').astype('int') elif n_tf == "L": - return (1 + np.log(tf) / np.log(2)) / (1 + np.log(tf.mean(axis=0) / np.log(2))) + return (1 + np.log2(tf)) / (1 + np.log2(tf.mean(axis=0))) def updated_wglobal(docfreq, totaldocs, n_df): @@ -166,12 +166,13 @@ def updated_wglobal(docfreq, totaldocs, n_df): Calculated wglobal. """ + if n_df == "n": - return utils.identity(docfreq) + return 1. elif n_df == "t": - return np.log(1.0 * totaldocs / docfreq) / np.log(2) + return np.log2(1.0 * totaldocs / docfreq) elif n_df == "p": - return np.log((1.0 * totaldocs - docfreq) / docfreq) / np.log(2) + return max(0, np.log2((1.0 * totaldocs - docfreq) / docfreq)) def updated_normalize(x, n_n, return_norm=False): @@ -303,7 +304,6 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden # If smartirs is not None, override wlocal, wglobal and normalize if smartirs is not None: n_tf, n_df, n_n = resolve_weights(smartirs) - self.wlocal = partial(updated_wlocal, n_tf=n_tf) self.wglobal = partial(updated_wglobal, n_df=n_df) # also return norm factor if pivot is not none @@ -371,7 +371,6 @@ def initialize(self, corpus): numnnz += len(bow) for termid, _ in bow: dfs[termid] = dfs.get(termid, 0) + 1 - # keep some stats about the training corpus self.num_docs = docno + 1 self.num_nnz = numnnz diff --git a/gensim/test/test_data/tfidf_model.tst b/gensim/test/test_data/tfidf_model.tst index e9e5f3f3cff5372e7e5ce18a89efe58321c84e4f..8d3c60c73ed35f9d341a1a9e65fd490301539739 100644 GIT binary patch literal 1261 zcma)++fvg&7{`00ZHfm}JRlxXDJoU)gbIR3l$EM0YE)`WleCGXN&ndt24RWi_OX%*>ah!27!)7+S+5PQz-n>m24a;Lrv)uGe%Vp)T?wEBsSLVcY zRxB$>4822g6{(V%OC%CryQz8J3eu!sP4jQF&X8F}wnP#nMUuRWC(V0?_bi`v@iE88 zJRb{uRCq6vR9nsR<1J^=57AXo75<@VPQ&v9%Vfw^kyi^spC2@J*IBVpFzRj32z}pW zN=pwyN9V>V@jkR$rXE_R_E&;};u^Xswv_DR&?>e{`#sIWHZ{pL8Q9J$A$n@kPsbcx z@&gk)DiX6=3_GVM@ZsaT;a@#!ue3I>jCR1BMN4D)qUHKJH+)wWy=q==^H8;x8Fp8( zN3Mv_nZCiWmlULqrH1R*bQk;VO}Np474)rPKSh7Z?&D4zP%|P(npwv|=|zY`!i&Q- zxxur@aAXxnd3p?KPubuBE;rjQk6#ez%L+4eR}T~nZ0sv$xZJW798*(VaD5ENd190A zae`v7MAA~nI7*dDW#%_;@z~#?C+Cw>IHh*+ z6;}@(Zv>|!3{#XNoS`^NlJ`aZ$wrayOkv;=Eg~@R;mOWZ^6KG@L+6Q zBft}~j_&89W6AeChs{cMR<&hogw(bimP#zYnLIeWb2fqC`US#0L{pY`rf+Mlb`s)O zmT}&V?bFu6q+&7VEhKwLr}SWc8$1T@Zy$;V9z&0j$Jpa!e)IYBEb(sY-ORhWcZtX1 zFEFwU|Hg%BEMzoisk?RE4sv?ME;;XIGxk=kCAaljYEugfq+*0~wZbJOUW0iguRHi~%x zA?h?~p^>#T$)GeeXaE2MLn8zLWMmp-4Ff_df0U{Enw!-F)Mx+z0004?0000001W^% zGy$LiibXU44FDPd27mwn00000001-q02vN9)ipFWKm-7iISr==DeZtoswe^=J%_3~ z0RS^3l!ka3U4JX`ly^4FHH~1{4^;n1ulaz#yUm2to+vQ`CO>VQBO|C{`Rn zq9aWt13)4lAQ*^k7SSvuAQ2(WHCu9SoxE++w~_6_5)ed%=@kG3Twol|0PI~Tj-*5$ zlMmPoWXJ&nODMo6Fv5Zi(NhK|2g$uk8Y|o;qx2&$j#+4HolcJ6e|@L> zEci%H0Fs%3jsPh@EGcs@R|P;?sw=X(QioL0LLqT)Zt(DlvtElFONUOqFB#Q<%9R{; zc~@lSWmc(IF<(m(H7mx2T$n@_LGDDT+fw~sVY+CNk)Z_`4blQb&p_qoS&#=16rmCi zF-pIG%^>Zq#F%MFkdunWtVw-f@;v07l7RP>5jon!nCQG=of}JuSFP+^e>m2|?WGZYN zcQ9L36s5KTHcYg29Jm>r(Ba-|HnR|(nMqn>o>M$}tU5*-Bg5ix#);VdCLVr%lTXo7 zsio?>SF-nC3*~((H#1*T*T4*^jL0KkKS=gsPrT_rX|M>s^`UC)B5CFi71_r;U|FU2}004*p00ICY zFaa9dHX0(C2dU_R=pX<9XwVrRpa`CzC#q@c05oW502%-Q07_|7Aw4Ecn?f0)W{HWT zL4W|#L;Vr~C^N8ePzY=&PU1tjo%n8