From a5518e9f560a3269606179c469e377fa5ad63deb Mon Sep 17 00:00:00 2001 From: pmlk Date: Tue, 29 May 2018 22:42:29 +0200 Subject: [PATCH 01/10] add common_terms parameter This parameter is being propagated to the underlying models.Phrases class. --- gensim/sklearn_api/phrases.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/gensim/sklearn_api/phrases.py b/gensim/sklearn_api/phrases.py index 7579a09cc9..b6da043442 100644 --- a/gensim/sklearn_api/phrases.py +++ b/gensim/sklearn_api/phrases.py @@ -41,7 +41,7 @@ class PhrasesTransformer(TransformerMixin, BaseEstimator): """ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000, - delimiter=b'_', progress_per=10000, scoring='default'): + delimiter=b'_', progress_per=10000, scoring='default', common_terms=frozenset()): """ Parameters @@ -84,6 +84,9 @@ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000, A scoring function without any of these parameters (even if the parameters are not used) will raise a ValueError on initialization of the Phrases class. The scoring function must be pickleable. + common_terms : set of str, optional + List of "stop words" that won't affect frequency count of expressions containing them. + Allow to detect expressions like "bank_of_america" or "eye_of_the_beholder". """ self.gensim_model = None @@ -93,6 +96,7 @@ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000, self.delimiter = delimiter self.progress_per = progress_per self.scoring = scoring + self.common_terms = common_terms def fit(self, X, y=None): """Fit the model according to the given training data. @@ -111,7 +115,7 @@ def fit(self, X, y=None): self.gensim_model = models.Phrases( sentences=X, min_count=self.min_count, threshold=self.threshold, max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, - progress_per=self.progress_per, scoring=self.scoring + progress_per=self.progress_per, scoring=self.scoring, common_terms=self.common_terms ) return self @@ -163,7 +167,7 @@ def partial_fit(self, X): self.gensim_model = models.Phrases( sentences=X, min_count=self.min_count, threshold=self.threshold, max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, - progress_per=self.progress_per, scoring=self.scoring + progress_per=self.progress_per, scoring=self.scoring, common_terms=self.common_terms ) self.gensim_model.add_vocab(X) From 2db1a2590463810efc173099fef8ba1b4ccfe2d9 Mon Sep 17 00:00:00 2001 From: pmlk Date: Wed, 30 May 2018 12:19:23 +0200 Subject: [PATCH 02/10] add tests for new common_terms parameter --- gensim/test/test_sklearn_api.py | 60 +++++++++++++++++++++++++++++++++ 1 file changed, 60 insertions(+) diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py index ed5516df37..d280badd18 100644 --- a/gensim/test/test_sklearn_api.py +++ b/gensim/test/test_sklearn_api.py @@ -286,6 +286,14 @@ ['graph', 'minors', 'survey', 'human', 'interface'] ] +common_terms = ["of", "the", "was", "are"] +phrases_w_common_terms = [ + [u'the', u'mayor', u'of', u'new', u'york', u'was', u'there'], + [u'the', u'mayor', u'of', u'new', u'orleans', u'was', u'there'], + [u'the', u'bank', u'of', u'america', u'offices', u'are', u'open'], + [u'the', u'bank', u'of', u'america', u'offices', u'are', u'closed'] +] + class TestLdaWrapper(unittest.TestCase): def setUp(self): @@ -1151,6 +1159,58 @@ def testModelNotFitted(self): self.assertRaises(NotFittedError, phrases_transformer.transform, phrases_sentences[0]) +class TestPhrasesTransformerCommonTerms(unittest.TestCase): + def setUp(self): + numpy.random.seed(0) + self.model = PhrasesTransformer(min_count=1, threshold=1, common_terms=common_terms) + self.expected_transformations = [ + [u'the', u'mayor_of_new', u'york', u'was', u'there'], + [u'the', u'mayor_of_new', u'orleans', u'was', u'there'], + [u'the', u'bank_of_america', u'offices', u'are', u'open'], + [u'the', u'bank_of_america', u'offices', u'are', u'closed'] + ] + + def testFitAndTransform(self): + self.model.fit(phrases_w_common_terms) + + transformed = self.model.transform(phrases_w_common_terms) + self.assertEqual(transformed, self.expected_transformations) + + def testFitTransform(self): + transformed = self.model.fit_transform(phrases_w_common_terms) + self.assertEqual(transformed, self.expected_transformations) + + def testPartialFit(self): + # fit half of the sentences + self.model.fit(phrases_w_common_terms[:2]) + + expected_transformations_0 = [ + [u'the', u'mayor_of_new', u'york', u'was', u'there'], + [u'the', u'mayor_of_new', u'orleans', u'was', u'there'], + [u'the', u'bank', u'of', u'america', u'offices', u'are', u'open'], + [u'the', u'bank', u'of', u'america', u'offices', u'are', u'closed'] + ] + # transform all sentences, second half should be same as original + transformed_0 = self.model.transform(phrases_w_common_terms) + self.assertEqual(transformed_0, expected_transformations_0) + + # fit remaining sentences, result should be the same as in the other tests + self.model.partial_fit(phrases_w_common_terms[2:]) + transformed_1 = self.model.fit_transform(phrases_w_common_terms) + self.assertEqual(transformed_1, self.expected_transformations) + + new_phrases = [[u'offices', u'are', u'open'], [u'offices', u'are', u'closed']] + self.model.partial_fit(new_phrases) + expected_transformations_2 = [ + [u'the', u'mayor_of_new', u'york', u'was', u'there'], + [u'the', u'mayor_of_new', u'orleans', u'was', u'there'], + [u'the', u'bank_of_america', u'offices_are_open'], + [u'the', u'bank_of_america', u'offices_are_closed'] + ] + transformed_2 = self.model.transform(phrases_w_common_terms) + self.assertEqual(transformed_2, expected_transformations_2) + + # specifically test pluggable scoring in Phrases, because possible pickling issues with function parameter # this is intentionally in main rather than a class method to support pickling From cbe6061bfc743482597f54457cc7130e80f80fc9 Mon Sep 17 00:00:00 2001 From: pmlk Date: Wed, 30 May 2018 12:44:46 +0200 Subject: [PATCH 03/10] utilize models.phrases.Phraser class this avoids the following warning: "UserWarning: For a faster implementation, use the gensim.models.phrases.Phraser class" --- gensim/sklearn_api/phrases.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/gensim/sklearn_api/phrases.py b/gensim/sklearn_api/phrases.py index b6da043442..119143f8d9 100644 --- a/gensim/sklearn_api/phrases.py +++ b/gensim/sklearn_api/phrases.py @@ -30,6 +30,7 @@ from sklearn.exceptions import NotFittedError from gensim import models +from gensim.models.phrases import Phraser class PhrasesTransformer(TransformerMixin, BaseEstimator): @@ -90,6 +91,7 @@ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000, """ self.gensim_model = None + self.phraser = None self.min_count = min_count self.threshold = threshold self.max_vocab_size = max_vocab_size @@ -117,6 +119,7 @@ def fit(self, X, y=None): max_vocab_size=self.max_vocab_size, delimiter=self.delimiter, progress_per=self.progress_per, scoring=self.scoring, common_terms=self.common_terms ) + self.phraser = Phraser(self.gensim_model) return self def transform(self, docs): @@ -135,7 +138,7 @@ def transform(self, docs): Phrase representation for each of the input sentences. """ - if self.gensim_model is None: + if self.gensim_model is None or self.phraser is None: raise NotFittedError( "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." ) @@ -143,7 +146,8 @@ def transform(self, docs): # input as python lists if isinstance(docs[0], string_types): docs = [docs] - return [self.gensim_model[doc] for doc in docs] + + return [self.phraser[doc] for doc in docs] def partial_fit(self, X): """Train model over a potentially incomplete set of sentences. @@ -171,4 +175,5 @@ def partial_fit(self, X): ) self.gensim_model.add_vocab(X) + self.phraser = Phraser(self.gensim_model) return self From 35ab612c52fda3817e872ea1dc818cfc7e3db5bc Mon Sep 17 00:00:00 2001 From: pmlk Date: Mon, 13 Aug 2018 23:29:10 +0200 Subject: [PATCH 04/10] add testCompareToOld, add pre-trained Phrases model --- .../phrases_for_phrases_transformer.model | Bin 0 -> 1401 bytes gensim/test/test_sklearn_api.py | 18 +++++++++++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 gensim/test/test_data/phrases_for_phrases_transformer.model diff --git a/gensim/test/test_data/phrases_for_phrases_transformer.model b/gensim/test/test_data/phrases_for_phrases_transformer.model new file mode 100644 index 0000000000000000000000000000000000000000..65b831439f00d863af5a56fb503f3320eb087736 GIT binary patch literal 1401 zcmZ{k*?$yO5QS%wur-7PFbW8=$P!e-BBBBc3T_N*xwV1e+S7eA)7snV+dYsVf)Dt2 zTUFhe(FdPqe)XMGx9V2)eKivfZRX;1CCww7xRqj52F^O~LHmo&nZM9|fth80Tc^`; zVP3}B5VQYdV>*npATc7erH46x2iwv(Ghtq56}I%c8mUI5b)!6qu$3dbMH4(T<2(!o z#>E>J+irJO2Q&T_CK|=|FwB$0hE<$r&Wr3?P$yLshn0sNp)rFxPAWdecuAfOJ$SFN zldx+!ry(4D( z2yZA`V2j$#RIwzAHwka`X4t)*6=9whH7CG+K?ev2742m-v2kgw=xq@lA{_2@*)K)8 zx*Xeeyd&TU;iw)x$Dnqc^q7Q?6W-PE&WYYAJt6cY;gn{ZZ%xzFP77ZqoSB}s?kbz& zJpu0%KIqMH8Pe!xg~m0Cv!YlboKveb%5yAuA{OTbT_9Z4M#bEC%tY~_C@v8`QnNHu z#i_+*34cuZM2%AJro$-zRQMIbRpqnHZQ*cDqMs2y*Qiv&(Wd|Ff^HDL&~s$Dm%##)(g@Jnd&os$a4( zS)CT^4$t~nZzC>@C9=`SbFJQ(c*vJ(BZoiwc;PQevD{Bsmo6S# Date: Tue, 14 Aug 2018 10:51:23 +0200 Subject: [PATCH 05/10] use pickle to load old PhrasesTransformer --- .../phrases_for_phrases_transformer.model | Bin 1401 -> 0 bytes gensim/test/test_data/phrases_transformer.pkl | Bin 0 -> 982 bytes gensim/test/test_sklearn_api.py | 8 +------- 3 files changed, 1 insertion(+), 7 deletions(-) delete mode 100644 gensim/test/test_data/phrases_for_phrases_transformer.model create mode 100644 gensim/test/test_data/phrases_transformer.pkl diff --git a/gensim/test/test_data/phrases_for_phrases_transformer.model b/gensim/test/test_data/phrases_for_phrases_transformer.model deleted file mode 100644 index 65b831439f00d863af5a56fb503f3320eb087736..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1401 zcmZ{k*?$yO5QS%wur-7PFbW8=$P!e-BBBBc3T_N*xwV1e+S7eA)7snV+dYsVf)Dt2 zTUFhe(FdPqe)XMGx9V2)eKivfZRX;1CCww7xRqj52F^O~LHmo&nZM9|fth80Tc^`; zVP3}B5VQYdV>*npATc7erH46x2iwv(Ghtq56}I%c8mUI5b)!6qu$3dbMH4(T<2(!o z#>E>J+irJO2Q&T_CK|=|FwB$0hE<$r&Wr3?P$yLshn0sNp)rFxPAWdecuAfOJ$SFN zldx+!ry(4D( z2yZA`V2j$#RIwzAHwka`X4t)*6=9whH7CG+K?ev2742m-v2kgw=xq@lA{_2@*)K)8 zx*Xeeyd&TU;iw)x$Dnqc^q7Q?6W-PE&WYYAJt6cY;gn{ZZ%xzFP77ZqoSB}s?kbz& zJpu0%KIqMH8Pe!xg~m0Cv!YlboKveb%5yAuA{OTbT_9Z4M#bEC%tY~_C@v8`QnNHu z#i_+*34cuZM2%AJro$-zRQMIbRpqnHZQ*cDqMs2y*Qiv&(Wd|Ff^HDL&~s$Dm%##)(g@Jnd&os$a4( zS)CT^4$t~nZzC>@C9=`SbFJQ(c*vJ(BZoiwc;PQevD{Bsmo6S#@71>o1%jXlp+G^%GHnr+L=7pWO@ILHA^*Ei zB5eaXY37?Z_KyUM$w#Hryy*iLeBui?oM!3|5>QW?r#0p`tb=&Kp zEfxt2WS(TyajPZCLdA6`qd7SsgF@r~VJ9;1XH1JTPTjASAG4a*M5UzE3j!HMJjjGh zv@_=mKaaAx2r`HI!7LY1Ccs64d~iX-V;YU8%`lJsgtp!Kw1Wy)3qRnrgLGI{*alv4 zYtwch<1~kit|Hr67csl#*WBuKOXuo@pVD Date: Tue, 14 Aug 2018 11:02:07 +0200 Subject: [PATCH 06/10] allow setting Phrases model without setting Phraser model A pre-trained Phrases model (self.gensim_model) may be set to avoid using the fit() method. In transform(), the also necessary Phraser model (self.phraser) will be instantiated if it hasn't been before. --- gensim/sklearn_api/phrases.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/gensim/sklearn_api/phrases.py b/gensim/sklearn_api/phrases.py index 119143f8d9..b12ad7a40b 100644 --- a/gensim/sklearn_api/phrases.py +++ b/gensim/sklearn_api/phrases.py @@ -138,11 +138,14 @@ def transform(self, docs): Phrase representation for each of the input sentences. """ - if self.gensim_model is None or self.phraser is None: + if self.gensim_model is None: raise NotFittedError( "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method." ) + if self.phraser is None: + self.phraser = Phraser(self.gensim_model) + # input as python lists if isinstance(docs[0], string_types): docs = [docs] From 86f3d57705d5a1061fc2b99b20b8342d9afc508b Mon Sep 17 00:00:00 2001 From: pmlk Date: Tue, 14 Aug 2018 11:18:44 +0200 Subject: [PATCH 07/10] open pickle file --- gensim/test/test_sklearn_api.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py index 422a0b8757..44f57f6a8b 100644 --- a/gensim/test/test_sklearn_api.py +++ b/gensim/test/test_sklearn_api.py @@ -1170,7 +1170,8 @@ def setUp(self): ] def testCompareToOld(self): - old_phrases_transformer = pickle.load(datapath("phrases_transformer.pkl")) + with open(datapath("phrases_transformer.pkl"), "rb") as old_phrases_transformer_pkl: + old_phrases_transformer = pickle.load(old_phrases_transformer_pkl) doc = phrases_sentences[-1] phrase_tokens = old_phrases_transformer.transform(doc)[0] expected_phrase_tokens = [u'graph_minors', u'survey', u'human_interface'] From c0b4a958c408db4de4da1a53bc4242248ef512e6 Mon Sep 17 00:00:00 2001 From: pmlk Date: Thu, 6 Sep 2018 00:01:43 +0200 Subject: [PATCH 08/10] add __setstate__ for backward compatibility --- gensim/sklearn_api/phrases.py | 5 + .../test_data/phrases-transformer-v3-5-0.pkl | 174 ++++++++++++++++++ gensim/test/test_data/phrases_transformer.pkl | Bin 982 -> 0 bytes gensim/test/test_sklearn_api.py | 2 +- 4 files changed, 180 insertions(+), 1 deletion(-) create mode 100644 gensim/test/test_data/phrases-transformer-v3-5-0.pkl delete mode 100644 gensim/test/test_data/phrases_transformer.pkl diff --git a/gensim/sklearn_api/phrases.py b/gensim/sklearn_api/phrases.py index b12ad7a40b..5c1cfa83b0 100644 --- a/gensim/sklearn_api/phrases.py +++ b/gensim/sklearn_api/phrases.py @@ -100,6 +100,11 @@ def __init__(self, min_count=5, threshold=10.0, max_vocab_size=40000000, self.scoring = scoring self.common_terms = common_terms + def __setstate__(self, state): + self.__dict__ = state + self.common_terms = frozenset() + self.phraser = None + def fit(self, X, y=None): """Fit the model according to the given training data. diff --git a/gensim/test/test_data/phrases-transformer-v3-5-0.pkl b/gensim/test/test_data/phrases-transformer-v3-5-0.pkl new file mode 100644 index 0000000000..dd8b50ae72 --- /dev/null +++ b/gensim/test/test_data/phrases-transformer-v3-5-0.pkl @@ -0,0 +1,174 @@ +ccopy_reg +_reconstructor +p0 +(cgensim.sklearn_api.phrases +PhrasesTransformer +p1 +c__builtin__ +object +p2 +Ntp3 +Rp4 +(dp5 +S'scoring' +p6 +S'default' +p7 +sS'gensim_model' +p8 +g0 +(cgensim.models.phrases +Phrases +p9 +g2 +Ntp10 +Rp11 +(dp12 +g6 +cgensim.models.phrases +original_scorer +p13 +sS'vocab' +p14 +ccollections +defaultdict +p15 +(c__builtin__ +int +p16 +tp17 +Rp18 +S'minors' +p19 +I3 +sS'human_system' +p20 +I1 +sS'user_response' +p21 +I1 +sS'survey_user' +p22 +I1 +sS'human' +p23 +I3 +sS'system_eps' +p24 +I1 +sS'graph_minors' +p25 +I3 +sS'graph' +p26 +I4 +sS'survey_human' +p27 +I1 +sS'system' +p28 +I4 +sS'human_interface' +p29 +I2 +sS'minors_trees' +p30 +I1 +sS'interface_computer' +p31 +I1 +sS'trees' +p32 +I3 +sS'interface_system' +p33 +I1 +sS'user_computer' +p34 +I1 +sS'user' +p35 +I3 +sS'interface' +p36 +I3 +sS'eps_user' +p37 +I1 +sS'response' +p38 +I2 +sS'response_time' +p39 +I2 +sS'system_response' +p40 +I1 +sS'graph_trees' +p41 +I1 +sS'user_interface' +p42 +I1 +sS'system_human' +p43 +I1 +sS'eps' +p44 +I2 +sS'computer' +p45 +I2 +sS'survey' +p46 +I3 +sS'minors_survey' +p47 +I2 +sS'time' +p48 +I2 +sS'computer_system' +p49 +I1 +ssS'min_reduce' +p50 +I1 +sS'common_terms' +p51 +c__builtin__ +frozenset +p52 +((lp53 +tp54 +Rp55 +sS'threshold' +p56 +I1 +sS'delimiter' +p57 +S'_' +p58 +sS'progress_per' +p59 +I10000 +sS'max_vocab_size' +p60 +I40000000 +sS'min_count' +p61 +I1 +sS'corpus_word_count' +p62 +I34 +sbsg57 +g58 +sg59 +I10000 +sg56 +I1 +sg60 +I40000000 +sg61 +I1 +sb. \ No newline at end of file diff --git a/gensim/test/test_data/phrases_transformer.pkl b/gensim/test/test_data/phrases_transformer.pkl deleted file mode 100644 index 144943b652621e1e62ca6939245257eda05a9c68..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 982 zcmZ9LNpIUQ5QV+Qw!9|Y_cmSPba9(>@71>o1%jXlp+G^%GHnr+L=7pWO@ILHA^*Ei zB5eaXY37?Z_KyUM$w#Hryy*iLeBui?oM!3|5>QW?r#0p`tb=&Kp zEfxt2WS(TyajPZCLdA6`qd7SsgF@r~VJ9;1XH1JTPTjASAG4a*M5UzE3j!HMJjjGh zv@_=mKaaAx2r`HI!7LY1Ccs64d~iX-V;YU8%`lJsgtp!Kw1Wy)3qRnrgLGI{*alv4 zYtwch<1~kit|Hr67csl#*WBuKOXuo@pVD Date: Thu, 6 Sep 2018 11:38:55 +0200 Subject: [PATCH 09/10] use pickle protocol 2 --- .../test_data/phrases-transformer-v3-5-0.pkl | Bin 1333 -> 1432 bytes 1 file changed, 0 insertions(+), 0 deletions(-) diff --git a/gensim/test/test_data/phrases-transformer-v3-5-0.pkl b/gensim/test/test_data/phrases-transformer-v3-5-0.pkl index dd8b50ae72f6ff3761d7b235afa926f09a5eeb44..8ffef6763b539975f1035f50d458da257b6e183c 100644 GIT binary patch literal 1432 zcmZ{kYj+ey5QcY?kidjXz#HBHF9g&m;9U_F5M)KML>y6UXJ>nN2Ils5&jb?X9Q**k zo2r`GiE@sgX1AWI>Z-S@`@upyvxQId4Sy%Ikt9)n*D}$Tk z1x;HgTt-r66YLue)sm+L=WmJ{``IGJww;=s&&qIM-njntPxc&OjT=39ABWbHrT=l| zCTD38Wk&iHZJr8u@iAH;o{r-(%WPbyW#NNPgGn0K0iKDC88>Oh?HLnfWiboz>_a?9 zIJ};$gu6KhH&bz(#q$aG{{rE~(E=k$pEP+?;3cuYOn7Cq$o2zsyGaX{b{fSNuZqVJ z!qGl*=AtUWF|of!czv|Qk*gf3c2hfRJ>C%dCgJ#K$W&Bg9ZTmsye{|z;iO_Irww~c z>~9m^(Q;&ZzHys&3mXF7CA_C#NdnT`;*^+A6V7OHhuGW~&O^(qMi%6Kar%Jpp&ll& zjc4;mV*Z%$iKY~_`iM^jeMb0P(@OPz%hxtHt!QTje?d5>2^SgL%HvD1ent3NgVxx( zQ%f6;^Wt)WaIsfK6PgZxBlr^GvSL|ODfC~n_*NXQ5Ju_{?9?Uae<$!N;d_Pq7pP-rxA))QoZ~`;PIOu-^%{ zG^xaDFTKQn2sea3wOU!cIhv~xiN}}__a5C4=0#v49+ogw551bQnx^Y(CO9Fa{eW2} zqqDy)=nf(4wbJVP%(?J_P%4+@ZbfXPE1@8`{@jkX>){30gl05slI3JI(FaYY#f2GX zCkC%P`~ZO!4h$1(l=g^?=xg z%#KQm#;+{MzmQ2V)e$&V#Xwa}=0ruqmxI7JcIpPnR1)`wvEQBQt^lnBDdmO%{4GqO zOtH7Dr)pu%UZ|ZodZO7^XNJE()MzIcBDZ>>*VOyIjNK&TN1XMDczvYTiZPd{1I&*e zK$;U(PQ)d=>kr9!4%a!H`f#Ahjn|e^dxon;E;FI?ce&B1+gzRETzH~a!T1?wYkMy6 z8`Dd!TopjA4!2Gvt`JvqlNCqKL2G8w6meUzRBrSOPI9&=+{&RXdGsjXd}<)`;nTiL zQJM{Eio&f4LqTQg40no?O;NZry@AFgq{QTGM^@Mrh1unjM@kfttJOt}DC*K+_hdvF zVeVBd7e(SaLIGik%TBC-xD;Eb#pp#@w*HZTyi}SMh$5}Dd)z?*aqxDXAMRkkq^Y(= z7*33Cl9<}aQ*FO2_$|0kd?7S$e_PYxF!;8?60`zK(6)rO^BsQO-O%%C5Ke={Ufa>G zSD`&)S0raITM?9PxyT3B+r;PTKM{@#A!C5@10AC&DmWjV4r!oYr jw_l?C<3j&P@^!=AzeWRm!_a02WinIqKY+ Date: Thu, 4 Oct 2018 11:26:17 +0200 Subject: [PATCH 10/10] test loading new phrases transformer --- .../test_data/phrases-transformer-new-v3-5-0.pkl | Bin 0 -> 1896 bytes gensim/test/test_sklearn_api.py | 12 ++++++++++++ 2 files changed, 12 insertions(+) create mode 100644 gensim/test/test_data/phrases-transformer-new-v3-5-0.pkl diff --git a/gensim/test/test_data/phrases-transformer-new-v3-5-0.pkl b/gensim/test/test_data/phrases-transformer-new-v3-5-0.pkl new file mode 100644 index 0000000000000000000000000000000000000000..77994180586f35f8f185943fc7a1c191dbd29c18 GIT binary patch literal 1896 zcmZ`)hjtrP5S3-8t)0XPp_f1?aVP>O0Zcd55D^GN01*g})#^#RWQ*=TtrHtW62PK| z-g_^<#g8!acC|)6WRK2~G@b)Ih21NPTknF zlQhz)gUX)M80(`t)7Vt0RH7hOUXsN_jKBJJ&h{_*+sZ= zfU8Cbt}X~T*RGFiL~S==rf^?GwMWF)60RHI`uh07{EolvGr$d*^}}TkqJVv&#)f0; zS>Z;pxrs1qdu6;N4dM>=o~2iec7oUql_*N{_cf-N#!I3oi4`+*BlderHM1b(8Y|_r z(`1?FN%tJwd=$5=;8wz1)?Cxj+lpBk`oG;q)6#7>3wyZ3I_aiKhZ|{BmlL@2;Pmds z7Iq!EC)8r%xlLt)WsSQIRE{*OYqGuDpS2A6U@JR+xaTY>79;Lpp2zXtKJJ_Gc>w*r z{yu-dKj+s9u90Z&muRHFHWEDGc}W;*uNUypIYkz&z~dS`I8vN2i8~G+TEPP0VQJ>p zyqMjLOR3C~>FEfMNQVv(>Kw&XV>4@HhS74S9~Jx{;W3L#J90yn58LAcA0j+q;cc)`eL_$l=d4y2?j?~Q zBZO8i3FT(JRBMAs(3p@|r(%`MyCMf6E!pzzGa~O1vibyju?4s5Ule>%;Nyf7maaT4 zOCnw-ylg|CW-n#h!%n{1I0G`=BxYqOS`+vy%w?mL0MC;U*tMIL!4 ze-!v9!p{~K{@l9k0DcjTUkSg}tC=samu;-XH?Er*bv#L1#XIfy`YyXH%+3qg%J^je Z!OH)0i2k&Ph!35_I&vTX^7;PN{slS=U%&tW literal 0 HcmV?d00001 diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py index 252e1aa783..014e3526c5 100644 --- a/gensim/test/test_sklearn_api.py +++ b/gensim/test/test_sklearn_api.py @@ -1181,6 +1181,18 @@ def testCompareToOld(self): new_phrase_tokens = self.model.transform(doc)[0] self.assertEqual(new_phrase_tokens, phrase_tokens) + def testLoadNew(self): + with open(datapath("phrases-transformer-new-v3-5-0.pkl"), "rb") as new_phrases_transformer_pkl: + old_phrases_transformer = pickle.load(new_phrases_transformer_pkl) + doc = phrases_sentences[-1] + phrase_tokens = old_phrases_transformer.transform(doc)[0] + expected_phrase_tokens = [u'graph_minors', u'survey', u'human_interface'] + self.assertEqual(phrase_tokens, expected_phrase_tokens) + + self.model.fit(phrases_sentences) + new_phrase_tokens = self.model.transform(doc)[0] + self.assertEqual(new_phrase_tokens, phrase_tokens) + def testFitAndTransform(self): self.model.fit(phrases_w_common_terms)