From f67b095435ab91e37bb07c5139e33ded631bea6b Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Sun, 18 Jun 2017 13:39:08 -0700 Subject: [PATCH 1/8] updated ipynb for new sklearn wrappers --- docs/notebooks/sklearn_wrapper.ipynb | 308 +++++++++++++-------------- 1 file changed, 150 insertions(+), 158 deletions(-) diff --git a/docs/notebooks/sklearn_wrapper.ipynb b/docs/notebooks/sklearn_wrapper.ipynb index cc5e85d3a2..6bf336b08e 100644 --- a/docs/notebooks/sklearn_wrapper.ipynb +++ b/docs/notebooks/sklearn_wrapper.ipynb @@ -18,17 +18,21 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "The wrapper available (as of now) are :\n", - "* LdaModel (```gensim.sklearn_integration.sklearn_wrapper_gensim_ldaModel.SklearnWrapperLdaModel```),which implements gensim's ```LdaModel``` in a scikit-learn interface\n", + "The wrappers available (as of now) are :\n", + "* LdaModel (```gensim.sklearn_integration.sklearn_wrapper_gensim_ldaModel.SklLdaModel```),which implements gensim's ```LDA Model``` in a scikit-learn interface\n", "\n", - "* LsiModel (```gensim.sklearn_integration.sklearn_wrapper_gensim_lsiModel.SklearnWrapperLsiModel```),which implements gensim's ```LsiModel``` in a scikit-learn interface" + "* LsiModel (```gensim.sklearn_integration.sklearn_wrapper_gensim_lsiModel.SklLsiModel```),which implements gensim's ```LSI Model``` in a scikit-learn interface\n", + "\n", + "* RpModel (```gensim.sklearn_integration.sklearn_wrapper_gensim_rpmodel.SklRpModel```),which implements gensim's ```Random Projections Model``` in a scikit-learn interface\n", + "\n", + "* LDASeq Model (```gensim.sklearn_integration.sklearn_wrapper_gensim_lsiModel.SklLdaSeqModel```),which implements gensim's ```LdaSeqModel``` in a scikit-learn interface" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### LdaModel" + "### LDA Model" ] }, { @@ -40,13 +44,13 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ - "from gensim.sklearn_integration.sklearn_wrapper_gensim_ldamodel import SklearnWrapperLdaModel" + "from gensim.sklearn_integration.sklearn_wrapper_gensim_ldamodel import SklLdaModel" ] }, { @@ -58,7 +62,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "collapsed": true }, @@ -89,41 +93,14 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n" - ] - }, - { - "data": { - "text/plain": [ - "array([[ 0.85275314, 0.14724686],\n", - " [ 0.12390183, 0.87609817],\n", - " [ 0.4612995 , 0.5387005 ],\n", - " [ 0.84924177, 0.15075823],\n", - " [ 0.49180096, 0.50819904],\n", - " [ 0.40086923, 0.59913077],\n", - " [ 0.28454427, 0.71545573],\n", - " [ 0.88776198, 0.11223802],\n", - " [ 0.84210373, 0.15789627]])" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "model=SklearnWrapperLdaModel(num_topics=2, id2word=dictionary, iterations=20, random_state=1)\n", + "outputs": [], + "source": [ + "model = SklLdaModel(num_topics=2, id2word=dictionary, iterations=20, random_state=1)\n", "model.fit(corpus)\n", - "model.print_topics(2)\n", "model.transform(corpus)" ] }, @@ -145,7 +122,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": { "collapsed": false }, @@ -156,12 +133,12 @@ "from gensim.models.ldamodel import LdaModel\n", "from sklearn.datasets import fetch_20newsgroups\n", "from sklearn.feature_extraction.text import CountVectorizer\n", - "from gensim.sklearn_integration.sklearn_wrapper_gensim_ldamodel import SklearnWrapperLdaModel" + "from gensim.sklearn_integration.sklearn_wrapper_gensim_ldamodel import SklLdaModel" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": { "collapsed": false }, @@ -181,7 +158,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": { "collapsed": false }, @@ -204,35 +181,14 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/plain": [ - "[(0,\n", - " u'0.025*\"456\" + 0.021*\"argue\" + 0.016*\"bitnet\" + 0.015*\"beastmaster\" + 0.014*\"cryptography\" + 0.013*\"false\" + 0.012*\"digex\" + 0.011*\"cover\" + 0.011*\"classified\" + 0.010*\"disk\"'),\n", - " (1,\n", - " u'0.142*\"abroad\" + 0.113*\"asking\" + 0.088*\"cryptography\" + 0.044*\"ciphertext\" + 0.043*\"arithmetic\" + 0.032*\"courtesy\" + 0.030*\"facts\" + 0.021*\"argue\" + 0.019*\"amolitor\" + 0.018*\"agree\"'),\n", - " (2,\n", - " u'0.034*\"certain\" + 0.027*\"69\" + 0.025*\"book\" + 0.025*\"demand\" + 0.024*\"87\" + 0.024*\"cracking\" + 0.021*\"farm\" + 0.019*\"fierkelab\" + 0.015*\"face\" + 0.011*\"abroad\"'),\n", - " (3,\n", - " u'0.017*\"decipher\" + 0.017*\"example\" + 0.016*\"cases\" + 0.016*\"follow\" + 0.008*\"considering\" + 0.006*\"forgot\" + 0.006*\"cellular\" + 0.005*\"evans\" + 0.005*\"computed\" + 0.005*\"cia\"'),\n", - " (4,\n", - " u'0.022*\"accurate\" + 0.021*\"corporate\" + 0.013*\"chance\" + 0.012*\"clark\" + 0.009*\"consideration\" + 0.009*\"candidates\" + 0.008*\"dawson\" + 0.008*\"authentication\" + 0.008*\"assess\" + 0.008*\"attempt\"')]" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "obj = SklearnWrapperLdaModel(id2word=id2word, num_topics=5, passes=20)\n", - "lda = obj.fit(X)\n", - "lda.print_topics()" + "outputs": [], + "source": [ + "obj = SklLdaModel(id2word=id2word, num_topics=5, passes=20)\n", + "lda = obj.fit(X)" ] }, { @@ -246,19 +202,19 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { "collapsed": false }, "outputs": [], "source": [ - "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.model_selection import GridSearchCV\n", "from gensim.models.coherencemodel import CoherenceModel" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": { "collapsed": true }, @@ -271,33 +227,13 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/plain": [ - "GridSearchCV(cv=5, error_score='raise',\n", - " estimator=SklearnWrapperLdaModel(alpha='symmetric', chunksize=2000, corpus=None,\n", - " decay=0.5, eta=None, eval_every=10, gamma_threshold=0.001,\n", - " id2word=,\n", - " iterations=50, minimum_probability=0.01, num_topics=5,\n", - " offset=1.0, passes=20, random_state=None, update_every=1),\n", - " fit_params={}, iid=True, n_jobs=1,\n", - " param_grid={'num_topics': (2, 3, 5, 10), 'iterations': (1, 20, 50)},\n", - " pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n", - " scoring=, verbose=0)" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "obj = SklearnWrapperLdaModel(id2word=dictionary, num_topics=5, passes=20)\n", + "outputs": [], + "source": [ + "obj = SklLdaModel(id2word=dictionary, num_topics=5, passes=20)\n", "parameters = {'num_topics': (2, 3, 5, 10), 'iterations': (1, 20, 50)}\n", "model = GridSearchCV(obj, parameters, scoring=scorer, cv=5)\n", "model.fit(corpus)" @@ -305,22 +241,11 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'iterations': 20, 'num_topics': 3}" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "model.best_params_" ] @@ -334,7 +259,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": { "collapsed": false }, @@ -343,7 +268,6 @@ "from sklearn.pipeline import Pipeline\n", "from sklearn import linear_model\n", "\n", - "\n", "def print_features_pipe(clf, vocab, n=10):\n", " ''' Better printing for sorted list '''\n", " coef = clf.named_steps['classifier'].coef_[0]\n", @@ -354,7 +278,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": { "collapsed": false }, @@ -366,46 +290,25 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[ -2.95020466e-01 -1.04115352e-01 5.19570267e-01 1.03817059e-01\n", - " 2.72881013e-02 1.35738501e-02 1.89246630e-13 1.89246630e-13\n", - " 1.89246630e-13 1.89246630e-13 1.89246630e-13 1.89246630e-13\n", - " 1.89246630e-13 1.89246630e-13 1.89246630e-13]\n", - "Positive features: Fame,:0.52 Keach:0.10 comp.org.eff.talk,:0.03 comp.org.eff.talk.:0.01 >Pat:0.00 dome.:0.00 internet...:0.00 trawling:0.00 hanging:0.00 red@redpoll.neoucom.edu:0.00\n", - "Negative features: Fame.:-0.30 considered,:-0.10\n", - "0.531040268456\n" - ] - } - ], - "source": [ - "model = SklearnWrapperLdaModel(num_topics=15, id2word=id2word, iterations=50, random_state=37)\n", + "outputs": [], + "source": [ + "model = SklLdaModel(num_topics=15, id2word=id2word, iterations=50, random_state=37)\n", "clf = linear_model.LogisticRegression(penalty='l2', C=0.1) # l2 penalty used\n", "pipe = Pipeline((('features', model,), ('classifier', clf)))\n", "pipe.fit(corpus, data.target)\n", "print_features_pipe(pipe, id2word.values())\n", - "print pipe.score(corpus, data.target)" + "print(pipe.score(corpus, data.target))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### LsiModel" + "### LSI Model" ] }, { @@ -417,13 +320,61 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from gensim.sklearn_integration.sklearn_wrapper_gensim_lsimodel import SklLsiModel" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example of Using Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "model = SklLsiModel(num_topics=15, id2word=id2word)\n", + "clf = linear_model.LogisticRegression(penalty='l2', C=0.1) # l2 penalty used\n", + "pipe = Pipeline((('features', model,), ('classifier', clf)))\n", + "pipe.fit(corpus, data.target)\n", + "print_features_pipe(pipe, id2word.values())\n", + "print(pipe.score(corpus, data.target))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Random Projections Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use RpModel begin with importing RpModel wrapper" + ] + }, + { + "cell_type": "code", + "execution_count": null, "metadata": { "collapsed": true }, "outputs": [], "source": [ - "from gensim.sklearn_integration.sklearn_wrapper_gensim_lsimodel import SklearnWrapperLsiModel" + "from gensim.sklearn_integration.sklearn_wrapper_gensim_rpmodel import SklRpModel" ] }, { @@ -435,31 +386,72 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[ 0.13652819 0.00383696 0.02635504 -0.08454895 -0.02356143 0.60020084\n", - " 1.07026252 -0.04072257 0.43732847 0.54913549 -0.20242834 -0.21855402\n", - " -1.30546283 -0.08690711 0.17606255]\n", - "Positive features: 01101001B:1.07 comp.org.eff.talk.:0.60 red@redpoll.neoucom.edu:0.55 circuitry:0.44 >Pat:0.18 Fame.:0.14 Fame,:0.03 considered,:0.00\n", - "Negative features: internet...:-1.31 trawling:-0.22 hanging:-0.20 dome.:-0.09 Keach:-0.08 *best*:-0.04 comp.org.eff.talk,:-0.02\n", - "0.865771812081\n" - ] - } - ], - "source": [ - "model = SklearnWrapperLsiModel(num_topics=15, id2word=id2word)\n", + "outputs": [], + "source": [ + "model = SklRpModel(num_topics=2)\n", + "numpy.random.mtrand.RandomState(1) # set seed for getting same result\n", "clf = linear_model.LogisticRegression(penalty='l2', C=0.1) # l2 penalty used\n", "pipe = Pipeline((('features', model,), ('classifier', clf)))\n", "pipe.fit(corpus, data.target)\n", "print_features_pipe(pipe, id2word.values())\n", - "print pipe.score(corpus, data.target)" + "print(pipe.score(corpus, data.target))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### LDASeq Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use LdaSeqModel begin with importing LdaSeqModel wrapper" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from gensim.sklearn_integration.sklearn_wrapper_gensim_ldaseqmodel import SklLdaSeqModel" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Example of Using Pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "test_data = data.data[0:2]\n", + "test_target = data.target[0:2]\n", + "id2word = Dictionary(map(lambda x: x.split(), test_data))\n", + "corpus = [id2word.doc2bow(i.split()) for i in test_data]\n", + "\n", + "model = SklLdaSeqModel(id2word=id2word, num_topics=2, time_slice=[1, 1, 1], initialize='gensim')\n", + "clf = linear_model.LogisticRegression(penalty='l2', C=0.1) # l2 penalty used\n", + "pipe = Pipeline((('features', model,), ('classifier', clf)))\n", + "pipe.fit(corpus, test_target)\n", + "print_features_pipe(pipe, id2word.values())\n", + "print(pipe.score(corpus, test_target))" ] } ], From 861dc8bd1739de3d0ee84eea9f9cef868b00813f Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Tue, 20 Jun 2017 01:36:49 +0530 Subject: [PATCH 2/8] included skl wrapper classes in '__init__.py' --- gensim/sklearn_integration/__init__.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/gensim/sklearn_integration/__init__.py b/gensim/sklearn_integration/__init__.py index c3227cbdef..bad381a306 100644 --- a/gensim/sklearn_integration/__init__.py +++ b/gensim/sklearn_integration/__init__.py @@ -8,3 +8,10 @@ See [1] for complete set of conventions. [1] http://scikit-learn.org/stable/developers/ """ + + +from .base_sklearn_wrapper import BaseSklearnWrapper +from .sklearn_wrapper_gensim_ldamodel import SklearnWrapperLdaModel +from .sklearn_wrapper_gensim_lsimodel import SklearnWrapperLsiModel +from .sklearn_wrapper_gensim_rpmodel import SklRpModel +from .sklearn_wrapper_gensim_ldaseqmodel import SklLdaSeqModel From 539adb4aeacc7e0ff8197ba690dc438b3593fb05 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Tue, 20 Jun 2017 01:37:30 +0530 Subject: [PATCH 3/8] shortened import statements for skl wrapper classes --- docs/notebooks/sklearn_wrapper.ipynb | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/notebooks/sklearn_wrapper.ipynb b/docs/notebooks/sklearn_wrapper.ipynb index 6bf336b08e..8340239669 100644 --- a/docs/notebooks/sklearn_wrapper.ipynb +++ b/docs/notebooks/sklearn_wrapper.ipynb @@ -50,7 +50,7 @@ }, "outputs": [], "source": [ - "from gensim.sklearn_integration.sklearn_wrapper_gensim_ldamodel import SklLdaModel" + "from gensim.sklearn_integration import SklLdaModel" ] }, { @@ -326,7 +326,7 @@ }, "outputs": [], "source": [ - "from gensim.sklearn_integration.sklearn_wrapper_gensim_lsimodel import SklLsiModel" + "from gensim.sklearn_integration import SklLsiModel" ] }, { @@ -374,7 +374,7 @@ }, "outputs": [], "source": [ - "from gensim.sklearn_integration.sklearn_wrapper_gensim_rpmodel import SklRpModel" + "from gensim.sklearn_integration import SklRpModel" ] }, { @@ -423,7 +423,7 @@ }, "outputs": [], "source": [ - "from gensim.sklearn_integration.sklearn_wrapper_gensim_ldaseqmodel import SklLdaSeqModel" + "from gensim.sklearn_integration import SklLdaSeqModel" ] }, { From 554f941c1ac3d36255ff99064518a563936bdbc1 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Tue, 20 Jun 2017 14:55:35 -0700 Subject: [PATCH 4/8] changes for __init__.py file --- gensim/sklearn_integration/__init__.py | 4 ++-- gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py | 4 ++-- .../sklearn_integration/sklearn_wrapper_gensim_ldaseqmodel.py | 4 ++-- gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py | 4 ++-- gensim/sklearn_integration/sklearn_wrapper_gensim_rpmodel.py | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/gensim/sklearn_integration/__init__.py b/gensim/sklearn_integration/__init__.py index bad381a306..f7f5e749c5 100644 --- a/gensim/sklearn_integration/__init__.py +++ b/gensim/sklearn_integration/__init__.py @@ -11,7 +11,7 @@ from .base_sklearn_wrapper import BaseSklearnWrapper -from .sklearn_wrapper_gensim_ldamodel import SklearnWrapperLdaModel -from .sklearn_wrapper_gensim_lsimodel import SklearnWrapperLsiModel +from .sklearn_wrapper_gensim_ldamodel import SklLdaModel +from .sklearn_wrapper_gensim_lsimodel import SklLsiModel from .sklearn_wrapper_gensim_rpmodel import SklRpModel from .sklearn_wrapper_gensim_ldaseqmodel import SklLdaSeqModel diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py b/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py index a158e5f71d..3e5cae4c9c 100644 --- a/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py +++ b/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py @@ -16,10 +16,10 @@ from gensim import models from gensim import matutils -from gensim.sklearn_integration import base_sklearn_wrapper +from gensim.sklearn_integration import BaseSklearnWrapper -class SklLdaModel(base_sklearn_wrapper.BaseSklearnWrapper, TransformerMixin, BaseEstimator): +class SklLdaModel(BaseSklearnWrapper, TransformerMixin, BaseEstimator): """ Base LDA module """ diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_ldaseqmodel.py b/gensim/sklearn_integration/sklearn_wrapper_gensim_ldaseqmodel.py index fdf9e58a10..e783784d31 100644 --- a/gensim/sklearn_integration/sklearn_wrapper_gensim_ldaseqmodel.py +++ b/gensim/sklearn_integration/sklearn_wrapper_gensim_ldaseqmodel.py @@ -14,10 +14,10 @@ from sklearn.exceptions import NotFittedError from gensim import models -from gensim.sklearn_integration import base_sklearn_wrapper +from gensim.sklearn_integration import BaseSklearnWrapper -class SklLdaSeqModel(base_sklearn_wrapper.BaseSklearnWrapper, TransformerMixin, BaseEstimator): +class SklLdaSeqModel(BaseSklearnWrapper, TransformerMixin, BaseEstimator): """ Base LdaSeq module """ diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py b/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py index 9b93e3a37f..b5f2809a74 100644 --- a/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py +++ b/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py @@ -16,10 +16,10 @@ from gensim import models from gensim import matutils -from gensim.sklearn_integration import base_sklearn_wrapper +from gensim.sklearn_integration import BaseSklearnWrapper -class SklLsiModel(base_sklearn_wrapper.BaseSklearnWrapper, TransformerMixin, BaseEstimator): +class SklLsiModel(BaseSklearnWrapper, TransformerMixin, BaseEstimator): """ Base LSI module """ diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_rpmodel.py b/gensim/sklearn_integration/sklearn_wrapper_gensim_rpmodel.py index e98d64aa97..5ece879bed 100644 --- a/gensim/sklearn_integration/sklearn_wrapper_gensim_rpmodel.py +++ b/gensim/sklearn_integration/sklearn_wrapper_gensim_rpmodel.py @@ -14,10 +14,10 @@ from sklearn.exceptions import NotFittedError from gensim import models -from gensim.sklearn_integration import base_sklearn_wrapper +from gensim.sklearn_integration import BaseSklearnWrapper -class SklRpModel(base_sklearn_wrapper.BaseSklearnWrapper, TransformerMixin, BaseEstimator): +class SklRpModel(BaseSklearnWrapper, TransformerMixin, BaseEstimator): """ Base RP module """ From 088d3d7b89e761c6f5c3fc4024d0631abc51aec8 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Tue, 20 Jun 2017 14:56:06 -0700 Subject: [PATCH 5/8] added output in sklearn wrappers ipynb --- docs/notebooks/sklearn_wrapper.ipynb | 169 ++++++++++++++++++++++----- 1 file changed, 138 insertions(+), 31 deletions(-) diff --git a/docs/notebooks/sklearn_wrapper.ipynb b/docs/notebooks/sklearn_wrapper.ipynb index 8340239669..4f4635389f 100644 --- a/docs/notebooks/sklearn_wrapper.ipynb +++ b/docs/notebooks/sklearn_wrapper.ipynb @@ -44,11 +44,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using TensorFlow backend.\n" + ] + } + ], "source": [ "from gensim.sklearn_integration import SklLdaModel" ] @@ -62,7 +70,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "collapsed": true }, @@ -93,11 +101,37 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n" + ] + }, + { + "data": { + "text/plain": [ + "array([[ 0.85275314, 0.14724686],\n", + " [ 0.12390183, 0.87609817],\n", + " [ 0.4612995 , 0.5387005 ],\n", + " [ 0.84924177, 0.15075823],\n", + " [ 0.49180096, 0.50819904],\n", + " [ 0.40086923, 0.59913077],\n", + " [ 0.28454427, 0.71545573],\n", + " [ 0.88776198, 0.11223802],\n", + " [ 0.84210373, 0.15789627]])" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "model = SklLdaModel(num_topics=2, id2word=dictionary, iterations=20, random_state=1)\n", "model.fit(corpus)\n", @@ -122,7 +156,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "collapsed": false }, @@ -138,7 +172,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "collapsed": false }, @@ -158,7 +192,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "collapsed": false }, @@ -181,13 +215,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": { "collapsed": false }, - "outputs": [], - "source": [ - "obj = SklLdaModel(id2word=id2word, num_topics=5, passes=20)\n", + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n" + ] + } + ], + "source": [ + "obj = SklLdaModel(id2word=id2word, num_topics=5, iterations=20)\n", "lda = obj.fit(X)" ] }, @@ -202,7 +244,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "collapsed": false }, @@ -214,14 +256,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": { "collapsed": true }, "outputs": [], "source": [ "def scorer(estimator, X, y=None):\n", - " goodcm = CoherenceModel(model=estimator, texts= texts, dictionary=estimator.id2word, coherence='c_v')\n", + " goodcm = CoherenceModel(model=estimator.gensim_model, texts= texts, dictionary=estimator.gensim_model.id2word, coherence='c_v')\n", " return goodcm.get_coherence()" ] }, @@ -233,9 +275,9 @@ }, "outputs": [], "source": [ - "obj = SklLdaModel(id2word=dictionary, num_topics=5, passes=20)\n", + "obj = SklLdaModel(id2word=dictionary, num_topics=5, iterations=20)\n", "parameters = {'num_topics': (2, 3, 5, 10), 'iterations': (1, 20, 50)}\n", - "model = GridSearchCV(obj, parameters, scoring=scorer, cv=5)\n", + "model = GridSearchCV(obj, parameters, scoring=scorer, cv=2)\n", "model.fit(corpus)" ] }, @@ -259,7 +301,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": { "collapsed": false }, @@ -278,7 +320,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": { "collapsed": false }, @@ -290,11 +332,32 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ -2.95020466e-01 -1.04115352e-01 5.19570267e-01 1.03817059e-01\n", + " 2.72881013e-02 1.35738501e-02 1.89246630e-13 1.89246630e-13\n", + " 1.89246630e-13 1.89246630e-13 1.89246630e-13 1.89246630e-13\n", + " 1.89246630e-13 1.89246630e-13 1.89246630e-13]\n", + "Positive features: Fame,:0.52 Keach:0.10 comp.org.eff.talk,:0.03 comp.org.eff.talk.:0.01 >Pat:0.00 dome.:0.00 internet...:0.00 trawling:0.00 hanging:0.00 red@redpoll.neoucom.edu:0.00\n", + "Negative features: Fame.:-0.30 considered,:-0.10\n", + "0.531040268456\n" + ] + } + ], "source": [ "model = SklLdaModel(num_topics=15, id2word=id2word, iterations=50, random_state=37)\n", "clf = linear_model.LogisticRegression(penalty='l2', C=0.1) # l2 penalty used\n", @@ -320,7 +383,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": { "collapsed": false }, @@ -338,11 +401,24 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0.13651844 -0.0038155 0.0264238 0.08494405 -0.02384796 -0.60051921\n", + " -1.07079081 0.04000798 0.43845983 -0.54894361 0.2017333 -0.21800463\n", + " 1.3045325 0.08672903 -0.17578455]\n", + "Positive features: internet...:1.30 circuitry:0.44 hanging:0.20 Fame.:0.14 dome.:0.09 Keach:0.08 *best*:0.04 Fame,:0.03\n", + "Negative features: 01101001B:-1.07 comp.org.eff.talk.:-0.60 red@redpoll.neoucom.edu:-0.55 trawling:-0.22 >Pat:-0.18 comp.org.eff.talk,:-0.02 considered,:-0.00\n", + "0.865771812081\n" + ] + } + ], "source": [ "model = SklLsiModel(num_topics=15, id2word=id2word)\n", "clf = linear_model.LogisticRegression(penalty='l2', C=0.1) # l2 penalty used\n", @@ -368,7 +444,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "metadata": { "collapsed": true }, @@ -386,14 +462,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0.01241958 -0.01335879]\n", + "Positive features: Fame.:0.01\n", + "Negative features: considered,:-0.01\n", + "0.59144295302\n" + ] + } + ], "source": [ "model = SklRpModel(num_topics=2)\n", - "numpy.random.mtrand.RandomState(1) # set seed for getting same result\n", + "np.random.mtrand.RandomState(1) # set seed for getting same result\n", "clf = linear_model.LogisticRegression(penalty='l2', C=0.1) # l2 penalty used\n", "pipe = Pipeline((('features', model,), ('classifier', clf)))\n", "pipe.fit(corpus, data.target)\n", @@ -417,7 +504,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "metadata": { "collapsed": true }, @@ -435,11 +522,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0.04877324 -0.04877324]\n", + "Positive features: What:0.05\n", + "Negative features: NLCS:-0.05\n", + "1.0\n" + ] + } + ], "source": [ "test_data = data.data[0:2]\n", "test_target = data.target[0:2]\n", @@ -453,6 +551,15 @@ "print_features_pipe(pipe, id2word.values())\n", "print(pipe.score(corpus, test_target))" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] } ], "metadata": { From e2f2a22b0c10b58c927f706410e5a4368c29d3ac Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Tue, 20 Jun 2017 17:02:27 -0700 Subject: [PATCH 6/8] updated 'set_params' function in skl wrappers --- gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py | 1 + gensim/sklearn_integration/sklearn_wrapper_gensim_ldaseqmodel.py | 1 + gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py | 1 + gensim/sklearn_integration/sklearn_wrapper_gensim_rpmodel.py | 1 + 4 files changed, 4 insertions(+) diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py b/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py index 3e5cae4c9c..1ad1fabccf 100644 --- a/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py +++ b/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py @@ -64,6 +64,7 @@ def set_params(self, **parameters): Set all parameters. """ super(SklLdaModel, self).set_params(**parameters) + return self def fit(self, X, y=None): """ diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_ldaseqmodel.py b/gensim/sklearn_integration/sklearn_wrapper_gensim_ldaseqmodel.py index e783784d31..32a732f145 100644 --- a/gensim/sklearn_integration/sklearn_wrapper_gensim_ldaseqmodel.py +++ b/gensim/sklearn_integration/sklearn_wrapper_gensim_ldaseqmodel.py @@ -61,6 +61,7 @@ def set_params(self, **parameters): Set all parameters. """ super(SklLdaSeqModel, self).set_params(**parameters) + return self def fit(self, X, y=None): """ diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py b/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py index b5f2809a74..5bd4fd0362 100644 --- a/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py +++ b/gensim/sklearn_integration/sklearn_wrapper_gensim_lsimodel.py @@ -51,6 +51,7 @@ def set_params(self, **parameters): Set all parameters. """ super(SklLsiModel, self).set_params(**parameters) + return self def fit(self, X, y=None): """ diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_rpmodel.py b/gensim/sklearn_integration/sklearn_wrapper_gensim_rpmodel.py index 5ece879bed..19e5739b33 100644 --- a/gensim/sklearn_integration/sklearn_wrapper_gensim_rpmodel.py +++ b/gensim/sklearn_integration/sklearn_wrapper_gensim_rpmodel.py @@ -41,6 +41,7 @@ def set_params(self, **parameters): Set all parameters. """ super(SklRpModel, self).set_params(**parameters) + return self def fit(self, X, y=None): """ From 7aa385b5693f083ddcf10955accd8114f3637313 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Tue, 20 Jun 2017 17:02:59 -0700 Subject: [PATCH 7/8] updated sklearn ipynb for GridSearch --- docs/notebooks/sklearn_wrapper.ipynb | 147 ++++++++++++++++++++++----- 1 file changed, 122 insertions(+), 25 deletions(-) diff --git a/docs/notebooks/sklearn_wrapper.ipynb b/docs/notebooks/sklearn_wrapper.ipynb index 4f4635389f..fb2008df72 100644 --- a/docs/notebooks/sklearn_wrapper.ipynb +++ b/docs/notebooks/sklearn_wrapper.ipynb @@ -256,7 +256,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 9, "metadata": { "collapsed": true }, @@ -269,25 +269,123 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n", + "WARNING:gensim.models.ldamodel:too few updates, training might not converge; consider increasing the number of passes or iterations to improve accuracy\n" + ] + }, + { + "data": { + "text/plain": [ + "GridSearchCV(cv=5, error_score='raise',\n", + " estimator=SklLdaModel(alpha='symmetric', chunksize=2000, decay=0.5, eta=None,\n", + " eval_every=10, gamma_threshold=0.001,\n", + " id2word=,\n", + " iterations=20, minimum_probability=0.01, num_topics=5, offset=1.0,\n", + " passes=1, random_state=None, update_every=1),\n", + " fit_params={}, iid=True, n_jobs=1,\n", + " param_grid={'num_topics': (2, 3, 5, 10), 'iterations': (1, 20, 50)},\n", + " pre_dispatch='2*n_jobs', refit=True, return_train_score=True,\n", + " scoring=, verbose=0)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "obj = SklLdaModel(id2word=dictionary, num_topics=5, iterations=20)\n", "parameters = {'num_topics': (2, 3, 5, 10), 'iterations': (1, 20, 50)}\n", - "model = GridSearchCV(obj, parameters, scoring=scorer, cv=2)\n", + "model = GridSearchCV(obj, parameters, scoring=scorer, cv=5)\n", "model.fit(corpus)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": { "collapsed": false }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'iterations': 20, 'num_topics': 10}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "model.best_params_" ] @@ -348,18 +446,17 @@ "name": "stdout", "output_type": "stream", "text": [ - "[ -2.95020466e-01 -1.04115352e-01 5.19570267e-01 1.03817059e-01\n", - " 2.72881013e-02 1.35738501e-02 1.89246630e-13 1.89246630e-13\n", - " 1.89246630e-13 1.89246630e-13 1.89246630e-13 1.89246630e-13\n", - " 1.89246630e-13 1.89246630e-13 1.89246630e-13]\n", - "Positive features: Fame,:0.52 Keach:0.10 comp.org.eff.talk,:0.03 comp.org.eff.talk.:0.01 >Pat:0.00 dome.:0.00 internet...:0.00 trawling:0.00 hanging:0.00 red@redpoll.neoucom.edu:0.00\n", - "Negative features: Fame.:-0.30 considered,:-0.10\n", - "0.531040268456\n" + "[-0.91085778 -0.48036135 -0.41265981 -0.66310168 -0.01339967 -0.12794711\n", + " 0.01611456 0.15208847 0.21579624 0.25621594 0.54796235 0.43618653\n", + " 0.56767608 0.39267377 0.27554429]\n", + "Positive features: internet...:0.57 hanging:0.55 trawling:0.44 dome.:0.39 >Pat:0.28 red@redpoll.neoucom.edu:0.26 circuitry:0.22 *best*:0.15 01101001B:0.02\n", + "Negative features: Fame.:-0.91 Keach:-0.66 considered,:-0.48 Fame,:-0.41 comp.org.eff.talk.:-0.13 comp.org.eff.talk,:-0.01\n", + "0.640939597315\n" ] } ], "source": [ - "model = SklLdaModel(num_topics=15, id2word=id2word, iterations=50, random_state=37)\n", + "model = SklLdaModel(num_topics=15, id2word=id2word, iterations=10, random_state=37)\n", "clf = linear_model.LogisticRegression(penalty='l2', C=0.1) # l2 penalty used\n", "pipe = Pipeline((('features', model,), ('classifier', clf)))\n", "pipe.fit(corpus, data.target)\n", @@ -401,7 +498,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "metadata": { "collapsed": false }, @@ -410,11 +507,11 @@ "name": "stdout", "output_type": "stream", "text": [ - "[ 0.13651844 -0.0038155 0.0264238 0.08494405 -0.02384796 -0.60051921\n", - " -1.07079081 0.04000798 0.43845983 -0.54894361 0.2017333 -0.21800463\n", - " 1.3045325 0.08672903 -0.17578455]\n", - "Positive features: internet...:1.30 circuitry:0.44 hanging:0.20 Fame.:0.14 dome.:0.09 Keach:0.08 *best*:0.04 Fame,:0.03\n", - "Negative features: 01101001B:-1.07 comp.org.eff.talk.:-0.60 red@redpoll.neoucom.edu:-0.55 trawling:-0.22 >Pat:-0.18 comp.org.eff.talk,:-0.02 considered,:-0.00\n", + "[ 0.13650375 -0.00382155 -0.0264042 -0.08478659 -0.02379243 -0.6006137\n", + " 1.07099917 0.03998737 0.43831279 -0.54905248 0.20204591 -0.2185433\n", + " -1.3051437 -0.08704868 0.17599105]\n", + "Positive features: 01101001B:1.07 circuitry:0.44 hanging:0.20 >Pat:0.18 Fame.:0.14 *best*:0.04\n", + "Negative features: internet...:-1.31 comp.org.eff.talk.:-0.60 red@redpoll.neoucom.edu:-0.55 trawling:-0.22 dome.:-0.09 Keach:-0.08 Fame,:-0.03 comp.org.eff.talk,:-0.02 considered,:-0.00\n", "0.865771812081\n" ] } @@ -444,7 +541,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "metadata": { "collapsed": true }, @@ -471,10 +568,10 @@ "name": "stdout", "output_type": "stream", "text": [ - "[ 0.01241958 -0.01335879]\n", - "Positive features: Fame.:0.01\n", - "Negative features: considered,:-0.01\n", - "0.59144295302\n" + "[-0.00071555 0.00913274]\n", + "Positive features: considered,:0.01\n", + "Negative features: Fame.:-0.00\n", + "0.543624161074\n" ] } ], From 4547c1e5f9f89d611fa9169559259d08198c3ad3 Mon Sep 17 00:00:00 2001 From: Chinmaya Pancholi Date: Tue, 20 Jun 2017 17:07:35 -0700 Subject: [PATCH 8/8] added comments to skip flake8 checks --- gensim/sklearn_integration/__init__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/gensim/sklearn_integration/__init__.py b/gensim/sklearn_integration/__init__.py index f7f5e749c5..d351e625fc 100644 --- a/gensim/sklearn_integration/__init__.py +++ b/gensim/sklearn_integration/__init__.py @@ -10,8 +10,8 @@ """ -from .base_sklearn_wrapper import BaseSklearnWrapper -from .sklearn_wrapper_gensim_ldamodel import SklLdaModel -from .sklearn_wrapper_gensim_lsimodel import SklLsiModel -from .sklearn_wrapper_gensim_rpmodel import SklRpModel -from .sklearn_wrapper_gensim_ldaseqmodel import SklLdaSeqModel +from .base_sklearn_wrapper import BaseSklearnWrapper # noqa: F401 +from .sklearn_wrapper_gensim_ldamodel import SklLdaModel # noqa: F401 +from .sklearn_wrapper_gensim_lsimodel import SklLsiModel # noqa: F401 +from .sklearn_wrapper_gensim_rpmodel import SklRpModel # noqa: F401 +from .sklearn_wrapper_gensim_ldaseqmodel import SklLdaSeqModel # noqa: F401