Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Pipeline #1213

Merged
merged 12 commits into from
Mar 21, 2017
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ install:
- pip install annoy
- pip install testfixtures
- pip install unittest2
- pip install sklearn
- pip install Morfessor==2.0.2a4
- python setup.py install
script: python setup.py test
171 changes: 147 additions & 24 deletions docs/notebooks/sklearn_wrapper.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -38,13 +38,52 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import sys\n",
"sys.path.insert(0, '/home/kris/Desktop/GsoC2K17/gensim/gensim/')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"WARNING:gensim.models.doc2vec:Slow version of gensim.models.doc2vec is being used\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"1.0.1\n"
]
}
],
"source": [
"import gensim\n",
"print gensim.__version__"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from gensim.sklearn_integration.sklearn_wrapper_gensim_ldaModel import SklearnWrapperLdaModel"
"from gensim.sklearn_integration.sklearn_wrapper_gensim_ldamodel import SklearnWrapperLdaModel"
]
},
{
Expand All @@ -56,7 +95,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 6,
"metadata": {
"collapsed": true
},
Expand Down Expand Up @@ -85,7 +124,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 7,
"metadata": {
"collapsed": false
},
Expand All @@ -106,7 +145,7 @@
" u'0.102*\"graph\" + 0.083*\"system\" + 0.072*\"tree\" + 0.064*\"server\" + 0.059*\"user\" + 0.059*\"computer\" + 0.057*\"trees\" + 0.056*\"eulerian\" + 0.055*\"node\" + 0.052*\"flow\"')]"
]
},
"execution_count": 3,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -135,9 +174,9 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 8,
"metadata": {
"collapsed": true
"collapsed": false
},
"outputs": [],
"source": [
Expand All @@ -146,14 +185,14 @@
"from gensim.models.ldamodel import LdaModel\n",
"from sklearn.datasets import fetch_20newsgroups\n",
"from sklearn.feature_extraction.text import CountVectorizer\n",
"from gensim.sklearn_integration.sklearn_wrapper_gensim_ldaModel import SklearnWrapperLdaModel"
"from gensim.sklearn_integration.sklearn_wrapper_gensim_ldamodel import SklearnWrapperLdaModel"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 9,
"metadata": {
"collapsed": true
"collapsed": false
},
"outputs": [],
"source": [
Expand All @@ -173,9 +212,9 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 38,
"metadata": {
"collapsed": true
"collapsed": false
},
"outputs": [],
"source": [
Expand All @@ -196,27 +235,34 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 61,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"here\n"
]
},
{
"data": {
"text/plain": [
"[(0,\n",
" u'0.018*\"cryptography\" + 0.018*\"face\" + 0.017*\"fierkelab\" + 0.008*\"abuse\" + 0.007*\"constitutional\" + 0.007*\"collection\" + 0.007*\"finish\" + 0.007*\"150\" + 0.007*\"fast\" + 0.006*\"difference\"'),\n",
" u'0.061*\"ciphertext\" + 0.028*\"abroad\" + 0.027*\"amolitor\" + 0.022*\"brian\" + 0.020*\"agree\" + 0.016*\"argue\" + 0.013*\"facts\" + 0.010*\"crypto\" + 0.010*\"asking\" + 0.009*\"capabilities\"'),\n",
" (1,\n",
" u'0.022*\"corporate\" + 0.022*\"accurate\" + 0.012*\"chance\" + 0.008*\"decipher\" + 0.008*\"example\" + 0.008*\"basically\" + 0.008*\"dawson\" + 0.008*\"cases\" + 0.008*\"consideration\" + 0.008*\"follow\"'),\n",
" u'0.031*\"accurate\" + 0.022*\"corporate\" + 0.019*\"clark\" + 0.017*\"decipher\" + 0.017*\"example\" + 0.015*\"cases\" + 0.015*\"follow\" + 0.015*\"basically\" + 0.015*\"consideration\" + 0.013*\"authentication\"'),\n",
" (2,\n",
" u'0.034*\"argue\" + 0.031*\"456\" + 0.031*\"arithmetic\" + 0.024*\"courtesy\" + 0.020*\"beastmaster\" + 0.019*\"bitnet\" + 0.015*\"false\" + 0.015*\"classified\" + 0.014*\"cubs\" + 0.014*\"digex\"'),\n",
" u'0.014*\"face\" + 0.013*\"bitnet\" + 0.011*\"constitutional\" + 0.011*\"false\" + 0.010*\"digex\" + 0.008*\"abuse\" + 0.008*\"effort\" + 0.008*\"costs\" + 0.008*\"breaking\" + 0.007*\"cover\"'),\n",
" (3,\n",
" u'0.108*\"abroad\" + 0.089*\"asking\" + 0.060*\"cryptography\" + 0.035*\"certain\" + 0.030*\"ciphertext\" + 0.030*\"book\" + 0.028*\"69\" + 0.028*\"demand\" + 0.028*\"87\" + 0.027*\"cracking\"'),\n",
" u'0.091*\"abroad\" + 0.079*\"asking\" + 0.059*\"cryptography\" + 0.032*\"certain\" + 0.030*\"arithmetic\" + 0.027*\"69\" + 0.026*\"book\" + 0.025*\"87\" + 0.024*\"cracking\" + 0.024*\"demand\"'),\n",
" (4,\n",
" u'0.022*\"clark\" + 0.019*\"authentication\" + 0.017*\"candidates\" + 0.016*\"decryption\" + 0.015*\"attempt\" + 0.013*\"creation\" + 0.013*\"1993apr5\" + 0.013*\"acceptable\" + 0.013*\"algorithms\" + 0.013*\"employer\"')]"
" u'0.014*\"cryptography\" + 0.013*\"corporate\" + 0.013*\"chance\" + 0.010*\"accurate\" + 0.008*\"dawson\" + 0.006*\"afford\" + 0.005*\"broad\" + 0.005*\"clark\" + 0.005*\"brett\" + 0.005*\"entirely\"')]"
]
},
"execution_count": 7,
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -245,7 +291,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 21,
"metadata": {
"collapsed": true
},
Expand All @@ -256,7 +302,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 22,
"metadata": {
"collapsed": true
},
Expand Down Expand Up @@ -291,14 +337,91 @@
"print_features(clf,vocab)"
]
},
{
"cell_type": "markdown",
"metadata": {
"collapsed": true
},
"source": [
"### Example for Using Grid Search"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from sklearn.model_selection import GridSearchCV\n",
"from gensim.models.coherencemodel import CoherenceModel"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
"source": [
"def scorer(estimator, X,y=None):\n",
" goodcm = CoherenceModel(model=estimator, texts= texts, dictionary=estimator.id2word, coherence='c_v')\n",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This gridsearch returns exception in the ipynb. Is it possible to have it fixed?

" return goodcm.get_coherence()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"ename": "NameError",
"evalue": "global name 'get_params' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-10-aa8da1d8855d>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0mparameters\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m'num_topics'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'iterations'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m20\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m50\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mGridSearchCV\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparameters\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscoring\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mscorer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcv\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcorpus\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m/home/kris/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_search.pyc\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, groups)\u001b[0m\n\u001b[1;32m 943\u001b[0m \u001b[0mtrain\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0mtest\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 944\u001b[0m \"\"\"\n\u001b[0;32m--> 945\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgroups\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mParameterGrid\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparam_grid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 946\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 947\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/kris/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_search.pyc\u001b[0m in \u001b[0;36m_fit\u001b[0;34m(self, X, y, groups, parameter_iterable)\u001b[0m\n\u001b[1;32m 548\u001b[0m n_candidates * n_splits))\n\u001b[1;32m 549\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 550\u001b[0;31m \u001b[0mbase_estimator\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mclone\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 551\u001b[0m \u001b[0mpre_dispatch\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpre_dispatch\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 552\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/kris/anaconda2/lib/python2.7/site-packages/sklearn/base.pyc\u001b[0m in \u001b[0;36mclone\u001b[0;34m(estimator, safe)\u001b[0m\n\u001b[1;32m 65\u001b[0m % (repr(estimator), type(estimator)))\n\u001b[1;32m 66\u001b[0m \u001b[0mklass\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mestimator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__class__\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 67\u001b[0;31m \u001b[0mnew_object_params\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mestimator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_params\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdeep\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 68\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparam\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msix\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miteritems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnew_object_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 69\u001b[0m \u001b[0mnew_object_params\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mclone\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparam\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msafe\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m/home/kris/Desktop/GsoC2K17/gensim/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py\u001b[0m in \u001b[0;36mget_params\u001b[0;34m(self, deep)\u001b[0m\n\u001b[1;32m 66\u001b[0m \"random_state\": self.random_state}\n\u001b[1;32m 67\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mdeep\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 68\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mget_params\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdeep\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 69\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 70\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mNameError\u001b[0m: global name 'get_params' is not defined"
]
}
],
"source": [
"obj=SklearnWrapperLdaModel(id2word=dictionary,num_topics=5,passes=20)\n",
"parameters = {'num_topics':(2, 3, 5, 10), 'iterations':(1,20,50)}\n",
"model = GridSearchCV(obj, parameters, scoring=scorer, cv=5)\n",
"model.fit(corpus)"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"{'iterations': 20, 'num_topics': 3}"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.best_params_"
]
}
],
"metadata": {
Expand All @@ -317,7 +440,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
"version": "2.7.12"
}
},
"nbformat": 4,
Expand Down
11 changes: 5 additions & 6 deletions gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,15 +56,14 @@ def get_params(self, deep=True):
"""
Returns all parameters as dictionary.
"""
if deep:
return {
"corpus": self.corpus, "num_topics": self.num_topics, "id2word": self.id2word,
return {"corpus": self.corpus, "num_topics": self.num_topics, "id2word": self.id2word,
"chunksize": self.chunksize, "passes": self.passes,
"update_every": self.update_every, "alpha": self.alpha, " eta": self.eta, " decay": self.decay,
"offset": self.offset, "eval_every": self.eval_every, " iterations": self.iterations,
"update_every": self.update_every, "alpha": self.alpha, "eta": self.eta, "decay": self.decay,
"offset": self.offset, "eval_every": self.eval_every, "iterations": self.iterations,
"gamma_threshold": self.gamma_threshold, "minimum_probability": self.minimum_probability,
"random_state": self.random_state}


def set_params(self, **parameters):
"""
Set all parameters.
Expand All @@ -73,7 +72,7 @@ def set_params(self, **parameters):
self.parameter = value
return self

def fit(self, X):
def fit(self, X, y=None):
"""
For fitting corpus into the class object.
Calls gensim.model.LdaModel:
Expand Down
11 changes: 11 additions & 0 deletions gensim/test/test_sklearn_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import numpy

from scipy import sparse
from sklearn.pipeline import Pipeline
from gensim.sklearn_integration.sklearn_wrapper_gensim_ldamodel import SklearnWrapperLdaModel
from gensim.corpora import Dictionary
from gensim import matutils
Expand Down Expand Up @@ -67,5 +68,15 @@ def testCSRMatrixConversion(self):
self.assertTrue(isinstance(v, six.string_types))
self.assertTrue(isinstance(k, int))

def testPipline(self):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typo in name of the function

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typo in test name

model = SklearnWrapperLdaModel(id2word=dictionary, num_topics=2, passes=100, minimum_probability=0, random_state=numpy.random.seed(0))
text_lda = Pipeline([('model', model)])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can a pipeline contain two things? From lda to logistic regression would be good. Also could you please add it to the tutorial.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do, you mean to say that we use lda as a feature extractor. And then use it to in the logistic regression. I thought of this and modified the transform function accordingly.

text_lda.fit(corpus)
topic = text_lda.named_steps['model'].print_topics(2)
for k, v in topic:
self.assertTrue(isinstance(v, six.string_types))
self.assertTrue(isinstance(k, int))


if __name__ == '__main__':
unittest.main()