piskvorky · tmylk · Mar 21, 2017 · Mar 14, 2017 · Mar 14, 2017 · Mar 15, 2017
diff --git a/.travis.yml b/.travis.yml
@@ -18,6 +18,7 @@ install:
   - pip install annoy
   - pip install testfixtures
   - pip install unittest2
+  - pip install sklearn
   - pip install Morfessor==2.0.2a4
   - python setup.py install
 script: python setup.py test
diff --git a/docs/notebooks/sklearn_wrapper.ipynb b/docs/notebooks/sklearn_wrapper.ipynb
@@ -38,13 +38,52 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "sys.path.insert(0, '/home/kris/Desktop/GsoC2K17/gensim/gensim/')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:gensim.models.doc2vec:Slow version of gensim.models.doc2vec is being used\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1.0.1\n"
+     ]
+    }
+   ],
+   "source": [
+    "import gensim\n",
+    "print gensim.__version__"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
    "metadata": {
     "collapsed": false
    },
    "outputs": [],
    "source": [
-    "from gensim.sklearn_integration.sklearn_wrapper_gensim_ldaModel import SklearnWrapperLdaModel"
+    "from gensim.sklearn_integration.sklearn_wrapper_gensim_ldamodel import SklearnWrapperLdaModel"
    ]
   },
   {
@@ -56,7 +95,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 6,
    "metadata": {
     "collapsed": true
    },
@@ -85,7 +124,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 7,
    "metadata": {
     "collapsed": false
    },
@@ -106,7 +145,7 @@
        "  u'0.102*\"graph\" + 0.083*\"system\" + 0.072*\"tree\" + 0.064*\"server\" + 0.059*\"user\" + 0.059*\"computer\" + 0.057*\"trees\" + 0.056*\"eulerian\" + 0.055*\"node\" + 0.052*\"flow\"')]"
       ]
      },
-     "execution_count": 3,
+     "execution_count": 7,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -135,9 +174,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 8,
    "metadata": {
-    "collapsed": true
+    "collapsed": false
    },
    "outputs": [],
    "source": [
@@ -146,14 +185,14 @@
     "from gensim.models.ldamodel import LdaModel\n",
     "from sklearn.datasets import fetch_20newsgroups\n",
     "from sklearn.feature_extraction.text import CountVectorizer\n",
-    "from gensim.sklearn_integration.sklearn_wrapper_gensim_ldaModel import SklearnWrapperLdaModel"
+    "from gensim.sklearn_integration.sklearn_wrapper_gensim_ldamodel import SklearnWrapperLdaModel"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 9,
    "metadata": {
-    "collapsed": true
+    "collapsed": false
    },
    "outputs": [],
    "source": [
@@ -173,9 +212,9 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 38,
    "metadata": {
-    "collapsed": true
+    "collapsed": false
    },
    "outputs": [],
    "source": [
@@ -196,27 +235,34 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 61,
    "metadata": {
     "collapsed": false
    },
    "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "here\n"
+     ]
+    },
     {
      "data": {
       "text/plain": [
        "[(0,\n",
-       "  u'0.018*\"cryptography\" + 0.018*\"face\" + 0.017*\"fierkelab\" + 0.008*\"abuse\" + 0.007*\"constitutional\" + 0.007*\"collection\" + 0.007*\"finish\" + 0.007*\"150\" + 0.007*\"fast\" + 0.006*\"difference\"'),\n",
+       "  u'0.061*\"ciphertext\" + 0.028*\"abroad\" + 0.027*\"amolitor\" + 0.022*\"brian\" + 0.020*\"agree\" + 0.016*\"argue\" + 0.013*\"facts\" + 0.010*\"crypto\" + 0.010*\"asking\" + 0.009*\"capabilities\"'),\n",
        " (1,\n",
-       "  u'0.022*\"corporate\" + 0.022*\"accurate\" + 0.012*\"chance\" + 0.008*\"decipher\" + 0.008*\"example\" + 0.008*\"basically\" + 0.008*\"dawson\" + 0.008*\"cases\" + 0.008*\"consideration\" + 0.008*\"follow\"'),\n",
+       "  u'0.031*\"accurate\" + 0.022*\"corporate\" + 0.019*\"clark\" + 0.017*\"decipher\" + 0.017*\"example\" + 0.015*\"cases\" + 0.015*\"follow\" + 0.015*\"basically\" + 0.015*\"consideration\" + 0.013*\"authentication\"'),\n",
        " (2,\n",
-       "  u'0.034*\"argue\" + 0.031*\"456\" + 0.031*\"arithmetic\" + 0.024*\"courtesy\" + 0.020*\"beastmaster\" + 0.019*\"bitnet\" + 0.015*\"false\" + 0.015*\"classified\" + 0.014*\"cubs\" + 0.014*\"digex\"'),\n",
+       "  u'0.014*\"face\" + 0.013*\"bitnet\" + 0.011*\"constitutional\" + 0.011*\"false\" + 0.010*\"digex\" + 0.008*\"abuse\" + 0.008*\"effort\" + 0.008*\"costs\" + 0.008*\"breaking\" + 0.007*\"cover\"'),\n",
        " (3,\n",
-       "  u'0.108*\"abroad\" + 0.089*\"asking\" + 0.060*\"cryptography\" + 0.035*\"certain\" + 0.030*\"ciphertext\" + 0.030*\"book\" + 0.028*\"69\" + 0.028*\"demand\" + 0.028*\"87\" + 0.027*\"cracking\"'),\n",
+       "  u'0.091*\"abroad\" + 0.079*\"asking\" + 0.059*\"cryptography\" + 0.032*\"certain\" + 0.030*\"arithmetic\" + 0.027*\"69\" + 0.026*\"book\" + 0.025*\"87\" + 0.024*\"cracking\" + 0.024*\"demand\"'),\n",
        " (4,\n",
-       "  u'0.022*\"clark\" + 0.019*\"authentication\" + 0.017*\"candidates\" + 0.016*\"decryption\" + 0.015*\"attempt\" + 0.013*\"creation\" + 0.013*\"1993apr5\" + 0.013*\"acceptable\" + 0.013*\"algorithms\" + 0.013*\"employer\"')]"
+       "  u'0.014*\"cryptography\" + 0.013*\"corporate\" + 0.013*\"chance\" + 0.010*\"accurate\" + 0.008*\"dawson\" + 0.006*\"afford\" + 0.005*\"broad\" + 0.005*\"clark\" + 0.005*\"brett\" + 0.005*\"entirely\"')]"
       ]
      },
-     "execution_count": 7,
+     "execution_count": 61,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -245,7 +291,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 21,
    "metadata": {
     "collapsed": true
    },
@@ -256,7 +302,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 22,
    "metadata": {
     "collapsed": true
    },
@@ -291,14 +337,91 @@
     "print_features(clf,vocab)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "collapsed": true
+   },
+   "source": [
+    "### Example for Using Grid Search"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.model_selection import GridSearchCV\n",
+    "from gensim.models.coherencemodel import CoherenceModel"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "metadata": {
     "collapsed": true
    },
    "outputs": [],
-   "source": []
+   "source": [
+    "def scorer(estimator, X,y=None):\n",
+    "    goodcm = CoherenceModel(model=estimator, texts= texts, dictionary=estimator.id2word, coherence='c_v')\n",
+    "    return goodcm.get_coherence()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "global name 'get_params' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-10-aa8da1d8855d>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0mparameters\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m{\u001b[0m\u001b[0;34m'num_topics'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'iterations'\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m20\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m50\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mmodel\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mGridSearchCV\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mobj\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparameters\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscoring\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mscorer\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcv\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcorpus\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m/home/kris/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_search.pyc\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, groups)\u001b[0m\n\u001b[1;32m    943\u001b[0m             \u001b[0mtrain\u001b[0m\u001b[0;34m/\u001b[0m\u001b[0mtest\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    944\u001b[0m         \"\"\"\n\u001b[0;32m--> 945\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_fit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgroups\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mParameterGrid\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparam_grid\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    946\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    947\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/home/kris/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_search.pyc\u001b[0m in \u001b[0;36m_fit\u001b[0;34m(self, X, y, groups, parameter_iterable)\u001b[0m\n\u001b[1;32m    548\u001b[0m                                      n_candidates * n_splits))\n\u001b[1;32m    549\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 550\u001b[0;31m         \u001b[0mbase_estimator\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mclone\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mestimator\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    551\u001b[0m         \u001b[0mpre_dispatch\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpre_dispatch\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    552\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/home/kris/anaconda2/lib/python2.7/site-packages/sklearn/base.pyc\u001b[0m in \u001b[0;36mclone\u001b[0;34m(estimator, safe)\u001b[0m\n\u001b[1;32m     65\u001b[0m                             % (repr(estimator), type(estimator)))\n\u001b[1;32m     66\u001b[0m     \u001b[0mklass\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mestimator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__class__\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 67\u001b[0;31m     \u001b[0mnew_object_params\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mestimator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_params\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdeep\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     68\u001b[0m     \u001b[0;32mfor\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparam\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msix\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miteritems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnew_object_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     69\u001b[0m         \u001b[0mnew_object_params\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mclone\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparam\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0msafe\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/home/kris/Desktop/GsoC2K17/gensim/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py\u001b[0m in \u001b[0;36mget_params\u001b[0;34m(self, deep)\u001b[0m\n\u001b[1;32m     66\u001b[0m                 \"random_state\": self.random_state}\n\u001b[1;32m     67\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mdeep\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 68\u001b[0;31m             \u001b[0;32mreturn\u001b[0m \u001b[0mget_params\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdeep\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     69\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     70\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mNameError\u001b[0m: global name 'get_params' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "obj=SklearnWrapperLdaModel(id2word=dictionary,num_topics=5,passes=20)\n",
+    "parameters = {'num_topics':(2, 3, 5, 10), 'iterations':(1,20,50)}\n",
+    "model = GridSearchCV(obj, parameters, scoring=scorer, cv=5)\n",
+    "model.fit(corpus)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'iterations': 20, 'num_topics': 3}"
+      ]
+     },
+     "execution_count": 60,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.best_params_"
+   ]
   }
  ],
  "metadata": {
@@ -317,7 +440,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython2",
-   "version": "2.7.6"
+   "version": "2.7.12"
   }
  },
  "nbformat": 4,

diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py b/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py
@@ -56,15 +56,14 @@ def get_params(self, deep=True):
         """
         Returns all parameters as dictionary.
         """
-        if deep:
-            return {
-                "corpus": self.corpus, "num_topics": self.num_topics, "id2word": self.id2word,
+        return {"corpus": self.corpus, "num_topics": self.num_topics, "id2word": self.id2word,
                 "chunksize": self.chunksize, "passes": self.passes,
-                "update_every": self.update_every, "alpha": self.alpha, " eta": self.eta, " decay": self.decay,
-                "offset": self.offset, "eval_every": self.eval_every, " iterations": self.iterations,
+                "update_every": self.update_every, "alpha": self.alpha, "eta": self.eta, "decay": self.decay,
+                "offset": self.offset, "eval_every": self.eval_every, "iterations": self.iterations,
                 "gamma_threshold": self.gamma_threshold, "minimum_probability": self.minimum_probability,
                 "random_state": self.random_state}
 
+
     def set_params(self, **parameters):
         """
         Set all parameters.
@@ -73,7 +72,7 @@ def set_params(self, **parameters):
             self.parameter = value
         return self
 
-    def fit(self, X):
+    def fit(self, X,  y=None):
         """
         For fitting corpus into the class object.
         Calls gensim.model.LdaModel:

diff --git a/gensim/test/test_sklearn_integration.py b/gensim/test/test_sklearn_integration.py
@@ -3,6 +3,7 @@
 import numpy
 
 from scipy import sparse
+from sklearn.pipeline import Pipeline
 from gensim.sklearn_integration.sklearn_wrapper_gensim_ldamodel import SklearnWrapperLdaModel
 from gensim.corpora import Dictionary
 from gensim import matutils
@@ -67,5 +68,15 @@ def testCSRMatrixConversion(self):
             self.assertTrue(isinstance(v, six.string_types))
             self.assertTrue(isinstance(k, int))
 
+    def testPipline(self):
+        model = SklearnWrapperLdaModel(id2word=dictionary, num_topics=2, passes=100, minimum_probability=0, random_state=numpy.random.seed(0))
+        text_lda = Pipeline([('model', model)])
+        text_lda.fit(corpus)
+        topic = text_lda.named_steps['model'].print_topics(2)
+        for k, v in topic:
+            self.assertTrue(isinstance(v, six.string_types))
+            self.assertTrue(isinstance(k, int))
+
+
 if __name__ == '__main__':
     unittest.main()