diff --git a/docs/notebooks/Corpora_and_Vector_Spaces.ipynb b/docs/notebooks/Corpora_and_Vector_Spaces.ipynb index 7f62262b0f..44560f952d 100644 --- a/docs/notebooks/Corpora_and_Vector_Spaces.ipynb +++ b/docs/notebooks/Corpora_and_Vector_Spaces.ipynb @@ -40,26 +40,18 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2017-05-07 13:52:33,796 : INFO : 'pattern' package not found; tag filters are not available for English\n" - ] - } - ], + "outputs": [], "source": [ "from gensim import corpora" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": { "collapsed": false }, @@ -87,7 +79,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": { "collapsed": false }, @@ -142,21 +134,11 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": { "collapsed": false }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2017-05-07 14:04:55,398 : INFO : adding document #0 to Dictionary(0 unique tokens: [])\n", - "2017-05-07 14:04:55,400 : INFO : built Dictionary(12 unique tokens: ['human', 'interface', 'computer', 'survey', 'user']...) from 9 documents (total 29 corpus positions)\n", - "2017-05-07 14:04:55,402 : INFO : saving Dictionary object under /tmp/deerwester.dict, separately None\n", - "2017-05-07 14:04:55,404 : INFO : saved /tmp/deerwester.dict\n" - ] - }, { "name": "stdout", "output_type": "stream", @@ -234,22 +216,11 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 5, "metadata": { "collapsed": false }, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2017-05-07 14:15:59,996 : INFO : storing corpus in Matrix Market format to /tmp/deerwester.mm\n", - "2017-05-07 14:15:59,999 : INFO : saving sparse matrix to /tmp/deerwester.mm\n", - "2017-05-07 14:16:00,001 : INFO : PROGRESS: saving document #0\n", - "2017-05-07 14:16:00,003 : INFO : saved 9x12 matrix, density=25.926% (28/108)\n", - "2017-05-07 14:16:00,005 : INFO : saving MmCorpus index to /tmp/deerwester.mm.index\n" - ] - }, { "name": "stdout", "output_type": "stream", @@ -286,7 +257,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 6, "metadata": { "collapsed": true }, @@ -308,7 +279,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 7, "metadata": { "collapsed": false }, @@ -317,7 +288,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "<__main__.MyCorpus object at 0x112c5acf8>\n" + "<__main__.MyCorpus object at 0x10f48a240>\n" ] } ], @@ -335,7 +306,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 8, "metadata": { "collapsed": false }, @@ -372,7 +343,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 9, "metadata": { "collapsed": false }, @@ -381,7 +352,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Dictionary(12 unique tokens: ['response', 'computer', 'survey', 'user', 'minors']...)\n" + "Dictionary(12 unique tokens: ['human', 'interface', 'computer', 'survey', 'user']...)\n" ] } ], @@ -419,23 +390,11 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 10, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2017-05-07 14:34:16,166 : INFO : storing corpus in Matrix Market format to /tmp/corpus.mm\n", - "2017-05-07 14:34:16,169 : INFO : saving sparse matrix to /tmp/corpus.mm\n", - "2017-05-07 14:34:16,170 : INFO : PROGRESS: saving document #0\n", - "2017-05-07 14:34:16,172 : INFO : saved 2x2 matrix, density=25.000% (1/4)\n", - "2017-05-07 14:34:16,173 : INFO : saving MmCorpus index to /tmp/corpus.mm.index\n" - ] - } - ], + "outputs": [], "source": [ "# create a toy corpus of 2 documents, as a plain Python list\n", "corpus = [[(1, 0.5)], []] # make one document empty, for the heck of it\n", @@ -452,28 +411,11 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 11, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2017-05-07 14:34:29,173 : INFO : converting corpus to SVMlight format: /tmp/corpus.svmlight\n", - "2017-05-07 14:34:29,176 : INFO : saving SvmLightCorpus index to /tmp/corpus.svmlight.index\n", - "2017-05-07 14:34:29,178 : INFO : no word id mapping provided; initializing from corpus\n", - "2017-05-07 14:34:29,179 : INFO : storing corpus in Blei's LDA-C format into /tmp/corpus.lda-c\n", - "2017-05-07 14:34:29,181 : INFO : saving vocabulary of 2 words to /tmp/corpus.lda-c.vocab\n", - "2017-05-07 14:34:29,183 : INFO : saving BleiCorpus index to /tmp/corpus.lda-c.index\n", - "2017-05-07 14:34:29,184 : INFO : no word id mapping provided; initializing from corpus\n", - "2017-05-07 14:34:29,186 : INFO : storing corpus in List-Of-Words format into /tmp/corpus.low\n", - "2017-05-07 14:34:29,188 : WARNING : List-of-words format can only save vectors with integer elements; 1 float entries were truncated to integer value\n", - "2017-05-07 14:34:29,190 : INFO : saving LowCorpus index to /tmp/corpus.low.index\n" - ] - } - ], + "outputs": [], "source": [ "corpora.SvmLightCorpus.serialize('/tmp/corpus.svmlight', corpus)\n", "corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)\n", @@ -489,21 +431,11 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 12, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2017-05-07 14:34:40,151 : INFO : loaded corpus index from /tmp/corpus.mm.index\n", - "2017-05-07 14:34:40,153 : INFO : initializing corpus reader from /tmp/corpus.mm\n", - "2017-05-07 14:34:40,156 : INFO : accepted corpus with 2 documents, 2 features, 1 non-zero entries\n" - ] - } - ], + "outputs": [], "source": [ "corpus = corpora.MmCorpus('/tmp/corpus.mm')" ] @@ -517,7 +449,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 13, "metadata": { "collapsed": false }, @@ -543,7 +475,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 14, "metadata": { "collapsed": false }, @@ -570,7 +502,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 15, "metadata": { "collapsed": false }, @@ -601,22 +533,11 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 16, "metadata": { "collapsed": false }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2017-05-07 14:35:00,740 : INFO : no word id mapping provided; initializing from corpus\n", - "2017-05-07 14:35:00,743 : INFO : storing corpus in Blei's LDA-C format into /tmp/corpus.lda-c\n", - "2017-05-07 14:35:00,745 : INFO : saving vocabulary of 2 words to /tmp/corpus.lda-c.vocab\n", - "2017-05-07 14:35:00,747 : INFO : saving BleiCorpus index to /tmp/corpus.lda-c.index\n" - ] - } - ], + "outputs": [], "source": [ "corpora.BleiCorpus.serialize('/tmp/corpus.lda-c', corpus)" ] @@ -634,7 +555,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 17, "metadata": { "collapsed": false }, @@ -656,7 +577,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 18, "metadata": { "collapsed": false }, @@ -675,15 +596,6 @@ "For a complete reference (want to prune the dictionary to a smaller size? Optimize converting between corpora and NumPy/SciPy arrays?), see the [API documentation](https://radimrehurek.com/gensim/apiref.html). Or continue to the next tutorial on Topics and Transformations ([notebook](https://github.com/piskvorky/gensim/tree/develop/docs/notebooks/Topics_and_Transformations.ipynb) \n", "or [website](https://radimrehurek.com/gensim/tut2.html))." ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] } ], "metadata": {