From 1aa3f33b7a225bea0b535cb8b8e67f519ddf9772 Mon Sep 17 00:00:00 2001 From: robotcator Date: Fri, 17 Mar 2017 22:53:50 +0800 Subject: [PATCH 1/2] fix the compatibility between python2 & 3 --- docs/notebooks/doc2vec-IMDB.ipynb | 259 ++++++++++++++++++++++++------ 1 file changed, 206 insertions(+), 53 deletions(-) diff --git a/docs/notebooks/doc2vec-IMDB.ipynb b/docs/notebooks/doc2vec-IMDB.ipynb index d68b68b1a6..97468b2465 100644 --- a/docs/notebooks/doc2vec-IMDB.ipynb +++ b/docs/notebooks/doc2vec-IMDB.ipynb @@ -2,14 +2,20 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "# gensim doc2vec & IMDB sentiment dataset" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "TODO: section on introduction & motivation\n", "\n", @@ -24,14 +30,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Load corpus" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Fetch and prep exactly as in Mikolov's go.sh shell script. (Note this cell tests for existence of required files, so steps won't repeat once the final summary file (`aclImdb/alldata-id.txt`) is available alongside this notebook.)" ] @@ -39,7 +51,11 @@ { "cell_type": "code", "execution_count": 2, - "metadata": {}, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, "outputs": [], "source": [ "import locale\n", @@ -118,7 +134,11 @@ { "cell_type": "code", "execution_count": 3, - "metadata": {}, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, "outputs": [], "source": [ "import os.path\n", @@ -127,7 +147,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The data is small enough to be read into memory. " ] @@ -135,7 +158,11 @@ { "cell_type": "code", "execution_count": 1, - "metadata": {}, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, "outputs": [ { "name": "stdout", @@ -171,14 +198,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Set-up Doc2Vec Training & Evaluation Models" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Approximating experiment of Le & Mikolov [\"Distributed Representations of Sentences and Documents\"](http://cs.stanford.edu/~quocle/paragraph_vector.pdf), also with guidance from Mikolov's [example go.sh](https://groups.google.com/d/msg/word2vec-toolkit/Q49FIrNOQRo/J6KG8mUj45sJ):\n", "\n", @@ -196,7 +229,11 @@ { "cell_type": "code", "execution_count": 2, - "metadata": {}, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, "outputs": [ { "name": "stdout", @@ -238,7 +275,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Following the paper, we also evaluate models in pairs. These wrappers return the concatenation of the vectors from each model. (Only the singular models are trained.)" ] @@ -246,7 +286,11 @@ { "cell_type": "code", "execution_count": 5, - "metadata": {}, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, "outputs": [], "source": [ "from gensim.test.test_doc2vec import ConcatenatedDoc2Vec\n", @@ -256,14 +300,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Predictive Evaluation Methods" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Helper methods for evaluating error rate." ] @@ -271,7 +321,11 @@ { "cell_type": "code", "execution_count": 8, - "metadata": {}, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, "outputs": [], "source": [ "import numpy as np\n", @@ -323,14 +377,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Bulk Training" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Using explicit multiple-pass, alpha-reduction approach as sketched in [gensim doc2vec blog post](http://radimrehurek.com/2014/12/doc2vec-tutorial/) – with added shuffling of corpus on each pass.\n", "\n", @@ -344,7 +404,11 @@ { "cell_type": "code", "execution_count": 9, - "metadata": {}, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, "outputs": [], "source": [ "from collections import defaultdict\n", @@ -354,7 +418,11 @@ { "cell_type": "code", "execution_count": 10, - "metadata": {}, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, "outputs": [ { "name": "stdout", @@ -555,7 +623,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Achieved Sentiment-Prediction Accuracy" ] @@ -563,7 +634,11 @@ { "cell_type": "code", "execution_count": 12, - "metadata": {}, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, "outputs": [ { "name": "stdout", @@ -590,21 +665,30 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "In my testing, unlike the paper's report, DBOW performs best. Concatenating vectors from different models only offers a small predictive improvement. The best results I've seen are still just under 10% error rate, still a ways from the paper's 7.42%.\n" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Examining Results" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Are inferred vectors close to the precalculated ones?" ] @@ -612,7 +696,11 @@ { "cell_type": "code", "execution_count": 13, - "metadata": {}, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, "outputs": [ { "name": "stdout", @@ -638,14 +726,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "(Yes, here the stored vector from 20 epochs of training is usually one of the closest to a freshly-inferred vector for the same words. Note the defaults for inference are very abbreviated – just 3 steps starting at a high alpha – and likely need tuning for other applications.)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Do close documents seem more related than distant ones?" ] @@ -653,7 +747,11 @@ { "cell_type": "code", "execution_count": 14, - "metadata": {}, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, "outputs": [ { "name": "stdout", @@ -686,14 +784,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "(Somewhat, in terms of reviewer tone, movie genre, etc... the MOST cosine-similar docs usually seem more like the TARGET than the MEDIAN or LEAST.)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Do the word vectors show useful similarities?" ] @@ -701,7 +805,11 @@ { "cell_type": "code", "execution_count": 15, - "metadata": {}, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, "outputs": [], "source": [ "word_models = simple_models[:]" @@ -710,7 +818,11 @@ { "cell_type": "code", "execution_count": 17, - "metadata": {}, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, "outputs": [ { "name": "stdout", @@ -782,8 +894,8 @@ "('mystery/comedy', 0.5020694732666016)]" ] }, - "output_type": "execute_result", - "metadata": {} + "metadata": {}, + "output_type": "execute_result" } ], "source": [ @@ -808,7 +920,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Do the DBOW words look meaningless? That's because the gensim DBOW model doesn't train word vectors – they remain at their random initialized values – unless you ask with the `dbow_words=1` initialization parameter. Concurrent word-training slows DBOW mode significantly, and offers little improvement (and sometimes a little worsening) of the error rate on this IMDB sentiment-prediction task. \n", "\n", @@ -817,7 +932,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Are the word vectors from this dataset any good at analogies?" ] @@ -825,7 +943,11 @@ { "cell_type": "code", "execution_count": 26, - "metadata": {}, + "metadata": { + "collapsed": false, + "deletable": true, + "editable": true + }, "outputs": [ { "name": "stdout", @@ -850,14 +972,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Even though this is a tiny, domain-specific dataset, it shows some meager capability on the general word analogies – at least for the DM/concat and DM/mean models which actually train word vectors. (The untrained random-initialized words of the DBOW model of course fail miserably.)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Slop" ] @@ -865,7 +993,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, "outputs": [], "source": [ "This cell left intentionally erroneous." @@ -873,7 +1005,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "To mix the Google dataset (if locally available) into the word tests..." ] @@ -881,7 +1016,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, "outputs": [], "source": [ "from gensim.models import KeyedVectors\n", @@ -892,7 +1031,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "To get copious logging output from above steps..." ] @@ -900,7 +1042,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, "outputs": [], "source": [ "import logging\n", @@ -911,7 +1057,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "To auto-reload python code while developing..." ] @@ -919,7 +1068,11 @@ { "cell_type": "code", "execution_count": null, - "metadata": {}, + "metadata": { + "collapsed": true, + "deletable": true, + "editable": true + }, "outputs": [], "source": [ "%load_ext autoreload\n", @@ -929,23 +1082,23 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 2", "language": "python", - "name": "python3" + "name": "python2" }, "language_info": { "codemirror_mode": { "name": "ipython", - "version": 3.0 + "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.5.1" + "pygments_lexer": "ipython2", + "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 -} \ No newline at end of file +} From bf48e8d4d382ddfb22f9a0d05e3077736bec901e Mon Sep 17 00:00:00 2001 From: robotcator Date: Fri, 17 Mar 2017 23:22:03 +0800 Subject: [PATCH 2/2] fix the compatibility between python2 & 3 --- docs/notebooks/doc2vec-IMDB.ipynb | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/docs/notebooks/doc2vec-IMDB.ipynb b/docs/notebooks/doc2vec-IMDB.ipynb index 97468b2465..92f48e24c5 100644 --- a/docs/notebooks/doc2vec-IMDB.ipynb +++ b/docs/notebooks/doc2vec-IMDB.ipynb @@ -63,11 +63,17 @@ "import os.path\n", "import requests\n", "import tarfile\n", + "import sys\n", + "import codecs\n", "\n", "dirname = 'aclImdb'\n", "filename = 'aclImdb_v1.tar.gz'\n", "locale.setlocale(locale.LC_ALL, 'C')\n", "\n", + "if sys.version > '3':\n", + " control_chars = [chr(0x85)]\n", + "else:\n", + " control_chars = [unichr(0x85)]\n", "\n", "# Convert text to lower-case and strip punctuation/symbols from words\n", "def normalize_text(text):\n", @@ -82,12 +88,14 @@ "\n", " return norm_text\n", "\n", + "import time\n", + "start = time.clock()\n", "\n", "if not os.path.isfile('aclImdb/alldata-id.txt'):\n", " if not os.path.isdir(dirname):\n", " if not os.path.isfile(filename):\n", " # Download IMDB archive\n", - " url = 'http://ai.stanford.edu/~amaas/data/sentiment/' + filename\n", + " url = u'http://ai.stanford.edu/~amaas/data/sentiment/' + filename\n", " r = requests.get(url)\n", " with open(filename, 'wb') as f:\n", " f.write(r.content)\n", @@ -108,8 +116,7 @@ " txt_files = glob.glob('/'.join([dirname, fol, '*.txt']))\n", "\n", " for txt in txt_files:\n", - " with open(txt, 'r', encoding='utf-8') as t:\n", - " control_chars = [chr(0x85)]\n", + " with codecs.open(txt, 'r', encoding='utf-8') as t:\n", " t_clean = t.read()\n", "\n", " for c in control_chars:\n", @@ -120,15 +127,18 @@ " temp += \"\\n\"\n", "\n", " temp_norm = normalize_text(temp)\n", - " with open('/'.join([dirname, output]), 'w', encoding='utf-8') as n:\n", + " with codecs.open('/'.join([dirname, output]), 'w', encoding='utf-8') as n:\n", " n.write(temp_norm)\n", "\n", " alldata += temp_norm\n", "\n", - " with open('/'.join([dirname, 'alldata-id.txt']), 'w', encoding='utf-8') as f:\n", + " with codecs.open('/'.join([dirname, 'alldata-id.txt']), 'w', encoding='utf-8') as f:\n", " for idx, line in enumerate(alldata.splitlines()):\n", - " num_line = \"_*{0} {1}\\n\".format(idx, line)\n", - " f.write(num_line)" + " num_line = u\"_*{0} {1}\\n\".format(idx, line)\n", + " f.write(num_line)\n", + "\n", + "end = time.clock()\n", + "print (\"total running time: \", end-start)" ] }, {