From 1aa3f33b7a225bea0b535cb8b8e67f519ddf9772 Mon Sep 17 00:00:00 2001
From: robotcator <problemset@163.com>
Date: Fri, 17 Mar 2017 22:53:50 +0800
Subject: [PATCH 1/2] fix the compatibility between python2 & 3

---
 docs/notebooks/doc2vec-IMDB.ipynb | 259 ++++++++++++++++++++++++------
 1 file changed, 206 insertions(+), 53 deletions(-)

diff --git a/docs/notebooks/doc2vec-IMDB.ipynb b/docs/notebooks/doc2vec-IMDB.ipynb
index d68b68b1a6..97468b2465 100644
--- a/docs/notebooks/doc2vec-IMDB.ipynb
+++ b/docs/notebooks/doc2vec-IMDB.ipynb
@@ -2,14 +2,20 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
    "source": [
     "# gensim doc2vec & IMDB sentiment dataset"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
    "source": [
     "TODO: section on introduction & motivation\n",
     "\n",
@@ -24,14 +30,20 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
    "source": [
     "## Load corpus"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
    "source": [
     "Fetch and prep exactly as in Mikolov's go.sh shell script. (Note this cell tests for existence of required files, so steps won't repeat once the final summary file (`aclImdb/alldata-id.txt`) is available alongside this notebook.)"
    ]
@@ -39,7 +51,11 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true,
+    "deletable": true,
+    "editable": true
+   },
    "outputs": [],
    "source": [
     "import locale\n",
@@ -118,7 +134,11 @@
   {
    "cell_type": "code",
    "execution_count": 3,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true,
+    "deletable": true,
+    "editable": true
+   },
    "outputs": [],
    "source": [
     "import os.path\n",
@@ -127,7 +147,10 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
    "source": [
     "The data is small enough to be read into memory. "
    ]
@@ -135,7 +158,11 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -171,14 +198,20 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
    "source": [
     "## Set-up Doc2Vec Training & Evaluation Models"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
    "source": [
     "Approximating experiment of Le & Mikolov [\"Distributed Representations of Sentences and Documents\"](http://cs.stanford.edu/~quocle/paragraph_vector.pdf), also with guidance from Mikolov's [example go.sh](https://groups.google.com/d/msg/word2vec-toolkit/Q49FIrNOQRo/J6KG8mUj45sJ):\n",
     "\n",
@@ -196,7 +229,11 @@
   {
    "cell_type": "code",
    "execution_count": 2,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -238,7 +275,10 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
    "source": [
     "Following the paper, we also evaluate models in pairs. These wrappers return the concatenation of the vectors from each model. (Only the singular models are trained.)"
    ]
@@ -246,7 +286,11 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true,
+    "deletable": true,
+    "editable": true
+   },
    "outputs": [],
    "source": [
     "from gensim.test.test_doc2vec import ConcatenatedDoc2Vec\n",
@@ -256,14 +300,20 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
    "source": [
     "## Predictive Evaluation Methods"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
    "source": [
     "Helper methods for evaluating error rate."
    ]
@@ -271,7 +321,11 @@
   {
    "cell_type": "code",
    "execution_count": 8,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true,
+    "deletable": true,
+    "editable": true
+   },
    "outputs": [],
    "source": [
     "import numpy as np\n",
@@ -323,14 +377,20 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
    "source": [
     "## Bulk Training"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
    "source": [
     "Using explicit multiple-pass, alpha-reduction approach as sketched in [gensim doc2vec blog post](http://radimrehurek.com/2014/12/doc2vec-tutorial/) – with added shuffling of corpus on each pass.\n",
     "\n",
@@ -344,7 +404,11 @@
   {
    "cell_type": "code",
    "execution_count": 9,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true,
+    "deletable": true,
+    "editable": true
+   },
    "outputs": [],
    "source": [
     "from collections import defaultdict\n",
@@ -354,7 +418,11 @@
   {
    "cell_type": "code",
    "execution_count": 10,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -555,7 +623,10 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
    "source": [
     "## Achieved Sentiment-Prediction Accuracy"
    ]
@@ -563,7 +634,11 @@
   {
    "cell_type": "code",
    "execution_count": 12,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -590,21 +665,30 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
    "source": [
     "In my testing, unlike the paper's report, DBOW performs best. Concatenating vectors from different models only offers a small predictive improvement. The best results I've seen are still just under 10% error rate, still a ways from the paper's 7.42%.\n"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
    "source": [
     "## Examining Results"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
    "source": [
     "### Are inferred vectors close to the precalculated ones?"
    ]
@@ -612,7 +696,11 @@
   {
    "cell_type": "code",
    "execution_count": 13,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -638,14 +726,20 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
    "source": [
     "(Yes, here the stored vector from 20 epochs of training is usually one of the closest to a freshly-inferred vector for the same words. Note the defaults for inference are very abbreviated – just 3 steps starting at a high alpha – and likely need tuning for other applications.)"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
    "source": [
     "### Do close documents seem more related than distant ones?"
    ]
@@ -653,7 +747,11 @@
   {
    "cell_type": "code",
    "execution_count": 14,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -686,14 +784,20 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
    "source": [
     "(Somewhat, in terms of reviewer tone, movie genre, etc... the MOST cosine-similar docs usually seem more like the TARGET than the MEDIAN or LEAST.)"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
    "source": [
     "### Do the word vectors show useful similarities?"
    ]
@@ -701,7 +805,11 @@
   {
    "cell_type": "code",
    "execution_count": 15,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true,
+    "deletable": true,
+    "editable": true
+   },
    "outputs": [],
    "source": [
     "word_models = simple_models[:]"
@@ -710,7 +818,11 @@
   {
    "cell_type": "code",
    "execution_count": 17,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -782,8 +894,8 @@
        "('mystery/comedy', 0.5020694732666016)]</td></tr></table>"
       ]
      },
-     "output_type": "execute_result",
-     "metadata": {}
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
@@ -808,7 +920,10 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
    "source": [
     "Do the DBOW words look meaningless? That's because the gensim DBOW model doesn't train word vectors – they remain at their random initialized values – unless you ask with the `dbow_words=1` initialization parameter. Concurrent word-training slows DBOW mode significantly, and offers little improvement (and sometimes a little worsening) of the error rate on this IMDB sentiment-prediction task. \n",
     "\n",
@@ -817,7 +932,10 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
    "source": [
     "### Are the word vectors from this dataset any good at analogies?"
    ]
@@ -825,7 +943,11 @@
   {
    "cell_type": "code",
    "execution_count": 26,
-   "metadata": {},
+   "metadata": {
+    "collapsed": false,
+    "deletable": true,
+    "editable": true
+   },
    "outputs": [
     {
      "name": "stdout",
@@ -850,14 +972,20 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
    "source": [
     "Even though this is a tiny, domain-specific dataset, it shows some meager capability on the general word analogies – at least for the DM/concat and DM/mean models which actually train word vectors. (The untrained random-initialized words of the DBOW model of course fail miserably.)"
    ]
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
    "source": [
     "## Slop"
    ]
@@ -865,7 +993,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true,
+    "deletable": true,
+    "editable": true
+   },
    "outputs": [],
    "source": [
     "This cell left intentionally erroneous."
@@ -873,7 +1005,10 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
    "source": [
     "To mix the Google dataset (if locally available) into the word tests..."
    ]
@@ -881,7 +1016,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true,
+    "deletable": true,
+    "editable": true
+   },
    "outputs": [],
    "source": [
     "from gensim.models import KeyedVectors\n",
@@ -892,7 +1031,10 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
    "source": [
     "To get copious logging output from above steps..."
    ]
@@ -900,7 +1042,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true,
+    "deletable": true,
+    "editable": true
+   },
    "outputs": [],
    "source": [
     "import logging\n",
@@ -911,7 +1057,10 @@
   },
   {
    "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
+    "deletable": true,
+    "editable": true
+   },
    "source": [
     "To auto-reload python code while developing..."
    ]
@@ -919,7 +1068,11 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {},
+   "metadata": {
+    "collapsed": true,
+    "deletable": true,
+    "editable": true
+   },
    "outputs": [],
    "source": [
     "%load_ext autoreload\n",
@@ -929,23 +1082,23 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 2",
    "language": "python",
-   "name": "python3"
+   "name": "python2"
   },
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 3.0
+    "version": 2
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
    "name": "python",
    "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.5.1"
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
   }
  },
  "nbformat": 4,
  "nbformat_minor": 0
-}
\ No newline at end of file
+}

From bf48e8d4d382ddfb22f9a0d05e3077736bec901e Mon Sep 17 00:00:00 2001
From: robotcator <problemset@163.com>
Date: Fri, 17 Mar 2017 23:22:03 +0800
Subject: [PATCH 2/2] fix the compatibility between python2 & 3

---
 docs/notebooks/doc2vec-IMDB.ipynb | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/docs/notebooks/doc2vec-IMDB.ipynb b/docs/notebooks/doc2vec-IMDB.ipynb
index 97468b2465..92f48e24c5 100644
--- a/docs/notebooks/doc2vec-IMDB.ipynb
+++ b/docs/notebooks/doc2vec-IMDB.ipynb
@@ -63,11 +63,17 @@
     "import os.path\n",
     "import requests\n",
     "import tarfile\n",
+    "import sys\n",
+    "import codecs\n",
     "\n",
     "dirname = 'aclImdb'\n",
     "filename = 'aclImdb_v1.tar.gz'\n",
     "locale.setlocale(locale.LC_ALL, 'C')\n",
     "\n",
+    "if sys.version > '3':\n",
+    "    control_chars = [chr(0x85)]\n",
+    "else:\n",
+    "    control_chars = [unichr(0x85)]\n",
     "\n",
     "# Convert text to lower-case and strip punctuation/symbols from words\n",
     "def normalize_text(text):\n",
@@ -82,12 +88,14 @@
     "\n",
     "    return norm_text\n",
     "\n",
+    "import time\n",
+    "start = time.clock()\n",
     "\n",
     "if not os.path.isfile('aclImdb/alldata-id.txt'):\n",
     "    if not os.path.isdir(dirname):\n",
     "        if not os.path.isfile(filename):\n",
     "            # Download IMDB archive\n",
-    "            url = 'http://ai.stanford.edu/~amaas/data/sentiment/' + filename\n",
+    "            url = u'http://ai.stanford.edu/~amaas/data/sentiment/' + filename\n",
     "            r = requests.get(url)\n",
     "            with open(filename, 'wb') as f:\n",
     "                f.write(r.content)\n",
@@ -108,8 +116,7 @@
     "        txt_files = glob.glob('/'.join([dirname, fol, '*.txt']))\n",
     "\n",
     "        for txt in txt_files:\n",
-    "            with open(txt, 'r', encoding='utf-8') as t:\n",
-    "                control_chars = [chr(0x85)]\n",
+    "            with codecs.open(txt, 'r', encoding='utf-8') as t:\n",
     "                t_clean = t.read()\n",
     "\n",
     "                for c in control_chars:\n",
@@ -120,15 +127,18 @@
     "            temp += \"\\n\"\n",
     "\n",
     "        temp_norm = normalize_text(temp)\n",
-    "        with open('/'.join([dirname, output]), 'w', encoding='utf-8') as n:\n",
+    "        with codecs.open('/'.join([dirname, output]), 'w', encoding='utf-8') as n:\n",
     "            n.write(temp_norm)\n",
     "\n",
     "        alldata += temp_norm\n",
     "\n",
-    "    with open('/'.join([dirname, 'alldata-id.txt']), 'w', encoding='utf-8') as f:\n",
+    "    with codecs.open('/'.join([dirname, 'alldata-id.txt']), 'w', encoding='utf-8') as f:\n",
     "        for idx, line in enumerate(alldata.splitlines()):\n",
-    "            num_line = \"_*{0} {1}\\n\".format(idx, line)\n",
-    "            f.write(num_line)"
+    "            num_line = u\"_*{0} {1}\\n\".format(idx, line)\n",
+    "            f.write(num_line)\n",
+    "\n",
+    "end = time.clock()\n",
+    "print (\"total running time: \", end-start)"
    ]
   },
   {