piskvorky · mpenkov · Aug 22, 2022 · Aug 4, 2022 · Aug 22, 2022
diff --git a/docs/notebooks/translation_matrix.ipynb b/docs/notebooks/translation_matrix.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Tranlation Matrix Tutorial"
+    "# Translation Matrix Tutorial"
    ]
   },
   {
@@ -34,22 +34,23 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Tomas Mikolov, Quoc V Le, Ilya Sutskever. 2013.[Exploiting Similarities among Languages for Machine Translation](https://arxiv.org/pdf/1309.4168.pdf)\n",
+    "Tomas Mikolov, Quoc V Le, Ilya Sutskever. 2013. [Exploiting Similarities among Languages for Machine Translation](https://arxiv.org/pdf/1309.4168.pdf)\n",
     "\n",
-    "Georgiana Dinu, Angelikie Lazaridou and Marco Baroni. 2014.[Improving zero-shot learning by mitigating the hubness problem](https://arxiv.org/pdf/1309.4168.pdf)"
+    "Georgiana Dinu, Angelikie Lazaridou and Marco Baroni. 2014. [Improving zero-shot learning by mitigating the hubness problem](https://arxiv.org/pdf/1309.4168.pdf)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
     "import os\n",
     "\n",
     "from gensim import utils\n",
     "from gensim.models import translation_matrix\n",
-    "from gensim.models import KeyedVectors"
+    "from gensim.models import KeyedVectors\n",
+    "import smart_open"
    ]
   },
   {
@@ -65,7 +66,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -74,30 +75,23 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
-     "ename": "FileNotFoundError",
-     "evalue": "[Errno 2] No such file or directory: 'OPUS_en_it_europarl_train_5K.txt'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
-      "\u001b[0;32m<ipython-input-2-a21913f3bec7>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0mtrain_file\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"OPUS_en_it_europarl_train_5K.txt\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0;32mwith\u001b[0m \u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msmart_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_file\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"r\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      4\u001b[0m     \u001b[0mword_pair\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mtuple\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mutils\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_unicode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mline\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstrip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msplit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mline\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0mprint\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mword_pair\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/envs/gensim/lib/python3.7/site-packages/smart_open/smart_open_lib.py\u001b[0m in \u001b[0;36msmart_open\u001b[0;34m(uri, mode, **kw)\u001b[0m\n\u001b[1;32m    437\u001b[0m             \u001b[0mtransport_params\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    438\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 439\u001b[0;31m     \u001b[0;32mreturn\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0muri\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mignore_ext\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mignore_extension\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtransport_params\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtransport_params\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mscrubbed_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    440\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    441\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/envs/gensim/lib/python3.7/site-packages/smart_open/smart_open_lib.py\u001b[0m in \u001b[0;36mopen\u001b[0;34m(uri, mode, buffering, encoding, errors, newline, closefd, opener, ignore_ext, transport_params)\u001b[0m\n\u001b[1;32m    305\u001b[0m         \u001b[0mbuffering\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbuffering\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    306\u001b[0m         \u001b[0mencoding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mencoding\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 307\u001b[0;31m         \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    308\u001b[0m     )\n\u001b[1;32m    309\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mfobj\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;32m~/envs/gensim/lib/python3.7/site-packages/smart_open/smart_open_lib.py\u001b[0m in \u001b[0;36m_shortcut_open\u001b[0;34m(uri, mode, ignore_ext, buffering, encoding, errors)\u001b[0m\n\u001b[1;32m    496\u001b[0m     \u001b[0;31m#\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    497\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0msix\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mPY3\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 498\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0m_builtin_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparsed_uri\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muri_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbuffering\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbuffering\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mopen_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    499\u001b[0m     \u001b[0;32melif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mopen_kwargs\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    500\u001b[0m         \u001b[0;32mreturn\u001b[0m \u001b[0m_builtin_open\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mparsed_uri\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0muri_path\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmode\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbuffering\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbuffering\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
-      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'OPUS_en_it_europarl_train_5K.txt'"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[('for', 'per'), ('that', 'che'), ('with', 'con'), ('are', 'are'), ('are', 'sono'), ('this', 'questa'), ('this', 'questo'), ('you', 'lei'), ('not', 'non'), ('which', 'che')]\n"
      ]
     }
    ],
    "source": [
     "train_file = \"OPUS_en_it_europarl_train_5K.txt\"\n",
     "\n",
-    "with utils.smart_open(train_file, \"r\") as f:\n",
+    "with smart_open.open(train_file, \"r\") as f:\n",
     "    word_pair = [tuple(utils.to_unicode(line).strip().split()) for line in f]\n",
-    "print (word_pair[:10])"
+    "print(word_pair[:10])"
    ]
   },
   {
@@ -151,14 +145,14 @@
    "source": [
     "transmat = translation_matrix.TranslationMatrix(source_word_vec, target_word_vec, word_pair)\n",
     "transmat.train(word_pair)\n",
-    "print (\"the shape of translation matrix is: \", transmat.translation_matrix.shape)"
+    "print(\"the shape of translation matrix is: \", transmat.translation_matrix.shape)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Prediction Time: For any given new word, we can map it to the other language space by coputing $z = Wx$, then we find the word whose representation is closet to z in the target language space, using consine similarity as the distance metric."
+    "Prediction Time: For any given new word, we can map it to the other language space by computing $z = Wx$, then we find the word whose representation is closet to z in the target language space, using cosine similarity as the distance metric."
    ]
   },
   {
@@ -190,7 +184,7 @@
    "outputs": [],
    "source": [
     "for k, v in translated_word.iteritems():\n",
-    "    print (\"word \", k, \" and translated word\", v)"
+    "    print(\"word \", k, \" and translated word\", v)"
    ]
   },
   {
@@ -211,7 +205,7 @@
     "source_word, target_word = zip(*words)\n",
     "translated_word = transmat.translate(source_word, 5)\n",
     "for k, v in translated_word.iteritems():\n",
-    "    print (\"word \", k, \" and translated word\", v)"
+    "    print(\"word \", k, \" and translated word\", v)"
    ]
   },
   {
@@ -232,7 +226,7 @@
     "source_word, target_word = zip(*words)\n",
     "translated_word = transmat.translate(source_word, 5)\n",
     "for k, v in translated_word.iteritems():\n",
-    "    print (\"word \", k, \" and translated word\", v)"
+    "    print(\"word \", k, \" and translated word\", v)"
    ]
   },
   {
@@ -246,7 +240,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Testing the creation time, we extracted more word pairs from a dictionary built from Europarl([Europara, en-it](http://opus.lingfil.uu.se/)). We obtain about 20K word pairs and their coresponding word vectors or you can download from this.[word_dict.pkl](https://pan.baidu.com/s/1dF8HUX7)"
+    "Testing the creation time, we extracted more word pairs from a dictionary built from Europarl([Europara, en-it](http://opus.lingfil.uu.se/)). We obtain about 20K word pairs and their corresponding word vectors or you can download from this: [word_dict.pkl](https://pan.baidu.com/s/1dF8HUX7)"
    ]
   },
   {
@@ -257,9 +251,9 @@
    "source": [
     "import pickle\n",
     "word_dict = \"word_dict.pkl\"\n",
-    "with utils.smart_open(word_dict, \"r\") as f:\n",
+    "with smart_open.open(word_dict, \"r\") as f:\n",
     "    word_pair = pickle.load(f)\n",
-    "print (\"the length of word pair \", len(word_pair))"
+    "print(\"the length of word pair \", len(word_pair))"
    ]
   },
   {
@@ -423,7 +417,7 @@
     "\n",
     "# Translate the English word five to Italian word\n",
     "translated_word = transmat.translate([en_words[4]], 3)\n",
-    "print \"translation of five: \", translated_word\n",
+    "print(\"translation of five: \", translated_word)\n",
     "\n",
     "# the translated words of five\n",
     "for item in translated_word[en_words[4]]:\n",
@@ -518,7 +512,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Let's see some animal words, the figue shows that most of words are also share the similar geometric arrangements."
+    "Let's see some animal words, the figure shows that most of the words also have similar geometric arrangements."
    ]
   },
   {
@@ -593,7 +587,7 @@
     "\n",
     "# Translate the English word birds to Italian word\n",
     "translated_word = transmat.translate([en_words[4]], 3)\n",
-    "print \"translation of birds: \", translated_word\n",
+    "print(\"translation of birds: \", translated_word)\n",
     "\n",
     "# the translated words of birds\n",
     "for item in translated_word[en_words[4]]:\n",
@@ -700,7 +694,7 @@
    "source": [
     "As dicussion in this [PR](https://github.com/RaRe-Technologies/gensim/pull/1434), Translation Matrix not only can used to translate the words from one source language to another target lanuage, but also to translate new document vectors back to old model space.\n",
     "\n",
-    "For example, if we have trained 15k documents using doc2vec (we called this as model1), and we are going to train new 35k documents using doc2vec(we called this as model2). So we can include those 15k documents as reference documents into the new 35k documents. Then we can get 15k document vectors from model1 and 50k document vectors from model2, but both of the two models have vectors for those 15k documents. We can use those vectors to build a mapping from model1 to model2. Finally, with this relation, we can back-mapping the model2's vector to model1. Therefore, 35k document vectors are learned using this method."
+    "For example, if we have trained 15k documents using doc2vec (we called this as model1), and we are going to train new 35k documents using doc2vec (we called this as model2). So we can include those 15k documents as reference documents into the new 35k documents. Then we can get 15k document vectors from model1 and 50k document vectors from model2, but both of the two models have vectors for those 15k documents. We can use those vectors to build a mapping from model1 to model2. Finally, with this relation, we can back-map the model2's vector to model1. Therefore, 35k document vectors are learned using this method."
    ]
   },
   {
@@ -720,13 +714,13 @@
     "from gensim.models.doc2vec import TaggedDocument\n",
     "from gensim.models import Doc2Vec\n",
     "from collections import namedtuple\n",
-    "from gensim import utils\n",
+    "import smart_open\n",
     "\n",
     "def read_sentimentDocs():\n",
     "    SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')\n",
     "\n",
     "    alldocs = []  # will hold all docs in original order\n",
-    "    with utils.smart_open('aclImdb/alldata-id.txt', encoding='utf-8') as alldata:\n",
+    "    with smart_open.open('aclImdb/alldata-id.txt', encoding='utf-8') as alldata:\n",
     "        for line_no, line in enumerate(alldata):\n",
     "            tokens = gensim.utils.to_unicode(line).split()\n",
     "            words = tokens[1:]\n",
@@ -748,14 +742,14 @@
     "small_corpus = train_docs[:15000]\n",
     "large_corpus = train_docs + test_docs\n",
     "\n",
-    "print len(train_docs), len(test_docs), len(doc_list), len(small_corpus), len(large_corpus)"
+    "print(len(train_docs), len(test_docs), len(doc_list), len(small_corpus), len(large_corpus))"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Here, we train two Doc2vec model, the parameters can be determined by yourself. We trained on 15k documents for the `model1` and 50k documents for the `model2`. But you should mixed some documents which from the 15k document in `model` to the `model2` as dicussed before. "
+    "Here, we train two Doc2vec model, the parameters can be determined by yourself. We trained on 15k documents for the `model1` and 50k documents for the `model2`. But you should mix some documents which from the 15k document in `model` to the `model2`, as discussed before. "
    ]
   },
   {
@@ -795,7 +789,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "For the IMDB training dataset, we train an classifier on the train data which has 25k documents with positive and negative label. Then using this classifier to predict the test data. To see what accuracy can the document vectors which learned by different method achieve."
+    "For the IMDB training dataset, we train an classifier on the train data which has 25k documents with positive and negative label. Then using this classifier to predict the test data, we see what accuracy can be achieved by the document vectors learned by different methods."
    ]
   },
   {
@@ -812,7 +806,7 @@
     "    classifier = LogisticRegression()\n",
     "    classifier.fit(train, train_label)\n",
     "    score = classifier.score(test, test_label)\n",
-    "    print \"the classifier score :\", score\n",
+    "    print(\"the classifier score :\", score)\n",
     "    return score"
    ]
   },
@@ -855,7 +849,7 @@
     "    test_array[i + 12500] = m2[i + 37500]\n",
     "    test_label[i + 12500] = 0\n",
     "\n",
-    "print \"The vectors are learned by doc2vec method\"\n",
+    "print(\"The vectors are learned by doc2vec method\")\n",
     "test_classifier_error(train_array, train_label, test_array, test_label)"
    ]
   },
@@ -910,15 +904,15 @@
     "    test_array[i + 12500] = m1[i + 37500]\n",
     "    test_label[i + 12500] = 0\n",
     "\n",
-    "print \"The vectors are learned by back-mapping method\"\n",
+    "print(\"The vectors are learned by back-mapping method\")\n",
     "test_classifier_error(train_array, train_label, test_array, test_label)"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "As we can see that, the vectors learned by back-mapping method performed not bad but still need improved."
+    "As we can see that, the vectors learned by back-mapping method performed not bad but still need to be improved."
    ]
   },
   {
@@ -1026,18 +1020,11 @@
    "source": [
     "You probably will see kinds of colors point. One for the `model1`, the `sdoc0` to `sdoc4` document vector are learned by Doc2vec and `sdoc5` and `sdoc6` are learned by back-mapping.  One for the `model2`, the `tdoc0` to `tdoc6` are learned by Doc2vec. We can see that some of points learned from the back-mapping method still have the relative position with the point learned by Doc2vec."
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3.10.2 64-bit",
    "language": "python",
    "name": "python3"
   },
@@ -1051,7 +1038,12 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.1"
+   "version": "3.10.2"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "901b79e026e03396fd1ffa7133844e9ea80e258ce34c66e1aabb5896bcb18463"
+   }
   }
  },
  "nbformat": 4,