From becc6d3be63627ff7b05906fca8a113ab1e128a0 Mon Sep 17 00:00:00 2001 From: robotcator Date: Thu, 30 Mar 2017 21:50:25 +0800 Subject: [PATCH] Explicit epochs and corpus size in word2vec train(). Continuing #1139. Fix #1052. (#1237) * fix the compatibility between python2 & 3 * require explicit corpus size, epochs for train() * make all train() calls use explicit count, epochs * add tests to make sure that ValueError is indeed thrown * update test * fix the word2vec's reset_from() * require explicit corpus size, epochs for train() * make all train() calls use explicit count, epochs * fix some error * fix test error --- docs/notebooks/doc2vec-IMDB.ipynb | 2 +- docs/notebooks/doc2vec-lee.ipynb | 192 +++++++++++++++----- docs/notebooks/doc2vec-wikipedia.ipynb | 123 ++++++++++--- docs/notebooks/online_w2v_tutorial.ipynb | 95 +++++++--- docs/notebooks/word2vec.ipynb | 217 +++++++++++++++++------ gensim/models/doc2vec.py | 2 +- gensim/models/word2vec.py | 47 ++--- gensim/test/test_doc2vec.py | 6 +- gensim/test/test_word2vec.py | 37 ++-- 9 files changed, 543 insertions(+), 178 deletions(-) diff --git a/docs/notebooks/doc2vec-IMDB.ipynb b/docs/notebooks/doc2vec-IMDB.ipynb index 92f48e24c5..4fb30b7b93 100644 --- a/docs/notebooks/doc2vec-IMDB.ipynb +++ b/docs/notebooks/doc2vec-IMDB.ipynb @@ -600,7 +600,7 @@ " duration = 'na'\n", " train_model.alpha, train_model.min_alpha = alpha, alpha\n", " with elapsed_timer() as elapsed:\n", - " train_model.train(doc_list)\n", + " train_model.train(doc_list, total_examples=train_model.corpus_count, epochs=train_model.iter)\n", " duration = '%.1f' % elapsed()\n", " \n", " # evaluate\n", diff --git a/docs/notebooks/doc2vec-lee.ipynb b/docs/notebooks/doc2vec-lee.ipynb index cc6279fe86..92d01aa133 100644 --- a/docs/notebooks/doc2vec-lee.ipynb +++ b/docs/notebooks/doc2vec-lee.ipynb @@ -2,7 +2,10 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "# Doc2Vec Tutorial on the Lee Dataset" ] @@ -11,7 +14,9 @@ "cell_type": "code", "execution_count": 1, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -24,7 +29,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## What is it?\n", "\n", @@ -33,7 +41,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Resources\n", "\n", @@ -46,14 +57,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Getting Started" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "To get going, we'll need to have a set of documents to train our doc2vec model. In theory, a document could be anything from a short 140 character tweet, a single paragraph (i.e., journal article abstract), a news article, or a book. In NLP parlance a collection or set of documents is often referred to as a corpus. \n", "\n", @@ -67,7 +84,9 @@ "cell_type": "code", "execution_count": 2, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -79,14 +98,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Define a Function to Read and Preprocess Text" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Below, we define a function to open the train/test file (with latin encoding), read the file line-by-line, pre-process each line using a simple gensim pre-processing tool (i.e., tokenize text into individual words, remove punctuation, set to lowercase, etc), and return a list of words. Note that, for a given file (aka corpus), each continuous line constitutes a single document and the length of each line (i.e., document) can vary. Also, to train the model, we'll need to associate a tag/number with each document of the training corpus. In our case, the tag is simply the zero-based line number." ] @@ -95,7 +120,9 @@ "cell_type": "code", "execution_count": 3, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -113,7 +140,9 @@ "cell_type": "code", "execution_count": 4, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -123,7 +152,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Let's take a look at the training corpus" ] @@ -132,7 +164,9 @@ "cell_type": "code", "execution_count": 5, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -153,7 +187,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "And the testing corpus looks like this:" ] @@ -162,7 +199,9 @@ "cell_type": "code", "execution_count": 6, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -179,28 +218,40 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Notice that the testing corpus is just a list of lists and does not contain any tags." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Training the Model" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Instantiate a Doc2Vec Object " ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Now, we'll instantiate a Doc2Vec model with a vector size with 50 words and iterating over the training corpus 55 times. We set the minimum word count to 2 in order to give higher frequency words more weighting. Model accuracy can be improved by increasing the number of iterations but this generally increases the training time. Small datasets with short documents, like this one, can benefit from more training passes." ] @@ -209,7 +260,9 @@ "cell_type": "code", "execution_count": 7, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -218,7 +271,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Build a Vocabulary" ] @@ -227,7 +283,9 @@ "cell_type": "code", "execution_count": 8, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -236,14 +294,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Essentially, the vocabulary is a dictionary (accessible via `model.wv.vocab`) of all of the unique words extracted from the training corpus along with the count (e.g., `model.wv.vocab['penalty'].count` for counts for the word `penalty`)." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Time to Train\n", "\n", @@ -255,7 +319,9 @@ "cell_type": "code", "execution_count": 9, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -278,19 +344,25 @@ } ], "source": [ - "%time model.train(train_corpus)" + "%time model.train(train_corpus, total_examples=model.corpus_count, epochs=model.iter)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Inferring a Vector" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "One important thing to note is that you can now infer a vector for any piece of text without having to re-train the model by passing a list of words to the `model.infer_vector` function. This vector can then be compared with other vectors via cosine similarity." ] @@ -299,7 +371,9 @@ "cell_type": "code", "execution_count": 10, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -328,14 +402,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Assessing Model" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "To assess our new model, we'll first infer new vectors for each document of the training corpus, compare the inferred vectors with the training corpus, and then returning the rank of the document based on self-similarity. Basically, we're pretending as if the training corpus is some new unseen data and then seeing how they compare with the trained model. The expectation is that we've likely overfit our model (i.e., all of the ranks will be less than 2) and so we should be able to find similar documents very easily. Additionally, we'll keep track of the second ranks for a comparison of less similar documents. " ] @@ -344,7 +424,9 @@ "cell_type": "code", "execution_count": 11, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -361,7 +443,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Let's count how each document ranks with respect to the training corpus " ] @@ -371,6 +456,8 @@ "execution_count": 12, "metadata": { "collapsed": false, + "deletable": true, + "editable": true, "scrolled": true }, "outputs": [ @@ -391,7 +478,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Basically, greater than 95% of the inferred documents are found to be most similar to itself and about 5% of the time it is mistakenly most similar to another document. the checking of an inferred-vector against a training-vector is a sort of 'sanity check' as to whether the model is behaving in a usefully consistent manner, though not a real 'accuracy' value.\n", "\n", @@ -402,7 +492,9 @@ "cell_type": "code", "execution_count": 15, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -431,7 +523,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Notice above that the most similar document is has a similarity score of ~80% (or higher). However, the similarity score for the second ranked documents should be significantly lower (assuming the documents are in fact different) and the reasoning becomes obvious when we examine the text itself" ] @@ -440,7 +535,9 @@ "cell_type": "code", "execution_count": 14, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -466,14 +563,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Testing the Model" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Using the same approach above, we'll infer the vector for a randomly chosen test document, and compare the document to our model by eye." ] @@ -482,7 +585,9 @@ "cell_type": "code", "execution_count": 15, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -517,7 +622,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Wrapping Up\n", "\n", @@ -541,7 +649,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.12" + "version": "2.7.6" } }, "nbformat": 4, diff --git a/docs/notebooks/doc2vec-wikipedia.ipynb b/docs/notebooks/doc2vec-wikipedia.ipynb index c5cd3e70f5..4e09fa318d 100644 --- a/docs/notebooks/doc2vec-wikipedia.ipynb +++ b/docs/notebooks/doc2vec-wikipedia.ipynb @@ -2,14 +2,20 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "# Doc2Vec to wikipedia articles" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "We conduct the replication to **Document Embedding with Paragraph Vectors** (http://arxiv.org/abs/1507.07998).\n", "In this paper, they showed only DBOW results to Wikipedia data. So we replicate this experiments using not only DBOW but also DM." @@ -17,14 +23,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Basic Setup" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Let's import Doc2Vec module." ] @@ -33,7 +45,9 @@ "cell_type": "code", "execution_count": 1, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -45,14 +59,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Preparing the corpus" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "First, download the dump of all Wikipedia articles from [here](http://download.wikimedia.org/enwiki/) (you want the file enwiki-latest-pages-articles.xml.bz2, or enwiki-YYYYMMDD-pages-articles.xml.bz2 for date-specific dumps).\n", "\n", @@ -65,7 +85,9 @@ "cell_type": "code", "execution_count": 2, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -75,7 +97,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Define **TaggedWikiDocument** class to convert WikiCorpus into suitable form for Doc2Vec." ] @@ -84,7 +109,9 @@ "cell_type": "code", "execution_count": 3, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -101,7 +128,9 @@ "cell_type": "code", "execution_count": 4, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -110,7 +139,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Preprocessing\n", "To set the same vocabulary size with original papar. We first calculate the optimal **min_count** parameter." @@ -120,7 +152,9 @@ "cell_type": "code", "execution_count": 5, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -132,7 +166,9 @@ "cell_type": "code", "execution_count": 6, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -169,14 +205,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "In the original paper, they set the vocabulary size 915,715. It seems similar size of vocabulary if we set min_count = 19. (size of vocab = 898,725)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Training the Doc2Vec Model\n", "To train Doc2Vec model by several method, DBOW and DM, we define the list of models." @@ -187,6 +229,8 @@ "execution_count": 7, "metadata": { "collapsed": false, + "deletable": true, + "editable": true, "scrolled": false }, "outputs": [], @@ -205,7 +249,9 @@ "cell_type": "code", "execution_count": 8, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -226,7 +272,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Now we’re ready to train Doc2Vec of the English Wikipedia. " ] @@ -236,6 +285,8 @@ "execution_count": 9, "metadata": { "collapsed": false, + "deletable": true, + "editable": true, "scrolled": true }, "outputs": [ @@ -252,19 +303,25 @@ ], "source": [ "for model in models:\n", - " %%time model.train(documents)" + " %%time model.train(documents, total_examples=model.corpus_count, epochs=model.iter)" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Similarity interface" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "After that, let's test both models! DBOW model show the simillar results with the original paper. First, calculating cosine simillarity of \"Machine learning\" using Paragraph Vector. Word Vector and Document Vector are separately stored. We have to add .docvecs after model name to extract Document Vector from Doc2Vec Model." ] @@ -273,7 +330,9 @@ "cell_type": "code", "execution_count": 10, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -333,7 +392,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "DBOW model interpret the word 'Machine Learning' as a part of Computer Science field, and DM model as Data Science related field.\n", "\n", @@ -345,6 +407,8 @@ "execution_count": 11, "metadata": { "collapsed": false, + "deletable": true, + "editable": true, "scrolled": false }, "outputs": [ @@ -386,7 +450,9 @@ { "cell_type": "markdown", "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "source": [ "DBOW model reveal the similar singer in the U.S., and DM model understand that many of Lady Gaga's songs are similar with the word \"Lady Gaga\".\n", @@ -399,6 +465,8 @@ "execution_count": 12, "metadata": { "collapsed": false, + "deletable": true, + "editable": true, "scrolled": false }, "outputs": [ @@ -440,7 +508,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "As a result, DBOW model demonstrate the similar artists with Lady Gaga in Japan such as 'Perfume', which is the Most famous Idol in Japan. On the other hand, DM model results don't include the Japanese aritsts in top 10 simillar documents. It's almost same with no vector calculated results.\n", "\n", @@ -464,7 +535,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.2" + "version": "3.4.3" } }, "nbformat": 4, diff --git a/docs/notebooks/online_w2v_tutorial.ipynb b/docs/notebooks/online_w2v_tutorial.ipynb index b233c67817..ed51565272 100644 --- a/docs/notebooks/online_w2v_tutorial.ipynb +++ b/docs/notebooks/online_w2v_tutorial.ipynb @@ -2,7 +2,10 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "# Online word2vec tutorial\n", "\n", @@ -15,7 +18,9 @@ "cell_type": "code", "execution_count": 1, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -28,7 +33,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Download wikipedia dump files\n", "\n", @@ -39,7 +47,9 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -50,7 +60,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Convert two wikipedia dump files\n", "To avoid alert when convert old verision of wikipedia dump, you should download alternative wikicorpus.py in my repo." @@ -60,7 +73,9 @@ "cell_type": "code", "execution_count": 2, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -71,7 +86,9 @@ "cell_type": "code", "execution_count": 3, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -89,7 +106,9 @@ "cell_type": "code", "execution_count": 4, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -101,7 +120,9 @@ "cell_type": "code", "execution_count": 5, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -110,7 +131,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "### Initial training\n", "At first we train word2vec using \"enwiki-20101011-pages-articles.xml.bz2\". After that, we update model using \"enwiki-20160820-pages-articles.xml.bz2\"." @@ -120,7 +144,9 @@ "cell_type": "code", "execution_count": 6, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -142,7 +168,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "#### Japanese new idol group, [\"Babymetal\"](https://en.wikipedia.org/wiki/Babymetal), weren't known worldwide in 2010, so that the word, \"babymetal\", is not in oldmodel vocaburary.\n", "Note: In recent years, they became the famous idol group not only in Japan. They won many music awards and run world tour." @@ -152,7 +181,9 @@ "cell_type": "code", "execution_count": 7, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -172,7 +203,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Online update\n", "To use online word2vec feature, set update=True when you use build_vocab using new documents." @@ -182,7 +216,9 @@ "cell_type": "code", "execution_count": 8, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -197,14 +233,17 @@ "source": [ "%%time\n", "model.build_vocab(newwiki, update=True)\n", - "model.train(newwiki)\n", + "model.train(newwiki, total_examples=model.corpus_count, epochs=model.iter)\n", "model.save('newmodel')\n", "# model = Word2Vec.load('newmodel')" ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "#### Model Comparison\n", "By the online training, the size of vocaburaries are increased about 3 millions." @@ -214,7 +253,9 @@ "cell_type": "code", "execution_count": 9, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -233,7 +274,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "#### After online training, the word, \"babymetal\", is added in model. This word is simillar with rock and metal bands." ] @@ -242,7 +286,9 @@ "cell_type": "code", "execution_count": 10, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -271,7 +317,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## The word, \"Zootopia\", become disney movie through the years.\n", "In the past, the word, \"Zootopia\", was used just for an annual summer concert put on by New York top-40 radio station Z100, so that the word, \"zootopia\", is simillar with music festival.\n", @@ -284,6 +333,8 @@ "execution_count": 11, "metadata": { "collapsed": false, + "deletable": true, + "editable": true, "scrolled": false }, "outputs": [ @@ -343,7 +394,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.12" + "version": "2.7.6" } }, "nbformat": 4, diff --git a/docs/notebooks/word2vec.ipynb b/docs/notebooks/word2vec.ipynb index 4d7344c11c..1f490950fa 100644 --- a/docs/notebooks/word2vec.ipynb +++ b/docs/notebooks/word2vec.ipynb @@ -2,7 +2,10 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "# Word2Vec Tutorial\n", "\n", @@ -19,7 +22,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Preparing the Input\n", "Starting from the beginning, gensim’s `word2vec` expects a sequence of sentences as its input. Each sentence a list of words (utf8 strings):" @@ -29,7 +35,9 @@ "cell_type": "code", "execution_count": 1, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -42,7 +50,9 @@ "cell_type": "code", "execution_count": 2, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -61,7 +71,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Keeping the input as a Python built-in list is convenient, but can use up a lot of RAM when the input is large.\n", "\n", @@ -74,7 +87,9 @@ "cell_type": "code", "execution_count": 3, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -96,7 +111,9 @@ "cell_type": "code", "execution_count": 4, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -114,7 +131,9 @@ "cell_type": "code", "execution_count": 5, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -134,7 +153,9 @@ "cell_type": "code", "execution_count": 6, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -154,7 +175,9 @@ "cell_type": "code", "execution_count": 7, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -173,7 +196,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Say we want to further preprocess the words from the files — convert to unicode, lowercase, remove numbers, extract named entities… All of this can be done inside the `MySentences` iterator and `word2vec` doesn’t need to know. All that is required is that the input yields one sentence (list of utf8 words) after another.\n", "\n", @@ -188,7 +214,9 @@ "cell_type": "code", "execution_count": 8, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -213,14 +241,17 @@ "# build the same model, making the 2 steps explicit\n", "new_model = gensim.models.Word2Vec(min_count=1) # an empty model, no training\n", "new_model.build_vocab(sentences) # can be a non-repeatable, 1-pass generator \n", - "new_model.train(sentences) # can be a non-repeatable, 1-pass generator" + "new_model.train(sentences, total_examples=new_model.corpus_count, epochs=new_model.iter) \n", + "# can be a non-repeatable, 1-pass generator" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -239,7 +270,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## More data would be nice\n", "For the following examples, we'll use the [Lee Corpus](https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/test/test_data/lee_background.cor) (which you already have if you've installed gensim):" @@ -249,7 +283,9 @@ "cell_type": "code", "execution_count": 10, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -262,7 +298,9 @@ "cell_type": "code", "execution_count": 11, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -287,7 +325,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Training\n", "`Word2Vec` accepts several parameters that affect both training speed and quality.\n", @@ -299,7 +340,9 @@ "cell_type": "code", "execution_count": 12, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -311,7 +354,9 @@ "cell_type": "code", "execution_count": 13, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -321,7 +366,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Bigger size values require more training data, but can lead to better (more accurate) models. Reasonable values are in the tens to hundreds.\n", "\n", @@ -332,7 +380,9 @@ "cell_type": "code", "execution_count": 14, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -350,14 +400,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The `workers` parameter only has an effect if you have [Cython](http://cython.org/) installed. Without Cython, you’ll only be able to use one core because of the [GIL](https://wiki.python.org/moin/GlobalInterpreterLock) (and `word2vec` training will be [miserably slow](http://rare-technologies.com/word2vec-in-python-part-two-optimizing/))." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Memory\n", "At its core, `word2vec` model parameters are stored as matrices (NumPy arrays). Each array is **#vocabulary** (controlled by min_count parameter) times **#size** (size parameter) of floats (single precision aka 4 bytes).\n", @@ -369,7 +425,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Evaluating\n", "`Word2Vec` training is an unsupervised task, there’s no good way to objectively evaluate the result. Evaluation depends on your end application.\n", @@ -383,7 +442,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Gensim support the same evaluation set, in exactly the same format:" ] @@ -392,7 +454,9 @@ "cell_type": "code", "execution_count": 15, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -630,7 +694,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "This `accuracy` takes an \n", "[optional parameter](http://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec.accuracy) `restrict_vocab` \n", @@ -640,7 +707,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "In the December 2016 release of Gensim we added a better way to evaluate semantic similarity.\n", "\n", @@ -651,7 +721,9 @@ "cell_type": "code", "execution_count": 16, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -673,14 +745,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "Once again, **good performance on Google's or WS-353 test set doesn’t mean word2vec will work well in your application, or vice versa**. It’s always best to evaluate directly on your intended task. For an example of how to use word2vec in a classifier pipeline, see this [tutorial](https://github.com/RaRe-Technologies/movie-plots-by-genre)." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Storing and loading models\n", "You can store/load models using the standard gensim methods:" @@ -690,7 +768,9 @@ "cell_type": "code", "execution_count": 17, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -705,7 +785,9 @@ "cell_type": "code", "execution_count": 18, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [ @@ -714,7 +796,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "which uses pickle internally, optionally `mmap`‘ing the model’s internal large NumPy matrices into virtual memory directly from disk files, for inter-process memory sharing.\n", "\n", @@ -727,7 +812,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "## Online training / Resuming training\n", "Advanced users can load a model and continue training it with more sentences and [new vocabulary words](https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/online_w2v_tutorial.ipynb):" @@ -737,7 +825,9 @@ "cell_type": "code", "execution_count": 19, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -753,7 +843,7 @@ "more_sentences = [['Advanced', 'users', 'can', 'load', 'a', 'model', 'and', 'continue', \n", " 'training', 'it', 'with', 'more', 'sentences']]\n", "model.build_vocab(more_sentences, update=True)\n", - "model.train(more_sentences, )\n", + "model.train(more_sentences, total_examples=model.corpus_count, epochs=model.iter)\n", "\n", "# cleaning up temp\n", "os.close(fs)\n", @@ -762,7 +852,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "You may need to tweak the `total_words` parameter to `train()`, depending on what learning rate decay you want to simulate.\n", "\n", @@ -776,7 +869,9 @@ "cell_type": "code", "execution_count": 20, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -798,7 +893,9 @@ "cell_type": "code", "execution_count": 21, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -827,7 +924,9 @@ "cell_type": "code", "execution_count": 22, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -846,7 +945,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "You can get the probability distribution for the center word given the context words as input:" ] @@ -855,7 +957,9 @@ "cell_type": "code", "execution_count": 23, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -872,14 +976,20 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "The results here don't look good because the training corpus is very small. To get meaningful results one needs to train on 500k+ words." ] }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "If you need the raw output vectors in your application, you can access these either on a word-by-word basis:" ] @@ -888,7 +998,9 @@ "cell_type": "code", "execution_count": 24, "metadata": { - "collapsed": false + "collapsed": false, + "deletable": true, + "editable": true }, "outputs": [ { @@ -927,7 +1039,10 @@ }, { "cell_type": "markdown", - "metadata": {}, + "metadata": { + "deletable": true, + "editable": true + }, "source": [ "…or en-masse as a 2D NumPy matrix from `model.wv.syn0`.\n", "\n", @@ -945,7 +1060,9 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "deletable": true, + "editable": true }, "outputs": [], "source": [] @@ -968,7 +1085,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", - "version": "2.7.10" + "version": "2.7.6" } }, "nbformat": 4, diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index fdf00e430c..a166d17687 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -632,7 +632,7 @@ def __init__(self, documents=None, dm_mean=None, self.comment = comment if documents is not None: self.build_vocab(documents, trim_rule=trim_rule) - self.train(documents) + self.train(documents, total_examples=self.corpus_count, epochs=self.iter) @property def dm(self): diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index c29d61126c..d14894ef8a 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -474,8 +474,8 @@ def __init__( if isinstance(sentences, GeneratorType): raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.") self.build_vocab(sentences, trim_rule=trim_rule) - self.train(sentences) - + self.train(sentences, total_examples=self.corpus_count, epochs=self.iter, + start_alpha=self.alpha, end_alpha=self.min_alpha) else : if trim_rule is not None : logger.warning("The rule, if given, is only used prune vocabulary during build_vocab() and is not stored as part of the model. ") @@ -761,16 +761,23 @@ def _raw_word_count(self, job): """Return the number of words in a given job.""" return sum(len(sentence) for sentence in job) - def train(self, sentences, total_words=None, word_count=0, - total_examples=None, queue_factor=2, report_delay=1.0): + def train(self, sentences, total_examples=None, total_words=None, + epochs=None, start_alpha=None, end_alpha=None, + word_count=0, + queue_factor=2, report_delay=1.0): """ Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). For Word2Vec, each sentence must be a list of unicode strings. (Subclasses may accept other examples.) - To support linear learning-rate decay from (initial) alpha to min_alpha, either total_examples - (count of sentences) or total_words (count of raw words in sentences) should be provided, unless the - sentences are the same as those that were used to initially build the vocabulary. + To support linear learning-rate decay from (initial) alpha to min_alpha, and accurate + progres-percentage logging, either total_examples (count of sentences) or total_words (count of + raw words in sentences) MUST be provided. (If the corpus is the same as was provided to + `build_vocab()`, the count of examples in that corpus will be available in the model's + `corpus_count` property.) + To avoid common mistakes around the model's ability to do multiple training passes itself, an + explicit `epochs` argument MUST be provided. In the common and recommended case, where `train()` + is only called once, the model's cached `iter` value should be supplied as `epochs` value. """ if (self.model_trimmed_post_training): raise RuntimeError("Parameters for training were discarded using model_trimmed_post_training method") @@ -802,18 +809,18 @@ def train(self, sentences, total_words=None, word_count=0, "Instead start with a blank model, scan_vocab on the new corpus, intersect_word2vec_format with the old model, then train.") if total_words is None and total_examples is None: - if self.corpus_count: - total_examples = self.corpus_count - logger.info("expecting %i sentences, matching count from corpus used for vocabulary survey", total_examples) - else: - raise ValueError("you must provide either total_words or total_examples, to enable alpha and progress calculations") + raise ValueError("you must specify either total_examples or total_words, for proper alpha and progress calculations") + if epochs is None: + raise ValueError("you must specify an explict epochs count") + start_alpha = start_alpha or self.alpha + end_alpha = end_alpha or self.min_alpha job_tally = 0 - if self.iter > 1: - sentences = utils.RepeatCorpusNTimes(sentences, self.iter) - total_words = total_words and total_words * self.iter - total_examples = total_examples and total_examples * self.iter + if epochs > 1: + sentences = utils.RepeatCorpusNTimes(sentences, epochs) + total_words = total_words and total_words * epochs + total_examples = total_examples and total_examples * epochs def worker_loop(): """Train the model, lifting lists of sentences from the job_queue.""" @@ -835,7 +842,7 @@ def job_producer(): """Fill jobs queue using the input `sentences` iterator.""" job_batch, batch_size = [], 0 pushed_words, pushed_examples = 0, 0 - next_alpha = self.alpha + next_alpha = start_alpha if next_alpha > self.min_alpha_yet_reached: logger.warn("Effective 'alpha' higher than previous training cycles") self.min_alpha_yet_reached = next_alpha @@ -858,7 +865,7 @@ def job_producer(): job_queue.put((job_batch, next_alpha)) # update the learning rate for the next job - if self.min_alpha < next_alpha: + if end_alpha < next_alpha: if total_examples: # examples-based decay pushed_examples += len(job_batch) @@ -867,8 +874,8 @@ def job_producer(): # words-based decay pushed_words += self._raw_word_count(job_batch) progress = 1.0 * pushed_words / total_words - next_alpha = self.alpha - (self.alpha - self.min_alpha) * progress - next_alpha = max(self.min_alpha, next_alpha) + next_alpha = start_alpha - (start_alpha - end_alpha) * progress + next_alpha = max(end_alpha, next_alpha) # add the sentence that didn't fit as the first item of a new job job_batch, batch_size = [sentence], sentence_length diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index 55b9b5f3f0..1cc32f0095 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -183,7 +183,7 @@ def model_sanity(self, model, keep_training=True): if keep_training: model.save(testfile()) loaded = doc2vec.Doc2Vec.load(testfile()) - loaded.train(sentences) + loaded.train(sentences, total_examples=loaded.corpus_count, epochs=loaded.iter) def test_training(self): """Test doc2vec training.""" @@ -191,7 +191,7 @@ def test_training(self): model = doc2vec.Doc2Vec(size=100, min_count=2, iter=20, workers=1) model.build_vocab(corpus) self.assertEqual(model.docvecs.doctag_syn0.shape, (300, 100)) - model.train(corpus) + model.train(corpus, total_examples=model.corpus_count, epochs=model.iter) self.model_sanity(model) @@ -347,7 +347,7 @@ def testTrainWarning(self, l): model = doc2vec.Doc2Vec(alpha=0.025, min_alpha=0.025, min_count=1, workers=8, size=5) model.build_vocab(sentences) for epoch in range(10): - model.train(sentences) + model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) model.alpha -= 0.002 model.min_alpha = model.alpha if epoch == 5: diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index e37968218c..bbb09a21ab 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -103,7 +103,7 @@ def testOnlineLearningAfterSave(self): model_neg = word2vec.Word2Vec.load(testfile()) self.assertTrue(len(model_neg.wv.vocab), 12) model_neg.build_vocab(new_sentences, update=True) - model_neg.train(new_sentences) + model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.iter) self.assertEqual(len(model_neg.wv.vocab), 14) @@ -116,12 +116,12 @@ def onlineSanity(self, model): others.append(l) self.assertTrue(all(['terrorism' not in l for l in others])) model.build_vocab(others) - model.train(others) + model.train(others, total_examples=model.corpus_count, epochs=model.iter) self.assertFalse('terrorism' in model.wv.vocab) model.build_vocab(terro, update=True) self.assertTrue('terrorism' in model.wv.vocab) orig0 = np.copy(model.wv.syn0) - model.train(terro) + model.train(terro, total_examples=len(terro), epochs=model.iter) self.assertFalse(np.allclose(model.wv.syn0, orig0)) sim = model.n_similarity(['war'], ['terrorism']) self.assertLess(0., sim) @@ -363,7 +363,7 @@ def testTraining(self): self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), 2)) self.assertTrue(model.syn1.shape == (len(model.wv.vocab), 2)) - model.train(sentences) + model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) sims = model.most_similar('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar @@ -399,7 +399,7 @@ def testLocking(self): # lock the vector in slot 0 against change model.syn0_lockf[0] = 0.0 - model.train(corpus) + model.train(corpus, total_examples=model.corpus_count, epochs=model.iter) self.assertFalse((unlocked1 == model.wv.syn0[1]).all()) # unlocked vector should vary self.assertTrue((locked0 == model.wv.syn0[0]).all()) # locked vector should not vary @@ -428,7 +428,7 @@ def model_sanity(self, model, train=True): if train: model.build_vocab(list_corpus) orig0 = np.copy(model.wv.syn0[0]) - model.train(list_corpus) + model.train(list_corpus, total_examples=model.corpus_count, epochs=model.iter) self.assertFalse((orig0 == model.wv.syn0[1]).all()) # vector should vary after training sims = model.most_similar('war', topn=len(model.wv.index2word)) t_rank = [word for word, score in sims].index('terrorism') @@ -481,7 +481,7 @@ def testTrainingCbow(self): self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), 2)) self.assertTrue(model.syn1.shape == (len(model.wv.vocab), 2)) - model.train(sentences) + model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) sims = model.most_similar('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar @@ -504,7 +504,7 @@ def testTrainingSgNegative(self): self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), 2)) self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), 2)) - model.train(sentences) + model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) sims = model.most_similar('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar @@ -527,7 +527,7 @@ def testTrainingCbowNegative(self): self.assertTrue(model.wv.syn0.shape == (len(model.wv.vocab), 2)) self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), 2)) - model.train(sentences) + model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) sims = model.most_similar('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar @@ -546,7 +546,7 @@ def testSimilarities(self): # The model is trained using CBOW model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=0, negative=2) model.build_vocab(sentences) - model.train(sentences) + model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) self.assertTrue(model.n_similarity(['graph', 'trees'], ['trees', 'graph'])) self.assertTrue(model.n_similarity(['graph'], ['trees']) == model.similarity('graph', 'trees')) @@ -660,7 +660,7 @@ def testTrainWarning(self, l): model = word2vec.Word2Vec(min_count=1) model.build_vocab(sentences) for epoch in range(10): - model.train(sentences) + model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) model.alpha -= 0.002 model.min_alpha = model.alpha if epoch == 5: @@ -668,6 +668,17 @@ def testTrainWarning(self, l): warning = "Effective 'alpha' higher than previous training cycles" self.assertTrue(warning in str(l)) + def test_train_with_explicit_param(self): + model = word2vec.Word2Vec(size=2, min_count=1, hs=1, negative=0) + model.build_vocab(sentences) + with self.assertRaises(ValueError): + model.train(sentences, total_examples=model.corpus_count) + + with self.assertRaises(ValueError): + model.train(sentences, epochs=model.iter) + + with self.assertRaises(ValueError): + model.train(sentences) def test_sentences_should_not_be_a_generator(self): """ Is sentences a generator object? @@ -682,11 +693,11 @@ def testLoadOnClassError(self): def test_reset_from(self): """Test if reset_from() uses pre-built structures from other model""" model = word2vec.Word2Vec(sentences, min_count=1) - other_model = word2vec.Word2Vec(new_sentences, min_count=1) + other_model = word2vec.Word2Vec(new_sentences, min_count=1) other_vocab = other_model.wv.vocab model.reset_from(other_model) self.assertEqual(model.wv.vocab, other_vocab) - + #endclass TestWord2VecModel