diff --git a/docs/notebooks/downloader_api_tutorial.ipynb b/docs/notebooks/downloader_api_tutorial.ipynb new file mode 100644 index 0000000000..73eba4475b --- /dev/null +++ b/docs/notebooks/downloader_api_tutorial.ipynb @@ -0,0 +1,495 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Tutorial for using Gensim's API for downloading corpuses/models\n", + "Let's start by importing the api module." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import gensim.downloader as api\n", + "\n", + "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, lets download the text8 corpus and load it to memory (automatically)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[==================================================] 100.0% 31.6/31.6MB downloaded\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2017-11-10 14:49:45,787 : INFO : text8 downloaded\n" + ] + } + ], + "source": [ + "corpus = api.load('text8')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As the corpus has been downloaded and loaded, let's create a word2vec model of our corpus." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2017-11-10 14:50:02,458 : INFO : collecting all words and their counts\n", + "2017-11-10 14:50:02,461 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types\n", + "2017-11-10 14:50:08,402 : INFO : collected 253854 word types from a corpus of 17005207 raw words and 1701 sentences\n", + "2017-11-10 14:50:08,403 : INFO : Loading a fresh vocabulary\n", + "2017-11-10 14:50:08,693 : INFO : min_count=5 retains 71290 unique words (28% of original 253854, drops 182564)\n", + "2017-11-10 14:50:08,694 : INFO : min_count=5 leaves 16718844 word corpus (98% of original 17005207, drops 286363)\n", + "2017-11-10 14:50:08,870 : INFO : deleting the raw counts dictionary of 253854 items\n", + "2017-11-10 14:50:08,898 : INFO : sample=0.001 downsamples 38 most-common words\n", + "2017-11-10 14:50:08,899 : INFO : downsampling leaves estimated 12506280 word corpus (74.8% of prior 16718844)\n", + "2017-11-10 14:50:08,900 : INFO : estimated required memory for 71290 words and 100 dimensions: 92677000 bytes\n", + "2017-11-10 14:50:09,115 : INFO : resetting layer weights\n", + "2017-11-10 14:50:09,703 : INFO : training model with 3 workers on 71290 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5\n", + "2017-11-10 14:50:10,718 : INFO : PROGRESS: at 1.66% examples, 1020519 words/s, in_qsize 5, out_qsize 0\n", + "2017-11-10 14:50:11,715 : INFO : PROGRESS: at 3.29% examples, 1017921 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:12,715 : INFO : PROGRESS: at 4.71% examples, 976739 words/s, in_qsize 4, out_qsize 0\n", + "2017-11-10 14:50:13,729 : INFO : PROGRESS: at 6.35% examples, 989118 words/s, in_qsize 4, out_qsize 1\n", + "2017-11-10 14:50:14,729 : INFO : PROGRESS: at 8.02% examples, 999982 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:15,734 : INFO : PROGRESS: at 9.65% examples, 1003821 words/s, in_qsize 1, out_qsize 1\n", + "2017-11-10 14:50:16,740 : INFO : PROGRESS: at 11.41% examples, 1017517 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:17,738 : INFO : PROGRESS: at 13.17% examples, 1027943 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:18,740 : INFO : PROGRESS: at 14.80% examples, 1027654 words/s, in_qsize 4, out_qsize 0\n", + "2017-11-10 14:50:19,744 : INFO : PROGRESS: at 16.53% examples, 1030328 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:20,747 : INFO : PROGRESS: at 18.21% examples, 1032126 words/s, in_qsize 0, out_qsize 1\n", + "2017-11-10 14:50:21,750 : INFO : PROGRESS: at 19.85% examples, 1030455 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:22,755 : INFO : PROGRESS: at 21.54% examples, 1031582 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:23,760 : INFO : PROGRESS: at 23.20% examples, 1031237 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:24,764 : INFO : PROGRESS: at 24.84% examples, 1031195 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:25,769 : INFO : PROGRESS: at 26.56% examples, 1034213 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:26,771 : INFO : PROGRESS: at 28.14% examples, 1031534 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:27,777 : INFO : PROGRESS: at 29.82% examples, 1032589 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:28,780 : INFO : PROGRESS: at 31.42% examples, 1030998 words/s, in_qsize 1, out_qsize 0\n", + "2017-11-10 14:50:29,783 : INFO : PROGRESS: at 33.15% examples, 1033447 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:30,783 : INFO : PROGRESS: at 34.85% examples, 1035303 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:31,789 : INFO : PROGRESS: at 36.50% examples, 1033770 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:32,795 : INFO : PROGRESS: at 38.17% examples, 1034073 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:33,798 : INFO : PROGRESS: at 39.81% examples, 1033387 words/s, in_qsize 2, out_qsize 0\n", + "2017-11-10 14:50:34,800 : INFO : PROGRESS: at 41.33% examples, 1029575 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:35,801 : INFO : PROGRESS: at 43.03% examples, 1030736 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:36,801 : INFO : PROGRESS: at 44.70% examples, 1031367 words/s, in_qsize 0, out_qsize 1\n", + "2017-11-10 14:50:37,802 : INFO : PROGRESS: at 46.41% examples, 1032986 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:38,805 : INFO : PROGRESS: at 48.09% examples, 1033731 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:39,807 : INFO : PROGRESS: at 49.82% examples, 1035440 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:40,817 : INFO : PROGRESS: at 51.49% examples, 1035681 words/s, in_qsize 3, out_qsize 0\n", + "2017-11-10 14:50:41,811 : INFO : PROGRESS: at 53.16% examples, 1036024 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:42,817 : INFO : PROGRESS: at 54.86% examples, 1036910 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:43,820 : INFO : PROGRESS: at 56.51% examples, 1035966 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:44,822 : INFO : PROGRESS: at 58.07% examples, 1034360 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:45,822 : INFO : PROGRESS: at 59.54% examples, 1030906 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:46,823 : INFO : PROGRESS: at 61.12% examples, 1029543 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:47,827 : INFO : PROGRESS: at 62.77% examples, 1029390 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:48,833 : INFO : PROGRESS: at 64.50% examples, 1030528 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:49,833 : INFO : PROGRESS: at 66.15% examples, 1030820 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:50,836 : INFO : PROGRESS: at 67.83% examples, 1031459 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:51,850 : INFO : PROGRESS: at 69.47% examples, 1030985 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:52,857 : INFO : PROGRESS: at 71.18% examples, 1031954 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:53,862 : INFO : PROGRESS: at 72.83% examples, 1031823 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:54,864 : INFO : PROGRESS: at 74.46% examples, 1031628 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:55,866 : INFO : PROGRESS: at 76.17% examples, 1031962 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:56,870 : INFO : PROGRESS: at 77.77% examples, 1031167 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:57,875 : INFO : PROGRESS: at 79.37% examples, 1030337 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:58,880 : INFO : PROGRESS: at 80.99% examples, 1029831 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:50:59,881 : INFO : PROGRESS: at 82.67% examples, 1030029 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:51:00,881 : INFO : PROGRESS: at 84.39% examples, 1030874 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:51:01,886 : INFO : PROGRESS: at 86.03% examples, 1030988 words/s, in_qsize 2, out_qsize 0\n", + "2017-11-10 14:51:02,892 : INFO : PROGRESS: at 87.72% examples, 1031570 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:51:03,895 : INFO : PROGRESS: at 89.41% examples, 1031964 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:51:04,902 : INFO : PROGRESS: at 91.09% examples, 1032271 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:51:05,910 : INFO : PROGRESS: at 92.53% examples, 1029888 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:51:06,912 : INFO : PROGRESS: at 94.03% examples, 1028192 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:51:07,916 : INFO : PROGRESS: at 95.74% examples, 1028660 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:51:08,919 : INFO : PROGRESS: at 97.47% examples, 1029434 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:51:09,923 : INFO : PROGRESS: at 99.18% examples, 1029952 words/s, in_qsize 0, out_qsize 0\n", + "2017-11-10 14:51:10,409 : INFO : worker thread finished; awaiting finish of 2 more threads\n", + "2017-11-10 14:51:10,409 : INFO : worker thread finished; awaiting finish of 1 more threads\n", + "2017-11-10 14:51:10,415 : INFO : worker thread finished; awaiting finish of 0 more threads\n", + "2017-11-10 14:51:10,416 : INFO : training on 85026035 raw words (62530433 effective words) took 60.7s, 1029968 effective words/s\n" + ] + } + ], + "source": [ + "from gensim.models.word2vec import Word2Vec\n", + "\n", + "model = Word2Vec(corpus)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have our word2vec model, let's find words that are similar to 'tree'" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2017-11-10 14:51:10,422 : INFO : precomputing L2-norms of word weight vectors\n" + ] + }, + { + "data": { + "text/plain": [ + "[(u'trees', 0.7245415449142456),\n", + " (u'leaf', 0.6882676482200623),\n", + " (u'bark', 0.645646333694458),\n", + " (u'avl', 0.6076173782348633),\n", + " (u'cactus', 0.6019535064697266),\n", + " (u'flower', 0.6010029315948486),\n", + " (u'fruit', 0.5908031463623047),\n", + " (u'bird', 0.5886812806129456),\n", + " (u'leaves', 0.5771278142929077),\n", + " (u'pond', 0.5627825856208801)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model.most_similar('tree')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can use the API to download many corpora and models. You can get the list of all the models and corpora that are provided, by using the code below:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"models\": {\n", + " \"glove-twitter-25\": {\n", + " \"description\": \"Pre-trained vectors, 2B tweets, 27B tokens, 1.2M vocab, uncased. https://nlp.stanford.edu/projects/glove/\", \n", + " \"parameters\": \"dimensions = 25\", \n", + " \"file_name\": \"glove-twitter-25.gz\", \n", + " \"papers\": \"https://nlp.stanford.edu/pubs/glove.pdf\", \n", + " \"parts\": 1, \n", + " \"preprocessing\": \"Converted to w2v format with `python -m gensim.scripts.glove2word2vec -i -o glove-twitter-25.txt`\", \n", + " \"checksum\": \"50db0211d7e7a2dcd362c6b774762793\"\n", + " }, \n", + " \"glove-twitter-100\": {\n", + " \"description\": \"Pre-trained vectors, 2B tweets, 27B tokens, 1.2M vocab, uncased. https://nlp.stanford.edu/projects/glove/\", \n", + " \"parameters\": \"dimensions = 100\", \n", + " \"file_name\": \"glove-twitter-100.gz\", \n", + " \"papers\": \"https://nlp.stanford.edu/pubs/glove.pdf\", \n", + " \"parts\": 1, \n", + " \"preprocessing\": \"Converted to w2v format with `python -m gensim.scripts.glove2word2vec -i -o glove-twitter-100.txt`\", \n", + " \"checksum\": \"b04f7bed38756d64cf55b58ce7e97b15\"\n", + " }, \n", + " \"glove-wiki-gigaword-100\": {\n", + " \"description\": \"Pre-trained vectors ,Wikipedia 2014 + Gigaword 5,6B tokens, 400K vocab, uncased. https://nlp.stanford.edu/projects/glove/\", \n", + " \"parameters\": \"dimensions = 100\", \n", + " \"file_name\": \"glove-wiki-gigaword-100.gz\", \n", + " \"papers\": \"https://nlp.stanford.edu/pubs/glove.pdf\", \n", + " \"parts\": 1, \n", + " \"preprocessing\": \"Converted to w2v format with `python -m gensim.scripts.glove2word2vec -i -o glove-wiki-gigaword-100.txt`\", \n", + " \"checksum\": \"40ec481866001177b8cd4cb0df92924f\"\n", + " }, \n", + " \"glove-twitter-200\": {\n", + " \"description\": \"Pre-trained vectors, 2B tweets, 27B tokens, 1.2M vocab, uncased. https://nlp.stanford.edu/projects/glove/\", \n", + " \"parameters\": \"dimensions = 200\", \n", + " \"file_name\": \"glove-twitter-200.gz\", \n", + " \"papers\": \"https://nlp.stanford.edu/pubs/glove.pdf\", \n", + " \"parts\": 1, \n", + " \"preprocessing\": \"Converted to w2v format with `python -m gensim.scripts.glove2word2vec -i -o glove-twitter-200.txt`\", \n", + " \"checksum\": \"e52e8392d1860b95d5308a525817d8f9\"\n", + " }, \n", + " \"glove-wiki-gigaword-50\": {\n", + " \"description\": \"Pre-trained vectors ,Wikipedia 2014 + Gigaword 5,6B tokens, 400K vocab, uncased. https://nlp.stanford.edu/projects/glove/\", \n", + " \"parameters\": \"dimension = 50\", \n", + " \"file_name\": \"glove-wiki-gigaword-50.gz\", \n", + " \"papers\": \"https://nlp.stanford.edu/pubs/glove.pdf\", \n", + " \"parts\": 1, \n", + " \"preprocessing\": \"Converted to w2v format with `python -m gensim.scripts.glove2word2vec -i -o glove-wiki-gigaword-50.txt`\", \n", + " \"checksum\": \"c289bc5d7f2f02c6dc9f2f9b67641813\"\n", + " }, \n", + " \"glove-twitter-50\": {\n", + " \"description\": \"Pre-trained vectors, 2B tweets, 27B tokens, 1.2M vocab, uncased. https://nlp.stanford.edu/projects/glove/\", \n", + " \"parameters\": \"dimensions = 50\", \n", + " \"file_name\": \"glove-twitter-50.gz\", \n", + " \"papers\": \"https://nlp.stanford.edu/pubs/glove.pdf\", \n", + " \"parts\": 1, \n", + " \"preprocessing\": \"Converted to w2v format with `python -m gensim.scripts.glove2word2vec -i -o glove-twitter-50.txt`\", \n", + " \"checksum\": \"c168f18641f8c8a00fe30984c4799b2b\"\n", + " }, \n", + " \"__testing_word2vec-matrix-synopsis\": {\n", + " \"description\": \"Word vecrors of the movie matrix\", \n", + " \"parameters\": \"dimentions = 50\", \n", + " \"file_name\": \"__testing_word2vec-matrix-synopsis.gz\", \n", + " \"papers\": \"\", \n", + " \"parts\": 1, \n", + " \"preprocessing\": \"Converted to w2v using a preprocessed corpus. Converted to w2v format with `python3.5 -m gensim.models.word2vec -train -iter 50 -output `\", \n", + " \"checksum\": \"534dcb8b56a360977a269b7bfc62d124\"\n", + " }, \n", + " \"glove-wiki-gigaword-200\": {\n", + " \"description\": \"Pre-trained vectors ,Wikipedia 2014 + Gigaword 5,6B tokens, 400K vocab, uncased. https://nlp.stanford.edu/projects/glove/\", \n", + " \"parameters\": \"dimentions = 200\", \n", + " \"file_name\": \"glove-wiki-gigaword-200.gz\", \n", + " \"papers\": \"https://nlp.stanford.edu/pubs/glove.pdf\", \n", + " \"parts\": 1, \n", + " \"preprocessing\": \"Converted to w2v format with `python -m gensim.scripts.glove2word2vec -i -o glove-wiki-gigaword-200.txt`\", \n", + " \"checksum\": \"59652db361b7a87ee73834a6c391dfc1\"\n", + " }, \n", + " \"word2vec-google-news-300\": {\n", + " \"description\": \"Pre-trained vectors trained on part of Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases. The phrases were obtained using a simple data-driven approach described in 'Distributed Representations of Words and Phrases and their Compositionality', https://code.google.com/archive/p/word2vec/\", \n", + " \"parameters\": \"dimension = 300\", \n", + " \"file_name\": \"word2vec-google-news-300.gz\", \n", + " \"papers\": \"https://arxiv.org/abs/1301.3781, https://arxiv.org/abs/1310.4546, https://www.microsoft.com/en-us/research/publication/linguistic-regularities-in-continuous-space-word-representations/?from=http%3A%2F%2Fresearch.microsoft.com%2Fpubs%2F189726%2Frvecs.pdf\", \n", + " \"parts\": 1, \n", + " \"checksum\": \"a5e5354d40acb95f9ec66d5977d140ef\"\n", + " }, \n", + " \"glove-wiki-gigaword-300\": {\n", + " \"description\": \"Pre-trained vectors, Wikipedia 2014 + Gigaword 5, 6B tokens, 400K vocab, uncased. https://nlp.stanford.edu/projects/glove/\", \n", + " \"parameters\": \"dimensions = 300\", \n", + " \"file_name\": \"glove-wiki-gigaword-300.gz\", \n", + " \"papers\": \"https://nlp.stanford.edu/pubs/glove.pdf\", \n", + " \"parts\": 1, \n", + " \"preprocessing\": \"Converted to w2v format with `python -m gensim.scripts.glove2word2vec -i -o glove-wiki-gigaword-300.txt`\", \n", + " \"checksum\": \"29e9329ac2241937d55b852e8284e89b\"\n", + " }\n", + " }, \n", + " \"corpora\": {\n", + " \"__testing_matrix-synopsis\": {\n", + " \"source\": \"http://www.imdb.com/title/tt0133093/plotsummary?ref_=ttpl_pl_syn#synopsis\", \n", + " \"checksum\": \"1767ac93a089b43899d54944b07d9dc5\", \n", + " \"parts\": 1, \n", + " \"description\": \"Synopsis of the movie matrix\", \n", + " \"file_name\": \"__testing_matrix-synopsis.gz\"\n", + " }, \n", + " \"fake-news\": {\n", + " \"source\": \"Kaggle\", \n", + " \"checksum\": \"5e64e942df13219465927f92dcefd5fe\", \n", + " \"parts\": 1, \n", + " \"description\": \"It contains text and metadata scraped from 244 websites tagged as 'bullshit' here by the BS Detector Chrome Extension by Daniel Sieradski.\", \n", + " \"file_name\": \"fake-news.gz\"\n", + " }, \n", + " \"__testing_multipart-matrix-synopsis\": {\n", + " \"description\": \"Synopsis of the movie matrix\", \n", + " \"source\": \"http://www.imdb.com/title/tt0133093/plotsummary?ref_=ttpl_pl_syn#synopsis\", \n", + " \"file_name\": \"__testing_multipart-matrix-synopsis.gz\", \n", + " \"checksum-0\": \"c8b0c7d8cf562b1b632c262a173ac338\", \n", + " \"checksum-1\": \"5ff7fc6818e9a5d9bc1cf12c35ed8b96\", \n", + " \"checksum-2\": \"966db9d274d125beaac7987202076cba\", \n", + " \"parts\": 3\n", + " }, \n", + " \"text8\": {\n", + " \"source\": \"http://mattmahoney.net/dc/text8.zip\", \n", + " \"checksum\": \"68799af40b6bda07dfa47a32612e5364\", \n", + " \"parts\": 1, \n", + " \"description\": \"Cleaned small sample from wikipedia\", \n", + " \"file_name\": \"text8.gz\"\n", + " }, \n", + " \"wiki-en\": {\n", + " \"description\": \"Extracted Wikipedia dump from October 2017. Produced by `python -m gensim.scripts.segment_wiki -f enwiki-20171001-pages-articles.xml.bz2 -o wiki-en.gz`\", \n", + " \"source\": \"https://dumps.wikimedia.org/enwiki/20171001/\", \n", + " \"file_name\": \"wiki-en.gz\", \n", + " \"parts\": 4, \n", + " \"checksum-0\": \"a7d7d7fd41ea7e2d7fa32ec1bb640d71\", \n", + " \"checksum-1\": \"b2683e3356ffbca3b6c2dca6e9801f9f\", \n", + " \"checksum-2\": \"c5cde2a9ae77b3c4ebce804f6df542c2\", \n", + " \"checksum-3\": \"00b71144ed5e3aeeb885de84f7452b81\"\n", + " }, \n", + " \"20-newsgroups\": {\n", + " \"source\": \"http://qwone.com/~jason/20Newsgroups/\", \n", + " \"checksum\": \"c92fd4f6640a86d5ba89eaad818a9891\", \n", + " \"parts\": 1, \n", + " \"description\": \"The 20 Newsgroups data set is a collection of approximately 20,000 newsgroup documents, partitioned (nearly) evenly across 20 different newsgroups\", \n", + " \"file_name\": \"20-newsgroups.gz\"\n", + " }\n", + " }\n", + "}\n" + ] + } + ], + "source": [ + "import json\n", + "data_list = api.info()\n", + "print(json.dumps(data_list, indent=4))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want to get detailed information about the model/corpus, use:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n", + " \"source\": \"Kaggle\", \n", + " \"checksum\": \"5e64e942df13219465927f92dcefd5fe\", \n", + " \"parts\": 1, \n", + " \"description\": \"It contains text and metadata scraped from 244 websites tagged as 'bullshit' here by the BS Detector Chrome Extension by Daniel Sieradski.\", \n", + " \"file_name\": \"fake-news.gz\"\n", + "}\n" + ] + } + ], + "source": [ + "fake_news_info = api.info('fake-news')\n", + "print(json.dumps(fake_news_info, indent=4))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Sometimes, you do not want to load the model to memory. You would just want to get the path to the model. For that, use :" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/home/ivan/gensim-data/glove-wiki-gigaword-50/glove-wiki-gigaword-50.gz\n" + ] + } + ], + "source": [ + "print(api.load('glove-wiki-gigaword-50', return_path=True))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want to load the model to memory, then:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2017-11-10 14:51:59,199 : INFO : loading projection weights from /home/ivan/gensim-data/glove-wiki-gigaword-50/glove-wiki-gigaword-50.gz\n", + "2017-11-10 14:52:18,380 : INFO : loaded (400000, 50) matrix from /home/ivan/gensim-data/glove-wiki-gigaword-50/glove-wiki-gigaword-50.gz\n", + "2017-11-10 14:52:18,405 : INFO : precomputing L2-norms of word weight vectors\n" + ] + }, + { + "data": { + "text/plain": [ + "[(u'plastic', 0.7942505478858948),\n", + " (u'metal', 0.770871639251709),\n", + " (u'walls', 0.7700636386871338),\n", + " (u'marble', 0.7638524174690247),\n", + " (u'wood', 0.7624281048774719),\n", + " (u'ceramic', 0.7602593302726746),\n", + " (u'pieces', 0.7589111924171448),\n", + " (u'stained', 0.7528817057609558),\n", + " (u'tile', 0.748193621635437),\n", + " (u'furniture', 0.746385931968689)]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "model = api.load(\"glove-wiki-gigaword-50\")\n", + "model.most_similar(\"glass\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In corpora, the corpus is never loaded to memory, all corpuses wrapped to special class `Dataset` and provide `__iter__` method" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst index 3538dca954..b781ef16d6 100644 --- a/docs/src/apiref.rst +++ b/docs/src/apiref.rst @@ -11,6 +11,7 @@ Modules: interfaces utils matutils + downloader corpora/bleicorpus corpora/csvcorpus corpora/dictionary diff --git a/docs/src/downloader.rst b/docs/src/downloader.rst new file mode 100644 index 0000000000..901a3748f3 --- /dev/null +++ b/docs/src/downloader.rst @@ -0,0 +1,9 @@ +:mod:`downloader` -- Downloader API for gensim +============================================== + +.. automodule:: gensim.downloader + :synopsis: Downloader API for gensim + :members: + :inherited-members: + :undoc-members: + :show-inheritance: diff --git a/gensim/downloader.py b/gensim/downloader.py new file mode 100644 index 0000000000..f8521a8747 --- /dev/null +++ b/gensim/downloader.py @@ -0,0 +1,435 @@ +""" +This module is an API for downloading, getting information and loading datasets/models. + +Give information about available models/datasets: + +>>> import gensim.downloader as api +>>> +>>> api.info() # return dict with info about available models/datasets +>>> api.info("text8") # return dict with info about "text8" dataset + + +Model example: + +>>> import gensim.downloader as api +>>> +>>> model = api.load("glove-twitter-25") # load glove vectors +>>> model.most_similar("cat") # show words that similar to word 'cat' + + +Dataset example: + +>>> import gensim.downloader as api +>>> from gensim.models import Word2Vec +>>> +>>> dataset = api.load("text8") # load dataset as iterable +>>> model = Word2Vec(dataset) # train w2v model + + +Also, this API available via CLI:: + + python -m gensim.downloader --info # same as api.info(dataname) + python -m gensim.downloader --download # same as api.load(dataname, return_path=True) + +""" +from __future__ import absolute_import +import argparse +import os +import json +import logging +import sys +import errno +import hashlib +import math +import shutil +import tempfile +from functools import partial + +if sys.version_info[0] == 2: + import urllib + from urllib2 import urlopen +else: + import urllib.request as urllib + from urllib.request import urlopen + +user_dir = os.path.expanduser('~') +base_dir = os.path.join(user_dir, 'gensim-data') +logger = logging.getLogger('gensim.api') + +DATA_LIST_URL = "https://raw.githubusercontent.com/RaRe-Technologies/gensim-data/master/list.json" + + +def _progress(chunks_downloaded, chunk_size, total_size, part=1, total_parts=1): + """Reporthook for :func:`urllib.urlretrieve`. + + Parameters + ---------- + chunks_downloaded : int + Number of chunks of data that have been downloaded. + chunk_size : int + Size of each chunk of data. + total_size : int + Total size of the dataset/model. + part : int, optional + Number of current part, used only if `no_parts` > 1. + total_parts : int, optional + Total number of parts. + + + References + ---------- + [1] https://gist.github.com/vladignatyev/06860ec2040cb497f0f3 + + """ + bar_len = 50 + size_downloaded = float(chunks_downloaded * chunk_size) + filled_len = int(math.floor((bar_len * size_downloaded) / total_size)) + percent_downloaded = round(((size_downloaded * 100) / total_size), 1) + bar = '=' * filled_len + '-' * (bar_len - filled_len) + if total_parts == 1: + sys.stdout.write( + '\r[%s] %s%s %s/%sMB downloaded' % ( + bar, percent_downloaded, "%", + round(size_downloaded / (1024 * 1024), 1), + round(float(total_size) / (1024 * 1024), 1)) + ) + sys.stdout.flush() + else: + sys.stdout.write( + '\r Part %s/%s [%s] %s%s %s/%sMB downloaded' % ( + part + 1, total_parts, bar, percent_downloaded, "%", + round(size_downloaded / (1024 * 1024), 1), + round(float(total_size) / (1024 * 1024), 1)) + ) + sys.stdout.flush() + + +def _create_base_dir(): + """Create the gensim-data directory in home directory, if it has not been already created. + + Raises + ------ + Exception + An exception is raised when read/write permissions are not available or a file named gensim-data + already exists in the home directory. + + """ + if not os.path.isdir(base_dir): + try: + logger.info("Creating %s", base_dir) + os.makedirs(base_dir) + except OSError as e: + if e.errno == errno.EEXIST: + raise Exception( + "Not able to create folder gensim-data in {}. File gensim-data " + "exists in the direcory already.".format(user_dir) + ) + else: + raise Exception( + "Can't create {}. Make sure you have the read/write permissions " + "to the directory or you can try creating the folder manually" + .format(base_dir) + ) + + +def _calculate_md5_checksum(fname): + """Calculate the checksum of the file, exactly same as md5-sum linux util. + + Parameters + ---------- + fname : str + Path to the file. + + Returns + ------- + str + MD5-hash of file names as `fname`. + + """ + hash_md5 = hashlib.md5() + with open(fname, "rb") as f: + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + return hash_md5.hexdigest() + + +def info(name=None): + """Provide the information related to model/dataset. + + Parameters + ---------- + name : str, optional + Name of model/dataset. + + Returns + ------- + dict + Detailed information about one or all models/datasets. + If name is specified, return full information about concrete dataset/model, + otherwise, return information about all available datasets/models. + + Raises + ------ + Exception + If name that has been passed is incorrect. + + Examples + -------- + >>> import gensim.downloader as api + >>> api.info("text8") # retrieve information about text8 dataset + {u'checksum': u'68799af40b6bda07dfa47a32612e5364', + u'description': u'Cleaned small sample from wikipedia', + u'file_name': u'text8.gz', + u'parts': 1, + u'source': u'http://mattmahoney.net/dc/text8.zip'} + >>> + >>> api.info() # retrieve information about all available datasets and models + + """ + information = json.loads(urlopen(DATA_LIST_URL).read().decode("utf-8")) + + if name is not None: + corpora = information['corpora'] + models = information['models'] + if name in corpora: + return information['corpora'][name] + elif name in models: + return information['models'][name] + else: + raise ValueError("Incorrect model/corpus name") + else: + return information + + +def _get_checksum(name, part=None): + """Retrieve the checksum of the model/dataset from gensim-data repository. + + Parameters + ---------- + name : str + Dataset/model name. + part : int, optional + Number of part (for multipart data only). + + Returns + ------- + str + Retrieved checksum of dataset/model. + + """ + information = info() + corpora = information['corpora'] + models = information['models'] + if part is None: + if name in corpora: + return information['corpora'][name]["checksum"] + elif name in models: + return information['models'][name]["checksum"] + else: + if name in corpora: + return information['corpora'][name]["checksum-{}".format(part)] + elif name in models: + return information['models'][name]["checksum-{}".format(part)] + + +def _get_parts(name): + """Retrieve the number of parts in which dataset/model has been split. + + Parameters + ---------- + name: str + Dataset/model name. + + Returns + ------- + int + Number of parts in which dataset/model has been split. + + """ + information = info() + corpora = information['corpora'] + models = information['models'] + if name in corpora: + return information['corpora'][name]["parts"] + elif name in models: + return information['models'][name]["parts"] + + +def _download(name): + """Download and extract the dataset/model. + + Parameters + ---------- + name: str + Dataset/model name which has to be downloaded. + + Raises + ------ + Exception + If md5sum on client and in repo are different. + + """ + url_load_file = "https://github.com/RaRe-Technologies/gensim-data/releases/download/{f}/__init__.py".format(f=name) + data_folder_dir = os.path.join(base_dir, name) + tmp_dir = tempfile.mkdtemp() + init_path = os.path.join(tmp_dir, "__init__.py") + urllib.urlretrieve(url_load_file, init_path) + total_parts = _get_parts(name) + if total_parts > 1: + concatenated_folder_name = "{f}.gz".format(f=name) + concatenated_folder_dir = os.path.join(tmp_dir, concatenated_folder_name) + for part in range(0, total_parts): + url_data = \ + "https://github.com/RaRe-Technologies/gensim-data/releases/download/{f}/{f}.gz_0{p}" \ + .format(f=name, p=part) + + fname = "{f}.gz_0{p}".format(f=name, p=part) + dst_path = os.path.join(tmp_dir, fname) + urllib.urlretrieve( + url_data, dst_path, + reporthook=partial(_progress, part=part, total_parts=total_parts) + ) + if _calculate_md5_checksum(dst_path) == _get_checksum(name, part): + sys.stdout.write("\n") + sys.stdout.flush() + logger.info("Part %s/%s downloaded", part + 1, total_parts) + else: + shutil.rmtree(tmp_dir) + raise Exception("Checksum comparison failed, try again") + with open(concatenated_folder_dir, 'wb') as wfp: + for part in range(0, total_parts): + part_path = os.path.join(tmp_dir, "{f}.gz_0{p}".format(f=name, p=part)) + with open(part_path, "rb") as rfp: + shutil.copyfileobj(rfp, wfp) + os.remove(part_path) + os.rename(tmp_dir, data_folder_dir) + else: + url_data = "https://github.com/RaRe-Technologies/gensim-data/releases/download/{f}/{f}.gz".format(f=name) + fname = "{f}.gz".format(f=name) + dst_path = os.path.join(tmp_dir, fname) + urllib.urlretrieve(url_data, dst_path, reporthook=_progress) + if _calculate_md5_checksum(dst_path) == _get_checksum(name): + sys.stdout.write("\n") + sys.stdout.flush() + logger.info("%s downloaded", name) + else: + shutil.rmtree(tmp_dir) + raise Exception("Checksum comparison failed, try again") + os.rename(tmp_dir, data_folder_dir) + + +def _get_filename(name): + """Retrieve the filename of the dataset/model. + + Parameters + ---------- + name: str + Name of dataset/model. + + Returns + ------- + str: + Filename of the dataset/model. + + """ + information = info() + corpora = information['corpora'] + models = information['models'] + if name in corpora: + return information['corpora'][name]["file_name"] + elif name in models: + return information['models'][name]["file_name"] + + +def load(name, return_path=False): + """Download (if needed) dataset/model and load it to memory (unless `return_path` is set). + + Parameters + ---------- + name: str + Name of the model/dataset. + return_path: bool, optional + If True, return full path to file, otherwise, return loaded model / iterable dataset. + + Returns + ------- + Model + Requested model, if `name` is model and `return_path` == False. + Dataset (iterable) + Requested dataset, if `name` is dataset and `return_path` == False. + str + Path to file with dataset / model, only when `return_path` == True. + + Raises + ------ + Exception + Raised if `name` is incorrect. + + Examples + -------- + Model example: + + >>> import gensim.downloader as api + >>> + >>> model = api.load("glove-twitter-25") # load glove vectors + >>> model.most_similar("cat") # show words that similar to word 'cat' + + Dataset example: + + >>> import gensim.downloader as api + >>> + >>> wiki = api.load("wiki-en") # load extracted Wikipedia dump, around 6 Gb + >>> for article in wiki: # iterate over all wiki script + >>> ... + + Download only example + >>> import gensim.downloader as api + >>> + >>> print(api.load("wiki-en", return_path=True)) # output: /home/user/gensim-data/wiki-en/wiki-en.gz + + """ + _create_base_dir() + file_name = _get_filename(name) + if file_name is None: + raise ValueError("Incorrect model/corpus name") + folder_dir = os.path.join(base_dir, name) + path = os.path.join(folder_dir, file_name) + if not os.path.exists(folder_dir): + _download(name) + + if return_path: + return path + else: + sys.path.insert(0, base_dir) + module = __import__(name) + return module.load_data() + + +if __name__ == '__main__': + logging.basicConfig( + format='%(asctime)s :%(name)s :%(levelname)s :%(message)s', stream=sys.stdout, level=logging.INFO + ) + parser = argparse.ArgumentParser( + description="Gensim console API", + usage="python -m gensim.api.downloader [-h] [-d data_name | -i data_name | -c]" + ) + + group = parser.add_mutually_exclusive_group() + group.add_argument( + "-d", "--download", metavar="data_name", nargs=1, + help="To download a corpus/model : python -m gensim.downloader -d " + ) + + full_information = 1 + group.add_argument( + "-i", "--info", metavar="data_name", nargs='?', const=full_information, + help="To get information about a corpus/model : python -m gensim.downloader -i " + ) + + args = parser.parse_args() + if args.download is not None: + data_path = load(args.download[0], return_path=True) + logger.info("Data has been installed and data path is %s", data_path) + elif args.info is not None: + output = info() if (args.info == full_information) else info(name=args.info) + print(json.dumps(output, indent=4)) diff --git a/gensim/test/test_api.py b/gensim/test/test_api.py new file mode 100644 index 0000000000..7cb7b8d202 --- /dev/null +++ b/gensim/test/test_api.py @@ -0,0 +1,75 @@ +import logging +import unittest +import os +import gensim.downloader as api +from gensim.downloader import base_dir +import shutil +import numpy as np + + +class TestApi(unittest.TestCase): + def test_base_dir_creation(self): + if os.path.isdir(base_dir): + shutil.rmtree(base_dir) + api._create_base_dir() + self.assertTrue(os.path.isdir(base_dir)) + os.rmdir(base_dir) + + def test_load_dataset(self): + dataset_path = os.path.join(base_dir, "__testing_matrix-synopsis", "__testing_matrix-synopsis.gz") + if os.path.isdir(base_dir): + shutil.rmtree(base_dir) + self.assertEqual(api.load("__testing_matrix-synopsis", return_path=True), dataset_path) + shutil.rmtree(base_dir) + self.assertEqual(len(list(api.load("__testing_matrix-synopsis"))), 1) + shutil.rmtree(base_dir) + + def test_load_model(self): + if os.path.isdir(base_dir): + shutil.rmtree(base_dir) + vector_dead = np.array([ + 0.17403787, -0.10167074, -0.00950371, -0.10367849, -0.14034484, + -0.08751217, 0.10030612, 0.07677923, -0.32563496, 0.01929072, + 0.20521086, -0.1617067, 0.00475458, 0.21956187, -0.08783089, + -0.05937332, 0.26528183, -0.06771874, -0.12369668, 0.12020949, + 0.28731, 0.36735833, 0.28051138, -0.10407482, 0.2496888, + -0.19372769, -0.28719661, 0.11989869, -0.00393865, -0.2431484, + 0.02725661, -0.20421691, 0.0328669, -0.26947051, -0.08068217, + -0.10245913, 0.1170633, 0.16583319, 0.1183883, -0.11217165, + 0.1261425, -0.0319365, -0.15787181, 0.03753783, 0.14748634, + 0.00414471, -0.02296237, 0.18336892, -0.23840059, 0.17924534 + ]) + dataset_path = os.path.join( + base_dir, "__testing_word2vec-matrix-synopsis", "__testing_word2vec-matrix-synopsis.gz" + ) + model = api.load("__testing_word2vec-matrix-synopsis") + vector_dead_calc = model["dead"] + self.assertTrue(np.allclose(vector_dead, vector_dead_calc)) + shutil.rmtree(base_dir) + self.assertEqual(api.load("__testing_word2vec-matrix-synopsis", return_path=True), dataset_path) + shutil.rmtree(base_dir) + + def test_multipart_load(self): + dataset_path = os.path.join( + base_dir, '__testing_multipart-matrix-synopsis', '__testing_multipart-matrix-synopsis.gz' + ) + if os.path.isdir(base_dir): + shutil.rmtree(base_dir) + self.assertEqual(dataset_path, api.load("__testing_multipart-matrix-synopsis", return_path=True)) + shutil.rmtree(base_dir) + dataset = api.load("__testing_multipart-matrix-synopsis") + self.assertEqual(len(list(dataset)), 1) + + def test_info(self): + data = api.info("text8") + self.assertEqual(data["parts"], 1) + self.assertEqual(data["file_name"], 'text8.gz') + data = api.info() + self.assertEqual(sorted(data.keys()), sorted(['models', 'corpora'])) + self.assertTrue(len(data['models'])) + self.assertTrue(len(data['corpora'])) + + +if __name__ == '__main__': + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) + unittest.main()