diff --git a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.ipynb b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.ipynb index 8d51ffc0e3..998115a80e 100644 --- a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.ipynb +++ b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.ipynb @@ -224,7 +224,7 @@ }, "outputs": [], "source": [ - "from six import iteritems\n# collect statistics about all tokens\ndictionary = corpora.Dictionary(line.lower().split() for line in open('https://radimrehurek.com/gensim/mycorpus.txt'))\n# remove stop words and words that appear only once\nstop_ids = [\n dictionary.token2id[stopword]\n for stopword in stoplist\n if stopword in dictionary.token2id\n]\nonce_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]\ndictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once\ndictionary.compactify() # remove gaps in id sequence after words that were removed\nprint(dictionary)" + "# collect statistics about all tokens\ndictionary = corpora.Dictionary(line.lower().split() for line in open('https://radimrehurek.com/gensim/mycorpus.txt'))\n# remove stop words and words that appear only once\nstop_ids = [\n dictionary.token2id[stopword]\n for stopword in stoplist\n if stopword in dictionary.token2id\n]\nonce_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]\ndictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once\ndictionary.compactify() # remove gaps in id sequence after words that were removed\nprint(dictionary)" ] }, { diff --git a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py index 426ecf2407..5a77b4e637 100644 --- a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py +++ b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py @@ -179,7 +179,6 @@ def __iter__(self): # # Similarly, to construct the dictionary without loading all texts into memory: -from six import iteritems # collect statistics about all tokens dictionary = corpora.Dictionary(line.lower().split() for line in open('https://radimrehurek.com/gensim/mycorpus.txt')) # remove stop words and words that appear only once @@ -188,7 +187,7 @@ def __iter__(self): for stopword in stoplist if stopword in dictionary.token2id ] -once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1] +once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1] dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once dictionary.compactify() # remove gaps in id sequence after words that were removed print(dictionary) diff --git a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py.md5 b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py.md5 index 1a6c2797e8..9e8401aae5 100644 --- a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py.md5 +++ b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py.md5 @@ -1 +1 @@ -e017de81683bfd2f6005a3186bfc1eb3 \ No newline at end of file +c239d5c523ea2b3af1f6d4c6c51e7925 \ No newline at end of file diff --git a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.rst b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.rst index a95de20d11..4b55ff959e 100644 --- a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.rst +++ b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.rst @@ -159,10 +159,10 @@ between the questions and ids is called a dictionary: .. code-block:: none - 2020-09-30 12:28:00,819 : INFO : adding document #0 to Dictionary(0 unique tokens: []) - 2020-09-30 12:28:00,820 : INFO : built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions) - 2020-09-30 12:28:00,821 : INFO : saving Dictionary object under /tmp/deerwester.dict, separately None - 2020-09-30 12:28:00,822 : INFO : saved /tmp/deerwester.dict + 2020-10-19 01:23:37,722 : INFO : adding document #0 to Dictionary(0 unique tokens: []) + 2020-10-19 01:23:37,722 : INFO : built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions) + 2020-10-19 01:23:37,722 : INFO : saving Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) under /tmp/deerwester.dict, separately None + 2020-10-19 01:23:37,723 : INFO : saved /tmp/deerwester.dict Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) @@ -244,11 +244,11 @@ therefore reads: in the document `"Human computer interaction"`, the words `comp .. code-block:: none - 2020-09-30 12:28:01,181 : INFO : storing corpus in Matrix Market format to /tmp/deerwester.mm - 2020-09-30 12:28:01,182 : INFO : saving sparse matrix to /tmp/deerwester.mm - 2020-09-30 12:28:01,182 : INFO : PROGRESS: saving document #0 - 2020-09-30 12:28:01,182 : INFO : saved 9x12 matrix, density=25.926% (28/108) - 2020-09-30 12:28:01,183 : INFO : saving MmCorpus index to /tmp/deerwester.mm.index + 2020-10-19 01:23:38,012 : INFO : storing corpus in Matrix Market format to /tmp/deerwester.mm + 2020-10-19 01:23:38,013 : INFO : saving sparse matrix to /tmp/deerwester.mm + 2020-10-19 01:23:38,013 : INFO : PROGRESS: saving document #0 + 2020-10-19 01:23:38,016 : INFO : saved 9x12 matrix, density=25.926% (28/108) + 2020-10-19 01:23:38,016 : INFO : saving MmCorpus index to /tmp/deerwester.mm.index [[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (7, 1), (8, 1)], [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(4, 1), (10, 1), (11, 1)]] @@ -334,7 +334,7 @@ then convert the tokens via a dictionary to their ids and yield the resulting sp .. code-block:: none - <__main__.MyCorpus object at 0x125b5a128> + <__main__.MyCorpus object at 0x117e06828> @@ -383,7 +383,6 @@ Similarly, to construct the dictionary without loading all texts into memory: .. code-block:: default - from six import iteritems # collect statistics about all tokens dictionary = corpora.Dictionary(line.lower().split() for line in open('https://radimrehurek.com/gensim/mycorpus.txt')) # remove stop words and words that appear only once @@ -392,7 +391,7 @@ Similarly, to construct the dictionary without loading all texts into memory: for stopword in stoplist if stopword in dictionary.token2id ] - once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1] + once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1] dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once dictionary.compactify() # remove gaps in id sequence after words that were removed print(dictionary) @@ -407,8 +406,8 @@ Similarly, to construct the dictionary without loading all texts into memory: .. code-block:: none - 2020-09-30 12:28:02,652 : INFO : adding document #0 to Dictionary(0 unique tokens: []) - 2020-09-30 12:28:02,653 : INFO : built Dictionary(42 unique tokens: ['abc', 'applications', 'computer', 'for', 'human']...) from 9 documents (total 69 corpus positions) + 2020-10-19 01:23:38,980 : INFO : adding document #0 to Dictionary(0 unique tokens: []) + 2020-10-19 01:23:38,981 : INFO : built Dictionary(42 unique tokens: ['abc', 'applications', 'computer', 'for', 'human']...) from 9 documents (total 69 corpus positions) Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) @@ -455,11 +454,11 @@ create a toy corpus of 2 documents, as a plain Python list .. code-block:: none - 2020-09-30 12:28:02,781 : INFO : storing corpus in Matrix Market format to /tmp/corpus.mm - 2020-09-30 12:28:02,782 : INFO : saving sparse matrix to /tmp/corpus.mm - 2020-09-30 12:28:02,783 : INFO : PROGRESS: saving document #0 - 2020-09-30 12:28:02,783 : INFO : saved 2x2 matrix, density=25.000% (1/4) - 2020-09-30 12:28:02,783 : INFO : saving MmCorpus index to /tmp/corpus.mm.index + 2020-10-19 01:23:39,099 : INFO : storing corpus in Matrix Market format to /tmp/corpus.mm + 2020-10-19 01:23:39,100 : INFO : saving sparse matrix to /tmp/corpus.mm + 2020-10-19 01:23:39,100 : INFO : PROGRESS: saving document #0 + 2020-10-19 01:23:39,101 : INFO : saved 2x2 matrix, density=25.000% (1/4) + 2020-10-19 01:23:39,101 : INFO : saving MmCorpus index to /tmp/corpus.mm.index @@ -487,16 +486,16 @@ Other formats include `Joachim's SVMlight format .. code-block:: none - 2020-09-30 12:28:02,842 : INFO : converting corpus to SVMlight format: /tmp/corpus.svmlight - 2020-09-30 12:28:02,844 : INFO : saving SvmLightCorpus index to /tmp/corpus.svmlight.index - 2020-09-30 12:28:02,844 : INFO : no word id mapping provided; initializing from corpus - 2020-09-30 12:28:02,844 : INFO : storing corpus in Blei's LDA-C format into /tmp/corpus.lda-c - 2020-09-30 12:28:02,844 : INFO : saving vocabulary of 2 words to /tmp/corpus.lda-c.vocab - 2020-09-30 12:28:02,845 : INFO : saving BleiCorpus index to /tmp/corpus.lda-c.index - 2020-09-30 12:28:02,904 : INFO : no word id mapping provided; initializing from corpus - 2020-09-30 12:28:02,905 : INFO : storing corpus in List-Of-Words format into /tmp/corpus.low - 2020-09-30 12:28:02,906 : WARNING : List-of-words format can only save vectors with integer elements; 1 float entries were truncated to integer value - 2020-09-30 12:28:02,906 : INFO : saving LowCorpus index to /tmp/corpus.low.index + 2020-10-19 01:23:39,152 : INFO : converting corpus to SVMlight format: /tmp/corpus.svmlight + 2020-10-19 01:23:39,153 : INFO : saving SvmLightCorpus index to /tmp/corpus.svmlight.index + 2020-10-19 01:23:39,154 : INFO : no word id mapping provided; initializing from corpus + 2020-10-19 01:23:39,154 : INFO : storing corpus in Blei's LDA-C format into /tmp/corpus.lda-c + 2020-10-19 01:23:39,154 : INFO : saving vocabulary of 2 words to /tmp/corpus.lda-c.vocab + 2020-10-19 01:23:39,154 : INFO : saving BleiCorpus index to /tmp/corpus.lda-c.index + 2020-10-19 01:23:39,206 : INFO : no word id mapping provided; initializing from corpus + 2020-10-19 01:23:39,207 : INFO : storing corpus in List-Of-Words format into /tmp/corpus.low + 2020-10-19 01:23:39,207 : WARNING : List-of-words format can only save vectors with integer elements; 1 float entries were truncated to integer value + 2020-10-19 01:23:39,207 : INFO : saving LowCorpus index to /tmp/corpus.low.index @@ -519,9 +518,9 @@ Conversely, to load a corpus iterator from a Matrix Market file: .. code-block:: none - 2020-09-30 12:28:02,968 : INFO : loaded corpus index from /tmp/corpus.mm.index - 2020-09-30 12:28:02,969 : INFO : initializing cython corpus reader from /tmp/corpus.mm - 2020-09-30 12:28:02,970 : INFO : accepted corpus with 2 documents, 2 features, 1 non-zero entries + 2020-10-19 01:23:39,260 : INFO : loaded corpus index from /tmp/corpus.mm.index + 2020-10-19 01:23:39,262 : INFO : initializing cython corpus reader from /tmp/corpus.mm + 2020-10-19 01:23:39,262 : INFO : accepted corpus with 2 documents, 2 features, 1 non-zero entries @@ -620,10 +619,10 @@ To save the same Matrix Market document stream in Blei's LDA-C format, .. code-block:: none - 2020-09-30 12:28:03,395 : INFO : no word id mapping provided; initializing from corpus - 2020-09-30 12:28:03,397 : INFO : storing corpus in Blei's LDA-C format into /tmp/corpus.lda-c - 2020-09-30 12:28:03,397 : INFO : saving vocabulary of 2 words to /tmp/corpus.lda-c.vocab - 2020-09-30 12:28:03,398 : INFO : saving BleiCorpus index to /tmp/corpus.lda-c.index + 2020-10-19 01:23:39,634 : INFO : no word id mapping provided; initializing from corpus + 2020-10-19 01:23:39,636 : INFO : storing corpus in Blei's LDA-C format into /tmp/corpus.lda-c + 2020-10-19 01:23:39,636 : INFO : saving vocabulary of 2 words to /tmp/corpus.lda-c.vocab + 2020-10-19 01:23:39,636 : INFO : saving BleiCorpus index to /tmp/corpus.lda-c.index @@ -711,9 +710,9 @@ Optimize converting between corpora and NumPy/SciPy arrays?), see the :ref:`apir .. rst-class:: sphx-glr-timing - **Total running time of the script:** ( 0 minutes 3.219 seconds) + **Total running time of the script:** ( 0 minutes 2.979 seconds) -**Estimated memory usage:** 10 MB +**Estimated memory usage:** 39 MB .. _sphx_glr_download_auto_examples_core_run_corpora_and_vector_spaces.py: diff --git a/docs/src/auto_examples/core/sg_execution_times.rst b/docs/src/auto_examples/core/sg_execution_times.rst index 419b52b786..d346e546cb 100644 --- a/docs/src/auto_examples/core/sg_execution_times.rst +++ b/docs/src/auto_examples/core/sg_execution_times.rst @@ -5,14 +5,14 @@ Computation times ================= -**00:06.698** total execution time for **auto_examples_core** files: +**00:02.979** total execution time for **auto_examples_core** files: +--------------------------------------------------------------------------------------------------------------+-----------+---------+ -| :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py` (``run_corpora_and_vector_spaces.py``) | 00:03.219 | 9.7 MB | +| :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py` (``run_corpora_and_vector_spaces.py``) | 00:02.979 | 38.7 MB | +--------------------------------------------------------------------------------------------------------------+-----------+---------+ -| :ref:`sphx_glr_auto_examples_core_run_core_concepts.py` (``run_core_concepts.py``) | 00:01.675 | 36.8 MB | +| :ref:`sphx_glr_auto_examples_core_run_core_concepts.py` (``run_core_concepts.py``) | 00:00.000 | 0.0 MB | +--------------------------------------------------------------------------------------------------------------+-----------+---------+ -| :ref:`sphx_glr_auto_examples_core_run_topics_and_transformations.py` (``run_topics_and_transformations.py``) | 00:00.970 | 7.2 MB | +| :ref:`sphx_glr_auto_examples_core_run_similarity_queries.py` (``run_similarity_queries.py``) | 00:00.000 | 0.0 MB | +--------------------------------------------------------------------------------------------------------------+-----------+---------+ -| :ref:`sphx_glr_auto_examples_core_run_similarity_queries.py` (``run_similarity_queries.py``) | 00:00.834 | 6.5 MB | +| :ref:`sphx_glr_auto_examples_core_run_topics_and_transformations.py` (``run_topics_and_transformations.py``) | 00:00.000 | 0.0 MB | +--------------------------------------------------------------------------------------------------------------+-----------+---------+ diff --git a/docs/src/gallery/core/run_corpora_and_vector_spaces.py b/docs/src/gallery/core/run_corpora_and_vector_spaces.py index 426ecf2407..5a77b4e637 100644 --- a/docs/src/gallery/core/run_corpora_and_vector_spaces.py +++ b/docs/src/gallery/core/run_corpora_and_vector_spaces.py @@ -179,7 +179,6 @@ def __iter__(self): # # Similarly, to construct the dictionary without loading all texts into memory: -from six import iteritems # collect statistics about all tokens dictionary = corpora.Dictionary(line.lower().split() for line in open('https://radimrehurek.com/gensim/mycorpus.txt')) # remove stop words and words that appear only once @@ -188,7 +187,7 @@ def __iter__(self): for stopword in stoplist if stopword in dictionary.token2id ] -once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1] +once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1] dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once dictionary.compactify() # remove gaps in id sequence after words that were removed print(dictionary) diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py index 1afde870d2..15d79aeffd 100644 --- a/gensim/corpora/bleicorpus.py +++ b/gensim/corpora/bleicorpus.py @@ -14,7 +14,6 @@ from gensim import utils from gensim.corpora import IndexedCorpus -from six.moves import range logger = logging.getLogger(__name__) diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py index ba3795062d..e046134250 100644 --- a/gensim/corpora/dictionary.py +++ b/gensim/corpora/dictionary.py @@ -6,22 +6,13 @@ """This module implements the concept of a Dictionary -- a mapping between words and their integer ids.""" -from __future__ import with_statement - from collections import defaultdict from collections.abc import Mapping -import sys import logging import itertools from gensim import utils -from six import PY3, iteritems, iterkeys, itervalues, string_types -from six.moves import zip, range - -if sys.version_info[0] >= 3: - unicode = str - logger = logging.getLogger(__name__) @@ -116,15 +107,14 @@ def __iter__(self): """Iterate over all tokens.""" return iter(self.keys()) - if PY3: - # restore Py2-style dict API - iterkeys = __iter__ + # restore Py2-style dict API + iterkeys = __iter__ - def iteritems(self): - return self.items() + def iteritems(self): + return self.items() - def itervalues(self): - return self.values() + def itervalues(self): + return self.values() def keys(self): """Get all stored ids. @@ -149,7 +139,7 @@ def __len__(self): return len(self.token2id) def __str__(self): - some_keys = list(itertools.islice(iterkeys(self.token2id), 5)) + some_keys = list(itertools.islice(self.token2id.keys(), 5)) return "Dictionary(%i unique tokens: %s%s)" % (len(self), some_keys, '...' if len(self) > 5 else '') @staticmethod @@ -245,35 +235,35 @@ def doc2bow(self, document, allow_update=False, return_missing=False): ([(2, 1)], {u'this': 1, u'is': 1}) """ - if isinstance(document, string_types): + if isinstance(document, str): raise TypeError("doc2bow expects an array of unicode tokens on input, not a single string") # Construct (word, frequency) mapping. counter = defaultdict(int) for w in document: - counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1 + counter[w if isinstance(w, str) else str(w, 'utf-8')] += 1 token2id = self.token2id if allow_update or return_missing: - missing = sorted(x for x in iteritems(counter) if x[0] not in token2id) + missing = sorted(x for x in counter.items() if x[0] not in token2id) if allow_update: for w, _ in missing: # new id = number of ids made so far; # NOTE this assumes there are no gaps in the id sequence! token2id[w] = len(token2id) - result = {token2id[w]: freq for w, freq in iteritems(counter) if w in token2id} + result = {token2id[w]: freq for w, freq in counter.items() if w in token2id} if allow_update: self.num_docs += 1 - self.num_pos += sum(itervalues(counter)) + self.num_pos += sum(counter.values()) self.num_nnz += len(result) # keep track of document and collection frequencies - for tokenid, freq in iteritems(result): + for tokenid, freq in result.items(): self.cfs[tokenid] = self.cfs.get(tokenid, 0) + freq self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1 # return tokenids, in ascending id order - result = sorted(iteritems(result)) + result = sorted(result.items()) if return_missing: return result, dict(missing) else: @@ -307,10 +297,10 @@ def doc2idx(self, document, unknown_word_index=-1): [0, 0, 2, -1, 2] """ - if isinstance(document, string_types): + if isinstance(document, str): raise TypeError("doc2idx expects an array of unicode tokens on input, not a single string") - document = [word if isinstance(word, unicode) else unicode(word, 'utf-8') for word in document] + document = [word if isinstance(word, str) else str(word, 'utf-8') for word in document] return [self.token2id.get(word, unknown_word_index) for word in document] def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None): @@ -361,13 +351,13 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=N if keep_tokens: keep_ids = {self.token2id[v] for v in keep_tokens if v in self.token2id} good_ids = [ - v for v in itervalues(self.token2id) + v for v in self.token2id.values() if no_below <= self.dfs.get(v, 0) <= no_above_abs or v in keep_ids ] good_ids.sort(key=lambda x: self.num_docs if x in keep_ids else self.dfs.get(x, 0), reverse=True) else: good_ids = [ - v for v in itervalues(self.token2id) + v for v in self.token2id.values() if no_below <= self.dfs.get(v, 0) <= no_above_abs ] good_ids.sort(key=self.dfs.get, reverse=True) @@ -408,7 +398,7 @@ def filter_n_most_frequent(self, remove_n): """ # determine which tokens to keep - most_frequent_ids = (v for v in itervalues(self.token2id)) + most_frequent_ids = (v for v in self.token2id.values()) most_frequent_ids = sorted(most_frequent_ids, key=self.dfs.get, reverse=True) most_frequent_ids = most_frequent_ids[:remove_n] # do the actual filtering, then rebuild dictionary to remove gaps in ids @@ -452,14 +442,14 @@ def filter_tokens(self, bad_ids=None, good_ids=None): """ if bad_ids is not None: bad_ids = set(bad_ids) - self.token2id = {token: tokenid for token, tokenid in iteritems(self.token2id) if tokenid not in bad_ids} - self.cfs = {tokenid: freq for tokenid, freq in iteritems(self.cfs) if tokenid not in bad_ids} - self.dfs = {tokenid: freq for tokenid, freq in iteritems(self.dfs) if tokenid not in bad_ids} + self.token2id = {token: tokenid for token, tokenid in self.token2id.items() if tokenid not in bad_ids} + self.cfs = {tokenid: freq for tokenid, freq in self.cfs.items() if tokenid not in bad_ids} + self.dfs = {tokenid: freq for tokenid, freq in self.dfs.items() if tokenid not in bad_ids} if good_ids is not None: good_ids = set(good_ids) - self.token2id = {token: tokenid for token, tokenid in iteritems(self.token2id) if tokenid in good_ids} - self.cfs = {tokenid: freq for tokenid, freq in iteritems(self.cfs) if tokenid in good_ids} - self.dfs = {tokenid: freq for tokenid, freq in iteritems(self.dfs) if tokenid in good_ids} + self.token2id = {token: tokenid for token, tokenid in self.token2id.items() if tokenid in good_ids} + self.cfs = {tokenid: freq for tokenid, freq in self.cfs.items() if tokenid in good_ids} + self.dfs = {tokenid: freq for tokenid, freq in self.dfs.items() if tokenid in good_ids} self.compactify() def compactify(self): @@ -467,13 +457,13 @@ def compactify(self): logger.debug("rebuilding dictionary, shrinking gaps") # build mapping from old id -> new id - idmap = dict(zip(sorted(itervalues(self.token2id)), range(len(self.token2id)))) + idmap = dict(zip(sorted(self.token2id.values()), range(len(self.token2id)))) # reassign mappings to new ids - self.token2id = {token: idmap[tokenid] for token, tokenid in iteritems(self.token2id)} + self.token2id = {token: idmap[tokenid] for token, tokenid in self.token2id.items()} self.id2token = {} - self.dfs = {idmap[tokenid]: freq for tokenid, freq in iteritems(self.dfs)} - self.cfs = {idmap[tokenid]: freq for tokenid, freq in iteritems(self.cfs)} + self.dfs = {idmap[tokenid]: freq for tokenid, freq in self.dfs.items()} + self.cfs = {idmap[tokenid]: freq for tokenid, freq in self.cfs.items()} def save_as_text(self, fname, sort_by_word=True): """Save :class:`~gensim.corpora.dictionary.Dictionary` to a text file. @@ -527,11 +517,11 @@ def save_as_text(self, fname, sort_by_word=True): numdocs_line = "%d\n" % self.num_docs fout.write(utils.to_utf8(numdocs_line)) if sort_by_word: - for token, tokenid in sorted(iteritems(self.token2id)): + for token, tokenid in sorted(self.token2id.items()): line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0)) fout.write(utils.to_utf8(line)) else: - for tokenid, freq in sorted(iteritems(self.dfs), key=lambda item: -item[1]): + for tokenid, freq in sorted(self.dfs.items(), key=lambda item: -item[1]): line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq) fout.write(utils.to_utf8(line)) @@ -573,7 +563,7 @@ def merge_with(self, other): """ old2new = {} - for other_id, other_token in iteritems(other): + for other_id, other_token in other.items(): if other_token in self.token2id: new_id = self.token2id[other_token] else: @@ -748,11 +738,11 @@ def from_corpus(corpus, id2word=None): if id2word is None: # make sure length(result) == get_max_id(corpus) + 1 - result.token2id = {unicode(i): i for i in range(max_id + 1)} + result.token2id = {str(i): i for i in range(max_id + 1)} else: # id=>word mapping given: simply copy it - result.token2id = {utils.to_unicode(token): idx for idx, token in iteritems(id2word)} - for idx in itervalues(result.token2id): + result.token2id = {utils.to_unicode(token): idx for idx, token in id2word.items()} + for idx in result.token2id.values(): # make sure all token ids have a valid `dfs` entry result.dfs[idx] = result.dfs.get(idx, 0) diff --git a/gensim/corpora/hashdictionary.py b/gensim/corpora/hashdictionary.py index cb3f4053ea..8eb6d87dd1 100644 --- a/gensim/corpora/hashdictionary.py +++ b/gensim/corpora/hashdictionary.py @@ -27,14 +27,11 @@ """ -from __future__ import with_statement - import logging import itertools import zlib from gensim import utils -from six import iteritems, iterkeys logger = logging.getLogger(__name__) @@ -252,11 +249,11 @@ def doc2bow(self, document, allow_update=False, return_missing=False): if self.debug: # increment document count for each unique tokenid that appeared in the document # done here, because several words may map to the same tokenid - for tokenid in iterkeys(result): + for tokenid in result.keys(): self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1 # return tokenids, in ascending id order - result = sorted(iteritems(result)) + result = sorted(result.items()) if return_missing: return result, missing else: @@ -293,16 +290,16 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000): """ no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold - ok = [item for item in iteritems(self.dfs_debug) if no_below <= item[1] <= no_above_abs] + ok = [item for item in self.dfs_debug.items() if no_below <= item[1] <= no_above_abs] ok = frozenset(word for word, freq in sorted(ok, key=lambda x: -x[1])[:keep_n]) - self.dfs_debug = {word: freq for word, freq in iteritems(self.dfs_debug) if word in ok} - self.token2id = {token: tokenid for token, tokenid in iteritems(self.token2id) if token in self.dfs_debug} + self.dfs_debug = {word: freq for word, freq in self.dfs_debug.items() if word in ok} + self.token2id = {token: tokenid for token, tokenid in self.token2id.items() if token in self.dfs_debug} self.id2token = { tokenid: {token for token in tokens if token in self.dfs_debug} - for tokenid, tokens in iteritems(self.id2token) + for tokenid, tokens in self.id2token.items() } - self.dfs = {tokenid: freq for tokenid, freq in iteritems(self.dfs) if self.id2token.get(tokenid, False)} + self.dfs = {tokenid: freq for tokenid, freq in self.dfs.items() if self.id2token.get(tokenid, False)} # for word->document frequency logger.info( diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py index 1ab8331a0a..8624c54fbf 100644 --- a/gensim/corpora/indexedcorpus.py +++ b/gensim/corpora/indexedcorpus.py @@ -8,7 +8,6 @@ """Base Indexed Corpus class.""" import logging -import six import numpy @@ -182,7 +181,7 @@ def __getitem__(self, docno): raise RuntimeError("Cannot call corpus[docid] without an index") if isinstance(docno, (slice, list, numpy.ndarray)): return utils.SlicedCorpus(self, docno) - elif isinstance(docno, six.integer_types + (numpy.integer,)): + elif isinstance(docno, (int, numpy.integer,)): return self.docbyoffset(self.index[docno]) # TODO: no `docbyoffset` method, should be defined in this class else: diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py index d52f190187..80dacf8ec0 100644 --- a/gensim/corpora/lowcorpus.py +++ b/gensim/corpora/lowcorpus.py @@ -4,17 +4,13 @@ # Copyright (C) 2010 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - """Corpus in `GibbsLda++ format `_.""" -from __future__ import with_statement - import logging from collections import Counter from gensim import utils from gensim.corpora import IndexedCorpus -from six.moves import zip, range logger = logging.getLogger(__name__) diff --git a/gensim/corpora/sharded_corpus.py b/gensim/corpora/sharded_corpus.py index c9ebf1841b..b8858f93c3 100644 --- a/gensim/corpora/sharded_corpus.py +++ b/gensim/corpora/sharded_corpus.py @@ -26,7 +26,6 @@ import numpy import scipy.sparse as sparse -from six.moves import range import gensim from gensim.corpora import IndexedCorpus @@ -290,7 +289,7 @@ def init_shards(self, output_prefix, corpus, shardsize=4096, dtype=_default_dtyp for i, doc in enumerate(doc_chunk): doc = dict(doc) - current_shard[i][list(doc)] = list(gensim.matutils.itervalues(doc)) + current_shard[i][list(doc)] = list(doc.values()) # Handles the updating as well. if self.sparse_serialization: diff --git a/gensim/corpora/ucicorpus.py b/gensim/corpora/ucicorpus.py index 6f5f2f85f3..fd81f3ef2a 100644 --- a/gensim/corpora/ucicorpus.py +++ b/gensim/corpora/ucicorpus.py @@ -7,7 +7,6 @@ """Corpus in `UCI format `_.""" -from __future__ import with_statement import logging from collections import defaultdict @@ -17,7 +16,6 @@ from gensim.corpora import IndexedCorpus from gensim.matutils import MmReader from gensim.matutils import MmWriter -from six.moves import range logger = logging.getLogger(__name__) diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py index 78affc1028..8c3c94b5ff 100644 --- a/gensim/corpora/wikicorpus.py +++ b/gensim/corpora/wikicorpus.py @@ -34,8 +34,6 @@ from gensim.corpora.dictionary import Dictionary from gensim.corpora.textcorpus import TextCorpus -from six import raise_from - logger = logging.getLogger(__name__) @@ -704,14 +702,16 @@ def get_texts(self): yield tokens except KeyboardInterrupt: - logger.warn( + logger.warning( "user terminated iteration over Wikipedia corpus after %i documents with %i positions " "(total %i articles, %i positions before pruning articles shorter than %i words)", articles, positions, articles_all, positions_all, self.article_min_tokens ) except PicklingError as exc: - raise_from(PicklingError('Can not send filtering function {} to multiprocessing, ' - 'make sure the function can be pickled.'.format(self.filter_articles)), exc) + raise PicklingError( + f'Can not send filtering function {self.filter_articles} to multiprocessing, ' + 'make sure the function can be pickled.' + ) from exc else: logger.info( "finished iterating over Wikipedia corpus of %i documents with %i positions " diff --git a/gensim/interfaces.py b/gensim/interfaces.py index 3fd266eb62..3358adaab5 100644 --- a/gensim/interfaces.py +++ b/gensim/interfaces.py @@ -14,12 +14,9 @@ """ -from __future__ import with_statement - import logging from gensim import utils, matutils -from six.moves import range logger = logging.getLogger(__name__) diff --git a/gensim/matutils.py b/gensim/matutils.py index c9d0e19f59..dbdd3f1439 100644 --- a/gensim/matutils.py +++ b/gensim/matutils.py @@ -22,9 +22,6 @@ from scipy.linalg.special_matrices import triu from scipy.special import psi # gamma function utils -from six import iteritems, itervalues -from six.moves import zip, range - logger = logging.getLogger(__name__) @@ -398,7 +395,7 @@ def sparse2full(doc, length): doc = dict(doc) # overwrite some of the zeroes with explicit values - result[list(doc)] = list(itervalues(doc)) + result[list(doc)] = list(doc.values()) return result @@ -807,12 +804,12 @@ def cossim(vec1, vec2): vec1, vec2 = dict(vec1), dict(vec2) if not vec1 or not vec2: return 0.0 - vec1len = 1.0 * math.sqrt(sum(val * val for val in itervalues(vec1))) - vec2len = 1.0 * math.sqrt(sum(val * val for val in itervalues(vec2))) + vec1len = 1.0 * math.sqrt(sum(val * val for val in vec1.values())) + vec2len = 1.0 * math.sqrt(sum(val * val for val in vec2.values())) assert vec1len > 0.0 and vec2len > 0.0, "sparse documents must not contain any explicit zero entries" if len(vec2) < len(vec1): vec1, vec2 = vec2, vec1 # swap references so that we iterate over the shorter vector - result = sum(value * vec2.get(index, 0.0) for index, value in iteritems(vec1)) + result = sum(value * vec2.get(index, 0.0) for index, value in vec1.items()) result /= vec1len * vec2len # rescale by vector lengths return result @@ -982,7 +979,7 @@ def jaccard(vec1, vec2): union = sum(weight for id_, weight in vec1) + sum(weight for id_, weight in vec2) vec1, vec2 = dict(vec1), dict(vec2) intersection = 0.0 - for feature_id, feature_weight in iteritems(vec1): + for feature_id, feature_weight in vec1.items(): intersection += min(feature_weight, vec2.get(feature_id, 0.0)) return 1 - float(intersection) / float(union) else: diff --git a/gensim/models/_fasttext_bin.py b/gensim/models/_fasttext_bin.py index 77549b1351..5fbce5d926 100644 --- a/gensim/models/_fasttext_bin.py +++ b/gensim/models/_fasttext_bin.py @@ -35,7 +35,6 @@ """ -import codecs import collections import gzip import io @@ -43,7 +42,6 @@ import struct import numpy as np -import six _END_OF_WORD_MARKER = b'\x00' @@ -674,7 +672,3 @@ def save(model, fout, fb_fasttext_parameters, encoding): _save_to_stream(model, fout_stream, fb_fasttext_parameters, encoding) else: _save_to_stream(model, fout, fb_fasttext_parameters, encoding) - - -if six.PY2: - codecs.register_error('backslashreplace', _backslashreplace_backport) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index afe529a395..fdc3f3c0dd 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -51,27 +51,27 @@ >>> author_vecs = [model.get_author_topics(author) for author in model.id2author.values()] """ + # TODO: this class inherits LdaModel and overwrites some methods. There is some code # duplication still, and a refactor could be made to avoid this. Comments with "TODOs" # are included in the code where this is the case, for example in the log_perplexity # and do_estep methods. import logging -import numpy as np # for arrays, array broadcasting etc. +from itertools import chain from copy import deepcopy from shutil import copyfile from os.path import isfile from os import remove +import numpy as np # for arrays, array broadcasting etc. +from scipy.special import gammaln # gamma function utils + from gensim import utils from gensim.models import LdaModel from gensim.models.ldamodel import LdaState from gensim.matutils import dirichlet_expectation, mean_absolute_difference from gensim.corpora import MmCorpus -from itertools import chain -from scipy.special import gammaln # gamma function utils -from six.moves import range -import six logger = logging.getLogger(__name__) @@ -283,7 +283,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d assert self.alpha.shape == (self.num_topics,), \ "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics) - if isinstance(eta, six.string_types): + if isinstance(eta, str): if eta == 'asymmetric': raise ValueError("The 'asymmetric' option cannot be used for eta") @@ -458,7 +458,7 @@ def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, c doc_no = d # Get the IDs and counts of all the words in the current document. # TODO: this is duplication of code in LdaModel. Refactor. - if doc and not isinstance(doc[0][0], six.integer_types + (np.integer,)): + if doc and not isinstance(doc[0][0], (int, np.integer,)): # make sure the term IDs are ints, otherwise np will get upset ids = [int(idx) for idx, _ in doc] else: diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py index d8d7185487..c29e0a0737 100755 --- a/gensim/models/hdpmodel.py +++ b/gensim/models/hdpmodel.py @@ -49,7 +49,6 @@ >>> hdp.update([[(1, 2)], [(1, 1), (4, 5)]]) """ -from __future__ import with_statement import logging import time @@ -57,13 +56,13 @@ import numpy as np from scipy.special import gammaln, psi # gamma function utils -from six.moves import zip, range from gensim import interfaces, utils, matutils from gensim.matutils import dirichlet_expectation, mean_absolute_difference from gensim.models import basemodel, ldamodel from gensim.utils import deprecated + logger = logging.getLogger(__name__) meanchangethresh = 0.00001 diff --git a/gensim/models/lda_dispatcher.py b/gensim/models/lda_dispatcher.py index 8ca05ba36d..f81cd806eb 100755 --- a/gensim/models/lda_dispatcher.py +++ b/gensim/models/lda_dispatcher.py @@ -56,20 +56,16 @@ """ -from __future__ import with_statement import argparse import os import sys import logging import threading import time -from six import iteritems, itervalues +from queue import Queue -try: - from Queue import Queue -except ImportError: - from queue import Queue import Pyro4 + from gensim import utils from gensim.models.lda_worker import LDA_WORKER_PREFIX @@ -143,7 +139,7 @@ def initialize(self, **model_params): self.workers = {} with utils.getNS(**self.ns_conf) as ns: self.callback = Pyro4.Proxy(ns.list(prefix=LDA_DISPATCHER_PREFIX)[LDA_DISPATCHER_PREFIX]) - for name, uri in iteritems(ns.list(prefix=LDA_WORKER_PREFIX)): + for name, uri in ns.list(prefix=LDA_WORKER_PREFIX).items(): try: worker = Pyro4.Proxy(uri) workerid = len(self.workers) @@ -168,7 +164,7 @@ def getworkers(self): The pyro URIs for each worker. """ - return [worker._pyroUri for worker in itervalues(self.workers)] + return [worker._pyroUri for worker in self.workers.values()] @Pyro4.expose def getjob(self, worker_id): @@ -223,7 +219,7 @@ def getstate(self): i += 1 if i > count: i = 0 - for workerid, worker in iteritems(self.workers): + for workerid, worker in self.workers.items(): logger.info("checking aliveness for worker %s", workerid) worker.ping() @@ -246,7 +242,7 @@ def reset(self, state): State of :class:`~gensim.models.lda.LdaModel`. """ - for workerid, worker in iteritems(self.workers): + for workerid, worker in self.workers.items(): logger.info("resetting worker %s", workerid) worker.reset(state) worker.requestjob() @@ -289,7 +285,7 @@ def jobsdone(self): @Pyro4.oneway def exit(self): """Terminate all registered workers and then the dispatcher.""" - for workerid, worker in iteritems(self.workers): + for workerid, worker in self.workers.items(): logger.info("terminating worker %s", workerid) worker.exit() logger.info("terminating dispatcher") diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index c4a26e6967..0863bcc578 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -90,10 +90,8 @@ import os import numpy as np -import six from scipy.special import gammaln, psi # gamma function utils from scipy.special import polygamma -from six.moves import range from collections import defaultdict from gensim import interfaces, utils, matutils @@ -464,7 +462,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, assert self.alpha.shape == (self.num_topics,), \ "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics) - if isinstance(eta, six.string_types): + if isinstance(eta, str): if eta == 'asymmetric': raise ValueError("The 'asymmetric' option cannot be used for eta") @@ -557,7 +555,7 @@ def init_dir_prior(self, prior, name): is_auto = False - if isinstance(prior, six.string_types): + if isinstance(prior, str): if prior == 'symmetric': logger.info("using symmetric %s at %s", name, 1.0 / self.num_topics) init_prior = np.fromiter( @@ -674,7 +672,7 @@ def inference(self, chunk, collect_sstats=False): # Inference code copied from Hoffman's `onlineldavb.py` (esp. the # Lee&Seung trick which speeds things up by an order of magnitude, compared # to Blei's original LDA-C code, cool!). - integer_types = six.integer_types + (np.integer,) + integer_types = (int, np.integer,) epsilon = np.finfo(self.dtype).eps for d, doc in enumerate(chunk): if len(doc) > 0 and not isinstance(doc[0][0], integer_types): @@ -1585,7 +1583,7 @@ def save(self, fname, ignore=('state', 'dispatcher'), separately=None, *args, ** # make sure 'state', 'id2word' and 'dispatcher' are ignored from the pickled object, even if # someone sets the ignore list themselves if ignore is not None and ignore: - if isinstance(ignore, six.string_types): + if isinstance(ignore, str): ignore = [ignore] ignore = [e for e in ignore if e] # make sure None and '' are not in the list ignore = list({'state', 'dispatcher', 'id2word'} | set(ignore)) @@ -1597,15 +1595,15 @@ def save(self, fname, ignore=('state', 'dispatcher'), separately=None, *args, ** separately_explicit = ['expElogbeta', 'sstats'] # Also add 'alpha' and 'eta' to separately list if they are set 'auto' or some # array manually. - if (isinstance(self.alpha, six.string_types) and self.alpha == 'auto') or \ + if (isinstance(self.alpha, str) and self.alpha == 'auto') or \ (isinstance(self.alpha, np.ndarray) and len(self.alpha.shape) != 1): separately_explicit.append('alpha') - if (isinstance(self.eta, six.string_types) and self.eta == 'auto') or \ + if (isinstance(self.eta, str) and self.eta == 'auto') or \ (isinstance(self.eta, np.ndarray) and len(self.eta.shape) != 1): separately_explicit.append('eta') # Merge separately_explicit with separately. if separately: - if isinstance(separately, six.string_types): + if isinstance(separately, str): separately = [separately] separately = [e for e in separately if e] # make sure None and '' are not in the list separately = list(set(separately_explicit) | set(separately)) diff --git a/gensim/models/ldamulticore.py b/gensim/models/ldamulticore.py index a4a0c297de..6ef8d5ea82 100644 --- a/gensim/models/ldamulticore.py +++ b/gensim/models/ldamulticore.py @@ -84,15 +84,14 @@ """ import logging +import queue +from multiprocessing import Pool, Queue, cpu_count import numpy as np from gensim import utils from gensim.models.ldamodel import LdaModel, LdaState -import six -from six.moves import queue, range -from multiprocessing import Pool, Queue, cpu_count logger = logging.getLogger(__name__) @@ -173,7 +172,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None, self.workers = max(1, cpu_count() - 1) if workers is None else workers self.batch = batch - if isinstance(alpha, six.string_types) and alpha == 'auto': + if isinstance(alpha, str) and alpha == 'auto': raise NotImplementedError("auto-tuning alpha not implemented in multicore LDA; use plain LdaModel.") super(LdaMulticore, self).__init__( diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index 527530cf51..100aed748f 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -52,22 +52,26 @@ """ -from gensim import utils, matutils -from gensim.models import ldamodel +import logging + import numpy as np from scipy.special import digamma, gammaln from scipy import optimize -import logging -from six.moves import range, zip + +from gensim import utils, matutils +from gensim.models import ldamodel + logger = logging.getLogger(__name__) class LdaSeqModel(utils.SaveLoad): """Estimate Dynamic Topic Model parameters based on a training corpus.""" - def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_topics=10, - initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10, - random_state=None, lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100): + def __init__( + self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_topics=10, + initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10, + random_state=None, lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100, + ): """ Parameters diff --git a/gensim/models/lsi_dispatcher.py b/gensim/models/lsi_dispatcher.py index cb7fd4c053..b593e94cd3 100755 --- a/gensim/models/lsi_dispatcher.py +++ b/gensim/models/lsi_dispatcher.py @@ -1,290 +1,287 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2010 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -"""Dispatcher process which orchestrates distributed :class:`~gensim.models.lsimodel.LsiModel` computations. -Run this script only once, on any node in your cluster. - -Notes ------ -The dispatcher expects to find worker scripts already running. Make sure you run as many workers as you like on -your machines **before** launching the dispatcher. - - -How to use distributed LSI --------------------------- - -#. Install needed dependencies (Pyro4) :: - - pip install gensim[distributed] - -#. Setup serialization (on each machine) :: - - export PYRO_SERIALIZERS_ACCEPTED=pickle - export PYRO_SERIALIZER=pickle - -#. Run nameserver :: - - python -m Pyro4.naming -n 0.0.0.0 & - -#. Run workers (on each machine) :: - - python -m gensim.models.lsi_worker & - -#. Run dispatcher :: - - python -m gensim.models.lsi_dispatcher & - -#. Run :class:`~gensim.models.lsimodel.LsiModel` in distributed mode: - - .. sourcecode:: pycon - - >>> from gensim.test.utils import common_corpus, common_dictionary - >>> from gensim.models import LsiModel - >>> - >>> model = LsiModel(common_corpus, id2word=common_dictionary, distributed=True) - -Command line arguments ----------------------- - -.. program-output:: python -m gensim.models.lsi_dispatcher --help - :ellipsis: 0, -5 - -""" - -from __future__ import with_statement -import os -import sys -import logging -import argparse -import threading -import time -from six import iteritems, itervalues - -try: - from Queue import Queue -except ImportError: - from queue import Queue -import Pyro4 -from gensim import utils - -logger = logging.getLogger(__name__) - -# How many jobs (=chunks of N documents) to keep "pre-fetched" in a queue? -# A small number is usually enough, unless iteration over the corpus is very very -# slow (slower than the actual computation of LSI), in which case you can override -# this value from command line. ie. run "python ./lsi_dispatcher.py 100" -MAX_JOBS_QUEUE = 10 - -# timeout for the Queue object put/get blocking methods. -# it should really be infinity, but then keyboard interrupts don't work. -# so this is really just a hack, see http://bugs.python.org/issue1360 -HUGE_TIMEOUT = 365 * 24 * 60 * 60 # one year - - -class Dispatcher(object): - """Dispatcher object that communicates and coordinates individual workers. - - Warnings - -------- - There should never be more than one dispatcher running at any one time. - - """ - def __init__(self, maxsize=0): - """Partly initialize the dispatcher. - - A full initialization (including initialization of the workers) requires a call to - :meth:`~gensim.models.lsi_dispatcher.Dispatcher.initialize` - - Parameters - ---------- - maxsize : int, optional - Maximum number of jobs to be kept pre-fetched in the queue. - - """ - self.maxsize = maxsize - self.workers = {} - self.callback = None # a pyro proxy to this object (unknown at init time, but will be set later) - - @Pyro4.expose - def initialize(self, **model_params): - """Fully initialize the dispatcher and all its workers. - - Parameters - ---------- - **model_params - Keyword parameters used to initialize individual workers - (gets handed all the way down to :meth:`gensim.models.lsi_worker.Worker.initialize`). - See :class:`~gensim.models.lsimodel.LsiModel`. - - Raises - ------ - RuntimeError - When no workers are found (the :mod:`gensim.model.lsi_worker` script must be ran beforehand). - - """ - self.jobs = Queue(maxsize=self.maxsize) - self.lock_update = threading.Lock() - self._jobsdone = 0 - self._jobsreceived = 0 - - # locate all available workers and store their proxies, for subsequent RMI calls - self.workers = {} - with utils.getNS() as ns: - self.callback = Pyro4.Proxy('PYRONAME:gensim.lsi_dispatcher') # = self - for name, uri in iteritems(ns.list(prefix='gensim.lsi_worker')): - try: - worker = Pyro4.Proxy(uri) - workerid = len(self.workers) - # make time consuming methods work asynchronously - logger.info("registering worker #%i from %s", workerid, uri) - worker.initialize(workerid, dispatcher=self.callback, **model_params) - self.workers[workerid] = worker - except Pyro4.errors.PyroError: - logger.exception("unresponsive worker at %s, deleting it from the name server", uri) - ns.remove(name) - - if not self.workers: - raise RuntimeError('no workers found; run some lsi_worker scripts on your machines first!') - - @Pyro4.expose - def getworkers(self): - """Get pyro URIs of all registered workers. - - Returns - ------- - list of URIs - The pyro URIs for each worker. - - """ - return [worker._pyroUri for worker in itervalues(self.workers)] - - @Pyro4.expose - def getjob(self, worker_id): - """Atomically pop a job from the queue. - - Parameters - ---------- - worker_id : int - The worker that requested the job. - - Returns - ------- - iterable of iterable of (int, float) - The corpus in BoW format. - - """ - logger.info("worker #%i requesting a new job", worker_id) - job = self.jobs.get(block=True, timeout=1) - logger.info("worker #%i got a new job (%i left)", worker_id, self.jobs.qsize()) - return job - - @Pyro4.expose - def putjob(self, job): - """Atomically add a job to the queue. - - Parameters - ---------- - job : iterable of list of (int, float) - The corpus in BoW format. - - """ - self._jobsreceived += 1 - self.jobs.put(job, block=True, timeout=HUGE_TIMEOUT) - logger.info("added a new job (len(queue)=%i items)", self.jobs.qsize()) - - @Pyro4.expose - def getstate(self): - """Merge projections from across all workers and get the final projection. - - Returns - ------- - :class:`~gensim.models.lsimodel.Projection` - The current projection of the total model. - - """ - logger.info("end of input, assigning all remaining jobs") - logger.debug("jobs done: %s, jobs received: %s", self._jobsdone, self._jobsreceived) - while self._jobsdone < self._jobsreceived: - time.sleep(0.5) # check every half a second - - # TODO: merge in parallel, so that we're done in `log_2(workers)` merges, - # and not `workers - 1` merges! - # but merging only takes place once, after all input data has been processed, - # so the overall effect would be small... compared to the amount of coding :-) - logger.info("merging states from %i workers", len(self.workers)) - workers = list(self.workers.items()) - result = workers[0][1].getstate() - for workerid, worker in workers[1:]: - logger.info("pulling state from worker %s", workerid) - result.merge(worker.getstate()) - logger.info("sending out merged projection") - return result - - @Pyro4.expose - def reset(self): - """Re-initialize all workers for a new decomposition.""" - for workerid, worker in iteritems(self.workers): - logger.info("resetting worker %s", workerid) - worker.reset() - worker.requestjob() - self._jobsdone = 0 - self._jobsreceived = 0 - - @Pyro4.expose - @Pyro4.oneway - @utils.synchronous('lock_update') - def jobdone(self, workerid): - """A worker has finished its job. Log this event and then asynchronously transfer control back to the worker. - - Callback used by workers to notify when their job is done. - - The job done event is logged and then control is asynchronously transfered back to the worker - (who can then request another job). In this way, control flow basically oscillates between - :meth:`gensim.models.lsi_dispatcher.Dispatcher.jobdone` and :meth:`gensim.models.lsi_worker.Worker.requestjob`. - - Parameters - ---------- - workerid : int - The ID of the worker that finished the job (used for logging). - - """ - self._jobsdone += 1 - logger.info("worker #%s finished job #%i", workerid, self._jobsdone) - worker = self.workers[workerid] - worker.requestjob() # tell the worker to ask for another job, asynchronously (one-way) - - def jobsdone(self): - """Wrap :attr:`~gensim.models.lsi_dispatcher.Dispatcher._jobsdone`, needed for remote access through proxies. - - Returns - ------- - int - Number of jobs already completed. - - """ - return self._jobsdone - - @Pyro4.oneway - def exit(self): - """Terminate all registered workers and then the dispatcher.""" - for workerid, worker in iteritems(self.workers): - logger.info("terminating worker %s", workerid) - worker.exit() - logger.info("terminating dispatcher") - os._exit(0) # exit the whole process (not just this thread ala sys.exit()) - - -if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO) - parser = argparse.ArgumentParser(description=__doc__[:-135], formatter_class=argparse.RawTextHelpFormatter) - parser.add_argument( - 'maxsize', type=int, help='Maximum number of jobs to be kept pre-fetched in the queue.', default=MAX_JOBS_QUEUE - ) - args = parser.parse_args() - - logger.info("running %s", " ".join(sys.argv)) - utils.pyro_daemon('gensim.lsi_dispatcher', Dispatcher(maxsize=args.maxsize)) - logger.info("finished running %s", parser.prog) +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (C) 2010 Radim Rehurek +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html + +"""Dispatcher process which orchestrates distributed :class:`~gensim.models.lsimodel.LsiModel` computations. +Run this script only once, on any node in your cluster. + +Notes +----- +The dispatcher expects to find worker scripts already running. Make sure you run as many workers as you like on +your machines **before** launching the dispatcher. + + +How to use distributed LSI +-------------------------- + +#. Install needed dependencies (Pyro4) :: + + pip install gensim[distributed] + +#. Setup serialization (on each machine) :: + + export PYRO_SERIALIZERS_ACCEPTED=pickle + export PYRO_SERIALIZER=pickle + +#. Run nameserver :: + + python -m Pyro4.naming -n 0.0.0.0 & + +#. Run workers (on each machine) :: + + python -m gensim.models.lsi_worker & + +#. Run dispatcher :: + + python -m gensim.models.lsi_dispatcher & + +#. Run :class:`~gensim.models.lsimodel.LsiModel` in distributed mode: + + .. sourcecode:: pycon + + >>> from gensim.test.utils import common_corpus, common_dictionary + >>> from gensim.models import LsiModel + >>> + >>> model = LsiModel(common_corpus, id2word=common_dictionary, distributed=True) + +Command line arguments +---------------------- + +.. program-output:: python -m gensim.models.lsi_dispatcher --help + :ellipsis: 0, -5 + +""" + +import os +import sys +import logging +import argparse +import threading +import time +from queue import Queue + +import Pyro4 + +from gensim import utils + + +logger = logging.getLogger(__name__) + +# How many jobs (=chunks of N documents) to keep "pre-fetched" in a queue? +# A small number is usually enough, unless iteration over the corpus is very very +# slow (slower than the actual computation of LSI), in which case you can override +# this value from command line. ie. run "python ./lsi_dispatcher.py 100" +MAX_JOBS_QUEUE = 10 + +# timeout for the Queue object put/get blocking methods. +# it should really be infinity, but then keyboard interrupts don't work. +# so this is really just a hack, see http://bugs.python.org/issue1360 +HUGE_TIMEOUT = 365 * 24 * 60 * 60 # one year + + +class Dispatcher: + """Dispatcher object that communicates and coordinates individual workers. + + Warnings + -------- + There should never be more than one dispatcher running at any one time. + + """ + def __init__(self, maxsize=0): + """Partly initialize the dispatcher. + + A full initialization (including initialization of the workers) requires a call to + :meth:`~gensim.models.lsi_dispatcher.Dispatcher.initialize` + + Parameters + ---------- + maxsize : int, optional + Maximum number of jobs to be kept pre-fetched in the queue. + + """ + self.maxsize = maxsize + self.workers = {} + self.callback = None # a pyro proxy to this object (unknown at init time, but will be set later) + + @Pyro4.expose + def initialize(self, **model_params): + """Fully initialize the dispatcher and all its workers. + + Parameters + ---------- + **model_params + Keyword parameters used to initialize individual workers + (gets handed all the way down to :meth:`gensim.models.lsi_worker.Worker.initialize`). + See :class:`~gensim.models.lsimodel.LsiModel`. + + Raises + ------ + RuntimeError + When no workers are found (the :mod:`gensim.model.lsi_worker` script must be ran beforehand). + + """ + self.jobs = Queue(maxsize=self.maxsize) + self.lock_update = threading.Lock() + self._jobsdone = 0 + self._jobsreceived = 0 + + # locate all available workers and store their proxies, for subsequent RMI calls + self.workers = {} + with utils.getNS() as ns: + self.callback = Pyro4.Proxy('PYRONAME:gensim.lsi_dispatcher') # = self + for name, uri in ns.list(prefix='gensim.lsi_worker').items(): + try: + worker = Pyro4.Proxy(uri) + workerid = len(self.workers) + # make time consuming methods work asynchronously + logger.info("registering worker #%i from %s", workerid, uri) + worker.initialize(workerid, dispatcher=self.callback, **model_params) + self.workers[workerid] = worker + except Pyro4.errors.PyroError: + logger.exception("unresponsive worker at %s, deleting it from the name server", uri) + ns.remove(name) + + if not self.workers: + raise RuntimeError('no workers found; run some lsi_worker scripts on your machines first!') + + @Pyro4.expose + def getworkers(self): + """Get pyro URIs of all registered workers. + + Returns + ------- + list of URIs + The pyro URIs for each worker. + + """ + return [worker._pyroUri for worker in self.workers.values()] + + @Pyro4.expose + def getjob(self, worker_id): + """Atomically pop a job from the queue. + + Parameters + ---------- + worker_id : int + The worker that requested the job. + + Returns + ------- + iterable of iterable of (int, float) + The corpus in BoW format. + + """ + logger.info("worker #%i requesting a new job", worker_id) + job = self.jobs.get(block=True, timeout=1) + logger.info("worker #%i got a new job (%i left)", worker_id, self.jobs.qsize()) + return job + + @Pyro4.expose + def putjob(self, job): + """Atomically add a job to the queue. + + Parameters + ---------- + job : iterable of list of (int, float) + The corpus in BoW format. + + """ + self._jobsreceived += 1 + self.jobs.put(job, block=True, timeout=HUGE_TIMEOUT) + logger.info("added a new job (len(queue)=%i items)", self.jobs.qsize()) + + @Pyro4.expose + def getstate(self): + """Merge projections from across all workers and get the final projection. + + Returns + ------- + :class:`~gensim.models.lsimodel.Projection` + The current projection of the total model. + + """ + logger.info("end of input, assigning all remaining jobs") + logger.debug("jobs done: %s, jobs received: %s", self._jobsdone, self._jobsreceived) + while self._jobsdone < self._jobsreceived: + time.sleep(0.5) # check every half a second + + # TODO: merge in parallel, so that we're done in `log_2(workers)` merges, + # and not `workers - 1` merges! + # but merging only takes place once, after all input data has been processed, + # so the overall effect would be small... compared to the amount of coding :-) + logger.info("merging states from %i workers", len(self.workers)) + workers = list(self.workers.items()) + result = workers[0][1].getstate() + for workerid, worker in workers[1:]: + logger.info("pulling state from worker %s", workerid) + result.merge(worker.getstate()) + logger.info("sending out merged projection") + return result + + @Pyro4.expose + def reset(self): + """Re-initialize all workers for a new decomposition.""" + for workerid, worker in self.workers.items(): + logger.info("resetting worker %s", workerid) + worker.reset() + worker.requestjob() + self._jobsdone = 0 + self._jobsreceived = 0 + + @Pyro4.expose + @Pyro4.oneway + @utils.synchronous('lock_update') + def jobdone(self, workerid): + """A worker has finished its job. Log this event and then asynchronously transfer control back to the worker. + + Callback used by workers to notify when their job is done. + + The job done event is logged and then control is asynchronously transfered back to the worker + (who can then request another job). In this way, control flow basically oscillates between + :meth:`gensim.models.lsi_dispatcher.Dispatcher.jobdone` and :meth:`gensim.models.lsi_worker.Worker.requestjob`. + + Parameters + ---------- + workerid : int + The ID of the worker that finished the job (used for logging). + + """ + self._jobsdone += 1 + logger.info("worker #%s finished job #%i", workerid, self._jobsdone) + worker = self.workers[workerid] + worker.requestjob() # tell the worker to ask for another job, asynchronously (one-way) + + def jobsdone(self): + """Wrap :attr:`~gensim.models.lsi_dispatcher.Dispatcher._jobsdone`, needed for remote access through proxies. + + Returns + ------- + int + Number of jobs already completed. + + """ + return self._jobsdone + + @Pyro4.oneway + def exit(self): + """Terminate all registered workers and then the dispatcher.""" + for workerid, worker in self.workers.items(): + logger.info("terminating worker %s", workerid) + worker.exit() + logger.info("terminating dispatcher") + os._exit(0) # exit the whole process (not just this thread ala sys.exit()) + + +if __name__ == '__main__': + logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO) + parser = argparse.ArgumentParser(description=__doc__[:-135], formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument( + 'maxsize', type=int, help='Maximum number of jobs to be kept pre-fetched in the queue.', default=MAX_JOBS_QUEUE + ) + args = parser.parse_args() + + logger.info("running %s", " ".join(sys.argv)) + utils.pyro_daemon('gensim.lsi_dispatcher', Dispatcher(maxsize=args.maxsize)) + logger.info("finished running %s", parser.prog) diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py index 8c160e02ca..d40690fd70 100644 --- a/gensim/models/lsimodel.py +++ b/gensim/models/lsimodel.py @@ -66,8 +66,6 @@ import scipy.linalg import scipy.sparse from scipy.sparse import sparsetools -from six import iterkeys -from six.moves import range from gensim import interfaces, matutils, utils from gensim.models import basemodel @@ -833,7 +831,7 @@ def print_debug(id2token, u, s, topics, num_words=10, num_neg=None): result.setdefault(topic, []).append((udiff[topic], uvecno)) logger.debug("printing %i+%i salient words", num_words, num_neg) - for topic in sorted(iterkeys(result)): + for topic in sorted(result.keys()): weights = sorted(result[topic], key=lambda x: -abs(x[0])) _, most = weights[0] if u[most, topic] < 0.0: # the most significant word has a negative sign => flip sign of u[most] diff --git a/gensim/models/poincare.py b/gensim/models/poincare.py index 1133c52061..050aa52e9b 100644 --- a/gensim/models/poincare.py +++ b/gensim/models/poincare.py @@ -47,17 +47,11 @@ from numbers import Integral import sys import time +from collections import defaultdict, Counter import numpy as np -from collections import defaultdict, Counter from numpy import random as np_random, float32 as REAL from scipy.stats import spearmanr -from six import string_types -from six.moves import zip, range - -from gensim import utils, matutils -from gensim.models.keyedvectors import KeyedVectors - try: from autograd import grad # Only required for optionally verifying gradients while training from autograd import numpy as grad_np @@ -65,6 +59,10 @@ except ImportError: AUTOGRAD_PRESENT = False +from gensim import utils, matutils +from gensim.models.keyedvectors import KeyedVectors + + logger = logging.getLogger(__name__) @@ -1156,7 +1154,7 @@ def most_similar(self, node_or_vector, topn=10, restrict_vocab=None): nodes_to_use = self.index_to_key[:restrict_vocab] all_distances = self.distances(node_or_vector, nodes_to_use) - if isinstance(node_or_vector, string_types + (int,)): + if isinstance(node_or_vector, (str, int,)): node_index = self.get_index(node_or_vector) else: node_index = None @@ -1214,7 +1212,7 @@ def distances(self, node_or_vector, other_nodes=()): If either `node_or_vector` or any node in `other_nodes` is absent from vocab. """ - if isinstance(node_or_vector, string_types): + if isinstance(node_or_vector, str): input_vector = self.get_vector(node_or_vector) else: input_vector = node_or_vector @@ -1259,7 +1257,7 @@ def norm(self, node_or_vector): The position in hierarchy is based on the norm of the vector for the node. """ - if isinstance(node_or_vector, string_types): + if isinstance(node_or_vector, str): input_vector = self.get_vector(node_or_vector) else: input_vector = node_or_vector diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py index a363e73298..d0e3d653ef 100644 --- a/gensim/models/tfidfmodel.py +++ b/gensim/models/tfidfmodel.py @@ -17,11 +17,11 @@ from functools import partial import re +import numpy as np + from gensim import interfaces, matutils, utils from gensim.utils import deprecated -from six import iteritems, iterkeys -import numpy as np logger = logging.getLogger(__name__) @@ -155,7 +155,7 @@ def precompute_idfs(wglobal, dfs, total_docs): """ # not strictly necessary and could be computed on the fly in TfidfModel__getitem__. # this method is here just to speed things up a little. - return {termid: wglobal(df, total_docs) for termid, df in iteritems(dfs)} + return {termid: wglobal(df, total_docs) for termid, df in dfs.items()} def smartirs_wlocal(tf, local_scheme): @@ -389,7 +389,7 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden self.num_docs, self.num_nnz = dictionary.num_docs, dictionary.num_nnz self.cfs = dictionary.cfs.copy() self.dfs = dictionary.dfs.copy() - self.term_lens = {termid: len(term) for termid, term in iteritems(dictionary)} + self.term_lens = {termid: len(term) for termid, term in dictionary.items()} self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs) if not id2word: self.id2word = dictionary @@ -415,7 +415,7 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden self.pivot = 1.0 * self.num_nnz / self.num_docs elif n_n == "b": self.pivot = 1.0 * sum( - self.cfs[termid] * (self.term_lens[termid] + 1.0) for termid in iterkeys(dictionary) + self.cfs[termid] * (self.term_lens[termid] + 1.0) for termid in dictionary.keys() ) / self.num_docs @classmethod diff --git a/gensim/parsing/porter.py b/gensim/parsing/porter.py index c579f44beb..528103a874 100644 --- a/gensim/parsing/porter.py +++ b/gensim/parsing/porter.py @@ -30,9 +30,6 @@ """ -from six.moves import range - - class PorterStemmer(object): """Class contains implementation of Porter stemming algorithm. diff --git a/gensim/scripts/benchmark.py b/gensim/scripts/benchmark.py index c920b2b32f..d963129a9c 100644 --- a/gensim/scripts/benchmark.py +++ b/gensim/scripts/benchmark.py @@ -6,7 +6,7 @@ """ Help script (template) for benchmarking. Run with: - /usr/bin/time --format "%E elapsed\n%Mk max mem" python -m gensim.scripts.benchmark ~/gensim-data/text9/text9.txt + /usr/bin/time --format "%E elapsed\n%Mk peak RAM" python -m gensim.scripts.benchmark ~/gensim-data/text9/text9.txt """ diff --git a/gensim/similarities/docsim.py b/gensim/similarities/docsim.py index 328d610a0d..14ada07904 100755 --- a/gensim/similarities/docsim.py +++ b/gensim/similarities/docsim.py @@ -77,7 +77,6 @@ import scipy.sparse from gensim import interfaces, utils, matutils -from six.moves import map, range, zip logger = logging.getLogger(__name__) diff --git a/gensim/similarities/termsim.py b/gensim/similarities/termsim.py index 9a83458f84..3949d77960 100644 --- a/gensim/similarities/termsim.py +++ b/gensim/similarities/termsim.py @@ -14,7 +14,6 @@ from math import sqrt import numpy as np -from six.moves import range from scipy import sparse from gensim.matutils import corpus2csc @@ -22,9 +21,11 @@ logger = logging.getLogger(__name__) -NON_NEGATIVE_NORM_ASSERTION_MESSAGE = u"sparse documents must not contain any explicit " \ - u"zero entries and the similarity matrix S must satisfy x^T * S * x >= 0 for any " \ +NON_NEGATIVE_NORM_ASSERTION_MESSAGE = ( + u"sparse documents must not contain any explicit " + u"zero entries and the similarity matrix S must satisfy x^T * S * x >= 0 for any " u"nonzero bag-of-words vector x." +) class TermSimilarityIndex(SaveLoad): diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index 370897bfdb..9f01f9818b 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -19,8 +19,9 @@ >>> docvecs = model.fit_transform(common_texts) # represent `common_texts` as vectors """ + import numpy as np -from six import string_types + from sklearn.base import TransformerMixin, BaseEstimator from sklearn.exceptions import NotFittedError @@ -195,7 +196,7 @@ def transform(self, docs): ) # The input as array of array - if isinstance(docs[0], string_types): + if isinstance(docs[0], str): docs = [docs] vectors = [self.gensim_model.infer_vector(doc) for doc in docs] return np.reshape(np.array(vectors), (len(docs), self.gensim_model.vector_size)) diff --git a/gensim/sklearn_api/ftmodel.py b/gensim/sklearn_api/ftmodel.py index 7acd22cfc2..c42f95274e 100644 --- a/gensim/sklearn_api/ftmodel.py +++ b/gensim/sklearn_api/ftmodel.py @@ -42,7 +42,7 @@ """ import numpy as np -import six + from sklearn.base import TransformerMixin, BaseEstimator from sklearn.exceptions import NotFittedError @@ -222,7 +222,7 @@ def transform(self, words): ) # The input as array of array - if isinstance(words, six.string_types): + if isinstance(words, str): words = [words] vectors = [self.gensim_model.wv[word] for word in words] return np.reshape(np.array(vectors), (len(words), self.vector_size)) diff --git a/gensim/sklearn_api/text2bow.py b/gensim/sklearn_api/text2bow.py index 7cae8a609b..8d982bfa9c 100644 --- a/gensim/sklearn_api/text2bow.py +++ b/gensim/sklearn_api/text2bow.py @@ -25,7 +25,7 @@ [[(0, 1), (1, 1), (2, 1)], [(1, 1), (2, 1), (3, 1)]] """ -from six import string_types + from sklearn.base import TransformerMixin, BaseEstimator from sklearn.exceptions import NotFittedError @@ -91,7 +91,7 @@ def transform(self, docs): ) # input as python lists - if isinstance(docs, string_types): + if isinstance(docs, str): docs = [docs] tokenized_docs = (list(self.tokenizer(doc)) for doc in docs) return [self.gensim_model.doc2bow(doc) for doc in tokenized_docs] diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py index ae64b56e3e..8c0bd932a1 100644 --- a/gensim/sklearn_api/w2vmodel.py +++ b/gensim/sklearn_api/w2vmodel.py @@ -25,8 +25,9 @@ >>> assert wordvecs.shape == (2, 10) """ + import numpy as np -import six + from sklearn.base import TransformerMixin, BaseEstimator from sklearn.exceptions import NotFittedError @@ -173,7 +174,7 @@ def transform(self, words): ) # The input as array of array - if isinstance(words, six.string_types): + if isinstance(words, str): words = [words] vectors = [self.gensim_model.wv[word] for word in words] return np.reshape(np.array(vectors), (len(words), self.vector_size)) diff --git a/gensim/test/basetmtests.py b/gensim/test/basetmtests.py index 78587c1e56..56de810691 100644 --- a/gensim/test/basetmtests.py +++ b/gensim/test/basetmtests.py @@ -9,28 +9,27 @@ """ import numpy as np -import six -class TestBaseTopicModel(object): +class TestBaseTopicModel: def test_print_topic(self): topics = self.model.show_topics(formatted=True) for topic_no, topic in topics: self.assertTrue(isinstance(topic_no, int)) - self.assertTrue(isinstance(topic, str) or isinstance(topic, unicode)) # noqa:F821 + self.assertTrue(isinstance(topic, str)) def test_print_topics(self): topics = self.model.print_topics() for topic_no, topic in topics: self.assertTrue(isinstance(topic_no, int)) - self.assertTrue(isinstance(topic, str) or isinstance(topic, unicode)) # noqa:F821 + self.assertTrue(isinstance(topic, str)) def test_show_topic(self): topic = self.model.show_topic(1) for k, v in topic: - self.assertTrue(isinstance(k, six.string_types)) + self.assertTrue(isinstance(k, str)) self.assertTrue(isinstance(v, (np.floating, float))) def test_show_topics(self): @@ -40,7 +39,7 @@ def test_show_topics(self): self.assertTrue(isinstance(topic_no, int)) self.assertTrue(isinstance(topic, list)) for k, v in topic: - self.assertTrue(isinstance(k, six.string_types)) + self.assertTrue(isinstance(k, str)) self.assertTrue(isinstance(v, (np.floating, float))) def test_get_topics(self): diff --git a/gensim/test/test_atmodel.py b/gensim/test/test_atmodel.py index 50e6a32ea9..ccb6cce0a7 100644 --- a/gensim/test/test_atmodel.py +++ b/gensim/test/test_atmodel.py @@ -17,7 +17,6 @@ import numbers from os import remove -import six import numpy as np from gensim.corpora import mmcorpus, Dictionary @@ -27,6 +26,7 @@ from gensim.test.utils import (datapath, get_tmpfile, common_texts, common_dictionary as dictionary, common_corpus as corpus) from gensim.matutils import jensen_shannon + # TODO: # Test that computing the bound on new unseen documents works as expected (this is somewhat different # in the author-topic model than in LDA). @@ -413,7 +413,7 @@ def testTopTopics(self): self.assertTrue(isinstance(score, float)) for v, k in topic: - self.assertTrue(isinstance(k, six.string_types)) + self.assertTrue(isinstance(k, str)) self.assertTrue(isinstance(v, float)) def testGetTopicTerms(self): diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py index 3b518f95c6..f1c17ac0c9 100644 --- a/gensim/test/test_ldamodel.py +++ b/gensim/test/test_ldamodel.py @@ -14,7 +14,6 @@ import os import unittest -import six import numpy as np from numpy.testing import assert_allclose @@ -203,7 +202,7 @@ def testTopTopics(self): self.assertTrue(isinstance(score, float)) for v, k in topic: - self.assertTrue(isinstance(k, six.string_types)) + self.assertTrue(isinstance(k, str)) self.assertTrue(np.issubdtype(v, np.floating)) def testGetTopicTerms(self): @@ -466,7 +465,7 @@ def testRandomStateBackwardCompatibility(self): for i in model_topics: self.assertTrue(isinstance(i[0], int)) - self.assertTrue(isinstance(i[1], six.string_types)) + self.assertTrue(isinstance(i[1], str)) # save back the loaded model using a post-0.13.2 version of Gensim post_0_13_2_fname = get_tmpfile('gensim_models_lda_post_0_13_2_model.tst') @@ -478,7 +477,7 @@ def testRandomStateBackwardCompatibility(self): for i in model_topics_new: self.assertTrue(isinstance(i[0], int)) - self.assertTrue(isinstance(i[1], six.string_types)) + self.assertTrue(isinstance(i[1], str)) def testDtypeBackwardCompatibility(self): lda_3_0_1_fname = datapath('lda_3_0_1_model') diff --git a/gensim/test/test_ldavowpalwabbit_wrapper.py b/gensim/test/test_ldavowpalwabbit_wrapper.py index bedcdebc63..ddec17da23 100644 --- a/gensim/test/test_ldavowpalwabbit_wrapper.py +++ b/gensim/test/test_ldavowpalwabbit_wrapper.py @@ -19,8 +19,6 @@ import tempfile from collections import defaultdict -import six - from gensim.corpora import Dictionary import gensim.models.wrappers.ldavowpalwabbit as ldavowpalwabbit @@ -154,7 +152,7 @@ def test_topic_coherence(self): # get list of original topics that each word actually belongs to ids = [] for word in topic_words: - for src_topic_words, src_topic_id in six.iteritems(topic_map): + for src_topic_words, src_topic_id in topic_map.items(): if word in src_topic_words: ids.append(src_topic_id) @@ -165,7 +163,7 @@ def test_topic_coherence(self): # if at least 6/10 words assigned to same topic, consider it coherent max_count = 0 - for count in six.itervalues(counts): + for count in counts.values(): max_count = max(max_count, count) if max_count >= 6: diff --git a/gensim/test/test_nmf.py b/gensim/test/test_nmf.py index 763f61360c..e0e71433cb 100644 --- a/gensim/test/test_nmf.py +++ b/gensim/test/test_nmf.py @@ -14,7 +14,6 @@ import logging import numbers import numpy as np -import six from gensim import matutils from gensim.models import nmf @@ -108,7 +107,7 @@ def testTopTopics(self): self.assertTrue(isinstance(score, float)) for v, k in topic: - self.assertTrue(isinstance(k, six.string_types)) + self.assertTrue(isinstance(k, str)) self.assertTrue(np.issubdtype(v, float)) def testGetTopicTerms(self): diff --git a/gensim/test/test_utils.py b/gensim/test/test_utils.py index 626c0de06c..18436bf655 100644 --- a/gensim/test/test_utils.py +++ b/gensim/test/test_utils.py @@ -6,13 +6,11 @@ """ Automated tests for checking various utils functions. """ -from __future__ import unicode_literals import logging import unittest import numpy as np -from six import iteritems from gensim import utils from gensim.test.utils import datapath, get_tmpfile @@ -112,7 +110,7 @@ class TestSampleDict(unittest.TestCase): def test_sample_dict(self): d = {1: 2, 2: 3, 3: 4, 4: 5} expected_dict = [(1, 2), (2, 3)] - expected_dict_random = [(k, v) for k, v in iteritems(d)] + expected_dict_random = [(k, v) for k, v in d.items()] sampled_dict = utils.sample_dict(d, 2, False) self.assertEqual(sampled_dict, expected_dict) sampled_dict_random = utils.sample_dict(d, 2) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index c867f045e8..a9e57036f0 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -7,21 +7,15 @@ """ Automated tests for checking transformation algorithms (the models package). """ -from __future__ import division import logging import unittest import os import bz2 import sys -import six import numpy as np -from gensim import utils -from gensim.models import word2vec, keyedvectors -from gensim.test.utils import datapath, get_tmpfile, temporary_file, common_texts as sentences, \ - LeeCorpus, lee_corpus_list from testfixtures import log_capture try: @@ -30,6 +24,13 @@ except (ImportError, ValueError): PYEMD_EXT = False +from gensim import utils +from gensim.models import word2vec, keyedvectors +from gensim.test.utils import ( + datapath, get_tmpfile, temporary_file, common_texts as sentences, + LeeCorpus, lee_corpus_list, +) + new_sentences = [ ['computer', 'artificial', 'intelligence'], @@ -166,7 +167,6 @@ def testOnlineLearningAfterSave(self): model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.epochs) self.assertEqual(len(model_neg.wv), 14) - @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27") def testOnlineLearningFromFile(self): """Test that the algorithm is able to add new words to the vocabulary and to a trained model when using a sorted vocabulary""" @@ -192,7 +192,6 @@ def testOnlineLearningFromFile(self): self.assertEqual(len(model_hs.wv), 14) self.assertEqual(len(model_neg.wv), 14) - @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27") def testOnlineLearningAfterSaveFromFile(self): """Test that the algorithm is able to add new words to the vocabulary and to a trained model when using a sorted vocabulary""" @@ -273,7 +272,6 @@ def testPersistence(self): self.assertTrue(np.allclose(wv.vectors, loaded_wv.vectors)) self.assertEqual(len(wv), len(loaded_wv)) - @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27") def testPersistenceFromFile(self): """Test storing/loading the entire model trained with corpus_file argument.""" with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file: @@ -498,7 +496,6 @@ def testTraining(self): model2 = word2vec.Word2Vec(sentences, vector_size=2, min_count=1, hs=1, negative=0) self.models_equal(model, model2) - @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27") def testTrainingFromFile(self): """Test word2vec training with corpus_file argument.""" # build vocabulary, don't train yet @@ -574,7 +571,6 @@ def testEvaluateWordPairs(self): self.assertTrue(0.1 < spearman < 1.0, "spearman {spearman} not between 0.1 and 1.0") self.assertTrue(0.0 <= oov < 90.0, "OOV {oov} not between 0.0 and 90.0") - @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27") def testEvaluateWordPairsFromFile(self): """Test Spearman and Pearson correlation coefficients give sane results on similarity datasets""" with temporary_file(get_tmpfile('gensim_word2vec.tst')) as tf: @@ -621,7 +617,6 @@ def test_sg_hs(self): model = word2vec.Word2Vec(sg=1, window=4, hs=1, negative=0, min_count=5, epochs=10, workers=2) self.model_sanity(model) - @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27") def test_sg_hs_fromfile(self): model = word2vec.Word2Vec(sg=1, window=4, hs=1, negative=0, min_count=5, epochs=10, workers=2) self.model_sanity(model, with_corpus_file=True) @@ -631,7 +626,6 @@ def test_sg_neg(self): model = word2vec.Word2Vec(sg=1, window=4, hs=0, negative=15, min_count=5, epochs=10, workers=2) self.model_sanity(model) - @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27") def test_sg_neg_fromfile(self): model = word2vec.Word2Vec(sg=1, window=4, hs=0, negative=15, min_count=5, epochs=10, workers=2) self.model_sanity(model, with_corpus_file=True) @@ -666,7 +660,6 @@ def test_cbow_hs(self, ranks=None): ) self.model_sanity(model, ranks=ranks) - @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27") def test_cbow_hs_fromfile(self): model = word2vec.Word2Vec( sg=0, cbow_mean=1, alpha=0.1, window=2, hs=1, negative=0, @@ -682,7 +675,6 @@ def test_cbow_neg(self, ranks=None): ) self.model_sanity(model, ranks=ranks) - @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27") def test_cbow_neg_fromfile(self): model = word2vec.Word2Vec( sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=15, @@ -1062,7 +1054,6 @@ def testLineSentenceWorksWithFilename(self): for words in sentences: self.assertEqual(words, utils.to_unicode(orig.readline()).split()) - @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27") def testCythonLineSentenceWorksWithFilename(self): """Does CythonLineSentence work with a filename argument?""" from gensim.models import word2vec_corpusfile diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 477a9f2bc3..fb4fda99b8 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -15,7 +15,6 @@ import numpy as np import scipy.sparse as sps -from six import iteritems, string_types from gensim import utils from gensim.models.word2vec import Word2Vec @@ -127,7 +126,7 @@ def analyze_text(self, text, doc_num=None): raise NotImplementedError("Base classes should implement analyze_text.") def __getitem__(self, word_or_words): - if isinstance(word_or_words, string_types) or not hasattr(word_or_words, '__iter__'): + if isinstance(word_or_words, str) or not hasattr(word_or_words, '__iter__'): return self.get_occurrences(word_or_words) else: return self.get_co_occurrences(*word_or_words) @@ -250,7 +249,7 @@ def _get_co_occurrences(self, word_id1, word_id2): return len(s1.intersection(s2)) def index_to_dict(self): - contiguous2id = {n: word_id for word_id, n in iteritems(self.id2contiguous)} + contiguous2id = {n: word_id for word_id, n in self.id2contiguous.items()} return {contiguous2id[n]: doc_id_set for n, doc_id_set in enumerate(self._inverted_index)} @@ -359,7 +358,7 @@ def partial_accumulate(self, texts, window_size): self._counter.clear() super(WordOccurrenceAccumulator, self).accumulate(texts, window_size) - for combo, count in iteritems(self._counter): + for combo, count in self._counter.items(): self._co_occurrences[combo] += count return self diff --git a/setup.py b/setup.py index 426e89912a..6c09eaf6fa 100644 --- a/setup.py +++ b/setup.py @@ -332,7 +332,6 @@ def run(self): install_requires = [ NUMPY_STR, 'scipy >= 0.18.1', - 'six >= 1.5.0', 'smart_open >= 1.8.1', "dataclasses; python_version < '3.7'", # pre-py3.7 needs `dataclasses` backport for use of `dataclass` in doc2vec.py ]