diff --git a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.ipynb b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.ipynb
index 8d51ffc0e3..998115a80e 100644
--- a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.ipynb
+++ b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.ipynb
@@ -224,7 +224,7 @@
},
"outputs": [],
"source": [
- "from six import iteritems\n# collect statistics about all tokens\ndictionary = corpora.Dictionary(line.lower().split() for line in open('https://radimrehurek.com/gensim/mycorpus.txt'))\n# remove stop words and words that appear only once\nstop_ids = [\n dictionary.token2id[stopword]\n for stopword in stoplist\n if stopword in dictionary.token2id\n]\nonce_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]\ndictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once\ndictionary.compactify() # remove gaps in id sequence after words that were removed\nprint(dictionary)"
+ "# collect statistics about all tokens\ndictionary = corpora.Dictionary(line.lower().split() for line in open('https://radimrehurek.com/gensim/mycorpus.txt'))\n# remove stop words and words that appear only once\nstop_ids = [\n dictionary.token2id[stopword]\n for stopword in stoplist\n if stopword in dictionary.token2id\n]\nonce_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]\ndictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once\ndictionary.compactify() # remove gaps in id sequence after words that were removed\nprint(dictionary)"
]
},
{
diff --git a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py
index 426ecf2407..5a77b4e637 100644
--- a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py
+++ b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py
@@ -179,7 +179,6 @@ def __iter__(self):
#
# Similarly, to construct the dictionary without loading all texts into memory:
-from six import iteritems
# collect statistics about all tokens
dictionary = corpora.Dictionary(line.lower().split() for line in open('https://radimrehurek.com/gensim/mycorpus.txt'))
# remove stop words and words that appear only once
@@ -188,7 +187,7 @@ def __iter__(self):
for stopword in stoplist
if stopword in dictionary.token2id
]
-once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]
+once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]
dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
dictionary.compactify() # remove gaps in id sequence after words that were removed
print(dictionary)
diff --git a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py.md5 b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py.md5
index 1a6c2797e8..9e8401aae5 100644
--- a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py.md5
+++ b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.py.md5
@@ -1 +1 @@
-e017de81683bfd2f6005a3186bfc1eb3
\ No newline at end of file
+c239d5c523ea2b3af1f6d4c6c51e7925
\ No newline at end of file
diff --git a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.rst b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.rst
index a95de20d11..4b55ff959e 100644
--- a/docs/src/auto_examples/core/run_corpora_and_vector_spaces.rst
+++ b/docs/src/auto_examples/core/run_corpora_and_vector_spaces.rst
@@ -159,10 +159,10 @@ between the questions and ids is called a dictionary:
.. code-block:: none
- 2020-09-30 12:28:00,819 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
- 2020-09-30 12:28:00,820 : INFO : built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)
- 2020-09-30 12:28:00,821 : INFO : saving Dictionary object under /tmp/deerwester.dict, separately None
- 2020-09-30 12:28:00,822 : INFO : saved /tmp/deerwester.dict
+ 2020-10-19 01:23:37,722 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
+ 2020-10-19 01:23:37,722 : INFO : built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)
+ 2020-10-19 01:23:37,722 : INFO : saving Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) under /tmp/deerwester.dict, separately None
+ 2020-10-19 01:23:37,723 : INFO : saved /tmp/deerwester.dict
Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)
@@ -244,11 +244,11 @@ therefore reads: in the document `"Human computer interaction"`, the words `comp
.. code-block:: none
- 2020-09-30 12:28:01,181 : INFO : storing corpus in Matrix Market format to /tmp/deerwester.mm
- 2020-09-30 12:28:01,182 : INFO : saving sparse matrix to /tmp/deerwester.mm
- 2020-09-30 12:28:01,182 : INFO : PROGRESS: saving document #0
- 2020-09-30 12:28:01,182 : INFO : saved 9x12 matrix, density=25.926% (28/108)
- 2020-09-30 12:28:01,183 : INFO : saving MmCorpus index to /tmp/deerwester.mm.index
+ 2020-10-19 01:23:38,012 : INFO : storing corpus in Matrix Market format to /tmp/deerwester.mm
+ 2020-10-19 01:23:38,013 : INFO : saving sparse matrix to /tmp/deerwester.mm
+ 2020-10-19 01:23:38,013 : INFO : PROGRESS: saving document #0
+ 2020-10-19 01:23:38,016 : INFO : saved 9x12 matrix, density=25.926% (28/108)
+ 2020-10-19 01:23:38,016 : INFO : saving MmCorpus index to /tmp/deerwester.mm.index
[[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (7, 1), (8, 1)], [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(4, 1), (10, 1), (11, 1)]]
@@ -334,7 +334,7 @@ then convert the tokens via a dictionary to their ids and yield the resulting sp
.. code-block:: none
- <__main__.MyCorpus object at 0x125b5a128>
+ <__main__.MyCorpus object at 0x117e06828>
@@ -383,7 +383,6 @@ Similarly, to construct the dictionary without loading all texts into memory:
.. code-block:: default
- from six import iteritems
# collect statistics about all tokens
dictionary = corpora.Dictionary(line.lower().split() for line in open('https://radimrehurek.com/gensim/mycorpus.txt'))
# remove stop words and words that appear only once
@@ -392,7 +391,7 @@ Similarly, to construct the dictionary without loading all texts into memory:
for stopword in stoplist
if stopword in dictionary.token2id
]
- once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]
+ once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]
dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
dictionary.compactify() # remove gaps in id sequence after words that were removed
print(dictionary)
@@ -407,8 +406,8 @@ Similarly, to construct the dictionary without loading all texts into memory:
.. code-block:: none
- 2020-09-30 12:28:02,652 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
- 2020-09-30 12:28:02,653 : INFO : built Dictionary(42 unique tokens: ['abc', 'applications', 'computer', 'for', 'human']...) from 9 documents (total 69 corpus positions)
+ 2020-10-19 01:23:38,980 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
+ 2020-10-19 01:23:38,981 : INFO : built Dictionary(42 unique tokens: ['abc', 'applications', 'computer', 'for', 'human']...) from 9 documents (total 69 corpus positions)
Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)
@@ -455,11 +454,11 @@ create a toy corpus of 2 documents, as a plain Python list
.. code-block:: none
- 2020-09-30 12:28:02,781 : INFO : storing corpus in Matrix Market format to /tmp/corpus.mm
- 2020-09-30 12:28:02,782 : INFO : saving sparse matrix to /tmp/corpus.mm
- 2020-09-30 12:28:02,783 : INFO : PROGRESS: saving document #0
- 2020-09-30 12:28:02,783 : INFO : saved 2x2 matrix, density=25.000% (1/4)
- 2020-09-30 12:28:02,783 : INFO : saving MmCorpus index to /tmp/corpus.mm.index
+ 2020-10-19 01:23:39,099 : INFO : storing corpus in Matrix Market format to /tmp/corpus.mm
+ 2020-10-19 01:23:39,100 : INFO : saving sparse matrix to /tmp/corpus.mm
+ 2020-10-19 01:23:39,100 : INFO : PROGRESS: saving document #0
+ 2020-10-19 01:23:39,101 : INFO : saved 2x2 matrix, density=25.000% (1/4)
+ 2020-10-19 01:23:39,101 : INFO : saving MmCorpus index to /tmp/corpus.mm.index
@@ -487,16 +486,16 @@ Other formats include `Joachim's SVMlight format
.. code-block:: none
- 2020-09-30 12:28:02,842 : INFO : converting corpus to SVMlight format: /tmp/corpus.svmlight
- 2020-09-30 12:28:02,844 : INFO : saving SvmLightCorpus index to /tmp/corpus.svmlight.index
- 2020-09-30 12:28:02,844 : INFO : no word id mapping provided; initializing from corpus
- 2020-09-30 12:28:02,844 : INFO : storing corpus in Blei's LDA-C format into /tmp/corpus.lda-c
- 2020-09-30 12:28:02,844 : INFO : saving vocabulary of 2 words to /tmp/corpus.lda-c.vocab
- 2020-09-30 12:28:02,845 : INFO : saving BleiCorpus index to /tmp/corpus.lda-c.index
- 2020-09-30 12:28:02,904 : INFO : no word id mapping provided; initializing from corpus
- 2020-09-30 12:28:02,905 : INFO : storing corpus in List-Of-Words format into /tmp/corpus.low
- 2020-09-30 12:28:02,906 : WARNING : List-of-words format can only save vectors with integer elements; 1 float entries were truncated to integer value
- 2020-09-30 12:28:02,906 : INFO : saving LowCorpus index to /tmp/corpus.low.index
+ 2020-10-19 01:23:39,152 : INFO : converting corpus to SVMlight format: /tmp/corpus.svmlight
+ 2020-10-19 01:23:39,153 : INFO : saving SvmLightCorpus index to /tmp/corpus.svmlight.index
+ 2020-10-19 01:23:39,154 : INFO : no word id mapping provided; initializing from corpus
+ 2020-10-19 01:23:39,154 : INFO : storing corpus in Blei's LDA-C format into /tmp/corpus.lda-c
+ 2020-10-19 01:23:39,154 : INFO : saving vocabulary of 2 words to /tmp/corpus.lda-c.vocab
+ 2020-10-19 01:23:39,154 : INFO : saving BleiCorpus index to /tmp/corpus.lda-c.index
+ 2020-10-19 01:23:39,206 : INFO : no word id mapping provided; initializing from corpus
+ 2020-10-19 01:23:39,207 : INFO : storing corpus in List-Of-Words format into /tmp/corpus.low
+ 2020-10-19 01:23:39,207 : WARNING : List-of-words format can only save vectors with integer elements; 1 float entries were truncated to integer value
+ 2020-10-19 01:23:39,207 : INFO : saving LowCorpus index to /tmp/corpus.low.index
@@ -519,9 +518,9 @@ Conversely, to load a corpus iterator from a Matrix Market file:
.. code-block:: none
- 2020-09-30 12:28:02,968 : INFO : loaded corpus index from /tmp/corpus.mm.index
- 2020-09-30 12:28:02,969 : INFO : initializing cython corpus reader from /tmp/corpus.mm
- 2020-09-30 12:28:02,970 : INFO : accepted corpus with 2 documents, 2 features, 1 non-zero entries
+ 2020-10-19 01:23:39,260 : INFO : loaded corpus index from /tmp/corpus.mm.index
+ 2020-10-19 01:23:39,262 : INFO : initializing cython corpus reader from /tmp/corpus.mm
+ 2020-10-19 01:23:39,262 : INFO : accepted corpus with 2 documents, 2 features, 1 non-zero entries
@@ -620,10 +619,10 @@ To save the same Matrix Market document stream in Blei's LDA-C format,
.. code-block:: none
- 2020-09-30 12:28:03,395 : INFO : no word id mapping provided; initializing from corpus
- 2020-09-30 12:28:03,397 : INFO : storing corpus in Blei's LDA-C format into /tmp/corpus.lda-c
- 2020-09-30 12:28:03,397 : INFO : saving vocabulary of 2 words to /tmp/corpus.lda-c.vocab
- 2020-09-30 12:28:03,398 : INFO : saving BleiCorpus index to /tmp/corpus.lda-c.index
+ 2020-10-19 01:23:39,634 : INFO : no word id mapping provided; initializing from corpus
+ 2020-10-19 01:23:39,636 : INFO : storing corpus in Blei's LDA-C format into /tmp/corpus.lda-c
+ 2020-10-19 01:23:39,636 : INFO : saving vocabulary of 2 words to /tmp/corpus.lda-c.vocab
+ 2020-10-19 01:23:39,636 : INFO : saving BleiCorpus index to /tmp/corpus.lda-c.index
@@ -711,9 +710,9 @@ Optimize converting between corpora and NumPy/SciPy arrays?), see the :ref:`apir
.. rst-class:: sphx-glr-timing
- **Total running time of the script:** ( 0 minutes 3.219 seconds)
+ **Total running time of the script:** ( 0 minutes 2.979 seconds)
-**Estimated memory usage:** 10 MB
+**Estimated memory usage:** 39 MB
.. _sphx_glr_download_auto_examples_core_run_corpora_and_vector_spaces.py:
diff --git a/docs/src/auto_examples/core/sg_execution_times.rst b/docs/src/auto_examples/core/sg_execution_times.rst
index 419b52b786..d346e546cb 100644
--- a/docs/src/auto_examples/core/sg_execution_times.rst
+++ b/docs/src/auto_examples/core/sg_execution_times.rst
@@ -5,14 +5,14 @@
Computation times
=================
-**00:06.698** total execution time for **auto_examples_core** files:
+**00:02.979** total execution time for **auto_examples_core** files:
+--------------------------------------------------------------------------------------------------------------+-----------+---------+
-| :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py` (``run_corpora_and_vector_spaces.py``) | 00:03.219 | 9.7 MB |
+| :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py` (``run_corpora_and_vector_spaces.py``) | 00:02.979 | 38.7 MB |
+--------------------------------------------------------------------------------------------------------------+-----------+---------+
-| :ref:`sphx_glr_auto_examples_core_run_core_concepts.py` (``run_core_concepts.py``) | 00:01.675 | 36.8 MB |
+| :ref:`sphx_glr_auto_examples_core_run_core_concepts.py` (``run_core_concepts.py``) | 00:00.000 | 0.0 MB |
+--------------------------------------------------------------------------------------------------------------+-----------+---------+
-| :ref:`sphx_glr_auto_examples_core_run_topics_and_transformations.py` (``run_topics_and_transformations.py``) | 00:00.970 | 7.2 MB |
+| :ref:`sphx_glr_auto_examples_core_run_similarity_queries.py` (``run_similarity_queries.py``) | 00:00.000 | 0.0 MB |
+--------------------------------------------------------------------------------------------------------------+-----------+---------+
-| :ref:`sphx_glr_auto_examples_core_run_similarity_queries.py` (``run_similarity_queries.py``) | 00:00.834 | 6.5 MB |
+| :ref:`sphx_glr_auto_examples_core_run_topics_and_transformations.py` (``run_topics_and_transformations.py``) | 00:00.000 | 0.0 MB |
+--------------------------------------------------------------------------------------------------------------+-----------+---------+
diff --git a/docs/src/gallery/core/run_corpora_and_vector_spaces.py b/docs/src/gallery/core/run_corpora_and_vector_spaces.py
index 426ecf2407..5a77b4e637 100644
--- a/docs/src/gallery/core/run_corpora_and_vector_spaces.py
+++ b/docs/src/gallery/core/run_corpora_and_vector_spaces.py
@@ -179,7 +179,6 @@ def __iter__(self):
#
# Similarly, to construct the dictionary without loading all texts into memory:
-from six import iteritems
# collect statistics about all tokens
dictionary = corpora.Dictionary(line.lower().split() for line in open('https://radimrehurek.com/gensim/mycorpus.txt'))
# remove stop words and words that appear only once
@@ -188,7 +187,7 @@ def __iter__(self):
for stopword in stoplist
if stopword in dictionary.token2id
]
-once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]
+once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]
dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
dictionary.compactify() # remove gaps in id sequence after words that were removed
print(dictionary)
diff --git a/gensim/corpora/bleicorpus.py b/gensim/corpora/bleicorpus.py
index 1afde870d2..15d79aeffd 100644
--- a/gensim/corpora/bleicorpus.py
+++ b/gensim/corpora/bleicorpus.py
@@ -14,7 +14,6 @@
from gensim import utils
from gensim.corpora import IndexedCorpus
-from six.moves import range
logger = logging.getLogger(__name__)
diff --git a/gensim/corpora/dictionary.py b/gensim/corpora/dictionary.py
index ba3795062d..e046134250 100644
--- a/gensim/corpora/dictionary.py
+++ b/gensim/corpora/dictionary.py
@@ -6,22 +6,13 @@
"""This module implements the concept of a Dictionary -- a mapping between words and their integer ids."""
-from __future__ import with_statement
-
from collections import defaultdict
from collections.abc import Mapping
-import sys
import logging
import itertools
from gensim import utils
-from six import PY3, iteritems, iterkeys, itervalues, string_types
-from six.moves import zip, range
-
-if sys.version_info[0] >= 3:
- unicode = str
-
logger = logging.getLogger(__name__)
@@ -116,15 +107,14 @@ def __iter__(self):
"""Iterate over all tokens."""
return iter(self.keys())
- if PY3:
- # restore Py2-style dict API
- iterkeys = __iter__
+ # restore Py2-style dict API
+ iterkeys = __iter__
- def iteritems(self):
- return self.items()
+ def iteritems(self):
+ return self.items()
- def itervalues(self):
- return self.values()
+ def itervalues(self):
+ return self.values()
def keys(self):
"""Get all stored ids.
@@ -149,7 +139,7 @@ def __len__(self):
return len(self.token2id)
def __str__(self):
- some_keys = list(itertools.islice(iterkeys(self.token2id), 5))
+ some_keys = list(itertools.islice(self.token2id.keys(), 5))
return "Dictionary(%i unique tokens: %s%s)" % (len(self), some_keys, '...' if len(self) > 5 else '')
@staticmethod
@@ -245,35 +235,35 @@ def doc2bow(self, document, allow_update=False, return_missing=False):
([(2, 1)], {u'this': 1, u'is': 1})
"""
- if isinstance(document, string_types):
+ if isinstance(document, str):
raise TypeError("doc2bow expects an array of unicode tokens on input, not a single string")
# Construct (word, frequency) mapping.
counter = defaultdict(int)
for w in document:
- counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1
+ counter[w if isinstance(w, str) else str(w, 'utf-8')] += 1
token2id = self.token2id
if allow_update or return_missing:
- missing = sorted(x for x in iteritems(counter) if x[0] not in token2id)
+ missing = sorted(x for x in counter.items() if x[0] not in token2id)
if allow_update:
for w, _ in missing:
# new id = number of ids made so far;
# NOTE this assumes there are no gaps in the id sequence!
token2id[w] = len(token2id)
- result = {token2id[w]: freq for w, freq in iteritems(counter) if w in token2id}
+ result = {token2id[w]: freq for w, freq in counter.items() if w in token2id}
if allow_update:
self.num_docs += 1
- self.num_pos += sum(itervalues(counter))
+ self.num_pos += sum(counter.values())
self.num_nnz += len(result)
# keep track of document and collection frequencies
- for tokenid, freq in iteritems(result):
+ for tokenid, freq in result.items():
self.cfs[tokenid] = self.cfs.get(tokenid, 0) + freq
self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1
# return tokenids, in ascending id order
- result = sorted(iteritems(result))
+ result = sorted(result.items())
if return_missing:
return result, dict(missing)
else:
@@ -307,10 +297,10 @@ def doc2idx(self, document, unknown_word_index=-1):
[0, 0, 2, -1, 2]
"""
- if isinstance(document, string_types):
+ if isinstance(document, str):
raise TypeError("doc2idx expects an array of unicode tokens on input, not a single string")
- document = [word if isinstance(word, unicode) else unicode(word, 'utf-8') for word in document]
+ document = [word if isinstance(word, str) else str(word, 'utf-8') for word in document]
return [self.token2id.get(word, unknown_word_index) for word in document]
def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None):
@@ -361,13 +351,13 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=N
if keep_tokens:
keep_ids = {self.token2id[v] for v in keep_tokens if v in self.token2id}
good_ids = [
- v for v in itervalues(self.token2id)
+ v for v in self.token2id.values()
if no_below <= self.dfs.get(v, 0) <= no_above_abs or v in keep_ids
]
good_ids.sort(key=lambda x: self.num_docs if x in keep_ids else self.dfs.get(x, 0), reverse=True)
else:
good_ids = [
- v for v in itervalues(self.token2id)
+ v for v in self.token2id.values()
if no_below <= self.dfs.get(v, 0) <= no_above_abs
]
good_ids.sort(key=self.dfs.get, reverse=True)
@@ -408,7 +398,7 @@ def filter_n_most_frequent(self, remove_n):
"""
# determine which tokens to keep
- most_frequent_ids = (v for v in itervalues(self.token2id))
+ most_frequent_ids = (v for v in self.token2id.values())
most_frequent_ids = sorted(most_frequent_ids, key=self.dfs.get, reverse=True)
most_frequent_ids = most_frequent_ids[:remove_n]
# do the actual filtering, then rebuild dictionary to remove gaps in ids
@@ -452,14 +442,14 @@ def filter_tokens(self, bad_ids=None, good_ids=None):
"""
if bad_ids is not None:
bad_ids = set(bad_ids)
- self.token2id = {token: tokenid for token, tokenid in iteritems(self.token2id) if tokenid not in bad_ids}
- self.cfs = {tokenid: freq for tokenid, freq in iteritems(self.cfs) if tokenid not in bad_ids}
- self.dfs = {tokenid: freq for tokenid, freq in iteritems(self.dfs) if tokenid not in bad_ids}
+ self.token2id = {token: tokenid for token, tokenid in self.token2id.items() if tokenid not in bad_ids}
+ self.cfs = {tokenid: freq for tokenid, freq in self.cfs.items() if tokenid not in bad_ids}
+ self.dfs = {tokenid: freq for tokenid, freq in self.dfs.items() if tokenid not in bad_ids}
if good_ids is not None:
good_ids = set(good_ids)
- self.token2id = {token: tokenid for token, tokenid in iteritems(self.token2id) if tokenid in good_ids}
- self.cfs = {tokenid: freq for tokenid, freq in iteritems(self.cfs) if tokenid in good_ids}
- self.dfs = {tokenid: freq for tokenid, freq in iteritems(self.dfs) if tokenid in good_ids}
+ self.token2id = {token: tokenid for token, tokenid in self.token2id.items() if tokenid in good_ids}
+ self.cfs = {tokenid: freq for tokenid, freq in self.cfs.items() if tokenid in good_ids}
+ self.dfs = {tokenid: freq for tokenid, freq in self.dfs.items() if tokenid in good_ids}
self.compactify()
def compactify(self):
@@ -467,13 +457,13 @@ def compactify(self):
logger.debug("rebuilding dictionary, shrinking gaps")
# build mapping from old id -> new id
- idmap = dict(zip(sorted(itervalues(self.token2id)), range(len(self.token2id))))
+ idmap = dict(zip(sorted(self.token2id.values()), range(len(self.token2id))))
# reassign mappings to new ids
- self.token2id = {token: idmap[tokenid] for token, tokenid in iteritems(self.token2id)}
+ self.token2id = {token: idmap[tokenid] for token, tokenid in self.token2id.items()}
self.id2token = {}
- self.dfs = {idmap[tokenid]: freq for tokenid, freq in iteritems(self.dfs)}
- self.cfs = {idmap[tokenid]: freq for tokenid, freq in iteritems(self.cfs)}
+ self.dfs = {idmap[tokenid]: freq for tokenid, freq in self.dfs.items()}
+ self.cfs = {idmap[tokenid]: freq for tokenid, freq in self.cfs.items()}
def save_as_text(self, fname, sort_by_word=True):
"""Save :class:`~gensim.corpora.dictionary.Dictionary` to a text file.
@@ -527,11 +517,11 @@ def save_as_text(self, fname, sort_by_word=True):
numdocs_line = "%d\n" % self.num_docs
fout.write(utils.to_utf8(numdocs_line))
if sort_by_word:
- for token, tokenid in sorted(iteritems(self.token2id)):
+ for token, tokenid in sorted(self.token2id.items()):
line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0))
fout.write(utils.to_utf8(line))
else:
- for tokenid, freq in sorted(iteritems(self.dfs), key=lambda item: -item[1]):
+ for tokenid, freq in sorted(self.dfs.items(), key=lambda item: -item[1]):
line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq)
fout.write(utils.to_utf8(line))
@@ -573,7 +563,7 @@ def merge_with(self, other):
"""
old2new = {}
- for other_id, other_token in iteritems(other):
+ for other_id, other_token in other.items():
if other_token in self.token2id:
new_id = self.token2id[other_token]
else:
@@ -748,11 +738,11 @@ def from_corpus(corpus, id2word=None):
if id2word is None:
# make sure length(result) == get_max_id(corpus) + 1
- result.token2id = {unicode(i): i for i in range(max_id + 1)}
+ result.token2id = {str(i): i for i in range(max_id + 1)}
else:
# id=>word mapping given: simply copy it
- result.token2id = {utils.to_unicode(token): idx for idx, token in iteritems(id2word)}
- for idx in itervalues(result.token2id):
+ result.token2id = {utils.to_unicode(token): idx for idx, token in id2word.items()}
+ for idx in result.token2id.values():
# make sure all token ids have a valid `dfs` entry
result.dfs[idx] = result.dfs.get(idx, 0)
diff --git a/gensim/corpora/hashdictionary.py b/gensim/corpora/hashdictionary.py
index cb3f4053ea..8eb6d87dd1 100644
--- a/gensim/corpora/hashdictionary.py
+++ b/gensim/corpora/hashdictionary.py
@@ -27,14 +27,11 @@
"""
-from __future__ import with_statement
-
import logging
import itertools
import zlib
from gensim import utils
-from six import iteritems, iterkeys
logger = logging.getLogger(__name__)
@@ -252,11 +249,11 @@ def doc2bow(self, document, allow_update=False, return_missing=False):
if self.debug:
# increment document count for each unique tokenid that appeared in the document
# done here, because several words may map to the same tokenid
- for tokenid in iterkeys(result):
+ for tokenid in result.keys():
self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1
# return tokenids, in ascending id order
- result = sorted(iteritems(result))
+ result = sorted(result.items())
if return_missing:
return result, missing
else:
@@ -293,16 +290,16 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000):
"""
no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold
- ok = [item for item in iteritems(self.dfs_debug) if no_below <= item[1] <= no_above_abs]
+ ok = [item for item in self.dfs_debug.items() if no_below <= item[1] <= no_above_abs]
ok = frozenset(word for word, freq in sorted(ok, key=lambda x: -x[1])[:keep_n])
- self.dfs_debug = {word: freq for word, freq in iteritems(self.dfs_debug) if word in ok}
- self.token2id = {token: tokenid for token, tokenid in iteritems(self.token2id) if token in self.dfs_debug}
+ self.dfs_debug = {word: freq for word, freq in self.dfs_debug.items() if word in ok}
+ self.token2id = {token: tokenid for token, tokenid in self.token2id.items() if token in self.dfs_debug}
self.id2token = {
tokenid: {token for token in tokens if token in self.dfs_debug}
- for tokenid, tokens in iteritems(self.id2token)
+ for tokenid, tokens in self.id2token.items()
}
- self.dfs = {tokenid: freq for tokenid, freq in iteritems(self.dfs) if self.id2token.get(tokenid, False)}
+ self.dfs = {tokenid: freq for tokenid, freq in self.dfs.items() if self.id2token.get(tokenid, False)}
# for word->document frequency
logger.info(
diff --git a/gensim/corpora/indexedcorpus.py b/gensim/corpora/indexedcorpus.py
index 1ab8331a0a..8624c54fbf 100644
--- a/gensim/corpora/indexedcorpus.py
+++ b/gensim/corpora/indexedcorpus.py
@@ -8,7 +8,6 @@
"""Base Indexed Corpus class."""
import logging
-import six
import numpy
@@ -182,7 +181,7 @@ def __getitem__(self, docno):
raise RuntimeError("Cannot call corpus[docid] without an index")
if isinstance(docno, (slice, list, numpy.ndarray)):
return utils.SlicedCorpus(self, docno)
- elif isinstance(docno, six.integer_types + (numpy.integer,)):
+ elif isinstance(docno, (int, numpy.integer,)):
return self.docbyoffset(self.index[docno])
# TODO: no `docbyoffset` method, should be defined in this class
else:
diff --git a/gensim/corpora/lowcorpus.py b/gensim/corpora/lowcorpus.py
index d52f190187..80dacf8ec0 100644
--- a/gensim/corpora/lowcorpus.py
+++ b/gensim/corpora/lowcorpus.py
@@ -4,17 +4,13 @@
# Copyright (C) 2010 Radim Rehurek
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
-
"""Corpus in `GibbsLda++ format `_."""
-from __future__ import with_statement
-
import logging
from collections import Counter
from gensim import utils
from gensim.corpora import IndexedCorpus
-from six.moves import zip, range
logger = logging.getLogger(__name__)
diff --git a/gensim/corpora/sharded_corpus.py b/gensim/corpora/sharded_corpus.py
index c9ebf1841b..b8858f93c3 100644
--- a/gensim/corpora/sharded_corpus.py
+++ b/gensim/corpora/sharded_corpus.py
@@ -26,7 +26,6 @@
import numpy
import scipy.sparse as sparse
-from six.moves import range
import gensim
from gensim.corpora import IndexedCorpus
@@ -290,7 +289,7 @@ def init_shards(self, output_prefix, corpus, shardsize=4096, dtype=_default_dtyp
for i, doc in enumerate(doc_chunk):
doc = dict(doc)
- current_shard[i][list(doc)] = list(gensim.matutils.itervalues(doc))
+ current_shard[i][list(doc)] = list(doc.values())
# Handles the updating as well.
if self.sparse_serialization:
diff --git a/gensim/corpora/ucicorpus.py b/gensim/corpora/ucicorpus.py
index 6f5f2f85f3..fd81f3ef2a 100644
--- a/gensim/corpora/ucicorpus.py
+++ b/gensim/corpora/ucicorpus.py
@@ -7,7 +7,6 @@
"""Corpus in `UCI format `_."""
-from __future__ import with_statement
import logging
from collections import defaultdict
@@ -17,7 +16,6 @@
from gensim.corpora import IndexedCorpus
from gensim.matutils import MmReader
from gensim.matutils import MmWriter
-from six.moves import range
logger = logging.getLogger(__name__)
diff --git a/gensim/corpora/wikicorpus.py b/gensim/corpora/wikicorpus.py
index 78affc1028..8c3c94b5ff 100644
--- a/gensim/corpora/wikicorpus.py
+++ b/gensim/corpora/wikicorpus.py
@@ -34,8 +34,6 @@
from gensim.corpora.dictionary import Dictionary
from gensim.corpora.textcorpus import TextCorpus
-from six import raise_from
-
logger = logging.getLogger(__name__)
@@ -704,14 +702,16 @@ def get_texts(self):
yield tokens
except KeyboardInterrupt:
- logger.warn(
+ logger.warning(
"user terminated iteration over Wikipedia corpus after %i documents with %i positions "
"(total %i articles, %i positions before pruning articles shorter than %i words)",
articles, positions, articles_all, positions_all, self.article_min_tokens
)
except PicklingError as exc:
- raise_from(PicklingError('Can not send filtering function {} to multiprocessing, '
- 'make sure the function can be pickled.'.format(self.filter_articles)), exc)
+ raise PicklingError(
+ f'Can not send filtering function {self.filter_articles} to multiprocessing, '
+ 'make sure the function can be pickled.'
+ ) from exc
else:
logger.info(
"finished iterating over Wikipedia corpus of %i documents with %i positions "
diff --git a/gensim/interfaces.py b/gensim/interfaces.py
index 3fd266eb62..3358adaab5 100644
--- a/gensim/interfaces.py
+++ b/gensim/interfaces.py
@@ -14,12 +14,9 @@
"""
-from __future__ import with_statement
-
import logging
from gensim import utils, matutils
-from six.moves import range
logger = logging.getLogger(__name__)
diff --git a/gensim/matutils.py b/gensim/matutils.py
index c9d0e19f59..dbdd3f1439 100644
--- a/gensim/matutils.py
+++ b/gensim/matutils.py
@@ -22,9 +22,6 @@
from scipy.linalg.special_matrices import triu
from scipy.special import psi # gamma function utils
-from six import iteritems, itervalues
-from six.moves import zip, range
-
logger = logging.getLogger(__name__)
@@ -398,7 +395,7 @@ def sparse2full(doc, length):
doc = dict(doc)
# overwrite some of the zeroes with explicit values
- result[list(doc)] = list(itervalues(doc))
+ result[list(doc)] = list(doc.values())
return result
@@ -807,12 +804,12 @@ def cossim(vec1, vec2):
vec1, vec2 = dict(vec1), dict(vec2)
if not vec1 or not vec2:
return 0.0
- vec1len = 1.0 * math.sqrt(sum(val * val for val in itervalues(vec1)))
- vec2len = 1.0 * math.sqrt(sum(val * val for val in itervalues(vec2)))
+ vec1len = 1.0 * math.sqrt(sum(val * val for val in vec1.values()))
+ vec2len = 1.0 * math.sqrt(sum(val * val for val in vec2.values()))
assert vec1len > 0.0 and vec2len > 0.0, "sparse documents must not contain any explicit zero entries"
if len(vec2) < len(vec1):
vec1, vec2 = vec2, vec1 # swap references so that we iterate over the shorter vector
- result = sum(value * vec2.get(index, 0.0) for index, value in iteritems(vec1))
+ result = sum(value * vec2.get(index, 0.0) for index, value in vec1.items())
result /= vec1len * vec2len # rescale by vector lengths
return result
@@ -982,7 +979,7 @@ def jaccard(vec1, vec2):
union = sum(weight for id_, weight in vec1) + sum(weight for id_, weight in vec2)
vec1, vec2 = dict(vec1), dict(vec2)
intersection = 0.0
- for feature_id, feature_weight in iteritems(vec1):
+ for feature_id, feature_weight in vec1.items():
intersection += min(feature_weight, vec2.get(feature_id, 0.0))
return 1 - float(intersection) / float(union)
else:
diff --git a/gensim/models/_fasttext_bin.py b/gensim/models/_fasttext_bin.py
index 77549b1351..5fbce5d926 100644
--- a/gensim/models/_fasttext_bin.py
+++ b/gensim/models/_fasttext_bin.py
@@ -35,7 +35,6 @@
"""
-import codecs
import collections
import gzip
import io
@@ -43,7 +42,6 @@
import struct
import numpy as np
-import six
_END_OF_WORD_MARKER = b'\x00'
@@ -674,7 +672,3 @@ def save(model, fout, fb_fasttext_parameters, encoding):
_save_to_stream(model, fout_stream, fb_fasttext_parameters, encoding)
else:
_save_to_stream(model, fout, fb_fasttext_parameters, encoding)
-
-
-if six.PY2:
- codecs.register_error('backslashreplace', _backslashreplace_backport)
diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py
index afe529a395..fdc3f3c0dd 100755
--- a/gensim/models/atmodel.py
+++ b/gensim/models/atmodel.py
@@ -51,27 +51,27 @@
>>> author_vecs = [model.get_author_topics(author) for author in model.id2author.values()]
"""
+
# TODO: this class inherits LdaModel and overwrites some methods. There is some code
# duplication still, and a refactor could be made to avoid this. Comments with "TODOs"
# are included in the code where this is the case, for example in the log_perplexity
# and do_estep methods.
import logging
-import numpy as np # for arrays, array broadcasting etc.
+from itertools import chain
from copy import deepcopy
from shutil import copyfile
from os.path import isfile
from os import remove
+import numpy as np # for arrays, array broadcasting etc.
+from scipy.special import gammaln # gamma function utils
+
from gensim import utils
from gensim.models import LdaModel
from gensim.models.ldamodel import LdaState
from gensim.matutils import dirichlet_expectation, mean_absolute_difference
from gensim.corpora import MmCorpus
-from itertools import chain
-from scipy.special import gammaln # gamma function utils
-from six.moves import range
-import six
logger = logging.getLogger(__name__)
@@ -283,7 +283,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d
assert self.alpha.shape == (self.num_topics,), \
"Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics)
- if isinstance(eta, six.string_types):
+ if isinstance(eta, str):
if eta == 'asymmetric':
raise ValueError("The 'asymmetric' option cannot be used for eta")
@@ -458,7 +458,7 @@ def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, c
doc_no = d
# Get the IDs and counts of all the words in the current document.
# TODO: this is duplication of code in LdaModel. Refactor.
- if doc and not isinstance(doc[0][0], six.integer_types + (np.integer,)):
+ if doc and not isinstance(doc[0][0], (int, np.integer,)):
# make sure the term IDs are ints, otherwise np will get upset
ids = [int(idx) for idx, _ in doc]
else:
diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py
index d8d7185487..c29e0a0737 100755
--- a/gensim/models/hdpmodel.py
+++ b/gensim/models/hdpmodel.py
@@ -49,7 +49,6 @@
>>> hdp.update([[(1, 2)], [(1, 1), (4, 5)]])
"""
-from __future__ import with_statement
import logging
import time
@@ -57,13 +56,13 @@
import numpy as np
from scipy.special import gammaln, psi # gamma function utils
-from six.moves import zip, range
from gensim import interfaces, utils, matutils
from gensim.matutils import dirichlet_expectation, mean_absolute_difference
from gensim.models import basemodel, ldamodel
from gensim.utils import deprecated
+
logger = logging.getLogger(__name__)
meanchangethresh = 0.00001
diff --git a/gensim/models/lda_dispatcher.py b/gensim/models/lda_dispatcher.py
index 8ca05ba36d..f81cd806eb 100755
--- a/gensim/models/lda_dispatcher.py
+++ b/gensim/models/lda_dispatcher.py
@@ -56,20 +56,16 @@
"""
-from __future__ import with_statement
import argparse
import os
import sys
import logging
import threading
import time
-from six import iteritems, itervalues
+from queue import Queue
-try:
- from Queue import Queue
-except ImportError:
- from queue import Queue
import Pyro4
+
from gensim import utils
from gensim.models.lda_worker import LDA_WORKER_PREFIX
@@ -143,7 +139,7 @@ def initialize(self, **model_params):
self.workers = {}
with utils.getNS(**self.ns_conf) as ns:
self.callback = Pyro4.Proxy(ns.list(prefix=LDA_DISPATCHER_PREFIX)[LDA_DISPATCHER_PREFIX])
- for name, uri in iteritems(ns.list(prefix=LDA_WORKER_PREFIX)):
+ for name, uri in ns.list(prefix=LDA_WORKER_PREFIX).items():
try:
worker = Pyro4.Proxy(uri)
workerid = len(self.workers)
@@ -168,7 +164,7 @@ def getworkers(self):
The pyro URIs for each worker.
"""
- return [worker._pyroUri for worker in itervalues(self.workers)]
+ return [worker._pyroUri for worker in self.workers.values()]
@Pyro4.expose
def getjob(self, worker_id):
@@ -223,7 +219,7 @@ def getstate(self):
i += 1
if i > count:
i = 0
- for workerid, worker in iteritems(self.workers):
+ for workerid, worker in self.workers.items():
logger.info("checking aliveness for worker %s", workerid)
worker.ping()
@@ -246,7 +242,7 @@ def reset(self, state):
State of :class:`~gensim.models.lda.LdaModel`.
"""
- for workerid, worker in iteritems(self.workers):
+ for workerid, worker in self.workers.items():
logger.info("resetting worker %s", workerid)
worker.reset(state)
worker.requestjob()
@@ -289,7 +285,7 @@ def jobsdone(self):
@Pyro4.oneway
def exit(self):
"""Terminate all registered workers and then the dispatcher."""
- for workerid, worker in iteritems(self.workers):
+ for workerid, worker in self.workers.items():
logger.info("terminating worker %s", workerid)
worker.exit()
logger.info("terminating dispatcher")
diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py
index c4a26e6967..0863bcc578 100755
--- a/gensim/models/ldamodel.py
+++ b/gensim/models/ldamodel.py
@@ -90,10 +90,8 @@
import os
import numpy as np
-import six
from scipy.special import gammaln, psi # gamma function utils
from scipy.special import polygamma
-from six.moves import range
from collections import defaultdict
from gensim import interfaces, utils, matutils
@@ -464,7 +462,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
assert self.alpha.shape == (self.num_topics,), \
"Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), self.num_topics)
- if isinstance(eta, six.string_types):
+ if isinstance(eta, str):
if eta == 'asymmetric':
raise ValueError("The 'asymmetric' option cannot be used for eta")
@@ -557,7 +555,7 @@ def init_dir_prior(self, prior, name):
is_auto = False
- if isinstance(prior, six.string_types):
+ if isinstance(prior, str):
if prior == 'symmetric':
logger.info("using symmetric %s at %s", name, 1.0 / self.num_topics)
init_prior = np.fromiter(
@@ -674,7 +672,7 @@ def inference(self, chunk, collect_sstats=False):
# Inference code copied from Hoffman's `onlineldavb.py` (esp. the
# Lee&Seung trick which speeds things up by an order of magnitude, compared
# to Blei's original LDA-C code, cool!).
- integer_types = six.integer_types + (np.integer,)
+ integer_types = (int, np.integer,)
epsilon = np.finfo(self.dtype).eps
for d, doc in enumerate(chunk):
if len(doc) > 0 and not isinstance(doc[0][0], integer_types):
@@ -1585,7 +1583,7 @@ def save(self, fname, ignore=('state', 'dispatcher'), separately=None, *args, **
# make sure 'state', 'id2word' and 'dispatcher' are ignored from the pickled object, even if
# someone sets the ignore list themselves
if ignore is not None and ignore:
- if isinstance(ignore, six.string_types):
+ if isinstance(ignore, str):
ignore = [ignore]
ignore = [e for e in ignore if e] # make sure None and '' are not in the list
ignore = list({'state', 'dispatcher', 'id2word'} | set(ignore))
@@ -1597,15 +1595,15 @@ def save(self, fname, ignore=('state', 'dispatcher'), separately=None, *args, **
separately_explicit = ['expElogbeta', 'sstats']
# Also add 'alpha' and 'eta' to separately list if they are set 'auto' or some
# array manually.
- if (isinstance(self.alpha, six.string_types) and self.alpha == 'auto') or \
+ if (isinstance(self.alpha, str) and self.alpha == 'auto') or \
(isinstance(self.alpha, np.ndarray) and len(self.alpha.shape) != 1):
separately_explicit.append('alpha')
- if (isinstance(self.eta, six.string_types) and self.eta == 'auto') or \
+ if (isinstance(self.eta, str) and self.eta == 'auto') or \
(isinstance(self.eta, np.ndarray) and len(self.eta.shape) != 1):
separately_explicit.append('eta')
# Merge separately_explicit with separately.
if separately:
- if isinstance(separately, six.string_types):
+ if isinstance(separately, str):
separately = [separately]
separately = [e for e in separately if e] # make sure None and '' are not in the list
separately = list(set(separately_explicit) | set(separately))
diff --git a/gensim/models/ldamulticore.py b/gensim/models/ldamulticore.py
index a4a0c297de..6ef8d5ea82 100644
--- a/gensim/models/ldamulticore.py
+++ b/gensim/models/ldamulticore.py
@@ -84,15 +84,14 @@
"""
import logging
+import queue
+from multiprocessing import Pool, Queue, cpu_count
import numpy as np
from gensim import utils
from gensim.models.ldamodel import LdaModel, LdaState
-import six
-from six.moves import queue, range
-from multiprocessing import Pool, Queue, cpu_count
logger = logging.getLogger(__name__)
@@ -173,7 +172,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None,
self.workers = max(1, cpu_count() - 1) if workers is None else workers
self.batch = batch
- if isinstance(alpha, six.string_types) and alpha == 'auto':
+ if isinstance(alpha, str) and alpha == 'auto':
raise NotImplementedError("auto-tuning alpha not implemented in multicore LDA; use plain LdaModel.")
super(LdaMulticore, self).__init__(
diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py
index 527530cf51..100aed748f 100644
--- a/gensim/models/ldaseqmodel.py
+++ b/gensim/models/ldaseqmodel.py
@@ -52,22 +52,26 @@
"""
-from gensim import utils, matutils
-from gensim.models import ldamodel
+import logging
+
import numpy as np
from scipy.special import digamma, gammaln
from scipy import optimize
-import logging
-from six.moves import range, zip
+
+from gensim import utils, matutils
+from gensim.models import ldamodel
+
logger = logging.getLogger(__name__)
class LdaSeqModel(utils.SaveLoad):
"""Estimate Dynamic Topic Model parameters based on a training corpus."""
- def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_topics=10,
- initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10,
- random_state=None, lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100):
+ def __init__(
+ self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_topics=10,
+ initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10,
+ random_state=None, lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20, chunksize=100,
+ ):
"""
Parameters
diff --git a/gensim/models/lsi_dispatcher.py b/gensim/models/lsi_dispatcher.py
index cb7fd4c053..b593e94cd3 100755
--- a/gensim/models/lsi_dispatcher.py
+++ b/gensim/models/lsi_dispatcher.py
@@ -1,290 +1,287 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-#
-# Copyright (C) 2010 Radim Rehurek
-# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
-
-"""Dispatcher process which orchestrates distributed :class:`~gensim.models.lsimodel.LsiModel` computations.
-Run this script only once, on any node in your cluster.
-
-Notes
------
-The dispatcher expects to find worker scripts already running. Make sure you run as many workers as you like on
-your machines **before** launching the dispatcher.
-
-
-How to use distributed LSI
---------------------------
-
-#. Install needed dependencies (Pyro4) ::
-
- pip install gensim[distributed]
-
-#. Setup serialization (on each machine) ::
-
- export PYRO_SERIALIZERS_ACCEPTED=pickle
- export PYRO_SERIALIZER=pickle
-
-#. Run nameserver ::
-
- python -m Pyro4.naming -n 0.0.0.0 &
-
-#. Run workers (on each machine) ::
-
- python -m gensim.models.lsi_worker &
-
-#. Run dispatcher ::
-
- python -m gensim.models.lsi_dispatcher &
-
-#. Run :class:`~gensim.models.lsimodel.LsiModel` in distributed mode:
-
- .. sourcecode:: pycon
-
- >>> from gensim.test.utils import common_corpus, common_dictionary
- >>> from gensim.models import LsiModel
- >>>
- >>> model = LsiModel(common_corpus, id2word=common_dictionary, distributed=True)
-
-Command line arguments
-----------------------
-
-.. program-output:: python -m gensim.models.lsi_dispatcher --help
- :ellipsis: 0, -5
-
-"""
-
-from __future__ import with_statement
-import os
-import sys
-import logging
-import argparse
-import threading
-import time
-from six import iteritems, itervalues
-
-try:
- from Queue import Queue
-except ImportError:
- from queue import Queue
-import Pyro4
-from gensim import utils
-
-logger = logging.getLogger(__name__)
-
-# How many jobs (=chunks of N documents) to keep "pre-fetched" in a queue?
-# A small number is usually enough, unless iteration over the corpus is very very
-# slow (slower than the actual computation of LSI), in which case you can override
-# this value from command line. ie. run "python ./lsi_dispatcher.py 100"
-MAX_JOBS_QUEUE = 10
-
-# timeout for the Queue object put/get blocking methods.
-# it should really be infinity, but then keyboard interrupts don't work.
-# so this is really just a hack, see http://bugs.python.org/issue1360
-HUGE_TIMEOUT = 365 * 24 * 60 * 60 # one year
-
-
-class Dispatcher(object):
- """Dispatcher object that communicates and coordinates individual workers.
-
- Warnings
- --------
- There should never be more than one dispatcher running at any one time.
-
- """
- def __init__(self, maxsize=0):
- """Partly initialize the dispatcher.
-
- A full initialization (including initialization of the workers) requires a call to
- :meth:`~gensim.models.lsi_dispatcher.Dispatcher.initialize`
-
- Parameters
- ----------
- maxsize : int, optional
- Maximum number of jobs to be kept pre-fetched in the queue.
-
- """
- self.maxsize = maxsize
- self.workers = {}
- self.callback = None # a pyro proxy to this object (unknown at init time, but will be set later)
-
- @Pyro4.expose
- def initialize(self, **model_params):
- """Fully initialize the dispatcher and all its workers.
-
- Parameters
- ----------
- **model_params
- Keyword parameters used to initialize individual workers
- (gets handed all the way down to :meth:`gensim.models.lsi_worker.Worker.initialize`).
- See :class:`~gensim.models.lsimodel.LsiModel`.
-
- Raises
- ------
- RuntimeError
- When no workers are found (the :mod:`gensim.model.lsi_worker` script must be ran beforehand).
-
- """
- self.jobs = Queue(maxsize=self.maxsize)
- self.lock_update = threading.Lock()
- self._jobsdone = 0
- self._jobsreceived = 0
-
- # locate all available workers and store their proxies, for subsequent RMI calls
- self.workers = {}
- with utils.getNS() as ns:
- self.callback = Pyro4.Proxy('PYRONAME:gensim.lsi_dispatcher') # = self
- for name, uri in iteritems(ns.list(prefix='gensim.lsi_worker')):
- try:
- worker = Pyro4.Proxy(uri)
- workerid = len(self.workers)
- # make time consuming methods work asynchronously
- logger.info("registering worker #%i from %s", workerid, uri)
- worker.initialize(workerid, dispatcher=self.callback, **model_params)
- self.workers[workerid] = worker
- except Pyro4.errors.PyroError:
- logger.exception("unresponsive worker at %s, deleting it from the name server", uri)
- ns.remove(name)
-
- if not self.workers:
- raise RuntimeError('no workers found; run some lsi_worker scripts on your machines first!')
-
- @Pyro4.expose
- def getworkers(self):
- """Get pyro URIs of all registered workers.
-
- Returns
- -------
- list of URIs
- The pyro URIs for each worker.
-
- """
- return [worker._pyroUri for worker in itervalues(self.workers)]
-
- @Pyro4.expose
- def getjob(self, worker_id):
- """Atomically pop a job from the queue.
-
- Parameters
- ----------
- worker_id : int
- The worker that requested the job.
-
- Returns
- -------
- iterable of iterable of (int, float)
- The corpus in BoW format.
-
- """
- logger.info("worker #%i requesting a new job", worker_id)
- job = self.jobs.get(block=True, timeout=1)
- logger.info("worker #%i got a new job (%i left)", worker_id, self.jobs.qsize())
- return job
-
- @Pyro4.expose
- def putjob(self, job):
- """Atomically add a job to the queue.
-
- Parameters
- ----------
- job : iterable of list of (int, float)
- The corpus in BoW format.
-
- """
- self._jobsreceived += 1
- self.jobs.put(job, block=True, timeout=HUGE_TIMEOUT)
- logger.info("added a new job (len(queue)=%i items)", self.jobs.qsize())
-
- @Pyro4.expose
- def getstate(self):
- """Merge projections from across all workers and get the final projection.
-
- Returns
- -------
- :class:`~gensim.models.lsimodel.Projection`
- The current projection of the total model.
-
- """
- logger.info("end of input, assigning all remaining jobs")
- logger.debug("jobs done: %s, jobs received: %s", self._jobsdone, self._jobsreceived)
- while self._jobsdone < self._jobsreceived:
- time.sleep(0.5) # check every half a second
-
- # TODO: merge in parallel, so that we're done in `log_2(workers)` merges,
- # and not `workers - 1` merges!
- # but merging only takes place once, after all input data has been processed,
- # so the overall effect would be small... compared to the amount of coding :-)
- logger.info("merging states from %i workers", len(self.workers))
- workers = list(self.workers.items())
- result = workers[0][1].getstate()
- for workerid, worker in workers[1:]:
- logger.info("pulling state from worker %s", workerid)
- result.merge(worker.getstate())
- logger.info("sending out merged projection")
- return result
-
- @Pyro4.expose
- def reset(self):
- """Re-initialize all workers for a new decomposition."""
- for workerid, worker in iteritems(self.workers):
- logger.info("resetting worker %s", workerid)
- worker.reset()
- worker.requestjob()
- self._jobsdone = 0
- self._jobsreceived = 0
-
- @Pyro4.expose
- @Pyro4.oneway
- @utils.synchronous('lock_update')
- def jobdone(self, workerid):
- """A worker has finished its job. Log this event and then asynchronously transfer control back to the worker.
-
- Callback used by workers to notify when their job is done.
-
- The job done event is logged and then control is asynchronously transfered back to the worker
- (who can then request another job). In this way, control flow basically oscillates between
- :meth:`gensim.models.lsi_dispatcher.Dispatcher.jobdone` and :meth:`gensim.models.lsi_worker.Worker.requestjob`.
-
- Parameters
- ----------
- workerid : int
- The ID of the worker that finished the job (used for logging).
-
- """
- self._jobsdone += 1
- logger.info("worker #%s finished job #%i", workerid, self._jobsdone)
- worker = self.workers[workerid]
- worker.requestjob() # tell the worker to ask for another job, asynchronously (one-way)
-
- def jobsdone(self):
- """Wrap :attr:`~gensim.models.lsi_dispatcher.Dispatcher._jobsdone`, needed for remote access through proxies.
-
- Returns
- -------
- int
- Number of jobs already completed.
-
- """
- return self._jobsdone
-
- @Pyro4.oneway
- def exit(self):
- """Terminate all registered workers and then the dispatcher."""
- for workerid, worker in iteritems(self.workers):
- logger.info("terminating worker %s", workerid)
- worker.exit()
- logger.info("terminating dispatcher")
- os._exit(0) # exit the whole process (not just this thread ala sys.exit())
-
-
-if __name__ == '__main__':
- logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
- parser = argparse.ArgumentParser(description=__doc__[:-135], formatter_class=argparse.RawTextHelpFormatter)
- parser.add_argument(
- 'maxsize', type=int, help='Maximum number of jobs to be kept pre-fetched in the queue.', default=MAX_JOBS_QUEUE
- )
- args = parser.parse_args()
-
- logger.info("running %s", " ".join(sys.argv))
- utils.pyro_daemon('gensim.lsi_dispatcher', Dispatcher(maxsize=args.maxsize))
- logger.info("finished running %s", parser.prog)
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (C) 2010 Radim Rehurek
+# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
+
+"""Dispatcher process which orchestrates distributed :class:`~gensim.models.lsimodel.LsiModel` computations.
+Run this script only once, on any node in your cluster.
+
+Notes
+-----
+The dispatcher expects to find worker scripts already running. Make sure you run as many workers as you like on
+your machines **before** launching the dispatcher.
+
+
+How to use distributed LSI
+--------------------------
+
+#. Install needed dependencies (Pyro4) ::
+
+ pip install gensim[distributed]
+
+#. Setup serialization (on each machine) ::
+
+ export PYRO_SERIALIZERS_ACCEPTED=pickle
+ export PYRO_SERIALIZER=pickle
+
+#. Run nameserver ::
+
+ python -m Pyro4.naming -n 0.0.0.0 &
+
+#. Run workers (on each machine) ::
+
+ python -m gensim.models.lsi_worker &
+
+#. Run dispatcher ::
+
+ python -m gensim.models.lsi_dispatcher &
+
+#. Run :class:`~gensim.models.lsimodel.LsiModel` in distributed mode:
+
+ .. sourcecode:: pycon
+
+ >>> from gensim.test.utils import common_corpus, common_dictionary
+ >>> from gensim.models import LsiModel
+ >>>
+ >>> model = LsiModel(common_corpus, id2word=common_dictionary, distributed=True)
+
+Command line arguments
+----------------------
+
+.. program-output:: python -m gensim.models.lsi_dispatcher --help
+ :ellipsis: 0, -5
+
+"""
+
+import os
+import sys
+import logging
+import argparse
+import threading
+import time
+from queue import Queue
+
+import Pyro4
+
+from gensim import utils
+
+
+logger = logging.getLogger(__name__)
+
+# How many jobs (=chunks of N documents) to keep "pre-fetched" in a queue?
+# A small number is usually enough, unless iteration over the corpus is very very
+# slow (slower than the actual computation of LSI), in which case you can override
+# this value from command line. ie. run "python ./lsi_dispatcher.py 100"
+MAX_JOBS_QUEUE = 10
+
+# timeout for the Queue object put/get blocking methods.
+# it should really be infinity, but then keyboard interrupts don't work.
+# so this is really just a hack, see http://bugs.python.org/issue1360
+HUGE_TIMEOUT = 365 * 24 * 60 * 60 # one year
+
+
+class Dispatcher:
+ """Dispatcher object that communicates and coordinates individual workers.
+
+ Warnings
+ --------
+ There should never be more than one dispatcher running at any one time.
+
+ """
+ def __init__(self, maxsize=0):
+ """Partly initialize the dispatcher.
+
+ A full initialization (including initialization of the workers) requires a call to
+ :meth:`~gensim.models.lsi_dispatcher.Dispatcher.initialize`
+
+ Parameters
+ ----------
+ maxsize : int, optional
+ Maximum number of jobs to be kept pre-fetched in the queue.
+
+ """
+ self.maxsize = maxsize
+ self.workers = {}
+ self.callback = None # a pyro proxy to this object (unknown at init time, but will be set later)
+
+ @Pyro4.expose
+ def initialize(self, **model_params):
+ """Fully initialize the dispatcher and all its workers.
+
+ Parameters
+ ----------
+ **model_params
+ Keyword parameters used to initialize individual workers
+ (gets handed all the way down to :meth:`gensim.models.lsi_worker.Worker.initialize`).
+ See :class:`~gensim.models.lsimodel.LsiModel`.
+
+ Raises
+ ------
+ RuntimeError
+ When no workers are found (the :mod:`gensim.model.lsi_worker` script must be ran beforehand).
+
+ """
+ self.jobs = Queue(maxsize=self.maxsize)
+ self.lock_update = threading.Lock()
+ self._jobsdone = 0
+ self._jobsreceived = 0
+
+ # locate all available workers and store their proxies, for subsequent RMI calls
+ self.workers = {}
+ with utils.getNS() as ns:
+ self.callback = Pyro4.Proxy('PYRONAME:gensim.lsi_dispatcher') # = self
+ for name, uri in ns.list(prefix='gensim.lsi_worker').items():
+ try:
+ worker = Pyro4.Proxy(uri)
+ workerid = len(self.workers)
+ # make time consuming methods work asynchronously
+ logger.info("registering worker #%i from %s", workerid, uri)
+ worker.initialize(workerid, dispatcher=self.callback, **model_params)
+ self.workers[workerid] = worker
+ except Pyro4.errors.PyroError:
+ logger.exception("unresponsive worker at %s, deleting it from the name server", uri)
+ ns.remove(name)
+
+ if not self.workers:
+ raise RuntimeError('no workers found; run some lsi_worker scripts on your machines first!')
+
+ @Pyro4.expose
+ def getworkers(self):
+ """Get pyro URIs of all registered workers.
+
+ Returns
+ -------
+ list of URIs
+ The pyro URIs for each worker.
+
+ """
+ return [worker._pyroUri for worker in self.workers.values()]
+
+ @Pyro4.expose
+ def getjob(self, worker_id):
+ """Atomically pop a job from the queue.
+
+ Parameters
+ ----------
+ worker_id : int
+ The worker that requested the job.
+
+ Returns
+ -------
+ iterable of iterable of (int, float)
+ The corpus in BoW format.
+
+ """
+ logger.info("worker #%i requesting a new job", worker_id)
+ job = self.jobs.get(block=True, timeout=1)
+ logger.info("worker #%i got a new job (%i left)", worker_id, self.jobs.qsize())
+ return job
+
+ @Pyro4.expose
+ def putjob(self, job):
+ """Atomically add a job to the queue.
+
+ Parameters
+ ----------
+ job : iterable of list of (int, float)
+ The corpus in BoW format.
+
+ """
+ self._jobsreceived += 1
+ self.jobs.put(job, block=True, timeout=HUGE_TIMEOUT)
+ logger.info("added a new job (len(queue)=%i items)", self.jobs.qsize())
+
+ @Pyro4.expose
+ def getstate(self):
+ """Merge projections from across all workers and get the final projection.
+
+ Returns
+ -------
+ :class:`~gensim.models.lsimodel.Projection`
+ The current projection of the total model.
+
+ """
+ logger.info("end of input, assigning all remaining jobs")
+ logger.debug("jobs done: %s, jobs received: %s", self._jobsdone, self._jobsreceived)
+ while self._jobsdone < self._jobsreceived:
+ time.sleep(0.5) # check every half a second
+
+ # TODO: merge in parallel, so that we're done in `log_2(workers)` merges,
+ # and not `workers - 1` merges!
+ # but merging only takes place once, after all input data has been processed,
+ # so the overall effect would be small... compared to the amount of coding :-)
+ logger.info("merging states from %i workers", len(self.workers))
+ workers = list(self.workers.items())
+ result = workers[0][1].getstate()
+ for workerid, worker in workers[1:]:
+ logger.info("pulling state from worker %s", workerid)
+ result.merge(worker.getstate())
+ logger.info("sending out merged projection")
+ return result
+
+ @Pyro4.expose
+ def reset(self):
+ """Re-initialize all workers for a new decomposition."""
+ for workerid, worker in self.workers.items():
+ logger.info("resetting worker %s", workerid)
+ worker.reset()
+ worker.requestjob()
+ self._jobsdone = 0
+ self._jobsreceived = 0
+
+ @Pyro4.expose
+ @Pyro4.oneway
+ @utils.synchronous('lock_update')
+ def jobdone(self, workerid):
+ """A worker has finished its job. Log this event and then asynchronously transfer control back to the worker.
+
+ Callback used by workers to notify when their job is done.
+
+ The job done event is logged and then control is asynchronously transfered back to the worker
+ (who can then request another job). In this way, control flow basically oscillates between
+ :meth:`gensim.models.lsi_dispatcher.Dispatcher.jobdone` and :meth:`gensim.models.lsi_worker.Worker.requestjob`.
+
+ Parameters
+ ----------
+ workerid : int
+ The ID of the worker that finished the job (used for logging).
+
+ """
+ self._jobsdone += 1
+ logger.info("worker #%s finished job #%i", workerid, self._jobsdone)
+ worker = self.workers[workerid]
+ worker.requestjob() # tell the worker to ask for another job, asynchronously (one-way)
+
+ def jobsdone(self):
+ """Wrap :attr:`~gensim.models.lsi_dispatcher.Dispatcher._jobsdone`, needed for remote access through proxies.
+
+ Returns
+ -------
+ int
+ Number of jobs already completed.
+
+ """
+ return self._jobsdone
+
+ @Pyro4.oneway
+ def exit(self):
+ """Terminate all registered workers and then the dispatcher."""
+ for workerid, worker in self.workers.items():
+ logger.info("terminating worker %s", workerid)
+ worker.exit()
+ logger.info("terminating dispatcher")
+ os._exit(0) # exit the whole process (not just this thread ala sys.exit())
+
+
+if __name__ == '__main__':
+ logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
+ parser = argparse.ArgumentParser(description=__doc__[:-135], formatter_class=argparse.RawTextHelpFormatter)
+ parser.add_argument(
+ 'maxsize', type=int, help='Maximum number of jobs to be kept pre-fetched in the queue.', default=MAX_JOBS_QUEUE
+ )
+ args = parser.parse_args()
+
+ logger.info("running %s", " ".join(sys.argv))
+ utils.pyro_daemon('gensim.lsi_dispatcher', Dispatcher(maxsize=args.maxsize))
+ logger.info("finished running %s", parser.prog)
diff --git a/gensim/models/lsimodel.py b/gensim/models/lsimodel.py
index 8c160e02ca..d40690fd70 100644
--- a/gensim/models/lsimodel.py
+++ b/gensim/models/lsimodel.py
@@ -66,8 +66,6 @@
import scipy.linalg
import scipy.sparse
from scipy.sparse import sparsetools
-from six import iterkeys
-from six.moves import range
from gensim import interfaces, matutils, utils
from gensim.models import basemodel
@@ -833,7 +831,7 @@ def print_debug(id2token, u, s, topics, num_words=10, num_neg=None):
result.setdefault(topic, []).append((udiff[topic], uvecno))
logger.debug("printing %i+%i salient words", num_words, num_neg)
- for topic in sorted(iterkeys(result)):
+ for topic in sorted(result.keys()):
weights = sorted(result[topic], key=lambda x: -abs(x[0]))
_, most = weights[0]
if u[most, topic] < 0.0: # the most significant word has a negative sign => flip sign of u[most]
diff --git a/gensim/models/poincare.py b/gensim/models/poincare.py
index 1133c52061..050aa52e9b 100644
--- a/gensim/models/poincare.py
+++ b/gensim/models/poincare.py
@@ -47,17 +47,11 @@
from numbers import Integral
import sys
import time
+from collections import defaultdict, Counter
import numpy as np
-from collections import defaultdict, Counter
from numpy import random as np_random, float32 as REAL
from scipy.stats import spearmanr
-from six import string_types
-from six.moves import zip, range
-
-from gensim import utils, matutils
-from gensim.models.keyedvectors import KeyedVectors
-
try:
from autograd import grad # Only required for optionally verifying gradients while training
from autograd import numpy as grad_np
@@ -65,6 +59,10 @@
except ImportError:
AUTOGRAD_PRESENT = False
+from gensim import utils, matutils
+from gensim.models.keyedvectors import KeyedVectors
+
+
logger = logging.getLogger(__name__)
@@ -1156,7 +1154,7 @@ def most_similar(self, node_or_vector, topn=10, restrict_vocab=None):
nodes_to_use = self.index_to_key[:restrict_vocab]
all_distances = self.distances(node_or_vector, nodes_to_use)
- if isinstance(node_or_vector, string_types + (int,)):
+ if isinstance(node_or_vector, (str, int,)):
node_index = self.get_index(node_or_vector)
else:
node_index = None
@@ -1214,7 +1212,7 @@ def distances(self, node_or_vector, other_nodes=()):
If either `node_or_vector` or any node in `other_nodes` is absent from vocab.
"""
- if isinstance(node_or_vector, string_types):
+ if isinstance(node_or_vector, str):
input_vector = self.get_vector(node_or_vector)
else:
input_vector = node_or_vector
@@ -1259,7 +1257,7 @@ def norm(self, node_or_vector):
The position in hierarchy is based on the norm of the vector for the node.
"""
- if isinstance(node_or_vector, string_types):
+ if isinstance(node_or_vector, str):
input_vector = self.get_vector(node_or_vector)
else:
input_vector = node_or_vector
diff --git a/gensim/models/tfidfmodel.py b/gensim/models/tfidfmodel.py
index a363e73298..d0e3d653ef 100644
--- a/gensim/models/tfidfmodel.py
+++ b/gensim/models/tfidfmodel.py
@@ -17,11 +17,11 @@
from functools import partial
import re
+import numpy as np
+
from gensim import interfaces, matutils, utils
from gensim.utils import deprecated
-from six import iteritems, iterkeys
-import numpy as np
logger = logging.getLogger(__name__)
@@ -155,7 +155,7 @@ def precompute_idfs(wglobal, dfs, total_docs):
"""
# not strictly necessary and could be computed on the fly in TfidfModel__getitem__.
# this method is here just to speed things up a little.
- return {termid: wglobal(df, total_docs) for termid, df in iteritems(dfs)}
+ return {termid: wglobal(df, total_docs) for termid, df in dfs.items()}
def smartirs_wlocal(tf, local_scheme):
@@ -389,7 +389,7 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden
self.num_docs, self.num_nnz = dictionary.num_docs, dictionary.num_nnz
self.cfs = dictionary.cfs.copy()
self.dfs = dictionary.dfs.copy()
- self.term_lens = {termid: len(term) for termid, term in iteritems(dictionary)}
+ self.term_lens = {termid: len(term) for termid, term in dictionary.items()}
self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
if not id2word:
self.id2word = dictionary
@@ -415,7 +415,7 @@ def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.iden
self.pivot = 1.0 * self.num_nnz / self.num_docs
elif n_n == "b":
self.pivot = 1.0 * sum(
- self.cfs[termid] * (self.term_lens[termid] + 1.0) for termid in iterkeys(dictionary)
+ self.cfs[termid] * (self.term_lens[termid] + 1.0) for termid in dictionary.keys()
) / self.num_docs
@classmethod
diff --git a/gensim/parsing/porter.py b/gensim/parsing/porter.py
index c579f44beb..528103a874 100644
--- a/gensim/parsing/porter.py
+++ b/gensim/parsing/porter.py
@@ -30,9 +30,6 @@
"""
-from six.moves import range
-
-
class PorterStemmer(object):
"""Class contains implementation of Porter stemming algorithm.
diff --git a/gensim/scripts/benchmark.py b/gensim/scripts/benchmark.py
index c920b2b32f..d963129a9c 100644
--- a/gensim/scripts/benchmark.py
+++ b/gensim/scripts/benchmark.py
@@ -6,7 +6,7 @@
"""
Help script (template) for benchmarking. Run with:
- /usr/bin/time --format "%E elapsed\n%Mk max mem" python -m gensim.scripts.benchmark ~/gensim-data/text9/text9.txt
+ /usr/bin/time --format "%E elapsed\n%Mk peak RAM" python -m gensim.scripts.benchmark ~/gensim-data/text9/text9.txt
"""
diff --git a/gensim/similarities/docsim.py b/gensim/similarities/docsim.py
index 328d610a0d..14ada07904 100755
--- a/gensim/similarities/docsim.py
+++ b/gensim/similarities/docsim.py
@@ -77,7 +77,6 @@
import scipy.sparse
from gensim import interfaces, utils, matutils
-from six.moves import map, range, zip
logger = logging.getLogger(__name__)
diff --git a/gensim/similarities/termsim.py b/gensim/similarities/termsim.py
index 9a83458f84..3949d77960 100644
--- a/gensim/similarities/termsim.py
+++ b/gensim/similarities/termsim.py
@@ -14,7 +14,6 @@
from math import sqrt
import numpy as np
-from six.moves import range
from scipy import sparse
from gensim.matutils import corpus2csc
@@ -22,9 +21,11 @@
logger = logging.getLogger(__name__)
-NON_NEGATIVE_NORM_ASSERTION_MESSAGE = u"sparse documents must not contain any explicit " \
- u"zero entries and the similarity matrix S must satisfy x^T * S * x >= 0 for any " \
+NON_NEGATIVE_NORM_ASSERTION_MESSAGE = (
+ u"sparse documents must not contain any explicit "
+ u"zero entries and the similarity matrix S must satisfy x^T * S * x >= 0 for any "
u"nonzero bag-of-words vector x."
+)
class TermSimilarityIndex(SaveLoad):
diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py
index 370897bfdb..9f01f9818b 100644
--- a/gensim/sklearn_api/d2vmodel.py
+++ b/gensim/sklearn_api/d2vmodel.py
@@ -19,8 +19,9 @@
>>> docvecs = model.fit_transform(common_texts) # represent `common_texts` as vectors
"""
+
import numpy as np
-from six import string_types
+
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.exceptions import NotFittedError
@@ -195,7 +196,7 @@ def transform(self, docs):
)
# The input as array of array
- if isinstance(docs[0], string_types):
+ if isinstance(docs[0], str):
docs = [docs]
vectors = [self.gensim_model.infer_vector(doc) for doc in docs]
return np.reshape(np.array(vectors), (len(docs), self.gensim_model.vector_size))
diff --git a/gensim/sklearn_api/ftmodel.py b/gensim/sklearn_api/ftmodel.py
index 7acd22cfc2..c42f95274e 100644
--- a/gensim/sklearn_api/ftmodel.py
+++ b/gensim/sklearn_api/ftmodel.py
@@ -42,7 +42,7 @@
"""
import numpy as np
-import six
+
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.exceptions import NotFittedError
@@ -222,7 +222,7 @@ def transform(self, words):
)
# The input as array of array
- if isinstance(words, six.string_types):
+ if isinstance(words, str):
words = [words]
vectors = [self.gensim_model.wv[word] for word in words]
return np.reshape(np.array(vectors), (len(words), self.vector_size))
diff --git a/gensim/sklearn_api/text2bow.py b/gensim/sklearn_api/text2bow.py
index 7cae8a609b..8d982bfa9c 100644
--- a/gensim/sklearn_api/text2bow.py
+++ b/gensim/sklearn_api/text2bow.py
@@ -25,7 +25,7 @@
[[(0, 1), (1, 1), (2, 1)], [(1, 1), (2, 1), (3, 1)]]
"""
-from six import string_types
+
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.exceptions import NotFittedError
@@ -91,7 +91,7 @@ def transform(self, docs):
)
# input as python lists
- if isinstance(docs, string_types):
+ if isinstance(docs, str):
docs = [docs]
tokenized_docs = (list(self.tokenizer(doc)) for doc in docs)
return [self.gensim_model.doc2bow(doc) for doc in tokenized_docs]
diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py
index ae64b56e3e..8c0bd932a1 100644
--- a/gensim/sklearn_api/w2vmodel.py
+++ b/gensim/sklearn_api/w2vmodel.py
@@ -25,8 +25,9 @@
>>> assert wordvecs.shape == (2, 10)
"""
+
import numpy as np
-import six
+
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.exceptions import NotFittedError
@@ -173,7 +174,7 @@ def transform(self, words):
)
# The input as array of array
- if isinstance(words, six.string_types):
+ if isinstance(words, str):
words = [words]
vectors = [self.gensim_model.wv[word] for word in words]
return np.reshape(np.array(vectors), (len(words), self.vector_size))
diff --git a/gensim/test/basetmtests.py b/gensim/test/basetmtests.py
index 78587c1e56..56de810691 100644
--- a/gensim/test/basetmtests.py
+++ b/gensim/test/basetmtests.py
@@ -9,28 +9,27 @@
"""
import numpy as np
-import six
-class TestBaseTopicModel(object):
+class TestBaseTopicModel:
def test_print_topic(self):
topics = self.model.show_topics(formatted=True)
for topic_no, topic in topics:
self.assertTrue(isinstance(topic_no, int))
- self.assertTrue(isinstance(topic, str) or isinstance(topic, unicode)) # noqa:F821
+ self.assertTrue(isinstance(topic, str))
def test_print_topics(self):
topics = self.model.print_topics()
for topic_no, topic in topics:
self.assertTrue(isinstance(topic_no, int))
- self.assertTrue(isinstance(topic, str) or isinstance(topic, unicode)) # noqa:F821
+ self.assertTrue(isinstance(topic, str))
def test_show_topic(self):
topic = self.model.show_topic(1)
for k, v in topic:
- self.assertTrue(isinstance(k, six.string_types))
+ self.assertTrue(isinstance(k, str))
self.assertTrue(isinstance(v, (np.floating, float)))
def test_show_topics(self):
@@ -40,7 +39,7 @@ def test_show_topics(self):
self.assertTrue(isinstance(topic_no, int))
self.assertTrue(isinstance(topic, list))
for k, v in topic:
- self.assertTrue(isinstance(k, six.string_types))
+ self.assertTrue(isinstance(k, str))
self.assertTrue(isinstance(v, (np.floating, float)))
def test_get_topics(self):
diff --git a/gensim/test/test_atmodel.py b/gensim/test/test_atmodel.py
index 50e6a32ea9..ccb6cce0a7 100644
--- a/gensim/test/test_atmodel.py
+++ b/gensim/test/test_atmodel.py
@@ -17,7 +17,6 @@
import numbers
from os import remove
-import six
import numpy as np
from gensim.corpora import mmcorpus, Dictionary
@@ -27,6 +26,7 @@
from gensim.test.utils import (datapath,
get_tmpfile, common_texts, common_dictionary as dictionary, common_corpus as corpus)
from gensim.matutils import jensen_shannon
+
# TODO:
# Test that computing the bound on new unseen documents works as expected (this is somewhat different
# in the author-topic model than in LDA).
@@ -413,7 +413,7 @@ def testTopTopics(self):
self.assertTrue(isinstance(score, float))
for v, k in topic:
- self.assertTrue(isinstance(k, six.string_types))
+ self.assertTrue(isinstance(k, str))
self.assertTrue(isinstance(v, float))
def testGetTopicTerms(self):
diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py
index 3b518f95c6..f1c17ac0c9 100644
--- a/gensim/test/test_ldamodel.py
+++ b/gensim/test/test_ldamodel.py
@@ -14,7 +14,6 @@
import os
import unittest
-import six
import numpy as np
from numpy.testing import assert_allclose
@@ -203,7 +202,7 @@ def testTopTopics(self):
self.assertTrue(isinstance(score, float))
for v, k in topic:
- self.assertTrue(isinstance(k, six.string_types))
+ self.assertTrue(isinstance(k, str))
self.assertTrue(np.issubdtype(v, np.floating))
def testGetTopicTerms(self):
@@ -466,7 +465,7 @@ def testRandomStateBackwardCompatibility(self):
for i in model_topics:
self.assertTrue(isinstance(i[0], int))
- self.assertTrue(isinstance(i[1], six.string_types))
+ self.assertTrue(isinstance(i[1], str))
# save back the loaded model using a post-0.13.2 version of Gensim
post_0_13_2_fname = get_tmpfile('gensim_models_lda_post_0_13_2_model.tst')
@@ -478,7 +477,7 @@ def testRandomStateBackwardCompatibility(self):
for i in model_topics_new:
self.assertTrue(isinstance(i[0], int))
- self.assertTrue(isinstance(i[1], six.string_types))
+ self.assertTrue(isinstance(i[1], str))
def testDtypeBackwardCompatibility(self):
lda_3_0_1_fname = datapath('lda_3_0_1_model')
diff --git a/gensim/test/test_ldavowpalwabbit_wrapper.py b/gensim/test/test_ldavowpalwabbit_wrapper.py
index bedcdebc63..ddec17da23 100644
--- a/gensim/test/test_ldavowpalwabbit_wrapper.py
+++ b/gensim/test/test_ldavowpalwabbit_wrapper.py
@@ -19,8 +19,6 @@
import tempfile
from collections import defaultdict
-import six
-
from gensim.corpora import Dictionary
import gensim.models.wrappers.ldavowpalwabbit as ldavowpalwabbit
@@ -154,7 +152,7 @@ def test_topic_coherence(self):
# get list of original topics that each word actually belongs to
ids = []
for word in topic_words:
- for src_topic_words, src_topic_id in six.iteritems(topic_map):
+ for src_topic_words, src_topic_id in topic_map.items():
if word in src_topic_words:
ids.append(src_topic_id)
@@ -165,7 +163,7 @@ def test_topic_coherence(self):
# if at least 6/10 words assigned to same topic, consider it coherent
max_count = 0
- for count in six.itervalues(counts):
+ for count in counts.values():
max_count = max(max_count, count)
if max_count >= 6:
diff --git a/gensim/test/test_nmf.py b/gensim/test/test_nmf.py
index 763f61360c..e0e71433cb 100644
--- a/gensim/test/test_nmf.py
+++ b/gensim/test/test_nmf.py
@@ -14,7 +14,6 @@
import logging
import numbers
import numpy as np
-import six
from gensim import matutils
from gensim.models import nmf
@@ -108,7 +107,7 @@ def testTopTopics(self):
self.assertTrue(isinstance(score, float))
for v, k in topic:
- self.assertTrue(isinstance(k, six.string_types))
+ self.assertTrue(isinstance(k, str))
self.assertTrue(np.issubdtype(v, float))
def testGetTopicTerms(self):
diff --git a/gensim/test/test_utils.py b/gensim/test/test_utils.py
index 626c0de06c..18436bf655 100644
--- a/gensim/test/test_utils.py
+++ b/gensim/test/test_utils.py
@@ -6,13 +6,11 @@
"""
Automated tests for checking various utils functions.
"""
-from __future__ import unicode_literals
import logging
import unittest
import numpy as np
-from six import iteritems
from gensim import utils
from gensim.test.utils import datapath, get_tmpfile
@@ -112,7 +110,7 @@ class TestSampleDict(unittest.TestCase):
def test_sample_dict(self):
d = {1: 2, 2: 3, 3: 4, 4: 5}
expected_dict = [(1, 2), (2, 3)]
- expected_dict_random = [(k, v) for k, v in iteritems(d)]
+ expected_dict_random = [(k, v) for k, v in d.items()]
sampled_dict = utils.sample_dict(d, 2, False)
self.assertEqual(sampled_dict, expected_dict)
sampled_dict_random = utils.sample_dict(d, 2)
diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py
index c867f045e8..a9e57036f0 100644
--- a/gensim/test/test_word2vec.py
+++ b/gensim/test/test_word2vec.py
@@ -7,21 +7,15 @@
"""
Automated tests for checking transformation algorithms (the models package).
"""
-from __future__ import division
import logging
import unittest
import os
import bz2
import sys
-import six
import numpy as np
-from gensim import utils
-from gensim.models import word2vec, keyedvectors
-from gensim.test.utils import datapath, get_tmpfile, temporary_file, common_texts as sentences, \
- LeeCorpus, lee_corpus_list
from testfixtures import log_capture
try:
@@ -30,6 +24,13 @@
except (ImportError, ValueError):
PYEMD_EXT = False
+from gensim import utils
+from gensim.models import word2vec, keyedvectors
+from gensim.test.utils import (
+ datapath, get_tmpfile, temporary_file, common_texts as sentences,
+ LeeCorpus, lee_corpus_list,
+)
+
new_sentences = [
['computer', 'artificial', 'intelligence'],
@@ -166,7 +167,6 @@ def testOnlineLearningAfterSave(self):
model_neg.train(new_sentences, total_examples=model_neg.corpus_count, epochs=model_neg.epochs)
self.assertEqual(len(model_neg.wv), 14)
- @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27")
def testOnlineLearningFromFile(self):
"""Test that the algorithm is able to add new words to the
vocabulary and to a trained model when using a sorted vocabulary"""
@@ -192,7 +192,6 @@ def testOnlineLearningFromFile(self):
self.assertEqual(len(model_hs.wv), 14)
self.assertEqual(len(model_neg.wv), 14)
- @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27")
def testOnlineLearningAfterSaveFromFile(self):
"""Test that the algorithm is able to add new words to the
vocabulary and to a trained model when using a sorted vocabulary"""
@@ -273,7 +272,6 @@ def testPersistence(self):
self.assertTrue(np.allclose(wv.vectors, loaded_wv.vectors))
self.assertEqual(len(wv), len(loaded_wv))
- @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27")
def testPersistenceFromFile(self):
"""Test storing/loading the entire model trained with corpus_file argument."""
with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file:
@@ -498,7 +496,6 @@ def testTraining(self):
model2 = word2vec.Word2Vec(sentences, vector_size=2, min_count=1, hs=1, negative=0)
self.models_equal(model, model2)
- @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27")
def testTrainingFromFile(self):
"""Test word2vec training with corpus_file argument."""
# build vocabulary, don't train yet
@@ -574,7 +571,6 @@ def testEvaluateWordPairs(self):
self.assertTrue(0.1 < spearman < 1.0, "spearman {spearman} not between 0.1 and 1.0")
self.assertTrue(0.0 <= oov < 90.0, "OOV {oov} not between 0.0 and 90.0")
- @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27")
def testEvaluateWordPairsFromFile(self):
"""Test Spearman and Pearson correlation coefficients give sane results on similarity datasets"""
with temporary_file(get_tmpfile('gensim_word2vec.tst')) as tf:
@@ -621,7 +617,6 @@ def test_sg_hs(self):
model = word2vec.Word2Vec(sg=1, window=4, hs=1, negative=0, min_count=5, epochs=10, workers=2)
self.model_sanity(model)
- @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27")
def test_sg_hs_fromfile(self):
model = word2vec.Word2Vec(sg=1, window=4, hs=1, negative=0, min_count=5, epochs=10, workers=2)
self.model_sanity(model, with_corpus_file=True)
@@ -631,7 +626,6 @@ def test_sg_neg(self):
model = word2vec.Word2Vec(sg=1, window=4, hs=0, negative=15, min_count=5, epochs=10, workers=2)
self.model_sanity(model)
- @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27")
def test_sg_neg_fromfile(self):
model = word2vec.Word2Vec(sg=1, window=4, hs=0, negative=15, min_count=5, epochs=10, workers=2)
self.model_sanity(model, with_corpus_file=True)
@@ -666,7 +660,6 @@ def test_cbow_hs(self, ranks=None):
)
self.model_sanity(model, ranks=ranks)
- @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27")
def test_cbow_hs_fromfile(self):
model = word2vec.Word2Vec(
sg=0, cbow_mean=1, alpha=0.1, window=2, hs=1, negative=0,
@@ -682,7 +675,6 @@ def test_cbow_neg(self, ranks=None):
)
self.model_sanity(model, ranks=ranks)
- @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27")
def test_cbow_neg_fromfile(self):
model = word2vec.Word2Vec(
sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=15,
@@ -1062,7 +1054,6 @@ def testLineSentenceWorksWithFilename(self):
for words in sentences:
self.assertEqual(words, utils.to_unicode(orig.readline()).split())
- @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27")
def testCythonLineSentenceWorksWithFilename(self):
"""Does CythonLineSentence work with a filename argument?"""
from gensim.models import word2vec_corpusfile
diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py
index 477a9f2bc3..fb4fda99b8 100644
--- a/gensim/topic_coherence/text_analysis.py
+++ b/gensim/topic_coherence/text_analysis.py
@@ -15,7 +15,6 @@
import numpy as np
import scipy.sparse as sps
-from six import iteritems, string_types
from gensim import utils
from gensim.models.word2vec import Word2Vec
@@ -127,7 +126,7 @@ def analyze_text(self, text, doc_num=None):
raise NotImplementedError("Base classes should implement analyze_text.")
def __getitem__(self, word_or_words):
- if isinstance(word_or_words, string_types) or not hasattr(word_or_words, '__iter__'):
+ if isinstance(word_or_words, str) or not hasattr(word_or_words, '__iter__'):
return self.get_occurrences(word_or_words)
else:
return self.get_co_occurrences(*word_or_words)
@@ -250,7 +249,7 @@ def _get_co_occurrences(self, word_id1, word_id2):
return len(s1.intersection(s2))
def index_to_dict(self):
- contiguous2id = {n: word_id for word_id, n in iteritems(self.id2contiguous)}
+ contiguous2id = {n: word_id for word_id, n in self.id2contiguous.items()}
return {contiguous2id[n]: doc_id_set for n, doc_id_set in enumerate(self._inverted_index)}
@@ -359,7 +358,7 @@ def partial_accumulate(self, texts, window_size):
self._counter.clear()
super(WordOccurrenceAccumulator, self).accumulate(texts, window_size)
- for combo, count in iteritems(self._counter):
+ for combo, count in self._counter.items():
self._co_occurrences[combo] += count
return self
diff --git a/setup.py b/setup.py
index 426e89912a..6c09eaf6fa 100644
--- a/setup.py
+++ b/setup.py
@@ -332,7 +332,6 @@ def run(self):
install_requires = [
NUMPY_STR,
'scipy >= 0.18.1',
- 'six >= 1.5.0',
'smart_open >= 1.8.1',
"dataclasses; python_version < '3.7'", # pre-py3.7 needs `dataclasses` backport for use of `dataclass` in doc2vec.py
]