Skip to content

Commit

Permalink
Merge pull request #2984 from RaRe-Technologies/remove_six
Browse files Browse the repository at this point in the history
[MRG] Remove dependency on `six`
  • Loading branch information
piskvorky authored Oct 19, 2020
2 parents 87ad617 + 86fe8ef commit 94a227b
Show file tree
Hide file tree
Showing 44 changed files with 476 additions and 543 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@
},
"outputs": [],
"source": [
"from six import iteritems\n# collect statistics about all tokens\ndictionary = corpora.Dictionary(line.lower().split() for line in open('https://radimrehurek.com/gensim/mycorpus.txt'))\n# remove stop words and words that appear only once\nstop_ids = [\n dictionary.token2id[stopword]\n for stopword in stoplist\n if stopword in dictionary.token2id\n]\nonce_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]\ndictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once\ndictionary.compactify() # remove gaps in id sequence after words that were removed\nprint(dictionary)"
"# collect statistics about all tokens\ndictionary = corpora.Dictionary(line.lower().split() for line in open('https://radimrehurek.com/gensim/mycorpus.txt'))\n# remove stop words and words that appear only once\nstop_ids = [\n dictionary.token2id[stopword]\n for stopword in stoplist\n if stopword in dictionary.token2id\n]\nonce_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]\ndictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once\ndictionary.compactify() # remove gaps in id sequence after words that were removed\nprint(dictionary)"
]
},
{
Expand Down
3 changes: 1 addition & 2 deletions docs/src/auto_examples/core/run_corpora_and_vector_spaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,6 @@ def __iter__(self):
#
# Similarly, to construct the dictionary without loading all texts into memory:

from six import iteritems
# collect statistics about all tokens
dictionary = corpora.Dictionary(line.lower().split() for line in open('https://radimrehurek.com/gensim/mycorpus.txt'))
# remove stop words and words that appear only once
Expand All @@ -188,7 +187,7 @@ def __iter__(self):
for stopword in stoplist
if stopword in dictionary.token2id
]
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]
dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
dictionary.compactify() # remove gaps in id sequence after words that were removed
print(dictionary)
Expand Down
Original file line number Diff line number Diff line change
@@ -1 +1 @@
e017de81683bfd2f6005a3186bfc1eb3
c239d5c523ea2b3af1f6d4c6c51e7925
75 changes: 37 additions & 38 deletions docs/src/auto_examples/core/run_corpora_and_vector_spaces.rst
Original file line number Diff line number Diff line change
Expand Up @@ -159,10 +159,10 @@ between the questions and ids is called a dictionary:

.. code-block:: none
2020-09-30 12:28:00,819 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-09-30 12:28:00,820 : INFO : built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)
2020-09-30 12:28:00,821 : INFO : saving Dictionary object under /tmp/deerwester.dict, separately None
2020-09-30 12:28:00,822 : INFO : saved /tmp/deerwester.dict
2020-10-19 01:23:37,722 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-10-19 01:23:37,722 : INFO : built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)
2020-10-19 01:23:37,722 : INFO : saving Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) under /tmp/deerwester.dict, separately None
2020-10-19 01:23:37,723 : INFO : saved /tmp/deerwester.dict
Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)
Expand Down Expand Up @@ -244,11 +244,11 @@ therefore reads: in the document `"Human computer interaction"`, the words `comp

.. code-block:: none
2020-09-30 12:28:01,181 : INFO : storing corpus in Matrix Market format to /tmp/deerwester.mm
2020-09-30 12:28:01,182 : INFO : saving sparse matrix to /tmp/deerwester.mm
2020-09-30 12:28:01,182 : INFO : PROGRESS: saving document #0
2020-09-30 12:28:01,182 : INFO : saved 9x12 matrix, density=25.926% (28/108)
2020-09-30 12:28:01,183 : INFO : saving MmCorpus index to /tmp/deerwester.mm.index
2020-10-19 01:23:38,012 : INFO : storing corpus in Matrix Market format to /tmp/deerwester.mm
2020-10-19 01:23:38,013 : INFO : saving sparse matrix to /tmp/deerwester.mm
2020-10-19 01:23:38,013 : INFO : PROGRESS: saving document #0
2020-10-19 01:23:38,016 : INFO : saved 9x12 matrix, density=25.926% (28/108)
2020-10-19 01:23:38,016 : INFO : saving MmCorpus index to /tmp/deerwester.mm.index
[[(0, 1), (1, 1), (2, 1)], [(0, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)], [(2, 1), (5, 1), (7, 1), (8, 1)], [(1, 1), (5, 2), (8, 1)], [(3, 1), (6, 1), (7, 1)], [(9, 1)], [(9, 1), (10, 1)], [(9, 1), (10, 1), (11, 1)], [(4, 1), (10, 1), (11, 1)]]
Expand Down Expand Up @@ -334,7 +334,7 @@ then convert the tokens via a dictionary to their ids and yield the resulting sp

.. code-block:: none
<__main__.MyCorpus object at 0x125b5a128>
<__main__.MyCorpus object at 0x117e06828>
Expand Down Expand Up @@ -383,7 +383,6 @@ Similarly, to construct the dictionary without loading all texts into memory:
.. code-block:: default
from six import iteritems
# collect statistics about all tokens
dictionary = corpora.Dictionary(line.lower().split() for line in open('https://radimrehurek.com/gensim/mycorpus.txt'))
# remove stop words and words that appear only once
Expand All @@ -392,7 +391,7 @@ Similarly, to construct the dictionary without loading all texts into memory:
for stopword in stoplist
if stopword in dictionary.token2id
]
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]
dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
dictionary.compactify() # remove gaps in id sequence after words that were removed
print(dictionary)
Expand All @@ -407,8 +406,8 @@ Similarly, to construct the dictionary without loading all texts into memory:

.. code-block:: none
2020-09-30 12:28:02,652 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-09-30 12:28:02,653 : INFO : built Dictionary(42 unique tokens: ['abc', 'applications', 'computer', 'for', 'human']...) from 9 documents (total 69 corpus positions)
2020-10-19 01:23:38,980 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-10-19 01:23:38,981 : INFO : built Dictionary(42 unique tokens: ['abc', 'applications', 'computer', 'for', 'human']...) from 9 documents (total 69 corpus positions)
Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...)
Expand Down Expand Up @@ -455,11 +454,11 @@ create a toy corpus of 2 documents, as a plain Python list

.. code-block:: none
2020-09-30 12:28:02,781 : INFO : storing corpus in Matrix Market format to /tmp/corpus.mm
2020-09-30 12:28:02,782 : INFO : saving sparse matrix to /tmp/corpus.mm
2020-09-30 12:28:02,783 : INFO : PROGRESS: saving document #0
2020-09-30 12:28:02,783 : INFO : saved 2x2 matrix, density=25.000% (1/4)
2020-09-30 12:28:02,783 : INFO : saving MmCorpus index to /tmp/corpus.mm.index
2020-10-19 01:23:39,099 : INFO : storing corpus in Matrix Market format to /tmp/corpus.mm
2020-10-19 01:23:39,100 : INFO : saving sparse matrix to /tmp/corpus.mm
2020-10-19 01:23:39,100 : INFO : PROGRESS: saving document #0
2020-10-19 01:23:39,101 : INFO : saved 2x2 matrix, density=25.000% (1/4)
2020-10-19 01:23:39,101 : INFO : saving MmCorpus index to /tmp/corpus.mm.index
Expand Down Expand Up @@ -487,16 +486,16 @@ Other formats include `Joachim's SVMlight format <http://svmlight.joachims.org/>

.. code-block:: none
2020-09-30 12:28:02,842 : INFO : converting corpus to SVMlight format: /tmp/corpus.svmlight
2020-09-30 12:28:02,844 : INFO : saving SvmLightCorpus index to /tmp/corpus.svmlight.index
2020-09-30 12:28:02,844 : INFO : no word id mapping provided; initializing from corpus
2020-09-30 12:28:02,844 : INFO : storing corpus in Blei's LDA-C format into /tmp/corpus.lda-c
2020-09-30 12:28:02,844 : INFO : saving vocabulary of 2 words to /tmp/corpus.lda-c.vocab
2020-09-30 12:28:02,845 : INFO : saving BleiCorpus index to /tmp/corpus.lda-c.index
2020-09-30 12:28:02,904 : INFO : no word id mapping provided; initializing from corpus
2020-09-30 12:28:02,905 : INFO : storing corpus in List-Of-Words format into /tmp/corpus.low
2020-09-30 12:28:02,906 : WARNING : List-of-words format can only save vectors with integer elements; 1 float entries were truncated to integer value
2020-09-30 12:28:02,906 : INFO : saving LowCorpus index to /tmp/corpus.low.index
2020-10-19 01:23:39,152 : INFO : converting corpus to SVMlight format: /tmp/corpus.svmlight
2020-10-19 01:23:39,153 : INFO : saving SvmLightCorpus index to /tmp/corpus.svmlight.index
2020-10-19 01:23:39,154 : INFO : no word id mapping provided; initializing from corpus
2020-10-19 01:23:39,154 : INFO : storing corpus in Blei's LDA-C format into /tmp/corpus.lda-c
2020-10-19 01:23:39,154 : INFO : saving vocabulary of 2 words to /tmp/corpus.lda-c.vocab
2020-10-19 01:23:39,154 : INFO : saving BleiCorpus index to /tmp/corpus.lda-c.index
2020-10-19 01:23:39,206 : INFO : no word id mapping provided; initializing from corpus
2020-10-19 01:23:39,207 : INFO : storing corpus in List-Of-Words format into /tmp/corpus.low
2020-10-19 01:23:39,207 : WARNING : List-of-words format can only save vectors with integer elements; 1 float entries were truncated to integer value
2020-10-19 01:23:39,207 : INFO : saving LowCorpus index to /tmp/corpus.low.index
Expand All @@ -519,9 +518,9 @@ Conversely, to load a corpus iterator from a Matrix Market file:

.. code-block:: none
2020-09-30 12:28:02,968 : INFO : loaded corpus index from /tmp/corpus.mm.index
2020-09-30 12:28:02,969 : INFO : initializing cython corpus reader from /tmp/corpus.mm
2020-09-30 12:28:02,970 : INFO : accepted corpus with 2 documents, 2 features, 1 non-zero entries
2020-10-19 01:23:39,260 : INFO : loaded corpus index from /tmp/corpus.mm.index
2020-10-19 01:23:39,262 : INFO : initializing cython corpus reader from /tmp/corpus.mm
2020-10-19 01:23:39,262 : INFO : accepted corpus with 2 documents, 2 features, 1 non-zero entries
Expand Down Expand Up @@ -620,10 +619,10 @@ To save the same Matrix Market document stream in Blei's LDA-C format,

.. code-block:: none
2020-09-30 12:28:03,395 : INFO : no word id mapping provided; initializing from corpus
2020-09-30 12:28:03,397 : INFO : storing corpus in Blei's LDA-C format into /tmp/corpus.lda-c
2020-09-30 12:28:03,397 : INFO : saving vocabulary of 2 words to /tmp/corpus.lda-c.vocab
2020-09-30 12:28:03,398 : INFO : saving BleiCorpus index to /tmp/corpus.lda-c.index
2020-10-19 01:23:39,634 : INFO : no word id mapping provided; initializing from corpus
2020-10-19 01:23:39,636 : INFO : storing corpus in Blei's LDA-C format into /tmp/corpus.lda-c
2020-10-19 01:23:39,636 : INFO : saving vocabulary of 2 words to /tmp/corpus.lda-c.vocab
2020-10-19 01:23:39,636 : INFO : saving BleiCorpus index to /tmp/corpus.lda-c.index
Expand Down Expand Up @@ -711,9 +710,9 @@ Optimize converting between corpora and NumPy/SciPy arrays?), see the :ref:`apir

.. rst-class:: sphx-glr-timing

**Total running time of the script:** ( 0 minutes 3.219 seconds)
**Total running time of the script:** ( 0 minutes 2.979 seconds)

**Estimated memory usage:** 10 MB
**Estimated memory usage:** 39 MB


.. _sphx_glr_download_auto_examples_core_run_corpora_and_vector_spaces.py:
Expand Down
10 changes: 5 additions & 5 deletions docs/src/auto_examples/core/sg_execution_times.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@

Computation times
=================
**00:06.698** total execution time for **auto_examples_core** files:
**00:02.979** total execution time for **auto_examples_core** files:

+--------------------------------------------------------------------------------------------------------------+-----------+---------+
| :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py` (``run_corpora_and_vector_spaces.py``) | 00:03.219 | 9.7 MB |
| :ref:`sphx_glr_auto_examples_core_run_corpora_and_vector_spaces.py` (``run_corpora_and_vector_spaces.py``) | 00:02.979 | 38.7 MB |
+--------------------------------------------------------------------------------------------------------------+-----------+---------+
| :ref:`sphx_glr_auto_examples_core_run_core_concepts.py` (``run_core_concepts.py``) | 00:01.675 | 36.8 MB |
| :ref:`sphx_glr_auto_examples_core_run_core_concepts.py` (``run_core_concepts.py``) | 00:00.000 | 0.0 MB |
+--------------------------------------------------------------------------------------------------------------+-----------+---------+
| :ref:`sphx_glr_auto_examples_core_run_topics_and_transformations.py` (``run_topics_and_transformations.py``) | 00:00.970 | 7.2 MB |
| :ref:`sphx_glr_auto_examples_core_run_similarity_queries.py` (``run_similarity_queries.py``) | 00:00.000 | 0.0 MB |
+--------------------------------------------------------------------------------------------------------------+-----------+---------+
| :ref:`sphx_glr_auto_examples_core_run_similarity_queries.py` (``run_similarity_queries.py``) | 00:00.834 | 6.5 MB |
| :ref:`sphx_glr_auto_examples_core_run_topics_and_transformations.py` (``run_topics_and_transformations.py``) | 00:00.000 | 0.0 MB |
+--------------------------------------------------------------------------------------------------------------+-----------+---------+
3 changes: 1 addition & 2 deletions docs/src/gallery/core/run_corpora_and_vector_spaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,6 @@ def __iter__(self):
#
# Similarly, to construct the dictionary without loading all texts into memory:

from six import iteritems
# collect statistics about all tokens
dictionary = corpora.Dictionary(line.lower().split() for line in open('https://radimrehurek.com/gensim/mycorpus.txt'))
# remove stop words and words that appear only once
Expand All @@ -188,7 +187,7 @@ def __iter__(self):
for stopword in stoplist
if stopword in dictionary.token2id
]
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]
dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
dictionary.compactify() # remove gaps in id sequence after words that were removed
print(dictionary)
Expand Down
1 change: 0 additions & 1 deletion gensim/corpora/bleicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

from gensim import utils
from gensim.corpora import IndexedCorpus
from six.moves import range


logger = logging.getLogger(__name__)
Expand Down
Loading

0 comments on commit 94a227b

Please sign in to comment.