Skip to content

Commit

Permalink
remove dependency on six
Browse files Browse the repository at this point in the history
  • Loading branch information
piskvorky committed Oct 19, 2020
1 parent 87ad617 commit 839b1d3
Show file tree
Hide file tree
Showing 40 changed files with 432 additions and 498 deletions.
3 changes: 1 addition & 2 deletions docs/src/auto_examples/core/run_corpora_and_vector_spaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,6 @@ def __iter__(self):
#
# Similarly, to construct the dictionary without loading all texts into memory:

from six import iteritems
# collect statistics about all tokens
dictionary = corpora.Dictionary(line.lower().split() for line in open('https://radimrehurek.com/gensim/mycorpus.txt'))
# remove stop words and words that appear only once
Expand All @@ -188,7 +187,7 @@ def __iter__(self):
for stopword in stoplist
if stopword in dictionary.token2id
]
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]
dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
dictionary.compactify() # remove gaps in id sequence after words that were removed
print(dictionary)
Expand Down
3 changes: 1 addition & 2 deletions docs/src/gallery/core/run_corpora_and_vector_spaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,6 @@ def __iter__(self):
#
# Similarly, to construct the dictionary without loading all texts into memory:

from six import iteritems
# collect statistics about all tokens
dictionary = corpora.Dictionary(line.lower().split() for line in open('https://radimrehurek.com/gensim/mycorpus.txt'))
# remove stop words and words that appear only once
Expand All @@ -188,7 +187,7 @@ def __iter__(self):
for stopword in stoplist
if stopword in dictionary.token2id
]
once_ids = [tokenid for tokenid, docfreq in iteritems(dictionary.dfs) if docfreq == 1]
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.items() if docfreq == 1]
dictionary.filter_tokens(stop_ids + once_ids) # remove stop words and words that appear only once
dictionary.compactify() # remove gaps in id sequence after words that were removed
print(dictionary)
Expand Down
1 change: 0 additions & 1 deletion gensim/corpora/bleicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@

from gensim import utils
from gensim.corpora import IndexedCorpus
from six.moves import range


logger = logging.getLogger(__name__)
Expand Down
80 changes: 35 additions & 45 deletions gensim/corpora/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,13 @@

"""This module implements the concept of a Dictionary -- a mapping between words and their integer ids."""

from __future__ import with_statement

from collections import defaultdict
from collections.abc import Mapping
import sys
import logging
import itertools

from gensim import utils

from six import PY3, iteritems, iterkeys, itervalues, string_types
from six.moves import zip, range

if sys.version_info[0] >= 3:
unicode = str


logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -116,15 +107,14 @@ def __iter__(self):
"""Iterate over all tokens."""
return iter(self.keys())

if PY3:
# restore Py2-style dict API
iterkeys = __iter__
# restore Py2-style dict API
iterkeys = __iter__

def iteritems(self):
return self.items()
def iteritems(self):
return self.items()

def itervalues(self):
return self.values()
def itervalues(self):
return self.values()

def keys(self):
"""Get all stored ids.
Expand All @@ -149,7 +139,7 @@ def __len__(self):
return len(self.token2id)

def __str__(self):
some_keys = list(itertools.islice(iterkeys(self.token2id), 5))
some_keys = list(itertools.islice(self.token2id.keys(), 5))
return "Dictionary(%i unique tokens: %s%s)" % (len(self), some_keys, '...' if len(self) > 5 else '')

@staticmethod
Expand Down Expand Up @@ -245,35 +235,35 @@ def doc2bow(self, document, allow_update=False, return_missing=False):
([(2, 1)], {u'this': 1, u'is': 1})
"""
if isinstance(document, string_types):
if isinstance(document, str):
raise TypeError("doc2bow expects an array of unicode tokens on input, not a single string")

# Construct (word, frequency) mapping.
counter = defaultdict(int)
for w in document:
counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1
counter[w if isinstance(w, str) else str(w, 'utf-8')] += 1

token2id = self.token2id
if allow_update or return_missing:
missing = sorted(x for x in iteritems(counter) if x[0] not in token2id)
missing = sorted(x for x in counter.items() if x[0] not in token2id)
if allow_update:
for w, _ in missing:
# new id = number of ids made so far;
# NOTE this assumes there are no gaps in the id sequence!
token2id[w] = len(token2id)
result = {token2id[w]: freq for w, freq in iteritems(counter) if w in token2id}
result = {token2id[w]: freq for w, freq in counter.items() if w in token2id}

if allow_update:
self.num_docs += 1
self.num_pos += sum(itervalues(counter))
self.num_pos += sum(counter.values())
self.num_nnz += len(result)
# keep track of document and collection frequencies
for tokenid, freq in iteritems(result):
for tokenid, freq in result.items():
self.cfs[tokenid] = self.cfs.get(tokenid, 0) + freq
self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1

# return tokenids, in ascending id order
result = sorted(iteritems(result))
result = sorted(result.items())
if return_missing:
return result, dict(missing)
else:
Expand Down Expand Up @@ -307,10 +297,10 @@ def doc2idx(self, document, unknown_word_index=-1):
[0, 0, 2, -1, 2]
"""
if isinstance(document, string_types):
if isinstance(document, str):
raise TypeError("doc2idx expects an array of unicode tokens on input, not a single string")

document = [word if isinstance(word, unicode) else unicode(word, 'utf-8') for word in document]
document = [word if isinstance(word, str) else str(word, 'utf-8') for word in document]
return [self.token2id.get(word, unknown_word_index) for word in document]

def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=None):
Expand Down Expand Up @@ -361,13 +351,13 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000, keep_tokens=N
if keep_tokens:
keep_ids = {self.token2id[v] for v in keep_tokens if v in self.token2id}
good_ids = [
v for v in itervalues(self.token2id)
v for v in self.token2id.values()
if no_below <= self.dfs.get(v, 0) <= no_above_abs or v in keep_ids
]
good_ids.sort(key=lambda x: self.num_docs if x in keep_ids else self.dfs.get(x, 0), reverse=True)
else:
good_ids = [
v for v in itervalues(self.token2id)
v for v in self.token2id.values()
if no_below <= self.dfs.get(v, 0) <= no_above_abs
]
good_ids.sort(key=self.dfs.get, reverse=True)
Expand Down Expand Up @@ -408,7 +398,7 @@ def filter_n_most_frequent(self, remove_n):
"""
# determine which tokens to keep
most_frequent_ids = (v for v in itervalues(self.token2id))
most_frequent_ids = (v for v in self.token2id.values())
most_frequent_ids = sorted(most_frequent_ids, key=self.dfs.get, reverse=True)
most_frequent_ids = most_frequent_ids[:remove_n]
# do the actual filtering, then rebuild dictionary to remove gaps in ids
Expand Down Expand Up @@ -452,28 +442,28 @@ def filter_tokens(self, bad_ids=None, good_ids=None):
"""
if bad_ids is not None:
bad_ids = set(bad_ids)
self.token2id = {token: tokenid for token, tokenid in iteritems(self.token2id) if tokenid not in bad_ids}
self.cfs = {tokenid: freq for tokenid, freq in iteritems(self.cfs) if tokenid not in bad_ids}
self.dfs = {tokenid: freq for tokenid, freq in iteritems(self.dfs) if tokenid not in bad_ids}
self.token2id = {token: tokenid for token, tokenid in self.token2id.items() if tokenid not in bad_ids}
self.cfs = {tokenid: freq for tokenid, freq in self.cfs.items() if tokenid not in bad_ids}
self.dfs = {tokenid: freq for tokenid, freq in self.dfs.items() if tokenid not in bad_ids}
if good_ids is not None:
good_ids = set(good_ids)
self.token2id = {token: tokenid for token, tokenid in iteritems(self.token2id) if tokenid in good_ids}
self.cfs = {tokenid: freq for tokenid, freq in iteritems(self.cfs) if tokenid in good_ids}
self.dfs = {tokenid: freq for tokenid, freq in iteritems(self.dfs) if tokenid in good_ids}
self.token2id = {token: tokenid for token, tokenid in self.token2id.items() if tokenid in good_ids}
self.cfs = {tokenid: freq for tokenid, freq in self.cfs.items() if tokenid in good_ids}
self.dfs = {tokenid: freq for tokenid, freq in self.dfs.items() if tokenid in good_ids}
self.compactify()

def compactify(self):
"""Assign new word ids to all words, shrinking any gaps."""
logger.debug("rebuilding dictionary, shrinking gaps")

# build mapping from old id -> new id
idmap = dict(zip(sorted(itervalues(self.token2id)), range(len(self.token2id))))
idmap = dict(zip(sorted(self.token2id.values()), range(len(self.token2id))))

# reassign mappings to new ids
self.token2id = {token: idmap[tokenid] for token, tokenid in iteritems(self.token2id)}
self.token2id = {token: idmap[tokenid] for token, tokenid in self.token2id.items()}
self.id2token = {}
self.dfs = {idmap[tokenid]: freq for tokenid, freq in iteritems(self.dfs)}
self.cfs = {idmap[tokenid]: freq for tokenid, freq in iteritems(self.cfs)}
self.dfs = {idmap[tokenid]: freq for tokenid, freq in self.dfs.items()}
self.cfs = {idmap[tokenid]: freq for tokenid, freq in self.cfs.items()}

def save_as_text(self, fname, sort_by_word=True):
"""Save :class:`~gensim.corpora.dictionary.Dictionary` to a text file.
Expand Down Expand Up @@ -527,11 +517,11 @@ def save_as_text(self, fname, sort_by_word=True):
numdocs_line = "%d\n" % self.num_docs
fout.write(utils.to_utf8(numdocs_line))
if sort_by_word:
for token, tokenid in sorted(iteritems(self.token2id)):
for token, tokenid in sorted(self.token2id.items()):
line = "%i\t%s\t%i\n" % (tokenid, token, self.dfs.get(tokenid, 0))
fout.write(utils.to_utf8(line))
else:
for tokenid, freq in sorted(iteritems(self.dfs), key=lambda item: -item[1]):
for tokenid, freq in sorted(self.dfs.items(), key=lambda item: -item[1]):
line = "%i\t%s\t%i\n" % (tokenid, self[tokenid], freq)
fout.write(utils.to_utf8(line))

Expand Down Expand Up @@ -573,7 +563,7 @@ def merge_with(self, other):
"""
old2new = {}
for other_id, other_token in iteritems(other):
for other_id, other_token in other.items():
if other_token in self.token2id:
new_id = self.token2id[other_token]
else:
Expand Down Expand Up @@ -748,11 +738,11 @@ def from_corpus(corpus, id2word=None):

if id2word is None:
# make sure length(result) == get_max_id(corpus) + 1
result.token2id = {unicode(i): i for i in range(max_id + 1)}
result.token2id = {str(i): i for i in range(max_id + 1)}
else:
# id=>word mapping given: simply copy it
result.token2id = {utils.to_unicode(token): idx for idx, token in iteritems(id2word)}
for idx in itervalues(result.token2id):
result.token2id = {utils.to_unicode(token): idx for idx, token in id2word.items()}
for idx in result.token2id.values():
# make sure all token ids have a valid `dfs` entry
result.dfs[idx] = result.dfs.get(idx, 0)

Expand Down
17 changes: 7 additions & 10 deletions gensim/corpora/hashdictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,11 @@
"""

from __future__ import with_statement

import logging
import itertools
import zlib

from gensim import utils
from six import iteritems, iterkeys


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -252,11 +249,11 @@ def doc2bow(self, document, allow_update=False, return_missing=False):
if self.debug:
# increment document count for each unique tokenid that appeared in the document
# done here, because several words may map to the same tokenid
for tokenid in iterkeys(result):
for tokenid in result.keys():
self.dfs[tokenid] = self.dfs.get(tokenid, 0) + 1

# return tokenids, in ascending id order
result = sorted(iteritems(result))
result = sorted(result.items())
if return_missing:
return result, missing
else:
Expand Down Expand Up @@ -293,16 +290,16 @@ def filter_extremes(self, no_below=5, no_above=0.5, keep_n=100000):
"""
no_above_abs = int(no_above * self.num_docs) # convert fractional threshold to absolute threshold
ok = [item for item in iteritems(self.dfs_debug) if no_below <= item[1] <= no_above_abs]
ok = [item for item in self.dfs_debug.items() if no_below <= item[1] <= no_above_abs]
ok = frozenset(word for word, freq in sorted(ok, key=lambda x: -x[1])[:keep_n])

self.dfs_debug = {word: freq for word, freq in iteritems(self.dfs_debug) if word in ok}
self.token2id = {token: tokenid for token, tokenid in iteritems(self.token2id) if token in self.dfs_debug}
self.dfs_debug = {word: freq for word, freq in self.dfs_debug.items() if word in ok}
self.token2id = {token: tokenid for token, tokenid in self.token2id.items() if token in self.dfs_debug}
self.id2token = {
tokenid: {token for token in tokens if token in self.dfs_debug}
for tokenid, tokens in iteritems(self.id2token)
for tokenid, tokens in self.id2token.items()
}
self.dfs = {tokenid: freq for tokenid, freq in iteritems(self.dfs) if self.id2token.get(tokenid, False)}
self.dfs = {tokenid: freq for tokenid, freq in self.dfs.items() if self.id2token.get(tokenid, False)}

# for word->document frequency
logger.info(
Expand Down
3 changes: 1 addition & 2 deletions gensim/corpora/indexedcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
"""Base Indexed Corpus class."""

import logging
import six

import numpy

Expand Down Expand Up @@ -182,7 +181,7 @@ def __getitem__(self, docno):
raise RuntimeError("Cannot call corpus[docid] without an index")
if isinstance(docno, (slice, list, numpy.ndarray)):
return utils.SlicedCorpus(self, docno)
elif isinstance(docno, six.integer_types + (numpy.integer,)):
elif isinstance(docno, (int, numpy.integer,)):
return self.docbyoffset(self.index[docno])
# TODO: no `docbyoffset` method, should be defined in this class
else:
Expand Down
4 changes: 0 additions & 4 deletions gensim/corpora/lowcorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,13 @@
# Copyright (C) 2010 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html


"""Corpus in `GibbsLda++ format <http://gibbslda.sourceforge.net/>`_."""

from __future__ import with_statement

import logging
from collections import Counter

from gensim import utils
from gensim.corpora import IndexedCorpus
from six.moves import zip, range


logger = logging.getLogger(__name__)
Expand Down
3 changes: 1 addition & 2 deletions gensim/corpora/sharded_corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@

import numpy
import scipy.sparse as sparse
from six.moves import range

import gensim
from gensim.corpora import IndexedCorpus
Expand Down Expand Up @@ -290,7 +289,7 @@ def init_shards(self, output_prefix, corpus, shardsize=4096, dtype=_default_dtyp

for i, doc in enumerate(doc_chunk):
doc = dict(doc)
current_shard[i][list(doc)] = list(gensim.matutils.itervalues(doc))
current_shard[i][list(doc)] = list(doc.values())

# Handles the updating as well.
if self.sparse_serialization:
Expand Down
2 changes: 0 additions & 2 deletions gensim/corpora/ucicorpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

"""Corpus in `UCI format <http://archive.ics.uci.edu/ml/datasets/Bag+of+Words>`_."""

from __future__ import with_statement

import logging
from collections import defaultdict
Expand All @@ -17,7 +16,6 @@
from gensim.corpora import IndexedCorpus
from gensim.matutils import MmReader
from gensim.matutils import MmWriter
from six.moves import range


logger = logging.getLogger(__name__)
Expand Down
Loading

0 comments on commit 839b1d3

Please sign in to comment.