diff --git a/gensim/downloader.py b/gensim/downloader.py
index 6fb362ccad..8a4395440e 100644
--- a/gensim/downloader.py
+++ b/gensim/downloader.py
@@ -50,6 +50,7 @@
By default, this subdirectory is ~/gensim-data.
"""
+
from __future__ import absolute_import
import argparse
import os
diff --git a/gensim/models/_fasttext_bin.py b/gensim/models/_fasttext_bin.py
index 5eeb4ca71a..77549b1351 100644
--- a/gensim/models/_fasttext_bin.py
+++ b/gensim/models/_fasttext_bin.py
@@ -549,7 +549,7 @@ def _dict_save(fout, model, encoding):
# prunedidx_size_=-1, -1 value denotes no prunning index (prunning is only supported in supervised mode)
fout.write(np.int64(-1))
- for word in model.wv.index2word:
+ for word in model.wv.index_to_key:
word_count = model.wv.get_vecattr(word, 'count')
fout.write(word.encode(encoding))
fout.write(_END_OF_WORD_MARKER)
diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py
index 9633a2e62f..70fea79804 100644
--- a/gensim/models/coherencemodel.py
+++ b/gensim/models/coherencemodel.py
@@ -25,6 +25,7 @@
Internal functions for pipelines.
"""
+
import logging
import multiprocessing as mp
from collections import namedtuple
@@ -33,9 +34,11 @@
from gensim import interfaces, matutils
from gensim import utils
-from gensim.topic_coherence import (segmentation, probability_estimation,
- direct_confirmation_measure, indirect_confirmation_measure,
- aggregation)
+from gensim.topic_coherence import (
+ segmentation, probability_estimation,
+ direct_confirmation_measure, indirect_confirmation_measure,
+ aggregation,
+)
from gensim.topic_coherence.probability_estimation import unique_ids_from_segments
logger = logging.getLogger(__name__)
diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py
index f7d43d8e70..1fa7335da6 100644
--- a/gensim/models/doc2vec.py
+++ b/gensim/models/doc2vec.py
@@ -799,7 +799,7 @@ def load(cls, *args, **kwargs):
except AttributeError as ae:
logger.error(
"Model load error. Was model saved using code from an older Gensim Version? "
- "Try loading older model using gensim-3.8.1, then re-saving, to restore "
+ "Try loading older model using gensim-3.8.3, then re-saving, to restore "
"compatibility with current code.")
raise ae
diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py
index 42a69b8bc5..3476f7c5dc 100644
--- a/gensim/models/fasttext.py
+++ b/gensim/models/fasttext.py
@@ -827,7 +827,6 @@ def save(self, *args, **kwargs):
Load :class:`~gensim.models.fasttext.FastText` model.
"""
- kwargs['ignore'] = kwargs.get('ignore', []) + ['buckets_word', ]
super(FastText, self).save(*args, **kwargs)
@classmethod
@@ -850,25 +849,15 @@ def load(cls, *args, **kwargs):
Save :class:`~gensim.models.fasttext.FastText` model.
"""
- model = super(FastText, cls).load(*args, rethrow=True, **kwargs)
-
- if not hasattr(model.wv, 'vectors_vocab_lockf') and hasattr(model.wv, 'vectors_vocab'):
- # TODO: try trainables-location
- model.wv.vectors_vocab_lockf = ones(1, dtype=REAL)
- if not hasattr(model, 'vectors_ngrams_lockf') and hasattr(model.wv, 'vectors_ngrams'):
- # TODO: try trainables-location
- model.wv.vectors_ngrams_lockf = ones(1, dtype=REAL)
- # fixup mistakenly overdimensioned gensim-3.x lockf arrays
- if len(model.wv.vectors_vocab_lockf.shape) > 1:
- model.wv.vectors_vocab_lockf = ones(1, dtype=REAL)
- if len(model.wv.vectors_ngrams_lockf.shape) > 1:
- model.wv.vectors_ngrams_lockf = ones(1, dtype=REAL)
- if hasattr(model, 'bucket'):
- del model.bucket # should only exist in one place: the wv subcomponent
- if not hasattr(model.wv, 'buckets_word') or not model.wv.buckets_word:
- model.wv.recalc_char_ngram_buckets()
+ return super(FastText, cls).load(*args, rethrow=True, **kwargs)
- return model
+ def _load_specials(self, *args, **kwargs):
+ """Handle special requirements of `.load()` protocol, usually up-converting older versions."""
+ super(FastText, self)._load_specials(*args, **kwargs)
+ if hasattr(self, 'bucket'):
+ # should only exist in one place: the wv subcomponent
+ self.wv.bucket = self.bucket
+ del self.bucket
class FastTextVocab(utils.SaveLoad):
@@ -1202,12 +1191,49 @@ def __init__(self, vector_size, min_n, max_n, bucket):
@classmethod
def load(cls, fname_or_handle, **kwargs):
- model = super(FastTextKeyedVectors, cls).load(fname_or_handle, **kwargs)
- if isinstance(model, FastTextKeyedVectors):
- if not hasattr(model, 'compatible_hash') or model.compatible_hash is False:
- raise TypeError("Pre-gensim-3.8.x Fasttext models with nonstandard hashing are no longer compatible."
- "Loading into gensim-3.8.3 & re-saving may create a compatible model.")
- return model
+ """Load a previously saved `FastTextKeyedVectors` model.
+
+ Parameters
+ ----------
+ fname : str
+ Path to the saved file.
+
+ Returns
+ -------
+ :class:`~gensim.models.fasttext.FastTextKeyedVectors`
+ Loaded model.
+
+ See Also
+ --------
+ :meth:`~gensim.models.fasttext.FastTextKeyedVectors.save`
+ Save :class:`~gensim.models.fasttext.FastTextKeyedVectors` model.
+
+ """
+ return super(FastTextKeyedVectors, cls).load(fname_or_handle, **kwargs)
+
+ def _load_specials(self, *args, **kwargs):
+ """Handle special requirements of `.load()` protocol, usually up-converting older versions."""
+ super(FastTextKeyedVectors, self)._load_specials(*args, **kwargs)
+ if not isinstance(self, FastTextKeyedVectors):
+ raise TypeError("Loaded object of type %s, not expected FastTextKeyedVectors" % type(self))
+ if not hasattr(self, 'compatible_hash') or self.compatible_hash is False:
+ raise TypeError(
+ "Pre-gensim-3.8.x fastText models with nonstandard hashing are no longer compatible. "
+ "Loading your old model into gensim-3.8.3 & re-saving may create a model compatible with gensim 4.x."
+ )
+ if not hasattr(self, 'vectors_vocab_lockf') and hasattr(self, 'vectors_vocab'):
+ self.vectors_vocab_lockf = ones(1, dtype=REAL)
+ if not hasattr(self, 'vectors_ngrams_lockf') and hasattr(self, 'vectors_ngrams'):
+ self.vectors_ngrams_lockf = ones(1, dtype=REAL)
+ # fixup mistakenly overdimensioned gensim-3.x lockf arrays
+ if len(self.vectors_vocab_lockf.shape) > 1:
+ self.vectors_vocab_lockf = ones(1, dtype=REAL)
+ if len(self.vectors_ngrams_lockf.shape) > 1:
+ self.vectors_ngrams_lockf = ones(1, dtype=REAL)
+ if not hasattr(self, 'buckets_word') or not self.buckets_word:
+ self.recalc_char_ngram_buckets()
+ if not hasattr(self, 'vectors') or self.vectors is None:
+ self.adjust_vectors() # recompose full-word vectors
def __contains__(self, word):
"""Check if `word` or any character ngrams in `word` are present in the vocabulary.
@@ -1255,14 +1281,15 @@ def save(self, *args, **kwargs):
Load object.
"""
- # don't bother storing the cached normalized vectors
- ignore_attrs = [
- 'buckets_word',
- 'hash2index',
- ]
- kwargs['ignore'] = kwargs.get('ignore', ignore_attrs)
super(FastTextKeyedVectors, self).save(*args, **kwargs)
+ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, compress, subname):
+ """Arrange any special handling for the gensim.utils.SaveLoad protocol"""
+ # don't save properties that are merely calculated from others
+ ignore = set(ignore).union(['buckets_word', 'vectors', ])
+ return super(FastTextKeyedVectors, self)._save_specials(
+ fname, separately, sep_limit, ignore, pickle_protocol, compress, subname)
+
def get_vector(self, word, norm=False):
"""Get `word` representations in vector space, as a 1D numpy array.
diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
index 0846dcb78a..5d5f01e84c 100644
--- a/gensim/models/keyedvectors.py
+++ b/gensim/models/keyedvectors.py
@@ -101,12 +101,16 @@
>>>
>>> word_vectors = api.load("glove-wiki-gigaword-100") # load pre-trained word-vectors from gensim-data
>>>
+ >>> # Check the "most similar words", using the default "cosine similarity" measure.
>>> result = word_vectors.most_similar(positive=['woman', 'king'], negative=['man'])
- >>> print("{}: {:.4f}".format(*result[0]))
+ >>> most_similar_key, similarity = result[0] # look at the first match
+ >>> print(f"{most_similar_key}: {similarity:.4f}")
queen: 0.7699
>>>
+ >>> # Use a different similarity measure: "cosmul".
>>> result = word_vectors.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])
- >>> print("{}: {:.4f}".format(*result[0]))
+ >>> most_similar_key, similarity = result[0] # look at the first match
+ >>> print(f"{most_similar_key}: {similarity:.4f}")
queen: 0.8965
>>>
>>> print(word_vectors.doesnt_match("breakfast cereal dinner lunch".split()))
@@ -117,22 +121,23 @@
True
>>>
>>> result = word_vectors.similar_by_word("cat")
- >>> print("{}: {:.4f}".format(*result[0]))
+ >>> most_similar_key, similarity = result[0] # look at the first match
+ >>> print(f"{most_similar_key}: {similarity:.4f}")
dog: 0.8798
>>>
>>> sentence_obama = 'Obama speaks to the media in Illinois'.lower().split()
>>> sentence_president = 'The president greets the press in Chicago'.lower().split()
>>>
>>> similarity = word_vectors.wmdistance(sentence_obama, sentence_president)
- >>> print("{:.4f}".format(similarity))
+ >>> print(f"{similarity:.4f}")
3.4893
>>>
>>> distance = word_vectors.distance("media", "media")
- >>> print("{:.1f}".format(distance))
+ >>> print(f"{distance:.1f}")
0.0
>>>
- >>> sim = word_vectors.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant'])
- >>> print("{:.4f}".format(sim))
+ >>> similarity = word_vectors.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant'])
+ >>> print(f"{similarity:.4f}")
0.7067
>>>
>>> vector = word_vectors['computer'] # numpy vector of a word
@@ -219,7 +224,7 @@ def _load_specials(self, *args, **kwargs):
self._upconvert_old_d2vkv()
# fixup rename/consolidation into index_to_key of older index2word, index2entity
if not hasattr(self, 'index_to_key'):
- self.index_to_key = self.__dict__.pop('index2word', self.__dict__.pop('index2word', None))
+ self.index_to_key = self.__dict__.pop('index2word', self.__dict__.pop('index2entity', None))
# fixup rename into vectors of older syn0
if not hasattr(self, 'vectors'):
self.vectors = self.__dict__.pop('syn0', None)
@@ -267,22 +272,54 @@ def allocate_vecattrs(self, attrs=None, types=None):
continue
prev_expando = self.expandos[attr]
if not np.issubdtype(t, prev_expando.dtype):
- raise TypeError("can't allocate {0} for existing {1}".format(t, prev_expando.dtype))
+ raise TypeError(f"can't allocate {t} for existing {prev_expando.dtype}")
if len(prev_expando) == target_size:
continue # no resizing necessary
prev_count = len(prev_expando)
self.expandos[attr] = np.zeros(target_size, dtype=prev_expando.dtype)
- self.expandos[attr][0:min(prev_count, target_size), ] = \
- prev_expando[0:min(prev_count, target_size), ]
+ self.expandos[attr][: min(prev_count, target_size), ] = prev_expando[: min(prev_count, target_size), ]
def set_vecattr(self, key, attr, val):
- """Set attribute associated with given key to value. TODO: param docs"""
+ """Set attribute associated with the given key to value.
+
+ Parameters
+ ----------
+
+ key : str
+ Store the attribute for this vector key.
+ attr : str
+ Name of the additional attribute to store for the given key.
+ val : object
+ Value of the additional attribute to store for the given key.
+
+ Returns
+ -------
+
+ None
+
+ """
self.allocate_vecattrs(attrs=[attr], types=[type(val)])
index = self.get_index(key)
self.expandos[attr][index] = val
def get_vecattr(self, key, attr):
- """Get attribute value associate with given key. TODO: param docs"""
+ """Get attribute value associated with given key.
+
+ Parameters
+ ----------
+
+ key : str
+ Vector key for which to fetch the attribute value.
+ attr : str
+ Name of the additional attribute to fetch for the given key.
+
+ Returns
+ -------
+
+ object
+ Value of the additional attribute fetched for the given key.
+
+ """
index = self.get_index(key)
return self.expandos[attr][index]
@@ -351,13 +388,14 @@ def get_index(self, key, default=None):
elif default is not None:
return default
else:
- raise KeyError("Key '%s' not present" % key)
+ raise KeyError(f"Key '{key}' not present")
def get_vector(self, key, norm=False):
"""Get the key's vector, as a 1D numpy array.
Parameters
----------
+
key : str
Key for vector to return.
norm : bool, optional
@@ -365,11 +403,13 @@ def get_vector(self, key, norm=False):
Returns
-------
+
numpy.ndarray
Vector for the specified key.
Raises
------
+
KeyError
If the given key doesn't exist.
@@ -386,18 +426,30 @@ def get_vector(self, key, norm=False):
@deprecated("Use get_vector instead")
def word_vec(self, *args, **kwargs):
- """Compatibility alias for get_vector(); must exist so subclass calls reach subclass get_vector()"""
+ """Compatibility alias for get_vector(); must exist so subclass calls reach subclass get_vector()."""
return self.get_vector(*args, **kwargs)
- def add_one(self, key, vector):
+ def add_vector(self, key, vector):
"""Add one new vector at the given key, into existing slot if available.
Warning: using this repeatedly is inefficient, requiring a full reallocation & copy,
- if this instance hasn't been preallocated to be ready fro such incremental additions.
+ if this instance hasn't been preallocated to be ready for such incremental additions.
- returns: actual index used TODO: other param docs
- """
+ Parameters
+ ----------
+
+ key: str
+ Key identifier of the added vector.
+ vector: numpy.ndarray
+ 1D numpy array with the vector values.
+
+ Returns
+ -------
+ int
+ Index of the newly added vector, so that ``self.vectors[result] == vector`` and
+ ``self.index_to_key[result] == key``.
+ """
target_index = self.next_index
if target_index >= len(self) or self.index_to_key[target_index] is not None:
# must append at end by expanding existing structures
@@ -406,7 +458,7 @@ def add_one(self, key, vector):
"Adding single vectors to a KeyedVectors which grows by one each time can be costly. "
"Consider adding in batches or preallocating to the required size.",
UserWarning)
- self.add([key], [vector])
+ self.add_vectors([key], [vector])
self.allocate_vecattrs() # grow any adjunct arrays
self.next_index = target_index + 1
else:
@@ -417,14 +469,14 @@ def add_one(self, key, vector):
self.next_index += 1
return target_index
- def add(self, keys, weights, extras=None, replace=False):
+ def add_vectors(self, keys, weights, extras=None, replace=False):
"""Append keys and their vectors in a manual way.
If some key is already in the vocabulary, the old vector is kept unless `replace` flag is True.
Parameters
----------
keys : list of (str or int)
- keys specified by string or int ids.
+ Keys specified by string or int ids.
weights: list of numpy.ndarray or numpy.ndarray
List of 1D np.array vectors or a 2D np.array of vectors.
replace: bool, optional
@@ -484,7 +536,7 @@ def __setitem__(self, keys, weights):
keys = [keys]
weights = weights.reshape(1, -1)
- self.add(keys, weights, replace=True)
+ self.add_vectors(keys, weights, replace=True)
def has_index_for(self, key):
"""Can this model return a single index for this key?
@@ -522,19 +574,29 @@ def rank(self, key1, key2):
@property
def vectors_norm(self):
- raise ValueError(
- "The vectors_norm attribute became a get_normed_vectors() method in Gensim 4.0.0. "
+ raise AttributeError(
+ "The `.vectors_norm` attribute is computed dynamically since Gensim 4.0.0. "
+ "Use `.get_normed_vectors()` instead.\n"
"See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4#init_sims"
)
@vectors_norm.setter
def vectors_norm(self, _):
- pass # no-op; shouldn't be set
+ pass # ignored but must remain for backward serialization compatibility
def get_normed_vectors(self):
- # TODO: what's the way for users to get from a matrix index (integer) to the
- # corresponding key (string)?
- # Shouldn't we return this as a mapping (dict), or even a new KeyedVectors instance?
+ """Get all embedding vectors normalized to unit L2 length (euclidean), as a 2D numpy array.
+
+ To see which key corresponds to which vector = which array row, refer
+ to the :attr:`~gensim.models.keyedvectors.KeyedVectors.index_to_key` attribute.
+
+ Returns
+ -------
+ numpy.ndarray:
+ 2D numpy array of shape ``(number_of_keys, embedding dimensionality)``, L2-normalized
+ along the rows (key vectors).
+
+ """
self.fill_norms()
return self.vectors / self.norms[..., np.newaxis]
@@ -543,7 +605,7 @@ def fill_norms(self, force=False):
Ensure per-vector norms are available.
Any code which modifies vectors should ensure the accompanying norms are
- either recalculated or 'None', to trigger a full recalculation later.
+ either recalculated or 'None', to trigger a full recalculation later on-request.
"""
if self.norms is None or force:
@@ -551,27 +613,33 @@ def fill_norms(self, force=False):
@property
def index2entity(self):
- return self.index_to_key
+ raise AttributeError(
+ "The index2entity attribute has been replaced by index_to_key since Gensim 4.0.0.\n"
+ "See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4#init_sims"
+ )
@index2entity.setter
def index2entity(self, value):
- self.index_to_key = value
+ self.index_to_key = value # must remain for backward serialization compatibility
@property
def index2word(self):
- return self.index_to_key
+ raise AttributeError(
+ "The index2word attribute has been replaced by index_to_key since Gensim 4.0.0.\n"
+ "See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4#init_sims"
+ )
@index2word.setter
def index2word(self, value):
- self.index_to_key = value
+ self.index_to_key = value # must remain for backward serialization compatibility
@property
def vocab(self):
- raise NotImplementedError(
- "The .vocab dict of 'Vocab' propery objects, one per key, has been removed.\n"
- "See the KeyedVectors .key_to_index dict, .index_to_key list, and methods\n"
- ".get_vecattr(key, attr)/.set_vecattr(key, attr, new_val) for replacement\n"
- "functionality."
+ raise AttributeError(
+ "The vocab attribute was removed from KeyedVector in Gensim 4.0.0.\n"
+ "Use KeyedVector's .key_to_index dict, .index_to_key list, and methods "
+ ".get_vecattr(key, attr) and .set_vecattr(key, attr, new_val) instead.\n"
+ "See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4#init_sims"
)
@vocab.setter
@@ -583,7 +651,7 @@ def sort_by_descending_frequency(self):
if not len(self):
return # noop if empty
count_sorted_indexes = np.argsort(self.expandos['count'])[::-1]
- self.index_to_key = list(np.array(self.index_to_key)[count_sorted_indexes])
+ self.index_to_key = [self.index_to_key[idx] for idx in count_sorted_indexes]
self.allocate_vecattrs()
for k in self.expandos:
# Use numpy's "fancy indexing" to permutate the entire array in one step.
@@ -591,11 +659,10 @@ def sort_by_descending_frequency(self):
if len(self.vectors):
logger.warning("sorting after vectors have been allocated is expensive & error-prone")
self.vectors = self.vectors[count_sorted_indexes]
- for i, word in enumerate(self.index_to_key):
- self.key_to_index[word] = i
+ self.key_to_index = {word: i for i, word in enumerate(self.index_to_key)}
def save(self, *args, **kwargs):
- """Save KeyedVectors.
+ """Save KeyedVectors to a file.
Parameters
----------
@@ -605,13 +672,15 @@ def save(self, *args, **kwargs):
See Also
--------
:meth:`~gensim.models.keyedvectors.KeyedVectors.load`
- Load saved model.
+ Load a previously saved model.
"""
super(KeyedVectors, self).save(*args, **kwargs)
- def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip_end=None,
- restrict_vocab=None, indexer=None):
+ def most_similar(
+ self, positive=None, negative=None, topn=10, clip_start=0, clip_end=None,
+ restrict_vocab=None, indexer=None,
+ ):
"""Find the top-N most similar keys.
Positive keys contribute positively towards the similarity, negative keys negatively.
@@ -698,8 +767,10 @@ def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip
return dists
best = matutils.argsort(dists, topn=topn + len(all_keys), reverse=True)
# ignore (don't return) keys from the input
- result = [(self.index_to_key[sim + clip_start], float(dists[sim]))
- for sim in best if (sim + clip_start) not in all_keys]
+ result = [
+ (self.index_to_key[sim + clip_start], float(dists[sim]))
+ for sim in best if (sim + clip_start) not in all_keys
+ ]
return result[:topn]
def similar_by_word(self, word, topn=10, restrict_vocab=None):
@@ -1647,11 +1718,11 @@ def _add_word_to_kv(kv, counts, word, weights, vocab_size):
if kv.has_index_for(word):
logger.warning("duplicate word '%s' in word2vec file, ignoring all but first", word)
return
- word_id = kv.add_one(word, weights)
+ word_id = kv.add_vector(word, weights)
if counts is None:
- # most common scenario: no vocab file given. just make up some bogus counts, in descending order
- # FIXME(someday): make this faking optional, include more realistic (Zipf-based) fake numbers
+ # Most common scenario: no vocab file given. Just make up some bogus counts, in descending order.
+ # TODO (someday): make this faking optional, include more realistic (Zipf-based) fake numbers.
word_count = vocab_size - word_id
elif word in counts:
# use count from the vocab file
@@ -1797,14 +1868,16 @@ def _load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8'
kv = cls(vector_size, vocab_size, dtype=datatype)
if binary:
- _word2vec_read_binary(fin, kv, counts,
- vocab_size, vector_size, datatype, unicode_errors, binary_chunk_size)
+ _word2vec_read_binary(
+ fin, kv, counts,
+ vocab_size, vector_size, datatype, unicode_errors, binary_chunk_size,
+ )
else:
_word2vec_read_text(fin, kv, counts, vocab_size, vector_size, datatype, unicode_errors, encoding)
if kv.vectors.shape[0] != len(kv):
logger.info(
"duplicate words detected, shrinking matrix size from %i to %i",
- kv.vectors.shape[0], len(kv)
+ kv.vectors.shape[0], len(kv),
)
kv.vectors = ascontiguousarray(kv.vectors[: len(kv)])
assert (len(kv), vector_size) == kv.vectors.shape
@@ -1814,15 +1887,15 @@ def _load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8'
def load_word2vec_format(*args, **kwargs):
- """Alias for `KeyedVectors.load_word2vec_format(...)`"""
+ """Alias for :meth:`~gensim.models.keyedvectors.KeyedVectors.load_word2vec_format`."""
return KeyedVectors.load_word2vec_format(*args, **kwargs)
def pseudorandom_weak_vector(size, seed_string=None, hashfxn=hash):
- """Get a 'random' vector (but deterministically derived from seed_string if supplied).
+ """Get a random vector, derived deterministically from `seed_string` if supplied.
+
+ Useful for initializing KeyedVectors that will be the starting projection/input layers of _2Vec models.
- Useful for initializing KeyedVectors that will be the starting
- projection/input layers of _2Vec models.
"""
if seed_string:
once = np.random.Generator(np.random.SFC64(hashfxn(seed_string) & 0xffffffff))
diff --git a/gensim/models/phrases.py b/gensim/models/phrases.py
index d7001830ee..9460619db8 100644
--- a/gensim/models/phrases.py
+++ b/gensim/models/phrases.py
@@ -62,20 +62,14 @@
import os
import logging
from collections import defaultdict
-import functools as ft
-import itertools as it
+import functools
+import itertools
from math import log
import pickle
-import six
-
-from six import iteritems, string_types, PY2, next
+from inspect import getfullargspec as getargspec
from gensim import utils, interfaces
-if PY2:
- from inspect import getargspec
-else:
- from inspect import getfullargspec as getargspec
logger = logging.getLogger(__name__)
@@ -101,11 +95,11 @@ def _is_single(obj):
temp_iter = obj_iter
try:
peek = next(obj_iter)
- obj_iter = it.chain([peek], obj_iter)
+ obj_iter = itertools.chain([peek], obj_iter)
except StopIteration:
# An empty object is a single document
return True, obj
- if isinstance(peek, string_types):
+ if isinstance(peek, str):
# It's a document, return the iterator
return True, obj_iter
if temp_iter is obj:
@@ -116,7 +110,7 @@ def _is_single(obj):
return False, obj
-class SentenceAnalyzer(object):
+class SentenceAnalyzer:
"""Base util class for :class:`~gensim.models.phrases.Phrases` and :class:`~gensim.models.phrases.Phraser`."""
def score_item(self, worda, wordb, components, scorer):
"""Get bi-gram score statistics.
@@ -194,7 +188,7 @@ def analyze_sentence(self, sentence, threshold, common_terms, scorer):
in_between = []
else:
# release words individually
- for w in it.chain([last_uncommon], in_between):
+ for w in itertools.chain([last_uncommon], in_between):
yield (w, None)
in_between = []
last_uncommon = word
@@ -242,7 +236,7 @@ def load(cls, *args, **kwargs):
model.scoring = original_scorer
# if there is a scoring parameter, and it's a text value, load the proper scoring function
if hasattr(model, 'scoring'):
- if isinstance(model.scoring, six.string_types):
+ if isinstance(model.scoring, str):
if model.scoring == 'default':
logger.info('older version of %s loaded with "default" scoring parameter', cls.__name__)
logger.info('setting scoring method to original_scorer pluggable scoring method for compatibility')
@@ -290,7 +284,7 @@ def _sentence2token(phrase_class, sentence):
delimiter = phrase_class.delimiter
if hasattr(phrase_class, 'vocab'):
- scorer = ft.partial(
+ scorer = functools.partial(
phrase_class.scoring,
len_vocab=float(len(phrase_class.vocab)),
min_count=float(phrase_class.min_count),
@@ -311,9 +305,11 @@ def _sentence2token(phrase_class, sentence):
class Phrases(SentenceAnalyzer, PhrasesTransformation):
"""Detect phrases based on collocation counts."""
- def __init__(self, sentences=None, min_count=5, threshold=10.0,
- max_vocab_size=40000000, delimiter=b'_', progress_per=10000,
- scoring='default', common_terms=frozenset()):
+ def __init__(
+ self, sentences=None, min_count=5, threshold=10.0,
+ max_vocab_size=40000000, delimiter=b'_', progress_per=10000,
+ scoring='default', common_terms=frozenset(),
+ ):
"""
Parameters
@@ -378,16 +374,16 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
# intentially override the value of the scoring parameter rather than set self.scoring here,
# to still run the check of scoring function parameters in the next code block
- if isinstance(scoring, six.string_types):
+ if isinstance(scoring, str):
if scoring == 'default':
scoring = original_scorer
elif scoring == 'npmi':
scoring = npmi_scorer
else:
- raise ValueError('unknown scoring method string %s specified' % (scoring))
+ raise ValueError(f'unknown scoring method string {scoring} specified')
scoring_parameters = [
- 'worda_count', 'wordb_count', 'bigram_count', 'len_vocab', 'min_count', 'corpus_word_count'
+ 'worda_count', 'wordb_count', 'bigram_count', 'len_vocab', 'min_count', 'corpus_word_count',
]
if callable(scoring):
if all(parameter in getargspec(scoring)[0] for parameter in scoring_parameters):
@@ -407,13 +403,9 @@ def __init__(self, sentences=None, min_count=5, threshold=10.0,
# ensure picklability of custom scorer
try:
- test_pickle = pickle.dumps(self.scoring)
- load_pickle = pickle.loads(test_pickle)
+ pickle.loads(pickle.dumps(self.scoring))
except pickle.PickleError:
- raise pickle.PickleError('unable to pickle custom Phrases scoring function')
- finally:
- del(test_pickle)
- del(load_pickle)
+ raise pickle.PickleError('Custom Phrases scoring function must be pickle-able')
if sentences is not None:
self.add_vocab(sentences)
@@ -442,7 +434,7 @@ def __str__(self):
"""Get short string representation of this phrase detector."""
return "%s<%i vocab, min_count=%s, threshold=%s, max_vocab_size=%s>" % (
self.__class__.__name__, len(self.vocab), self.min_count,
- self.threshold, self.max_vocab_size
+ self.threshold, self.max_vocab_size,
)
@staticmethod
@@ -510,7 +502,7 @@ def learn_vocab(sentences, max_vocab_size, delimiter=b'_', progress_per=10000,
if word not in common_terms:
vocab[word] += 1
if last_uncommon is not None:
- components = it.chain([last_uncommon], in_between, [word])
+ components = itertools.chain([last_uncommon], in_between, [word])
vocab[delimiter.join(components)] += 1
last_uncommon = word
in_between = []
@@ -569,7 +561,7 @@ def add_vocab(self, sentences):
if len(self.vocab) > 0:
logger.info("merging %i counts into %s", len(vocab), self)
self.min_reduce = max(self.min_reduce, min_reduce)
- for word, count in iteritems(vocab):
+ for word, count in vocab.items():
self.vocab[word] += count
if len(self.vocab) > self.max_vocab_size:
utils.prune_vocab(self.vocab, self.min_reduce)
@@ -612,11 +604,11 @@ def export_phrases(self, sentences, out_delimiter=b' ', as_tuples=False):
... pass
"""
- analyze_sentence = ft.partial(
+ analyze_sentence = functools.partial(
self.analyze_sentence,
threshold=self.threshold,
common_terms=self.common_terms,
- scorer=ft.partial(
+ scorer=functools.partial(
self.scoring,
len_vocab=float(len(self.vocab)),
min_count=float(self.min_count),
@@ -780,7 +772,7 @@ def pseudocorpus(source_vocab, sep, common_terms=frozenset()):
for i in range(1, len(unigrams)):
if unigrams[i - 1] not in common_terms:
# do not join common terms
- cterms = list(it.takewhile(lambda w: w in common_terms, unigrams[i:]))
+ cterms = list(itertools.takewhile(lambda w: w in common_terms, unigrams[i:]))
tail = unigrams[i + len(cterms):]
components = [sep.join(unigrams[:i])] + cterms
if tail:
diff --git a/gensim/models/translation_matrix.py b/gensim/models/translation_matrix.py
index 54b21416e3..528e3d6fa2 100644
--- a/gensim/models/translation_matrix.py
+++ b/gensim/models/translation_matrix.py
@@ -1,8 +1,8 @@
#!/usr/bin/env python
# encoding: utf-8
-"""Produce translation matrix to translate the word from one language to another language, using either
-standard nearest neighbour method or globally corrected neighbour retrieval method [1]_.
+"""Produce a translation matrix to translate words from one language to another, using either
+a standard nearest neighbour method or a globally corrected neighbour retrieval method [1]_.
This method can be used to augment the existing phrase tables with more candidate translations, or
filter out errors from the translation tables and known dictionaries [2]_. What's more, It also work
@@ -10,6 +10,7 @@
Examples
--------
+
How to make translation between two set of word-vectors
=======================================================
@@ -97,19 +98,18 @@
"""
import warnings
+from collections import OrderedDict
+
import numpy as np
-from collections import OrderedDict
from gensim import utils
-from six import string_types
-class Space(object):
+class Space:
"""An auxiliary class for storing the the words space."""
def __init__(self, matrix, index2word):
"""
-
Parameters
----------
matrix : iterable of numpy.ndarray
@@ -256,7 +256,7 @@ def train(self, word_pairs):
self.translation_matrix = np.linalg.lstsq(m1, m2, -1)[0]
def save(self, *args, **kwargs):
- """Save the model to file but ignoring the `source_space` and `target_space`"""
+ """Save the model to a file. Ignores (doesn't store) the `source_space` and `target_space` attributes."""
kwargs['ignore'] = kwargs.get('ignore', ['source_space', 'target_space'])
super(TranslationMatrix, self).save(*args, **kwargs)
@@ -266,12 +266,12 @@ def apply_transmat(self, words_space):
Parameters
----------
words_space : :class:`~gensim.models.translation_matrix.Space`
- Object that constructed for those words to be translate.
+ `Space` object constructed for the words to be translated.
Returns
-------
:class:`~gensim.models.translation_matrix.Space`
- Object that constructed for those mapped words.
+ `Space` object constructed for the mapped words.
"""
return Space(np.dot(words_space.mat, self.translation_matrix), words_space.index2word)
@@ -301,8 +301,7 @@ def translate(self, source_words, topn=5, gc=0, sample_num=None, source_lang_vec
Ordered dict where each item is `word`: [`translated_word_1`, `translated_word_2`, ...]
"""
-
- if isinstance(source_words, string_types):
+ if isinstance(source_words, str):
# pass only one word to translate
source_words = [source_words]
@@ -329,7 +328,7 @@ def translate(self, source_words, topn=5, gc=0, sample_num=None, source_lang_vec
"When using the globally corrected neighbour retrieval method, "
"the `sample_num` parameter(i.e. the number of words sampled from source space) must be provided."
)
- lexicon = set(source_lang_vec.index2word)
+ lexicon = set(source_lang_vec.index_to_key)
addition = min(sample_num, len(lexicon) - len(source_words))
lexicon = self.random_state.choice(list(lexicon.difference(source_words)), addition)
source_space = Space.build(source_lang_vec, set(source_words).union(set(lexicon)))
diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py
index 1c042ba851..806e087c56 100755
--- a/gensim/models/word2vec.py
+++ b/gensim/models/word2vec.py
@@ -8,6 +8,7 @@
"""
Introduction
============
+
This module implements the word2vec family of algorithms, using highly optimized C routines,
data streaming and Pythonic interfaces.
@@ -21,17 +22,15 @@
There are more ways to train word vectors in Gensim than just Word2Vec.
See also :class:`~gensim.models.doc2vec.Doc2Vec`, :class:`~gensim.models.fasttext.FastText` and
-wrappers for :class:`~gensim.models.wrappers.VarEmbed` and :class:`~gensim.models.wrappers.WordRank`.
+wrappers for :class:`~gensim.models.wrappers.varembed.VarEmbed` and :class:`~gensim.models.wrappers.wordrank.WordRank`.
The training algorithms were originally ported from the C package https://code.google.com/p/word2vec/
-and extended with additional functionality and optimizations over the years.
+and extended with additional functionality and
+`optimizations `_ over the years.
For a tutorial on Gensim word2vec, with an interactive web app trained on GoogleNews,
visit https://rare-technologies.com/word2vec-tutorial/.
-**Make sure you have a C compiler before installing Gensim, to use the optimized word2vec routines**
-(70x speedup compared to plain NumPy implementation, https://rare-technologies.com/parallelizing-word2vec-in-python/).
-
Usage examples
==============
@@ -42,17 +41,17 @@
>>> from gensim.test.utils import common_texts
>>> from gensim.models import Word2Vec
>>>
- >>> model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4)
+ >>> model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)
>>> model.save("word2vec.model")
-The training is streamed, so ``sentences`` can be an iterable, reading input data
-from disk on-the-fly. This lets you avoid loading the entire corpus into RAM.
-However, note that because the iterable must be re-startable, `sentences` must
-not be a generator. For an example of an appropriate iterator see
-:class:`~gensim.models.word2vec.BrownCorpus`,
-:class:`~gensim.models.word2vec.Text8Corpus` or
-:class:`~gensim.models.word2vec.LineSentence`.
+**The training is streamed, so ``sentences`` can be an iterable**, reading input data
+from the disk or network on-the-fly, without loading your entire corpus into RAM.
+
+Note the ``sentences`` iterable must be *restartable* (not just a generator), to allow the algorithm
+to stream over your dataset multiple times. For some examples of streamed iterables,
+see :class:`~gensim.models.word2vec.BrownCorpus`,
+:class:`~gensim.models.word2vec.Text8Corpus` or :class:`~gensim.models.word2vec.LineSentence`.
If you save the model you can continue training it later:
@@ -62,26 +61,31 @@
>>> model.train([["hello", "world"]], total_examples=1, epochs=1)
(0, 2)
-The trained word vectors are stored in a :class:`~gensim.models.keyedvectors.KeyedVectors` instance in `model.wv`:
+The trained word vectors are stored in a :class:`~gensim.models.keyedvectors.KeyedVectors` instance, as `model.wv`:
.. sourcecode:: pycon
- >>> vector = model.wv['computer'] # numpy vector of a word
+ >>> vector = model.wv['computer'] # get numpy vector of a word
The reason for separating the trained vectors into `KeyedVectors` is that if you don't
-need the full model state any more (don't need to continue training), the state can discarded,
-resulting in a much smaller and faster object that can be mmapped for lightning
+need the full model state any more (don't need to continue training), its state can discarded,
+keeping just the vectors and their keys proper.
+
+This results in a much smaller and faster object that can be mmapped for lightning
fast loading and sharing the vectors in RAM between processes:
.. sourcecode:: pycon
>>> from gensim.models import KeyedVectors
>>>
- >>> path = get_tmpfile("wordvectors.kv")
+ >>> # Store just the words + their trained embeddings.
+ >>> word_vectors = model.wv
+ >>> word_vectors.save("word2vec.wordvectors")
+ >>>
+ >>> # Load back with memory-mapping = read-only, shared across processes.
+ >>> wv = KeyedVectors.load("word2vec.wordvectors", mmap='r')
>>>
- >>> model.wv.save(path)
- >>> wv = KeyedVectors.load("model.wv", mmap='r')
- >>> vector = wv['computer'] # numpy vector of a word
+ >>> vector = wv['computer'] # Get numpy vector of a word
Gensim can also load word vectors in the "word2vec C format", as a
:class:`~gensim.models.keyedvectors.KeyedVectors` instance:
@@ -90,16 +94,18 @@
>>> from gensim.test.utils import datapath
>>>
- >>> wv_from_text = KeyedVectors.load_word2vec_format(datapath('word2vec_pre_kv_c'), binary=False) # C text format
- >>> wv_from_bin = KeyedVectors.load_word2vec_format(datapath("euclidean_vectors.bin"), binary=True) # C bin format
+ >>> # Load a word2vec model stored in the C *text* format.
+ >>> wv_from_text = KeyedVectors.load_word2vec_format(datapath('word2vec_pre_kv_c'), binary=False)
+ >>> # Load a word2vec model stored in the C *binary* format.
+ >>> wv_from_bin = KeyedVectors.load_word2vec_format(datapath("euclidean_vectors.bin"), binary=True)
It is impossible to continue training the vectors loaded from the C format because the hidden weights,
vocabulary frequencies and the binary tree are missing. To continue training, you'll need the
full :class:`~gensim.models.word2vec.Word2Vec` object state, as stored by :meth:`~gensim.models.word2vec.Word2Vec.save`,
not just the :class:`~gensim.models.keyedvectors.KeyedVectors`.
-You can perform various NLP word tasks with a trained model. Some of them
-are already built-in - you can see it in :mod:`gensim.models.keyedvectors`.
+You can perform various NLP tasks with a trained model. Some of the operations
+are already built-in - see :mod:`gensim.models.keyedvectors`.
If you're finished training a model (i.e. no more updates, only querying),
you can switch to the :class:`~gensim.models.keyedvectors.KeyedVectors` instance:
@@ -111,18 +117,65 @@
to trim unneeded model state = use much less RAM and allow fast loading and memory sharing (mmap).
-Note that there is a :mod:`gensim.models.phrases` module which lets you automatically
-detect phrases longer than one word. Using phrases, you can learn a word2vec model
-where "words" are actually multiword expressions, such as `new_york_times` or `financial_crisis`:
+Embeddings with multiword ngrams
+================================
+
+There is a :mod:`gensim.models.phrases` module which lets you automatically
+detect phrases longer than one word, using collocation statistics.
+Using phrases, you can learn a word2vec model where "words" are actually multiword expressions,
+such as `new_york_times` or `financial_crisis`:
.. sourcecode:: pycon
- >>> from gensim.test.utils import common_texts
>>> from gensim.models import Phrases
>>>
+ >>> # Train a bigram detector.
>>> bigram_transformer = Phrases(common_texts)
+ >>>
+ >>> # Apply the trained MWE detector to a corpus, using the result to train a Word2vec model.
>>> model = Word2Vec(bigram_transformer[common_texts], min_count=1)
+Pretrained models
+=================
+
+Gensim comes with several already pre-trained models, in the
+`Gensim-data repository `_:
+
+.. sourcecode:: pycon
+
+ >>> import gensim.downloader
+ >>> # Show all available models in gensim-data
+ >>> print(list(gensim.downloader.info()['models'].keys()))
+ ['fasttext-wiki-news-subwords-300',
+ 'conceptnet-numberbatch-17-06-300',
+ 'word2vec-ruscorpora-300',
+ 'word2vec-google-news-300',
+ 'glove-wiki-gigaword-50',
+ 'glove-wiki-gigaword-100',
+ 'glove-wiki-gigaword-200',
+ 'glove-wiki-gigaword-300',
+ 'glove-twitter-25',
+ 'glove-twitter-50',
+ 'glove-twitter-100',
+ 'glove-twitter-200',
+ '__testing_word2vec-matrix-synopsis']
+ >>>
+ >>> # Download the "glove-twitter-25" embeddings
+ >>> glove_vectors = gensim.downloader.load('glove-twitter-25')
+ >>>
+ >>> # Use the downloaded vectors as usual:
+ >>> glove_vectors.most_similar('twitter')
+ [('facebook', 0.948005199432373),
+ ('tweet', 0.9403423070907593),
+ ('fb', 0.9342358708381653),
+ ('instagram', 0.9104824066162109),
+ ('chat', 0.8964964747428894),
+ ('hashtag', 0.8885937333106995),
+ ('tweets', 0.8878158330917358),
+ ('tl', 0.8778461217880249),
+ ('link', 0.8778210878372192),
+ ('internet', 0.8753897547721863)]
+
"""
from __future__ import division # py3 "true division"
@@ -137,21 +190,15 @@
import threading
import itertools
import copy
-
-from gensim.utils import keep_vocab_item, call_on_class_only, deprecated
-from gensim.models.keyedvectors import KeyedVectors, pseudorandom_weak_vector
-
-try:
- from queue import Queue, Empty
-except ImportError:
- from Queue import Queue, Empty
+from queue import Queue, Empty
from numpy import float32 as REAL
import numpy as np
-from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc
-from six import iteritems, itervalues, string_types
-from six.moves import range
+from gensim.utils import keep_vocab_item, call_on_class_only, deprecated
+from gensim.models.keyedvectors import KeyedVectors, pseudorandom_weak_vector
+from gensim import utils, matutils
+
logger = logging.getLogger(__name__)
@@ -173,21 +220,27 @@
# file-based word2vec is not supported
CORPUSFILE_VERSION = -1
- def train_epoch_sg(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words,
- _work, _neu1, compute_loss):
+ def train_epoch_sg(
+ model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words,
+ _work, _neu1, compute_loss,
+ ):
raise RuntimeError("Training with corpus_file argument is not supported")
- def train_epoch_cbow(model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words,
- _work, _neu1, compute_loss):
+ def train_epoch_cbow(
+ model, corpus_file, offset, _cython_vocab, _cur_epoch, _expected_examples, _expected_words,
+ _work, _neu1, compute_loss,
+ ):
raise RuntimeError("Training with corpus_file argument is not supported")
class Word2Vec(utils.SaveLoad):
- def __init__(self, sentences=None, corpus_file=None, vector_size=100, alpha=0.025, window=5, min_count=5,
- max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
- sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0,
- trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(),
- comment=None, max_final_vocab=None):
+ def __init__(
+ self, sentences=None, corpus_file=None, vector_size=100, alpha=0.025, window=5, min_count=5,
+ max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
+ sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0,
+ trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(),
+ comment=None, max_final_vocab=None,
+ ):
"""Train, use and evaluate neural networks described in https://code.google.com/p/word2vec/.
Once you're finished training a model (=no more updates, only querying)
@@ -375,7 +428,7 @@ def __init__(self, sentences=None, corpus_file=None, vector_size=100, alpha=0.02
def build_vocab_and_train(self, corpus_iterable=None, corpus_file=None, trim_rule=None, callbacks=None):
if not (corpus_iterable is None) ^ (corpus_file is None):
raise ValueError("You must provide only one of corpus_iterable or corpus_file arguments.")
- if corpus_file is not None and not isinstance(corpus_file, string_types):
+ if corpus_file is not None and not isinstance(corpus_file, str):
raise TypeError("You must pass string as the corpus_file argument.")
elif isinstance(corpus_iterable, GeneratorType):
raise TypeError("You can't pass a generator as the sentences argument. Try a sequence.")
@@ -386,8 +439,10 @@ def build_vocab_and_train(self, corpus_iterable=None, corpus_file=None, trim_rul
total_words=self.corpus_total_words, epochs=self.epochs, start_alpha=self.alpha,
end_alpha=self.min_alpha, compute_loss=self.compute_loss, callbacks=callbacks)
- def build_vocab(self, corpus_iterable=None, corpus_file=None, update=False, progress_per=10000,
- keep_raw_vocab=False, trim_rule=None, **kwargs):
+ def build_vocab(
+ self, corpus_iterable=None, corpus_file=None, update=False, progress_per=10000,
+ keep_raw_vocab=False, trim_rule=None, **kwargs,
+ ):
"""Build vocabulary from a sequence of sentences (can be a once-only generator stream).
Parameters
@@ -433,7 +488,9 @@ def build_vocab(self, corpus_iterable=None, corpus_file=None, update=False, prog
report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words'])
self.prepare_weights(update=update)
- def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False):
+ def build_vocab_from_freq(
+ self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False,
+ ):
"""Build vocabulary from a dictionary of word frequencies.
Parameters
@@ -468,7 +525,7 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No
raw_vocab = word_freq
logger.info(
"collected %i different raw word, with total frequency of %i",
- len(raw_vocab), sum(itervalues(raw_vocab))
+ len(raw_vocab), sum(raw_vocab.values()),
)
# Since no sentences are provided, this is to control the corpus_count.
@@ -488,11 +545,11 @@ def _scan_vocab(self, sentences, progress_per, trim_rule):
checked_string_types = 0
for sentence_no, sentence in enumerate(sentences):
if not checked_string_types:
- if isinstance(sentence, string_types):
+ if isinstance(sentence, str):
logger.warning(
"Each 'sentences' item should be a list of words (usually unicode strings). "
"First item here is instead plain %s.",
- type(sentence)
+ type(sentence),
)
checked_string_types += 1
if sentence_no % progress_per == 0:
@@ -528,7 +585,8 @@ def scan_vocab(self, corpus_iterable=None, corpus_file=None, progress_per=10000,
def prepare_vocab(
self, update=False, keep_raw_vocab=False, trim_rule=None,
- min_count=None, sample=None, dry_run=False):
+ min_count=None, sample=None, dry_run=False,
+ ):
"""Apply vocabulary settings for `min_count` (discarding less-frequent words)
and `sample` (controlling the downsampling of more-frequent words).
@@ -574,7 +632,7 @@ def prepare_vocab(
self.sample = sample
self.wv.key_to_index = {}
- for word, v in iteritems(self.raw_vocab):
+ for word, v in self.raw_vocab.items():
if keep_vocab_item(word, v, self.effective_min_count, trim_rule=trim_rule):
retain_words.append(word)
retain_total += v
@@ -604,7 +662,7 @@ def prepare_vocab(
logger.info("Updating model with new vocabulary")
new_total = pre_exist_total = 0
new_words = pre_exist_words = []
- for word, v in iteritems(self.raw_vocab):
+ for word, v in self.raw_vocab.items():
if keep_vocab_item(word, v, self.effective_min_count, trim_rule=trim_rule):
if self.wv.has_index_for(word):
pre_exist_words.append(word)
@@ -836,8 +894,10 @@ def init_sims(self, replace=False):
"""
self.wv.init_sims(replace=replace)
- def _do_train_epoch(self, corpus_file, thread_id, offset, cython_vocab, thread_private_mem, cur_epoch,
- total_examples=None, total_words=None, **kwargs):
+ def _do_train_epoch(
+ self, corpus_file, thread_id, offset, cython_vocab, thread_private_mem, cur_epoch,
+ total_examples=None, total_words=None, **kwargs,
+ ):
work, neu1 = thread_private_mem
if self.sg:
@@ -879,10 +939,12 @@ def _clear_post_train(self):
"""Clear any cached vector lengths from the model."""
self.wv.norms = None
- def train(self, corpus_iterable=None, corpus_file=None, total_examples=None, total_words=None,
- epochs=None, start_alpha=None, end_alpha=None, word_count=0,
- queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=(),
- **kwargs):
+ def train(
+ self, corpus_iterable=None, corpus_file=None, total_examples=None, total_words=None,
+ epochs=None, start_alpha=None, end_alpha=None, word_count=0,
+ queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=(),
+ **kwargs,
+ ):
"""Update the model's neural weights from a sequence of sentences.
Notes
@@ -897,7 +959,7 @@ def train(self, corpus_iterable=None, corpus_file=None, total_examples=None, tot
--------
To avoid common mistakes around the model's ability to do multiple training passes itself, an
explicit `epochs` argument **MUST** be provided. In the common and recommended case
- where :meth:`~gensim.models.word2vec.Word2Vec.train` is only called once, you can set `epochs=self.iter`.
+ where :meth:`~gensim.models.word2vec.Word2Vec.train` is only called once, you can set `epochs=self.epochs`.
Parameters
----------
@@ -950,7 +1012,7 @@ def train(self, corpus_iterable=None, corpus_file=None, total_examples=None, tot
>>>
>>> model = Word2Vec(min_count=1)
>>> model.build_vocab(sentences) # prepare the model vocabulary
- >>> model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) # train word vectors
+ >>> model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs) # train word vectors
(1, 30)
"""
@@ -1006,8 +1068,10 @@ def train(self, corpus_iterable=None, corpus_file=None, total_examples=None, tot
callback.on_train_end(self)
return trained_word_count, raw_word_count
- def _worker_loop_corpusfile(self, corpus_file, thread_id, offset, cython_vocab, progress_queue, cur_epoch=0,
- total_examples=None, total_words=None, **kwargs):
+ def _worker_loop_corpusfile(
+ self, corpus_file, thread_id, offset, cython_vocab, progress_queue, cur_epoch=0,
+ total_examples=None, total_words=None, **kwargs,
+ ):
"""Train the model on a `corpus_file` in LineSentence format.
This function will be called in parallel by multiple workers (threads or processes) to make
@@ -1153,8 +1217,10 @@ def _job_producer(self, data_iterator, job_queue, cur_epoch=0, total_examples=No
job_queue.put(None)
logger.debug("job loop exiting, total %i jobs", job_no)
- def _log_epoch_progress(self, progress_queue=None, job_queue=None, cur_epoch=0, total_examples=None,
- total_words=None, report_delay=1.0, is_corpus_file_mode=None):
+ def _log_epoch_progress(
+ self, progress_queue=None, job_queue=None, cur_epoch=0, total_examples=None,
+ total_words=None, report_delay=1.0, is_corpus_file_mode=None,
+ ):
"""Get the progress report for a single training epoch.
Parameters
@@ -1226,7 +1292,8 @@ def _log_epoch_progress(self, progress_queue=None, job_queue=None, cur_epoch=0,
return trained_word_count, raw_word_count, job_tally
def _train_epoch_corpusfile(
- self, corpus_file, cur_epoch=0, total_examples=None, total_words=None, callbacks=(), **kwargs):
+ self, corpus_file, cur_epoch=0, total_examples=None, total_words=None, callbacks=(), **kwargs,
+ ):
"""Train the model for a single epoch.
Parameters
@@ -1289,8 +1356,10 @@ def _train_epoch_corpusfile(
return trained_word_count, raw_word_count, job_tally
- def _train_epoch(self, data_iterable, cur_epoch=0, total_examples=None, total_words=None,
- queue_factor=2, report_delay=1.0, callbacks=()):
+ def _train_epoch(
+ self, data_iterable, cur_epoch=0, total_examples=None, total_words=None,
+ queue_factor=2, report_delay=1.0, callbacks=(),
+ ):
"""Train the model for a single epoch.
Parameters
@@ -1455,8 +1524,10 @@ def _check_training_sanity(self, epochs=None, total_examples=None, total_words=N
self.hs, self.sample, self.negative, self.window
)
- def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, total_examples,
- raw_word_count, total_words, trained_word_count, elapsed):
+ def _log_progress(
+ self, job_queue, progress_queue, cur_epoch, example_count, total_examples,
+ raw_word_count, total_words, trained_word_count, elapsed
+ ):
"""Callback used to log progress for long running jobs.
Parameters
@@ -1506,8 +1577,10 @@ def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, tot
-1 if job_queue is None else utils.qsize(job_queue), utils.qsize(progress_queue)
)
- def _log_epoch_end(self, cur_epoch, example_count, total_examples, raw_word_count, total_words,
- trained_word_count, elapsed, is_corpus_file_mode):
+ def _log_epoch_end(
+ self, cur_epoch, example_count, total_examples, raw_word_count, total_words,
+ trained_word_count, elapsed, is_corpus_file_mode
+ ):
"""Callback used to log the end of a training epoch.
Parameters
@@ -1801,20 +1874,14 @@ def save(self, *args, **kwargs):
Path to the file.
"""
- # don't bother storing recalculable table
- kwargs['ignore'] = kwargs.get('ignore', []) + ['cum_table', ]
super(Word2Vec, self).save(*args, **kwargs)
- def get_latest_training_loss(self):
- """Get current value of the training loss.
-
- Returns
- -------
- float
- Current training loss.
-
- """
- return self.running_training_loss
+ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, compress, subname):
+ """Arrange any special handling for the `gensim.utils.SaveLoad` protocol."""
+ # don't save properties that are merely calculated from others
+ ignore = set(ignore).union(['cum_table', ])
+ return super(Word2Vec, self)._save_specials(
+ fname, separately, sep_limit, ignore, pickle_protocol, compress, subname)
@classmethod
def load(cls, *args, rethrow=False, **kwargs):
@@ -1841,49 +1908,65 @@ def load(cls, *args, rethrow=False, **kwargs):
if not isinstance(model, Word2Vec):
rethrow = True
raise AttributeError("Model of type %s can't be loaded by %s" % (type(model), str(cls)))
- # for backward compatibility
- if not hasattr(model, 'ns_exponent'):
- model.ns_exponent = 0.75
- if model.negative and hasattr(model.wv, 'index2word'):
- model.make_cum_table() # rebuild cum_table from vocabulary ## TODO: ???
- if not hasattr(model, 'corpus_count'):
- model.corpus_count = None
- if not hasattr(model, 'corpus_total_words'):
- model.corpus_total_words = None
- if not hasattr(model.wv, 'vectors_lockf') and hasattr(model.wv, 'vectors'):
- model.wv.vectors_lockf = getattr(model, 'vectors_lockf', np.ones(1, dtype=REAL))
- if not hasattr(model, 'random'):
- model.random = np.random.RandomState(model.seed)
- if not hasattr(model, 'train_count'):
- model.train_count = 0
- model.total_train_time = 0
- if not hasattr(model, 'epochs'):
- model.epochs = model.iter
- del model.iter
- if not hasattr(model, 'max_final_vocab'):
- model.max_final_vocab = None
- if hasattr(model, 'vocabulary'): # re-integrate state that had been moved
- for a in ('max_vocab_size', 'min_count', 'sample', 'sorted_vocab', 'null_word', 'raw_vocab'):
- setattr(model, a, getattr(model.vocabulary, a))
- del model.vocabulary
- if hasattr(model, 'trainables'): # re-integrate state that had been moved
- for a in ('hashfxn', 'layer1_size', 'seed', 'syn1neg', 'syn1'):
- if hasattr(model.trainables, a):
- setattr(model, a, getattr(model.trainables, a))
- if hasattr(model, 'syn1'):
- model.syn1 = model.syn1
- del model.syn1
- del model.trainables
return model
except AttributeError as ae:
if rethrow:
raise ae
logger.error(
"Model load error. Was model saved using code from an older Gensim Version? "
- "Try loading older model using gensim-3.8.1, then re-saving, to restore "
+ "Try loading older model using gensim-3.8.3, then re-saving, to restore "
"compatibility with current code.")
raise ae
+ def _load_specials(self, *args, **kwargs):
+ """Handle special requirements of `.load()` protocol, usually up-converting older versions."""
+ super(Word2Vec, self)._load_specials(*args, **kwargs)
+ # for backward compatibility, add/rearrange properties from prior versions
+ if not hasattr(self, 'ns_exponent'):
+ self.ns_exponent = 0.75
+ if self.negative and hasattr(self.wv, 'index_to_key'):
+ self.make_cum_table() # rebuild cum_table from vocabulary
+ if not hasattr(self, 'corpus_count'):
+ self.corpus_count = None
+ if not hasattr(self, 'corpus_total_words'):
+ self.corpus_total_words = None
+ if not hasattr(self.wv, 'vectors_lockf') and hasattr(self.wv, 'vectors'):
+ self.wv.vectors_lockf = np.ones(1, dtype=REAL)
+ if not hasattr(self, 'random'):
+ # use new instance of numpy's recommended generator/algorithm
+ self.random = np.random.default_rng(seed=self.seed)
+ if not hasattr(self, 'train_count'):
+ self.train_count = 0
+ self.total_train_time = 0
+ if not hasattr(self, 'epochs'):
+ self.epochs = self.iter
+ del self.iter
+ if not hasattr(self, 'max_final_vocab'):
+ self.max_final_vocab = None
+ if hasattr(self, 'vocabulary'): # re-integrate state that had been moved
+ for a in ('max_vocab_size', 'min_count', 'sample', 'sorted_vocab', 'null_word', 'raw_vocab'):
+ setattr(self, a, getattr(self.vocabulary, a))
+ del self.vocabulary
+ if hasattr(self, 'trainables'): # re-integrate state that had been moved
+ for a in ('hashfxn', 'layer1_size', 'seed', 'syn1neg', 'syn1'):
+ if hasattr(self.trainables, a):
+ setattr(self, a, getattr(self.trainables, a))
+ if hasattr(self, 'syn1'):
+ self.syn1 = self.syn1
+ del self.syn1
+ del self.trainables
+
+ def get_latest_training_loss(self):
+ """Get current value of the training loss.
+
+ Returns
+ -------
+ float
+ Current training loss.
+
+ """
+ return self.running_training_loss
+
class BrownCorpus(object):
def __init__(self, dirname):
@@ -2043,12 +2126,12 @@ def __iter__(self):
class Word2VecVocab(utils.SaveLoad):
- """Obsolete class retained for now as load-compatibility state capture"""
+ """Obsolete class retained for now as load-compatibility state capture."""
pass
class Word2VecTrainables(utils.SaveLoad):
- """Obsolete class retained for now as load-compatibility state capture"""
+ """Obsolete class retained for now as load-compatibility state capture."""
pass
diff --git a/gensim/scripts/word2vec2tensor.py b/gensim/scripts/word2vec2tensor.py
index 8495cb9862..3e79688490 100644
--- a/gensim/scripts/word2vec2tensor.py
+++ b/gensim/scripts/word2vec2tensor.py
@@ -70,7 +70,7 @@ def word2vec2tensor(word2vec_model_path, tensor_filename, binary=False):
outfiletsvmeta = tensor_filename + '_metadata.tsv'
with utils.open(outfiletsv, 'wb') as file_vector, utils.open(outfiletsvmeta, 'wb') as file_metadata:
- for word in model.index2word:
+ for word in model.index_to_key:
file_metadata.write(gensim.utils.to_utf8(word) + gensim.utils.to_utf8('\n'))
vector_row = '\t'.join(str(x) for x in model[word])
file_vector.write(gensim.utils.to_utf8(vector_row) + gensim.utils.to_utf8('\n'))
diff --git a/gensim/similarities/annoy.py b/gensim/similarities/annoy.py
index 9f8b8fdbc0..57808f1b3b 100644
--- a/gensim/similarities/annoy.py
+++ b/gensim/similarities/annoy.py
@@ -151,7 +151,7 @@ def load(self, fname):
def build_from_word2vec(self):
"""Build an Annoy index using word vectors from a Word2Vec model."""
return self._build_from_model(
- self.model.wv.get_normed_vectors(), self.model.wv.index2word, self.model.vector_size,
+ self.model.wv.get_normed_vectors(), self.model.wv.index_to_key, self.model.vector_size,
)
def build_from_doc2vec(self):
@@ -163,7 +163,7 @@ def build_from_doc2vec(self):
def build_from_keyedvectors(self):
"""Build an Annoy index using word vectors from a KeyedVectors model."""
return self._build_from_model(
- self.model.get_normed_vectors(), self.model.index2word, self.model.vector_size,
+ self.model.get_normed_vectors(), self.model.index_to_key, self.model.vector_size,
)
def _build_from_model(self, vectors, labels, num_features):
diff --git a/gensim/similarities/nmslib.py b/gensim/similarities/nmslib.py
index b70a9f4e43..7ff78539c1 100644
--- a/gensim/similarities/nmslib.py
+++ b/gensim/similarities/nmslib.py
@@ -187,7 +187,7 @@ def load(cls, fname):
def _build_from_word2vec(self):
"""Build an NMSLIB index using word vectors from a Word2Vec model."""
- self._build_from_model(self.model.wv.get_normed_vectors(), self.model.wv.index2word)
+ self._build_from_model(self.model.wv.get_normed_vectors(), self.model.wv.index_to_key)
def _build_from_doc2vec(self):
"""Build an NMSLIB index using document vectors from a Doc2Vec model."""
@@ -197,7 +197,7 @@ def _build_from_doc2vec(self):
def _build_from_keyedvectors(self):
"""Build an NMSLIB index using word vectors from a KeyedVectors model."""
- self._build_from_model(self.model.get_normed_vectors(), self.model.index2word)
+ self._build_from_model(self.model.get_normed_vectors(), self.model.index_to_key)
def _build_from_model(self, vectors, labels):
index = nmslib.init(method='hnsw', space='cosinesimil')
diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py
index aa958b744d..e402b1355a 100644
--- a/gensim/test/test_doc2vec.py
+++ b/gensim/test/test_doc2vec.py
@@ -109,7 +109,7 @@ def obsolete_testLoadOldModel(self):
model = doc2vec.Doc2Vec.load(datapath(model_file))
self.assertTrue(model.wv.vectors.shape == (3955, 100))
self.assertTrue(len(model.wv) == 3955)
- self.assertTrue(len(model.wv.index2word) == 3955)
+ self.assertTrue(len(model.wv.index_to_key) == 3955)
self.assertIsNone(model.corpus_total_words)
self.assertTrue(model.syn1neg.shape == (len(model.wv), model.vector_size))
self.assertTrue(model.wv.vectors_lockf.shape == (3955, ))
@@ -129,7 +129,7 @@ def obsolete_testLoadOldModelSeparates(self):
model = doc2vec.Doc2Vec.load(datapath(model_file))
self.assertTrue(model.wv.vectors.shape == (3955, 100))
self.assertTrue(len(model.wv) == 3955)
- self.assertTrue(len(model.wv.index2word) == 3955)
+ self.assertTrue(len(model.wv.index_to_key) == 3955)
self.assertIsNone(model.corpus_total_words)
self.assertTrue(model.syn1neg.shape == (len(model.wv), model.vector_size))
self.assertTrue(model.wv.vectors_lockf.shape == (3955, ))
diff --git a/gensim/test/test_keyedvectors.py b/gensim/test/test_keyedvectors.py
index b998ffe308..fd96f9f26f 100644
--- a/gensim/test/test_keyedvectors.py
+++ b/gensim/test/test_keyedvectors.py
@@ -23,8 +23,7 @@
class TestKeyedVectors(unittest.TestCase):
def setUp(self):
- self.vectors = KeyedVectors.load_word2vec_format(
- datapath('euclidean_vectors.bin'), binary=True)
+ self.vectors = KeyedVectors.load_word2vec_format(datapath('euclidean_vectors.bin'), binary=True)
self.model_path = datapath("w2v_keyedvectors_load_test.modeldata")
self.vocab_path = datapath("w2v_keyedvectors_load_test.vocab")
@@ -61,12 +60,9 @@ def test_relative_cosine_similarity(self):
'respectable', 'beneficial', 'just', 'upright', 'adept', 'expert', 'practiced', 'proficient',
'skillful', 'skilful', 'dear', 'near', 'dependable', 'safe', 'secure', 'right', 'ripe', 'well',
'effective', 'in_effect', 'in_force', 'serious', 'sound', 'salutary', 'honest', 'undecomposed',
- 'unspoiled', 'unspoilt', 'thoroughly', 'soundly'
+ 'unspoiled', 'unspoilt', 'thoroughly', 'soundly',
] # synonyms for "good" as per wordnet
- cos_sim = []
- for i in range(len(wordnet_syn)):
- if wordnet_syn[i] in self.vectors:
- cos_sim.append(self.vectors.similarity("good", wordnet_syn[i]))
+ cos_sim = [self.vectors.similarity("good", syn) for syn in wordnet_syn if syn in self.vectors]
cos_sim = sorted(cos_sim, reverse=True) # cosine_similarity of "good" with wordnet_syn in decreasing order
# computing relative_cosine_similarity of two similar words
rcs_wordnet = self.vectors.similarity("good", "nice") / sum(cos_sim[i] for i in range(10))
@@ -84,7 +80,7 @@ def test_most_similar_raises_keyerror(self):
def test_most_similar_restrict_vocab(self):
"""Test most_similar returns handles restrict_vocab correctly."""
- expected = set(self.vectors.index2word[:5])
+ expected = set(self.vectors.index_to_key[:5])
predicted = set(result[0] for result in self.vectors.most_similar('war', topn=5, restrict_vocab=5))
self.assertEqual(expected, predicted)
@@ -113,7 +109,7 @@ def test_similar_by_word(self):
'administration',
'terrorism',
'call',
- 'israel'
+ 'israel',
]
predicted = [result[0] for result in self.vectors.similar_by_word('war', topn=5)]
self.assertEqual(expected, predicted)
@@ -154,12 +150,12 @@ def test_rank(self):
def test_add_single(self):
"""Test that adding entity in a manual way works correctly."""
- entities = ['___some_entity{}_not_present_in_keyed_vectors___'.format(i) for i in range(5)]
+ entities = [f'___some_entity{i}_not_present_in_keyed_vectors___' for i in range(5)]
vectors = [np.random.randn(self.vectors.vector_size) for _ in range(5)]
# Test `add` on already filled kv.
for ent, vector in zip(entities, vectors):
- self.vectors.add(ent, vector)
+ self.vectors.add_vectors(ent, vector)
for ent, vector in zip(entities, vectors):
self.assertTrue(np.allclose(self.vectors[ent], vector))
@@ -167,7 +163,7 @@ def test_add_single(self):
# Test `add` on empty kv.
kv = KeyedVectors(self.vectors.vector_size)
for ent, vector in zip(entities, vectors):
- kv.add(ent, vector)
+ kv.add_vectors(ent, vector)
for ent, vector in zip(entities, vectors):
self.assertTrue(np.allclose(kv[ent], vector))
@@ -179,7 +175,7 @@ def test_add_multiple(self):
# Test `add` on already filled kv.
vocab_size = len(self.vectors)
- self.vectors.add(entities, vectors, replace=False)
+ self.vectors.add_vectors(entities, vectors, replace=False)
self.assertEqual(vocab_size + len(entities), len(self.vectors))
for ent, vector in zip(entities, vectors):
@@ -198,7 +194,7 @@ def test_add_type(self):
assert kv.vectors.dtype == REAL
words, vectors = ["a"], np.array([1., 1.], dtype=np.float64).reshape(1, -1)
- kv.add(words, vectors)
+ kv.add_vectors(words, vectors)
assert kv.vectors.dtype == REAL
@@ -270,7 +266,7 @@ def test_save_reload(self):
count = 20
keys = [str(i) for i in range(count)]
weights = [pseudorandom_weak_vector(randkv.vector_size) for _ in range(count)]
- randkv.add(keys, weights)
+ randkv.add_vectors(keys, weights)
tmpfiletxt = gensim.test.utils.get_tmpfile("tmp_kv.txt")
randkv.save_word2vec_format(tmpfiletxt, binary=False)
reloadtxtkv = KeyedVectors.load_word2vec_format(tmpfiletxt, binary=False)
@@ -287,7 +283,7 @@ def test_no_header(self):
count = 20
keys = [str(i) for i in range(count)]
weights = [pseudorandom_weak_vector(randkv.vector_size) for _ in range(count)]
- randkv.add(keys, weights)
+ randkv.add_vectors(keys, weights)
tmpfiletxt = gensim.test.utils.get_tmpfile("tmp_kv.txt")
randkv.save_word2vec_format(tmpfiletxt, binary=False, write_header=False)
reloadtxtkv = KeyedVectors.load_word2vec_format(tmpfiletxt, binary=False, no_header=True)
diff --git a/gensim/test/test_poincare.py b/gensim/test/test_poincare.py
index 67b2668e02..98970525a2 100644
--- a/gensim/test/test_poincare.py
+++ b/gensim/test/test_poincare.py
@@ -278,7 +278,7 @@ def test_most_similar_raises_keyerror(self):
def test_most_similar_restrict_vocab(self):
"""Test most_similar returns handles restrict_vocab correctly."""
- expected = set(self.vectors.index2word[:5])
+ expected = set(self.vectors.index_to_key[:5])
predicted = set(result[0] for result in self.vectors.most_similar('dog.n.01', topn=5, restrict_vocab=5))
self.assertEqual(expected, predicted)
diff --git a/gensim/test/test_sharded_corpus.py b/gensim/test/test_sharded_corpus.py
index 3a56f240e2..14eea34f88 100644
--- a/gensim/test/test_sharded_corpus.py
+++ b/gensim/test/test_sharded_corpus.py
@@ -1,19 +1,17 @@
"""
-Testing the test sharded corpus.
+Tests for ShardedCorpus.
"""
-import os
+import os
import unittest
-
import random
-import numpy as np
import shutil
+import numpy as np
from scipy import sparse
-from gensim.utils import is_corpus
+from gensim.utils import is_corpus, mock_data
from gensim.corpora.sharded_corpus import ShardedCorpus
-from gensim.utils import mock_data, range
#############################################################################
diff --git a/gensim/test/test_similarities.py b/gensim/test/test_similarities.py
index 9c91e8926d..6a0321fdbe 100644
--- a/gensim/test/test_similarities.py
+++ b/gensim/test/test_similarities.py
@@ -591,7 +591,7 @@ def testLoadMissingRaisesError(self):
def assertVectorIsSimilarToItself(self, wv, index):
vector = wv.get_normed_vectors()[0]
- label = wv.index2word[0]
+ label = wv.index_to_key[0]
approx_neighbors = index.most_similar(vector, 1)
word, similarity = approx_neighbors[0]
@@ -748,7 +748,7 @@ def test_load_missing_raises_error(self):
def assertVectorIsSimilarToItself(self, wv, index):
vector = wv.get_normed_vectors()[0]
- label = wv.index2word[0]
+ label = wv.index_to_key[0]
approx_neighbors = index.most_similar(vector, 1)
word, similarity = approx_neighbors[0]
diff --git a/gensim/test/test_translation_matrix.py b/gensim/test/test_translation_matrix.py
index 8846dc617d..578be26941 100644
--- a/gensim/test/test_translation_matrix.py
+++ b/gensim/test/test_translation_matrix.py
@@ -19,10 +19,11 @@ def setUp(self):
self.source_word_vec_file = datapath("EN.1-10.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt")
self.target_word_vec_file = datapath("IT.1-10.cbow1_wind5_hs0_neg10_size300_smpl1e-05.txt")
- self.word_pairs = [("one", "uno"), ("two", "due"), ("three", "tre"),
+ self.word_pairs = [
+ ("one", "uno"), ("two", "due"), ("three", "tre"),
("four", "quattro"), ("five", "cinque"), ("seven", "sette"), ("eight", "otto"),
("dog", "cane"), ("pig", "maiale"), ("fish", "cavallo"), ("birds", "uccelli"),
- ("apple", "mela"), ("orange", "arancione"), ("grape", "acino"), ("banana", "banana")
+ ("apple", "mela"), ("orange", "arancione"), ("grape", "acino"), ("banana", "banana"),
]
self.test_word_pairs = [("ten", "dieci"), ("cat", "gatto")]
@@ -53,7 +54,7 @@ def test_translate_nn(self):
test_source_word, test_target_word = zip(*self.test_word_pairs)
translated_words = model.translate(
- test_source_word, topn=5, source_lang_vec=self.source_word_vec, target_lang_vec=self.target_word_vec
+ test_source_word, topn=5, source_lang_vec=self.source_word_vec, target_lang_vec=self.target_word_vec,
)
for idx, item in enumerate(self.test_word_pairs):
@@ -96,7 +97,7 @@ def setUp(self):
def test_translation_matrix(self):
model = translation_matrix.BackMappingTranslationMatrix(
- self.source_doc_vec, self.target_doc_vec, self.train_docs[:5]
+ self.source_doc_vec, self.target_doc_vec, self.train_docs[:5],
)
transmat = model.train(self.train_docs[:5])
self.assertEqual(transmat.shape, (8, 8))
@@ -108,7 +109,7 @@ def test_infer_vector(self):
replaces a nonsensical test.
"""
model = translation_matrix.BackMappingTranslationMatrix(
- self.source_doc_vec, self.target_doc_vec, self.train_docs[:5]
+ self.source_doc_vec, self.target_doc_vec, self.train_docs[:5],
)
model.train(self.train_docs[:5])
backmapped_vec = model.infer_vector(self.target_doc_vec.dv[self.train_docs[5].tags])
diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py
index 001ad4c365..a1d766bdb8 100644
--- a/gensim/test/test_word2vec.py
+++ b/gensim/test/test_word2vec.py
@@ -571,9 +571,9 @@ def testEvaluateWordPairs(self):
pearson = correlation[0][0]
spearman = correlation[1][0]
oov = correlation[2]
- self.assertTrue(0.1 < pearson < 1.0, "pearson %f not between 0.1 & 1.0" % pearson)
- self.assertTrue(0.1 < spearman < 1.0, "spearman %f not between 0.1 and 1.0" % spearman)
- self.assertTrue(0.0 <= oov < 90.0, "oov %f not between 0.0 and 90.0" % oov)
+ self.assertTrue(0.1 < pearson < 1.0, "pearson {pearson} not between 0.1 & 1.0")
+ self.assertTrue(0.1 < spearman < 1.0, "spearman {spearman} not between 0.1 and 1.0")
+ self.assertTrue(0.0 <= oov < 90.0, "OOV {oov} not between 0.0 and 90.0")
@unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27")
def testEvaluateWordPairsFromFile(self):
@@ -586,9 +586,9 @@ def testEvaluateWordPairsFromFile(self):
pearson = correlation[0][0]
spearman = correlation[1][0]
oov = correlation[2]
- self.assertTrue(0.1 < pearson < 1.0, "pearson %f not between 0.1 & 1.0" % pearson)
- self.assertTrue(0.1 < spearman < 1.0, "spearman %f not between 0.1 and 1.0" % spearman)
- self.assertTrue(0.0 <= oov < 90.0, "oov %f not between 0.0 and 90.0" % oov)
+ self.assertTrue(0.1 < pearson < 1.0, f"pearson {pearson} not between 0.1 & 1.0")
+ self.assertTrue(0.1 < spearman < 1.0, f"spearman {spearman} not between 0.1 and 1.0")
+ self.assertTrue(0.0 <= oov < 90.0, f"OOV {oov} not between 0.0 and 90.0")
def model_sanity(self, model, train=True, with_corpus_file=False, ranks=None):
"""Even tiny models trained on LeeCorpus should pass these sanity checks"""
@@ -606,7 +606,7 @@ def model_sanity(self, model, train=True, with_corpus_file=False, ranks=None):
self.assertFalse((orig0 == model.wv.vectors[1]).all()) # vector should vary after training
query_word = 'attacks'
expected_word = 'bombings'
- sims = model.wv.most_similar(query_word, topn=len(model.wv.index2word))
+ sims = model.wv.most_similar(query_word, topn=len(model.wv.index_to_key))
t_rank = [word for word, score in sims].index(expected_word)
# in >200 calibration runs w/ calling parameters, 'terrorism' in 50-most_sim for 'war'
if ranks is not None:
@@ -855,7 +855,7 @@ def testLoadOldModel(self):
model = word2vec.Word2Vec.load(datapath(model_file))
self.assertTrue(model.wv.vectors.shape == (12, 100))
self.assertTrue(len(model.wv) == 12)
- self.assertTrue(len(model.wv.index2word) == 12)
+ self.assertTrue(len(model.wv.index_to_key) == 12)
self.assertTrue(model.syn1neg.shape == (len(model.wv), model.wv.vector_size))
self.assertTrue(len(model.wv.vectors_lockf.shape) > 0)
self.assertTrue(model.cum_table.shape == (12,))
@@ -870,7 +870,7 @@ def testLoadOldModelSeparates(self):
model = word2vec.Word2Vec.load(datapath(model_file))
self.assertTrue(model.wv.vectors.shape == (12, 100))
self.assertTrue(len(model.wv) == 12)
- self.assertTrue(len(model.wv.index2word) == 12)
+ self.assertTrue(len(model.wv.index_to_key) == 12)
self.assertTrue(model.syn1neg.shape == (len(model.wv), model.wv.vector_size))
self.assertTrue(len(model.wv.vectors_lockf.shape) > 0)
self.assertTrue(model.cum_table.shape == (12,))
diff --git a/gensim/utils.py b/gensim/utils.py
index bb9ee2fa02..49cab6c595 100644
--- a/gensim/utils.py
+++ b/gensim/utils.py
@@ -11,16 +11,9 @@
import collections
import logging
import warnings
-
-try:
- from html.entities import name2codepoint as n2cp
-except ImportError:
- from htmlentitydefs import name2codepoint as n2cp
-try:
- import cPickle as _pickle
-except ImportError:
- import pickle as _pickle
-
+import numbers
+from html.entities import name2codepoint as n2cp
+import pickle as _pickle
import re
import unicodedata
import os
@@ -36,18 +29,9 @@
import heapq
import numpy as np
-import numbers
import scipy.sparse
-
-from six import iterkeys, iteritems, itervalues, u, string_types, unichr
-from six.moves import range
-
from smart_open import open
-from multiprocessing import cpu_count
-
-if sys.version_info[0] >= 3:
- unicode = str
logger = logging.getLogger(__name__)
@@ -138,7 +122,7 @@ def file_or_filename(input):
An open file, positioned at the beginning.
"""
- if isinstance(input, string_types):
+ if isinstance(input, str):
# input was a filename: open as file
return open(input, 'rb')
else:
@@ -169,11 +153,11 @@ def open_file(input):
except Exception:
# Handling any unhandled exceptions from the code nested in 'with' statement.
exc = True
- if not isinstance(input, string_types) or not mgr.__exit__(*sys.exc_info()):
+ if not isinstance(input, str) or not mgr.__exit__(*sys.exc_info()):
raise
# Try to introspect and silence errors.
finally:
- if not exc and isinstance(input, string_types):
+ if not exc and isinstance(input, str):
mgr.__exit__(None, None, None)
@@ -199,11 +183,11 @@ def deaccent(text):
u'Sef chomutovskych komunistu dostal postou bily prasek'
"""
- if not isinstance(text, unicode):
+ if not isinstance(text, str):
# assume utf8 for byte strings, use default (strict) error handling
text = text.decode('utf8')
norm = unicodedata.normalize("NFD", text)
- result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
+ result = ''.join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
return unicodedata.normalize("NFC", result)
@@ -339,10 +323,10 @@ def any2utf8(text, errors='strict', encoding='utf8'):
"""
- if isinstance(text, unicode):
+ if isinstance(text, str):
return text.encode('utf8')
# do bytestring -> unicode -> utf8 full circle, to ensure valid utf8
- return unicode(text, encoding, errors=errors).encode('utf8')
+ return str(text, encoding, errors=errors).encode('utf8')
to_utf8 = any2utf8
@@ -366,9 +350,9 @@ def any2unicode(text, encoding='utf8', errors='strict'):
Unicode version of `text`.
"""
- if isinstance(text, unicode):
+ if isinstance(text, str):
return text
- return unicode(text, encoding, errors=errors)
+ return str(text, encoding, errors=errors)
to_unicode = any2unicode
@@ -393,7 +377,7 @@ def call_on_class_only(*args, **kwargs):
raise AttributeError('This method should be called on a class object.')
-class SaveLoad(object):
+class SaveLoad:
"""Serialize/deserialize object from disk, by equipping objects with the save()/load() methods.
Warnings
@@ -562,7 +546,7 @@ def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=fro
finally:
# restore attribs handled specially
for obj, asides in restores:
- for attrib, val in iteritems(asides):
+ for attrib, val in asides.items():
with ignore_deprecation_warning():
setattr(obj, attrib, val)
logger.info("saved %s", fname)
@@ -599,7 +583,7 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol,
sparse_matrices = (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)
if separately is None:
separately = []
- for attrib, val in iteritems(self.__dict__):
+ for attrib, val in self.__dict__.items():
if isinstance(val, np.ndarray) and val.size >= sep_limit:
separately.append(attrib)
elif isinstance(val, sparse_matrices) and val.nnz >= sep_limit:
@@ -614,7 +598,7 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol,
recursive_saveloads = []
restores = []
- for attrib, val in iteritems(self.__dict__):
+ for attrib, val in self.__dict__.items():
if hasattr(val, '_save_specials'): # better than 'isinstance(val, SaveLoad)' if IPython reloading
recursive_saveloads.append(attrib)
cfname = '.'.join((fname, attrib))
@@ -622,7 +606,7 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol,
try:
numpys, scipys, ignoreds = [], [], []
- for attrib, val in iteritems(asides):
+ for attrib, val in asides.items():
if isinstance(val, np.ndarray) and attrib not in ignore:
numpys.append(attrib)
logger.info("storing np array '%s' to %s", attrib, subname(fname, attrib))
@@ -666,7 +650,7 @@ def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol,
self.__dict__['__recursive_saveloads'] = recursive_saveloads
except Exception:
# restore the attributes if exception-interrupted
- for attrib, val in iteritems(asides):
+ for attrib, val in asides.items():
setattr(self, attrib, val)
raise
return restores + [(self, asides)]
@@ -749,7 +733,7 @@ def get_max_id(corpus):
return maxid
-class FakeDict(object):
+class FakeDict:
"""Objects of this class act as dictionaries that map integer->str(integer), for a specified
range of integers <0, num_terms).
@@ -778,7 +762,6 @@ def __getitem__(self, val):
def iteritems(self):
"""Iterate over all keys and values.
-
Yields
------
(int, str)
@@ -1087,9 +1070,9 @@ def safe_unichr(intval):
"""
try:
- return unichr(intval)
+ return chr(intval)
except ValueError:
- # ValueError: unichr() arg not in range(0x10000) (narrow Python build)
+ # ValueError: chr() arg not in range(0x10000) (narrow Python build)
s = "\\U%08x" % intval
# return UTF16 surrogate pair
return s.decode('unicode-escape')
@@ -1396,11 +1379,7 @@ def unpickle(fname):
"""
with open(fname, 'rb') as f:
- # Because of loading from S3 load can't be used (missing readline in smart_open)
- if sys.version_info > (3, 0):
- return _pickle.load(f, encoding='latin1')
- else:
- return _pickle.loads(f.read())
+ return _pickle.load(f, encoding='latin1') # needed because loading from S3 doesn't support readline()
def revdict(d):
@@ -1430,7 +1409,7 @@ def revdict(d):
{2: 1, 4: 3}
"""
- return {v: k for (k, v) in iteritems(dict(d))}
+ return {v: k for (k, v) in dict(d).items()}
def deprecated(reason):
@@ -1450,7 +1429,7 @@ def deprecated(reason):
Decorated function
"""
- if isinstance(reason, string_types):
+ if isinstance(reason, str):
def decorator(func):
fmt = "Call to deprecated `{name}` ({reason})."
@@ -1704,7 +1683,7 @@ def lemmatize(content, allowed_tags=re.compile(r'(NN|VB|JJ|RB)'), light=False,
# producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
# FIXME this throws away all fancy parsing cues, including sentence structure,
# abbreviations etc.
- content = u(' ').join(tokenize(content, lower=True, errors='ignore'))
+ content = ' '.join(tokenize(content, lower=True, errors='ignore'))
parsed = parse(content, lemmata=True, collapse=False)
result = []
@@ -1814,7 +1793,7 @@ def trim_vocab_by_freq(vocab, topk, trim_rule=None):
if topk >= len(vocab):
return
- min_count = heapq.nlargest(topk, itervalues(vocab))[-1]
+ min_count = heapq.nlargest(topk, vocab.values())[-1]
prune_vocab(vocab, min_count, trim_rule=trim_rule)
@@ -1831,7 +1810,7 @@ def merge_counts(dict1, dict2):
result : dict
Merged dictionary with sum of frequencies as values.
"""
- for word, freq in iteritems(dict2):
+ for word, freq in dict2.items():
if word in dict1:
dict1[word] += freq
else:
@@ -1957,7 +1936,7 @@ def sample_dict(d, n=10, use_random=True):
Selected items from dictionary, as a list.
"""
- selected_keys = random.sample(list(d), min(len(d), n)) if use_random else itertools.islice(iterkeys(d), n)
+ selected_keys = random.sample(list(d), min(len(d), n)) if use_random else itertools.islice(d.keys(), n)
return [(key, d[key]) for key in selected_keys]
@@ -2080,7 +2059,7 @@ def lazy_flatten(nested_list):
"""
for el in nested_list:
- if isinstance(el, collections.Iterable) and not isinstance(el, string_types):
+ if isinstance(el, collections.Iterable) and not isinstance(el, str):
for sub in flatten(el):
yield sub
else:
@@ -2124,5 +2103,5 @@ def effective_n_jobs(n_jobs):
elif n_jobs is None:
return 1
elif n_jobs < 0:
- n_jobs = max(cpu_count() + 1 + n_jobs, 1)
+ n_jobs = max(multiprocessing.cpu_count() + 1 + n_jobs, 1)
return n_jobs
diff --git a/gensim/viz/poincare.py b/gensim/viz/poincare.py
index f20fd8ab2d..ba91f103dd 100644
--- a/gensim/viz/poincare.py
+++ b/gensim/viz/poincare.py
@@ -51,7 +51,7 @@ def poincare_2d_visualization(model, tree, figure_title, num_nodes=50, show_node
if vectors.shape[1] != 2:
raise ValueError('Can only plot 2-D vectors')
- node_labels = model.kv.index2word
+ node_labels = model.kv.index_to_key
nodes_x = list(vectors[:, 0])
nodes_y = list(vectors[:, 1])
nodes = go.Scatter(