From dda970e47d41feef30077f73b078f38f7ea1c8c6 Mon Sep 17 00:00:00 2001 From: Gordon Mohr Date: Tue, 6 Oct 2020 00:51:58 -0700 Subject: [PATCH] comment clarity from review --- gensim/models/fasttext.py | 14 +++++++++----- gensim/models/keyedvectors.py | 19 ++++++++++++++++--- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 9abd27ffcf..460a1682f5 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -940,6 +940,12 @@ def __init__(self, vector_size, min_n, max_n, bucket, count=0, dtype=REAL): The maximum number of characters in an ngram bucket : int The number of buckets. + count : int, optional + If provided, vectors will be pre-allocated for at least this many vectors. (Otherwise + they can be added later.) + dtype : type, optional + Vector dimensions will default to `np.float32` (AKA `REAL` in some Gensim code) unless + another type is provided here. Attributes ---------- @@ -963,7 +969,7 @@ def __init__(self, vector_size, min_n, max_n, bucket, count=0, dtype=REAL): training-update-dampening factors. """ - super(FastTextKeyedVectors, self).__init__(vector_size=vector_size) + super(FastTextKeyedVectors, self).__init__(vector_size=vector_size, count=count, dtype=dtype) self.min_n = min_n self.max_n = max_n self.bucket = bucket # count of buckets, fka num_ngram_vectors @@ -1122,12 +1128,10 @@ def get_vector(self, word, norm=False): return word_vec def resize_vectors(self, seed=0): - """Make underlying vectors match 'index_to_key' size; random-initialize any new rows. - - Unlike in superclass, the 'vectors_vocab' array is of primary importance, with - 'vectors' derived from it. And, the ngrams_vectors may need allocation.""" + """Make underlying vectors match 'index_to_key' size; random-initialize any new rows.""" vocab_shape = (len(self.index_to_key), self.vector_size) + # Unlike in superclass, 'vectors_vocab' array is primary with 'vectors' derived from it & ngrams self.vectors_vocab = prep_vectors(vocab_shape, prior_vectors=self.vectors_vocab, seed=seed) ngrams_shape = (self.bucket, self.vector_size) self.vectors_ngrams = prep_vectors(ngrams_shape, prior_vectors=self.vectors_ngrams, seed=seed + 1) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 76cd845ca0..867ff1dc90 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -191,7 +191,7 @@ class KeyedVectors(utils.SaveLoad): - def __init__(self, vector_size, count=0, dtype=REAL, mapfile_path=None): + def __init__(self, vector_size, count=0, dtype=np.float32, mapfile_path=None): """Mapping between keys (such as words) and vectors for :class:`~gensim.models.Word2Vec` and related models. @@ -204,6 +204,18 @@ def __init__(self, vector_size, count=0, dtype=REAL, mapfile_path=None): types, as the type and storage array for such attributes is established by the 1st time such `attr` is set. + Parameters + ---------- + vector_size : int + Intended number of dimensions for all contained vectors. + count : int, optional + If provided, vectors wil be pre-allocated for at least this many vectors. (Otherwise + they can be added later.) + dtype : type, optional + Vector dimensions will default to `np.float32` (AKA `REAL` in some Gensim code) unless + another type is provided here. + mapfile_path : string, optional + TODO: UNDER CONSTRUCTION / SUBJECT TO CHANGE - pending mmap work """ self.vector_size = vector_size # pre-allocating `index_to_key` to full size helps avoid redundant re-allocations, esp for `expandos` @@ -342,7 +354,7 @@ def resize_vectors(self, seed=0): target_shape = (len(self.index_to_key), self.vector_size) self.vectors = prep_vectors(target_shape, prior_vectors=self.vectors, seed=seed) - # TODO: support memmap? + # TODO: support memmap & cleanup # if hasattr(self, 'mapfile_path') and self.mapfile_path: # self.vectors = np.memmap(self.mapfile_path, shape=(target_count, self.vector_size), mode='w+', dtype=REAL) @@ -1903,7 +1915,8 @@ def pseudorandom_weak_vector(size, seed_string=None, hashfxn=hash): def prep_vectors(target_shape, prior_vectors=None, seed=0, dtype=REAL): - """Return a numpy array of the given shape. Reuse prior_vectors values instance or values + """TODO: NAME/DOCS CHANGES PENDING MMAP & OTHER INITIALIZATION CLEANUP WORK + Return a numpy array of the given shape. Reuse prior_vectors object or values to extent possible. Initialize new values randomly if requested.""" if prior_vectors is None: prior_vectors = np.zeros((0, 0))