Skip to content

Commit

Permalink
comment clarity from review
Browse files Browse the repository at this point in the history
  • Loading branch information
gojomo committed Oct 6, 2020
1 parent 8687e7f commit dda970e
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 8 deletions.
14 changes: 9 additions & 5 deletions gensim/models/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -940,6 +940,12 @@ def __init__(self, vector_size, min_n, max_n, bucket, count=0, dtype=REAL):
The maximum number of characters in an ngram
bucket : int
The number of buckets.
count : int, optional
If provided, vectors will be pre-allocated for at least this many vectors. (Otherwise
they can be added later.)
dtype : type, optional
Vector dimensions will default to `np.float32` (AKA `REAL` in some Gensim code) unless
another type is provided here.
Attributes
----------
Expand All @@ -963,7 +969,7 @@ def __init__(self, vector_size, min_n, max_n, bucket, count=0, dtype=REAL):
training-update-dampening factors.
"""
super(FastTextKeyedVectors, self).__init__(vector_size=vector_size)
super(FastTextKeyedVectors, self).__init__(vector_size=vector_size, count=count, dtype=dtype)
self.min_n = min_n
self.max_n = max_n
self.bucket = bucket # count of buckets, fka num_ngram_vectors
Expand Down Expand Up @@ -1122,12 +1128,10 @@ def get_vector(self, word, norm=False):
return word_vec

def resize_vectors(self, seed=0):
"""Make underlying vectors match 'index_to_key' size; random-initialize any new rows.
Unlike in superclass, the 'vectors_vocab' array is of primary importance, with
'vectors' derived from it. And, the ngrams_vectors may need allocation."""
"""Make underlying vectors match 'index_to_key' size; random-initialize any new rows."""

vocab_shape = (len(self.index_to_key), self.vector_size)
# Unlike in superclass, 'vectors_vocab' array is primary with 'vectors' derived from it & ngrams
self.vectors_vocab = prep_vectors(vocab_shape, prior_vectors=self.vectors_vocab, seed=seed)
ngrams_shape = (self.bucket, self.vector_size)
self.vectors_ngrams = prep_vectors(ngrams_shape, prior_vectors=self.vectors_ngrams, seed=seed + 1)
Expand Down
19 changes: 16 additions & 3 deletions gensim/models/keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@


class KeyedVectors(utils.SaveLoad):
def __init__(self, vector_size, count=0, dtype=REAL, mapfile_path=None):
def __init__(self, vector_size, count=0, dtype=np.float32, mapfile_path=None):
"""Mapping between keys (such as words) and vectors for :class:`~gensim.models.Word2Vec`
and related models.
Expand All @@ -204,6 +204,18 @@ def __init__(self, vector_size, count=0, dtype=REAL, mapfile_path=None):
types, as the type and storage array for such attributes is established by the 1st time such
`attr` is set.
Parameters
----------
vector_size : int
Intended number of dimensions for all contained vectors.
count : int, optional
If provided, vectors wil be pre-allocated for at least this many vectors. (Otherwise
they can be added later.)
dtype : type, optional
Vector dimensions will default to `np.float32` (AKA `REAL` in some Gensim code) unless
another type is provided here.
mapfile_path : string, optional
TODO: UNDER CONSTRUCTION / SUBJECT TO CHANGE - pending mmap work
"""
self.vector_size = vector_size
# pre-allocating `index_to_key` to full size helps avoid redundant re-allocations, esp for `expandos`
Expand Down Expand Up @@ -342,7 +354,7 @@ def resize_vectors(self, seed=0):

target_shape = (len(self.index_to_key), self.vector_size)
self.vectors = prep_vectors(target_shape, prior_vectors=self.vectors, seed=seed)
# TODO: support memmap?
# TODO: support memmap & cleanup
# if hasattr(self, 'mapfile_path') and self.mapfile_path:
# self.vectors = np.memmap(self.mapfile_path, shape=(target_count, self.vector_size), mode='w+', dtype=REAL)

Expand Down Expand Up @@ -1903,7 +1915,8 @@ def pseudorandom_weak_vector(size, seed_string=None, hashfxn=hash):


def prep_vectors(target_shape, prior_vectors=None, seed=0, dtype=REAL):
"""Return a numpy array of the given shape. Reuse prior_vectors values instance or values
"""TODO: NAME/DOCS CHANGES PENDING MMAP & OTHER INITIALIZATION CLEANUP WORK
Return a numpy array of the given shape. Reuse prior_vectors object or values
to extent possible. Initialize new values randomly if requested."""
if prior_vectors is None:
prior_vectors = np.zeros((0, 0))
Expand Down

0 comments on commit dda970e

Please sign in to comment.