comment clarity from review

piskvorky · Oct 6, 2020 · dda970e · dda970e
1 parent 8687e7f
commit dda970e
Show file tree

Hide file tree

Showing 2 changed files with 25 additions and 8 deletions.
diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py
@@ -940,6 +940,12 @@ def __init__(self, vector_size, min_n, max_n, bucket, count=0, dtype=REAL):
             The maximum number of characters in an ngram
         bucket : int
             The number of buckets.
+        count : int, optional
+            If provided, vectors will be pre-allocated for at least this many vectors. (Otherwise
+            they can be added later.)
+        dtype : type, optional
+            Vector dimensions will default to `np.float32` (AKA `REAL` in some Gensim code) unless
+            another type is provided here.
 
         Attributes
         ----------
@@ -963,7 +969,7 @@ def __init__(self, vector_size, min_n, max_n, bucket, count=0, dtype=REAL):
         training-update-dampening factors.
 
         """
-        super(FastTextKeyedVectors, self).__init__(vector_size=vector_size)
+        super(FastTextKeyedVectors, self).__init__(vector_size=vector_size, count=count, dtype=dtype)
         self.min_n = min_n
         self.max_n = max_n
         self.bucket = bucket  # count of buckets, fka num_ngram_vectors
@@ -1122,12 +1128,10 @@ def get_vector(self, word, norm=False):
                 return word_vec
 
     def resize_vectors(self, seed=0):
-        """Make underlying vectors match 'index_to_key' size; random-initialize any new rows.
-
-        Unlike in superclass, the 'vectors_vocab' array is of primary importance, with
-        'vectors' derived from it. And, the ngrams_vectors may need allocation."""
+        """Make underlying vectors match 'index_to_key' size; random-initialize any new rows."""
 
         vocab_shape = (len(self.index_to_key), self.vector_size)
+        # Unlike in superclass, 'vectors_vocab' array is primary with 'vectors' derived from it & ngrams
         self.vectors_vocab = prep_vectors(vocab_shape, prior_vectors=self.vectors_vocab, seed=seed)
         ngrams_shape = (self.bucket, self.vector_size)
         self.vectors_ngrams = prep_vectors(ngrams_shape, prior_vectors=self.vectors_ngrams, seed=seed + 1)

diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py
@@ -191,7 +191,7 @@
 
 
 class KeyedVectors(utils.SaveLoad):
-    def __init__(self, vector_size, count=0, dtype=REAL, mapfile_path=None):
+    def __init__(self, vector_size, count=0, dtype=np.float32, mapfile_path=None):
         """Mapping between keys (such as words)  and vectors for :class:`~gensim.models.Word2Vec`
         and related models.
 
@@ -204,6 +204,18 @@ def __init__(self, vector_size, count=0, dtype=REAL, mapfile_path=None):
         types, as the type and storage array for such attributes is established by the 1st time such
         `attr` is set.
 
+        Parameters
+        ----------
+        vector_size : int
+            Intended number of dimensions for all contained vectors.
+        count : int, optional
+            If provided, vectors wil be pre-allocated for at least this many vectors. (Otherwise
+            they can be added later.)
+        dtype : type, optional
+            Vector dimensions will default to `np.float32` (AKA `REAL` in some Gensim code) unless
+            another type is provided here.
+        mapfile_path : string, optional
+            TODO: UNDER CONSTRUCTION / SUBJECT TO CHANGE - pending mmap work
         """
         self.vector_size = vector_size
         # pre-allocating `index_to_key` to full size helps avoid redundant re-allocations, esp for `expandos`
@@ -342,7 +354,7 @@ def resize_vectors(self, seed=0):
 
         target_shape = (len(self.index_to_key), self.vector_size)
         self.vectors = prep_vectors(target_shape, prior_vectors=self.vectors, seed=seed)
-        # TODO: support memmap?
+        # TODO: support memmap & cleanup
 #        if hasattr(self, 'mapfile_path') and self.mapfile_path:
 #            self.vectors = np.memmap(self.mapfile_path, shape=(target_count, self.vector_size), mode='w+', dtype=REAL)
 
@@ -1903,7 +1915,8 @@ def pseudorandom_weak_vector(size, seed_string=None, hashfxn=hash):
 
 
 def prep_vectors(target_shape, prior_vectors=None, seed=0, dtype=REAL):
-    """Return a numpy array of the given shape. Reuse prior_vectors values instance or values
+    """TODO: NAME/DOCS CHANGES PENDING MMAP & OTHER INITIALIZATION CLEANUP WORK
+    Return a numpy array of the given shape. Reuse prior_vectors object or values
     to extent possible. Initialize new values randomly if requested."""
     if prior_vectors is None:
         prior_vectors = np.zeros((0, 0))