piskvorky · steremma · Feb 28, 2018 · Feb 28, 2018 · Feb 28, 2018 · Mar 1, 2018
diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst
@@ -45,6 +45,7 @@ Modules:
     models/word2vec
     models/keyedvectors
     models/doc2vec
+    models/doc2vec_inner
     models/fasttext
     models/phrases
     models/poincare
@@ -64,6 +65,7 @@ Modules:
     models/deprecated/word2vec
     models/deprecated/keyedvectors
     models/deprecated/fasttext_wrapper
+    models/base_any2vec
     similarities/docsim
     similarities/index
     sklearn_api/atmodel

diff --git a/docs/src/models/base_any2vec.rst b/docs/src/models/base_any2vec.rst
@@ -0,0 +1,10 @@
+:mod:`models.base_any2vec` -- Base classes for any2vec models
+=============================================================
+
+.. automodule:: gensim.models.base_any2vec
+    :synopsis: Base classes for any2vec models
+    :members:
+    :inherited-members:
+    :special-members: __getitem__
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/src/models/doc2vec_inner.rst b/docs/src/models/doc2vec_inner.rst
@@ -0,0 +1,9 @@
+:mod:`models.doc2vec_inner` -- Cython job for training Doc2Vec model
+====================================================================
+
+.. automodule:: gensim.models.doc2vec_inner
+    :synopsis: Cython job for training Doc2Vec model
+    :members:
+    :inherited-members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py
diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py
diff --git a/gensim/models/doc2vec_inner.c b/gensim/models/doc2vec_inner.c
diff --git a/gensim/models/doc2vec_inner.pyx b/gensim/models/doc2vec_inner.pyx
@@ -2,17 +2,17 @@
 # cython: boundscheck=False
 # cython: wraparound=False
 # cython: cdivision=True
+# cython: embedsignature=True
 # coding: utf-8
 #
 # Copyright (C) 2013 Radim Rehurek <me@radimrehurek.com>
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
-
+"""Optimized cython functions for training :class:`~gensim.models.doc2vec.Doc2Vec` model."""
 import cython
 import numpy as np
 from numpy import zeros, float32 as REAL
 cimport numpy as np
 
-from libc.math cimport exp
 from libc.string cimport memset, memcpy
 
 # scipy <= 0.15
@@ -22,13 +22,7 @@ except ImportError:
      # in scipy > 0.15, fblas function has been removed
      import scipy.linalg.blas as fblas
 
-from word2vec_inner cimport bisect_left, random_int32, \
-     scopy, saxpy, sdot, dsdot, snrm2, sscal, \
-     REAL_t, EXP_TABLE, \
-     our_dot, our_saxpy, \
-     our_dot_double, our_dot_float, our_dot_noblas, our_saxpy_noblas
-
-from word2vec import FAST_VERSION
+from word2vec_inner cimport bisect_left, random_int32, sscal, REAL_t, EXP_TABLE, our_dot, our_saxpy
 
 DEF MAX_DOCUMENT_LEN = 10000
 
@@ -227,6 +221,50 @@ cdef unsigned long long fast_document_dmc_neg(
 def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None,
                         train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True,
                         word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None):
+    """Update distributed bag of words model ("PV-DBOW") by training on a single document.
+
+    Called internally from :meth:`~gensim.models.doc2vec.Doc2Vec.train` and
+    :meth:`~gensim.models.doc2vec.Doc2Vec.infer_vector`.
+
+    Parameters
+    ----------
+    model : :class:`~gensim.models.doc2vec.Doc2Vec`
+        The model to train.
+    doc_words : list of str
+        The input document as a list of words to be used for training. Each word will be looked up in
+        the model's vocabulary.
+    doctag_indexes : list of int
+        Indices into `doctag_vectors` used to obtain the tags of the document.
+    alpha : float
+        Learning rate.
+    work : list of float, optional
+        Updates to be performed on each neuron in the hidden layer of the underlying network.
+    train_words : bool, optional
+        Word vectors will be updated exactly as per Word2Vec skip-gram training only if **both** `learn_words`
+        and `train_words` are set to True.
+    learn_doctags : bool, optional
+        Whether the tag vectors should be updated.
+    learn_words : bool, optional
+        Word vectors will be updated exactly as per Word2Vec skip-gram training only if **both**
+        `learn_words` and `train_words` are set to True.
+    learn_hidden : bool, optional
+        Whether or not the weights of the hidden layer will be updated.
+    word_vectors : numpy.ndarray, optional
+        The vector representation for each word in the vocabulary. If None, these will be retrieved from the model.
+    word_locks : numpy.ndarray, optional
+        A learning lock factor for each weight in the hidden layer for words, value 0 completely blocks updates,
+        a value of 1 allows to update word-vectors.
+    doctag_vectors : numpy.ndarray, optional
+        Vector representations of the tags. If None, these will be retrieved from the model.
+    doctag_locks : numpy.ndarray, optional
+        The lock factors for each tag, same as `word_locks`, but for document-vectors.
+
+    Returns
+    -------
+    int
+        Number of words in the input document that were actually used for training.
+
+    """
     cdef int hs = model.hs
     cdef int negative = model.negative
     cdef int sample = (model.vocabulary.sample != 0)
@@ -363,6 +401,51 @@ def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None,
 def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None,
                       learn_doctags=True, learn_words=True, learn_hidden=True,
                       word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None):
+    """Update distributed memory model ("PV-DM") by training on a single document.
+    This method implements the DM model with a projection (input) layer that is either the sum or mean of the context
+    vectors, depending on the model's `dm_mean` configuration field.
+
+    Called internally from :meth:`~gensim.models.doc2vec.Doc2Vec.train` and
+    :meth:`~gensim.models.doc2vec.Doc2Vec.infer_vector`.
+
+    Parameters
+    ----------
+    model : :class:`~gensim.models.doc2vec.Doc2Vec`
+        The model to train.
+    doc_words : list of str
+        The input document as a list of words to be used for training. Each word will be looked up in
+        the model's vocabulary.
+    doctag_indexes : list of int
+        Indices into `doctag_vectors` used to obtain the tags of the document.
+    alpha : float
+        Learning rate.
+    work : np.ndarray, optional
+        Private working memory for each worker.
+    neu1 : np.ndarray, optional
+        Private working memory for each worker.
+    learn_doctags : bool, optional
+        Whether the tag vectors should be updated.
+    learn_words : bool, optional
+        Word vectors will be updated exactly as per Word2Vec skip-gram training only if **both**
+        `learn_words` and `train_words` are set to True.
+    learn_hidden : bool, optional
+        Whether or not the weights of the hidden layer will be updated.
+    word_vectors : numpy.ndarray, optional
+        The vector representation for each word in the vocabulary. If None, these will be retrieved from the model.
+    word_locks : numpy.ndarray, optional
+        A learning lock factor for each weight in the hidden layer for words, value 0 completely blocks updates,
+        a value of 1 allows to update word-vectors.
+    doctag_vectors : numpy.ndarray, optional
+        Vector representations of the tags. If None, these will be retrieved from the model.
+    doctag_locks : numpy.ndarray, optional
+        The lock factors for each tag, same as `word_locks`, but for document-vectors.
+
+    Returns
+    -------
+    int
+        Number of words in the input document that were actually used for training.
+
+    """
     cdef int hs = model.hs
     cdef int negative = model.negative
     cdef int sample = (model.vocabulary.sample != 0)
@@ -521,6 +604,51 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N
 def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, neu1=None,
                              learn_doctags=True, learn_words=True, learn_hidden=True,
                              word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None):
+    """Update distributed memory model ("PV-DM") by training on a single document, using a concatenation of the context
+     window word vectors (rather than a sum or average).
+     This might be slower since the input at each batch will be significantly larger.
+
+    Called internally from :meth:`~gensim.models.doc2vec.Doc2Vec.train` and
+    :meth:`~gensim.models.doc2vec.Doc2Vec.infer_vector`.
+
+    Parameters
+    ----------
+    model : :class:`~gensim.models.doc2vec.Doc2Vec`
+        The model to train.
+    doc_words : list of str
+        The input document as a list of words to be used for training. Each word will be looked up in
+        the model's vocabulary.
+    doctag_indexes : list of int
+        Indices into `doctag_vectors` used to obtain the tags of the document.
+    alpha : float, optional
+        Learning rate.
+    work : np.ndarray, optional
+        Private working memory for each worker.
+    neu1 : np.ndarray, optional
+        Private working memory for each worker.
+    learn_doctags : bool, optional
+        Whether the tag vectors should be updated.
+    learn_words : bool, optional
+        Word vectors will be updated exactly as per Word2Vec skip-gram training only if **both**
+        `learn_words` and `train_words` are set to True.
+    learn_hidden : bool, optional
+        Whether or not the weights of the hidden layer will be updated.
+    word_vectors : numpy.ndarray, optional
+        The vector representation for each word in the vocabulary. If None, these will be retrieved from the model.
+    word_locks : numpy.ndarray, optional
+        A learning lock factor for each weight in the hidden layer for words, value 0 completely blocks updates,
+        a value of 1 allows to update word-vectors.
+    doctag_vectors : numpy.ndarray, optional
+        Vector representations of the tags. If None, these will be retrieved from the model.
+    doctag_locks : numpy.ndarray, optional
+        The lock factors for each tag, same as `word_locks`, but for document-vectors.
+
+    Returns
+    -------
+    int
+        Number of words in the input document that were actually used for training.
+
+    """
     cdef int hs = model.hs
     cdef int negative = model.negative
     cdef int sample = (model.vocabulary.sample != 0)