piskvorky · menshikh-iv · Mar 15, 2018 · Feb 10, 2018 · Feb 10, 2018 · Feb 10, 2018
diff --git a/gensim/sklearn_api/atmodel.py b/gensim/sklearn_api/atmodel.py
@@ -5,9 +5,28 @@
 # Copyright (C) 2017 Radim Rehurek <radimrehurek@seznam.cz>
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
-"""
-Scikit learn interface for gensim for easy use of gensim with scikit-learn
-Follows scikit-learn API conventions
+"""Scikit learn interface for :class:`~gensim.models.atmodel.AuthorTopicModel`.
+
+Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn.
+
+Examples
+--------
+>>> from gensim.test.utils import common_texts, common_dictionary, common_corpus
+>>> from gensim.sklearn_api.atmodel import AuthorTopicTransformer
+>>>
+>>> # Pass a mapping from authors to the documents they contributed to.
+>>> author2doc = {
+...     'john': [0, 1, 2, 3, 4, 5, 6],
+...     'jane': [2, 3, 4, 5, 6, 7, 8],
+...     'jack': [0, 2, 4, 6, 8]
+... }
+>>>
+>>> # Lets use the model to discover 2 different topics.
+>>> model = AuthorTopicTransformer(id2word=common_dictionary, author2doc=author2doc, num_topics=2, passes=100)
+>>>
+>>> # In which of those 2 topics does jack mostly contribute to?
+>>> topic_dist = model.fit(common_corpus).transform('jack')
+
 """
 import numpy as np
 from sklearn.base import TransformerMixin, BaseEstimator
@@ -18,17 +37,79 @@
 
 
 class AuthorTopicTransformer(TransformerMixin, BaseEstimator):
-    """
-    Base AuthorTopic module
-    """
+    """Base Author Topic module, wraps :class:`~gensim.models.atmodel.AuthorTopicModel`.
 
+    For more information on the inner workings please take a look at the original class. The model's internal workings
+    are heavily based on `"The Author-Topic Model for Authors and Documents", Osen-Zvi et. al 2004
+    <https://mimno.infosci.cornell.edu/info6150/readings/398.pdf>`_.
+
+    """
     def __init__(self, num_topics=100, id2word=None, author2doc=None, doc2author=None,
                  chunksize=2000, passes=1, iterations=50, decay=0.5, offset=1.0,
                  alpha='symmetric', eta='symmetric', update_every=1, eval_every=10,
                  gamma_threshold=0.001, serialized=False, serialization_path=None,
                  minimum_probability=0.01, random_state=None):
         """
-        Sklearn wrapper for AuthorTopic model. See gensim.models.AuthorTopicModel for parameter details.
+
+        Parameters
+        ----------
+        num_topics : int, optional
+            Number of requested latent topics to be extracted from the training corpus.
+        id2word : dict of (int, str), optional
+            Mapping from a words' ID to the word itself. Used to determine the vocabulary size,
+            as well as for debugging and topic printing.
+        author2doc : dict(str, list of int), optional
+            Maps an authors name to a list of document IDs where has has contributed.
+            Either `author2doc` or `doc2author` **MUST** be supplied.
+        doc2author : dict of (int, list of str)
+            Maps a document (using its ID) to a list of author names that contributed to it.
+            Either `author2doc` or `doc2author` **MUST** be supplied.
+        chunksize : int, optional
+            Number of documents to be processed by the model in each mini-batch.
+        passes : int, optional
+            Number of times the model can make a pass over the corpus during training.
+        iterations : int, optional
+            Maximum number of times the model before convergence during the M step
+            of the EM algorithm.
+        decay : float, optional
+            A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten
+            when each new document is examined. Corresponds to Kappa from `"The Author-Topic Model for Authors
+            and Documents", Osen-Zvi et. al 2004 <https://mimno.infosci.cornell.edu/info6150/readings/398.pdf>`_.
+        offset : float, optional
+            Hyper-parameter that controls how much we will slow down the first steps the first few iterations.
+            Corresponds to Tau_0 from `"The Author-Topic Model for Authors and Documents", Osen-Zvi et. al 2004
+            <https://mimno.infosci.cornell.edu/info6150/readings/398.pdf>`_.
+        alpha : {np.array, str}, optional
+            Can be set to an 1D array of length equal to the number of expected topics that expresses
+            our a-priori belief for the each topics' probability.
+            Alternatively default prior selecting strategies can be employed by supplying a string:
+            'asymmetric': Uses a fixed normalized assymetric prior of `1.0 / topicno`.
+            'default': Learns an assymetric prior from the corpus.
+        eta : {float, np.array, str}, optional
+            A-priori belief on word probability, this can be:
+                * scalar for a symmetric prior over topic/word probability,
+                * vector : of length num_words to denote an asymmetric user defined probability for each word,
+                * matrix of shape (num_topics, num_words) to assign a probability for each word-topic combination,
+                * the string 'auto' to learn the asymmetric prior from the data.
+        update_every : int, optional
+            Number of mini-batches between each model update.
+        eval_every : int, optional
+            Number of updates between two log perplexity estimates.
+            Set to None to disable perplexity estimation.
+        gamma_threshold : float, optional
+            Minimum change in the value of the gamma parameters to continue iterating.
+        serialized : bool, optional
+            Indicates whether the input corpora to the model are simple in-memory lists (`serialized = False`)
+            or saved to the hard-drive (`serialized = True`). Note that this behaviour is quite different from
+            other Gensim models. If your data is too large to fit in to memory, use this functionality.
+        serialization_path : str, optional
+            Filepath to be used for storing the serialized object. **Must** be supplied if `serialized = True`.
+            An existing file *cannot* be overwritten; either delete the old file or choose a different name
+        minimum_probability : float, optional
+            Topics with a probability lower than this threshold will be filtered out.
+        random_state : {np.random.RandomState, int}, optional
+            Either a randomState object or a seed to generate one. Useful for reproducibility.
+
         """
         self.gensim_model = None
         self.num_topics = num_topics
@@ -51,9 +132,18 @@ def __init__(self, num_topics=100, id2word=None, author2doc=None, doc2author=Non
         self.random_state = random_state
 
     def fit(self, X, y=None):
-        """
-        Fit the model according to the given training data.
-        Calls gensim.models.AuthorTopicModel
+        """Fit the model according to the given training data.
+
+        Parameters
+        ----------
+        X : {iterable of iterable of (int, int), :class:`~gensim.corpora.mmcorpus.MmCorpus`}
+            A collection of documents in BOW format used for training the model.
+
+        Returns
+        -------
+        :class:`~gensim.sklearn_api.atmodel.AuthorTopicTransformer`
+            The trained model.
+
         """
         self.gensim_model = models.AuthorTopicModel(
             corpus=X, num_topics=self.num_topics, id2word=self.id2word,
@@ -66,16 +156,25 @@ def fit(self, X, y=None):
         return self
 
     def transform(self, author_names):
+        """Find the topic probabilities for each author.
+
+        Parameters
+        ----------
+        author_names : iterable of str
+            A collection of authors whose topics will be identified.
+
+        Returns
+        -------
+        iterable of (int, float)
+            Topic distribution for each input author as a tuple of (topic_id, topic_probability).
+
         """
-        Return topic distribution for input authors as a list of
-        (topic_id, topic_probabiity) 2-tuples.
-        """
-        # The input as array of array
         if self.gensim_model is None:
             raise NotFittedError(
                 "This model has not been fitted yet. Call 'fit' with appropriate arguments before using this method."
             )
 
+        # The input as array of arrays
         if not isinstance(author_names, list):
             author_names = [author_names]
         # returning dense representation for compatibility with sklearn
@@ -84,8 +183,29 @@ def transform(self, author_names):
         return np.reshape(np.array(topics), (len(author_names), self.num_topics))
 
     def partial_fit(self, X, author2doc=None, doc2author=None):
-        """
-        Train model over X.
+        """Train model over a potentially incomplete set of documents.
+
+        This method can be used in two ways:
+        * On an unfitted model in which case the model is initialized and trained on `X`.
+        * On an already fitted model in which case the model is **updated** by `X`.
+
+
+        Parameters
+        ----------
+        X : {iterable of iterable of (int, int), :class:`~gensim.corpora.mmcorpus.MmCorpus`}
+            A collection of documents in BOW format used for training the model.
+        author2doc : dict(str, list of int), optional
+            Maps an authors name to a list of document IDs corresponding to indexes in input corpus.
+            Either `author2doc` or `doc2author` **MUST** be supplied.
+        doc2author : dict of (int, list of str), optional
+            Maps a document (using its ID) to a list of author names corresponding to indexes in input corpus.
+            Either `author2doc` or `doc2author` **MUST** be supplied.
+
+        Returns
+        -------
+        :class:`~gensim.sklearn_api.atmodel.AuthorTopicTransformer`
+            The trained model.
+
         """
         if self.gensim_model is None:
             self.gensim_model = models.AuthorTopicModel(

diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py
@@ -4,9 +4,29 @@
 # Copyright (C) 2011 Radim Rehurek <radimrehurek@seznam.cz>
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
-"""
-Scikit learn interface for gensim for easy use of gensim with scikit-learn
-Follows scikit-learn API conventions
+"""Scikit learn interface for :class:`~gensim.models.doc2vec.Doc2Vec`.
+
+Follows scikit-learn API conventions to facilitate using gensim along with scikit-learn.
+
+Examples
+--------
+
+>>> from gensim.test.utils import common_texts
+>>> from gensim.sklearn_api import D2VTransformer
+>>> from gensim.similarities import Similarity
+>>>
+>>> # Lets represent each document using a 50 dimensional vector
+>>> model = D2VTransformer(min_count=1, size=5)
+>>> docvecs = model.fit_transform(common_texts)
+>>>
+>>> # Let's use the vector representations to compute similarities with one of the documents.
+>>> index = Similarity(None, docvecs, num_features=5)
+>>>
+>>> # Which document is most similar to the last one in the corpus? Probably itself!
+>>> result = index[docvecs[8]]
+>>> result.argmax()
+8
+
 """
 
 import numpy as np
@@ -19,16 +39,97 @@
 
 
 class D2VTransformer(TransformerMixin, BaseEstimator):
-    """
-    Base Doc2Vec module
+    """Base Dov2Vec module.
+
+    Wraps :class:`~gensim.models.doc2vec.Doc2Vec`.
+    For more information on the inner workings please take a look at
+    the original class.
+
     """
 
     def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None,
                  docvecs_mapfile=None, comment=None, trim_rule=None, size=100, alpha=0.025, window=5, min_count=5,
                  max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5, cbow_mean=1,
                  hashfxn=hash, iter=5, sorted_vocab=1, batch_words=10000):
-        """
-        Sklearn api for Doc2Vec model. See gensim.models.Doc2Vec and gensim.models.Word2Vec for parameter details.
+        """Sklearn api for Doc2Vec model.
+
+        Parameters
+        ----------
+
+        dm_mean : int {1,0}, optional
+            If 0, use the sum of the context word vectors. If 1, use the mean.
+            Only applies when `dm` is used in non-concatenative mode.
+        dm : int {1,0}, optional
+            Defines the training algorithm. If `dm=1`, 'distributed memory' (PV-DM) is used.
+            Otherwise, `distributed bag of words` (PV-DBOW) is employed.
+        dbow_words : int {1,0}, optional
+            If set to 1 trains word-vectors (in skip-gram fashion) simultaneous with DBOW
+            doc-vector training; If 0, only trains doc-vectors (faster).
+        dm_concat : int {1,0}, optional
+            If 1, use concatenation of context vectors rather than sum/average;
+            Note concatenation results in a much-larger model, as the input
+            is no longer the size of one (sampled or arithmetically combined) word vector, but the
+            size of the tag(s) and all words in the context strung together.
+        dm_tag_count : int, optional
+            Expected constant number of document tags per document, when using
+            dm_concat mode; default is 1.
+        docvecs : :class:`~gensim.models.keyedvectors.Doc2VecKeyedVectors`
+            A mapping from a string or int tag to its vector representation.
+            Either this or `docvecs_mapfile` **MUST** be supplied.
+        docvecs_mapfile : str, optional
+            Path to a file containing the docvecs mapping.
+            If `docvecs` is None, this file will be used to create it.
+        comment : str, optional
+            A model descriptive comment, used for logging and debugging purposes.
+        trim_rule : callable ((str, int, int) -> int), optional
+            Vocabulary trimming rule that accepts (word, count, min_count).
+            Specifies whether certain words should remain in the vocabulary (:attr:`gensim.utils.RULE_KEEP`),
+            be trimmed away (:attr:`gensim.utils.RULE_DISCARD`), or handled using the default
+            (:attr:`gensim.utils.RULE_DEFAULT`).If None, then :func:`~gensim.utils.keep_vocab_item` will be used.
+            Note: The rule, if given, is only used to prune vocabulary during build_vocab()
+            and is not stored as part of the model.
+        size : int, optional
+            Dimensionality of the feature vectors.
+        alpha : float, optional
+            The initial learning rate.
+        window : int, optional
+            The maximum distance between the current and predicted word within a sentence.
+        min_count : int, optional
+            Ignores all words with total frequency lower than this.
+        max_vocab_size : int, optional
+            Limits the RAM during vocabulary building; if there are more unique
+            words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM.
+            Set to `None` for no limit.
+        sample : float, optional
+            The threshold for configuring which higher-frequency words are randomly downsampled,
+            useful range is (0, 1e-5).
+        seed : int, optional
+            Seed for the random number generator. Initial vectors for each word are seeded with a hash of
+            the concatenation of word + `str(seed)`. Note that for a fully deterministically-reproducible run,
+            you must also limit the model to a single worker thread (`workers=1`), to eliminate ordering jitter
+            from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires
+            use of the `PYTHONHASHSEED` environment variable to control hash randomization).
+        workers : int, optional
+            Use this many worker threads to train the model. Will yield a speedup when training with multicore machines.
+        min_alpha : float, optional
+            Learning rate will linearly drop to `min_alpha` as training progresses.
+        hs : int {1,0}, optional
+            If 1, hierarchical softmax will be used for model training.
+            If set to 0, and `negative` is non-zero, negative sampling will be used.
+        negative : int, optional
+            If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
+            should be drawn (usually between 5-20). If set to 0, no negative sampling is used.
+        cbow_mean : int, optional
+            Same as `dm_mean`, unused.
+        hashfxn : callable (object -> int), optional
+            A hashing function. Used to create an initial random reproducible vector by hashing the random seed.
+        iter : int, optional
+            Number of epochs to iterate through the corpus.
+        sorted_vocab : bool, optional
+            Whether the vocabulary should be sorted internally.
+        batch_words : int, optional
+            Number of words to be handled by each job.
+
         """
         self.gensim_model = None
         self.dm_mean = dm_mean
@@ -60,9 +161,19 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1
         self.batch_words = batch_words
 
     def fit(self, X, y=None):
-        """
-        Fit the model according to the given training data.
-        Calls gensim.models.Doc2Vec
+        """Fit the model according to the given training data.
+
+        Parameters
+        ----------
+        X : {iterable of {:class:`~gensim.models.doc2vec.TaggedDocument`, iterable of iterable of str}
+            A collection of tagged documents used for training the model.
+            If these are not tagged, their order integer index will be used to tag them.
+
+        Returns
+        -------
+        :class:`~gensim.sklearn_api.d2vmodel.D2VTransformer`
+            The trained model.
+
         """
         if isinstance(X[0], doc2vec.TaggedDocument):
             d2v_sentences = X
@@ -81,12 +192,18 @@ def fit(self, X, y=None):
         return self
 
     def transform(self, docs):
-        """
-        Return the vector representations for the input documents.
-        The input `docs` should be a list of lists like
-        [['calculus', 'mathematical'],
-        ['geometry', 'operations', 'curves']]
-        or a single document like : ['calculus', 'mathematical']
+        """Get the vector representations for the input documents.
+
+        Parameters
+        ----------
+        docs : iterable of iterable of str
+            The input corpus.
+
+        Returns
+        -------
+        np.array of shape (`len(docs)`, `size`)
+            The vector representation of the input corpus.
+
         """
         if self.gensim_model is None:
             raise NotFittedError(