diff --git a/continuous_integration/travis/flake8_diff.sh b/continuous_integration/travis/flake8_diff.sh index 36a2a2d493..c297064392 100755 --- a/continuous_integration/travis/flake8_diff.sh +++ b/continuous_integration/travis/flake8_diff.sh @@ -134,6 +134,6 @@ check_files() { if [[ "$MODIFIED_FILES" == "no_match" ]]; then echo "No file has been modified" else - check_files "$(echo "$MODIFIED_FILES" )" "--ignore=E501,E731,E12,W503 --exclude=*.sh,*.md,*.yml,*.rst,*.ipynb,*.txt,*.csv,*.vec,Dockerfile*,*.c,*.pyx" + check_files "$(echo "$MODIFIED_FILES" )" "--ignore=E501,E731,E12,W503 --exclude=*.sh,*.md,*.yml,*.rst,*.ipynb,*.txt,*.csv,*.vec,Dockerfile*,*.c,*.pyx,*.inc" fi echo -e "No problem detected by flake8\n" diff --git a/docs/src/about.rst b/docs/src/about.rst index 294c60d52c..64a65bd333 100644 --- a/docs/src/about.rst +++ b/docs/src/about.rst @@ -1,3 +1,5 @@ +:orphan: + .. _about: ============ diff --git a/docs/src/changes_080.rst b/docs/src/changes_080.rst index be5df9ad15..b038ccb930 100644 --- a/docs/src/changes_080.rst +++ b/docs/src/changes_080.rst @@ -1,3 +1,5 @@ +:orphan: + .. _changes_080: Change Set for 0.8.0 diff --git a/docs/src/conf.py b/docs/src/conf.py index d2417fe5aa..9d21422c50 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -16,7 +16,7 @@ # If extensions (or modules to document with autodoc) are in another directory, # add these directories to sys.path here. If the directory is relative to the # documentation root, use os.path.abspath to make it absolute, like shown here. -#sys.path.append(os.path.abspath('.')) +sys.path.append(os.path.abspath('.')) # -- General configuration ----------------------------------------------------- @@ -139,7 +139,7 @@ # The name of an image file (within the static path) to use as favicon of the # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 # pixels large. -html_favicon = 'favicon.ico' +html_favicon = '_static/favicon.ico' # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, @@ -215,3 +215,5 @@ # If false, no module index is generated. #latex_use_modindex = True + +suppress_warnings = ['image.nonlocal_uri', 'ref.citation', 'ref.footnote'] diff --git a/docs/src/corpora/corpora.rst b/docs/src/corpora/corpora.rst index 3ea5151c96..f92a68af71 100644 --- a/docs/src/corpora/corpora.rst +++ b/docs/src/corpora/corpora.rst @@ -1,3 +1,5 @@ +:orphan: + :mod:`corpora` -- Package for corpora I/O ========================================== diff --git a/docs/src/models/models.rst b/docs/src/models/models.rst index f18032b7ee..0ac3b30831 100644 --- a/docs/src/models/models.rst +++ b/docs/src/models/models.rst @@ -1,3 +1,5 @@ +:orphan: + :mod:`models` -- Package for transformation models ====================================================== diff --git a/docs/src/models/wrappers/wrappers.rst b/docs/src/models/wrappers/wrappers.rst index e6acac5448..9746202d6d 100644 --- a/docs/src/models/wrappers/wrappers.rst +++ b/docs/src/models/wrappers/wrappers.rst @@ -1,3 +1,5 @@ +:orphan: + :mod:`models.wrappers` -- Package for transformation models via external programs ================================================================================= diff --git a/docs/src/similarities/simserver.rst b/docs/src/similarities/simserver.rst index 86a529b1c6..636ba663f4 100644 --- a/docs/src/similarities/simserver.rst +++ b/docs/src/similarities/simserver.rst @@ -1,3 +1,5 @@ +:orphan: + :mod:`simserver` -- Document similarity server ============================================== diff --git a/docs/src/simserver.rst b/docs/src/simserver.rst index f4abed868e..1b0d2b4396 100644 --- a/docs/src/simserver.rst +++ b/docs/src/simserver.rst @@ -1,3 +1,5 @@ +:orphan: + .. _simserver: Document Similarity Server diff --git a/docs/src/sklearn_integration/sklearn_wrapper_gensim_ldamodel.rst b/docs/src/sklearn_integration/sklearn_wrapper_gensim_ldamodel.rst index 95c100c4b1..585b8fc3dc 100644 --- a/docs/src/sklearn_integration/sklearn_wrapper_gensim_ldamodel.rst +++ b/docs/src/sklearn_integration/sklearn_wrapper_gensim_ldamodel.rst @@ -1,7 +1,7 @@ -:mod:`sklearn_integration.sklearn_wrapper_gensim_ldamodel.SklearnWrapperLdaModel` -- Scikit learn wrapper for Latent Dirichlet Allocation +:mod:`sklearn_integration.sklearn_wrapper_gensim_ldamodel` -- Scikit learn wrapper for Latent Dirichlet Allocation ========================================================================================================================================= -.. automodule:: gensim.sklearn_integration.sklearn_wrapper_gensim_ldamodel.SklearnWrapperLdaModel +.. automodule:: gensim.sklearn_integration.sklearn_wrapper_gensim_ldamodel :synopsis: Scikit learn wrapper for LDA model :members: :inherited-members: diff --git a/docs/src/summarization/textcleaner.rst b/docs/src/summarization/textcleaner.rst index dddaedcbbe..72eda3d779 100644 --- a/docs/src/summarization/textcleaner.rst +++ b/docs/src/summarization/textcleaner.rst @@ -7,4 +7,4 @@ :inherited-members: :undoc-members: :show-inheritance: -b + diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 8770284178..335f2af7f0 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -533,10 +533,10 @@ def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, corpus (gensim corpus): The corpus with which the author-topic model should be updated. author2doc (dictionary): author to document mapping corresponding to indexes in input - corpus. + corpus. doc2author (dictionary): document to author mapping corresponding to indexes in input - corpus. + corpus. chunks_as_numpy (bool): Whether each chunk passed to `.inference` should be a np array of not. np can in some settings turn the term IDs diff --git a/gensim/models/coherencemodel.py b/gensim/models/coherencemodel.py index d35a266a4a..8556db1c45 100644 --- a/gensim/models/coherencemodel.py +++ b/gensim/models/coherencemodel.py @@ -9,13 +9,13 @@ the four stage topic coherence pipeline from the paper [1]_. The four stage pipeline is basically: -Segmentation -> Probability Estimation -> Confirmation Measure -> Aggregation. + Segmentation -> Probability Estimation -> Confirmation Measure -> Aggregation. Implementation of this pipeline allows for the user to in essence "make" a coherence measure of his/her choice by choosing a method in each of the pipelines. .. [1] Michael Roeder, Andreas Both and Alexander Hinneburg. Exploring the space of topic -coherence measures. http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf. + coherence measures. http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf. """ import logging @@ -114,40 +114,47 @@ def __init__(self, model=None, topics=None, texts=None, corpus=None, dictionary= window_size=None, coherence='c_v', topn=10, processes=-1): """ Args: - ---- - model : Pre-trained topic model. Should be provided if topics is not provided. + model : Pre-trained topic model. Should be provided if topics is not provided. Currently supports LdaModel, LdaMallet wrapper and LdaVowpalWabbit wrapper. Use 'topics' parameter to plug in an as yet unsupported model. - topics : List of tokenized topics. If this is preferred over model, dictionary should be provided. eg:: - topics = [['human', 'machine', 'computer', 'interface'], + topics : List of tokenized topics. If this is preferred over model, dictionary should be provided. + eg:: + + topics = [['human', 'machine', 'computer', 'interface'], ['graph', 'trees', 'binary', 'widths']] - texts : Tokenized texts. Needed for coherence models that use sliding window based probability estimator, eg:: - texts = [['system', 'human', 'system', 'eps'], + + texts : Tokenized texts. Needed for coherence models that use sliding window based probability estimator, + eg:: + + texts = [['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees'], ['graph', 'minors', 'trees'], ['graph', 'minors', 'survey']] - corpus : Gensim document corpus. - dictionary : Gensim dictionary mapping of id word to create corpus. If model.id2word is present, - this is not needed. If both are provided, dictionary will be used. - window_size : Is the size of the window to be used for coherence measures using boolean sliding window as their - probability estimator. For 'u_mass' this doesn't matter. - If left 'None' the default window sizes are used which are: - 'c_v' : 110 - 'c_uci' : 10 - 'c_npmi' : 10 - coherence : Coherence measure to be used. Supported values are: - 'u_mass' - 'c_v' - 'c_uci' also popularly known as c_pmi - 'c_npmi' - For 'u_mass' corpus should be provided. If texts is provided, it will be converted - to corpus using the dictionary. For 'c_v', 'c_uci' and 'c_npmi' texts should be provided. - Corpus is not needed. - topn : Integer corresponding to the number of top words to be extracted from each topic. - processes : number of processes to use for probability estimation phase; any value less than 1 will be - interpreted to mean num_cpus - 1; default is -1. + + corpus : Gensim document corpus. + dictionary : Gensim dictionary mapping of id word to create corpus. If model.id2word is present, + this is not needed. If both are provided, dictionary will be used. + window_size : Is the size of the window to be used for coherence measures using boolean sliding window as their + probability estimator. For 'u_mass' this doesn't matter. + If left 'None' the default window sizes are used which are: + + 'c_v' : 110 + 'c_uci' : 10 + 'c_npmi' : 10 + + coherence : Coherence measure to be used. Supported values are: + 'u_mass' + 'c_v' + 'c_uci' also popularly known as c_pmi + 'c_npmi' + For 'u_mass' corpus should be provided. If texts is provided, it will be converted + to corpus using the dictionary. For 'c_v', 'c_uci' and 'c_npmi' texts should be provided. + Corpus is not needed. + topn : Integer corresponding to the number of top words to be extracted from each topic. + processes : number of processes to use for probability estimation phase; any value less than 1 will be + interpreted to mean num_cpus - 1; default is -1. """ if model is None and topics is None: raise ValueError("One of model or topics has to be provided.") diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index d0f768b5f6..126a09431d 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -730,7 +730,7 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case dummy4unknown=False): """ Compute correlation of the model with human similarity judgments. `pairs` is a filename of a dataset where - lines are 3-tuples, each consisting of a word pair and a similarity value, separated by `delimiter'. + lines are 3-tuples, each consisting of a word pair and a similarity value, separated by `delimiter`. An example dataset is included in Gensim (test/test_data/wordsim353.tsv). More datasets can be found at http://technion.ac.il/~ira.leviant/MultilingualVSMdata.html or https://www.cl.cam.ac.uk/~fh295/simlex.html. @@ -748,7 +748,7 @@ def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, case and words pairs in the dataset. If there are multiple case variants of a single word, the vector for the first occurrence (also the most frequent if vocabulary is sorted) is taken. - Use `dummy4unknown=True' to produce zero-valued similarities for pairs with out-of-vocabulary words. + Use `dummy4unknown=True` to produce zero-valued similarities for pairs with out-of-vocabulary words. Otherwise (default False), these pairs are skipped entirely. """ ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]] diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index aafa957eb8..0375eded69 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -981,16 +981,20 @@ def diff(self, other, distance="kullback_leibler", num_words=100, n_ann_terms=10 `n_ann_terms` is max quantity of words in intersection/symmetric difference between topics (used for annotation) Returns a matrix Z with shape (m1.num_topics, m2.num_topics), where Z[i][j] - difference between topic_i and topic_j and matrix annotation with shape (m1.num_topics, m2.num_topics, 2, None), - where + where: + annotation[i][j] = [[`int_1`, `int_2`, ...], [`diff_1`, `diff_2`, ...]] and `int_k` is word from intersection of `topic_i` and `topic_j` and `diff_l` is word from symmetric difference of `topic_i` and `topic_j` - `normed` is a flag. If `true`, matrix Z will be normalized + `normed` is a flag. If `true`, matrix Z will be normalized + Example: + >>> m1, m2 = LdaMulticore.load(path_1), LdaMulticore.load(path_2) >>> mdiff, annotation = m1.diff(m2) >>> print(mdiff) # get matrix with difference for each topic pair from `m1` and `m2` >>> print(annotation) # get array with positive/negative words for each topic pair from `m1` and `m2` + """ distances = { diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index 8e57489a89..1544aed84e 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -156,14 +156,14 @@ def init_ldaseq_ss(self, topic_chain_variance, topic_obs_variance, alpha, init_s def fit_lda_seq(self, corpus, lda_inference_max_iter, em_min_iter, em_max_iter, chunksize): """ fit an lda sequence model: + for each time period: + set up lda model with E[log p(w|z)] and \alpha - for each time period - set up lda model with E[log p(w|z)] and \alpha - for each document - perform posterior inference - update sufficient statistics/likelihood + for each document: + perform posterior inference + update sufficient statistics/likelihood - maximize topics + maximize topics """ LDASQE_EM_THRESHOLD = 1e-4 @@ -485,11 +485,13 @@ def compute_post_variance(self, word, chain_variance): This function accepts the word to compute variance for, along with the associated sslm class object, and returns variance and fwd_variance Computes Var[\beta_{t,w}] for t = 1:T - Fwd_Variance(t) ≡ E((beta_{t,w} − mean_{t,w})^2 |beta_{t} for 1:t) - = (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance ) * (fwd_variance[t - 1] + obs_variance) + :math:: - Variance(t) ≡ E((beta_{t,w} − mean_cap{t,w})^2 |beta_cap{t} for 1:t) - = fwd_variance[t - 1] + (fwd_variance[t - 1] / fwd_variance[t - 1] + obs_variance)^2 * (variance[t - 1] - (fwd_variance[t-1] + obs_variance)) + fwd\_variance[t] \equiv E((beta_{t,w}-mean_{t,w})^2 |beta_{t}\ for\ 1:t) = (obs\_variance / fwd\_variance[t - 1] + chain\_variance + obs\_variance ) * (fwd\_variance[t - 1] + obs\_variance) + + :math:: + + variance[t] \equiv E((beta_{t,w}-mean\_cap_{t,w})^2 |beta\_cap_{t}\ for\ 1:t) = fwd\_variance[t - 1] + (fwd\_variance[t - 1] / fwd\_variance[t - 1] + obs\_variance)^2 * (variance[t - 1] - (fwd\_variance[t-1] + obs\_variance)) """ INIT_VARIANCE_CONST = 1000 diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index 3c22320d0c..865a32f7c3 100644 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -21,27 +21,25 @@ Initialize a model with e.g.:: ->>> model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4) + >>> model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4) Persist a model to disk with:: ->>> model.save(fname) ->>> model = Word2Vec.load(fname) # you can continue training with the loaded model! + >>> model.save(fname) + >>> model = Word2Vec.load(fname) # you can continue training with the loaded model! -The word vectors are stored in a KeyedVectors instance in model.wv. This separates the read-only word vector lookup operations in KeyedVectors from the training code in Word2Vec. +The word vectors are stored in a KeyedVectors instance in model.wv. This separates the read-only word vector lookup operations in KeyedVectors from the training code in Word2Vec:: >>> model.wv['computer'] # numpy vector of a word array([-0.00449447, -0.00310097, 0.02421786, ...], dtype=float32) The word vectors can also be instantiated from an existing file on disk in the word2vec C format as a KeyedVectors instance:: + NOTE: It is impossible to continue training the vectors loaded from the C format because hidden weights, vocabulary frequency and the binary tree is missing:: -NOTE: It is impossible to continue training the vectors loaded from the C format because hidden weights, vocabulary frequency and the binary tree is missing. - - - >>> from gensim.models.keyedvectors import KeyedVectors - >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False) # C text format - >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.bin', binary=True) # C binary format + >>> from gensim.models.keyedvectors import KeyedVectors + >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False) # C text format + >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.bin', binary=True) # C binary format You can perform various NLP word tasks with the model. Some of them @@ -87,8 +85,8 @@ detect phrases longer than one word. Using phrases, you can learn a word2vec model where "words" are actually multiword expressions, such as `new_york_times` or `financial_crisis`: ->>> bigram_transformer = gensim.models.Phrases(sentences) ->>> model = Word2Vec(bigram_transformer[sentences], size=100, ...) + >>> bigram_transformer = gensim.models.Phrases(sentences) + >>> model = Word2Vec(bigram_transformer[sentences], size=100, ...) .. [1] Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. Efficient Estimation of Word Representations in Vector Space. In Proceedings of Workshop at ICLR, 2013. .. [2] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. Distributed Representations of Words and Phrases and their Compositionality. diff --git a/gensim/models/wrappers/dtmmodel.py b/gensim/models/wrappers/dtmmodel.py index 94a2e5eb1a..5eff091417 100644 --- a/gensim/models/wrappers/dtmmodel.py +++ b/gensim/models/wrappers/dtmmodel.py @@ -341,8 +341,9 @@ def dtm_coherence(self, time, num_words=20): """ returns all topics of a particular time-slice without probabilitiy values for it to be used for either "u_mass" or "c_v" coherence. - TODO: because of print format right now can only return for 1st time-slice. - should we fix the coherence printing or make changes to the print statements to mirror DTM python? + TODO: + because of print format right now can only return for 1st time-slice. + should we fix the coherence printing or make changes to the print statements to mirror DTM python? """ coherence_topics = [] for topic_no in range(0, self.num_topics): diff --git a/gensim/models/wrappers/ldamallet.py b/gensim/models/wrappers/ldamallet.py index 640cf11dd8..5276b035f1 100644 --- a/gensim/models/wrappers/ldamallet.py +++ b/gensim/models/wrappers/ldamallet.py @@ -21,8 +21,8 @@ Example: ->>> model = gensim.models.wrappers.LdaMallet('/Users/kofola/mallet-2.0.7/bin/mallet', corpus=my_corpus, num_topics=20, id2word=dictionary) ->>> print model[my_vector] # print LDA topics of a document + >>> model = gensim.models.wrappers.LdaMallet('/Users/kofola/mallet-2.0.7/bin/mallet', corpus=my_corpus, num_topics=20, id2word=dictionary) + >>> print model[my_vector] # print LDA topics of a document .. [1] http://mallet.cs.umass.edu/ @@ -359,14 +359,12 @@ def malletmodel2ldamodel(mallet_model, gamma_threshold=0.001, iterations=50): gensim model. Args: - ---- - mallet_model : Trained mallet model - gamma_threshold : To be used for inference in the new LdaModel. - iterations : number of iterations to be used for inference in the new LdaModel. + mallet_model : Trained mallet model + gamma_threshold : To be used for inference in the new LdaModel. + iterations : number of iterations to be used for inference in the new LdaModel. Returns: - ------- - model_gensim : LdaModel instance; copied gensim LdaModel + model_gensim : LdaModel instance; copied gensim LdaModel """ model_gensim = LdaModel( id2word=mallet_model.id2word, num_topics=mallet_model.num_topics, diff --git a/gensim/models/wrappers/ldavowpalwabbit.py b/gensim/models/wrappers/ldavowpalwabbit.py index 7ae040293c..6d6ae9e275 100644 --- a/gensim/models/wrappers/ldavowpalwabbit.py +++ b/gensim/models/wrappers/ldavowpalwabbit.py @@ -498,9 +498,9 @@ def corpus_to_vw(corpus): character. E.g.: - | 4:7 14:1 22:8 6:3 - | 14:22 22:4 0:1 1:3 - | 7:2 8:2 + | 4:7 14:1 22:8 6:3 + | 14:22 22:4 0:1 1:3 + | 7:2 8:2 """ for entries in corpus: line = ['|'] @@ -568,13 +568,11 @@ def vwmodel2ldamodel(vw_model, iterations=50): vwmodel into the gensim model. Args: - ---- - vw_model : Trained vowpal wabbit model. - iterations : Number of iterations to be used for inference of the new LdaModel. + vw_model : Trained vowpal wabbit model. + iterations : Number of iterations to be used for inference of the new LdaModel. Returns: - ------- - model_gensim : LdaModel instance; copied gensim LdaModel. + model_gensim : LdaModel instance; copied gensim LdaModel. """ model_gensim = LdaModel( num_topics=vw_model.num_topics, id2word=vw_model.id2word, chunksize=vw_model.chunksize, diff --git a/gensim/models/wrappers/wordrank.py b/gensim/models/wrappers/wordrank.py index 356be3051c..dbcca9ebb9 100644 --- a/gensim/models/wrappers/wordrank.py +++ b/gensim/models/wrappers/wordrank.py @@ -58,9 +58,11 @@ def train(cls, wr_path, corpus_file, out_name, size=100, window=15, symmetric=1, Expects file to contain space-separated tokens in a single line `out_name` is name of the directory which will be created (in wordrank folder) to save embeddings and training data. It will contain following contents: - Word Embeddings saved after every dump_period and stored in a file model_word_"current iter".txt - Context Embeddings saved after every dump_period and stored in a file model_context_"current iter".txt + + Word Embeddings saved after every dump_period and stored in a file model_word_current\ iter.txt + Context Embeddings saved after every dump_period and stored in a file model_context_current\ iter.txt A meta directory which contain: 'vocab.txt' - vocab words, 'wiki.toy' - word-word coccurence values, 'meta' - vocab and coccurence lengths + `size` is the dimensionality of the feature vectors. `window` is the number of context words to the left (and to the right, if symmetric = 1). `symmetric` if 0, only use left context words, else use left and right both. diff --git a/gensim/scripts/glove2word2vec.py b/gensim/scripts/glove2word2vec.py index 7709c48714..8d3d1cb02f 100644 --- a/gensim/scripts/glove2word2vec.py +++ b/gensim/scripts/glove2word2vec.py @@ -6,7 +6,8 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """ -USAGE: $ python -m gensim.scripts.glove2word2vec --input --output +USAGE: + $ python -m gensim.scripts.glove2word2vec --input --output Where: : Input GloVe .txt file : Desired name of output Word2vec .txt file @@ -38,7 +39,7 @@ def get_glove_info(glove_file_name): def glove2word2vec(glove_input_file, word2vec_output_file): - """Convert `glove_input_file` in GloVe format into `word2vec_output_file in word2vec format.""" + """Convert `glove_input_file` in GloVe format into `word2vec_output_file` in word2vec format.""" num_lines, num_dims = get_glove_info(glove_input_file) logger.info("converting %i vectors from %s to %s", num_lines, glove_input_file, word2vec_output_file) with smart_open(word2vec_output_file, 'wb') as fout: diff --git a/gensim/similarities/docsim.py b/gensim/similarities/docsim.py index d61c64636e..2a1a9512ea 100755 --- a/gensim/similarities/docsim.py +++ b/gensim/similarities/docsim.py @@ -572,7 +572,6 @@ class WmdSimilarity(interfaces.SimilarityABC): >>> # Given a document collection "corpus", train word2vec model. >>> model = word2vec(corpus) >>> instance = WmdSimilarity(corpus, model, num_best=10) - >>> # Make query. >>> query = 'Very good, you should seat outdoor.' >>> sims = instance[query] @@ -582,8 +581,7 @@ def __init__(self, corpus, w2v_model, num_best=None, normalize_w2v_and_replace=T corpus: List of lists of strings, as in gensim.models.word2vec. w2v_model: A trained word2vec model. num_best: Number of results to retrieve. - normalize_w2v_and_replace: Whether or not to normalize the word2vec vectors to - length 1. + normalize_w2v_and_replace: Whether or not to normalize the word2vec vectors to length 1. """ self.corpus = corpus self.w2v_model = w2v_model diff --git a/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py b/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py index 1ad1fabccf..de7b40d825 100644 --- a/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py +++ b/gensim/sklearn_integration/sklearn_wrapper_gensim_ldamodel.py @@ -114,9 +114,11 @@ def partial_fit(self, X): Train model over X. By default, 'online (single-pass)' mode is used for training the LDA model. Configure `passes` and `update_every` params at init to choose the mode among : + - online (single-pass): update_every != None and passes == 1 - online (multi-pass): update_every != None and passes > 1 - batch: update_every == None + """ if sparse.issparse(X): X = matutils.Sparse2Corpus(X) diff --git a/gensim/summarization/summarizer.py b/gensim/summarization/summarizer.py index e749b4cc66..c067c23faf 100644 --- a/gensim/summarization/summarizer.py +++ b/gensim/summarization/summarizer.py @@ -141,7 +141,6 @@ def summarize_corpus(corpus, ratio=0.2): The most important documents are returned as a list sorted by the document score, highest first. - """ % INPUT_MIN_LENGTH hashable_corpus = _build_hasheable_corpus(corpus) @@ -184,11 +183,13 @@ def summarize(text, ratio=0.2, word_count=None, split=False): The length of the output can be specified using the ratio and word_count parameters: + ratio should be a number between 0 and 1 that determines the - percentage of the number of sentences of the original text to be - chosen for the summary (defaults at 0.2). + percentage of the number of sentences of the original text to be + chosen for the summary (defaults at 0.2). word_count determines how many words will the output contain. - If both parameters are provided, the ratio will be ignored. + If both parameters are provided, the ratio will be ignored. + """ # Gets a list of processed sentences. sentences = _clean_text_by_sentences(text) diff --git a/gensim/topic_coherence/aggregation.py b/gensim/topic_coherence/aggregation.py index 7c345d8812..341834c92f 100644 --- a/gensim/topic_coherence/aggregation.py +++ b/gensim/topic_coherence/aggregation.py @@ -20,11 +20,9 @@ def arithmetic_mean(confirmed_measures): the confirmation measure module. Args: - ---- - confirmed_measures : list of calculated confirmation measure on each set in the segmented topics. + confirmed_measures : list of calculated confirmation measure on each set in the segmented topics. Returns: - ------- - mean : Arithmetic mean of all the values contained in confirmation measures. + mean : Arithmetic mean of all the values contained in confirmation measures. """ return np.mean(confirmed_measures) diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py index 467d134f29..26e86065fb 100644 --- a/gensim/topic_coherence/direct_confirmation_measure.py +++ b/gensim/topic_coherence/direct_confirmation_measure.py @@ -24,14 +24,12 @@ def log_conditional_probability(segmented_topics, accumulator): This is defined as: m_lc(S_i) = log[(P(W', W*) + e) / P(W*)] Args: - ---- - segmented_topics : Output from the segmentation module of the segmented topics. - Is a list of list of tuples. - accumulator: word occurrence accumulator from probability_estimation. + segmented_topics : Output from the segmentation module of the segmented topics. + Is a list of list of tuples. + accumulator: word occurrence accumulator from probability_estimation. Returns: - ------- - m_lc : List of log conditional probability measure for each topic. + m_lc : List of log conditional probability measure for each topic. """ m_lc = [] num_docs = float(accumulator.num_docs) @@ -65,14 +63,12 @@ def log_ratio_measure(segmented_topics, accumulator, normalize=False): This is defined as: m_nlr(S_i) = m_lr(S_i) / -log[P(W', W*) + e] Args: - ---- - segmented topics : Output from the segmentation module of the segmented topics. - Is a list of list of tuples. - accumulator: word occurrence accumulator from probability_estimation. + segmented topics : Output from the segmentation module of the segmented topics. + Is a list of list of tuples. + accumulator: word occurrence accumulator from probability_estimation. Returns: - ------- - m_lr : List of log ratio measures for each topic. + m_lr : List of log ratio measures for each topic. """ m_lr = [] num_docs = float(accumulator.num_docs) diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index 07f221e941..241b96befc 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -38,29 +38,29 @@ def cosine_similarity(segmented_topics, accumulator, topics, measure='nlr', gamma=1): """ This function calculates the indirect cosine measure. Given context vectors - _ _ _ _ u = V(W') and w = V(W*) for the word sets of a pair S_i = (W', W*) indirect - _ _ - cosine measure is computed as the cosine similarity between u and w. The formula used is: + cosine measure is computed as the cosine similarity between u and w. - m_{sim}_{(m, \gamma)}(W', W*) = s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*)) + The formula used is: - where each vector \vec{V}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|} + m_{sim}_{(m, \gamma)}(W', W*) = s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*)) + + where each vector: + + \vec{V}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|} Args: - ---- - segmented_topics : Output from the segmentation module of the segmented topics. - Is a list of list of tuples. - accumulator : Output from the probability_estimation module. - Is an accumulator of word occurrences (see text_analysis module). - topics : Topics obtained from the trained topic model. - measure : String. Direct confirmation measure to be used. - Supported values are "nlr" (normalized log ratio). - gamma : Gamma value for computing W', W* vectors; default is 1. + + segmented_topics : Output from the segmentation module of the segmented topics. Is a list of list of tuples. + accumulator : Output from the probability_estimation module. Is an accumulator of word occurrences (see text_analysis module). + topics : Topics obtained from the trained topic model. + measure : String. Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio). + gamma : Gamma value for computing W', W* vectors; default is 1. Returns: - ------- - s_cos_sim : list of indirect cosine similarity measure for each topic. + + s_cos_sim : list of indirect cosine similarity measure for each topic. + """ context_vectors = ContextVectorComputer(measure, topics, accumulator, gamma) diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index 85e787de18..7832494a5c 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -23,14 +23,12 @@ def p_boolean_document(corpus, segmented_topics): of documents in which the word occurs divided by the total number of documents. Args: - ---- - corpus : The corpus of documents. - segmented_topics : Output from the segmentation of topics. Could be simply topics too. + corpus : The corpus of documents. + segmented_topics : Output from the segmentation of topics. Could be simply topics too. Returns: - ------- - accumulator : word occurrence accumulator instance that can be used to lookup token - frequencies and co-occurrence frequencies. + accumulator : word occurrence accumulator instance that can be used to lookup token + frequencies and co-occurrence frequencies. """ top_ids = unique_ids_from_segments(segmented_topics) return CorpusAccumulator(top_ids).accumulate(corpus) @@ -44,16 +42,14 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, p documents to compute word probabilities. Args: - ---- - texts : List of string sentences. - segmented_topics : Output from the segmentation of topics. Could be simply topics too. - dictionary : Gensim dictionary mapping of the tokens and ids. - window_size : Size of the sliding window. 110 found out to be the ideal size for large corpora. + texts : List of string sentences. + segmented_topics : Output from the segmentation of topics. Could be simply topics too. + dictionary : Gensim dictionary mapping of the tokens and ids. + window_size : Size of the sliding window. 110 found out to be the ideal size for large corpora. Returns: - ------- - accumulator : word occurrence accumulator instance that can be used to lookup token - frequencies and co-occurrence frequencies. + accumulator : word occurrence accumulator instance that can be used to lookup token + frequencies and co-occurrence frequencies. """ top_ids = unique_ids_from_segments(segmented_topics) if processes <= 1: @@ -68,11 +64,10 @@ def unique_ids_from_segments(segmented_topics): """Return the set of all unique ids in a list of segmented topics. Args: - ---- - segmented_topics: list of tuples of (word_id_set1, word_id_set2). Each word_id_set - is either a single integer, or a `numpy.ndarray` of integers. + segmented_topics: list of tuples of (word_id_set1, word_id_set2). Each word_id_set + is either a single integer, or a `numpy.ndarray` of integers. Returns: - unique_ids : set of unique ids across all topic segments. + unique_ids : set of unique ids across all topic segments. """ unique_ids = set() # is a set of all the unique ids contained in topics. for s_i in segmented_topics: diff --git a/gensim/topic_coherence/segmentation.py b/gensim/topic_coherence/segmentation.py index 9a2a58b060..4845a26859 100644 --- a/gensim/topic_coherence/segmentation.py +++ b/gensim/topic_coherence/segmentation.py @@ -16,8 +16,7 @@ def s_one_pre(topics): """ This function performs s_one_pre segmentation on a list of topics. - s_one_pre segmentation is defined as: s_one_pre = {(W', W*) | W' = {w_i}; - W* = {w_j}; w_i, w_j belongs to W; i > j} + s_one_pre segmentation is defined as: s_one_pre = {(W', W*) | W' = {w_i}; W* = {w_j}; w_i, w_j belongs to W; i > j} Example: >>> topics = [np.array([1, 2, 3]), np.array([4, 5, 6])] @@ -25,12 +24,10 @@ def s_one_pre(topics): [[(2, 1), (3, 1), (3, 2)], [(5, 4), (6, 4), (6, 5)]] Args: - ---- - topics : list of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] + topics : list of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] Returns: - ------- - s_one_pre : list of list of (W', W*) tuples for all unique topic ids + s_one_pre : list of list of (W', W*) tuples for all unique topic ids """ s_one_pre = [] @@ -46,8 +43,7 @@ def s_one_pre(topics): def s_one_one(topics): """ This function performs s_one_one segmentation on a list of topics. - s_one_one segmentation is defined as: s_one_one = {(W', W*) | W' = {w_i}; - W* = {w_j}; w_i, w_j belongs to W; i != j} + s_one_one segmentation is defined as: s_one_one = {(W', W*) | W' = {w_i}; W* = {w_j}; w_i, w_j belongs to W; i != j} Example: >>> topics = [np.array([1, 2, 3]), np.array([4, 5, 6])] @@ -55,12 +51,10 @@ def s_one_one(topics): [[(1, 2), (1, 3), (2, 1), (2, 3), (3, 1), (3, 2)], [(4, 5), (4, 6), (5, 4), (5, 6), (6, 4), (6, 5)]] Args: - ---- - topics : list of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] + topics : list of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] Returns: - ------- - s_one_one : list of list of (W', W*) tuples for all unique topic ids + s_one_one : list of list of (W', W*) tuples for all unique topic ids """ s_one_one = [] @@ -79,8 +73,7 @@ def s_one_one(topics): def s_one_set(topics): """ This function performs s_one_set segmentation on a list of topics. - s_one_set segmentation is defined as: s_one_set = {(W', W*) | W' = {w_i}; w_i belongs to W; - W* = W} + s_one_set segmentation is defined as: s_one_set = {(W', W*) | W' = {w_i}; w_i belongs to W; W* = W} Example: >>> topics = [np.array([9, 10, 7]) >>> s_one_set(topics) @@ -89,12 +82,10 @@ def s_one_set(topics): (7, array([ 9, 10, 7]))]] Args: - ---- - topics : list of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] + topics : list of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] Returns: - ------- - s_one_set : list of list of (W', W*) tuples for all unique topic ids. + s_one_set : list of list of (W', W*) tuples for all unique topic ids. """ s_one_set = [] diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 1be0574d7b..20aee84906 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -29,9 +29,8 @@ def _ids_to_words(ids, dictionary): This function abstracts away the differences between the HashDictionary and the standard one. Args: - ---- - ids: list of list of tuples, where each tuple contains (token_id, iterable of token_ids). - This is the format returned by the topic_coherence.segmentation functions. + ids: list of list of tuples, where each tuple contains (token_id, iterable of token_ids). + This is the format returned by the topic_coherence.segmentation functions. """ if not dictionary.id2token: # may not be initialized in the standard gensim.corpora.Dictionary setattr(dictionary, 'id2token', {v: k for k, v in dictionary.token2id.items()}) @@ -169,9 +168,8 @@ class WindowedTextsAnalyzer(UsesDictionary): def __init__(self, relevant_ids, dictionary): """ Args: - ---- - relevant_ids: the set of words that occurrences should be accumulated for. - dictionary: Dictionary instance with mappings for the relevant_ids. + relevant_ids: the set of words that occurrences should be accumulated for. + dictionary: Dictionary instance with mappings for the relevant_ids. """ super(WindowedTextsAnalyzer, self).__init__(relevant_ids, dictionary) self._none_token = self._vocab_size # see _iter_texts for use of none token @@ -302,11 +300,10 @@ class ParallelWordOccurrenceAccumulator(WindowedTextsAnalyzer): def __init__(self, processes, *args, **kwargs): """ Args: - ---- - processes : number of processes to use; must be at least two. - args : should include `relevant_ids` and `dictionary` (see `UsesDictionary.__init__`). - kwargs : can include `batch_size`, which is the number of docs to send to a worker at a - time. If not included, it defaults to 64. + processes : number of processes to use; must be at least two. + args : should include `relevant_ids` and `dictionary` (see `UsesDictionary.__init__`). + kwargs : can include `batch_size`, which is the number of docs to send to a worker at a + time. If not included, it defaults to 64. """ super(ParallelWordOccurrenceAccumulator, self).__init__(*args) if processes < 2: diff --git a/gensim/utils.py b/gensim/utils.py index 55da2bffa0..387a9a4193 100644 --- a/gensim/utils.py +++ b/gensim/utils.py @@ -1214,13 +1214,13 @@ def strided_windows(ndarray, window_size): [5, 6, 7, 8, 9]]) Args: - ---- - ndarray: either a numpy.ndarray or something that can be converted into one. - window_size: sliding window size. - :param window_size: - :return: numpy.ndarray of the subsequences produced by sliding a window of the given size over - the `ndarray`. Since this uses striding, the individual arrays are views rather than - copies of `ndarray`. Changes to one view modifies the others and the original. + ndarray: either a numpy.ndarray or something that can be converted into one. + window_size: sliding window size. + + Returns: + numpy.ndarray of the subsequences produced by sliding a window of the given size over + the `ndarray`. Since this uses striding, the individual arrays are views rather than + copies of `ndarray`. Changes to one view modifies the others and the original. """ ndarray = np.asarray(ndarray) if window_size == ndarray.shape[0]: @@ -1240,12 +1240,11 @@ def iter_windows(texts, window_size, copy=False, ignore_below_size=True, include instead, pass `copy=True`. Args: - ---- - texts: List of string sentences. - window_size: Size of sliding window. - copy: False to use views of the texts (default) or True to produce deep copies. - ignore_below_size: ignore documents that are not at least `window_size` in length (default behavior). - If False, the documents below `window_size` will be yielded as the full document. + texts: List of string sentences. + window_size: Size of sliding window. + copy: False to use views of the texts (default) or True to produce deep copies. + ignore_below_size: ignore documents that are not at least `window_size` in length (default behavior). + If False, the documents below `window_size` will be yielded as the full document. """ for doc_num, document in enumerate(texts):