From 29a8a375da19cc3a5853dde1dbc425007d4705c9 Mon Sep 17 00:00:00 2001 From: CLearERR Date: Tue, 14 Nov 2017 02:03:30 +0500 Subject: [PATCH 01/39] Refactored aggregation --- gensim/topic_coherence/aggregation.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/gensim/topic_coherence/aggregation.py b/gensim/topic_coherence/aggregation.py index 065943a28f..b486dda307 100644 --- a/gensim/topic_coherence/aggregation.py +++ b/gensim/topic_coherence/aggregation.py @@ -4,10 +4,8 @@ # Copyright (C) 2013 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -This module contains functions to perform aggregation on a list of values -obtained from the confirmation measure. -""" +"""This module contains functions to perform aggregation on a list of values +obtained from the confirmation measure.""" import logging import numpy as np @@ -17,13 +15,24 @@ def arithmetic_mean(confirmed_measures): """ - This functoin performs the arithmetic mean aggregation on the output obtained from + Perform the arithmetic mean aggregation on the output obtained from the confirmation measure module. - Args: - confirmed_measures : list of calculated confirmation measure on each set in the segmented topics. + Parameters + ---------- + confirmed_measures : list + List of calculated confirmation measure on each set in the segmented topics. + + Returns + ------- + float + Arithmetic mean of all the values contained in confirmation measures. + + Examples + -------- + >>> from gensim.topic_coherence.aggregation import arithmetic_mean + >>> arithmetic_mean([1.1, 2.2, 3.3, 4.4]) + 2.75 - Returns: - mean : Arithmetic mean of all the values contained in confirmation measures. """ return np.mean(confirmed_measures) From 56eda2314678d83b336812cfb5e37b30d0be7d52 Mon Sep 17 00:00:00 2001 From: CLearERR Date: Wed, 15 Nov 2017 01:50:37 +0500 Subject: [PATCH 02/39] Micro-Fix for aggregation.py, partially refactored direct_confirmation.py --- gensim/topic_coherence/aggregation.py | 2 +- .../direct_confirmation_measure.py | 115 +++++++++++++----- 2 files changed, 83 insertions(+), 34 deletions(-) diff --git a/gensim/topic_coherence/aggregation.py b/gensim/topic_coherence/aggregation.py index b486dda307..48091085d3 100644 --- a/gensim/topic_coherence/aggregation.py +++ b/gensim/topic_coherence/aggregation.py @@ -25,7 +25,7 @@ def arithmetic_mean(confirmed_measures): Returns ------- - float + numpy.float Arithmetic mean of all the values contained in confirmation measures. Examples diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py index 0dc9dc30e8..260d7253e0 100644 --- a/gensim/topic_coherence/direct_confirmation_measure.py +++ b/gensim/topic_coherence/direct_confirmation_measure.py @@ -19,22 +19,43 @@ def log_conditional_probability(segmented_topics, accumulator, with_std=False, with_support=False): """ - This function calculates the log-conditional-probability measure + Calculate the log-conditional-probability measure which is used by coherence measures such as U_mass. This is defined as: m_lc(S_i) = log[(P(W', W*) + e) / P(W*)] - Args: - segmented_topics (list): Output from the segmentation module of the segmented - topics. Is a list of list of tuples. - accumulator: word occurrence accumulator from probability_estimation. - with_std (bool): True to also include standard deviation across topic segment - sets in addition to the mean coherence for each topic; default is False. - with_support (bool): True to also include support across topic segments. The - support is defined as the number of pairwise similarity comparisons were - used to compute the overall topic coherence. - - Returns: + Parameters + ---------- + segmented_topics : list + Output from the segmentation module of the segmented topics. Is a list of list of tuples. + accumulator : list + Word occurrence accumulator from probability_estimation. + with_std : bool + True to also include standard deviation across topic segment + sets in addition to the mean coherence for each topic; default is False. + with_support : bool + True to also include support across topic segments. The + support is defined as the number of pairwise similarity comparisons were + used to compute the overall topic coherence. + + Returns + ------- list : of log conditional probability measure for each topic. + + Examples + -------- + >>> from gensim.topic_coherence import direct_confirmation_measure,text_analysis + >>> from collections import namedtuple + >>> id2token = {1: 'test', 2: 'doc'} + >>> token2id = {v: k for k, v in id2token.items()} + >>> dictionary = namedtuple('Dictionary', 'token2id, id2token')(token2id, id2token) + >>> segmentation = [[(1, 2)]] + >>> num_docs = 5 + >>> accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary) + >>> accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}} + >>> accumulator._num_docs = num_docs + >>> direct_confirmation_measure.log_conditional_probability(segmentation, accumulator)[0] + Answer should be ~ ln(1 / 2) = -0.693147181 + """ topic_coherences = [] num_docs = float(accumulator.num_docs) @@ -59,14 +80,20 @@ def aggregate_segment_sims(segment_sims, with_std, with_support): """Compute various statistics from the segment similarities generated via set pairwise comparisons of top-N word lists for a single topic. - Args: - segment_sims (iterable): floating point similarity values to aggregate. - with_std (bool): Set to True to include standard deviation. - with_support (bool): Set to True to include number of elements in `segment_sims` - as a statistic in the results returned. + Parameters + ---------- + segment_sims : iterable + floating point similarity values to aggregate. + with_std : bool + Set to True to include standard deviation. + with_support : bool + Set to True to include number of elements in `segment_sims` as a statistic in the results returned. + + Returns + ------- + tuple + tuple with (mean[, std[, support]]) - Returns: - tuple: with (mean[, std[, support]]) """ mean = np.mean(segment_sims) stats = [mean] @@ -83,27 +110,49 @@ def log_ratio_measure( """ If normalize=False: Popularly known as PMI. - This function calculates the log-ratio-measure which is used by + Calculate the log-ratio-measure which is used by coherence measures such as c_v. This is defined as: m_lr(S_i) = log[(P(W', W*) + e) / (P(W') * P(W*))] If normalize=True: - This function calculates the normalized-log-ratio-measure, popularly knowns as + Calculate the normalized-log-ratio-measure, popularly knowns as NPMI which is used by coherence measures such as c_v. This is defined as: m_nlr(S_i) = m_lr(S_i) / -log[P(W', W*) + e] - Args: - segmented_topics (list): Output from the segmentation module of the segmented - topics. Is a list of list of tuples. - accumulator: word occurrence accumulator from probability_estimation. - with_std (bool): True to also include standard deviation across topic segment - sets in addition to the mean coherence for each topic; default is False. - with_support (bool): True to also include support across topic segments. The - support is defined as the number of pairwise similarity comparisons were - used to compute the overall topic coherence. - - Returns: - list : of log ratio measure for each topic. + Parameters + ---------- + segmented_topics : list + Output from the segmentation module of the segmented topics. Is a list of list of tuples. + accumulator: list + word occurrence accumulator from probability_estimation. + with_std : bool + True to also include standard deviation across topic segment + sets in addition to the mean coherence for each topic; default is False. + with_support : bool + True to also include support across topic segments. The + support is defined as the number of pairwise similarity comparisons were + used to compute the overall topic coherence. + + Returns + ------- + list + List of log ratio measure for each topic. + + Examples + -------- + >>> from gensim.topic_coherence import direct_confirmation_measure,text_analysis + >>> from collections import namedtuple + >>> id2token = {1: 'test', 2: 'doc'} + >>> token2id = {v: k for k, v in id2token.items()} + >>> dictionary = namedtuple('Dictionary', 'token2id, id2token')(token2id, id2token) + >>> segmentation = [[(1, 2)]] + >>> num_docs = 5 + >>> accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary) + >>> accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}} + >>> accumulator._num_docs = num_docs + >>> direct_confirmation_measure.log_ratio_measure(segmentation, accumulator)[0] + Answer should be ~ ln{(1 / 5) / [(3 / 5) * (2 / 5)]} = -0.182321557 + """ topic_coherences = [] num_docs = float(accumulator.num_docs) From edd53d4419f8e15742cbc26b2fceff9e6e264024 Mon Sep 17 00:00:00 2001 From: CLearERR Date: Thu, 16 Nov 2017 02:27:13 +0500 Subject: [PATCH 03/39] Partially refactored indirect_confirmation_measure --- .../direct_confirmation_measure.py | 4 +- .../indirect_confirmation_measure.py | 48 ++++++++++++++----- 2 files changed, 37 insertions(+), 15 deletions(-) diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py index 260d7253e0..55f0d11216 100644 --- a/gensim/topic_coherence/direct_confirmation_measure.py +++ b/gensim/topic_coherence/direct_confirmation_measure.py @@ -121,8 +121,8 @@ def log_ratio_measure( Parameters ---------- - segmented_topics : list - Output from the segmentation module of the segmented topics. Is a list of list of tuples. + segmented_topics : list of (list of tuples) + Output from the segmentation module of the segmented topics. accumulator: list word occurrence accumulator from probability_estimation. with_std : bool diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index 33b42223bb..aae33291de 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -43,18 +43,40 @@ def word2vec_similarity(segmented_topics, accumulator, with_std=False, with_supp """For each topic segmentation, compute average cosine similarity using a WordVectorsAccumulator. - Args: - segmented_topics (list): Output from the segmentation module of the segmented - topics. Is a list of list of tuples. - accumulator: word occurrence accumulator from probability_estimation. - with_std (bool): True to also include standard deviation across topic segment - sets in addition to the mean coherence for each topic; default is False. - with_support (bool): True to also include support across topic segments. The - support is defined as the number of pairwise similarity comparisons were - used to compute the overall topic coherence. + Parameters + ---------- + segmented_topics : list of (list of tuples) + Output from the segmentation module of the segmented topics. + accumulator: list + Word occurrence accumulator from probability_estimation. + with_std : bool + True to also include standard deviation across topic segment + sets in addition to the mean coherence for each topic; default is False. + with_support : bool + True to also include support across topic segments. The + support is defined as the number of pairwise similarity comparisons were + used to compute the overall topic coherence. + + Returns + ------- + list + List of word2vec cosine similarities per topic. + + Examples + -------- + >>> from gensim.corpora.dictionary import Dictionary + >>> import numpy as np + >>> from gensim.topic_coherence import indirect_confirmation_measure + >>> from gensim.topic_coherence import text_analysis + >>> segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]] + >>> dictionary = Dictionary() + >>> dictionary.id2token = {1: 'fake', 2: 'tokens'} + >>> accumulator = text_analysis.WordVectorsAccumulator({1, 2}, dictionary) + >>> accumulator.accumulate([['fake', 'tokens'],['tokens', 'fake']], 5) + >>> mean, std = indirect_confirmation_measure.word2vec_similarity(segmentation, accumulator, with_std=True)[0] + >>> print mean, std + 0.726752426218 0.00695475919227 - Returns: - list : of word2vec cosine similarities per topic. """ topic_coherences = [] total_oov = 0 @@ -88,8 +110,7 @@ def word2vec_similarity(segmented_topics, accumulator, with_std=False, with_supp def cosine_similarity( segmented_topics, accumulator, topics, measure='nlr', gamma=1, with_std=False, with_support=False): - r""" - This function calculates the indirect cosine measure. + r"""Calculate the indirect cosine measure. Given context vectors u = V(W') and w = V(W*) for the word sets of a pair S_i = (W', W*) indirect cosine measure @@ -123,6 +144,7 @@ def cosine_similarity( Returns: list: of indirect cosine similarity measure for each topic. + """ context_vectors = ContextVectorComputer(measure, topics, accumulator, gamma) From cfd60507fde74196cae05f6b9ef87bf31887d1d1 Mon Sep 17 00:00:00 2001 From: CLearERR Date: Fri, 17 Nov 2017 01:36:17 +0500 Subject: [PATCH 04/39] Some additions --- .../direct_confirmation_measure.py | 14 ++--- .../indirect_confirmation_measure.py | 63 +++++++++++++------ 2 files changed, 50 insertions(+), 27 deletions(-) diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py index 55f0d11216..f33140a7c7 100644 --- a/gensim/topic_coherence/direct_confirmation_measure.py +++ b/gensim/topic_coherence/direct_confirmation_measure.py @@ -82,12 +82,12 @@ def aggregate_segment_sims(segment_sims, with_std, with_support): Parameters ---------- - segment_sims : iterable - floating point similarity values to aggregate. - with_std : bool - Set to True to include standard deviation. - with_support : bool - Set to True to include number of elements in `segment_sims` as a statistic in the results returned. + segment_sims : iterable + floating point similarity values to aggregate. + with_std : bool + Set to True to include standard deviation. + with_support : bool + Set to True to include number of elements in `segment_sims` as a statistic in the results returned. Returns ------- @@ -124,7 +124,7 @@ def log_ratio_measure( segmented_topics : list of (list of tuples) Output from the segmentation module of the segmented topics. accumulator: list - word occurrence accumulator from probability_estimation. + Word occurrence accumulator from probability_estimation. with_std : bool True to also include standard deviation across topic segment sets in addition to the mean coherence for each topic; default is False. diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index aae33291de..20e3df3708 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -4,9 +4,10 @@ # Copyright (C) 2013 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -r""" -This module contains functions to compute confirmation on a pair of words or word subsets. +r"""This module contains functions to compute confirmation on a pair of words or word subsets. +Notes +----- The advantage of indirect confirmation measure is that it computes similarity of words in W' and W* with respect to direct confirmations to all words. Eg. Suppose x and z are both competing brands of cars, which semantically support each other. However, both brands are seldom mentioned @@ -25,6 +26,7 @@ \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|} Here 'm' is the direct confirmation measure used. + """ import itertools @@ -126,24 +128,45 @@ def cosine_similarity( \vec{V}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|} - Args: - segmented_topics: Output from the segmentation module of the - segmented topics. Is a list of list of tuples. - accumulator: Output from the probability_estimation module. Is an - accumulator of word occurrences (see text_analysis module). - topics: Topics obtained from the trained topic model. - measure (str): Direct confirmation measure to be used. Supported - values are "nlr" (normalized log ratio). - gamma: Gamma value for computing W', W* vectors; default is 1. - with_std (bool): True to also include standard deviation across topic - segment sets in addition to the mean coherence for each topic; - default is False. - with_support (bool): True to also include support across topic segments. - The support is defined as the number of pairwise similarity - comparisons were used to compute the overall topic coherence. - - Returns: - list: of indirect cosine similarity measure for each topic. + Parameters + ---------- + segmented_topics: list of (list of tuples) + Output from the segmentation module of the segmented topics. + accumulator: accumulator of word occurrences (see text_analysis module). + Output from the probability_estimation module. Is an topics: Topics obtained from the trained topic model. + measure : str + Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio). + gamma: + Gamma value for computing W', W* vectors; default is 1. + with_std : bool + True to also include standard deviation across topic segment sets in addition to the mean coherence + for each topic; default is False. + with_support : bool + True to also include support across topic segments. The support is defined as the number of pairwise similarity + comparisons were used to compute the overall topic coherence. + + Returns + ------- + list + List of indirect cosine similarity measure for each topic. + + Examples + -------- + >>> from gensim.corpora.dictionary import Dictionary + >>> from gensim.topic_coherence import indirect_confirmation_measure,text_analysis + >>> import numpy as np + >>> dictionary = Dictionary() + >>> dictionary.id2token = {1: 'fake', 2: 'tokens'} + >>> accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary) + >>> accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}} + >>> accumulator._num_docs = 5 + >>> topics = [np.array([1, 2])] + >>> segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]] + >>> gamma = 1 + >>> measure = 'nlr' + >>> obtained = indirect_confirmation_measure.cosine_similarity(segmentation, accumulator, topics, measure, gamma) + >>> print obtained[0] + 0.623018926945 """ context_vectors = ContextVectorComputer(measure, topics, accumulator, gamma) From 390b01eeebda0ab22db4117d515b33760d3ad40f Mon Sep 17 00:00:00 2001 From: CLearERR Date: Mon, 20 Nov 2017 01:13:04 +0500 Subject: [PATCH 05/39] Math attempts --- .../direct_confirmation_measure.py | 12 +++--- .../indirect_confirmation_measure.py | 39 ++++++++++++------- 2 files changed, 29 insertions(+), 22 deletions(-) diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py index f33140a7c7..6499174171 100644 --- a/gensim/topic_coherence/direct_confirmation_measure.py +++ b/gensim/topic_coherence/direct_confirmation_measure.py @@ -21,7 +21,7 @@ def log_conditional_probability(segmented_topics, accumulator, with_std=False, w """ Calculate the log-conditional-probability measure which is used by coherence measures such as U_mass. - This is defined as: m_lc(S_i) = log[(P(W', W*) + e) / P(W*)] + This is defined as :math:`m_lc(S_i) = log[(P(W', W*) + e) / P(W*)]` Parameters ---------- @@ -49,10 +49,9 @@ def log_conditional_probability(segmented_topics, accumulator, with_std=False, w >>> token2id = {v: k for k, v in id2token.items()} >>> dictionary = namedtuple('Dictionary', 'token2id, id2token')(token2id, id2token) >>> segmentation = [[(1, 2)]] - >>> num_docs = 5 >>> accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary) >>> accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}} - >>> accumulator._num_docs = num_docs + >>> accumulator._num_docs = 5 >>> direct_confirmation_measure.log_conditional_probability(segmentation, accumulator)[0] Answer should be ~ ln(1 / 2) = -0.693147181 @@ -112,12 +111,12 @@ def log_ratio_measure( Popularly known as PMI. Calculate the log-ratio-measure which is used by coherence measures such as c_v. - This is defined as: m_lr(S_i) = log[(P(W', W*) + e) / (P(W') * P(W*))] + This is defined as :math:`m_lr(S_i) = log[(P(W', W*) + e) / (P(W') * P(W*))]` If normalize=True: Calculate the normalized-log-ratio-measure, popularly knowns as NPMI which is used by coherence measures such as c_v. - This is defined as: m_nlr(S_i) = m_lr(S_i) / -log[P(W', W*) + e] + This is defined as :math:`m_nlr(S_i) = m_lr(S_i) / -log[P(W', W*) + e]` Parameters ---------- @@ -146,10 +145,9 @@ def log_ratio_measure( >>> token2id = {v: k for k, v in id2token.items()} >>> dictionary = namedtuple('Dictionary', 'token2id, id2token')(token2id, id2token) >>> segmentation = [[(1, 2)]] - >>> num_docs = 5 >>> accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary) >>> accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}} - >>> accumulator._num_docs = num_docs + >>> accumulator._num_docs = 5 >>> direct_confirmation_measure.log_ratio_measure(segmentation, accumulator)[0] Answer should be ~ ln{(1 / 5) / [(3 / 5) * (2 / 5)]} = -0.182321557 diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index 20e3df3708..ed7cb83f28 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -15,17 +15,17 @@ or “speed” do strongly correlate. This would be reflected by an indirect confirmation measure. Thus, indirect confirmation measures may capture semantic support that direct measures would miss. -The formula used to compute indirect confirmation measure is +The formula used to compute indirect confirmation measure is .. math:: - m_{sim}_{(m, \gamma)}(W', W*) = - s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*)) + m_{sim}_{(m, \gamma)}(W', W*) = s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*)) -where s_sim can be cosine, dice or jaccard similarity and +where s_sim can be cosine, dice or jaccard similarity and .. math:: - \vec{V}^{\,}_{m,\gamma}(W') = - \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|} + \vec{V}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|} -Here 'm' is the direct confirmation measure used. +Attributes: +----------- +m: direct confirmation measure. """ @@ -114,19 +114,17 @@ def cosine_similarity( with_std=False, with_support=False): r"""Calculate the indirect cosine measure. - Given context vectors u = V(W') and w = V(W*) for the - word sets of a pair S_i = (W', W*) indirect cosine measure + Given context vectors :math:`u = V(W') and w = V(W*)` for the + word sets of a pair :math:`S_i = (W', W*)` indirect cosine measure is computed as the cosine similarity between u and w. - The formula used is + The formula used is ..math:: - m_{sim}_{(m, \gamma)}(W', W*) = - s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*)) + `m_{sim}_{(m, \gamma)}(W', W*) = s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*))` - where each vector + where each vector ::math:: - \vec{V}^{\,}_{m,\gamma}(W') = - \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|} + `\vec{V}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|}` Parameters ---------- @@ -189,6 +187,17 @@ class ContextVectorComputer(object): """Lazily compute context vectors for topic segments.""" def __init__(self, measure, topics, accumulator, gamma): + """ + Parameters + ---------- + measure: tuple + no idea + topics: list + no idea + accumulator : list + Word occurrence accumulator from probability_estimation. + gamma: + """ if measure == 'nlr': self.similarity = _pair_npmi else: From 8b1a5ca16b639efec60a9e3547088b4b2dcf519a Mon Sep 17 00:00:00 2001 From: ivan Date: Mon, 20 Nov 2017 16:43:59 +0500 Subject: [PATCH 06/39] add math extension for sphinx --- .gitignore | 3 ++- docs/src/conf.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index f0ed3e97ff..04ac39346a 100644 --- a/.gitignore +++ b/.gitignore @@ -70,4 +70,5 @@ data *_out.txt *.html *.inv -*.js \ No newline at end of file +*.js +docs/_images diff --git a/docs/src/conf.py b/docs/src/conf.py index cce21dc95e..3a2220ce22 100644 --- a/docs/src/conf.py +++ b/docs/src/conf.py @@ -25,7 +25,7 @@ # Add any Sphinx extension module names here, as strings. They can be extensions # coming with Sphinx (named 'sphinx.ext.*') or your custom ones. -extensions = ['sphinx.ext.autodoc', 'sphinxcontrib.napoleon'] +extensions = ['sphinx.ext.autodoc', 'sphinxcontrib.napoleon', 'sphinx.ext.imgmath'] autoclass_content = "both" # Add any paths that contain templates here, relative to this directory. From 8d2c5844211e8dec1ed503bb1f2ade2fc56e506a Mon Sep 17 00:00:00 2001 From: CLearERR Date: Wed, 22 Nov 2017 01:32:06 +0500 Subject: [PATCH 07/39] Minor refactoring --- .../direct_confirmation_measure.py | 3 + .../indirect_confirmation_measure.py | 64 +++++++++++++++---- 2 files changed, 53 insertions(+), 14 deletions(-) diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py index 6499174171..5f08bc5884 100644 --- a/gensim/topic_coherence/direct_confirmation_measure.py +++ b/gensim/topic_coherence/direct_confirmation_measure.py @@ -93,6 +93,9 @@ def aggregate_segment_sims(segment_sims, with_std, with_support): tuple tuple with (mean[, std[, support]]) + Examples: + --------- + in progress """ mean = np.mean(segment_sims) stats = [mean] diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index ed7cb83f28..8849beefad 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -15,11 +15,13 @@ or “speed” do strongly correlate. This would be reflected by an indirect confirmation measure. Thus, indirect confirmation measures may capture semantic support that direct measures would miss. -The formula used to compute indirect confirmation measure is .. math:: +The formula used to compute indirect confirmation measure is +.. math:: m_{sim}_{(m, \gamma)}(W', W*) = s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*)) -where s_sim can be cosine, dice or jaccard similarity and .. math:: +where s_sim can be cosine, dice or jaccard similarity and +.. math:: \vec{V}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|} @@ -118,13 +120,15 @@ def cosine_similarity( word sets of a pair :math:`S_i = (W', W*)` indirect cosine measure is computed as the cosine similarity between u and w. - The formula used is ..math:: + The formula used is + .. math:: - `m_{sim}_{(m, \gamma)}(W', W*) = s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*))` + m_{sim}_{(m, \gamma)}(W', W*) = s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*)) - where each vector ::math:: + where each vector + .. math:: - `\vec{V}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|}` + \vec{V}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|} Parameters ---------- @@ -160,9 +164,7 @@ def cosine_similarity( >>> accumulator._num_docs = 5 >>> topics = [np.array([1, 2])] >>> segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]] - >>> gamma = 1 - >>> measure = 'nlr' - >>> obtained = indirect_confirmation_measure.cosine_similarity(segmentation, accumulator, topics, measure, gamma) + >>> obtained = indirect_confirmation_measure.cosine_similarity(segmentation, accumulator, topics, 'nlr', 1) >>> print obtained[0] 0.623018926945 @@ -191,12 +193,13 @@ def __init__(self, measure, topics, accumulator, gamma): Parameters ---------- measure: tuple - no idea + in progress topics: list - no idea + in progress accumulator : list Word occurrence accumulator from probability_estimation. gamma: + in progress """ if measure == 'nlr': self.similarity = _pair_npmi @@ -216,8 +219,23 @@ def __getitem__(self, idx): def compute_context_vector(self, segment_word_ids, topic_word_ids): """ - Step 1. Check if (segment_word_ids, topic_word_ids) context vector has been cached. - Step 2. If yes, return corresponding context vector, else compute, cache, and return. + Check if (segment_word_ids, topic_word_ids) context vector has been cached. + If yes, return corresponding context vector, else compute, cache, and return. + + Parameters + ---------- + segment_word_ids: in progress + + topic_word_ids: in progress + + Returns + ------- + in progress + + Examples: + --------- + In progress + """ key = _key_for_segment(segment_word_ids, topic_word_ids) context_vector = self.context_vector_cache.get(key, None) @@ -227,7 +245,25 @@ def compute_context_vector(self, segment_word_ids, topic_word_ids): return context_vector def _make_seg(self, segment_word_ids, topic_word_ids): - """Internal helper function to return context vectors for segmentations.""" + """Return context vectors for segmentations (Internal helper function). + + Parameters + ---------- + segment_word_ids : + + topic_word_ids : + + Returns + ------- + csr_matrix :class:`~scipy.sparse.csr` + Matrix in Compressed Sparse Row format + + Examples: + --------- + In progress + + """ + context_vector = sps.lil_matrix((self.vocab_size, 1)) if not hasattr(segment_word_ids, '__iter__'): segment_word_ids = (segment_word_ids,) From 6eb833500da8b75efcd5d19a7b326122ed27b30a Mon Sep 17 00:00:00 2001 From: CLearERR Date: Thu, 23 Nov 2017 01:04:23 +0500 Subject: [PATCH 08/39] Some refactoring for probability_estimation --- .../topic_coherence/probability_estimation.py | 37 +++++++++++++++---- 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index f59692bdcc..4aed7e7ee2 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -19,17 +19,40 @@ def p_boolean_document(corpus, segmented_topics): - """This function performs the boolean document probability estimation. + """Perform the boolean document probability estimation. Boolean document estimates the probability of a single word as the number of documents in which the word occurs divided by the total number of documents. - Args: - corpus : The corpus of documents. - segmented_topics : Output from the segmentation of topics. Could be simply topics too. + Parameters + ---------- + corpus : list + The corpus of documents. + segmented_topics : list of lists + Output from the segmentation of topics. Could be simply topics too. + + Returns + ------- + accumulator : + Word occurrence accumulator instance that can be used to lookup token frequencies and co-occurrence frequencies. + + Examples + --------- + >>> from gensim.topic_coherence import probability_estimation + >>> from gensim.corpora.hashdictionary import HashDictionary + >>> from gensim.corpora.dictionary import Dictionary + >>> texts = [['human', 'interface', 'computer'],['eps', 'user', 'interface', 'system'], + >>> ['system', 'human', 'system', 'eps'],['user', 'response', 'time'],['trees'],['graph', 'trees']] + >>> dictionary = HashDictionary(texts) + >>> token2id = dictionary.token2id + >>> computer_id = token2id['computer'] + >>> system_id = token2id['system'] + >>> user_id = token2id['user'] + >>> graph_id = token2id['graph'] + >>> segmented_topics = [[(system_id, graph_id),(computer_id, graph_id),(computer_id, system_id)], [ + >>> (computer_id, graph_id),(user_id, graph_id),(user_id, computer_id)]] + >>> corpus = [dictionary.doc2bow(text) for text in texts] + >>> probability_estimation.p_boolean_document(corpus, segmented_topics) - Returns: - accumulator : word occurrence accumulator instance that can be used to lookup token - frequencies and co-occurrence frequencies. """ top_ids = unique_ids_from_segments(segmented_topics) return CorpusAccumulator(top_ids).accumulate(corpus) From 7a47f058ada6fa00a60566328d4eb06461347640 Mon Sep 17 00:00:00 2001 From: CLearERR Date: Fri, 24 Nov 2017 02:02:39 +0500 Subject: [PATCH 09/39] Beta-strings --- .../indirect_confirmation_measure.py | 6 +- .../topic_coherence/probability_estimation.py | 83 +++++++++++++---- gensim/topic_coherence/segmentation.py | 93 ++++++++++++------- 3 files changed, 127 insertions(+), 55 deletions(-) diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index 8849beefad..007803ac23 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -111,9 +111,7 @@ def word2vec_similarity(segmented_topics, accumulator, with_std=False, with_supp return topic_coherences -def cosine_similarity( - segmented_topics, accumulator, topics, measure='nlr', gamma=1, - with_std=False, with_support=False): +def cosine_similarity(segmented_topics, accumulator, topics, measure='nlr', gamma=1,with_std=False, with_support=False): r"""Calculate the indirect cosine measure. Given context vectors :math:`u = V(W') and w = V(W*)` for the @@ -139,7 +137,7 @@ def cosine_similarity( measure : str Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio). gamma: - Gamma value for computing W', W* vectors; default is 1. + Gamma value for computing W', W* vectors. with_std : bool True to also include standard deviation across topic segment sets in addition to the mean coherence for each topic; default is False. diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index 4aed7e7ee2..fb47df36b6 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -27,7 +27,7 @@ def p_boolean_document(corpus, segmented_topics): ---------- corpus : list The corpus of documents. - segmented_topics : list of lists + segmented_topics : list of tuples of (word_id_set1, word_id_set2) Output from the segmentation of topics. Could be simply topics too. Returns @@ -59,21 +59,47 @@ def p_boolean_document(corpus, segmented_topics): def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, processes=1): - """This function performs the boolean sliding window probability estimation. + """Perform the boolean sliding window probability estimation. Boolean sliding window determines word counts using a sliding window. The window moves over the documents one word token per step. Each step defines a new virtual document by copying the window content. Boolean document is applied to these virtual documents to compute word probabilities. - Args: - texts : List of string sentences. - segmented_topics : Output from the segmentation of topics. Could be simply topics too. - dictionary : Gensim dictionary mapping of the tokens and ids. - window_size : Size of the sliding window. 110 found out to be the ideal size for large corpora. + Parameters + ---------- + texts : List of strings. + segmented_topics : list of tuples of (word_id_set1, word_id_set2) + Output from the segmentation of topics. Could be simply topics too. + dictionary : + Gensim dictionary mapping of the tokens and ids. + window_size : + Size of the sliding window. 110 found out to be the ideal size for large corpora. + + Returns + ------- + accumulator + word occurrence accumulator instance that can be used to lookup token frequencies and co-occurrence frequencies. + + Examples + --------- + >>> from gensim.topic_coherence import probability_estimation + >>> from gensim.corpora.hashdictionary import HashDictionary + >>> from gensim.corpora.dictionary import Dictionary + >>> texts = [['human', 'interface', 'computer'],['eps', 'user', 'interface', 'system'], + >>> ['system', 'human', 'system', 'eps'],['user', 'response', 'time'],['trees'],['graph', 'trees']] + >>> dictionary = HashDictionary(texts) + >>> token2id = dictionary.token2id + >>> computer_id = token2id['computer'] + >>> system_id = token2id['system'] + >>> user_id = token2id['user'] + >>> graph_id = token2id['graph'] + >>> segmented_topics = [[(system_id, graph_id),(computer_id, graph_id),(computer_id, system_id)], [ + >>> (computer_id, graph_id),(user_id, graph_id),(user_id, computer_id)]] + >>> corpus = [dictionary.doc2bow(text) for text in texts] + >>> accumulator = probability_estimation.p_boolean_sliding_window(texts, segmented_topics, dictionary, 2) + >>> print accumulator[computer_id], accumulator[user_id], accumulator[graph_id], accumulator[system_id] + 1 3 1 4 - Returns: - accumulator : word occurrence accumulator instance that can be used to lookup token - frequencies and co-occurrence frequencies. """ top_ids = unique_ids_from_segments(segmented_topics) if processes <= 1: @@ -86,9 +112,25 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, p def p_word2vec(texts, segmented_topics, dictionary, window_size=None, processes=1, model=None): """Train word2vec model on `texts` if model is not None. - Returns: - ---- - accumulator: text accumulator with trained context vectors. + + Parameters + ---------- + texts : List of strings. + segmented_topics : list of tuples of (word_id_set1, word_id_set2) + Output from the segmentation of topics. Could be simply topics too. + dictionary : + Gensim dictionary mapping of the tokens and ids. + window_size : + Size of the sliding window. + processes: + no idea + model: + no idea + + Returns + ------- + accumulator + Text accumulator with trained context vectors. """ top_ids = unique_ids_from_segments(segmented_topics) accumulator = WordVectorsAccumulator( @@ -99,12 +141,17 @@ def p_word2vec(texts, segmented_topics, dictionary, window_size=None, processes= def unique_ids_from_segments(segmented_topics): """Return the set of all unique ids in a list of segmented topics. - Args: - segmented_topics: list of tuples of (word_id_set1, word_id_set2). Each word_id_set - is either a single integer, or a `numpy.ndarray` of integers. - Returns: - unique_ids : set of unique ids across all topic segments. + Parameters + ---------- + segmented_topics: list of tuples of (word_id_set1, word_id_set2). + Each word_id_setis either a single integer, or a `numpy.ndarray` of integers. + + Returns + ------- + set + Set of unique ids across all topic segments. """ + unique_ids = set() # is a set of all the unique ids contained in topics. for s_i in segmented_topics: for word_id in itertools.chain.from_iterable(s_i): diff --git a/gensim/topic_coherence/segmentation.py b/gensim/topic_coherence/segmentation.py index 2db0d695d2..1b410a1a53 100644 --- a/gensim/topic_coherence/segmentation.py +++ b/gensim/topic_coherence/segmentation.py @@ -15,19 +15,29 @@ def s_one_pre(topics): """ - This function performs s_one_pre segmentation on a list of topics. - s_one_pre segmentation is defined as: s_one_pre = {(W', W*) | W' = {w_i}; W* = {w_j}; w_i, w_j belongs to W; i > j} - Example: + Performs s_one_pre segmentation on a list of topics. - >>> topics = [np.array([1, 2, 3]), np.array([4, 5, 6])] - >>> s_one_pre(topics) - [[(2, 1), (3, 1), (3, 2)], [(5, 4), (6, 4), (6, 5)]] + s_one_pre segmentation is defined as: s_one_pre = {(W', W*) | W' = {w_i}; W* = {w_j}; w_i, w_j belongs to W; i > j} - Args: - topics : list of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] + Parameters + ---------- + topics : list of topics + list of topics obtained from an algorithm such as LDA. Is a list such as + [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] + + Returns + ------- + list of list of (W', W*) tuples + For all unique topic ids. + + Examples + -------- + >>> import numpy as np + >>> from gensim.topic_coherence import segmentation + >>> topics = [np.array([1, 2, 3]), np.array([4, 5, 6])] + >>> segmentation.s_one_pre(topics) + [[(2, 1), (3, 1), (3, 2)], [(5, 4), (6, 4), (6, 5)]] - Returns: - s_one_pre_res : list of list of (W', W*) tuples for all unique topic ids """ s_one_pre_res = [] @@ -43,19 +53,28 @@ def s_one_pre(topics): def s_one_one(topics): """ - This function performs s_one_one segmentation on a list of topics. + Perform s_one_one segmentation on a list of topics. s_one_one segmentation is defined as: s_one_one = {(W', W*) | W' = {w_i}; W* = {w_j}; w_i, w_j belongs to W; i != j} - Example: - - >>> topics = [np.array([1, 2, 3]), np.array([4, 5, 6])] - >>> s_one_pre(topics) - [[(1, 2), (1, 3), (2, 1), (2, 3), (3, 1), (3, 2)], [(4, 5), (4, 6), (5, 4), (5, 6), (6, 4), (6, 5)]] - Args: - topics : list of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] + Parameters + ---------- + topics : list of topics + List of topics obtained from an algorithm such as LDA. Is a list such as + [array([ 9, 10, 11]), array([ 9, 10, 7]), ...]. + + Returns + ------- + list of list of (W', W*) tuples + For all unique topic ids. + + Examples + ------- + >>> import numpy as np + >>> from gensim.topic_coherence import segmentation + >>> topics = [np.array([1, 2, 3]), np.array([4, 5, 6])] + >>> segmentation.s_one_pre(topics) + [[(1, 2), (1, 3), (2, 1), (2, 3), (3, 1), (3, 2)], [(4, 5), (4, 6), (5, 4), (5, 6), (6, 4), (6, 5)]] - Returns: - s_one_one_res : list of list of (W', W*) tuples for all unique topic ids """ s_one_one_res = [] @@ -74,20 +93,28 @@ def s_one_one(topics): def s_one_set(topics): """ - This function performs s_one_set segmentation on a list of topics. + Perform s_one_set segmentation on a list of topics. s_one_set segmentation is defined as: s_one_set = {(W', W*) | W' = {w_i}; w_i belongs to W; W* = W} - Example: - >>> topics = [np.array([9, 10, 7]) - >>> s_one_set(topics) - [[(9, array([ 9, 10, 7])), - (10, array([ 9, 10, 7])), - (7, array([ 9, 10, 7]))]] - - Args: - topics : list of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] - - Returns: - s_one_set_res : list of list of (W', W*) tuples for all unique topic ids. + + Parameters + ---------- + topics : list of topics + List of topics obtained from an algorithm such as LDA. Is a list such as + [array([ 9, 10, 11]), array([ 9, 10, 7]), ...]. + + Returns + ------- + list of list of (W', W*) tuples + For all unique topic ids. + + Examples + -------- + >>> import numpy as np + >>> from gensim.topic_coherence import segmentation + >>> topics = [np.array([9, 10, 7])] + >>> segmentation.s_one_set(topics) + [[(9, array([ 9, 10, 7])), (10, array([ 9, 10, 7])), (7, array([ 9, 10, 7]))]] + """ s_one_set_res = [] From 667cad23772e43804d898946e6961526f866c5ff Mon Sep 17 00:00:00 2001 From: CLearERR Date: Sun, 26 Nov 2017 01:46:26 +0500 Subject: [PATCH 10/39] Different additions --- .../direct_confirmation_measure.py | 3 +- .../topic_coherence/probability_estimation.py | 5 +- gensim/topic_coherence/segmentation.py | 3 +- gensim/topic_coherence/text_analysis.py | 68 +++++++++++++------ 4 files changed, 51 insertions(+), 28 deletions(-) diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py index 5f08bc5884..0d925e2806 100644 --- a/gensim/topic_coherence/direct_confirmation_measure.py +++ b/gensim/topic_coherence/direct_confirmation_measure.py @@ -4,8 +4,7 @@ # Copyright (C) 2013 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -This module contains functions to compute direct confirmation on a pair of words or word subsets. +"""This module contains functions to compute direct confirmation on a pair of words or word subsets. """ import logging diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index fb47df36b6..6135e9a5c8 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -4,8 +4,7 @@ # Copyright (C) 2013 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -This module contains functions to perform segmentation on a list of topics. +"""This module contains functions to perform segmentation on a list of topics. """ import itertools @@ -78,7 +77,7 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, p Returns ------- accumulator - word occurrence accumulator instance that can be used to lookup token frequencies and co-occurrence frequencies. + Word occurrence accumulator instance that can be used to lookup token frequencies and co-occurrence frequencies. Examples --------- diff --git a/gensim/topic_coherence/segmentation.py b/gensim/topic_coherence/segmentation.py index 1b410a1a53..3003f8b5ba 100644 --- a/gensim/topic_coherence/segmentation.py +++ b/gensim/topic_coherence/segmentation.py @@ -4,8 +4,7 @@ # Copyright (C) 2013 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -This module contains functions to perform segmentation on a list of topics. +"""This module contains functions to perform segmentation on a list of topics. """ import logging diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 340286c8d1..ed89ef6cff 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -4,8 +4,7 @@ # Copyright (C) 2013 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -This module contains classes for analyzing the texts of a corpus to accumulate +"""This module contains classes for analyzing the texts of a corpus to accumulate statistical information about word occurrences. """ @@ -27,11 +26,24 @@ def _ids_to_words(ids, dictionary): """Convert an iterable of ids to their corresponding words using a dictionary. - This function abstracts away the differences between the HashDictionary and the standard one. + Abstract away the differences between the HashDictionary and the standard one. + + Parameters + ---------- + ids: list of list of tuples + Each tuple contains (token_id, iterable of token_ids). + This is the format returned by the :class:`~gensim.topic_coherence` functions. + dictionary: + + Returns + ------- + set + Corresponding words. + + Examples + -------- + in progress - Args: - ids: list of list of tuples, where each tuple contains (token_id, iterable of token_ids). - This is the format returned by the topic_coherence.segmentation functions. """ if not dictionary.id2token: # may not be initialized in the standard gensim.corpora.Dictionary setattr(dictionary, 'id2token', {v: k for k, v in dictionary.token2id.items()}) @@ -300,11 +312,16 @@ class ParallelWordOccurrenceAccumulator(WindowedTextsAnalyzer): def __init__(self, processes, *args, **kwargs): """ - Args: - processes : number of processes to use; must be at least two. - args : should include `relevant_ids` and `dictionary` (see `UsesDictionary.__init__`). - kwargs : can include `batch_size`, which is the number of docs to send to a worker at a - time. If not included, it defaults to 64. + Parameters + ---------- + processes : + Number of processes to use; must be at least two. + args : + Should include `relevant_ids` and `dictionary` (see :class:`~UsesDictionary.__init__`). + kwargs : + Can include `batch_size`, which is the number of docs to send to a worker at a time. + If not included, it defaults to 64. + """ super(ParallelWordOccurrenceAccumulator, self).__init__(*args) if processes < 2: @@ -331,10 +348,18 @@ def accumulate(self, texts, window_size): def start_workers(self, window_size): """Set up an input and output queue and start processes for each worker. - The input queue is used to transmit batches of documents to the workers. The output queue is used by workers to transmit the WordOccurrenceAccumulator instances. - Returns: tuple of (list of workers, input queue, output queue). + + Parameters + ---------- + window_size : + in progress + + Returns + ------- + tuple + Tuple of (list of workers, input queue, output queue). """ input_q = mp.Queue(maxsize=self.processes) output_q = mp.Queue() @@ -348,8 +373,7 @@ def start_workers(self, window_size): return workers, input_q, output_q def yield_batches(self, texts): - """Return a generator over the given texts that yields batches of - `batch_size` texts at a time. + """Return a generator over the given texts that yields batches of `batch_size` texts at a time. """ batch = [] for text in self._iter_texts(texts): @@ -473,12 +497,14 @@ class WordVectorsAccumulator(UsesDictionary): def __init__(self, relevant_ids, dictionary, model=None, **model_kwargs): """ - Args: - model: if None, a new Word2Vec model is trained on the given text corpus. - If not None, it should be a pre-trained Word2Vec context vectors - (gensim.models.keyedvectors.KeyedVectors instance). - model_kwargs: if model is None, these keyword arguments will be passed - through to the Word2Vec constructor. + Parameters + ---------- + model: + If None, a new Word2Vec model is trained on the given text corpus. + If not None, it should be a pre-trained Word2Vec context vectors + (:class:`~gensim.models.keyedvectors.KeyedVectors` instance). + model_kwargs: + if model is None, these keyword arguments will be passed through to the Word2Vec constructor. """ super(WordVectorsAccumulator, self).__init__(relevant_ids, dictionary) self.model = model From d41c5a34aaf3858aaecc989fcd60001a0c73611c Mon Sep 17 00:00:00 2001 From: CLearERR Date: Mon, 27 Nov 2017 00:34:42 +0500 Subject: [PATCH 11/39] Minor changes --- .../direct_confirmation_measure.py | 17 ++++++++++------- .../topic_coherence/probability_estimation.py | 11 +++++------ 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py index 0d925e2806..8b30a184c5 100644 --- a/gensim/topic_coherence/direct_confirmation_measure.py +++ b/gensim/topic_coherence/direct_confirmation_measure.py @@ -26,15 +26,14 @@ def log_conditional_probability(segmented_topics, accumulator, with_std=False, w ---------- segmented_topics : list Output from the segmentation module of the segmented topics. Is a list of list of tuples. - accumulator : list + accumulator : :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` Word occurrence accumulator from probability_estimation. with_std : bool - True to also include standard deviation across topic segment - sets in addition to the mean coherence for each topic; default is False. + True to also include standard deviation across topic segment sets in addition to the mean coherence + for each topic. with_support : bool - True to also include support across topic segments. The - support is defined as the number of pairwise similarity comparisons were - used to compute the overall topic coherence. + True to also include support across topic segments. The support is defined as the number of pairwise + similarity comparisons were used to compute the overall topic coherence. Returns ------- @@ -44,13 +43,17 @@ def log_conditional_probability(segmented_topics, accumulator, with_std=False, w -------- >>> from gensim.topic_coherence import direct_confirmation_measure,text_analysis >>> from collections import namedtuple + Now we create dictionary: >>> id2token = {1: 'test', 2: 'doc'} >>> token2id = {v: k for k, v in id2token.items()} >>> dictionary = namedtuple('Dictionary', 'token2id, id2token')(token2id, id2token) + Then we will initialize segmented topics: >>> segmentation = [[(1, 2)]] + And accumulator: >>> accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary) >>> accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}} >>> accumulator._num_docs = 5 + Function call: >>> direct_confirmation_measure.log_conditional_probability(segmentation, accumulator)[0] Answer should be ~ ln(1 / 2) = -0.693147181 @@ -124,7 +127,7 @@ def log_ratio_measure( ---------- segmented_topics : list of (list of tuples) Output from the segmentation module of the segmented topics. - accumulator: list + accumulator: :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` Word occurrence accumulator from probability_estimation. with_std : bool True to also include standard deviation across topic segment diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index 6135e9a5c8..d2a47f62b8 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -18,20 +18,19 @@ def p_boolean_document(corpus, segmented_topics): - """Perform the boolean document probability estimation. - Boolean document estimates the probability of a single word as the number - of documents in which the word occurs divided by the total number of documents. + """Perform the boolean document probability estimation. Boolean document estimates the probability of a single word + as the number of documents in which the word occurs divided by the total number of documents. Parameters ---------- corpus : list The corpus of documents. - segmented_topics : list of tuples of (word_id_set1, word_id_set2) - Output from the segmentation of topics. Could be simply topics too. + segmented_topics : list of tuples + Output from the segmentation of topics. Tuples of (word_id_set1, word_id_set2). Could be simply topics too. Returns ------- - accumulator : + accumulator Word occurrence accumulator instance that can be used to lookup token frequencies and co-occurrence frequencies. Examples From 180c1c1a870f6cf98fa2d668b732930424bcb012 Mon Sep 17 00:00:00 2001 From: CLearERR Date: Tue, 28 Nov 2017 01:27:08 +0500 Subject: [PATCH 12/39] text_analysis left --- .../direct_confirmation_measure.py | 20 ++++-- .../indirect_confirmation_measure.py | 66 ++++++++++++------- .../topic_coherence/probability_estimation.py | 10 ++- gensim/topic_coherence/text_analysis.py | 21 ++++-- 4 files changed, 84 insertions(+), 33 deletions(-) diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py index 8b30a184c5..c5a405c60b 100644 --- a/gensim/topic_coherence/direct_confirmation_measure.py +++ b/gensim/topic_coherence/direct_confirmation_measure.py @@ -37,11 +37,12 @@ def log_conditional_probability(segmented_topics, accumulator, with_std=False, w Returns ------- - list : of log conditional probability measure for each topic. + list + List of log conditional probability measure for each topic. Examples -------- - >>> from gensim.topic_coherence import direct_confirmation_measure,text_analysis + >>> from gensim.topic_coherence import direct_confirmation_measure, text_analysis >>> from collections import namedtuple Now we create dictionary: >>> id2token = {1: 'test', 2: 'doc'} @@ -83,8 +84,8 @@ def aggregate_segment_sims(segment_sims, with_std, with_support): Parameters ---------- - segment_sims : iterable - floating point similarity values to aggregate. + segment_sims : iterable array + Floating point similarity values to aggregate. with_std : bool Set to True to include standard deviation. with_support : bool @@ -95,9 +96,16 @@ def aggregate_segment_sims(segment_sims, with_std, with_support): tuple tuple with (mean[, std[, support]]) - Examples: + Examples --------- - in progress + >>> import numpy as np + >>> from gensim.topic_coherence import direct_confirmation_measure + >>> segment_sims = np.array([[1, 2], [3, 4]]) + >>> direct_confirmation_measure.aggregate_segment_sims(segment_sims,True,True) + (2.5, 1.1180339887498949, 2) + >>> direct_confirmation_measure.aggregate_segment_sims(segment_sims,False,False) + 2.5 + """ mean = np.mean(segment_sims) stats = [mean] diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index 007803ac23..0ffe987627 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -51,7 +51,7 @@ def word2vec_similarity(segmented_topics, accumulator, with_std=False, with_supp ---------- segmented_topics : list of (list of tuples) Output from the segmentation module of the segmented topics. - accumulator: list + accumulator: :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` Word occurrence accumulator from probability_estimation. with_std : bool True to also include standard deviation across topic segment @@ -132,11 +132,11 @@ def cosine_similarity(segmented_topics, accumulator, topics, measure='nlr', gamm ---------- segmented_topics: list of (list of tuples) Output from the segmentation module of the segmented topics. - accumulator: accumulator of word occurrences (see text_analysis module). + accumulator: :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` Output from the probability_estimation module. Is an topics: Topics obtained from the trained topic model. measure : str Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio). - gamma: + gamma: float Gamma value for computing W', W* vectors. with_std : bool True to also include standard deviation across topic segment sets in addition to the mean coherence @@ -153,7 +153,7 @@ def cosine_similarity(segmented_topics, accumulator, topics, measure='nlr', gamm Examples -------- >>> from gensim.corpora.dictionary import Dictionary - >>> from gensim.topic_coherence import indirect_confirmation_measure,text_analysis + >>> from gensim.topic_coherence import indirect_confirmation_measure, text_analysis >>> import numpy as np >>> dictionary = Dictionary() >>> dictionary.id2token = {1: 'fake', 2: 'tokens'} @@ -184,20 +184,30 @@ def cosine_similarity(segmented_topics, accumulator, topics, measure='nlr', gamm class ContextVectorComputer(object): - """Lazily compute context vectors for topic segments.""" + """Lazily compute context vectors for topic segments. + + Attributes + ---------- + sim_cache: dict + Cache similarities between tokens (pairs of word ids), e.g. (1, 2). + context_vector_cache: dict + Mapping from (segment, topic_words) --> context_vector. + + """ def __init__(self, measure, topics, accumulator, gamma): """ Parameters ---------- - measure: tuple - in progress + measure: str + Confirmation measure. topics: list - in progress - accumulator : list + Topics. + accumulator : :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` Word occurrence accumulator from probability_estimation. - gamma: - in progress + gamma: float + Value for computing vectors. + """ if measure == 'nlr': self.similarity = _pair_npmi @@ -209,28 +219,28 @@ def __init__(self, measure, topics, accumulator, gamma): self.vocab_size = len(self.mapping) self.accumulator = accumulator self.gamma = gamma - self.sim_cache = {} # Cache similarities between tokens (pairs of word ids), e.g. (1, 2) - self.context_vector_cache = {} # mapping from (segment, topic_words) --> context_vector + self.sim_cache = {} + self.context_vector_cache = {} def __getitem__(self, idx): return self.compute_context_vector(*idx) def compute_context_vector(self, segment_word_ids, topic_word_ids): - """ - Check if (segment_word_ids, topic_word_ids) context vector has been cached. - If yes, return corresponding context vector, else compute, cache, and return. + """Check if (segment_word_ids, topic_word_ids) context vector has been cached. + Parameters ---------- - segment_word_ids: in progress + segment_word_ids: list - topic_word_ids: in progress + topic_word_ids: list Returns ------- - in progress + csr_matrix :class:`~scipy.sparse.csr` + If context vector has been cached, then return corresponding context vector, else compute, cache, and return. - Examples: + Example --------- In progress @@ -256,7 +266,7 @@ def _make_seg(self, segment_word_ids, topic_word_ids): csr_matrix :class:`~scipy.sparse.csr` Matrix in Compressed Sparse Row format - Examples: + Example --------- In progress @@ -279,7 +289,19 @@ def _make_seg(self, segment_word_ids, topic_word_ids): def _pair_npmi(pair, accumulator): """Compute normalized pairwise mutual information (NPMI) between a pair of words. - The pair is an iterable of (word_id1, word_id2). + + Parameters + ---------- + pair : iterable + The pair of words (word_id1, word_id2). + accumulator : :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` + Word occurrence accumulator from probability_estimation. + + Return + ------ + float + NPMI between a pair of words. + """ return log_ratio_measure([[pair]], accumulator, True)[0] diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index d2a47f62b8..5985cef073 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -148,8 +148,16 @@ def unique_ids_from_segments(segmented_topics): ------- set Set of unique ids across all topic segments. - """ + Example + ------- + >>> from gensim.topic_coherence import probability_estimation + >>> segmentation = [[(1, 2)]] + >>> probability_estimation.unique_ids_from_segments(segmentation) + set([1, 2]) + + + """ unique_ids = set() # is a set of all the unique ids contained in topics. for s_i in segmented_topics: for word_id in itertools.chain.from_iterable(s_i): diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index ed89ef6cff..e6880c5902 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -60,7 +60,17 @@ def _ids_to_words(ids, dictionary): class BaseAnalyzer(object): - """Base class for corpus and text analyzers.""" + """Base class for corpus and text analyzers. + + Attributes + ---------- + relevant_ids : + _vocab_size : + id2contiguous : + log_every : int + _num_docs : int + + """ def __init__(self, relevant_ids): self.relevant_ids = relevant_ids @@ -180,9 +190,12 @@ class WindowedTextsAnalyzer(UsesDictionary): def __init__(self, relevant_ids, dictionary): """ - Args: - relevant_ids: the set of words that occurrences should be accumulated for. - dictionary: Dictionary instance with mappings for the relevant_ids. + Parameters + ---------- + relevant_ids: set + Set of words that occurrences should be accumulated for. + dictionary: tuple + Dictionary instance with mappings for the relevant_ids. """ super(WindowedTextsAnalyzer, self).__init__(relevant_ids, dictionary) self._none_token = self._vocab_size # see _iter_texts for use of none token From e3c1e297dd42a4e0d840c7be08fdce2b47cc03f2 Mon Sep 17 00:00:00 2001 From: CLearERR Date: Wed, 29 Nov 2017 01:34:41 +0500 Subject: [PATCH 13/39] Added example for ContextVectorComputer class --- .../indirect_confirmation_measure.py | 30 ++++++++++++++----- .../topic_coherence/probability_estimation.py | 2 +- 2 files changed, 24 insertions(+), 8 deletions(-) diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index 0ffe987627..5e8efeb429 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -208,6 +208,23 @@ def __init__(self, measure, topics, accumulator, gamma): gamma: float Value for computing vectors. + Example + ------- + >>> from gensim.corpora.dictionary import Dictionary + >>> from gensim.topic_coherence import indirect_confirmation_measure, text_analysis + >>> import numpy as np + >>> measure = 'nlr' + >>> top = [np.array([1, 2])] + >>> dictionary = Dictionary() + >>> dictionary.id2token = {1: 'fake', 2: 'tokens'} + >>> accumulator = text_analysis.WordVectorsAccumulator({1, 2}, dictionary) + >>> accumulator.accumulate([['fake', 'tokens'],['tokens', 'fake']], 5) + >>> cont_vect_comp = indirect_confirmation_measure.ContextVectorComputer(measure, top, accumulator,1) + >>> cont_vect_comp.mapping + {1: 0, 2: 1} + >>> cont_vect_comp.vocab_size + 2 + """ if measure == 'nlr': self.similarity = _pair_npmi @@ -228,13 +245,12 @@ def __getitem__(self, idx): def compute_context_vector(self, segment_word_ids, topic_word_ids): """Check if (segment_word_ids, topic_word_ids) context vector has been cached. - Parameters ---------- segment_word_ids: list - + Ids of words in segment. topic_word_ids: list - + Ids of words in topic. Returns ------- csr_matrix :class:`~scipy.sparse.csr` @@ -257,10 +273,10 @@ def _make_seg(self, segment_word_ids, topic_word_ids): Parameters ---------- - segment_word_ids : - - topic_word_ids : - + segment_word_ids : list + Ids of words in segment. + topic_word_ids : list + Ids of words in topic. Returns ------- csr_matrix :class:`~scipy.sparse.csr` diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index 5985cef073..df27b598e7 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -122,7 +122,7 @@ def p_word2vec(texts, segmented_topics, dictionary, window_size=None, processes= Size of the sliding window. processes: no idea - model: + model: word2vec module / some preta no idea Returns From da9ca292a1591430e5f6573304587a5c00dcd744 Mon Sep 17 00:00:00 2001 From: CLearERR Date: Thu, 30 Nov 2017 01:59:04 +0500 Subject: [PATCH 14/39] probability_estimation 0.9 --- .../indirect_confirmation_measure.py | 5 --- .../topic_coherence/probability_estimation.py | 35 +++++++++++++++---- gensim/topic_coherence/text_analysis.py | 9 +++-- 3 files changed, 32 insertions(+), 17 deletions(-) diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index 5e8efeb429..2dfa3d6297 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -282,12 +282,7 @@ def _make_seg(self, segment_word_ids, topic_word_ids): csr_matrix :class:`~scipy.sparse.csr` Matrix in Compressed Sparse Row format - Example - --------- - In progress - """ - context_vector = sps.lil_matrix((self.vocab_size, 1)) if not hasattr(segment_word_ids, '__iter__'): segment_word_ids = (segment_word_ids,) diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index df27b598e7..4a727bd9c7 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -30,7 +30,7 @@ def p_boolean_document(corpus, segmented_topics): Returns ------- - accumulator + :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` Word occurrence accumulator instance that can be used to lookup token frequencies and co-occurrence frequencies. Examples @@ -75,7 +75,7 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, p Returns ------- - accumulator + :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` Word occurrence accumulator instance that can be used to lookup token frequencies and co-occurrence frequencies. Examples @@ -120,15 +120,36 @@ def p_word2vec(texts, segmented_topics, dictionary, window_size=None, processes= Gensim dictionary mapping of the tokens and ids. window_size : Size of the sliding window. - processes: - no idea - model: word2vec module / some preta - no idea + processes: int + Number of processes to use. + model: model: Word2Vec (:class:`~gensim.models.keyedvectors.KeyedVectors`) + If None, a new Word2Vec model is trained on the given text corpus. Otherwise, + it should be a pre-trained Word2Vec context vectors. Returns ------- - accumulator + :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` Text accumulator with trained context vectors. + + Examples + -------- + >>> from gensim.topic_coherence import probability_estimation + >>> from gensim.corpora.hashdictionary import HashDictionary + >>> from gensim.corpora.dictionary import Dictionary + >>> texts = [['human', 'interface', 'computer'],['eps', 'user', 'interface', 'system'], + >>> ['system', 'human', 'system', 'eps'],['user', 'response', 'time'],['trees'],['graph', 'trees']] + >>> dictionary = HashDictionary(texts) + >>> token2id = dictionary.token2id + >>> computer_id = token2id['computer'] + >>> system_id = token2id['system'] + >>> user_id = token2id['user'] + >>> graph_id = token2id['graph'] + >>> segmented_topics = [[(system_id, graph_id),(computer_id, graph_id),(computer_id, system_id)], [ + >>> (computer_id, graph_id),(user_id, graph_id),(user_id, computer_id)]] + >>> corpus = [dictionary.doc2bow(text) for text in texts] + >>> accumulator = probability_estimation.p_word2vec(texts, segmented_topics, dictionary, 2) + >>> print accumulator[computer_id], accumulator[user_id], accumulator[graph_id], accumulator[system_id] + 1 3 1 4 """ top_ids = unique_ids_from_segments(segmented_topics) accumulator = WordVectorsAccumulator( diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index e6880c5902..9f8a3834a9 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -327,7 +327,7 @@ def __init__(self, processes, *args, **kwargs): """ Parameters ---------- - processes : + processes : int Number of processes to use; must be at least two. args : Should include `relevant_ids` and `dictionary` (see :class:`~UsesDictionary.__init__`). @@ -512,10 +512,9 @@ def __init__(self, relevant_ids, dictionary, model=None, **model_kwargs): """ Parameters ---------- - model: - If None, a new Word2Vec model is trained on the given text corpus. - If not None, it should be a pre-trained Word2Vec context vectors - (:class:`~gensim.models.keyedvectors.KeyedVectors` instance). + model: Word2Vec (:class:`~gensim.models.keyedvectors.KeyedVectors`) + If None, a new Word2Vec model is trained on the given text corpus. Otherwise, + it should be a pre-trained Word2Vec context vectors. model_kwargs: if model is None, these keyword arguments will be passed through to the Word2Vec constructor. """ From f54fb0c5bc8c561dc6b0c97f7599d9541eaa1526 Mon Sep 17 00:00:00 2001 From: CLearERR Date: Fri, 1 Dec 2017 01:21:43 +0500 Subject: [PATCH 15/39] beta_version --- gensim/topic_coherence/probability_estimation.py | 7 +++++-- gensim/topic_coherence/text_analysis.py | 10 ++++++++-- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index 4a727bd9c7..b3ca90eae1 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -135,6 +135,7 @@ def p_word2vec(texts, segmented_topics, dictionary, window_size=None, processes= -------- >>> from gensim.topic_coherence import probability_estimation >>> from gensim.corpora.hashdictionary import HashDictionary + >>> from gensim.models import word2vec >>> from gensim.corpora.dictionary import Dictionary >>> texts = [['human', 'interface', 'computer'],['eps', 'user', 'interface', 'system'], >>> ['system', 'human', 'system', 'eps'],['user', 'response', 'time'],['trees'],['graph', 'trees']] @@ -147,9 +148,11 @@ def p_word2vec(texts, segmented_topics, dictionary, window_size=None, processes= >>> segmented_topics = [[(system_id, graph_id),(computer_id, graph_id),(computer_id, system_id)], [ >>> (computer_id, graph_id),(user_id, graph_id),(user_id, computer_id)]] >>> corpus = [dictionary.doc2bow(text) for text in texts] - >>> accumulator = probability_estimation.p_word2vec(texts, segmented_topics, dictionary, 2) + >>> sentences = [['human', 'interface', 'computer'],['survey', 'user', 'computer', 'system', 'response', 'time']] + >>> model = word2vec.Word2Vec(sentences, size=100, window=5, min_count=5, workers=4) #TODO Ivan fix this holy shield + >>> accumulator = probability_estimation.p_word2vec(texts, segmented_topics, dictionary, 2, 1, model) >>> print accumulator[computer_id], accumulator[user_id], accumulator[graph_id], accumulator[system_id] - 1 3 1 4 + 1 3 1 4 # example for model = None """ top_ids = unique_ids_from_segments(segmented_topics) accumulator = WordVectorsAccumulator( diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 9f8a3834a9..6ae05e2ce4 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -33,7 +33,7 @@ def _ids_to_words(ids, dictionary): ids: list of list of tuples Each tuple contains (token_id, iterable of token_ids). This is the format returned by the :class:`~gensim.topic_coherence` functions. - dictionary: + dictionary: dict Returns ------- @@ -42,7 +42,13 @@ def _ids_to_words(ids, dictionary): Examples -------- - in progress + >>> from gensim.corpora.hashdictionary import HashDictionary + >>> from gensim.corpora.dictionary import Dictionary + >>> from gensim.topic_coherence import text_analysis + >>> ids = [[('1','a'), ('2','b')],[('3','c')]] + >>> texts = [['human', 'interface', 'computer'],['eps', 'user', 'interface', 'system'],['graph', 'trees']] + >>> dictionary = HashDictionary(texts) + >>> text_analysis._ids_to_words(ids, dictionary) """ if not dictionary.id2token: # may not be initialized in the standard gensim.corpora.Dictionary From 47ee63e7eb7110646ce561a12a03be1fdb2a6300 Mon Sep 17 00:00:00 2001 From: CLearERR Date: Mon, 4 Dec 2017 01:25:27 +0500 Subject: [PATCH 16/39] Added some examples for text_analysis --- gensim/topic_coherence/text_analysis.py | 71 ++++++++++++++++++++----- 1 file changed, 57 insertions(+), 14 deletions(-) diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 6ae05e2ce4..07cce346c4 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -30,10 +30,9 @@ def _ids_to_words(ids, dictionary): Parameters ---------- - ids: list of list of tuples - Each tuple contains (token_id, iterable of token_ids). - This is the format returned by the :class:`~gensim.topic_coherence` functions. - dictionary: dict + ids: dict + Dictionary of ids and their words. + dictionary: :class:`~gensim.corpora.dictionary` Returns ------- @@ -42,13 +41,13 @@ def _ids_to_words(ids, dictionary): Examples -------- - >>> from gensim.corpora.hashdictionary import HashDictionary >>> from gensim.corpora.dictionary import Dictionary >>> from gensim.topic_coherence import text_analysis - >>> ids = [[('1','a'), ('2','b')],[('3','c')]] - >>> texts = [['human', 'interface', 'computer'],['eps', 'user', 'interface', 'system'],['graph', 'trees']] - >>> dictionary = HashDictionary(texts) + >>> dictionary = Dictionary() + >>> ids = {1: 'fake', 4: 'cats'} + >>> dictionary.id2token = {1: 'fake', 2: 'tokens', 3: 'rabbids', 4: 'cats'} >>> text_analysis._ids_to_words(ids, dictionary) + set(['cats', 'fake']) """ if not dictionary.id2token: # may not be initialized in the standard gensim.corpora.Dictionary @@ -70,14 +69,24 @@ class BaseAnalyzer(object): Attributes ---------- - relevant_ids : - _vocab_size : - id2contiguous : + relevant_ids : dict + _vocab_size : int + Size of vocabulary. + id2contiguous : dict + log_every : int + Interval for logging. _num_docs : int - """ + Examples + -------- + >>> from gensim.topic_coherence import text_analysis + >>> ids = {1: 'fake', 4: 'cats'} + >>> base = text_analysis.BaseAnalyzer(ids) + >>> print base.relevant_ids, base._vocab_size, base.id2contiguous, base.log_every, base._num_docs + {1: 'fake', 4: 'cats'} 2 {1: 0, 4: 1} 1000 0 + """ def __init__(self, relevant_ids): self.relevant_ids = relevant_ids self._vocab_size = len(self.relevant_ids) @@ -125,6 +134,28 @@ class UsesDictionary(BaseAnalyzer): """A BaseAnalyzer that uses a Dictionary, hence can translate tokens to counts. The standard BaseAnalyzer can only deal with token ids since it doesn't have the token2id mapping. + + Attributes + ---------- + relevant_words : set + Set of words. + dictionary : :class:`~gensim.corpora.dictionary.Dictionary` + token2id : dict + token2id from :class:`~gensim.corpora.dictionary` + + Examples + -------- + >>> from gensim.topic_coherence import text_analysis + >>> from gensim.corpora.dictionary import Dictionary + >>> ids = {1: 'fake', 4: 'cats'} + >>> dictionary = Dictionary() + >>> dictionary.id2token = {1: 'fake', 2: 'tokens', 3: 'rabbids', 4: 'cats'} + >>> usesdict = text_analysis.UsesDictionary(ids, dictionary) + >>> print usesdict.relevant_words, usesdict.dictionary, usesdict.token2id + set(['cats', 'fake']) Dictionary(0 unique tokens: []) {} + + #TODO: Looks like we need to use HashDictionary, but it doesn't work (Ivan, help please) + """ def __init__(self, relevant_ids, dictionary): @@ -156,8 +187,18 @@ def get_co_occurrences(self, word1, word2): class InvertedIndexBased(BaseAnalyzer): - """Analyzer that builds up an inverted index to accumulate stats.""" + """Analyzer that builds up an inverted index to accumulate stats. + + Examples + -------- + >>> from gensim.topic_coherence import text_analysis + >>> ininb = text_analysis.InvertedIndexBased([1,2]) + >>> print ininb._inverted_index + [set([]) set([])] + + """ + # TODO : *args value have no impact on ininb._inverted_index def __init__(self, *args): super(InvertedIndexBased, self).__init__(*args) self._inverted_index = np.array([set() for _ in range(self._vocab_size)]) @@ -176,7 +217,9 @@ def index_to_dict(self): class CorpusAccumulator(InvertedIndexBased): - """Gather word occurrence stats from a corpus by iterating over its BoW representation.""" + """Gather word occurrence stats from a corpus by iterating over its BoW representation. + + """ def analyze_text(self, text, doc_num=None): doc_words = frozenset(x[0] for x in text) From 65211f0036ff2ac9cc5fa6b5f01916a5feb5b35e Mon Sep 17 00:00:00 2001 From: CLearERR Date: Tue, 5 Dec 2017 00:05:44 +0500 Subject: [PATCH 17/39] text_analysis: corrected example for class UsesDictionary --- gensim/topic_coherence/text_analysis.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 07cce346c4..6bd3b38591 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -147,14 +147,11 @@ class UsesDictionary(BaseAnalyzer): -------- >>> from gensim.topic_coherence import text_analysis >>> from gensim.corpora.dictionary import Dictionary - >>> ids = {1: 'fake', 4: 'cats'} - >>> dictionary = Dictionary() - >>> dictionary.id2token = {1: 'fake', 2: 'tokens', 3: 'rabbids', 4: 'cats'} + >>> ids = {1: 'foo', 2: 'bar'} + >>> dictionary = Dictionary([['foo','bar','baz'], ['foo','bar','bar','baz']]) >>> usesdict = text_analysis.UsesDictionary(ids, dictionary) >>> print usesdict.relevant_words, usesdict.dictionary, usesdict.token2id - set(['cats', 'fake']) Dictionary(0 unique tokens: []) {} - - #TODO: Looks like we need to use HashDictionary, but it doesn't work (Ivan, help please) + set([u'foo', u'baz']) Dictionary(3 unique tokens: [u'baz', u'foo', u'bar']) {u'baz': 2, u'foo': 1, u'bar': 0} """ @@ -555,7 +552,13 @@ def reply_to_master(self): class WordVectorsAccumulator(UsesDictionary): - """Accumulate context vectors for words using word vector embeddings.""" + """Accumulate context vectors for words using word vector embeddings. + + Examples + -------- + + + """ def __init__(self, relevant_ids, dictionary, model=None, **model_kwargs): """ From c4849620d957e4a598d54cd88f183dfdc0a6c8ff Mon Sep 17 00:00:00 2001 From: CLearERR Date: Fri, 8 Dec 2017 00:40:29 +0500 Subject: [PATCH 18/39] Final additions for text_analysis.py --- gensim/topic_coherence/text_analysis.py | 103 ++++++++++++++---------- 1 file changed, 60 insertions(+), 43 deletions(-) diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 6bd3b38591..d03789f913 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -6,6 +6,17 @@ """This module contains classes for analyzing the texts of a corpus to accumulate statistical information about word occurrences. + +Example for UsesDictionary +-------------------------- +>>> from gensim.topic_coherence import text_analysis +>>> from gensim.corpora.dictionary import Dictionary +>>> ids = {1: 'foo', 2: 'bar'} +>>> dictionary = Dictionary([['foo','bar','baz'], ['foo','bar','bar','baz']]) +>>> usesdict = text_analysis.UsesDictionary(ids, dictionary) +>>> print usesdict.relevant_words, usesdict.dictionary, usesdict.token2id +set([u'foo', u'baz']) Dictionary(3 unique tokens: [u'baz', u'foo', u'bar']) {u'baz': 2, u'foo': 1, u'bar': 0} + """ import itertools @@ -138,7 +149,7 @@ class UsesDictionary(BaseAnalyzer): Attributes ---------- relevant_words : set - Set of words. + Set of words that occurrences should be accumulated for. dictionary : :class:`~gensim.corpora.dictionary.Dictionary` token2id : dict token2id from :class:`~gensim.corpora.dictionary` @@ -189,13 +200,14 @@ class InvertedIndexBased(BaseAnalyzer): Examples -------- >>> from gensim.topic_coherence import text_analysis - >>> ininb = text_analysis.InvertedIndexBased([1,2]) + >>> ids = {1: 'fake', 4: 'cats'} + >>> ininb = text_analysis.InvertedIndexBased(ids) >>> print ininb._inverted_index [set([]) set([])] """ - # TODO : *args value have no impact on ininb._inverted_index + def __init__(self, *args): super(InvertedIndexBased, self).__init__(*args) self._inverted_index = np.array([set() for _ in range(self._vocab_size)]) @@ -214,11 +226,24 @@ def index_to_dict(self): class CorpusAccumulator(InvertedIndexBased): - """Gather word occurrence stats from a corpus by iterating over its BoW representation. - - """ + """Gather word occurrence stats from a corpus by iterating over its BoW representation.""" def analyze_text(self, text, doc_num=None): + """ + + + Examples + -------- + >> > from gensim.topic_coherence import text_analysis + >> > ids = {1: 'fake', 4: 'cats'} + >> > corac = text_analysis.CorpusAccumulator(ids) + >> > texts = [['human', 'interface', 'computer'], ['eps', 'user', 'interface', 'system']] + >> > corac.analyze_text(texts) + >> > print + corac._inverted_index + + # Doesn't work + """ doc_words = frozenset(x[0] for x in text) top_ids_in_doc = self.relevant_ids.intersection(doc_words) for word_id in top_ids_in_doc: @@ -232,17 +257,17 @@ def accumulate(self, corpus): class WindowedTextsAnalyzer(UsesDictionary): - """Gather some stats about relevant terms of a corpus by iterating over windows of texts.""" + """Gather some stats about relevant terms of a corpus by iterating over windows of texts. + + Attributes + ---------- + relevant_words : set + Set of words. + dictionary: tuple + Dictionary instance with mappings for the relevant_ids. + """ def __init__(self, relevant_ids, dictionary): - """ - Parameters - ---------- - relevant_ids: set - Set of words that occurrences should be accumulated for. - dictionary: tuple - Dictionary instance with mappings for the relevant_ids. - """ super(WindowedTextsAnalyzer, self).__init__(relevant_ids, dictionary) self._none_token = self._vocab_size # see _iter_texts for use of none token @@ -367,21 +392,20 @@ def _iter_texts(self, texts): class ParallelWordOccurrenceAccumulator(WindowedTextsAnalyzer): - """Accumulate word occurrences in parallel.""" + """Accumulate word occurrences in parallel. - def __init__(self, processes, *args, **kwargs): - """ - Parameters - ---------- - processes : int - Number of processes to use; must be at least two. - args : - Should include `relevant_ids` and `dictionary` (see :class:`~UsesDictionary.__init__`). - kwargs : - Can include `batch_size`, which is the number of docs to send to a worker at a time. - If not included, it defaults to 64. + Attributes + ---------- + processes : int + Number of processes to use; must be at least two. + args : + Should include `relevant_ids` and `dictionary` (see :class:`~UsesDictionary.__init__`). + kwargs : + Can include `batch_size`, which is the number of docs to send to a worker at a time. + If not included, it defaults to 64. + """ - """ + def __init__(self, processes, *args, **kwargs): super(ParallelWordOccurrenceAccumulator, self).__init__(*args) if processes < 2: raise ValueError( @@ -412,8 +436,7 @@ def start_workers(self, window_size): Parameters ---------- - window_size : - in progress + window_size : int Returns ------- @@ -554,22 +577,16 @@ def reply_to_master(self): class WordVectorsAccumulator(UsesDictionary): """Accumulate context vectors for words using word vector embeddings. - Examples - -------- - - + Attributes + ---------- + model: Word2Vec (:class:`~gensim.models.keyedvectors.KeyedVectors`) + If None, a new Word2Vec model is trained on the given text corpus. Otherwise, + it should be a pre-trained Word2Vec context vectors. + model_kwargs: + if model is None, these keyword arguments will be passed through to the Word2Vec constructor. """ def __init__(self, relevant_ids, dictionary, model=None, **model_kwargs): - """ - Parameters - ---------- - model: Word2Vec (:class:`~gensim.models.keyedvectors.KeyedVectors`) - If None, a new Word2Vec model is trained on the given text corpus. Otherwise, - it should be a pre-trained Word2Vec context vectors. - model_kwargs: - if model is None, these keyword arguments will be passed through to the Word2Vec constructor. - """ super(WordVectorsAccumulator, self).__init__(relevant_ids, dictionary) self.model = model self.model_kwargs = model_kwargs From d9237ea4444624b5f3ba14ab544c09927e3522f0 Mon Sep 17 00:00:00 2001 From: ivan Date: Mon, 11 Dec 2017 18:24:31 +0500 Subject: [PATCH 19/39] fix cross-reference problem --- gensim/models/atmodel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gensim/models/atmodel.py b/gensim/models/atmodel.py index 02b18984ac..5463e8a025 100755 --- a/gensim/models/atmodel.py +++ b/gensim/models/atmodel.py @@ -560,10 +560,10 @@ def update(self, corpus=None, author2doc=None, doc2author=None, chunksize=None, Args: corpus (gensim corpus): The corpus with which the author-topic model should be updated. - author2doc (dictionary): author to document mapping corresponding to indexes in input + author2doc (dict): author to document mapping corresponding to indexes in input corpus. - doc2author (dictionary): document to author mapping corresponding to indexes in input + doc2author (dict): document to author mapping corresponding to indexes in input corpus. chunks_as_numpy (bool): Whether each chunk passed to `.inference` should be a np From 275edd047811a1014bb2b085774b02f4126a50f3 Mon Sep 17 00:00:00 2001 From: ivan Date: Mon, 11 Dec 2017 18:27:35 +0500 Subject: [PATCH 20/39] fix pep8 --- gensim/topic_coherence/indirect_confirmation_measure.py | 6 ++++-- gensim/topic_coherence/text_analysis.py | 1 - 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index 2dfa3d6297..66cd183d25 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -111,7 +111,8 @@ def word2vec_similarity(segmented_topics, accumulator, with_std=False, with_supp return topic_coherences -def cosine_similarity(segmented_topics, accumulator, topics, measure='nlr', gamma=1,with_std=False, with_support=False): +def cosine_similarity(segmented_topics, accumulator, topics, measure='nlr', + gamma=1, with_std=False, with_support=False): r"""Calculate the indirect cosine measure. Given context vectors :math:`u = V(W') and w = V(W*)` for the @@ -254,7 +255,8 @@ def compute_context_vector(self, segment_word_ids, topic_word_ids): Returns ------- csr_matrix :class:`~scipy.sparse.csr` - If context vector has been cached, then return corresponding context vector, else compute, cache, and return. + If context vector has been cached, then return corresponding context vector, + else compute, cache, and return. Example --------- diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index d03789f913..87dd6caeee 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -207,7 +207,6 @@ class InvertedIndexBased(BaseAnalyzer): """ - def __init__(self, *args): super(InvertedIndexBased, self).__init__(*args) self._inverted_index = np.array([set() for _ in range(self._vocab_size)]) From 94bde33fb6e10f7d751a84e80fb24206cad87bc9 Mon Sep 17 00:00:00 2001 From: ivan Date: Mon, 11 Dec 2017 18:47:25 +0500 Subject: [PATCH 21/39] fix aggregation --- gensim/topic_coherence/aggregation.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/gensim/topic_coherence/aggregation.py b/gensim/topic_coherence/aggregation.py index 48091085d3..aa27c833f7 100644 --- a/gensim/topic_coherence/aggregation.py +++ b/gensim/topic_coherence/aggregation.py @@ -4,8 +4,7 @@ # Copyright (C) 2013 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""This module contains functions to perform aggregation on a list of values -obtained from the confirmation measure.""" +"""This module contains functions to perform aggregation on a list of values obtained from the confirmation measure.""" import logging import numpy as np @@ -20,12 +19,12 @@ def arithmetic_mean(confirmed_measures): Parameters ---------- - confirmed_measures : list + confirmed_measures : list of float List of calculated confirmation measure on each set in the segmented topics. Returns ------- - numpy.float + `numpy.float` Arithmetic mean of all the values contained in confirmation measures. Examples From 782d5cf13ef4c633ce0ab606ed0e2378417d02e0 Mon Sep 17 00:00:00 2001 From: ivan Date: Mon, 11 Dec 2017 20:46:01 +0500 Subject: [PATCH 22/39] fix direct_confirmation_measure --- .../direct_confirmation_measure.py | 126 ++++++++++-------- 1 file changed, 69 insertions(+), 57 deletions(-) diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py index c5a405c60b..419573254a 100644 --- a/gensim/topic_coherence/direct_confirmation_measure.py +++ b/gensim/topic_coherence/direct_confirmation_measure.py @@ -4,8 +4,7 @@ # Copyright (C) 2013 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""This module contains functions to compute direct confirmation on a pair of words or word subsets. -""" +"""This module contains functions to compute direct confirmation on a pair of words or word subsets.""" import logging @@ -13,21 +12,22 @@ logger = logging.getLogger(__name__) -EPSILON = 1e-12 # Should be small. Value as suggested in paper. +# Should be small. Value as suggested in paper http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf +EPSILON = 1e-12 def log_conditional_probability(segmented_topics, accumulator, with_std=False, with_support=False): - """ - Calculate the log-conditional-probability measure - which is used by coherence measures such as U_mass. - This is defined as :math:`m_lc(S_i) = log[(P(W', W*) + e) / P(W*)]` + """Calculate the log-conditional-probability measure which is used by coherence measures such as `U_mass`. + This is defined as :math:`m_{lc}(S_i) = log \\frac{P(W', W^{*}) + \epsilon}{P(W^{*})}`. Parameters ---------- - segmented_topics : list - Output from the segmentation module of the segmented topics. Is a list of list of tuples. + segmented_topics : list of lists of (int, int) + Output from the :func:`~gensim.topic_coherence.segmentation.s_one_pre`, + :func:`~gensim.topic_coherence.segmentation.s_one_one`, and so on from the + :mod:`gensim.topic_coherence.segmentation` module. accumulator : :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` - Word occurrence accumulator from probability_estimation. + Word occurrence accumulator from :mod:`gensim.topic_coherence.probability_estimation`. with_std : bool True to also include standard deviation across topic segment sets in addition to the mean coherence for each topic. @@ -37,26 +37,29 @@ def log_conditional_probability(segmented_topics, accumulator, with_std=False, w Returns ------- - list - List of log conditional probability measure for each topic. + list of float + Log conditional probabilities measurement for each topic. Examples -------- >>> from gensim.topic_coherence import direct_confirmation_measure, text_analysis >>> from collections import namedtuple - Now we create dictionary: + >>> + >>> # Create dictionary >>> id2token = {1: 'test', 2: 'doc'} >>> token2id = {v: k for k, v in id2token.items()} >>> dictionary = namedtuple('Dictionary', 'token2id, id2token')(token2id, id2token) - Then we will initialize segmented topics: + >>> + >>> # Initialize segmented topics and accumulator >>> segmentation = [[(1, 2)]] - And accumulator: + >>> >>> accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary) >>> accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}} >>> accumulator._num_docs = 5 - Function call: - >>> direct_confirmation_measure.log_conditional_probability(segmentation, accumulator)[0] - Answer should be ~ ln(1 / 2) = -0.693147181 + >>> + >>> # result should be ~ ln(1 / 2) = -0.693147181 + >>> result = direct_confirmation_measure.log_conditional_probability(segmentation, accumulator)[0] + """ topic_coherences = [] @@ -79,13 +82,13 @@ def log_conditional_probability(segmented_topics, accumulator, with_std=False, w def aggregate_segment_sims(segment_sims, with_std, with_support): - """Compute various statistics from the segment similarities generated via - set pairwise comparisons of top-N word lists for a single topic. + """Compute various statistics from the segment similarities generated via set pairwise comparisons + of top-N word lists for a single topic. Parameters ---------- - segment_sims : iterable array - Floating point similarity values to aggregate. + segment_sims : list of float + Similarity values to aggregate. with_std : bool Set to True to include standard deviation. with_support : bool @@ -93,18 +96,18 @@ def aggregate_segment_sims(segment_sims, with_std, with_support): Returns ------- - tuple - tuple with (mean[, std[, support]]) + (float[, float[, int]]) + Tuple with (mean[, std[, support]]). Examples --------- - >>> import numpy as np >>> from gensim.topic_coherence import direct_confirmation_measure - >>> segment_sims = np.array([[1, 2], [3, 4]]) - >>> direct_confirmation_measure.aggregate_segment_sims(segment_sims,True,True) - (2.5, 1.1180339887498949, 2) - >>> direct_confirmation_measure.aggregate_segment_sims(segment_sims,False,False) - 2.5 + >>> + >>> segment_sims = [0.2, 0.5, 1., 0.05] + >>> direct_confirmation_measure.aggregate_segment_sims(segment_sims, True, True) + (0.4375, 0.36293077852394939, 4) + >>> direct_confirmation_measure.aggregate_segment_sims(segment_sims, False, False) + 0.4375 """ mean = np.mean(segment_sims) @@ -117,52 +120,61 @@ def aggregate_segment_sims(segment_sims, with_std, with_support): return stats[0] if len(stats) == 1 else tuple(stats) -def log_ratio_measure( - segmented_topics, accumulator, normalize=False, with_std=False, with_support=False): - """ - If normalize=False: - Popularly known as PMI. - Calculate the log-ratio-measure which is used by - coherence measures such as c_v. - This is defined as :math:`m_lr(S_i) = log[(P(W', W*) + e) / (P(W') * P(W*))]` - - If normalize=True: - Calculate the normalized-log-ratio-measure, popularly knowns as - NPMI which is used by coherence measures such as c_v. - This is defined as :math:`m_nlr(S_i) = m_lr(S_i) / -log[P(W', W*) + e]` +def log_ratio_measure(segmented_topics, accumulator, normalize=False, with_std=False, with_support=False): + """Compute log ratio measure for `segment_topics`. Parameters ---------- - segmented_topics : list of (list of tuples) - Output from the segmentation module of the segmented topics. - accumulator: :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` - Word occurrence accumulator from probability_estimation. + segmented_topics : list of lists of (int, int) + Output from the :func:`~gensim.topic_coherence.segmentation.s_one_pre`, + :func:`~gensim.topic_coherence.segmentation.s_one_one`, and so on from the + :mod:`gensim.topic_coherence.segmentation` module. + accumulator : :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` + Word occurrence accumulator from :mod:`gensim.topic_coherence.probability_estimation`. + normalize : bool + Details in the "Notes" section. with_std : bool - True to also include standard deviation across topic segment - sets in addition to the mean coherence for each topic; default is False. + True to also include standard deviation across topic segment sets in addition to the mean coherence + for each topic. with_support : bool - True to also include support across topic segments. The - support is defined as the number of pairwise similarity comparisons were - used to compute the overall topic coherence. + True to also include support across topic segments. The support is defined as the number of pairwise + similarity comparisons were used to compute the overall topic coherence. + + Notes + ----- + If `normalize=False`: + Calculate the log-ratio-measure, popularly known as **PMI** which is used by coherence measures such as `c_v`. + This is defined as :math:`m_{lr}(S_i) = log \\frac{P(W', W^{*}) + \epsilon}{P(W') * P(W^{*})}` + + If `normalize=True`: + Calculate the normalized-log-ratio-measure, popularly knowns as **NPMI** + which is used by coherence measures such as `c_v`. + This is defined as :math:`m_{nlr}(S_i) = \\frac{m_{lr}(S_i)}{-log(P(W', W^{*}) + \epsilon)}` Returns ------- - list - List of log ratio measure for each topic. + list of float + Log ratio measurements for each topic. Examples -------- - >>> from gensim.topic_coherence import direct_confirmation_measure,text_analysis + >>> from gensim.topic_coherence import direct_confirmation_measure, text_analysis >>> from collections import namedtuple + >>> + >>> # Create dictionary >>> id2token = {1: 'test', 2: 'doc'} >>> token2id = {v: k for k, v in id2token.items()} >>> dictionary = namedtuple('Dictionary', 'token2id, id2token')(token2id, id2token) + >>> + >>> # Initialize segmented topics and accumulator >>> segmentation = [[(1, 2)]] + >>> >>> accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary) >>> accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}} >>> accumulator._num_docs = 5 - >>> direct_confirmation_measure.log_ratio_measure(segmentation, accumulator)[0] - Answer should be ~ ln{(1 / 5) / [(3 / 5) * (2 / 5)]} = -0.182321557 + >>> + >>> # result should be ~ ln{(1 / 5) / [(3 / 5) * (2 / 5)]} = -0.182321557 + >>> result = direct_confirmation_measure.log_ratio_measure(segmentation, accumulator)[0] """ topic_coherences = [] From 81732ef314a6272af3ee440674b2c1e91faa0549 Mon Sep 17 00:00:00 2001 From: ivan Date: Mon, 11 Dec 2017 21:25:58 +0500 Subject: [PATCH 23/39] fix types in direct_confirmation_measure --- gensim/topic_coherence/direct_confirmation_measure.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py index 419573254a..15858c91b8 100644 --- a/gensim/topic_coherence/direct_confirmation_measure.py +++ b/gensim/topic_coherence/direct_confirmation_measure.py @@ -24,8 +24,7 @@ def log_conditional_probability(segmented_topics, accumulator, with_std=False, w ---------- segmented_topics : list of lists of (int, int) Output from the :func:`~gensim.topic_coherence.segmentation.s_one_pre`, - :func:`~gensim.topic_coherence.segmentation.s_one_one`, and so on from the - :mod:`gensim.topic_coherence.segmentation` module. + :func:`~gensim.topic_coherence.segmentation.s_one_one`. accumulator : :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` Word occurrence accumulator from :mod:`gensim.topic_coherence.probability_estimation`. with_std : bool @@ -127,8 +126,7 @@ def log_ratio_measure(segmented_topics, accumulator, normalize=False, with_std=F ---------- segmented_topics : list of lists of (int, int) Output from the :func:`~gensim.topic_coherence.segmentation.s_one_pre`, - :func:`~gensim.topic_coherence.segmentation.s_one_one`, and so on from the - :mod:`gensim.topic_coherence.segmentation` module. + :func:`~gensim.topic_coherence.segmentation.s_one_one`. accumulator : :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` Word occurrence accumulator from :mod:`gensim.topic_coherence.probability_estimation`. normalize : bool From 3c7b40181e84bade7ca1cd714c2cc74fb11f06d9 Mon Sep 17 00:00:00 2001 From: ivan Date: Mon, 11 Dec 2017 21:46:15 +0500 Subject: [PATCH 24/39] partial fix indirect_confirmation_measure --- .../indirect_confirmation_measure.py | 72 +++++++------------ 1 file changed, 27 insertions(+), 45 deletions(-) diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index 66cd183d25..3cf0cff8ba 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -6,28 +6,25 @@ r"""This module contains functions to compute confirmation on a pair of words or word subsets. -Notes ------ -The advantage of indirect confirmation measure is that it computes similarity of words in W' and -W* with respect to direct confirmations to all words. Eg. Suppose x and z are both competing +The advantage of indirect confirmation measure is that it computes similarity of words in :math:`W'` and +:math:`W^{*}` with respect to direct confirmations to all words. Eg. Suppose `x` and `z` are both competing brands of cars, which semantically support each other. However, both brands are seldom mentioned together in documents in the reference corpus. But their confirmations to other words like “road” or “speed” do strongly correlate. This would be reflected by an indirect confirmation measure. Thus, indirect confirmation measures may capture semantic support that direct measures would miss. The formula used to compute indirect confirmation measure is + .. math:: - m_{sim}_{(m, \gamma)}(W', W*) = s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*)) + \widetilde{m}_{sim(m, \gamma)}(W', W^{*}) = s_{sim}(\vec{v}^{\,}_{m,\gamma}(W'), \vec{v}^{\,}_{m,\gamma}(W^{*})) -where s_sim can be cosine, dice or jaccard similarity and -.. math:: - \vec{V}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|} +where :math:`s_{sim}` can be cosine, dice or jaccard similarity and + +.. math:: -Attributes: ------------ -m: direct confirmation measure. + \vec{v}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|} """ @@ -37,49 +34,48 @@ import numpy as np import scipy.sparse as sps -from gensim.topic_coherence.direct_confirmation_measure import ( - aggregate_segment_sims, log_ratio_measure) +from gensim.topic_coherence.direct_confirmation_measure import aggregate_segment_sims, log_ratio_measure logger = logging.getLogger(__name__) def word2vec_similarity(segmented_topics, accumulator, with_std=False, with_support=False): """For each topic segmentation, compute average cosine similarity using a - WordVectorsAccumulator. + :class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator`. Parameters ---------- - segmented_topics : list of (list of tuples) - Output from the segmentation module of the segmented topics. - accumulator: :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` - Word occurrence accumulator from probability_estimation. + segmented_topics : list of lists of (int, `numpy.ndarray`) + Output from the :func:`~gensim.topic_coherence.segmentation.s_one_set`. + accumulator : :class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator` + Word occurrence accumulator. with_std : bool - True to also include standard deviation across topic segment - sets in addition to the mean coherence for each topic; default is False. + True to also include standard deviation across topic segment sets + in addition to the mean coherence for each topic. with_support : bool - True to also include support across topic segments. The - support is defined as the number of pairwise similarity comparisons were - used to compute the overall topic coherence. + True to also include support across topic segments. The support is defined as + the number of pairwise similarity comparisons were used to compute the overall topic coherence. Returns ------- - list - List of word2vec cosine similarities per topic. + list of (float[, float[, int]]) + Сosine word2vec similarities per topic (with std/support if `with_std`, `with_support`). Examples -------- - >>> from gensim.corpora.dictionary import Dictionary >>> import numpy as np + >>> from gensim.corpora.dictionary import Dictionary >>> from gensim.topic_coherence import indirect_confirmation_measure >>> from gensim.topic_coherence import text_analysis + >>> >>> segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]] >>> dictionary = Dictionary() >>> dictionary.id2token = {1: 'fake', 2: 'tokens'} >>> accumulator = text_analysis.WordVectorsAccumulator({1, 2}, dictionary) >>> accumulator.accumulate([['fake', 'tokens'],['tokens', 'fake']], 5) + >>> + >>> # should be (0.726752426218 0.00695475919227) >>> mean, std = indirect_confirmation_measure.word2vec_similarity(segmentation, accumulator, with_std=True)[0] - >>> print mean, std - 0.726752426218 0.00695475919227 """ topic_coherences = [] @@ -113,21 +109,7 @@ def word2vec_similarity(segmented_topics, accumulator, with_std=False, with_supp def cosine_similarity(segmented_topics, accumulator, topics, measure='nlr', gamma=1, with_std=False, with_support=False): - r"""Calculate the indirect cosine measure. - - Given context vectors :math:`u = V(W') and w = V(W*)` for the - word sets of a pair :math:`S_i = (W', W*)` indirect cosine measure - is computed as the cosine similarity between u and w. - - The formula used is - .. math:: - - m_{sim}_{(m, \gamma)}(W', W*) = s_{sim}(\vec{V}^{\,}_{m,\gamma}(W'), \vec{V}^{\,}_{m,\gamma}(W*)) - - where each vector - .. math:: - - \vec{V}^{\,}_{m,\gamma}(W') = \Bigg \{{\sum_{w_{i} \in W'}^{ } m(w_{i}, w_{j})^{\gamma}}\Bigg \}_{j = 1,...,|W|} + """Calculate the indirect cosine measure. Parameters ---------- @@ -301,11 +283,11 @@ def _make_seg(self, segment_word_ids, topic_word_ids): def _pair_npmi(pair, accumulator): - """Compute normalized pairwise mutual information (NPMI) between a pair of words. + """Compute normalized pairwise mutual information (**NPMI**) between a pair of words. Parameters ---------- - pair : iterable + pair : iterable of The pair of words (word_id1, word_id2). accumulator : :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` Word occurrence accumulator from probability_estimation. From 206784dd34666edaec4ffef7f67fb31e84117e34 Mon Sep 17 00:00:00 2001 From: CLearERR Date: Tue, 12 Dec 2017 08:03:35 +0500 Subject: [PATCH 25/39] HotFix for probability_estimation and segmentation --- .../topic_coherence/probability_estimation.py | 27 +++++++++------- gensim/topic_coherence/segmentation.py | 31 +++++++++---------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index b3ca90eae1..dbac1e248d 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -25,7 +25,7 @@ def p_boolean_document(corpus, segmented_topics): ---------- corpus : list The corpus of documents. - segmented_topics : list of tuples + segmented_topics : list of list of (str,str) Output from the segmentation of topics. Tuples of (word_id_set1, word_id_set2). Could be simply topics too. Returns @@ -58,21 +58,24 @@ def p_boolean_document(corpus, segmented_topics): def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, processes=1): """Perform the boolean sliding window probability estimation. - Boolean sliding window determines word counts using a sliding window. The window - moves over the documents one word token per step. Each step defines a new virtual - document by copying the window content. Boolean document is applied to these virtual - documents to compute word probabilities. Parameters ---------- - texts : List of strings. + texts : List of str segmented_topics : list of tuples of (word_id_set1, word_id_set2) Output from the segmentation of topics. Could be simply topics too. - dictionary : + dictionary : :class:`~gensim.corpora.dictionary` Gensim dictionary mapping of the tokens and ids. - window_size : + window_size : int Size of the sliding window. 110 found out to be the ideal size for large corpora. + Notes + ----- + Boolean sliding window determines word counts using a sliding window. The window + moves over the documents one word token per step. Each step defines a new virtual + document by copying the window content. Boolean document is applied to these virtual + documents to compute word probabilities. + Returns ------- :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` @@ -113,16 +116,16 @@ def p_word2vec(texts, segmented_topics, dictionary, window_size=None, processes= Parameters ---------- - texts : List of strings. + texts : List of str segmented_topics : list of tuples of (word_id_set1, word_id_set2) Output from the segmentation of topics. Could be simply topics too. - dictionary : + dictionary : :class:`~gensim.corpora.dictionary` Gensim dictionary mapping of the tokens and ids. - window_size : + window_size : int Size of the sliding window. processes: int Number of processes to use. - model: model: Word2Vec (:class:`~gensim.models.keyedvectors.KeyedVectors`) + model: Word2Vec (:class:`~gensim.models.keyedvectors.KeyedVectors`) If None, a new Word2Vec model is trained on the given text corpus. Otherwise, it should be a pre-trained Word2Vec context vectors. diff --git a/gensim/topic_coherence/segmentation.py b/gensim/topic_coherence/segmentation.py index 3003f8b5ba..ff1ae5f256 100644 --- a/gensim/topic_coherence/segmentation.py +++ b/gensim/topic_coherence/segmentation.py @@ -13,21 +13,20 @@ def s_one_pre(topics): - """ - Performs s_one_pre segmentation on a list of topics. + """Performs s_one_pre segmentation on a list of topics. s_one_pre segmentation is defined as: s_one_pre = {(W', W*) | W' = {w_i}; W* = {w_j}; w_i, w_j belongs to W; i > j} Parameters ---------- - topics : list of topics + topics : list of np.array list of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] Returns ------- - list of list of (W', W*) tuples - For all unique topic ids. + list of list of (str, str). + (W', W*) for all unique topic ids. Examples -------- @@ -51,20 +50,19 @@ def s_one_pre(topics): def s_one_one(topics): - """ - Perform s_one_one segmentation on a list of topics. + """Perform s_one_one segmentation on a list of topics. s_one_one segmentation is defined as: s_one_one = {(W', W*) | W' = {w_i}; W* = {w_j}; w_i, w_j belongs to W; i != j} Parameters ---------- - topics : list of topics - List of topics obtained from an algorithm such as LDA. Is a list such as - [array([ 9, 10, 11]), array([ 9, 10, 7]), ...]. + topics : list of np.array + List of topics obtained from an algorithm such as LDA. + Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...]. Returns ------- - list of list of (W', W*) tuples - For all unique topic ids. + list of list of (str, str). + (W', W*) for all unique topic ids. Examples ------- @@ -91,20 +89,19 @@ def s_one_one(topics): def s_one_set(topics): - """ - Perform s_one_set segmentation on a list of topics. + """Perform s_one_set segmentation on a list of topics. s_one_set segmentation is defined as: s_one_set = {(W', W*) | W' = {w_i}; w_i belongs to W; W* = W} Parameters ---------- - topics : list of topics + topics : list of np.array List of topics obtained from an algorithm such as LDA. Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...]. Returns ------- - list of list of (W', W*) tuples - For all unique topic ids. + list of list of (str, str). + (W', W*) for all unique topic ids. Examples -------- From 67962be046a06bf94f7b25efb6ea79df1d538ae5 Mon Sep 17 00:00:00 2001 From: CLearERR Date: Wed, 13 Dec 2017 01:45:35 +0500 Subject: [PATCH 26/39] Refactoring for probability_estimation --- .../topic_coherence/probability_estimation.py | 77 ++++++++++++------- 1 file changed, 48 insertions(+), 29 deletions(-) diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index dbac1e248d..2f9971140a 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -4,8 +4,7 @@ # Copyright (C) 2013 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""This module contains functions to perform segmentation on a list of topics. -""" +"""This module contains functions to perform segmentation on a list of topics.""" import itertools import logging @@ -25,8 +24,8 @@ def p_boolean_document(corpus, segmented_topics): ---------- corpus : list The corpus of documents. - segmented_topics : list of list of (str,str) - Output from the segmentation of topics. Tuples of (word_id_set1, word_id_set2). Could be simply topics too. + segmented_topics: list of (int, int). + Each tuple (word_id_set1, word_id_set2) is either a single integer, or a `numpy.ndarray` of integers. Returns ------- @@ -37,19 +36,27 @@ def p_boolean_document(corpus, segmented_topics): --------- >>> from gensim.topic_coherence import probability_estimation >>> from gensim.corpora.hashdictionary import HashDictionary - >>> from gensim.corpora.dictionary import Dictionary - >>> texts = [['human', 'interface', 'computer'],['eps', 'user', 'interface', 'system'], - >>> ['system', 'human', 'system', 'eps'],['user', 'response', 'time'],['trees'],['graph', 'trees']] + >>> + >>> # create dictionary + >>> texts = [['human', 'interface', 'computer'], ['eps', 'user', 'interface', 'system'], + >>> ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees']] >>> dictionary = HashDictionary(texts) >>> token2id = dictionary.token2id >>> computer_id = token2id['computer'] >>> system_id = token2id['system'] >>> user_id = token2id['user'] >>> graph_id = token2id['graph'] + >>> + >>> # create segmented_topics >>> segmented_topics = [[(system_id, graph_id),(computer_id, graph_id),(computer_id, system_id)], [ >>> (computer_id, graph_id),(user_id, graph_id),(user_id, computer_id)]] + >>> + >>> # create corpus >>> corpus = [dictionary.doc2bow(text) for text in texts] - >>> probability_estimation.p_boolean_document(corpus, segmented_topics) + >>> + >>> # result.index_to_dict() should be {10608: set([0]), 12736: set([1, 3]), 18451: set([5]), 5798: set([1, 2])} + >>> result = probability_estimation.p_boolean_document(corpus, segmented_topics) + >>> result.index_to_dict() """ top_ids = unique_ids_from_segments(segmented_topics) @@ -61,9 +68,9 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, p Parameters ---------- - texts : List of str - segmented_topics : list of tuples of (word_id_set1, word_id_set2) - Output from the segmentation of topics. Could be simply topics too. + texts : list of str + segmented_topics: list of (int, int). + Each tuple (word_id_set1, word_id_set2) is either a single integer, or a `numpy.ndarray` of integers. dictionary : :class:`~gensim.corpora.dictionary` Gensim dictionary mapping of the tokens and ids. window_size : int @@ -85,21 +92,27 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, p --------- >>> from gensim.topic_coherence import probability_estimation >>> from gensim.corpora.hashdictionary import HashDictionary - >>> from gensim.corpora.dictionary import Dictionary - >>> texts = [['human', 'interface', 'computer'],['eps', 'user', 'interface', 'system'], - >>> ['system', 'human', 'system', 'eps'],['user', 'response', 'time'],['trees'],['graph', 'trees']] + >>> + >>> # create dictionary + >>> texts = [['human', 'interface', 'computer'], ['eps', 'user', 'interface', 'system'], + >>> ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees']] >>> dictionary = HashDictionary(texts) >>> token2id = dictionary.token2id >>> computer_id = token2id['computer'] >>> system_id = token2id['system'] >>> user_id = token2id['user'] >>> graph_id = token2id['graph'] - >>> segmented_topics = [[(system_id, graph_id),(computer_id, graph_id),(computer_id, system_id)], [ - >>> (computer_id, graph_id),(user_id, graph_id),(user_id, computer_id)]] + >>> + >>> # create segmented_topics + >>> segmented_topics = [[(system_id, graph_id), (computer_id, graph_id), (computer_id, system_id)], [ + >>> (computer_id, graph_id), (user_id, graph_id), (user_id, computer_id)]] + >>> + >>> # create corpus >>> corpus = [dictionary.doc2bow(text) for text in texts] >>> accumulator = probability_estimation.p_boolean_sliding_window(texts, segmented_topics, dictionary, 2) + >>> + >>> # should be 1 3 1 4 >>> print accumulator[computer_id], accumulator[user_id], accumulator[graph_id], accumulator[system_id] - 1 3 1 4 """ top_ids = unique_ids_from_segments(segmented_topics) @@ -116,7 +129,7 @@ def p_word2vec(texts, segmented_topics, dictionary, window_size=None, processes= Parameters ---------- - texts : List of str + texts : list of str segmented_topics : list of tuples of (word_id_set1, word_id_set2) Output from the segmentation of topics. Could be simply topics too. dictionary : :class:`~gensim.corpora.dictionary` @@ -139,23 +152,30 @@ def p_word2vec(texts, segmented_topics, dictionary, window_size=None, processes= >>> from gensim.topic_coherence import probability_estimation >>> from gensim.corpora.hashdictionary import HashDictionary >>> from gensim.models import word2vec - >>> from gensim.corpora.dictionary import Dictionary - >>> texts = [['human', 'interface', 'computer'],['eps', 'user', 'interface', 'system'], - >>> ['system', 'human', 'system', 'eps'],['user', 'response', 'time'],['trees'],['graph', 'trees']] + >>> + >>> # create dictionary + >>> texts = [['human', 'interface', 'computer'], ['eps', 'user', 'interface', 'system'], + >>> ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees']] >>> dictionary = HashDictionary(texts) >>> token2id = dictionary.token2id >>> computer_id = token2id['computer'] >>> system_id = token2id['system'] >>> user_id = token2id['user'] >>> graph_id = token2id['graph'] - >>> segmented_topics = [[(system_id, graph_id),(computer_id, graph_id),(computer_id, system_id)], [ - >>> (computer_id, graph_id),(user_id, graph_id),(user_id, computer_id)]] + >>> + >>> # create segmented_topics + >>> segmented_topics = [[(system_id, graph_id), (computer_id, graph_id), (computer_id, system_id)], [ + >>> (computer_id, graph_id), (user_id, graph_id), (user_id, computer_id)]] + >>> + >>> # create corpus >>> corpus = [dictionary.doc2bow(text) for text in texts] >>> sentences = [['human', 'interface', 'computer'],['survey', 'user', 'computer', 'system', 'response', 'time']] - >>> model = word2vec.Word2Vec(sentences, size=100, window=5, min_count=5, workers=4) #TODO Ivan fix this holy shield + >>> model = word2vec.Word2Vec(sentences, size=100, window=5, min_count=5, workers=4) #TODO Ivan please fix this holy shield >>> accumulator = probability_estimation.p_word2vec(texts, segmented_topics, dictionary, 2, 1, model) + >>> + >>> # next string should return 1 3 1 4 example for model = None >>> print accumulator[computer_id], accumulator[user_id], accumulator[graph_id], accumulator[system_id] - 1 3 1 4 # example for model = None + """ top_ids = unique_ids_from_segments(segmented_topics) accumulator = WordVectorsAccumulator( @@ -168,8 +188,8 @@ def unique_ids_from_segments(segmented_topics): Parameters ---------- - segmented_topics: list of tuples of (word_id_set1, word_id_set2). - Each word_id_setis either a single integer, or a `numpy.ndarray` of integers. + segmented_topics: list of (int, int). + Each tuple (word_id_set1, word_id_set2) is either a single integer, or a `numpy.ndarray` of integers. Returns ------- @@ -180,9 +200,8 @@ def unique_ids_from_segments(segmented_topics): ------- >>> from gensim.topic_coherence import probability_estimation >>> segmentation = [[(1, 2)]] + >>> # should be set([1, 2]) >>> probability_estimation.unique_ids_from_segments(segmentation) - set([1, 2]) - """ unique_ids = set() # is a set of all the unique ids contained in topics. From 74c5c8697f7c36e7b0752fd485dcef6f4ae2ee95 Mon Sep 17 00:00:00 2001 From: CLearERR Date: Fri, 15 Dec 2017 01:13:23 +0500 Subject: [PATCH 27/39] Changes for indirect_confirmation_measure --- .../indirect_confirmation_measure.py | 80 +++++++++++-------- 1 file changed, 47 insertions(+), 33 deletions(-) diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index 3cf0cff8ba..31349c0716 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -6,6 +6,8 @@ r"""This module contains functions to compute confirmation on a pair of words or word subsets. +Notes +----- The advantage of indirect confirmation measure is that it computes similarity of words in :math:`W'` and :math:`W^{*}` with respect to direct confirmations to all words. Eg. Suppose `x` and `z` are both competing brands of cars, which semantically support each other. However, both brands are seldom mentioned @@ -68,7 +70,10 @@ def word2vec_similarity(segmented_topics, accumulator, with_std=False, with_supp >>> from gensim.topic_coherence import indirect_confirmation_measure >>> from gensim.topic_coherence import text_analysis >>> + >>> # create segmentation >>> segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]] + >>> + >>> # create accumulator >>> dictionary = Dictionary() >>> dictionary.id2token = {1: 'fake', 2: 'tokens'} >>> accumulator = text_analysis.WordVectorsAccumulator({1, 2}, dictionary) @@ -113,14 +118,14 @@ def cosine_similarity(segmented_topics, accumulator, topics, measure='nlr', Parameters ---------- - segmented_topics: list of (list of tuples) + segmented_topics: list of lists of (int, `numpy.ndarray`) Output from the segmentation module of the segmented topics. accumulator: :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` Output from the probability_estimation module. Is an topics: Topics obtained from the trained topic model. measure : str Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio). gamma: float - Gamma value for computing W', W* vectors. + Gamma value for computing :math:`W'` and :math:`W^{*}` vectors. with_std : bool True to also include standard deviation across topic segment sets in addition to the mean coherence for each topic; default is False. @@ -138,12 +143,18 @@ def cosine_similarity(segmented_topics, accumulator, topics, measure='nlr', >>> from gensim.corpora.dictionary import Dictionary >>> from gensim.topic_coherence import indirect_confirmation_measure, text_analysis >>> import numpy as np + >>> + >>> # create accumulator >>> dictionary = Dictionary() >>> dictionary.id2token = {1: 'fake', 2: 'tokens'} >>> accumulator = text_analysis.InvertedIndexAccumulator({1, 2}, dictionary) >>> accumulator._inverted_index = {0: {2, 3, 4}, 1: {3, 5}} >>> accumulator._num_docs = 5 + >>> + >>> # create topics >>> topics = [np.array([1, 2])] + >>> + >>> # create segmentation >>> segmentation = [[(1, np.array([1, 2])), (2, np.array([1, 2]))]] >>> obtained = indirect_confirmation_measure.cosine_similarity(segmentation, accumulator, topics, 'nlr', 1) >>> print obtained[0] @@ -169,6 +180,17 @@ def cosine_similarity(segmented_topics, accumulator, topics, measure='nlr', class ContextVectorComputer(object): """Lazily compute context vectors for topic segments. + Parameters + ---------- + measure: str + Confirmation measure. + topics: list + Topics. + accumulator : :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` + Word occurrence accumulator from probability_estimation. + gamma: float + Value for computing vectors. + Attributes ---------- sim_cache: dict @@ -176,39 +198,31 @@ class ContextVectorComputer(object): context_vector_cache: dict Mapping from (segment, topic_words) --> context_vector. + Example + ------- + >>> from gensim.corpora.dictionary import Dictionary + >>> from gensim.topic_coherence import indirect_confirmation_measure, text_analysis + >>> import numpy as np + >>> + >>> # create measure, topics + >>> measure = 'nlr' + >>> topics = [np.array([1, 2])] + >>> + >>> # create accumulator + >>> dictionary = Dictionary() + >>> dictionary.id2token = {1: 'fake', 2: 'tokens'} + >>> accumulator = text_analysis.WordVectorsAccumulator({1, 2}, dictionary) + >>> accumulator.accumulate([['fake', 'tokens'],['tokens', 'fake']], 5) + >>> cont_vect_comp = indirect_confirmation_measure.ContextVectorComputer(measure, topics, accumulator,1) + >>> # should be {1: 0, 2: 1} + >>> cont_vect_comp.mapping + >>> # should be 2 + >>> cont_vect_comp.vocab_size + """ def __init__(self, measure, topics, accumulator, gamma): - """ - Parameters - ---------- - measure: str - Confirmation measure. - topics: list - Topics. - accumulator : :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` - Word occurrence accumulator from probability_estimation. - gamma: float - Value for computing vectors. - Example - ------- - >>> from gensim.corpora.dictionary import Dictionary - >>> from gensim.topic_coherence import indirect_confirmation_measure, text_analysis - >>> import numpy as np - >>> measure = 'nlr' - >>> top = [np.array([1, 2])] - >>> dictionary = Dictionary() - >>> dictionary.id2token = {1: 'fake', 2: 'tokens'} - >>> accumulator = text_analysis.WordVectorsAccumulator({1, 2}, dictionary) - >>> accumulator.accumulate([['fake', 'tokens'],['tokens', 'fake']], 5) - >>> cont_vect_comp = indirect_confirmation_measure.ContextVectorComputer(measure, top, accumulator,1) - >>> cont_vect_comp.mapping - {1: 0, 2: 1} - >>> cont_vect_comp.vocab_size - 2 - - """ if measure == 'nlr': self.similarity = _pair_npmi else: @@ -242,7 +256,7 @@ def compute_context_vector(self, segment_word_ids, topic_word_ids): Example --------- - In progress + #TODO Need help with understanding parameters' types. """ key = _key_for_segment(segment_word_ids, topic_word_ids) @@ -287,7 +301,7 @@ def _pair_npmi(pair, accumulator): Parameters ---------- - pair : iterable of + pair : (str, str) The pair of words (word_id1, word_id2). accumulator : :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` Word occurrence accumulator from probability_estimation. From ef058df15642efc94d62a251e9d5350dba7942bf Mon Sep 17 00:00:00 2001 From: CLearERR Date: Tue, 19 Dec 2017 01:34:32 +0500 Subject: [PATCH 28/39] Fixed segmentation, partly fixed text_analysis --- gensim/topic_coherence/segmentation.py | 26 +++++++++++++------------ gensim/topic_coherence/text_analysis.py | 24 ++++++++++------------- 2 files changed, 24 insertions(+), 26 deletions(-) diff --git a/gensim/topic_coherence/segmentation.py b/gensim/topic_coherence/segmentation.py index ff1ae5f256..239d729834 100644 --- a/gensim/topic_coherence/segmentation.py +++ b/gensim/topic_coherence/segmentation.py @@ -4,8 +4,7 @@ # Copyright (C) 2013 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""This module contains functions to perform segmentation on a list of topics. -""" +"""This module contains functions to perform segmentation on a list of topics.""" import logging @@ -15,7 +14,10 @@ def s_one_pre(topics): """Performs s_one_pre segmentation on a list of topics. - s_one_pre segmentation is defined as: s_one_pre = {(W', W*) | W' = {w_i}; W* = {w_j}; w_i, w_j belongs to W; i > j} + Notes + ----- + s_one_pre segmentation is defined as + :math:`s_{pre} = {(W', W^{*}) | W' = w_{i}; W^{*} = {w_j}; w_{i}, w_{j} \in W; i > j}` Parameters ---------- @@ -26,15 +28,15 @@ def s_one_pre(topics): Returns ------- list of list of (str, str). - (W', W*) for all unique topic ids. + :math:`(W', W^{*})` for all unique topic ids. Examples -------- >>> import numpy as np >>> from gensim.topic_coherence import segmentation >>> topics = [np.array([1, 2, 3]), np.array([4, 5, 6])] + >>> # should be [[(2, 1), (3, 1), (3, 2)], [(5, 4), (6, 4), (6, 5)]] >>> segmentation.s_one_pre(topics) - [[(2, 1), (3, 1), (3, 2)], [(5, 4), (6, 4), (6, 5)]] """ s_one_pre_res = [] @@ -51,7 +53,8 @@ def s_one_pre(topics): def s_one_one(topics): """Perform s_one_one segmentation on a list of topics. - s_one_one segmentation is defined as: s_one_one = {(W', W*) | W' = {w_i}; W* = {w_j}; w_i, w_j belongs to W; i != j} + s_one_one segmentation is defined as + :math:`s_{one} = {(W', W^{*}) | W' = {w_i}; W^{*} = {w_j}; w_{i}, w_{j} \in W; i != j}` #TODO: neq - doesn't work Parameters ---------- @@ -62,16 +65,15 @@ def s_one_one(topics): Returns ------- list of list of (str, str). - (W', W*) for all unique topic ids. + :math:`(W', W^{*})` for all unique topic ids. Examples ------- >>> import numpy as np >>> from gensim.topic_coherence import segmentation >>> topics = [np.array([1, 2, 3]), np.array([4, 5, 6])] + >>> # should be [[(1, 2), (1, 3), (2, 1), (2, 3), (3, 1), (3, 2)], [(4, 5), (4, 6), (5, 4), (5, 6), (6, 4), (6, 5)]] >>> segmentation.s_one_pre(topics) - [[(1, 2), (1, 3), (2, 1), (2, 3), (3, 1), (3, 2)], [(4, 5), (4, 6), (5, 4), (5, 6), (6, 4), (6, 5)]] - """ s_one_one_res = [] @@ -90,7 +92,7 @@ def s_one_one(topics): def s_one_set(topics): """Perform s_one_set segmentation on a list of topics. - s_one_set segmentation is defined as: s_one_set = {(W', W*) | W' = {w_i}; w_i belongs to W; W* = W} + s_one_set segmentation is defined as :math:`s_{set} = {(W', W^{*}) | W' = {w_i}; w_{i} \in W; W^{*} = W}` Parameters ---------- @@ -101,15 +103,15 @@ def s_one_set(topics): Returns ------- list of list of (str, str). - (W', W*) for all unique topic ids. + :math:`(W', W^{*})` for all unique topic ids. Examples -------- >>> import numpy as np >>> from gensim.topic_coherence import segmentation >>> topics = [np.array([9, 10, 7])] + >>> # should be [[(9, array([ 9, 10, 7])), (10, array([ 9, 10, 7])), (7, array([ 9, 10, 7]))]] >>> segmentation.s_one_set(topics) - [[(9, array([ 9, 10, 7])), (10, array([ 9, 10, 7])), (7, array([ 9, 10, 7]))]] """ s_one_set_res = [] diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 87dd6caeee..db8c9e113e 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -57,8 +57,8 @@ def _ids_to_words(ids, dictionary): >>> dictionary = Dictionary() >>> ids = {1: 'fake', 4: 'cats'} >>> dictionary.id2token = {1: 'fake', 2: 'tokens', 3: 'rabbids', 4: 'cats'} + >>> # should be set(['cats', 'fake']) >>> text_analysis._ids_to_words(ids, dictionary) - set(['cats', 'fake']) """ if not dictionary.id2token: # may not be initialized in the standard gensim.corpora.Dictionary @@ -94,8 +94,8 @@ class BaseAnalyzer(object): >>> from gensim.topic_coherence import text_analysis >>> ids = {1: 'fake', 4: 'cats'} >>> base = text_analysis.BaseAnalyzer(ids) + >>> # should return {1: 'fake', 4: 'cats'} 2 {1: 0, 4: 1} 1000 0 >>> print base.relevant_ids, base._vocab_size, base.id2contiguous, base.log_every, base._num_docs - {1: 'fake', 4: 'cats'} 2 {1: 0, 4: 1} 1000 0 """ def __init__(self, relevant_ids): @@ -161,11 +161,10 @@ class UsesDictionary(BaseAnalyzer): >>> ids = {1: 'foo', 2: 'bar'} >>> dictionary = Dictionary([['foo','bar','baz'], ['foo','bar','bar','baz']]) >>> usesdict = text_analysis.UsesDictionary(ids, dictionary) + >>> # should be set([u'foo', u'baz']) Dictionary(3 unique tokens: [u'baz', u'foo', u'bar']) {u'baz': 2, u'foo': 1, u'bar': 0} >>> print usesdict.relevant_words, usesdict.dictionary, usesdict.token2id - set([u'foo', u'baz']) Dictionary(3 unique tokens: [u'baz', u'foo', u'bar']) {u'baz': 2, u'foo': 1, u'bar': 0} """ - def __init__(self, relevant_ids, dictionary): super(UsesDictionary, self).__init__(relevant_ids) self.relevant_words = _ids_to_words(self.relevant_ids, dictionary) @@ -202,11 +201,10 @@ class InvertedIndexBased(BaseAnalyzer): >>> from gensim.topic_coherence import text_analysis >>> ids = {1: 'fake', 4: 'cats'} >>> ininb = text_analysis.InvertedIndexBased(ids) + >>> # should be [set([]) set([])] >>> print ininb._inverted_index - [set([]) set([])] """ - def __init__(self, *args): super(InvertedIndexBased, self).__init__(*args) self._inverted_index = np.array([set() for _ in range(self._vocab_size)]) @@ -228,8 +226,7 @@ class CorpusAccumulator(InvertedIndexBased): """Gather word occurrence stats from a corpus by iterating over its BoW representation.""" def analyze_text(self, text, doc_num=None): - """ - + """Build an inverted index from a sequence of corpus texts. Examples -------- @@ -241,7 +238,7 @@ def analyze_text(self, text, doc_num=None): >> > print corac._inverted_index - # Doesn't work + #TODO: Doesn't work """ doc_words = frozenset(x[0] for x in text) top_ids_in_doc = self.relevant_ids.intersection(doc_words) @@ -262,7 +259,7 @@ class WindowedTextsAnalyzer(UsesDictionary): ---------- relevant_words : set Set of words. - dictionary: tuple + dictionary : :class:`~gensim.corpora.dictionary.Dictionary` Dictionary instance with mappings for the relevant_ids. """ @@ -290,7 +287,7 @@ def _iter_texts(self, texts): for w in text], dtype=dtype) def text_is_relevant(self, text): - """Return True if the text has any relevant words, else False.""" + """Check if the text has any relevant words.""" for word in text: if word in self.relevant_words: return True @@ -439,7 +436,7 @@ def start_workers(self, window_size): Returns ------- - tuple + (list of lists) Tuple of (list of workers, input queue, output queue). """ input_q = mp.Queue(maxsize=self.processes) @@ -454,8 +451,7 @@ def start_workers(self, window_size): return workers, input_q, output_q def yield_batches(self, texts): - """Return a generator over the given texts that yields batches of `batch_size` texts at a time. - """ + """Return a generator over the given texts that yields batches of `batch_size` texts at a time.""" batch = [] for text in self._iter_texts(texts): batch.append(text) From 0b0646834531780603773ff5921759280f9a8a72 Mon Sep 17 00:00:00 2001 From: CLearERR Date: Tue, 19 Dec 2017 01:53:10 +0500 Subject: [PATCH 29/39] Add Notes for text_analysis --- gensim/topic_coherence/text_analysis.py | 27 ++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index db8c9e113e..8c940be293 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -324,10 +324,14 @@ def accumulate(self, texts, window_size): return self def partial_accumulate(self, texts, window_size): - """Meant to be called several times to accumulate partial results. The final - accumulation should be performed with the `accumulate` method as opposed to this one. + """Meant to be called several times to accumulate partial results. + + Notes + ----- + The final accumulation should be performed with the `accumulate` method as opposed to this one. This method does not ensure the co-occurrence matrix is in lil format and does not symmetrize it after accumulation. + """ self._current_doc_num = -1 self._token_at_edge = None @@ -359,8 +363,12 @@ def _slide_window(self, window, doc_num): def _symmetrize(self): """Word pairs may have been encountered in (i, j) and (j, i) order. + + Notes + ----- Rather than enforcing a particular ordering during the update process, we choose to symmetrize the co-occurrence matrix after accumulation has completed. + """ co_occ = self._co_occurrences co_occ.setdiag(self._occurrences) # diagonal should be equal to occurrence counts @@ -380,9 +388,7 @@ def merge(self, other): class PatchedWordOccurrenceAccumulator(WordOccurrenceAccumulator): - """Monkey patched for multiprocessing worker usage, - to move some of the logic to the master process. - """ + """Monkey patched for multiprocessing worker usage, to move some of the logic to the master process.""" def _iter_texts(self, texts): return texts # master process will handle this @@ -427,6 +433,9 @@ def accumulate(self, texts, window_size): def start_workers(self, window_size): """Set up an input and output queue and start processes for each worker. + + Notes + ----- The input queue is used to transmit batches of documents to the workers. The output queue is used by workers to transmit the WordOccurrenceAccumulator instances. @@ -477,16 +486,20 @@ def queue_all_texts(self, q, texts, window_size): def terminate_workers(self, input_q, output_q, workers, interrupted=False): """Wait until all workers have transmitted their WordOccurrenceAccumulator instances, - then terminate each. We do not use join here because it has been shown to have some issues + then terminate each. + + Notes + ----- + We do not use join here because it has been shown to have some issues in Python 2.7 (and even in later versions). This method also closes both the input and output queue. - If `interrupted` is False (normal execution), a None value is placed on the input queue for each worker. The workers are looking for this sentinel value and interpret it as a signal to terminate themselves. If `interrupted` is True, a KeyboardInterrupt occurred. The workers are programmed to recover from this and continue on to transmit their results before terminating. So in this instance, the sentinel values are not queued, but the rest of the execution continues as usual. + """ if not interrupted: for _ in workers: From e3779d4bc20b59bb3c72bf3ed0f1c810d83f965d Mon Sep 17 00:00:00 2001 From: ivan Date: Tue, 19 Dec 2017 16:56:35 +0500 Subject: [PATCH 30/39] fix di/ind --- .../direct_confirmation_measure.py | 2 +- .../indirect_confirmation_measure.py | 22 ++++++++++--------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py index 15858c91b8..6caf505020 100644 --- a/gensim/topic_coherence/direct_confirmation_measure.py +++ b/gensim/topic_coherence/direct_confirmation_measure.py @@ -86,7 +86,7 @@ def aggregate_segment_sims(segment_sims, with_std, with_support): Parameters ---------- - segment_sims : list of float + segment_sims : iterable of float Similarity values to aggregate. with_std : bool Set to True to include standard deviation. diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index 31349c0716..7ed368bc4d 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -49,7 +49,8 @@ def word2vec_similarity(segmented_topics, accumulator, with_std=False, with_supp ---------- segmented_topics : list of lists of (int, `numpy.ndarray`) Output from the :func:`~gensim.topic_coherence.segmentation.s_one_set`. - accumulator : :class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator` + accumulator : :class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator` or + :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` Word occurrence accumulator. with_std : bool True to also include standard deviation across topic segment sets @@ -77,7 +78,7 @@ def word2vec_similarity(segmented_topics, accumulator, with_std=False, with_supp >>> dictionary = Dictionary() >>> dictionary.id2token = {1: 'fake', 2: 'tokens'} >>> accumulator = text_analysis.WordVectorsAccumulator({1, 2}, dictionary) - >>> accumulator.accumulate([['fake', 'tokens'],['tokens', 'fake']], 5) + >>> _ = accumulator.accumulate([['fake', 'tokens'],['tokens', 'fake']], 5) >>> >>> # should be (0.726752426218 0.00695475919227) >>> mean, std = indirect_confirmation_measure.word2vec_similarity(segmentation, accumulator, with_std=True)[0] @@ -186,7 +187,8 @@ class ContextVectorComputer(object): Confirmation measure. topics: list Topics. - accumulator : :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` + accumulator : :class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator` or + :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` Word occurrence accumulator from probability_estimation. gamma: float Value for computing vectors. @@ -212,12 +214,12 @@ class ContextVectorComputer(object): >>> dictionary = Dictionary() >>> dictionary.id2token = {1: 'fake', 2: 'tokens'} >>> accumulator = text_analysis.WordVectorsAccumulator({1, 2}, dictionary) - >>> accumulator.accumulate([['fake', 'tokens'],['tokens', 'fake']], 5) - >>> cont_vect_comp = indirect_confirmation_measure.ContextVectorComputer(measure, topics, accumulator,1) - >>> # should be {1: 0, 2: 1} + >>> _ = accumulator.accumulate([['fake', 'tokens'],['tokens', 'fake']], 5) + >>> cont_vect_comp = indirect_confirmation_measure.ContextVectorComputer(measure, topics, accumulator, 1) >>> cont_vect_comp.mapping - >>> # should be 2 + {1: 0, 2: 1} >>> cont_vect_comp.vocab_size + 2 """ @@ -267,11 +269,11 @@ def compute_context_vector(self, segment_word_ids, topic_word_ids): return context_vector def _make_seg(self, segment_word_ids, topic_word_ids): - """Return context vectors for segmentations (Internal helper function). + """Return context vectors for segmentation (Internal helper function). Parameters ---------- - segment_word_ids : list + segment_word_ids : iterable or int Ids of words in segment. topic_word_ids : list Ids of words in topic. @@ -301,7 +303,7 @@ def _pair_npmi(pair, accumulator): Parameters ---------- - pair : (str, str) + pair : (int, int) The pair of words (word_id1, word_id2). accumulator : :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` Word occurrence accumulator from probability_estimation. From 482377beef74cd293d2476653f134103a71b1d60 Mon Sep 17 00:00:00 2001 From: ivan Date: Wed, 20 Dec 2017 04:58:03 +0500 Subject: [PATCH 31/39] fix doc examples in probability_estimation --- .../topic_coherence/probability_estimation.py | 86 +++++++++++-------- 1 file changed, 49 insertions(+), 37 deletions(-) diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index 2f9971140a..4e453df609 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -37,26 +37,31 @@ def p_boolean_document(corpus, segmented_topics): >>> from gensim.topic_coherence import probability_estimation >>> from gensim.corpora.hashdictionary import HashDictionary >>> - >>> # create dictionary - >>> texts = [['human', 'interface', 'computer'], ['eps', 'user', 'interface', 'system'], - >>> ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees']] + >>> + >>> texts = [ + ... ['human', 'interface', 'computer'], + ... ['eps', 'user', 'interface', 'system'], + ... ['system', 'human', 'system', 'eps'], + ... ['user', 'response', 'time'], + ... ['trees'], + ... ['graph', 'trees'] + ... ] >>> dictionary = HashDictionary(texts) - >>> token2id = dictionary.token2id - >>> computer_id = token2id['computer'] - >>> system_id = token2id['system'] - >>> user_id = token2id['user'] - >>> graph_id = token2id['graph'] + >>> w2id = dictionary.token2id + >>> >>> # create segmented_topics - >>> segmented_topics = [[(system_id, graph_id),(computer_id, graph_id),(computer_id, system_id)], [ - >>> (computer_id, graph_id),(user_id, graph_id),(user_id, computer_id)]] + >>> segmented_topics = [ + ... [(w2id['system'], w2id['graph']),(w2id['computer'], w2id['graph']),(w2id['computer'], w2id['system'])], + ... [(w2id['computer'], w2id['graph']),(w2id['user'], w2id['graph']),(w2id['user'], w2id['computer'])] + ... ] >>> >>> # create corpus >>> corpus = [dictionary.doc2bow(text) for text in texts] >>> - >>> # result.index_to_dict() should be {10608: set([0]), 12736: set([1, 3]), 18451: set([5]), 5798: set([1, 2])} >>> result = probability_estimation.p_boolean_document(corpus, segmented_topics) >>> result.index_to_dict() + {10608: set([0]), 12736: set([1, 3]), 18451: set([5]), 5798: set([1, 2])} """ top_ids = unique_ids_from_segments(segmented_topics) @@ -93,26 +98,31 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, p >>> from gensim.topic_coherence import probability_estimation >>> from gensim.corpora.hashdictionary import HashDictionary >>> - >>> # create dictionary - >>> texts = [['human', 'interface', 'computer'], ['eps', 'user', 'interface', 'system'], - >>> ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees']] + >>> + >>> texts = [ + ... ['human', 'interface', 'computer'], + ... ['eps', 'user', 'interface', 'system'], + ... ['system', 'human', 'system', 'eps'], + ... ['user', 'response', 'time'], + ... ['trees'], + ... ['graph', 'trees'] + ... ] >>> dictionary = HashDictionary(texts) - >>> token2id = dictionary.token2id - >>> computer_id = token2id['computer'] - >>> system_id = token2id['system'] - >>> user_id = token2id['user'] - >>> graph_id = token2id['graph'] + >>> w2id = dictionary.token2id + >>> >>> # create segmented_topics - >>> segmented_topics = [[(system_id, graph_id), (computer_id, graph_id), (computer_id, system_id)], [ - >>> (computer_id, graph_id), (user_id, graph_id), (user_id, computer_id)]] + >>> segmented_topics = [ + ... [(w2id['system'], w2id['graph']),(w2id['computer'], w2id['graph']),(w2id['computer'], w2id['system'])], + ... [(w2id['computer'], w2id['graph']),(w2id['user'], w2id['graph']),(w2id['user'], w2id['computer'])] + ... ] >>> >>> # create corpus >>> corpus = [dictionary.doc2bow(text) for text in texts] >>> accumulator = probability_estimation.p_boolean_sliding_window(texts, segmented_topics, dictionary, 2) >>> - >>> # should be 1 3 1 4 - >>> print accumulator[computer_id], accumulator[user_id], accumulator[graph_id], accumulator[system_id] + >>> (accumulator[w2id['computer']], accumulator[w2id['user']], accumulator[w2id['system']]) + (1, 3, 4) """ top_ids = unique_ids_from_segments(segmented_topics) @@ -153,28 +163,29 @@ def p_word2vec(texts, segmented_topics, dictionary, window_size=None, processes= >>> from gensim.corpora.hashdictionary import HashDictionary >>> from gensim.models import word2vec >>> - >>> # create dictionary - >>> texts = [['human', 'interface', 'computer'], ['eps', 'user', 'interface', 'system'], - >>> ['system', 'human', 'system', 'eps'], ['user', 'response', 'time'], ['trees'], ['graph', 'trees']] + >>> texts = [ + ... ['human', 'interface', 'computer'], + ... ['eps', 'user', 'interface', 'system'], + ... ['system', 'human', 'system', 'eps'], + ... ['user', 'response', 'time'], + ... ['trees'], + ... ['graph', 'trees'] + ... ] >>> dictionary = HashDictionary(texts) - >>> token2id = dictionary.token2id - >>> computer_id = token2id['computer'] - >>> system_id = token2id['system'] - >>> user_id = token2id['user'] - >>> graph_id = token2id['graph'] + >>> w2id = dictionary.token2id + >>> >>> # create segmented_topics - >>> segmented_topics = [[(system_id, graph_id), (computer_id, graph_id), (computer_id, system_id)], [ - >>> (computer_id, graph_id), (user_id, graph_id), (user_id, computer_id)]] + >>> segmented_topics = [ + ... [(w2id['system'], w2id['graph']),(w2id['computer'], w2id['graph']),(w2id['computer'], w2id['system'])], + ... [(w2id['computer'], w2id['graph']),(w2id['user'], w2id['graph']),(w2id['user'], w2id['computer'])] + ... ] >>> >>> # create corpus >>> corpus = [dictionary.doc2bow(text) for text in texts] >>> sentences = [['human', 'interface', 'computer'],['survey', 'user', 'computer', 'system', 'response', 'time']] - >>> model = word2vec.Word2Vec(sentences, size=100, window=5, min_count=5, workers=4) #TODO Ivan please fix this holy shield + >>> model = word2vec.Word2Vec(sentences, size=100,min_count=1) >>> accumulator = probability_estimation.p_word2vec(texts, segmented_topics, dictionary, 2, 1, model) - >>> - >>> # next string should return 1 3 1 4 example for model = None - >>> print accumulator[computer_id], accumulator[user_id], accumulator[graph_id], accumulator[system_id] """ top_ids = unique_ids_from_segments(segmented_topics) @@ -202,6 +213,7 @@ def unique_ids_from_segments(segmented_topics): >>> segmentation = [[(1, 2)]] >>> # should be set([1, 2]) >>> probability_estimation.unique_ids_from_segments(segmentation) + set([1, 2]) """ unique_ids = set() # is a set of all the unique ids contained in topics. From acdebb19ea7e5047e852e9e3ba7a1eac8abf279e Mon Sep 17 00:00:00 2001 From: ivan Date: Wed, 20 Dec 2017 05:41:06 +0500 Subject: [PATCH 32/39] fix probability_estimation --- .../topic_coherence/probability_estimation.py | 36 +++++++++++-------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index 4e453df609..905cf4e5ef 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -22,14 +22,14 @@ def p_boolean_document(corpus, segmented_topics): Parameters ---------- - corpus : list + corpus : iterable of list of (int, int) The corpus of documents. segmented_topics: list of (int, int). Each tuple (word_id_set1, word_id_set2) is either a single integer, or a `numpy.ndarray` of integers. Returns ------- - :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` + :class:`~gensim.topic_coherence.text_analysis.CorpusAccumulator` Word occurrence accumulator instance that can be used to lookup token frequencies and co-occurrence frequencies. Examples @@ -73,13 +73,17 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, p Parameters ---------- - texts : list of str - segmented_topics: list of (int, int). + texts : iterable of iterable of str + Input text + segmented_topics: list of (int, int) Each tuple (word_id_set1, word_id_set2) is either a single integer, or a `numpy.ndarray` of integers. - dictionary : :class:`~gensim.corpora.dictionary` + dictionary : :class:`~gensim.corpora.dictionary.Dictionary` Gensim dictionary mapping of the tokens and ids. window_size : int - Size of the sliding window. 110 found out to be the ideal size for large corpora. + Size of the sliding window, 110 found out to be the ideal size for large corpora. + processes : int, optional + Number of process that will be used for + :class:`~gensim.topic_coherence.text_analysis.ParallelWordOccurrenceAccumulator` Notes ----- @@ -90,8 +94,11 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, p Returns ------- - :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` - Word occurrence accumulator instance that can be used to lookup token frequencies and co-occurrence frequencies. + :class:`~gensim.topic_coherence.text_analysis.WordOccurrenceAccumulator` + if `processes` = 1 OR + :class:`~gensim.topic_coherence.text_analysis.ParallelWordOccurrenceAccumulator` + otherwise. This is word occurrence accumulator instance that can be used to lookup + token frequencies and co-occurrence frequencies. Examples --------- @@ -135,26 +142,27 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size, p def p_word2vec(texts, segmented_topics, dictionary, window_size=None, processes=1, model=None): - """Train word2vec model on `texts` if model is not None. + """Train word2vec model on `texts` if `model` is not None. Parameters ---------- - texts : list of str - segmented_topics : list of tuples of (word_id_set1, word_id_set2) + texts : iterable of iterable of str + Input text + segmented_topics : iterable of iterable of str Output from the segmentation of topics. Could be simply topics too. dictionary : :class:`~gensim.corpora.dictionary` Gensim dictionary mapping of the tokens and ids. window_size : int Size of the sliding window. - processes: int + processes : int Number of processes to use. - model: Word2Vec (:class:`~gensim.models.keyedvectors.KeyedVectors`) + model : :class:`~gensim.models.word2vec.Word2Vec` or :class:`~gensim.models.keyedvectors.KeyedVectors`, optional If None, a new Word2Vec model is trained on the given text corpus. Otherwise, it should be a pre-trained Word2Vec context vectors. Returns ------- - :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` + :class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator` Text accumulator with trained context vectors. Examples From 8a07deeb8bb2626ca6b8048b7363a55c693bddb5 Mon Sep 17 00:00:00 2001 From: ivan Date: Wed, 20 Dec 2017 05:52:14 +0500 Subject: [PATCH 33/39] fix segmentation --- gensim/topic_coherence/segmentation.py | 44 ++++++++++++++------------ 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/gensim/topic_coherence/segmentation.py b/gensim/topic_coherence/segmentation.py index 239d729834..9629369b42 100644 --- a/gensim/topic_coherence/segmentation.py +++ b/gensim/topic_coherence/segmentation.py @@ -12,31 +12,31 @@ def s_one_pre(topics): - """Performs s_one_pre segmentation on a list of topics. + """Performs segmentation on a list of topics. Notes ----- - s_one_pre segmentation is defined as - :math:`s_{pre} = {(W', W^{*}) | W' = w_{i}; W^{*} = {w_j}; w_{i}, w_{j} \in W; i > j}` + Segmentation is defined as + :math:`s_{pre} = {(W', W^{*}) | W' = w_{i}; W^{*} = {w_j}; w_{i}, w_{j} \in W; i > j}`. Parameters ---------- topics : list of np.array - list of topics obtained from an algorithm such as LDA. Is a list such as - [array([ 9, 10, 11]), array([ 9, 10, 7]), ...] + list of topics obtained from an algorithm such as LDA. Returns ------- - list of list of (str, str). + list of list of (int, int) :math:`(W', W^{*})` for all unique topic ids. Examples -------- >>> import numpy as np >>> from gensim.topic_coherence import segmentation + >>> >>> topics = [np.array([1, 2, 3]), np.array([4, 5, 6])] - >>> # should be [[(2, 1), (3, 1), (3, 2)], [(5, 4), (6, 4), (6, 5)]] >>> segmentation.s_one_pre(topics) + [[(2, 1), (3, 1), (3, 2)], [(5, 4), (6, 4), (6, 5)]] """ s_one_pre_res = [] @@ -52,28 +52,29 @@ def s_one_pre(topics): def s_one_one(topics): - """Perform s_one_one segmentation on a list of topics. - s_one_one segmentation is defined as - :math:`s_{one} = {(W', W^{*}) | W' = {w_i}; W^{*} = {w_j}; w_{i}, w_{j} \in W; i != j}` #TODO: neq - doesn't work + """Perform segmentation on a list of topics. + Segmentation is defined as + :math:`s_{one} = {(W', W^{*}) | W' = {w_i}; W^{*} = {w_j}; w_{i}, w_{j} \in W; i \\neq j}`. Parameters ---------- - topics : list of np.array + topics : list of `numpy.ndarray` List of topics obtained from an algorithm such as LDA. - Is a list such as [array([ 9, 10, 11]), array([ 9, 10, 7]), ...]. Returns ------- - list of list of (str, str). + list of list of (int, int). :math:`(W', W^{*})` for all unique topic ids. Examples ------- >>> import numpy as np >>> from gensim.topic_coherence import segmentation + >>> >>> topics = [np.array([1, 2, 3]), np.array([4, 5, 6])] - >>> # should be [[(1, 2), (1, 3), (2, 1), (2, 3), (3, 1), (3, 2)], [(4, 5), (4, 6), (5, 4), (5, 6), (6, 4), (6, 5)]] - >>> segmentation.s_one_pre(topics) + >>> segmentation.s_one_one(topics) + [[(1, 2), (1, 3), (2, 1), (2, 3), (3, 1), (3, 2)], [(4, 5), (4, 6), (5, 4), (5, 6), (6, 4), (6, 5)]] + """ s_one_one_res = [] @@ -92,26 +93,27 @@ def s_one_one(topics): def s_one_set(topics): """Perform s_one_set segmentation on a list of topics. - s_one_set segmentation is defined as :math:`s_{set} = {(W', W^{*}) | W' = {w_i}; w_{i} \in W; W^{*} = W}` + Segmentation is defined as + :math:`s_{set} = {(W', W^{*}) | W' = {w_i}; w_{i} \in W; W^{*} = W}` Parameters ---------- - topics : list of np.array - List of topics obtained from an algorithm such as LDA. Is a list such as - [array([ 9, 10, 11]), array([ 9, 10, 7]), ...]. + topics : list of `numpy.ndarray` + List of topics obtained from an algorithm such as LDA. Returns ------- - list of list of (str, str). + list of list of (int, int). :math:`(W', W^{*})` for all unique topic ids. Examples -------- >>> import numpy as np >>> from gensim.topic_coherence import segmentation + >>> >>> topics = [np.array([9, 10, 7])] - >>> # should be [[(9, array([ 9, 10, 7])), (10, array([ 9, 10, 7])), (7, array([ 9, 10, 7]))]] >>> segmentation.s_one_set(topics) + [[(9, array([ 9, 10, 7])), (10, array([ 9, 10, 7])), (7, array([ 9, 10, 7]))]] """ s_one_set_res = [] From 63c35c2a47428c0a1e38042bcdf4d51cc97a5b68 Mon Sep 17 00:00:00 2001 From: ivan Date: Wed, 20 Dec 2017 05:57:07 +0500 Subject: [PATCH 34/39] fix docstring in probability_estimation --- gensim/topic_coherence/probability_estimation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index 905cf4e5ef..b9c7e17050 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -218,8 +218,8 @@ def unique_ids_from_segments(segmented_topics): Example ------- >>> from gensim.topic_coherence import probability_estimation + >>> >>> segmentation = [[(1, 2)]] - >>> # should be set([1, 2]) >>> probability_estimation.unique_ids_from_segments(segmentation) set([1, 2]) From 4b63f6cc8fa168ecb8dce2482d873689ae006f43 Mon Sep 17 00:00:00 2001 From: ivan Date: Wed, 20 Dec 2017 12:26:08 +0500 Subject: [PATCH 35/39] partial fix test_analysis --- docs/src/topic_coherence/text_analysis.rst | 1 + gensim/topic_coherence/text_analysis.py | 148 ++++++++++++--------- 2 files changed, 86 insertions(+), 63 deletions(-) diff --git a/docs/src/topic_coherence/text_analysis.rst b/docs/src/topic_coherence/text_analysis.rst index f4e3f7254e..ec9e14a795 100644 --- a/docs/src/topic_coherence/text_analysis.rst +++ b/docs/src/topic_coherence/text_analysis.rst @@ -7,3 +7,4 @@ :inherited-members: :undoc-members: :show-inheritance: + :special-members: __getitem__ diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index 8c940be293..e53633eefd 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -5,19 +5,7 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """This module contains classes for analyzing the texts of a corpus to accumulate -statistical information about word occurrences. - -Example for UsesDictionary --------------------------- ->>> from gensim.topic_coherence import text_analysis ->>> from gensim.corpora.dictionary import Dictionary ->>> ids = {1: 'foo', 2: 'bar'} ->>> dictionary = Dictionary([['foo','bar','baz'], ['foo','bar','bar','baz']]) ->>> usesdict = text_analysis.UsesDictionary(ids, dictionary) ->>> print usesdict.relevant_words, usesdict.dictionary, usesdict.token2id -set([u'foo', u'baz']) Dictionary(3 unique tokens: [u'baz', u'foo', u'bar']) {u'baz': 2, u'foo': 1, u'bar': 0} - -""" +statistical information about word occurrences.""" import itertools import logging @@ -43,7 +31,8 @@ def _ids_to_words(ids, dictionary): ---------- ids: dict Dictionary of ids and their words. - dictionary: :class:`~gensim.corpora.dictionary` + dictionary: :class:`~gensim.corpora.dictionary.Dictionary` + Input gensim dictionary Returns ------- @@ -54,11 +43,13 @@ def _ids_to_words(ids, dictionary): -------- >>> from gensim.corpora.dictionary import Dictionary >>> from gensim.topic_coherence import text_analysis + >>> >>> dictionary = Dictionary() >>> ids = {1: 'fake', 4: 'cats'} >>> dictionary.id2token = {1: 'fake', 2: 'tokens', 3: 'rabbids', 4: 'cats'} - >>> # should be set(['cats', 'fake']) + >>> >>> text_analysis._ids_to_words(ids, dictionary) + set(['cats', 'fake']) """ if not dictionary.id2token: # may not be initialized in the standard gensim.corpora.Dictionary @@ -81,24 +72,35 @@ class BaseAnalyzer(object): Attributes ---------- relevant_ids : dict + Mapping _vocab_size : int Size of vocabulary. id2contiguous : dict - + Mapping word_id -> number. log_every : int Interval for logging. _num_docs : int - - Examples - -------- - >>> from gensim.topic_coherence import text_analysis - >>> ids = {1: 'fake', 4: 'cats'} - >>> base = text_analysis.BaseAnalyzer(ids) - >>> # should return {1: 'fake', 4: 'cats'} 2 {1: 0, 4: 1} 1000 0 - >>> print base.relevant_ids, base._vocab_size, base.id2contiguous, base.log_every, base._num_docs + Number of documents. """ def __init__(self, relevant_ids): + """ + + Parameters + ---------- + relevant_ids : dict + Mapping + + Examples + -------- + >>> from gensim.topic_coherence import text_analysis + >>> ids = {1: 'fake', 4: 'cats'} + >>> base = text_analysis.BaseAnalyzer(ids) + >>> # should return {1: 'fake', 4: 'cats'} 2 {1: 0, 4: 1} 1000 0 + >>> print base.relevant_ids, base._vocab_size, base.id2contiguous, base.log_every, base._num_docs + {1: 'fake', 4: 'cats'} 2 {1: 0, 4: 1} 1000 0 + + """ self.relevant_ids = relevant_ids self._vocab_size = len(self.relevant_ids) self.id2contiguous = {word_id: n for n, word_id in enumerate(self.relevant_ids)} @@ -151,21 +153,34 @@ class UsesDictionary(BaseAnalyzer): relevant_words : set Set of words that occurrences should be accumulated for. dictionary : :class:`~gensim.corpora.dictionary.Dictionary` + Dictionary based on text token2id : dict - token2id from :class:`~gensim.corpora.dictionary` - - Examples - -------- - >>> from gensim.topic_coherence import text_analysis - >>> from gensim.corpora.dictionary import Dictionary - >>> ids = {1: 'foo', 2: 'bar'} - >>> dictionary = Dictionary([['foo','bar','baz'], ['foo','bar','bar','baz']]) - >>> usesdict = text_analysis.UsesDictionary(ids, dictionary) - >>> # should be set([u'foo', u'baz']) Dictionary(3 unique tokens: [u'baz', u'foo', u'bar']) {u'baz': 2, u'foo': 1, u'bar': 0} - >>> print usesdict.relevant_words, usesdict.dictionary, usesdict.token2id + Mapping from :class:`~gensim.corpora.dictionary.Dictionary` """ def __init__(self, relevant_ids, dictionary): + """ + + Parameters + ---------- + relevant_ids : dict + Mapping + dictionary : :class:`~gensim.corpora.dictionary.Dictionary` + Dictionary based on text + + Examples + -------- + >>> from gensim.topic_coherence import text_analysis + >>> from gensim.corpora.dictionary import Dictionary + >>> + >>> ids = {1: 'foo', 2: 'bar'} + >>> dictionary = Dictionary([['foo','bar','baz'], ['foo','bar','bar','baz']]) + >>> udict = text_analysis.UsesDictionary(ids, dictionary) + >>> + >>> print udict.relevant_words + set([u'foo', u'baz']) + + """ super(UsesDictionary, self).__init__(relevant_ids) self.relevant_words = _ids_to_words(self.relevant_ids, dictionary) self.dictionary = dictionary @@ -196,16 +211,26 @@ def get_co_occurrences(self, word1, word2): class InvertedIndexBased(BaseAnalyzer): """Analyzer that builds up an inverted index to accumulate stats. - Examples - -------- - >>> from gensim.topic_coherence import text_analysis - >>> ids = {1: 'fake', 4: 'cats'} - >>> ininb = text_analysis.InvertedIndexBased(ids) - >>> # should be [set([]) set([])] - >>> print ininb._inverted_index - """ def __init__(self, *args): + """ + + Parameters + ---------- + args : dict + Look at :class:`~gensim.topic_coherence.text_analysis.BaseAnalyzer` + + Examples + -------- + >>> from gensim.topic_coherence import text_analysis + >>> + >>> ids = {1: 'fake', 4: 'cats'} + >>> ininb = text_analysis.InvertedIndexBased(ids) + >>> + >>> print ininb._inverted_index + [set([]) set([])] + + """ super(InvertedIndexBased, self).__init__(*args) self._inverted_index = np.array([set() for _ in range(self._vocab_size)]) @@ -226,20 +251,7 @@ class CorpusAccumulator(InvertedIndexBased): """Gather word occurrence stats from a corpus by iterating over its BoW representation.""" def analyze_text(self, text, doc_num=None): - """Build an inverted index from a sequence of corpus texts. - - Examples - -------- - >> > from gensim.topic_coherence import text_analysis - >> > ids = {1: 'fake', 4: 'cats'} - >> > corac = text_analysis.CorpusAccumulator(ids) - >> > texts = [['human', 'interface', 'computer'], ['eps', 'user', 'interface', 'system']] - >> > corac.analyze_text(texts) - >> > print - corac._inverted_index - - #TODO: Doesn't work - """ + """Build an inverted index from a sequence of corpus texts.""" doc_words = frozenset(x[0] for x in text) top_ids_in_doc = self.relevant_ids.intersection(doc_words) for word_id in top_ids_in_doc: @@ -261,9 +273,20 @@ class WindowedTextsAnalyzer(UsesDictionary): Set of words. dictionary : :class:`~gensim.corpora.dictionary.Dictionary` Dictionary instance with mappings for the relevant_ids. + """ def __init__(self, relevant_ids, dictionary): + """ + + Parameters + ---------- + relevant_ids : set of int + Relevant id + dictionary : :class:`~gensim.corpora.dictionary.Dictionary` + Dictionary instance with mappings for the relevant_ids. + + """ super(WindowedTextsAnalyzer, self).__init__(relevant_ids, dictionary) self._none_token = self._vocab_size # see _iter_texts for use of none token @@ -485,14 +508,12 @@ def queue_all_texts(self, q, texts, window_size): (batch_num + 1), (batch_num + 1) * self.batch_size, self._num_docs) def terminate_workers(self, input_q, output_q, workers, interrupted=False): - """Wait until all workers have transmitted their WordOccurrenceAccumulator instances, - then terminate each. + """Wait until all workers have transmitted their WordOccurrenceAccumulator instances, then terminate each. - Notes - ----- + Warnings + -------- We do not use join here because it has been shown to have some issues - in Python 2.7 (and even in later versions). This method also closes both the input and output - queue. + in Python 2.7 (and even in later versions). This method also closes both the input and output queue. If `interrupted` is False (normal execution), a None value is placed on the input queue for each worker. The workers are looking for this sentinel value and interpret it as a signal to terminate themselves. If `interrupted` is True, a KeyboardInterrupt occurred. The workers are @@ -522,6 +543,7 @@ def merge_accumulators(self, accumulators): """Merge the list of accumulators into a single `WordOccurrenceAccumulator` with all occurrence and co-occurrence counts, and a `num_docs` that reflects the total observed by all the individual accumulators. + """ accumulator = WordOccurrenceAccumulator(self.relevant_ids, self.dictionary) for other_accumulator in accumulators: From 540021c44496f216e824f998e01a25a6e99336de Mon Sep 17 00:00:00 2001 From: ivan Date: Wed, 20 Dec 2017 13:12:56 +0500 Subject: [PATCH 36/39] add latex stuff for docs build --- .travis.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.travis.yml b/.travis.yml index f97bac263f..5bde814e7c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -24,5 +24,13 @@ matrix: - python: '3.6' env: TOXENV="py36-linux" +before_install: + - if [[ ${TOXENV} == *"docs"* ]]; then + apt-get -yq update; + apt-get -yq remove texlive-binaries --purge; + apt-get -yq --no-install-suggests --no-install-recommends --force-yes install \ + dvipng texlive-latex-base texlive-latex-extra texlive-latex-recommended \ + texlive-latex-extra texlive-fonts-recommended latexmk; + fi install: pip install tox script: tox -vv From 965587b24082668fbbb4a87f08aba80079a5a8ac Mon Sep 17 00:00:00 2001 From: ivan Date: Wed, 10 Jan 2018 20:40:43 +0500 Subject: [PATCH 37/39] doc fix[1] --- .../topic_coherence/direct_confirmation_measure.py | 10 +++++----- .../indirect_confirmation_measure.py | 14 +++++--------- gensim/topic_coherence/probability_estimation.py | 5 ++--- 3 files changed, 12 insertions(+), 17 deletions(-) diff --git a/gensim/topic_coherence/direct_confirmation_measure.py b/gensim/topic_coherence/direct_confirmation_measure.py index d9d88dd32b..6482191d9c 100644 --- a/gensim/topic_coherence/direct_confirmation_measure.py +++ b/gensim/topic_coherence/direct_confirmation_measure.py @@ -27,10 +27,10 @@ def log_conditional_probability(segmented_topics, accumulator, with_std=False, w :func:`~gensim.topic_coherence.segmentation.s_one_one`. accumulator : :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` Word occurrence accumulator from :mod:`gensim.topic_coherence.probability_estimation`. - with_std : bool + with_std : bool, optional True to also include standard deviation across topic segment sets in addition to the mean coherence for each topic. - with_support : bool + with_support : bool, optional True to also include support across topic segments. The support is defined as the number of pairwise similarity comparisons were used to compute the overall topic coherence. @@ -128,12 +128,12 @@ def log_ratio_measure(segmented_topics, accumulator, normalize=False, with_std=F :func:`~gensim.topic_coherence.segmentation.s_one_one`. accumulator : :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` Word occurrence accumulator from :mod:`gensim.topic_coherence.probability_estimation`. - normalize : bool + normalize : bool, optional Details in the "Notes" section. - with_std : bool + with_std : bool, optional True to also include standard deviation across topic segment sets in addition to the mean coherence for each topic. - with_support : bool + with_support : bool, optional True to also include support across topic segments. The support is defined as the number of pairwise similarity comparisons were used to compute the overall topic coherence. diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index 7ed368bc4d..fdcbd1565f 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -52,10 +52,10 @@ def word2vec_similarity(segmented_topics, accumulator, with_std=False, with_supp accumulator : :class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator` or :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` Word occurrence accumulator. - with_std : bool + with_std : bool, optional True to also include standard deviation across topic segment sets in addition to the mean coherence for each topic. - with_support : bool + with_support : bool, optional True to also include support across topic segments. The support is defined as the number of pairwise similarity comparisons were used to compute the overall topic coherence. @@ -123,9 +123,9 @@ def cosine_similarity(segmented_topics, accumulator, topics, measure='nlr', Output from the segmentation module of the segmented topics. accumulator: :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` Output from the probability_estimation module. Is an topics: Topics obtained from the trained topic model. - measure : str + measure : str, optional Direct confirmation measure to be used. Supported values are "nlr" (normalized log ratio). - gamma: float + gamma: float, optional Gamma value for computing :math:`W'` and :math:`W^{*}` vectors. with_std : bool True to also include standard deviation across topic segment sets in addition to the mean coherence @@ -185,7 +185,7 @@ class ContextVectorComputer(object): ---------- measure: str Confirmation measure. - topics: list + topics: list of numpy.array Topics. accumulator : :class:`~gensim.topic_coherence.text_analysis.WordVectorsAccumulator` or :class:`~gensim.topic_coherence.text_analysis.InvertedIndexAccumulator` @@ -256,10 +256,6 @@ def compute_context_vector(self, segment_word_ids, topic_word_ids): If context vector has been cached, then return corresponding context vector, else compute, cache, and return. - Example - --------- - #TODO Need help with understanding parameters' types. - """ key = _key_for_segment(segment_word_ids, topic_word_ids) context_vector = self.context_vector_cache.get(key, None) diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index b9c7e17050..404310a36c 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -48,7 +48,6 @@ def p_boolean_document(corpus, segmented_topics): ... ] >>> dictionary = HashDictionary(texts) >>> w2id = dictionary.token2id - >>> >>> # create segmented_topics >>> segmented_topics = [ @@ -152,9 +151,9 @@ def p_word2vec(texts, segmented_topics, dictionary, window_size=None, processes= Output from the segmentation of topics. Could be simply topics too. dictionary : :class:`~gensim.corpora.dictionary` Gensim dictionary mapping of the tokens and ids. - window_size : int + window_size : int, optional Size of the sliding window. - processes : int + processes : int, optional Number of processes to use. model : :class:`~gensim.models.word2vec.Word2Vec` or :class:`~gensim.models.keyedvectors.KeyedVectors`, optional If None, a new Word2Vec model is trained on the given text corpus. Otherwise, From f8f25cb110a267df698d348c05e27c47648cc4c0 Mon Sep 17 00:00:00 2001 From: ivan Date: Wed, 10 Jan 2018 20:56:20 +0500 Subject: [PATCH 38/39] doc fix[2] --- gensim/topic_coherence/text_analysis.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/gensim/topic_coherence/text_analysis.py b/gensim/topic_coherence/text_analysis.py index e53633eefd..b759e0a13a 100644 --- a/gensim/topic_coherence/text_analysis.py +++ b/gensim/topic_coherence/text_analysis.py @@ -209,9 +209,8 @@ def get_co_occurrences(self, word1, word2): class InvertedIndexBased(BaseAnalyzer): - """Analyzer that builds up an inverted index to accumulate stats. + """Analyzer that builds up an inverted index to accumulate stats.""" - """ def __init__(self, *args): """ @@ -265,16 +264,7 @@ def accumulate(self, corpus): class WindowedTextsAnalyzer(UsesDictionary): - """Gather some stats about relevant terms of a corpus by iterating over windows of texts. - - Attributes - ---------- - relevant_words : set - Set of words. - dictionary : :class:`~gensim.corpora.dictionary.Dictionary` - Dictionary instance with mappings for the relevant_ids. - - """ + """Gather some stats about relevant terms of a corpus by iterating over windows of texts.""" def __init__(self, relevant_ids, dictionary): """ From f42ad8f6cdb68b6b23b6be16a1c0846a7b172796 Mon Sep 17 00:00:00 2001 From: ivan Date: Wed, 10 Jan 2018 20:57:49 +0500 Subject: [PATCH 39/39] remove apt install from travis (now doc build in circle) --- .travis.yml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/.travis.yml b/.travis.yml index 6a5cedad9b..3cbccc0b0a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -24,13 +24,5 @@ matrix: - python: '3.6' env: TOXENV="py36-linux" -before_install: - - if [[ ${TOXENV} == *"docs"* ]]; then - apt-get -yq update; - apt-get -yq remove texlive-binaries --purge; - apt-get -yq --no-install-suggests --no-install-recommends --force-yes install \ - dvipng texlive-latex-base texlive-latex-extra texlive-latex-recommended \ - texlive-latex-extra texlive-fonts-recommended latexmk; - fi install: pip install tox script: tox -vv