From a15e5c37ecb4e483afcb0270b544e7b97d5e896e Mon Sep 17 00:00:00 2001 From: dsquareindia Date: Sun, 7 Aug 2016 19:41:59 +0530 Subject: [PATCH] Added ind_conf_m explanation, refactoring in prob_est, seg modules --- .../indirect_confirmation_measure.py | 7 ++++++- .../topic_coherence/probability_estimation.py | 17 +++++++++-------- gensim/topic_coherence/segmentation.py | 11 ++++------- 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/gensim/topic_coherence/indirect_confirmation_measure.py b/gensim/topic_coherence/indirect_confirmation_measure.py index a9daa246c3..c68206a372 100644 --- a/gensim/topic_coherence/indirect_confirmation_measure.py +++ b/gensim/topic_coherence/indirect_confirmation_measure.py @@ -5,7 +5,12 @@ # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """ -This module contains functions to compute confirmation on a pair of words or word subsets. +This module contains functions to compute confirmation on a pair of words or word subsets. The advantage of indirect +confirmation measure is that it computes similarity of words in W' and W* with respect to direct confirmations to all words. +Eg. Suppose x and z are both competing brands of cars, which semantically support each other. However, both brands are +seldom mentioned together in documents in the reference corpus. But their confirmations to other words like “road” +or “speed” do strongly correlate. This would be reflected by an indirect confirmation measure. Thus, indirect confirmation +measures may capture semantic support that direct measures would miss. The formula used to compute indirect confirmation measure is: diff --git a/gensim/topic_coherence/probability_estimation.py b/gensim/topic_coherence/probability_estimation.py index f66b1409d7..a76f40db4c 100644 --- a/gensim/topic_coherence/probability_estimation.py +++ b/gensim/topic_coherence/probability_estimation.py @@ -78,25 +78,26 @@ def p_boolean_sliding_window(texts, segmented_topics, dictionary, window_size): window_id[0] : Total no of windows """ top_ids = _ret_top_ids(segmented_topics) - window_id = [0] # Each window assigned a window id. + window_id = 0 # Each window assigned a window id. per_topic_postings = {} token2id_dict = dictionary.token2id - def add_topic_posting(): + def add_topic_posting(top_ids, window, per_topic_postings, window_id, token2id_dict): for word in window: word_id = token2id_dict[word] if word_id in top_ids: if word_id in per_topic_postings: - per_topic_postings[word_id].add(window_id[0]) + per_topic_postings[word_id].add(window_id) else: - per_topic_postings[word_id] = set([window_id[0]]) - window_id[0] += 1 + per_topic_postings[word_id] = set([window_id]) + window_id += 1 + return (window_id, per_topic_postings) # Apply boolean sliding window to each document in texts. for document in texts: it = iter(document) window = tuple(islice(it, window_size)) - add_topic_posting() + window_id, per_topic_postings = add_topic_posting(top_ids, window, per_topic_postings, window_id, token2id_dict) for elem in it: window = window[1:] + (elem,) - add_topic_posting() + window_id, per_topic_postings = add_topic_posting(top_ids, window, per_topic_postings, window_id, token2id_dict) - return (per_topic_postings, window_id[0]) + return per_topic_postings, window_id diff --git a/gensim/topic_coherence/segmentation.py b/gensim/topic_coherence/segmentation.py index 5f571561df..8b8b1a858c 100644 --- a/gensim/topic_coherence/segmentation.py +++ b/gensim/topic_coherence/segmentation.py @@ -36,9 +36,8 @@ def s_one_pre(topics): for top_words in topics: s_one_pre_t = [] - for w_prime in top_words[1:]: - w_prime_index = int(np.where(top_words == w_prime)[0]) # To get index of w_prime in top_words - for w_star in top_words[:w_prime_index]: + for w_prime_index, w_prime in enumerate(top_words[1:]): + for w_star in top_words[:w_prime_index+1]: s_one_pre_t.append((w_prime, w_star)) s_one_pre.append(s_one_pre_t) @@ -67,10 +66,8 @@ def s_one_one(topics): for top_words in topics: s_one_one_t = [] - for w_prime in top_words: - w_prime_index = int(np.where(top_words == int(w_prime))[0]) # To get index of w_prime in top_words - for w_star in top_words: - w_star_index = int(np.where(top_words == int(w_star))[0]) + for w_prime_index, w_prime in enumerate(top_words): + for w_star_index, w_star in enumerate(top_words): if w_prime_index == w_star_index: continue else: