From fe9367c388df14200a537b93e58de697a3c68def Mon Sep 17 00:00:00 2001 From: bhargavvader Date: Thu, 9 Jun 2016 19:09:02 +0530 Subject: [PATCH 01/38] DTM sample classes, helper methods --- gensim/models/ldaseqmodel.py | 80 + gensim/test/test_data/sample_mean_DTM | 2810 +++++++++++++++++++++ gensim/test/test_data/sample_variance_DTM | 2810 +++++++++++++++++++++ gensim/test/test_ldaseqmodel.py | 36 + 4 files changed, 5736 insertions(+) create mode 100644 gensim/models/ldaseqmodel.py create mode 100644 gensim/test/test_data/sample_mean_DTM create mode 100644 gensim/test/test_data/sample_variance_DTM create mode 100644 gensim/test/test_ldaseqmodel.py diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py new file mode 100644 index 0000000000..e14d40be2f --- /dev/null +++ b/gensim/models/ldaseqmodel.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html +# Based on Copyright (C) 2016 Radim Rehurek + + +""" + +This is the class which is used to help with Dynamic Topic Modelling of a corpus. +It is a work in progress and will change largely throughout the course of development. +Inspired by the Blei's original DTM code and paper. TODO: add links + +As of now, the LdaSeqModel and SSLM classes mimic the structures of the same name in the Blei DTM code. +Few mathematical helper functions will be made and tested. + +""" + +from gensim import interfaces, utils, matutils +import numpy + +class LdaSeqModel(utils.SaveLoad): + def __init__(self, corpus=None, num_topics=10, id2word=None, num_sequence=None, num_terms=None, alphas=None, top_doc_phis=None, + topic_chains=None, influence=None, influence_sum_lgl=None, renormalized_influence=None): + # store user-supplied parameters + self.id2word = id2word + if corpus is None and self.id2word is None: + raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality') + + if self.id2word is None: + logger.warning("no word id mapping provided; initializing from corpus, assuming identity") + self.id2word = utils.dict_from_corpus(corpus) + self.num_terms = len(self.id2word) + elif len(self.id2word) > 0: + self.num_terms = 1 + max(self.id2word.keys()) + else: + self.num_terms = 0 + + if self.num_terms == 0: + raise ValueError("cannot compute LDA over an empty collection (no terms)") + + self.num_topics = num_topics + self.num_sequence = num_sequence + self.alphas = alphas + self.topic_chains = topic_chains + self.top_doc_phis = top_doc_phis + + # influence values as of now not using + self.influence = influence + self.renormalized_influence = renormalized_influence + self.influence_sum_lgl = influence_sum_lgl + +class sslm(utils.SaveLoad): + def __init__(num_terms=None, num_sequence=None, obs=None, obs_variance=None, chain_variance=None, fwd_variance=None, + mean=None, variance=None, zeta=None, e_log_prob=None, fwd_mean=None, m_update_coeff=None, + mean_t=None, variance_t=None, influence_sum_lgl=None, w_phi_l=None, w_phi_sum=None, w_phi_l_sq=None, m_update_coeff_g=None): + + self.obs = obs + self.zeta = zeta + self.mean = mean + self.variance = variance + + +def update_zeta(sslm): + # setting limits + num_terms = sslm.obs.shape[0] # this is word length (our example, 562) + num_sequence = sslm.obs.shape[1] # this is number of sequeces + # making zero and updating + sslm.zeta.fill(0) + + for i in range(0, num_terms): + for j in range(0, num_sequence): + try: + m = sslm.mean[i][j + 1] + v = sslm.variance[i][j + 1] + val = numpy.exp(m + v/2) + sslm.zeta[j] = sslm.zeta[j] + val + except IndexError: + print i, j + return \ No newline at end of file diff --git a/gensim/test/test_data/sample_mean_DTM b/gensim/test/test_data/sample_mean_DTM new file mode 100644 index 0000000000..60621cf99b --- /dev/null +++ b/gensim/test/test_data/sample_mean_DTM @@ -0,0 +1,2810 @@ +0.905953 +0.906859 +0.89346 +0.890039 +0.888237 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +1.19846 +1.19966 +1.18178 +1.17724 +1.17483 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.887422 +0.888309 +0.889874 +0.89276 +0.890946 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.850482 +0.851333 +0.852835 +0.849557 +0.84782 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +1.18046 +1.18164 +1.18373 +1.17918 +1.17676 +0.733484 +0.734217 +0.735537 +0.732618 +0.731079 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.850482 +0.851333 +0.852835 +0.849557 +0.84782 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.850482 +0.851333 +0.852835 +0.849557 +0.84782 +0.498117 +0.498615 +0.499572 +0.497266 +0.496045 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.498117 +0.498615 +0.499572 +0.497266 +0.496045 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +1.4692 +1.47067 +1.47334 +1.46728 +1.46406 +0.706576 +0.707283 +0.708554 +0.70813 +0.706621 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.855175 +0.85603 +0.857541 +0.854247 +0.852503 +0.498117 +0.498615 +0.499572 +0.497266 +0.496045 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.892675 +0.893567 +0.895142 +0.891723 +0.889913 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +2.36713 +2.3695 +2.37485 +2.36017 +2.35236 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.498117 +0.498615 +0.499572 +0.497266 +0.496045 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.829858 +0.830688 +0.832157 +0.828946 +0.827245 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +1.17815 +1.17933 +1.18142 +1.18683 +1.1844 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.862533 +0.863395 +0.864921 +0.86615 +0.864389 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +1.07843 +1.07951 +1.08141 +1.08536 +1.08317 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.498117 +0.498615 +0.499572 +0.497266 +0.496045 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.892675 +0.893567 +0.895142 +0.891723 +0.889913 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.498117 +0.498615 +0.499572 +0.497266 +0.496045 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +1.35468 +1.35603 +1.35852 +1.35307 +1.35023 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.607767 +0.608375 +0.609509 +0.60693 +0.60558 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.498117 +0.498615 +0.499572 +0.497266 +0.496045 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.892675 +0.893567 +0.895142 +0.891723 +0.889913 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.892675 +0.893567 +0.895142 +0.891723 +0.889913 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +1.40624 +1.40765 +1.41019 +1.40449 +1.40147 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +2.29968 +2.30198 +2.30705 +2.2933 +2.28598 +0.498117 +0.498615 +0.499572 +0.497266 +0.496045 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.892675 +0.893567 +0.895142 +0.891723 +0.889913 +0.498117 +0.498615 +0.499572 +0.497266 +0.496045 +0.892675 +0.893567 +0.895142 +0.891723 +0.889913 +0.498117 +0.498615 +0.499572 +0.497266 +0.496045 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.892675 +0.893567 +0.895142 +0.891723 +0.889913 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +1.40624 +1.40765 +1.41019 +1.40449 +1.40147 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.889266 +0.890155 +0.891728 +0.903262 +0.901436 +0.495921 +0.496417 +0.497371 +0.505047 +0.503816 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.495921 +0.496417 +0.497371 +0.505047 +0.503816 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +1.10466 +1.10576 +1.10772 +1.12203 +1.11976 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +1.39728 +1.39868 +1.40121 +1.42033 +1.44219 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +1.17338 +1.17455 +1.17663 +1.19197 +1.20948 +1.98925 +1.99124 +1.99529 +2.02936 +2.0685 +0.494751 +0.495246 +0.496198 +0.503865 +0.512623 +0.494751 +0.495246 +0.496198 +0.503865 +0.512623 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.887463 +0.888351 +0.889917 +0.901437 +0.914582 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.887463 +0.888351 +0.889917 +0.901437 +0.914582 +0.494751 +0.495246 +0.496198 +0.503865 +0.512623 +0.887463 +0.888351 +0.889917 +0.901437 +0.914582 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 diff --git a/gensim/test/test_data/sample_variance_DTM b/gensim/test/test_data/sample_variance_DTM new file mode 100644 index 0000000000..0760af316c --- /dev/null +++ b/gensim/test/test_data/sample_variance_DTM @@ -0,0 +1,2810 @@ +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 diff --git a/gensim/test/test_ldaseqmodel.py b/gensim/test/test_ldaseqmodel.py new file mode 100644 index 0000000000..adcccbcc14 --- /dev/null +++ b/gensim/test/test_ldaseqmodel.py @@ -0,0 +1,36 @@ +""" + +Tests to check helper DTM methods. + +""" + +import numpy # for arrays, array broadcasting etc. +from gensim.models import ldaseqmodel +import os.path +import unittest +import logging + + +module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder +datapath = lambda fname: os.path.join(module_path, 'test_data', fname) + + + +class TestSSLM(unittest.TestCase): + def setUp(self): + self.obs = numpy.resize(numpy.zeros(562 * 4), (562, 4)) + mean = numpy.loadtxt(datapath('sample_mean_DTM')) + variance= numpy.loadtxt(datapath('sample_variance_DTM')) + self.mean = numpy.split(mean, 562) + self.variance = numpy.split(variance, 562) + self.zeta = numpy.zeros(4) + + def testUpdateZeta(self): + ldaseqmodel.update_zeta(self) + expected_zeta = numpy.array([ 286.24901747, 285.9899686 , 286.03548494, 286.63929586]) + actual_zeta = self.zeta + self.assertAlmostEqual(expected_zeta[0], actual_zeta[0], places=2) + +if __name__ == '__main__': + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) + unittest.main() From 2a236394b35aa95196fee4750cb25d98b97c434c Mon Sep 17 00:00:00 2001 From: bhargavvader Date: Thu, 9 Jun 2016 20:04:31 +0530 Subject: [PATCH 02/38] Formatting --- gensim/models/ldaseqmodel.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index e14d40be2f..09ba8affc9 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -67,14 +67,10 @@ def update_zeta(sslm): num_sequence = sslm.obs.shape[1] # this is number of sequeces # making zero and updating sslm.zeta.fill(0) - for i in range(0, num_terms): for j in range(0, num_sequence): - try: - m = sslm.mean[i][j + 1] - v = sslm.variance[i][j + 1] - val = numpy.exp(m + v/2) - sslm.zeta[j] = sslm.zeta[j] + val - except IndexError: - print i, j - return \ No newline at end of file + m = sslm.mean[i][j + 1] + v = sslm.variance[i][j + 1] + val = numpy.exp(m + v/2) + sslm.zeta[j] = sslm.zeta[j] + val + return From 2574a4b8601cbb37c238df44002e9d6c0b20e2d6 Mon Sep 17 00:00:00 2001 From: bhargavvader Date: Fri, 10 Jun 2016 08:10:31 +0530 Subject: [PATCH 03/38] sslm_init --- gensim/models/ldaseqmodel.py | 44 ++++++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index 09ba8affc9..287161f86e 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -56,9 +56,9 @@ def __init__(num_terms=None, num_sequence=None, obs=None, obs_variance=None, cha mean_t=None, variance_t=None, influence_sum_lgl=None, w_phi_l=None, w_phi_sum=None, w_phi_l_sq=None, m_update_coeff_g=None): self.obs = obs - self.zeta = zeta - self.mean = mean - self.variance = variance + self.zeta = zeta # array equal to number of sequences + self.mean = mean # matrix of dimensions num_terms * (num_of sequences + 1) + self.variance = variance # matrix of dimensions num_terms * (num_of sequences + 1) def update_zeta(sslm): @@ -72,5 +72,41 @@ def update_zeta(sslm): m = sslm.mean[i][j + 1] v = sslm.variance[i][j + 1] val = numpy.exp(m + v/2) - sslm.zeta[j] = sslm.zeta[j] + val + sslm.zeta[j] = sslm.zeta[j] + val return + +def compute_post_variance(sslm): + return + + +def sslm_counts_init(sslm, lda, obs_variance, chain_variance): + + W = sslm.num_terms + T = sslm.num_sequence + + log_norm_counts = lda.state.sstats + log_norm_counts = log_norm_counts / sum(log_norm_counts) + + log_norm_counts = log_norm_counts + 1.0/W + log_norm_counts = log_norm_counts / sum(log_norm_counts) + log_norm_counts = numpy.log(log_norm_counts) + + # setting variational observations to transformed counts + for t in range(0, T): + sslm.obs[t] = log_norm_counts + + # set variational parameters + + sslm.obs_variance = obs_variance + sslm.chain_variance = chain_variance + + # compute post variance + for w in range(0, W): + compute_post_variance(w, sslm, sslm.chain_variance) + + for w in range(0, W): + compute_post_mean(w, sslm, sslm.chain_variance) + + update_zeta(sslm) + compute_expected_log_prob(sslm) + From 3255551d952e7e352895918eb3a55a4d636fb073 Mon Sep 17 00:00:00 2001 From: bhargavvader Date: Wed, 15 Jun 2016 20:36:18 +0530 Subject: [PATCH 04/38] Finished init_lda_from_ss --- gensim/models/ldaseqmodel.py | 99 ++++++++++++++++++++++++++++++++---- 1 file changed, 89 insertions(+), 10 deletions(-) diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index 287161f86e..eb60945d01 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -37,12 +37,15 @@ def __init__(self, corpus=None, num_topics=10, id2word=None, num_sequence=None, self.num_terms = 0 if self.num_terms == 0: - raise ValueError("cannot compute LDA over an empty collection (no terms)") + raise ValueError("cannot compute DTM over an empty collection (no terms)") self.num_topics = num_topics self.num_sequence = num_sequence self.alphas = alphas - self.topic_chains = topic_chains + self.topic_chains = [] + for topic in range(0, num_topics): + topic_chains.append(sslm) + self.top_doc_phis = top_doc_phis # influence values as of now not using @@ -51,7 +54,7 @@ def __init__(self, corpus=None, num_topics=10, id2word=None, num_sequence=None, self.influence_sum_lgl = influence_sum_lgl class sslm(utils.SaveLoad): - def __init__(num_terms=None, num_sequence=None, obs=None, obs_variance=None, chain_variance=None, fwd_variance=None, + def __init__(num_terms=None, num_sequence=None, obs=None, obs_variance=0.5, chain_variance=0.005, fwd_variance=None, mean=None, variance=None, zeta=None, e_log_prob=None, fwd_mean=None, m_update_coeff=None, mean_t=None, variance_t=None, influence_sum_lgl=None, w_phi_l=None, w_phi_sum=None, w_phi_l_sq=None, m_update_coeff_g=None): @@ -75,28 +78,94 @@ def update_zeta(sslm): sslm.zeta[j] = sslm.zeta[j] + val return -def compute_post_variance(sslm): - return +def compute_post_variance(word , sslm, chain_variance): + T = sslm.num_sequence + variance = sslm.variance[word] # pick wordth row + fwd_variance = sslm.fwd_variance[word] # pick wordth row + # forward pass. Set initial variance very high + fwd_variance[0] = chain_variance * 1000 -def sslm_counts_init(sslm, lda, obs_variance, chain_variance): + for t in range(1, T + 1): + if sslm.obs_variance: + w = sslm.obs_variance / (fwd_variance[t - 1] + chain_variance + sslm.obs_variance) + else: + w = 0 + fwd_variance[t] = w * (fwd_variance[t-1] + chain_variance) + + # backward pass + variance[T] = fwd_variance[T] + for t in range(T - 1, -1, -1): + if fwd_variance[t] > 0.0: + w = numpy.power((fwd_variance[t] / (fwd_variance[t] + chain_variance)), 2) + else: + w = 0 + variance[t] = (w * (variance[t + 1] - chain_variance)) + ((1 - w) * fwd_variance[t]) + + sslm.variance[word] = variance + sslm.fwd_variance[word] = fwd_variance + return + + +def compute_post_mean(word, sslm, chain_variance): + + T = sslm.num_sequence + obs = sslm.obs[word] # wordth row + mean = sslm.mean[word] + fwd_mean = sslm.fwd_mean[word] + fwd_variance = sslm.fwd_variance[word] + + # forward + fwd_mean[0] = 0 + for t in range(1, T + 1): + # assert(fabs(vget(&fwd_variance, t-1) + + # chain_variance + var->obs_variance) > 0.0); + w = sslm.obs_variance / (fwd_variance[t - 1] + chain_variance + sslm.obs_variance) + fwd_mean[t] = w * fwd_mean[t - 1] + (1 - w) * obs[t - 1] + if fwd_mean[t] is None: + print"log message" + + # backward pass + mean[T] = fwd_mean[T] + for t in range(T - 1, -1, -1): + if chain_variance == 0.0: + w = 0.0 + else: + w = chain_variance / (fwd_variance[t] + chain_variance) + mean[t] = w * fwd_mean[t] + (1 - w) * mean[t + 1] + if mean[t] is None: + print "log message" + sslm.mean[word] = mean + sslm.fwd_mean[word] = fwd_mean + return + +def compute_expected_log_prob(sslm): + + W = sslm.num_terms + T = sslm.num_sequence + for t in range(0, T): + for w in range(0, W): + sslm.e_log_prob[w][t] = sslm.mean[w][t + 1] - numpy.log(sslm.zeta[t]) + return + + +def sslm_counts_init(sslm, obs_variance, chain_variance, sstats): W = sslm.num_terms T = sslm.num_sequence - log_norm_counts = lda.state.sstats + log_norm_counts = sstats log_norm_counts = log_norm_counts / sum(log_norm_counts) log_norm_counts = log_norm_counts + 1.0/W log_norm_counts = log_norm_counts / sum(log_norm_counts) log_norm_counts = numpy.log(log_norm_counts) - + # setting variational observations to transformed counts for t in range(0, T): - sslm.obs[t] = log_norm_counts + sslm.obs[:,t] = log_norm_counts # set variational parameters - sslm.obs_variance = obs_variance sslm.chain_variance = chain_variance @@ -110,3 +179,13 @@ def sslm_counts_init(sslm, lda, obs_variance, chain_variance): update_zeta(sslm) compute_expected_log_prob(sslm) +def init_ldaseq_ss(ldaseq, lda, alpha, topic_chain_variance, topic_obs_variance): + ldaseq.alpha = alpha + for k in range(0, ldaseq.num_topics): + sstats = lda.state.sstats[k] + sslm_counts_init(ldaseq.topic_chains[k], topic_obs_variance, topic_chain_variance, sstats) + + # dont't need to initialize here, but writing for reference + ldaseq.topic_chains[k].w_phi_l = numpy.zeros((ldaseq.num_terms, ldaseq.num_sequence)) + ldaseq.topic_chains[k].w_phi_sum = numpy.zeros((ldaseq.num_terms, ldaseq.num_sequence)) + ldaseq.topic_chains[k].w_phi_sq = numpy.zeros((ldaseq.num_terms, ldaseq.num_sequence)) From f258c1236c25129a89bc0a369ff934cb5e202d66 Mon Sep 17 00:00:00 2001 From: bhargavvader Date: Wed, 15 Jun 2016 20:52:12 +0530 Subject: [PATCH 05/38] FIxed failing test --- gensim/models/ldaseqmodel.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index eb60945d01..2388d10fac 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -123,7 +123,8 @@ def compute_post_mean(word, sslm, chain_variance): w = sslm.obs_variance / (fwd_variance[t - 1] + chain_variance + sslm.obs_variance) fwd_mean[t] = w * fwd_mean[t - 1] + (1 - w) * obs[t - 1] if fwd_mean[t] is None: - print"log message" + # error message + pass # backward pass mean[T] = fwd_mean[T] @@ -134,7 +135,9 @@ def compute_post_mean(word, sslm, chain_variance): w = chain_variance / (fwd_variance[t] + chain_variance) mean[t] = w * fwd_mean[t] + (1 - w) * mean[t + 1] if mean[t] is None: - print "log message" + # error message + pass + sslm.mean[word] = mean sslm.fwd_mean[word] = fwd_mean return From 3b2643f0077998adafb6e4be4c3e0ca29d6ed4c6 Mon Sep 17 00:00:00 2001 From: bhargavvader Date: Wed, 22 Jun 2016 10:16:14 +0530 Subject: [PATCH 06/38] Added new classes and methods --- gensim/models/ldaseqmodel.py | 161 +++++++++++++++++++++++++++++++++++ 1 file changed, 161 insertions(+) diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index 2388d10fac..2dbd079112 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -17,8 +17,17 @@ """ from gensim import interfaces, utils, matutils +from gensim.models import ldamodel import numpy +class seq_corpus(utils.SaveLoad): + def __init__(self, num_terms=0, max_nterms=0, length=0, num_doc=0, corpuses=0): + self.num_terms = num_terms + self.max_nterms = max_nterms + self.length = length + self.num_docs = num_docs + self.corpuses = corpuses + class LdaSeqModel(utils.SaveLoad): def __init__(self, corpus=None, num_topics=10, id2word=None, num_sequence=None, num_terms=None, alphas=None, top_doc_phis=None, topic_chains=None, influence=None, influence_sum_lgl=None, renormalized_influence=None): @@ -192,3 +201,155 @@ def init_ldaseq_ss(ldaseq, lda, alpha, topic_chain_variance, topic_obs_variance) ldaseq.topic_chains[k].w_phi_l = numpy.zeros((ldaseq.num_terms, ldaseq.num_sequence)) ldaseq.topic_chains[k].w_phi_sum = numpy.zeros((ldaseq.num_terms, ldaseq.num_sequence)) ldaseq.topic_chains[k].w_phi_sq = numpy.zeros((ldaseq.num_terms, ldaseq.num_sequence)) + +def fit_lda_seq(ldaseq, seq_corpus): + K = ldaseq.num_topics + W = ldaseq.num_terms + data_len = seq_corpus.length + no_docs = seq_corpus.no_docs + + # heldout_gammas = NULL + # heldout_llhood = NULL + + bound = 0 + heldout_bound = 0 + ldasqe_em_threshold = 1e-4 + convergence = ldasqe_em_threshold + 1 + + # make directory + em_log = open("em_log.dat", "w") + gammas_file = open("gammas.dat", "w") + lhoods_file = open("lhoods.dat", "w") + + iter_ = 0 + final_iters_flag = 0 + last_iter = 0 + + # this is a flag/input do something about it + lda_seq_min_iter = 0 + lda_seq_max_iter = 0 + + while iter_ < lda_seq_min_iter or ((final_iters_flag is 0 or convergence > ldasqe_em_threshold) and iter_ <= lda_seq_max_iter): + if (!(iter_ < lda_sequence_min_iter or ((final_iters_flag is 0 or convergence > ldasqe_em_threshold) and iter_ <= lda_seq_max_iter))): + last_iter = 1 + + # log + print " EM iter " , iter_ + print "E Step" + + # writing to file + em_log.write(str(bound) + "\t" + str(convergence)) + old_bound = bound + + # initiate sufficient statistics + topic_suffstats = numpy.zeros(K) + for k in range(0, K): + topic_suffstats[k] = numpy.resize(numpy.zeros(W * data_len), (W, data_len)) + + # set up variables + gammas = numpy.resize(numpy.zeros(no_docs * K), (no_docs, K)) + lhoods = numpy.resize(numpy.zeros(no_docs * K + 1), (no_docs, K + 1)) + + bound = lda_seq_infer(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, last_iter) + + # figure out how to write to file here + # TODO save to file for command line + gammas_file.write(gammas) + lhoods_file.write(lhoods) + + print "M Step" + + topic_bound = fit_lda_seq_topics(ldaseq, topic_suffstats) + bound += topic_bound + + write_lda_seq(ldaseq) + + if ((bound - old_bound) < 0): + if (LDA_INFERENCE_MAX_ITER == 1): + LDA_INFERENCE_MAX_ITER = 2 + if (LDA_INFERENCE_MAX_ITER == 2): + LDA_INFERENCE_MAX_ITER = 5 + if (LDA_INFERENCE_MAX_ITER == 5): + LDA_INFERENCE_MAX_ITER = 10 + if (LDA_INFERENCE_MAX_ITER == 10): + LDA_INFERENCE_MAX_ITER = 20 + print "Bound went down, increasing it to" , LDA_INFERENCE_MAX_ITER + + # check for convergence + convergence = numpy.fabs((bound - old_bound) / old_bound) + + if convergence < ldasqe_em_threshold: + final_iters_flag = 1 + LDA_INFERENCE_MAX_ITER = 500 + print "Starting final iterations, max iter is", LDA_INFERENCE_MAX_ITER + convergence = 1.0 + + print "%d lda seq bound is = %d, convergence is %d", iter_, bound, convergence + + iter_ += 1 + + return bound + + +def lda_seq_infer(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, last_iter): + K = ldaseq.num_topics + W = ldaseq.num_terms + bound = 0.0 + +# typedef struct lda { +# int ntopics; // number of topics +# int nterms; // vocabulary size +# gsl_matrix* topics; // each column is a topic (V X K) +# gsl_vector* alpha; // dirichlet parameters +# } lda; + +# // lda posterior + +# typedef struct lda_post { +# doc_t* doc; // document associated to this posterior +# lda* model; // lda model +# gsl_matrix* phi; // variational mult parameters (nterms x K) +# gsl_matrix* log_phi; // convenient for computation (nterms x K) +# gsl_vector* gamma; // variational dirichlet parameters (K) +# gsl_vector* lhood; // a K+1 vector, sums to the lhood bound +# gsl_vector* doc_weight; // Not owned by this structure. +# gsl_vector* renormalized_doc_weight; // Not owned by this structure. +# } lda_post; + + lda = ldamodel.LdaModel(num_topics=K) + + lda_post.phi = numpy.resize(numpy.zeros(seq_corpus.max_nterms * K), (seq_corpus.max_nterms, K)) + lda_post.log_phi = numpy.resize(numpy.zeros(seq_corpus.max_nterms * K), (seq_corpus.max_nterms, K)) + lda_post.model = lda + + model = "DTM" + if model == "DTM": + inferDTMseq(K, ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, last_iter, lda, lda_post, bound) + elif model == "DIM": + InfluenceTotalFixed(ldaseq, seq_corpus); + inferDIMseq(K, ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, last_iter, lda, lda_post, bound) + + return bound + +def inferDTMseq(K, ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, last_iter, lda, lda_post, bound): + return + +def fit_lda_seq_topics(ldaseq, topic_suffstats): + lhood = 0 + lhood_term = 0 + K = ldaseq.num_topics + + for k in range(0, K): + print "Fitting topic number" , k + lhood_term = fit_sslm(ldaseq.topic_chains[k], topic_suffstats[k]) + lhood +=lhood_term + + return lhood + +def fit_sslm(sslm, counts): + W = sslm.num_terms + bound = 0 + old_bound = 0 + sslm_fit_threshold = 1e-6 + converged = sslm_fit_threshold + 1 + \ No newline at end of file From 0cd25e3ebd17a5a41eccf05708c9e8e7519a16e7 Mon Sep 17 00:00:00 2001 From: bhargavvader Date: Wed, 22 Jun 2016 10:24:54 +0530 Subject: [PATCH 07/38] Fixed failing test --- gensim/models/ldaseqmodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index 2dbd079112..1c1b3bbfb9 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -230,7 +230,7 @@ def fit_lda_seq(ldaseq, seq_corpus): lda_seq_max_iter = 0 while iter_ < lda_seq_min_iter or ((final_iters_flag is 0 or convergence > ldasqe_em_threshold) and iter_ <= lda_seq_max_iter): - if (!(iter_ < lda_sequence_min_iter or ((final_iters_flag is 0 or convergence > ldasqe_em_threshold) and iter_ <= lda_seq_max_iter))): + if not (iter_ < lda_sequence_min_iter or ((final_iters_flag is 0 or convergence > ldasqe_em_threshold) and iter_ <= lda_seq_max_iter)): last_iter = 1 # log From 18ae9adf3eb298bfd2764d3059310140639f3aa6 Mon Sep 17 00:00:00 2001 From: bhargavvader Date: Tue, 28 Jun 2016 20:52:00 +0530 Subject: [PATCH 08/38] Update with new methods, tests --- gensim/models/ldaseqmodel.py | 208 +++++++++++++++++++++++++++----- gensim/test/test_ldaseqmodel.py | 90 ++++++++++++-- 2 files changed, 258 insertions(+), 40 deletions(-) diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index 1c1b3bbfb9..cf86cf8df9 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -26,6 +26,8 @@ def __init__(self, num_terms=0, max_nterms=0, length=0, num_doc=0, corpuses=0): self.max_nterms = max_nterms self.length = length self.num_docs = num_docs + + # list of corpus class objects self.corpuses = corpuses class LdaSeqModel(utils.SaveLoad): @@ -63,7 +65,7 @@ def __init__(self, corpus=None, num_topics=10, id2word=None, num_sequence=None, self.influence_sum_lgl = influence_sum_lgl class sslm(utils.SaveLoad): - def __init__(num_terms=None, num_sequence=None, obs=None, obs_variance=0.5, chain_variance=0.005, fwd_variance=None, + def __init__(self, num_terms=None, num_sequence=None, obs=None, obs_variance=0.5, chain_variance=0.005, fwd_variance=None, mean=None, variance=None, zeta=None, e_log_prob=None, fwd_mean=None, m_update_coeff=None, mean_t=None, variance_t=None, influence_sum_lgl=None, w_phi_l=None, w_phi_sum=None, w_phi_l_sq=None, m_update_coeff_g=None): @@ -71,7 +73,29 @@ def __init__(num_terms=None, num_sequence=None, obs=None, obs_variance=0.5, chai self.zeta = zeta # array equal to number of sequences self.mean = mean # matrix of dimensions num_terms * (num_of sequences + 1) self.variance = variance # matrix of dimensions num_terms * (num_of sequences + 1) + self.num_terms = num_terms + self.num_sequence = num_sequence + self.obs_variance = obs_variance + self.chain_variance= chain_variance + self.fwd_variance = fwd_variance + self.fwd_mean = fwd_mean + self.e_log_prob = e_log_prob + self.m_update_coeff = m_update_coeff + self.mean_t = mean_t + self.variance_t = variance_t + self.influence_sum_lgl = influence_sum_lgl + self.w_phi_l = w_phi_l + self.w_phi_sum = w_phi_sum + self.w_phi_l_sq = w_phi_l_sq + self.m_update_coeff_g = m_update_coeff_g + +class lda_post(utils.SaveLoad): + def __init__(self, doc=None, lda=None, phi=None, log_phi=None, gamma=None, lhood=None, doc_weight=None, renormalized_doc_weight=None): + return +class opt_params(utils.SaveLoad): + def __init__(sslm, word_counts, totals, mean_deriv_mtx, word): + return def update_zeta(sslm): # setting limits @@ -234,8 +258,8 @@ def fit_lda_seq(ldaseq, seq_corpus): last_iter = 1 # log - print " EM iter " , iter_ - print "E Step" + print (" EM iter " , iter_) + print ("E Step") # writing to file em_log.write(str(bound) + "\t" + str(convergence)) @@ -257,7 +281,7 @@ def fit_lda_seq(ldaseq, seq_corpus): gammas_file.write(gammas) lhoods_file.write(lhoods) - print "M Step" + print ("M Step") topic_bound = fit_lda_seq_topics(ldaseq, topic_suffstats) bound += topic_bound @@ -273,7 +297,7 @@ def fit_lda_seq(ldaseq, seq_corpus): LDA_INFERENCE_MAX_ITER = 10 if (LDA_INFERENCE_MAX_ITER == 10): LDA_INFERENCE_MAX_ITER = 20 - print "Bound went down, increasing it to" , LDA_INFERENCE_MAX_ITER + print ("Bound went down, increasing it to" , LDA_INFERENCE_MAX_ITER) # check for convergence convergence = numpy.fabs((bound - old_bound) / old_bound) @@ -281,10 +305,10 @@ def fit_lda_seq(ldaseq, seq_corpus): if convergence < ldasqe_em_threshold: final_iters_flag = 1 LDA_INFERENCE_MAX_ITER = 500 - print "Starting final iterations, max iter is", LDA_INFERENCE_MAX_ITER + print ("Starting final iterations, max iter is", LDA_INFERENCE_MAX_ITER) convergence = 1.0 - print "%d lda seq bound is = %d, convergence is %d", iter_, bound, convergence + print ("%d lda seq bound is = %d, convergence is %d", iter_, bound, convergence) iter_ += 1 @@ -292,32 +316,12 @@ def fit_lda_seq(ldaseq, seq_corpus): def lda_seq_infer(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, last_iter): + K = ldaseq.num_topics W = ldaseq.num_terms bound = 0.0 - -# typedef struct lda { -# int ntopics; // number of topics -# int nterms; // vocabulary size -# gsl_matrix* topics; // each column is a topic (V X K) -# gsl_vector* alpha; // dirichlet parameters -# } lda; - -# // lda posterior - -# typedef struct lda_post { -# doc_t* doc; // document associated to this posterior -# lda* model; // lda model -# gsl_matrix* phi; // variational mult parameters (nterms x K) -# gsl_matrix* log_phi; // convenient for computation (nterms x K) -# gsl_vector* gamma; // variational dirichlet parameters (K) -# gsl_vector* lhood; // a K+1 vector, sums to the lhood bound -# gsl_vector* doc_weight; // Not owned by this structure. -# gsl_vector* renormalized_doc_weight; // Not owned by this structure. -# } lda_post; lda = ldamodel.LdaModel(num_topics=K) - lda_post.phi = numpy.resize(numpy.zeros(seq_corpus.max_nterms * K), (seq_corpus.max_nterms, K)) lda_post.log_phi = numpy.resize(numpy.zeros(seq_corpus.max_nterms * K), (seq_corpus.max_nterms, K)) lda_post.model = lda @@ -332,24 +336,170 @@ def lda_seq_infer(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, la return bound def inferDTMseq(K, ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, last_iter, lda, lda_post, bound): + + doc_index = 0 + for t in range(0, seq_corpus.length): + make_lda_seq_slice(lda, ldaseq, t) + # what to do here + ndocs = seq_corpus.corpuses[t].ndocs + for d in range(0, ndocs): + gam = gammas[doc_index] + lhood = lhoods[doc_index] + lda_post.gamma = gam + lda_post.lhood = lhood + lda_post.doc = seq_corpus.corpuses[t].doc[d] + if iter_ == 0: + doc_lhood = fit_lda_post(d, t, post, None, None, None, None, None) + else: + doc_lhood = fit_lda_post(d, t, post, model, None, None, None, None) + if topic_suffstats != None: + update_lda_seq_ss(t, seq_corpus.corpuses[t].doc[d], lda_post, topic_suffstats) + bound += doc_lhood + doc_index += 1 + return + +def fit_lda_post(): + return + +def make_lda_seq_slice(): + return + +def update_lda_seq_ss(): return + def fit_lda_seq_topics(ldaseq, topic_suffstats): lhood = 0 lhood_term = 0 K = ldaseq.num_topics for k in range(0, K): - print "Fitting topic number" , k + print ("Fitting topic number" , k) lhood_term = fit_sslm(ldaseq.topic_chains[k], topic_suffstats[k]) lhood +=lhood_term return lhood def fit_sslm(sslm, counts): + W = sslm.num_terms bound = 0 old_bound = 0 sslm_fit_threshold = 1e-6 + sslm_max_iter = 2 converged = sslm_fit_threshold + 1 + + totals = numpy.zeros(counts.shape[1]) + + for w in range(0, W): + compute_post_variance(w, sslm, sslm.chain_variance) + + totals = col_sum(counts, totals) + + iter_ = 0 + + model = "DTM" + if model == "DTM": + bound = compute_bound(counts, totals, sslm) + if model == "DIM": + bound = compute_bound_fixed(counts, totals, sslm) + + print ("initial sslm bound is " , bound) + + while converged > sslm_fit_threshold and iter_ < sslm_max_iter: + iter_ += 1 + old_bound = bound + update_obs(counts, totals, sslm_max_iter) + + if model == "DTM": + bound = compute_bound(counts, totals, sslm) + if model == "DIM": + bound = compute_bound_fixed(counts, totals, sslm) + + converged = numpy.fabs((bound - old_bound) / old_bound) + + print ("%d lda seq bound is = %d, convergence is %d", iter_, bound, converged) + + compute_expected_log_prob(sslm) + + return bound + + +def col_sum(matrix, vector): + for i in range(0, matrix.shape[1]): + for j in range(0, matrix.shape[0]): + vector[j] = vector[j] + matrix[i][j] + + return vector + +def compute_bound(word_counts, totals, sslm): + + W = sslm.num_terms + T = sslm.num_sequence + + term_1 = 0 + term_2 = 0 + term_3 = 0 + + val = 0 + ent = 0 + + chain_variance = sslm.chain_variance + + for w in range(0, W): + compute_post_mean(w, sslm, chain_variance) + + update_zeta(sslm) + + for w in range(0, W): + val += (sslm.variance[w][0] - sslm.variance[w][T]) / 2 * chain_variance + + print ("Computing bound, all times") + + for t in range(1, T + 1): + for w in range(0, W): + + m = sslm.mean[w][t] + prev_m = sslm.mean[w][t - 1] + + v = sslm.variance[w][t] + + # Values specifically related to document influence: + # Note that our indices are off by 1 here. + + w_phi_l = sslm.w_phi_l[w][t - 1] + exp_i = numpy.exp(numpy.negative(prev_m)) + + term_1 += (numpy.power(m - prev_m - (w_phi_l * exp_i), 2) / (2 * chain_variance)) - (v / chain_variance) - numpy.log(chain_variance) + + term_2 += word_counts[w][t - 1] * m + ent += numpy.log(v) / 2 # note the 2pi's cancel with term1 (see doc) + + term_3 += totals[t - 1] * numpy.log(sslm.zeta[t - 1]) + val += numpy.negative(term_1) + term_2 + term_3 + ent + + return val + +# fucntion to perform optimization +# def update_obs(counts, totals, sslm): + +# W = sslm.num_terms +# T = sslm.num_sequence + +# runs = 0 + +# params = opt_params(var=sslm, totals=totals) +# mean_deriv_mtx = numpy.resize(numpy.zeros(T * (T + 1)), (T, T + 1)) + +# # for w in range(0, W): + + + + + + + + + + \ No newline at end of file diff --git a/gensim/test/test_ldaseqmodel.py b/gensim/test/test_ldaseqmodel.py index adcccbcc14..d13bad396f 100644 --- a/gensim/test/test_ldaseqmodel.py +++ b/gensim/test/test_ldaseqmodel.py @@ -15,22 +15,90 @@ datapath = lambda fname: os.path.join(module_path, 'test_data', fname) - class TestSSLM(unittest.TestCase): - def setUp(self): - self.obs = numpy.resize(numpy.zeros(562 * 4), (562, 4)) - mean = numpy.loadtxt(datapath('sample_mean_DTM')) - variance= numpy.loadtxt(datapath('sample_variance_DTM')) - self.mean = numpy.split(mean, 562) - self.variance = numpy.split(variance, 562) - self.zeta = numpy.zeros(4) - def testUpdateZeta(self): - ldaseqmodel.update_zeta(self) + # setting up mock values + mean = numpy.split(numpy.loadtxt(datapath('sample_mean_DTM')), 562) + variance = numpy.split(numpy.loadtxt(datapath('sample_variance_DTM')), 562) + obs = numpy.resize(numpy.zeros(562 * 4), (562, 4)) + zeta = numpy.zeros(4) + + # setting up sslm object + sslm = ldaseqmodel.sslm(mean=mean, variance=variance, obs=obs, zeta=zeta) + ldaseqmodel.update_zeta(sslm) + expected_zeta = numpy.array([ 286.24901747, 285.9899686 , 286.03548494, 286.63929586]) - actual_zeta = self.zeta + actual_zeta = sslm.zeta self.assertAlmostEqual(expected_zeta[0], actual_zeta[0], places=2) + def testPostVariance(self): + + # setting up mock values + variance = numpy.split(numpy.loadtxt(datapath('sample_variance_DTM')), 562) + fwd_variance = numpy.split(numpy.loadtxt(datapath('sample_variance_DTM')), 562) + chain_variance = 0.005 + + sslm = ldaseqmodel.sslm(chain_variance=chain_variance, obs_variance=0.5, num_terms=562, num_sequence=4, variance=variance, fwd_variance=fwd_variance) + + # since we only check for the 0th word of compute_post_variance, we initialise our mock values + + sslm.variance[0] = numpy.loadtxt(datapath('before_variance')) + sslm.fwd_variance[0] = numpy.loadtxt(datapath('before_fwd_variance')) + + ldaseqmodel.compute_post_variance(0, sslm, chain_variance) + + expected_variance = numpy.array([0.130797, 0.126054, 0.123787, 0.123906, 0.126415]) + expected_fwd_variance = numpy.array([5, 0.454587, 0.239471, 0.164191, 0.126415]) + + self.assertAlmostEqual(expected_variance[0], sslm.variance[0][0], places=2) + self.assertAlmostEqual(expected_fwd_variance[0], sslm.fwd_variance[0][0], places=2) + + def testPostMean(self): + + # setting up mock values + obs = numpy.resize(numpy.zeros(562 * 4), (562, 4)) + variance = numpy.split(numpy.loadtxt(datapath('sample_variance_DTM')), 562) + fwd_variance = numpy.split(numpy.loadtxt(datapath('sample_variance_DTM')), 562) + mean = numpy.split(numpy.loadtxt(datapath('sample_mean_DTM')), 562) + fwd_mean = numpy.split(numpy.loadtxt(datapath('sample_mean_DTM')), 562) + chain_variance = 0.005 + + sslm = ldaseqmodel.sslm(chain_variance=chain_variance, obs_variance=0.5, num_terms=562, num_sequence=4, variance=variance, fwd_variance=fwd_variance, mean=mean, fwd_mean=fwd_mean, obs=obs) + + # since we only check for the 0th word of compute_post_mean, we initialise our mock values + sslm.obs[0] = numpy.loadtxt(datapath('before_obs')) + sslm.mean[0] = numpy.loadtxt(datapath('before_mean')) + sslm.fwd_mean[0] = numpy.loadtxt(datapath('before_fwd_mean')) + sslm.variance[0] = numpy.loadtxt(datapath('before_variance')) + sslm.fwd_variance[0] = numpy.loadtxt(datapath('before_fwd_variance1')) + + ldaseqmodel.compute_post_mean(0, sslm, chain_variance) + + expected_mean = numpy.array([-1.40784, -1.40924, -1.41058, -1.41093, -1.41111]) + expected_fwd_mean = numpy.array([0, -1.28744, -1.39419, -1.40497, -1.41111]) + + self.assertAlmostEqual(expected_mean[0], sslm.mean[0][0], places=2) + self.assertAlmostEqual(expected_fwd_mean[0], sslm.fwd_mean[0][0], places=2) + + def testLogProb(self): + + # setting up mock values + zeta = numpy.loadtxt(datapath('eprob_zeta')) + mean = numpy.split(numpy.loadtxt(datapath('eprob_mean')), 562) + e_log_prob = numpy.loadtxt(datapath('eprob_before')) + e_log_prob = numpy.resize(e_log_prob, (562, 4)) + chain_variance = 0.005 + + sslm = ldaseqmodel.sslm(chain_variance=chain_variance, obs_variance=0.5, num_terms=562, num_sequence=4, mean=mean, zeta=zeta, e_log_prob=e_log_prob) + + # we are only checking the first few values; + expected_log_prob = numpy.array([-4.75, -4.7625, -4.76608, -4.76999]) + ldaseqmodel.compute_expected_log_prob(sslm) + + self.assertAlmostEqual(expected_log_prob[0], sslm.e_log_prob[0][0], places=2) + + + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() From 17f7873f8818e94019a8ed0c1b022c01c2009b08 Mon Sep 17 00:00:00 2001 From: bhargavvader Date: Tue, 28 Jun 2016 20:52:54 +0530 Subject: [PATCH 09/38] Added test data files --- gensim/test/test_data/.DS_Store | Bin 0 -> 10244 bytes gensim/test/test_data/before_fwd_mean | 5 + gensim/test/test_data/before_fwd_variance | 5 + gensim/test/test_data/before_fwd_variance1 | 5 + gensim/test/test_data/before_mean | 5 + gensim/test/test_data/before_obs | 4 + gensim/test/test_data/before_variance | 5 + gensim/test/test_data/eprob_before | 2248 ++++++++++++++++ gensim/test/test_data/eprob_mean | 2810 ++++++++++++++++++++ gensim/test/test_data/eprob_zeta | 4 + 10 files changed, 5091 insertions(+) create mode 100644 gensim/test/test_data/.DS_Store create mode 100644 gensim/test/test_data/before_fwd_mean create mode 100644 gensim/test/test_data/before_fwd_variance create mode 100644 gensim/test/test_data/before_fwd_variance1 create mode 100644 gensim/test/test_data/before_mean create mode 100644 gensim/test/test_data/before_obs create mode 100644 gensim/test/test_data/before_variance create mode 100644 gensim/test/test_data/eprob_before create mode 100644 gensim/test/test_data/eprob_mean create mode 100644 gensim/test/test_data/eprob_zeta diff --git a/gensim/test/test_data/.DS_Store b/gensim/test/test_data/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..0533d30182c1f2823f7d097daa20527a4cd810cb GIT binary patch literal 10244 zcmeHM&2AGh5T0!#QPGwHY1MFIkG(WBMckt*AoainIB+85XWK5_CQ*|#rKdaw55)uU z033Sc4d9#cCb1p6O@)L|<*jyS*V*|z{``4tZ;2>67`7`!F%hlcWczpx7stWB^Er1c z-x7G0$22SU%>aAQbwQ_B__(!PYY#pF%+tpIE4!fXVt=Swt?OtbA8m> z@X+$=;<<0uMBgnRo3$I;yZpt9a~)qu24J|3YeIco%P2{~)WE72vC@tf>#D=^jLs}C zL~D4}P)i5jWl}oDxl0-PWm5Al_3(5oaqLR34J49GN-Bu#5|L)`>d~I@SHaV)>?NM~ z+{c@JIuSi_9UYu)pfu@<_RTn>cNQ<3@MG1r>6O$}hBx;H$J;^dec-ehc^=1TZ)s5+ zcU-Jb120|}k9+h8{Izi{gSRg363VQ2fBXghNSI}{w1tlW`o~hu(kBbQ1P0sS@B+Rg z>F*4-0cM-P(bR{S9eAu?gOB2^crFy=4g=)@P!jw(`;W>RDKvYI z_6(s6)|W#UoF9xDx6mUg`mzBmWF=VcFV0jA;FN(max=i=--xEP08$Upv0@!uu<{MO z?OX8rh!5l3gT{I?xw^TXj)+7vI>wqrbchyYqLBvbc5p0XMfWR;zL?Sd>7B}R&?#{~ zanvpfTO4KIgTe~aQ8o~L1MTREcBXS0dc8&;=gW%>1zR_JBZgFRK|+@%lJv>lkT_Ge zz)I6X;+2bzG-JbT5%V%KePtD(`_6jA;p=aWtig1gnKn$ENv|gM{2=Ql9bwzUtcKu^ zhck6_sVQyUi{@CT7@kGHr@wwtnO?sSr4Oa@N+!k3abj~4A%_~VJoCh2q4?tx)#-8h zwBi^!vO-?OcL9HZoa4Uoi*s zUc|e)5xre&dV4;_o6M3b&t3^`3bYRLB(D>=zRB8`*9;Ekd8Rk=H>Q+u1P1OM1IwyK zF@OJm?f?J(y=!-9P+%Z1Fk>Jp)ZW()@SJ~JpZ${G@wfIE=VzRJ@Oxur#RV7n91mMS xF+cF<_#M7S*&bUya9_>sSXq?d_~-u&2>XA>x+m=aWe1?ID~s0GgL}09{{u=?B)I?p literal 0 HcmV?d00001 diff --git a/gensim/test/test_data/before_fwd_mean b/gensim/test/test_data/before_fwd_mean new file mode 100644 index 0000000000..344aba8fc4 --- /dev/null +++ b/gensim/test/test_data/before_fwd_mean @@ -0,0 +1,5 @@ +0 +-1.28738 +-1.39413 +-1.40491 +-1.41105 diff --git a/gensim/test/test_data/before_fwd_variance b/gensim/test/test_data/before_fwd_variance new file mode 100644 index 0000000000..229972f292 --- /dev/null +++ b/gensim/test/test_data/before_fwd_variance @@ -0,0 +1,5 @@ +0 +0 +0 +0 +0 diff --git a/gensim/test/test_data/before_fwd_variance1 b/gensim/test/test_data/before_fwd_variance1 new file mode 100644 index 0000000000..9469f47581 --- /dev/null +++ b/gensim/test/test_data/before_fwd_variance1 @@ -0,0 +1,5 @@ +5 +0.454587 +0.239471 +0.164191 +0.126415 diff --git a/gensim/test/test_data/before_mean b/gensim/test/test_data/before_mean new file mode 100644 index 0000000000..9b758baa11 --- /dev/null +++ b/gensim/test/test_data/before_mean @@ -0,0 +1,5 @@ +-1.40778 +-1.40919 +-1.41053 +-1.41087 +-1.41105 diff --git a/gensim/test/test_data/before_obs b/gensim/test/test_data/before_obs new file mode 100644 index 0000000000..bd84df3fb5 --- /dev/null +++ b/gensim/test/test_data/before_obs @@ -0,0 +1,4 @@ +-1.41605 +-1.51033 +-1.42702 +-1.42924 diff --git a/gensim/test/test_data/before_variance b/gensim/test/test_data/before_variance new file mode 100644 index 0000000000..0c371066b9 --- /dev/null +++ b/gensim/test/test_data/before_variance @@ -0,0 +1,5 @@ +0.130797 +0.126054 +0.123787 +0.123906 +0.126415 diff --git a/gensim/test/test_data/eprob_before b/gensim/test/test_data/eprob_before new file mode 100644 index 0000000000..b29f0fc739 --- /dev/null +++ b/gensim/test/test_data/eprob_before @@ -0,0 +1,2248 @@ +-4.74994 +-4.76244 +-4.76602 +-4.76993 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-4.45715 +-4.47412 +-4.47883 +-4.48334 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-4.76872 +-4.76625 +-4.76353 +-4.76745 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-4.80552 +-4.80312 +-4.80656 +-4.81041 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-4.47516 +-4.47217 +-4.47688 +-4.4814 +-4.92295 +-4.92073 +-4.92381 +-4.92746 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-4.80552 +-4.80312 +-4.80656 +-4.81041 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-4.80552 +-4.80312 +-4.80656 +-4.81041 +-5.15817 +-5.15631 +-5.15878 +-5.16211 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-5.15817 +-5.15631 +-5.15878 +-5.16211 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-4.18605 +-4.18247 +-4.18869 +-4.19402 +-4.94948 +-4.9473 +-4.94789 +-4.95151 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-4.80101 +-4.79859 +-4.80205 +-4.8059 +-5.15817 +-5.15631 +-5.15878 +-5.16211 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-4.76321 +-4.76073 +-4.76431 +-4.76823 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-3.28738 +-3.28113 +-3.29595 +-3.30587 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-5.15817 +-5.15631 +-5.15878 +-5.16211 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-4.8263 +-4.82392 +-4.8273 +-4.83111 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-4.47747 +-4.47448 +-4.46923 +-4.47377 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-4.79343 +-4.791 +-4.78994 +-4.79381 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-4.57727 +-4.57446 +-4.57067 +-4.57497 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-5.15817 +-5.15631 +-5.15878 +-5.16211 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-4.76321 +-4.76073 +-4.76431 +-4.76823 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-5.15817 +-5.15631 +-5.15878 +-5.16211 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-4.30172 +-4.29833 +-4.30395 +-4.3089 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-5.04966 +-5.04763 +-5.05037 +-5.05383 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-5.15817 +-5.15631 +-5.15878 +-5.16211 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-4.76321 +-4.76073 +-4.76431 +-4.76823 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-4.76321 +-4.76073 +-4.76431 +-4.76823 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-4.24911 +-4.24566 +-4.25153 +-4.25666 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-3.3547 +-3.34872 +-3.36262 +-3.37204 +-5.15817 +-5.15631 +-5.15878 +-5.16211 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-4.76321 +-4.76073 +-4.76431 +-4.76823 +-5.15817 +-5.15631 +-5.15878 +-5.16211 +-4.76321 +-4.76073 +-4.76431 +-4.76823 +-5.15817 +-5.15631 +-5.15878 +-5.16211 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-4.76321 +-4.76073 +-4.76431 +-4.76823 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-4.24911 +-4.24566 +-4.25153 +-4.25666 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-4.76665 +-4.76417 +-4.7528 +-4.75673 +-5.16037 +-5.15851 +-5.151 +-5.15434 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-5.16037 +-5.15851 +-5.151 +-5.15434 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-4.55117 +-4.54831 +-4.53417 +-4.53854 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-4.25811 +-4.25467 +-4.23572 +-4.21596 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-4.48225 +-4.47927 +-4.4641 +-4.4487 +-3.66553 +-3.66057 +-3.62667 +-3.58963 +-5.16154 +-5.15969 +-5.15218 +-5.14553 +-5.16154 +-5.15969 +-5.15218 +-5.14553 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-4.76843 +-4.76596 +-4.7546 +-4.74357 +-7.06612 +-7.06655 +-7.06706 +-7.06935 +-4.76843 +-4.76596 +-4.7546 +-4.74357 +-5.16154 +-5.15969 +-5.15218 +-5.14553 +-4.76843 +-4.76596 +-4.7546 +-4.74357 +-7.06612 +-7.06655 +-7.06706 +-7.06935 diff --git a/gensim/test/test_data/eprob_mean b/gensim/test/test_data/eprob_mean new file mode 100644 index 0000000000..60621cf99b --- /dev/null +++ b/gensim/test/test_data/eprob_mean @@ -0,0 +1,2810 @@ +0.905953 +0.906859 +0.89346 +0.890039 +0.888237 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +1.19846 +1.19966 +1.18178 +1.17724 +1.17483 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.887422 +0.888309 +0.889874 +0.89276 +0.890946 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.850482 +0.851333 +0.852835 +0.849557 +0.84782 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +1.18046 +1.18164 +1.18373 +1.17918 +1.17676 +0.733484 +0.734217 +0.735537 +0.732618 +0.731079 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.850482 +0.851333 +0.852835 +0.849557 +0.84782 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.850482 +0.851333 +0.852835 +0.849557 +0.84782 +0.498117 +0.498615 +0.499572 +0.497266 +0.496045 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.498117 +0.498615 +0.499572 +0.497266 +0.496045 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +1.4692 +1.47067 +1.47334 +1.46728 +1.46406 +0.706576 +0.707283 +0.708554 +0.70813 +0.706621 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.855175 +0.85603 +0.857541 +0.854247 +0.852503 +0.498117 +0.498615 +0.499572 +0.497266 +0.496045 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.892675 +0.893567 +0.895142 +0.891723 +0.889913 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +2.36713 +2.3695 +2.37485 +2.36017 +2.35236 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.498117 +0.498615 +0.499572 +0.497266 +0.496045 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.829858 +0.830688 +0.832157 +0.828946 +0.827245 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +1.17815 +1.17933 +1.18142 +1.18683 +1.1844 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.862533 +0.863395 +0.864921 +0.86615 +0.864389 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +1.07843 +1.07951 +1.08141 +1.08536 +1.08317 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.498117 +0.498615 +0.499572 +0.497266 +0.496045 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.892675 +0.893567 +0.895142 +0.891723 +0.889913 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.498117 +0.498615 +0.499572 +0.497266 +0.496045 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +1.35468 +1.35603 +1.35852 +1.35307 +1.35023 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.607767 +0.608375 +0.609509 +0.60693 +0.60558 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.498117 +0.498615 +0.499572 +0.497266 +0.496045 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.892675 +0.893567 +0.895142 +0.891723 +0.889913 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.892675 +0.893567 +0.895142 +0.891723 +0.889913 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +1.40624 +1.40765 +1.41019 +1.40449 +1.40147 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +2.29968 +2.30198 +2.30705 +2.2933 +2.28598 +0.498117 +0.498615 +0.499572 +0.497266 +0.496045 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.892675 +0.893567 +0.895142 +0.891723 +0.889913 +0.498117 +0.498615 +0.499572 +0.497266 +0.496045 +0.892675 +0.893567 +0.895142 +0.891723 +0.889913 +0.498117 +0.498615 +0.499572 +0.497266 +0.496045 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.892675 +0.893567 +0.895142 +0.891723 +0.889913 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +1.40624 +1.40765 +1.41019 +1.40449 +1.40147 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.889266 +0.890155 +0.891728 +0.903262 +0.901436 +0.495921 +0.496417 +0.497371 +0.505047 +0.503816 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.495921 +0.496417 +0.497371 +0.505047 +0.503816 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +1.10466 +1.10576 +1.10772 +1.12203 +1.11976 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +1.39728 +1.39868 +1.40121 +1.42033 +1.44219 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +1.17338 +1.17455 +1.17663 +1.19197 +1.20948 +1.98925 +1.99124 +1.99529 +2.02936 +2.0685 +0.494751 +0.495246 +0.496198 +0.503865 +0.512623 +0.494751 +0.495246 +0.496198 +0.503865 +0.512623 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.887463 +0.888351 +0.889917 +0.901437 +0.914582 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 +0.887463 +0.888351 +0.889917 +0.901437 +0.914582 +0.494751 +0.495246 +0.496198 +0.503865 +0.512623 +0.887463 +0.888351 +0.889917 +0.901437 +0.914582 +-1.40784 +-1.40924 +-1.41058 +-1.41093 +-1.41111 diff --git a/gensim/test/test_data/eprob_zeta b/gensim/test/test_data/eprob_zeta new file mode 100644 index 0000000000..3c18927e0d --- /dev/null +++ b/gensim/test/test_data/eprob_zeta @@ -0,0 +1,4 @@ +286.248 +285.989 +286.036 +286.64 From 96b7f380eedcd8d8c1d85c54c0b79e0eccea5345 Mon Sep 17 00:00:00 2001 From: bhargavvader Date: Wed, 29 Jun 2016 23:47:37 +0530 Subject: [PATCH 10/38] Added more functions --- gensim/models/ldaseqmodel.py | 140 ++++++++++++++++++++++++++++++++--- 1 file changed, 128 insertions(+), 12 deletions(-) diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index cf86cf8df9..02b6878bab 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -349,24 +349,94 @@ def inferDTMseq(K, ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, l lda_post.lhood = lhood lda_post.doc = seq_corpus.corpuses[t].doc[d] if iter_ == 0: - doc_lhood = fit_lda_post(d, t, post, None, None, None, None, None) + doc_lhood = fit_lda_post(d, t, lda_post, None, None, None, None, None) else: - doc_lhood = fit_lda_post(d, t, post, model, None, None, None, None) + doc_lhood = fit_lda_post(d, t, lda_post, ldaseq, None, None, None, None) if topic_suffstats != None: update_lda_seq_ss(t, seq_corpus.corpuses[t].doc[d], lda_post, topic_suffstats) bound += doc_lhood doc_index += 1 return -def fit_lda_post(): +def fit_lda_post(doc_number, time, lda_post, ldaseq, g, g3_matrix, g4_matrix, g5_matrix): + init_lda_post(lda_post) + + model = "DTM" + if model == "DIM": + # if in DIM then we initialise some variables here + pass + + lhood = compute_lda_lhood(lda_post) + lhood_old = 0 + converged = 0 + iter_ = 0 + + # convert from a do-while look + while converged > LDA_INFERENCE_CONVERGED and iter_ <= LDA_INFERENCE_MAX_ITER: + iter_ += 1 + lhood_old = lhood + update_gamma(lda_post) + + model = "DTM" + if model == "DTM" or sslm is None: + update_phi(doc_number, time, lda_post, sslm, g) + elif model == "DIM" and sslm is not None: + update_phi_fixed(doc_number, time, lda_post, sslm, g3_matrix, g4_matrix, g5_matrix) + + lhood = compute_lda_lhood(lda_post) + + # go through to this again + converged = numpy.fabs((lhood_old - lhood) / lhood_old * lda_post.doc.total) + + return lhood + + +def make_lda_seq_slice(lda, ldaseq, time): + + K = ldaseq.num_topics + + for k in range(0, K): + # s = ldaseq.topic_chains[k].e_log_prob[time] + # d = lda.topics[k] + # deep_copy(s, d) + ldaseq.topic_chains[k].e_log_prob[time] = lda.topics[k] + ldaseq.alpha = lda.alpha + return -def make_lda_seq_slice(): +def update_lda_seq_ss(time, doc, lda_post, topic_suffstats): + + K = numpy.size(lda_post.phi)[1].size[1] + N = doc.nterms + + for k in range(0, K): + topic_ss = topic_suffstats[k] + for n in range(0, N): + w = doc.word[n] + c = doc.count[n] + topic_ss[w][time] = topic_ss[w][time] + c * lda_post.phi[n][k] return -def update_lda_seq_ss(): +def init_lda_post(ldapost): + K = lda_post.lda.num_topics + N = lda_post.doc.nterms + + for k in range(0, K): + ldapost.gamma[k] = lda_post.lda.alpha[k] + float(lda_post.doc.total) / k + for n in range(0, N): + lda_post.phi[n][k] = 1.0 / K + + lda_post.doc_weight = None return +def compute_lda_lhood(): + return + +def update_phi(): + return + +def update_gamma(): + return def fit_lda_seq_topics(ldaseq, topic_suffstats): lhood = 0 @@ -481,17 +551,63 @@ def compute_bound(word_counts, totals, sslm): return val # fucntion to perform optimization -# def update_obs(counts, totals, sslm): +def update_obs(word_counts, totals, sslm): + + OBS_NORM_CUTOFF = 2 + W = sslm.num_terms + T = sslm.num_sequence + + runs = 0 + + params = opt_params(var=sslm, totals=totals) + mean_deriv_mtx = numpy.resize(numpy.zeros(T * (T + 1)), (T, T + 1)) + + + for w in range(0, W): + w_counts = word_counts[w] + + counts_norm = 0 + # now we find L2 norm of w_counts + for i in range(0, len(word_counts)): + counts_norm += word_counts[i] * word_counts[i] + + if counts_norm < OBS_NORM_CUTOFF and norm_cutoff_obs is not None: + obs = sslm.obs[w] + # a memcopy is happening here + norm_cutoff_obs = obs + else: + if counts_norm < OBS_NORM_CUTOFF: + w_counts = numpy.zeros(len(word_counts)) + + for t in range(0, T): + mean_deriv = mean_deriv_mtx[t] + compute_mean_deriv(w, t, sslm, mean_deriv) + + params.word_counts = w_counts + params.word = w + params.mean_deriv_mtx = mean_deriv_mtx + obs = sslm.obs[w] + + model = "DTM" + if model == "DTM": + optimize_fdf(T, obs, params, fdf_obs, df_obs, f_obs, f_val, conv_val, niter) + if model == "DIM": + optimize_fdf(T, obs, params, fdf_obs, df_obs, f_obs_fixed, f_val, conv_val, niter) -# W = sslm.num_terms -# T = sslm.num_sequence + runs += 1 -# runs = 0 + if counts_norm < OBS_NORM_CUTOFF: + norm_cutoff_obs = obs + + update_zeta(sslm) + return -# params = opt_params(var=sslm, totals=totals) -# mean_deriv_mtx = numpy.resize(numpy.zeros(T * (T + 1)), (T, T + 1)) -# # for w in range(0, W): +def compute_mean_deriv(): + return + +def optimize_fdf(): + return From 7c60f9045a6c3a91692e7693d389b5a424dccc27 Mon Sep 17 00:00:00 2001 From: bhargavvader Date: Thu, 30 Jun 2016 18:23:00 +0530 Subject: [PATCH 11/38] All methods completed --- gensim/models/ldaseqmodel.py | 154 +++++++++++++++++++++++++++++++++-- 1 file changed, 145 insertions(+), 9 deletions(-) diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index 02b6878bab..fb9801f042 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -417,25 +417,102 @@ def update_lda_seq_ss(time, doc, lda_post, topic_suffstats): topic_ss[w][time] = topic_ss[w][time] + c * lda_post.phi[n][k] return -def init_lda_post(ldapost): +def init_lda_post(lda_post): K = lda_post.lda.num_topics N = lda_post.doc.nterms for k in range(0, K): - ldapost.gamma[k] = lda_post.lda.alpha[k] + float(lda_post.doc.total) / k + lda_post.gamma[k] = lda_post.lda.alpha[k] + float(lda_post.doc.total) / k for n in range(0, N): lda_post.phi[n][k] = 1.0 / K lda_post.doc_weight = None return -def compute_lda_lhood(): - return +def compute_lda_lhood(lda_post): + + K = lda_post.lda.num_topics + N = lda_post.doc.nterms + gamma_sum = numpy.sum(lda_post.gamam) + + # figure out how to do flags + FLAGS_sigma_l = 0 + FLAGS_sigma_d = 0 + + # need to find replacement for this gsl method + lhood = gls_sf_lngamma(numpy.sum(lda_post.lda.alpha)) - gls_sf_lngamma(gamma_sum) + lda_post.lhood[K] = lhood + + influence_term = 0 + # need to find replacement for this gsl method + digsum = gsl_sf_psi(gamma_sum) + + model = "DTM" + for k in range(0, K): + if lda_post.doc_weight is not None and (model == "DIM" or model == "DTM"): + influence_topic = lda_post.doc_weight[k] + influence_term = - ((influence_topic * influence_topic + FLAGS_sigma_l * FLAGS_sigma_l) / 2.0 / (FLAGS_sigma_d * FLAGS_sigma_d)) + + e_log_theta_k = gsl_sf_psi(lda_post.gamma[k]) - digsum + + # figure out what is this gsl stuff + lhood_term = (lda_post.lda.alpha[k] - lda_post.gamma[k]) * e_log_theta_k + gls_sf_lngamma(lda_post.gamma[k]) - gls_sf_lngamma(lda_post.lda.alpha[k]) + + for n in range(0, N): + if lda_post.phi[n][k] > 0: + lhood_term += lda_post.doc.count[n] * lda_post.phi[n][k] * (e_log_theta_k + lda_post.lda.topics[lda_post.doc.word[n]][k] - lda_post.log_phi[n][k]) + + lda_post.lhood[k] = lhood_term + lhood += lhood_term + lhood += influence_term + + return lhood + +# update variational multinomial parameters +def update_phi(doc, time, lda_post, ldaseq, g): + + K = lda_post.lda.num_topics + N = lda_post.doc.nterms + + dig = numpy.zeros(K) + + for k in range(0, K): + dig[k] = gsl_sf_psi(lda_post.gamma[k]) + + for n in range(0, N): + w = lda_post.doc.word[n] + for k in range(0, K): + lda_post.log_phi[n][k] = dig[k] + lda_post.lda.topics[w][k] + log_phi_row = lda_post.log_phi[n] + phi_row = lda_post.phi[n] + + # log normalize + v = log_phi_row[0] + for i in range(1, len(log_phi_row)): + v = numpy.logaddexp(v, log_phi_row[i]) + + for i in range(0, len(log_phi_row)): + log_phi_row[i] = log_phi_row[i] - v + + for k in range(0, K): + phi_row[i] = numpy.exp(log_phi_row[i]) -def update_phi(): return -def update_gamma(): +# update variational dirichlet parameters +def update_gamma(lda_post): + + K = lda_post.lda.num_topics + N = lda_post.doc.nterms + + lda_post.gamma = lda_post.lda.alpha + for n in range(0, N): + phi_row = lda_post.phi[n] + count = lda_post.doc.count[n] + + for k in range(0, K): + lda_post.gamma[k] += phi_row[k] * count + return def fit_lda_seq_topics(ldaseq, topic_suffstats): @@ -603,10 +680,69 @@ def update_obs(word_counts, totals, sslm): return -def compute_mean_deriv(): - return + # compute d E[\beta_{t,w}]/d obs_{s,w} for t = 1:T. + # put the result in deriv, allocated T+1 vector + +def compute_mean_deriv(word, time, sslm, deriv): + + T = sslm.num_sequence + fwd_variance = sslm.variance[w] + + deriv[0] = 0 + + # forward pass + for t in range(1, T + 1): + if sslm.obs_variance > 0.0: + w = sslm.obs_variance / (fwd_variance[t - 1] + sslm.chain_variance + sslm.obs_variance) + else: + w = 0.0 + + val = w * deriv[t - 1] + if time == t - 1: + val += (1 - w) + + deriv[t]= val + + for t in range(T - 1, -1, -1): + if sslm.chain_variance == 0.0: + w = 0.0 + else: + w = sslm.chain_variance / (fwd_variance[t] + sslm.chain_variance) + deriv[t] = w * deriv[t] + (1 - w) * deriv[t + 1] + + return deriv + +# maximize a function using it's derivative +def optimize_fdf(dim, x, params, fdf, df, f, f_val, conv_val, niter): + + MAX_ITER = 15 + # what is multimin? + obj = gsl_multimin_function_fdf() + obj.f = f + obj.df = df + obj.fdf = fdf + obj.n = dim + obj.params = params + + method = gsl_multimin_fdfminimizer_conjugate_fr; + opt = gsl_multimin_fdfminimizer_alloc(method, dim); + gsl_multimin_fdfminimizer_set(opt, obj, x, 0.01, 1e-3) + + iter_ = 0 + f_old = 0 + + # convert from a do while here + while converged > 1e-8 and iter_ < MAX_ITER: + iter_ += 1 + status = gsl_multimin_fdfminimizer_iterate(opt) + converged = numpy.fabs((f_old - opt.f) / (dim * f_old)) + f_old = opt.f + + # all of these are pointer values being reset, so should probably return them + f_val = opt.f + conv_val = converged + niter = iter_ -def optimize_fdf(): return From 0989ba30c431b2e7fb0c0cc2628db77d5b198a94 Mon Sep 17 00:00:00 2001 From: bhargavvader Date: Fri, 1 Jul 2016 19:39:58 +0530 Subject: [PATCH 12/38] Added functions --- gensim/models/ldaseqmodel.py | 144 ++++++++++++++++++++++++++++++++++- 1 file changed, 141 insertions(+), 3 deletions(-) diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index fb9801f042..34d6280498 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -66,7 +66,7 @@ def __init__(self, corpus=None, num_topics=10, id2word=None, num_sequence=None, class sslm(utils.SaveLoad): def __init__(self, num_terms=None, num_sequence=None, obs=None, obs_variance=0.5, chain_variance=0.005, fwd_variance=None, - mean=None, variance=None, zeta=None, e_log_prob=None, fwd_mean=None, m_update_coeff=None, + mean=None, variance=None, zeta=None, e_log_prob=None, fwd_mean=None, m_update_coeff=None, temp_vect=None, mean_t=None, variance_t=None, influence_sum_lgl=None, w_phi_l=None, w_phi_sum=None, w_phi_l_sq=None, m_update_coeff_g=None): self.obs = obs @@ -89,12 +89,30 @@ def __init__(self, num_terms=None, num_sequence=None, obs=None, obs_variance=0.5 self.w_phi_l_sq = w_phi_l_sq self.m_update_coeff_g = m_update_coeff_g + # temp_vect + self.temp_vect = temp_vect + class lda_post(utils.SaveLoad): def __init__(self, doc=None, lda=None, phi=None, log_phi=None, gamma=None, lhood=None, doc_weight=None, renormalized_doc_weight=None): + self.doc = doc + self.lda = lda + self.phi = phi + self.log_phi = log_phi + self.gamma = gamma + self.lhood = lhood + self.doc_weight = doc_weight + self.renormalized_doc_weight = renormalized_doc_weight + return class opt_params(utils.SaveLoad): - def __init__(sslm, word_counts, totals, mean_deriv_mtx, word): + def __init__(self, sslm, word_counts, totals, mean_deriv_mtx, word): + self.sslm = sslm + self.word_counts + self.totals = totals + self.mean_deriv_mtx = mean_deriv_mtx + self.word = word + return def update_zeta(sslm): @@ -631,6 +649,12 @@ def compute_bound(word_counts, totals, sslm): def update_obs(word_counts, totals, sslm): OBS_NORM_CUTOFF = 2 + + # used in optimize function but not sure what is happening + f_val = None + conv_val = None + niter = None + W = sslm.num_terms T = sslm.num_sequence @@ -745,13 +769,127 @@ def optimize_fdf(dim, x, params, fdf, df, f, f_val, conv_val, niter): return +def fdf_obs(x, params, f, df): + + p = params + model = "DTM" + + if model == "DTM": + f = f_obs(x, params) + compute_obs_deriv(p.word, p.word_counts, p.totals, p.sslm, p.mean_deriv_mtx, df) + elif model == "DIM": + f = f_obs_multiplt(x, params) + compute_obs_deriv_fixed(p.word, p.word_counts, p.totals, p.sslm, p.mean_deriv_mtx, df) + + for i in range(0, len(df)): + df[i] = - df[i] + +def df_obs(x, params, df): + + p = params + p.sslm.obs[p.word] = x + + compute_post_mean(p.word, p.sslm, p.sslm.chain_variance) + if model == "DTM": + compute_obs_deriv(p.word, p.word_counts, p.totals, p.sslm, p.mean_deriv_mtx, df) + elif model == "DIM": + compute_obs_deriv_fixed(p.word, p.word_counts, p.totals, p.sslm, p.mean_deriv_mtx, df) + +def f_obs(x, params): + + # flag + init_mult = 1000 + + T = len(x) + val = 0 + term1 = 0 + term2 = 0 + + # term 3 and 4 for DIM + term3 = 0 + term4 = 0 + + p = params + p.sslm.obs[p.word] = x + compute_post_mean(p.word, p.sslm, p.sslm.chain_variance) + + mean = p.sslm.mean[p.word] + variance = p.sslm.variance[p.word] + w_phi_l = p.sslm.w_phi_l[p.word] + m_update_coeff = p.sslm.m_update_coeff[p.word] + for t in range(1, T + 1): + mean_t = mean[t] + mean_t_prev = mean[t - 1] + var_t_prev = variance[t - 1] + + val = mean_t - mean_t_prev + term1 += val * val + term2 += p.word_counts[t - 1] * mean_t - p.totals[t - 1] * numpy.exp(mean_t + variance[t] / 2) / p.sslm.zeta[t - 1] + + model = "DTM" + if model == "DIM": + # stuff happens + pass + if p.sslm.chain_variance > 0.0: + + term1 = - (term1 / 2 * p.sslm.chain_variance) + term1 = term1 - mean[0] * mean[0] / (2 * init_milt * p.sslm.chain_variance) + else: + term1 = 0.0 + return -(term1 + term2 + term3 + term4) +def compute_obs_deriv(word, word_counts, totals, sslm, mean_deriv_mtx, deriv): + # flag + init_mult = 1000 + T = sslm.num_sequence + + mean = sslm.mean[word] + variance = sslm.variance[word] + sslm.temp_vect = numpy.zeros(T) - \ No newline at end of file + for u in range(0, T): + sslm.temp_vect[u] = numpy.exp(mean[u + 1] + variance[u + 1] / 2) + + w_phi_l = sslm.w_phi_l[word] + m_update_coeff = sslm.m_update_coeff[word] + + for t in range(0, T): + + mean_deriv = mean_deriv_mtx[t] + term1 = 0 + term2 = 0 + term3 = 0 + term4 = 0 + + for u in range(1, T + 1): + mean_u = mean[u] + variance_u_prev = variance[u - 1] + mean_u_prev = mean[u - 1] + dmean_u = mean_deriv[u] + dmean_u_prev = mean_deriv[u - 1] + + term1 += (mean_u - mean_u_prev) * (dmean_u * dmean_u_prev) + + term2 += (word_counts[u - 1] - (totals[u - 1] * sslm.temp_vect[u - 1] / sslm.zeta[u - 1])) * dmean_u + + model = "DTM" + if model == "DIM": + # do some stuff + pass + + if sslm.chain_variance: + term1 = - (term1 / sslm.chain_variance) + term1 = term1 - (mean[0] * mean_deriv[0]) / init_mult * sslm.chain_variance + else: + term1 = 0.0 + + deriv[t] = term1 + term2 + term3 + term4 + + return From bf4c41686f8ecd47fe7836347a115aa2dee82e22 Mon Sep 17 00:00:00 2001 From: bhargavvader Date: Sat, 2 Jul 2016 13:59:53 +0530 Subject: [PATCH 13/38] Added more methods --- gensim/models/ldaseqmodel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index 34d6280498..945179213b 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -458,7 +458,7 @@ def compute_lda_lhood(lda_post): FLAGS_sigma_d = 0 # need to find replacement for this gsl method - lhood = gls_sf_lngamma(numpy.sum(lda_post.lda.alpha)) - gls_sf_lngamma(gamma_sum) + lhood = gsl_sf_lngamma(numpy.sum(lda_post.lda.alpha)) - gls_sf_lngamma(gamma_sum) lda_post.lhood[K] = lhood influence_term = 0 From 7987f35b65835becc7be37d3e2a18aa60f408a61 Mon Sep 17 00:00:00 2001 From: bhargavvader Date: Sat, 2 Jul 2016 17:06:47 +0530 Subject: [PATCH 14/38] Replaces gsl functions --- gensim/models/ldaseqmodel.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index 945179213b..b2375ed7b0 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -19,6 +19,8 @@ from gensim import interfaces, utils, matutils from gensim.models import ldamodel import numpy +import math +from scipy.special import digamma class seq_corpus(utils.SaveLoad): def __init__(self, num_terms=0, max_nterms=0, length=0, num_doc=0, corpuses=0): @@ -457,13 +459,11 @@ def compute_lda_lhood(lda_post): FLAGS_sigma_l = 0 FLAGS_sigma_d = 0 - # need to find replacement for this gsl method - lhood = gsl_sf_lngamma(numpy.sum(lda_post.lda.alpha)) - gls_sf_lngamma(gamma_sum) + lhood = math.lgamma(numpy.sum(lda_post.lda.alpha)) - math.lgamma(gamma_sum) lda_post.lhood[K] = lhood influence_term = 0 - # need to find replacement for this gsl method - digsum = gsl_sf_psi(gamma_sum) + digsum = digamma(gamma_sum) model = "DTM" for k in range(0, K): @@ -471,10 +471,9 @@ def compute_lda_lhood(lda_post): influence_topic = lda_post.doc_weight[k] influence_term = - ((influence_topic * influence_topic + FLAGS_sigma_l * FLAGS_sigma_l) / 2.0 / (FLAGS_sigma_d * FLAGS_sigma_d)) - e_log_theta_k = gsl_sf_psi(lda_post.gamma[k]) - digsum + e_log_theta_k = digamma(lda_post.gamma[k]) - digsum - # figure out what is this gsl stuff - lhood_term = (lda_post.lda.alpha[k] - lda_post.gamma[k]) * e_log_theta_k + gls_sf_lngamma(lda_post.gamma[k]) - gls_sf_lngamma(lda_post.lda.alpha[k]) + lhood_term = (lda_post.lda.alpha[k] - lda_post.gamma[k]) * e_log_theta_k + math.lgamma(lda_post.gamma[k]) - math.lgamma(lda_post.lda.alpha[k]) for n in range(0, N): if lda_post.phi[n][k] > 0: @@ -495,7 +494,7 @@ def update_phi(doc, time, lda_post, ldaseq, g): dig = numpy.zeros(K) for k in range(0, K): - dig[k] = gsl_sf_psi(lda_post.gamma[k]) + dig[k] = digamma(lda_post.gamma[k]) for n in range(0, N): w = lda_post.doc.word[n] From 2389177fba7aed4933cee2937719537f18e6c00b Mon Sep 17 00:00:00 2001 From: bhargavvader Date: Mon, 4 Jul 2016 12:50:42 +0530 Subject: [PATCH 15/38] Added tests --- gensim/models/ldaseqmodel.py | 19 +- gensim/test/test_data/before_posterior_gamma | 2 + gensim/test/test_data/before_posterior_logphi | 232 ++++ gensim/test/test_data/before_posterior_phi | 232 ++++ gensim/test/test_data/before_posterior_topics | 1124 +++++++++++++++++ gensim/test/test_ldaseqmodel.py | 29 + 6 files changed, 1637 insertions(+), 1 deletion(-) create mode 100644 gensim/test/test_data/before_posterior_gamma create mode 100644 gensim/test/test_data/before_posterior_logphi create mode 100644 gensim/test/test_data/before_posterior_phi create mode 100644 gensim/test/test_data/before_posterior_topics diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index b2375ed7b0..46840ad516 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -22,11 +22,23 @@ import math from scipy.special import digamma +# this is a mock LDA class to help with testing until this is figured out +class mockLDA(utils.SaveLoad): + def __init__(self, num_topics, topics): + self.num_topics = num_topics + self.topics = topics + +# a mock document class to help with testing until this is figured out +class doc(utils.SaveLoad): + def __init__(self, nterms, word): + self.nterms = nterms + self.word = word + class seq_corpus(utils.SaveLoad): def __init__(self, num_terms=0, max_nterms=0, length=0, num_doc=0, corpuses=0): self.num_terms = num_terms self.max_nterms = max_nterms - self.length = length + self.length = len(corpuses) self.num_docs = num_docs # list of corpus class objects @@ -362,12 +374,14 @@ def inferDTMseq(K, ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, l make_lda_seq_slice(lda, ldaseq, t) # what to do here ndocs = seq_corpus.corpuses[t].ndocs + # ndocs = len(seq_corpus.corpuses[t]) for d in range(0, ndocs): gam = gammas[doc_index] lhood = lhoods[doc_index] lda_post.gamma = gam lda_post.lhood = lhood lda_post.doc = seq_corpus.corpuses[t].doc[d] + # lda_post.doc = seq_corpus.corpuses[t][d] if iter_ == 0: doc_lhood = fit_lda_post(d, t, lda_post, None, None, None, None, None) else: @@ -514,6 +528,9 @@ def update_phi(doc, time, lda_post, ldaseq, g): for k in range(0, K): phi_row[i] = numpy.exp(log_phi_row[i]) + lda_post.log_phi[n] = log_phi_row + lda_post.phi[n] = phi_row + return # update variational dirichlet parameters diff --git a/gensim/test/test_data/before_posterior_gamma b/gensim/test/test_data/before_posterior_gamma new file mode 100644 index 0000000000..930470ca86 --- /dev/null +++ b/gensim/test/test_data/before_posterior_gamma @@ -0,0 +1,2 @@ +0.01 +3.01 diff --git a/gensim/test/test_data/before_posterior_logphi b/gensim/test/test_data/before_posterior_logphi new file mode 100644 index 0000000000..af538b4031 --- /dev/null +++ b/gensim/test/test_data/before_posterior_logphi @@ -0,0 +1,232 @@ +-105.042 +0 +-103.888 +0 +-101.562 +0 +-104.22 +0 +-102.144 +0 +0 +-107.181 +0 +-106.269 +0 +-105.562 +0 +-105.423 +0 +-104.838 +0 +-108.256 +0 +-106.988 +0 +-103.146 +0 +-105.423 +0 +-107.733 +0 +-107.434 +0 +-105.423 +0 +-106.988 +0 +-105.006 +0 +-107.739 +0 +-105.423 +0 +-105.423 +0 +-107.236 +0 +-107.4 +0 +-107.728 +0 +-106.907 +0 +-106.667 +0 +-107.748 +0 +-106.988 +0 +-106.99 +0 +-107.438 +0 +-104.751 +0 +-106.667 +0 +-106.99 +0 +-106.99 +0 +-106.667 +0 +-106.667 +0 +-106.667 +0 +-106.99 +0 +-106.667 +0 +-106.99 +0 +-106.667 +0 +-106.667 +0 +-106.667 +0 +-107.75 +0 +-106.667 +0 +-106.99 +0 +-106.667 +0 +-106.667 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-106.667 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-106.99 +0 +-105.423 +0 +-105.423 +0 +-106.667 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-106.667 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 +0 +-105.423 diff --git a/gensim/test/test_data/before_posterior_phi b/gensim/test/test_data/before_posterior_phi new file mode 100644 index 0000000000..c9b9f678bc --- /dev/null +++ b/gensim/test/test_data/before_posterior_phi @@ -0,0 +1,232 @@ +2.40322e-46 +1 +7.61974e-46 +1 +7.79857e-45 +1 +5.46583e-46 +1 +4.35942e-45 +1 +1 +2.82936e-47 +1 +7.04587e-47 +1 +1.42956e-46 +1 +1.64151e-46 +1 +2.94744e-46 +1 +9.66349e-48 +1 +3.43223e-47 +1 +1.60024e-45 +1 +1.64151e-46 +1 +1.62944e-47 +1 +2.19805e-47 +1 +1.64151e-46 +1 +3.43223e-47 +1 +2.4908e-46 +1 +1.61979e-47 +1 +1.64151e-46 +1 +1.64151e-46 +1 +2.68012e-47 +1 +2.27403e-47 +1 +1.63776e-47 +1 +3.72445e-47 +1 +4.73041e-47 +1 +1.60524e-47 +1 +3.43223e-47 +1 +3.42567e-47 +1 +2.18987e-47 +1 +3.2139e-46 +1 +4.73041e-47 +1 +3.42567e-47 +1 +3.42567e-47 +1 +4.73041e-47 +1 +4.73041e-47 +1 +4.73041e-47 +1 +3.42567e-47 +1 +4.73041e-47 +1 +3.42569e-47 +1 +4.73041e-47 +1 +4.73041e-47 +1 +4.73041e-47 +1 +1.60243e-47 +1 +4.73041e-47 +1 +3.42567e-47 +1 +4.73041e-47 +1 +4.73041e-47 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +4.73041e-47 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +3.42567e-47 +1 +1.64151e-46 +1 +1.64151e-46 +1 +4.73041e-47 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +4.73041e-47 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 +1 +1.64151e-46 diff --git a/gensim/test/test_data/before_posterior_topics b/gensim/test/test_data/before_posterior_topics new file mode 100644 index 0000000000..26b908973f --- /dev/null +++ b/gensim/test/test_data/before_posterior_topics @@ -0,0 +1,1124 @@ +-7.14413 +-4.76993 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-4.48334 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.33648 +-7.06935 +-7.14413 +-7.06935 +-3.72561 +-4.76745 +-7.14413 +-7.06935 +-4.82491 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.1404 +-7.06935 +-4.96766 +-7.06935 +-7.14413 +-7.06935 +-5.9052 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-4.81041 +-5.9052 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-4.02652 +-7.06935 +-5.13194 +-7.06935 +-4.9728 +-4.4814 +-4.64291 +-4.92746 +-4.70724 +-7.06935 +-5.9052 +-7.06935 +-7.14413 +-7.06935 +-5.38592 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.33838 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.58349 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.13616 +-7.06935 +-5.89987 +-7.06935 +-7.14413 +-4.81041 +-7.14413 +-7.06935 +-7.14413 +-4.81041 +-7.14413 +-5.16211 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-4.38989 +-5.16211 +-7.14413 +-7.06935 +-4.3287 +-4.19402 +-4.88647 +-4.95151 +-7.14413 +-7.06935 +-7.14413 +-4.8059 +-7.14413 +-5.16211 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-4.76823 +-5.89987 +-7.06935 +-4.13869 +-3.30587 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-5.16211 +-5.58096 +-7.06935 +-7.14413 +-4.83111 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-4.30661 +-7.06935 +-5.62117 +-7.06935 +-4.98452 +-7.06935 +-5.90384 +-7.06935 +-7.14413 +-7.06935 +-5.17093 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.13144 +-4.47377 +-7.14413 +-7.06935 +-5.90384 +-7.06935 +-7.14413 +-7.06935 +-5.58096 +-7.06935 +-5.90384 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-4.31153 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.57904 +-7.06935 +-5.90384 +-7.06935 +-5.5829 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-4.79381 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.90384 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.90384 +-7.06935 +-4.83398 +-7.06935 +-5.13337 +-7.06935 +-5.90384 +-7.06935 +-5.90384 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.57904 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.90384 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.57904 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.5829 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.06456 +-4.57497 +-7.14413 +-7.06935 +-5.22313 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-4.828 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-5.16211 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.89987 +-7.06935 +-5.89987 +-4.76823 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-5.16211 +-7.14413 +-7.06935 +-5.89987 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.38713 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-4.93502 +-4.3089 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.33164 +-7.06935 +-7.14413 +-7.06935 +-4.85472 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.6608 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.64763 +-5.05383 +-5.1674 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-4.83904 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.6608 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-5.16211 +-7.14413 +-7.06935 +-5.89987 +-7.06935 +-4.81897 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-4.76823 +-7.14413 +-7.06935 +-5.33353 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-4.76823 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-4.25666 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-3.37204 +-7.14413 +-5.16211 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-4.76823 +-5.57711 +-5.16211 +-5.58096 +-4.76823 +-7.14413 +-5.16211 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.58096 +-7.06935 +-7.14413 +-4.76823 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-4.25666 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.33353 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.57904 +-7.06935 +-5.57904 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.57904 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.89987 +-7.06935 +-5.89987 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.89987 +-7.06935 +-5.32977 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.57711 +-7.06935 +-5.89987 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.63203 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.12958 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.89987 +-4.75673 +-7.14413 +-5.15434 +-7.14413 +-7.06935 +-5.89987 +-5.15434 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.89987 +-7.06935 +-7.14413 +-7.06935 +-5.57711 +-7.06935 +-5.57711 +-7.06935 +-7.14413 +-7.06935 +-5.89987 +-7.06935 +-7.14413 +-7.06935 +-5.57711 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.89987 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.36423 +-7.06935 +-5.57711 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.89987 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.89987 +-7.06935 +-7.14413 +-7.06935 +-5.89987 +-7.06935 +-5.32977 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.89987 +-7.06935 +-7.14413 +-7.06935 +-5.57711 +-7.06935 +-7.14413 +-7.06935 +-5.57711 +-7.06935 +-5.89987 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.32977 +-7.06935 +-7.14413 +-7.06935 +-5.89987 +-7.06935 +-7.14413 +-7.06935 +-5.89987 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.89987 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.89987 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.57711 +-7.06935 +-7.14413 +-7.06935 +-5.89987 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-4.53854 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.57711 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.57712 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.89987 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.89987 +-7.06935 +-5.89987 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-4.81725 +-7.06935 +-7.14413 +-7.06935 +-5.89987 +-7.06935 +-5.89987 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-4.9617 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.89987 +-7.06935 +-5.89987 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.57711 +-7.06935 +-7.14413 +-7.06935 +-5.89987 +-7.06935 +-5.89987 +-7.06935 +-5.89987 +-7.06935 +-5.89987 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.89987 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.57711 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.89987 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-5.89987 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-4.21596 +-7.14413 +-7.06935 +-7.14413 +-4.4487 +-7.14413 +-3.58963 +-7.14413 +-5.14553 +-7.14413 +-5.14553 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-7.06935 +-7.14413 +-4.74357 +-7.14413 +-7.06935 +-7.14413 +-4.74357 +-7.14413 +-5.14553 +-7.14413 +-4.74357 +-7.14413 +-7.06935 diff --git a/gensim/test/test_ldaseqmodel.py b/gensim/test/test_ldaseqmodel.py index d13bad396f..b943283c89 100644 --- a/gensim/test/test_ldaseqmodel.py +++ b/gensim/test/test_ldaseqmodel.py @@ -97,6 +97,35 @@ def testLogProb(self): self.assertAlmostEqual(expected_log_prob[0], sslm.e_log_prob[0][0], places=2) +# to test by Wednesday: +# 1. update_phi +# 2. update_gamma +# 3. update_ldaseq_ss +# 4. init_lda_post +# 5. compute_bound +# 6. compute_mean_deriv +# 7. compute_obs_deriv +# 8. compute_lda_lhood + + def testUpdatePhi(self): + + # we test update phi for one particular document + doc = ldaseqmodel.doc(nterms=3, word=[549, 560, 561]) + topics = numpy.array(numpy.split(numpy.loadtxt(datapath('before_posterior_topics')), 562)) + lda = ldaseqmodel.mockLDA(num_topics=2, topics=topics) + + log_phi = numpy.array(numpy.split(numpy.loadtxt(datapath('before_posterior_logphi')), 116)) + phi = numpy.array(numpy.split(numpy.loadtxt(datapath('before_posterior_phi')), 116)) + gamma = numpy.array(numpy.loadtxt(datapath('before_posterior_gamma'))) + + lda_post = ldaseqmodel.lda_post(lda=lda, doc=doc, log_phi= log_phi, phi=phi, gamma=gamma) + ldaseqmodel.update_phi(10, 3, lda_post, None, None) + + expected_log_phi = numpy.array([[-105.04211145, 0. ], [-103.88817145, 0. ]]) + expected_phi = numpy.array([[ 2.40322000e-46, 1.00000000e+00], [ 7.61974000e-46, 1.00000000e+00]]) + + self.assertAlmostEqual(expected_log_phi[0][0], lda_post.log_phi[0][0], places=2) + self.assertAlmostEqual(expected_phi[0][0], lda_post.phi[0][0], places=2) if __name__ == '__main__': From 6fe8524a92a3d79b6376d4f3bf4f537149e19963 Mon Sep 17 00:00:00 2001 From: bhargavvader Date: Wed, 6 Jul 2016 19:04:32 +0530 Subject: [PATCH 16/38] Wrote all tests --- gensim/models/ldaseqmodel.py | 62 +- gensim/test/test_data/DTM/.DS_Store | Bin 0 -> 10244 bytes gensim/test/test_data/DTM/before_bound_counts | 2248 +++++++++++++ .../test/test_data/DTM/before_bound_fwd_mean | 2810 +++++++++++++++++ .../test_data/DTM/before_bound_fwd_variance | 2810 +++++++++++++++++ gensim/test/test_data/DTM/before_bound_mean | 2810 +++++++++++++++++ gensim/test/test_data/DTM/before_bound_obs | 2248 +++++++++++++ gensim/test/test_data/DTM/before_bound_totals | 4 + .../before_bound_variance} | 0 .../test/test_data/DTM/before_bound_w_phi_l | 2248 +++++++++++++ gensim/test/test_data/DTM/before_bound_zeta | 4 + gensim/test/test_data/DTM/before_fobs_audit | 1 + .../test/test_data/DTM/before_fobs_fwd_mean | 2810 +++++++++++++++++ .../test_data/DTM/before_fobs_fwd_variance | 2810 +++++++++++++++++ gensim/test/test_data/DTM/before_fobs_mean | 2810 +++++++++++++++++ gensim/test/test_data/DTM/before_fobs_mupdate | 2248 +++++++++++++ gensim/test/test_data/DTM/before_fobs_obs | 2248 +++++++++++++ gensim/test/test_data/DTM/before_fobs_totals | 4 + .../test/test_data/DTM/before_fobs_variance | 2810 +++++++++++++++++ gensim/test/test_data/DTM/before_fobs_w_phi_l | 2248 +++++++++++++ .../test/test_data/DTM/before_fobs_wordcounts | 4 + gensim/test/test_data/DTM/before_fobs_x | 4 + gensim/test/test_data/DTM/before_fobs_zeta | 4 + .../test/test_data/{ => DTM}/before_fwd_mean | 0 .../test_data/{ => DTM}/before_fwd_variance | 0 .../test_data/{ => DTM}/before_fwd_variance1 | 0 .../before_ldaseq_phi} | 0 .../test/test_data/DTM/before_ldaseq_sstats_0 | 2248 +++++++++++++ .../test/test_data/DTM/before_ldaseq_sstats_1 | 2248 +++++++++++++ .../before_lhood_gamma} | 0 .../test/test_data/DTM/before_lhood_lda_alpha | 2 + .../before_lhood_lda_topics} | 0 gensim/test/test_data/DTM/before_lhood_lhood | 3 + .../before_lhood_log_phi} | 0 gensim/test/test_data/DTM/before_lhood_phi | 232 ++ gensim/test/test_data/DTM/before_log_norm.dat | 562 ++++ gensim/test/test_data/{ => DTM}/before_mean | 0 gensim/test/test_data/DTM/before_mean_deriv | 5 + .../before_mean_deriv_variance} | 0 gensim/test/test_data/{ => DTM}/before_obs | 0 gensim/test/test_data/DTM/before_obs_deriv | 4 + gensim/test/test_data/DTM/before_obs_m_update | 2248 +++++++++++++ gensim/test/test_data/DTM/before_obs_mean | 2810 +++++++++++++++++ .../test_data/DTM/before_obs_mean_deriv_mtx | 20 + gensim/test/test_data/DTM/before_obs_totals | 4 + gensim/test/test_data/DTM/before_obs_variance | 2810 +++++++++++++++++ gensim/test/test_data/DTM/before_obs_w_phi_l | 2248 +++++++++++++ .../test/test_data/DTM/before_obs_wordcounts | 4 + gensim/test/test_data/DTM/before_obs_zeta | 4 + .../test/test_data/DTM/before_posterior_gamma | 2 + .../test_data/DTM/before_posterior_logphi | 232 ++ .../test/test_data/DTM/before_posterior_phi | 232 ++ .../test_data/DTM/before_posterior_topics | 1124 +++++++ gensim/test/test_data/DTM/before_update_gamma | 2 + gensim/test/test_data/DTM/before_update_phi | 232 ++ gensim/test/test_data/DTM/before_variance | 5 + gensim/test/test_data/{ => DTM}/eprob_before | 0 gensim/test/test_data/{ => DTM}/eprob_mean | 0 gensim/test/test_data/{ => DTM}/eprob_zeta | 0 .../test/test_data/{ => DTM}/sample_mean_DTM | 0 gensim/test/test_data/DTM/sample_variance_DTM | 2810 +++++++++++++++++ gensim/test/test_ldaseqmodel.py | 159 +- 62 files changed, 53455 insertions(+), 40 deletions(-) create mode 100644 gensim/test/test_data/DTM/.DS_Store create mode 100644 gensim/test/test_data/DTM/before_bound_counts create mode 100644 gensim/test/test_data/DTM/before_bound_fwd_mean create mode 100644 gensim/test/test_data/DTM/before_bound_fwd_variance create mode 100644 gensim/test/test_data/DTM/before_bound_mean create mode 100644 gensim/test/test_data/DTM/before_bound_obs create mode 100644 gensim/test/test_data/DTM/before_bound_totals rename gensim/test/test_data/{sample_variance_DTM => DTM/before_bound_variance} (100%) create mode 100644 gensim/test/test_data/DTM/before_bound_w_phi_l create mode 100644 gensim/test/test_data/DTM/before_bound_zeta create mode 100644 gensim/test/test_data/DTM/before_fobs_audit create mode 100644 gensim/test/test_data/DTM/before_fobs_fwd_mean create mode 100644 gensim/test/test_data/DTM/before_fobs_fwd_variance create mode 100644 gensim/test/test_data/DTM/before_fobs_mean create mode 100644 gensim/test/test_data/DTM/before_fobs_mupdate create mode 100644 gensim/test/test_data/DTM/before_fobs_obs create mode 100644 gensim/test/test_data/DTM/before_fobs_totals create mode 100644 gensim/test/test_data/DTM/before_fobs_variance create mode 100644 gensim/test/test_data/DTM/before_fobs_w_phi_l create mode 100644 gensim/test/test_data/DTM/before_fobs_wordcounts create mode 100644 gensim/test/test_data/DTM/before_fobs_x create mode 100644 gensim/test/test_data/DTM/before_fobs_zeta rename gensim/test/test_data/{ => DTM}/before_fwd_mean (100%) rename gensim/test/test_data/{ => DTM}/before_fwd_variance (100%) rename gensim/test/test_data/{ => DTM}/before_fwd_variance1 (100%) rename gensim/test/test_data/{before_posterior_phi => DTM/before_ldaseq_phi} (100%) create mode 100644 gensim/test/test_data/DTM/before_ldaseq_sstats_0 create mode 100644 gensim/test/test_data/DTM/before_ldaseq_sstats_1 rename gensim/test/test_data/{before_posterior_gamma => DTM/before_lhood_gamma} (100%) create mode 100644 gensim/test/test_data/DTM/before_lhood_lda_alpha rename gensim/test/test_data/{before_posterior_topics => DTM/before_lhood_lda_topics} (100%) create mode 100644 gensim/test/test_data/DTM/before_lhood_lhood rename gensim/test/test_data/{before_posterior_logphi => DTM/before_lhood_log_phi} (100%) create mode 100644 gensim/test/test_data/DTM/before_lhood_phi create mode 100644 gensim/test/test_data/DTM/before_log_norm.dat rename gensim/test/test_data/{ => DTM}/before_mean (100%) create mode 100644 gensim/test/test_data/DTM/before_mean_deriv rename gensim/test/test_data/{before_variance => DTM/before_mean_deriv_variance} (100%) rename gensim/test/test_data/{ => DTM}/before_obs (100%) create mode 100644 gensim/test/test_data/DTM/before_obs_deriv create mode 100644 gensim/test/test_data/DTM/before_obs_m_update create mode 100644 gensim/test/test_data/DTM/before_obs_mean create mode 100644 gensim/test/test_data/DTM/before_obs_mean_deriv_mtx create mode 100644 gensim/test/test_data/DTM/before_obs_totals create mode 100644 gensim/test/test_data/DTM/before_obs_variance create mode 100644 gensim/test/test_data/DTM/before_obs_w_phi_l create mode 100644 gensim/test/test_data/DTM/before_obs_wordcounts create mode 100644 gensim/test/test_data/DTM/before_obs_zeta create mode 100644 gensim/test/test_data/DTM/before_posterior_gamma create mode 100644 gensim/test/test_data/DTM/before_posterior_logphi create mode 100644 gensim/test/test_data/DTM/before_posterior_phi create mode 100644 gensim/test/test_data/DTM/before_posterior_topics create mode 100644 gensim/test/test_data/DTM/before_update_gamma create mode 100644 gensim/test/test_data/DTM/before_update_phi create mode 100644 gensim/test/test_data/DTM/before_variance rename gensim/test/test_data/{ => DTM}/eprob_before (100%) rename gensim/test/test_data/{ => DTM}/eprob_mean (100%) rename gensim/test/test_data/{ => DTM}/eprob_zeta (100%) rename gensim/test/test_data/{ => DTM}/sample_mean_DTM (100%) create mode 100644 gensim/test/test_data/DTM/sample_variance_DTM diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index 46840ad516..aec1afc243 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -24,15 +24,18 @@ # this is a mock LDA class to help with testing until this is figured out class mockLDA(utils.SaveLoad): - def __init__(self, num_topics, topics): + def __init__(self, num_topics=None, topics=None, alpha=None): self.num_topics = num_topics self.topics = topics + self.alpha = alpha # a mock document class to help with testing until this is figured out class doc(utils.SaveLoad): - def __init__(self, nterms, word): + def __init__(self, nterms=None, word=None, count=None, total=None): self.nterms = nterms self.word = word + self.count = count + self.total = total class seq_corpus(utils.SaveLoad): def __init__(self, num_terms=0, max_nterms=0, length=0, num_doc=0, corpuses=0): @@ -117,22 +120,20 @@ def __init__(self, doc=None, lda=None, phi=None, log_phi=None, gamma=None, lhood self.doc_weight = doc_weight self.renormalized_doc_weight = renormalized_doc_weight - return - class opt_params(utils.SaveLoad): - def __init__(self, sslm, word_counts, totals, mean_deriv_mtx, word): + def __init__(self, sslm=None, word_counts=None, totals=None, mean_deriv_mtx=None, word=None): self.sslm = sslm - self.word_counts + self.word_counts = word_counts self.totals = totals self.mean_deriv_mtx = mean_deriv_mtx self.word = word - return - def update_zeta(sslm): # setting limits - num_terms = sslm.obs.shape[0] # this is word length (our example, 562) - num_sequence = sslm.obs.shape[1] # this is number of sequeces + # num_terms = sslm.obs.shape[0] # this is word length (our example, 562) + # num_sequence = sslm.obs.shape[1] # this is number of sequeces + num_terms = sslm.num_terms + num_sequence = sslm.num_sequence # making zero and updating sslm.zeta.fill(0) for i in range(0, num_terms): @@ -225,7 +226,7 @@ def sslm_counts_init(sslm, obs_variance, chain_variance, sstats): log_norm_counts = sstats log_norm_counts = log_norm_counts / sum(log_norm_counts) - log_norm_counts = log_norm_counts + 1.0/W + log_norm_counts = log_norm_counts + 1.0 / W log_norm_counts = log_norm_counts / sum(log_norm_counts) log_norm_counts = numpy.log(log_norm_counts) @@ -440,7 +441,7 @@ def make_lda_seq_slice(lda, ldaseq, time): def update_lda_seq_ss(time, doc, lda_post, topic_suffstats): - K = numpy.size(lda_post.phi)[1].size[1] + K = numpy.shape(lda_post.phi)[1] N = doc.nterms for k in range(0, K): @@ -449,6 +450,9 @@ def update_lda_seq_ss(time, doc, lda_post, topic_suffstats): w = doc.word[n] c = doc.count[n] topic_ss[w][time] = topic_ss[w][time] + c * lda_post.phi[n][k] + + topic_suffstats[k] = topic_ss + return def init_lda_post(lda_post): @@ -456,7 +460,7 @@ def init_lda_post(lda_post): N = lda_post.doc.nterms for k in range(0, K): - lda_post.gamma[k] = lda_post.lda.alpha[k] + float(lda_post.doc.total) / k + lda_post.gamma[k] = lda_post.lda.alpha[k] + float(lda_post.doc.total) / K for n in range(0, N): lda_post.phi[n][k] = 1.0 / K @@ -467,7 +471,7 @@ def compute_lda_lhood(lda_post): K = lda_post.lda.num_topics N = lda_post.doc.nterms - gamma_sum = numpy.sum(lda_post.gamam) + gamma_sum = numpy.sum(lda_post.gamma) # figure out how to do flags FLAGS_sigma_l = 0 @@ -638,6 +642,9 @@ def compute_bound(word_counts, totals, sslm): print ("Computing bound, all times") for t in range(1, T + 1): + term_1 = 0.0 + term_2 = 0.0 + ent = 0.0 for w in range(0, W): m = sslm.mean[w][t] @@ -647,20 +654,17 @@ def compute_bound(word_counts, totals, sslm): # Values specifically related to document influence: # Note that our indices are off by 1 here. - w_phi_l = sslm.w_phi_l[w][t - 1] - exp_i = numpy.exp(numpy.negative(prev_m)) - + exp_i = numpy.exp(-prev_m) term_1 += (numpy.power(m - prev_m - (w_phi_l * exp_i), 2) / (2 * chain_variance)) - (v / chain_variance) - numpy.log(chain_variance) - term_2 += word_counts[w][t - 1] * m ent += numpy.log(v) / 2 # note the 2pi's cancel with term1 (see doc) - term_3 += totals[t - 1] * numpy.log(sslm.zeta[t - 1]) - val += numpy.negative(term_1) + term_2 + term_3 + ent + term_3 = -totals[t - 1] * numpy.log(sslm.zeta[t - 1]) + val += term_2 + term_3 + ent - term_1 return val - + # fucntion to perform optimization def update_obs(word_counts, totals, sslm): @@ -726,7 +730,7 @@ def update_obs(word_counts, totals, sslm): def compute_mean_deriv(word, time, sslm, deriv): T = sslm.num_sequence - fwd_variance = sslm.variance[w] + fwd_variance = sslm.variance[word] deriv[0] = 0 @@ -750,7 +754,7 @@ def compute_mean_deriv(word, time, sslm, deriv): w = sslm.chain_variance / (fwd_variance[t] + sslm.chain_variance) deriv[t] = w * deriv[t] + (1 - w) * deriv[t + 1] - return deriv + return # maximize a function using it's derivative def optimize_fdf(dim, x, params, fdf, df, f, f_val, conv_val, niter): @@ -764,8 +768,8 @@ def optimize_fdf(dim, x, params, fdf, df, f, f_val, conv_val, niter): obj.n = dim obj.params = params - method = gsl_multimin_fdfminimizer_conjugate_fr; - opt = gsl_multimin_fdfminimizer_alloc(method, dim); + method = gsl_multimin_fdfminimizer_conjugate_fr + opt = gsl_multimin_fdfminimizer_alloc(method, dim) gsl_multimin_fdfminimizer_set(opt, obj, x, 0.01, 1e-3) iter_ = 0 @@ -850,8 +854,8 @@ def f_obs(x, params): if p.sslm.chain_variance > 0.0: - term1 = - (term1 / 2 * p.sslm.chain_variance) - term1 = term1 - mean[0] * mean[0] / (2 * init_milt * p.sslm.chain_variance) + term1 = - (term1 / (2 * p.sslm.chain_variance)) + term1 = term1 - mean[0] * mean[0] / (2 * init_mult * p.sslm.chain_variance) else: term1 = 0.0 @@ -891,7 +895,7 @@ def compute_obs_deriv(word, word_counts, totals, sslm, mean_deriv_mtx, deriv): dmean_u = mean_deriv[u] dmean_u_prev = mean_deriv[u - 1] - term1 += (mean_u - mean_u_prev) * (dmean_u * dmean_u_prev) + term1 += (mean_u - mean_u_prev) * (dmean_u - dmean_u_prev) term2 += (word_counts[u - 1] - (totals[u - 1] * sslm.temp_vect[u - 1] / sslm.zeta[u - 1])) * dmean_u @@ -902,7 +906,7 @@ def compute_obs_deriv(word, word_counts, totals, sslm, mean_deriv_mtx, deriv): if sslm.chain_variance: term1 = - (term1 / sslm.chain_variance) - term1 = term1 - (mean[0] * mean_deriv[0]) / init_mult * sslm.chain_variance + term1 = term1 - (mean[0] * mean_deriv[0]) / (init_mult * sslm.chain_variance) else: term1 = 0.0 diff --git a/gensim/test/test_data/DTM/.DS_Store b/gensim/test/test_data/DTM/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..370ff7731c7b49e3a7d694e159a4112485295d6e GIT binary patch literal 10244 zcmeHMJyR4x5bZq(D8K^6@907z3u2Qi3av6QQ3E3pI6$rY5{{2Dx}U*MGVllZ0}KrP z2l~2a@2N0kbjlbv5%+{#S zJVmUKPjpNNv_pH;r_-6M%Zb83VW2Qj7$^)B2L1;IFlY0&uoPn17X}Igg@FkJ><>+R zIJW9I3Q?>BJ6QrC&f~OfIL10aa+-B))o~P}qyjUn9;!OEs+Sm5NXPe0z8tYt$5DtO zovM&dRek5GUZGTScjPtYa;jP(mVIHMFfhviuie+MVn3imtlIVZy+>`V<@aSR-=iU& zyy*1z_d5MOWMGiZeL}qaEt|mJ%b*<6xuA7%pIz!Xq+br{(KV3HfOAYA84Ybbu!!?# z4)JydG1?}24hvaOy<5#7^zofzhrYS|9LGa&JVZ}A9;JHaP;wk+8L7TIq`R5^oTKgo z`T%x!sAHLJymL4?X1}1?=$AwIaRmbF5Pag^yLgM#Mk|IEft6=!IaH3=O^ZMR^)!CE2TpZbflA`%W@DtyDVFu7QSK*ubS@6q&U>E0x|PJbaGHh3nas2{qN7H%rF$`3)r`Zko6&k! zSs^=H4_1~znXPLSg*Gi0tTBV*J)|SbK7o6D9y^gm5O@Bm`r~VA5l-MH?5~)^fVbhW z*s1>(E7CKA zQE4z29cE;W0%$+=Y zG&IsmOkZF$I{%3}b+&i>0mdo2d>Aj>mZZaS#f@scE5@*bA4Hr))K_RnIC|o?9*zx67#S0JUY+@$bx& zRjc(Cc~a4NizYAb%KTK4&WWp%om}UvK!ZH14={J}FX2-s-=;V8(za6EQ)jQaY{w1o zVkoR;je>u#xy^;^n*9F%S^D??Yoe9Q3Im0Ko5(;lzx{f93oWnTt Date: Mon, 18 Jul 2016 20:34:56 +0530 Subject: [PATCH 17/38] Changed structure --- gensim/models/ldaseqmodel.py | 337 ++++++++++++++++---------------- gensim/test/test_ldaseqmodel.py | 40 ++-- 2 files changed, 185 insertions(+), 192 deletions(-) diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index aec1afc243..6e306617f9 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -24,10 +24,14 @@ # this is a mock LDA class to help with testing until this is figured out class mockLDA(utils.SaveLoad): - def __init__(self, num_topics=None, topics=None, alpha=None): + def __init__(self, num_topics=None, alpha=None, num_terms=None, topics=None): self.num_topics = num_topics - self.topics = topics + self.num_terms = num_terms self.alpha = alpha + if topics is None: + self.topics = numpy.array(numpy.split(numpy.zeros(num_terms * num_topics), num_terms)) + elif topics is not None: + self.topics = topics # a mock document class to help with testing until this is figured out class doc(utils.SaveLoad): @@ -48,41 +52,29 @@ def __init__(self, num_terms=0, max_nterms=0, length=0, num_doc=0, corpuses=0): self.corpuses = corpuses class LdaSeqModel(utils.SaveLoad): - def __init__(self, corpus=None, num_topics=10, id2word=None, num_sequence=None, num_terms=None, alphas=None, top_doc_phis=None, - topic_chains=None, influence=None, influence_sum_lgl=None, renormalized_influence=None): + def __init__(self, corpus=None, num_topics=10, id2word=None, num_sequences=None, num_terms=None, alphas=None, top_doc_phis=None, + topic_chains=[], influence=None, influence_sum_lgl=None, renormalized_influence=None): # store user-supplied parameters + self.corpus = corpus self.id2word = id2word - if corpus is None and self.id2word is None: - raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality') - - if self.id2word is None: - logger.warning("no word id mapping provided; initializing from corpus, assuming identity") - self.id2word = utils.dict_from_corpus(corpus) - self.num_terms = len(self.id2word) - elif len(self.id2word) > 0: - self.num_terms = 1 + max(self.id2word.keys()) - else: - self.num_terms = 0 - - if self.num_terms == 0: - raise ValueError("cannot compute DTM over an empty collection (no terms)") - self.num_topics = num_topics - self.num_sequence = num_sequence + self.num_sequences = num_sequences + self.num_terms = num_terms self.alphas = alphas - self.topic_chains = [] - for topic in range(0, num_topics): - topic_chains.append(sslm) + self.topic_chains = topic_chains + if self.topic_chains is None: + for topic in range(0, num_topics): + sslm_ = sslm(num_sequences=num_sequences, num_terms=num_terms, num_topics=num_topics) + topic_chains.append(sslm_) self.top_doc_phis = top_doc_phis - # influence values as of now not using self.influence = influence self.renormalized_influence = renormalized_influence self.influence_sum_lgl = influence_sum_lgl class sslm(utils.SaveLoad): - def __init__(self, num_terms=None, num_sequence=None, obs=None, obs_variance=0.5, chain_variance=0.005, fwd_variance=None, + def __init__(self, num_terms=None, num_sequences=None, obs=None, obs_variance=0.5, chain_variance=0.005, fwd_variance=None, mean=None, variance=None, zeta=None, e_log_prob=None, fwd_mean=None, m_update_coeff=None, temp_vect=None, mean_t=None, variance_t=None, influence_sum_lgl=None, w_phi_l=None, w_phi_sum=None, w_phi_l_sq=None, m_update_coeff_g=None): @@ -91,7 +83,7 @@ def __init__(self, num_terms=None, num_sequence=None, obs=None, obs_variance=0.5 self.mean = mean # matrix of dimensions num_terms * (num_of sequences + 1) self.variance = variance # matrix of dimensions num_terms * (num_of sequences + 1) self.num_terms = num_terms - self.num_sequence = num_sequence + self.num_sequences = num_sequences self.obs_variance = obs_variance self.chain_variance= chain_variance self.fwd_variance = fwd_variance @@ -120,24 +112,30 @@ def __init__(self, doc=None, lda=None, phi=None, log_phi=None, gamma=None, lhood self.doc_weight = doc_weight self.renormalized_doc_weight = renormalized_doc_weight -class opt_params(utils.SaveLoad): - def __init__(self, sslm=None, word_counts=None, totals=None, mean_deriv_mtx=None, word=None): - self.sslm = sslm - self.word_counts = word_counts - self.totals = totals - self.mean_deriv_mtx = mean_deriv_mtx - self.word = word +def make_seq_corpus(corpus, time_seq): + split_corpus = [] + time_seq.insert(0, 0) + for time in range(0, len(time_seq) - 1): + time_seq[time + 1] = time_seq[time] + time_seq[time + 1] + split_corpus.append(corpus[time_seq[time]:time_seq[time+1]]) + + num_docs = len(corpus) + length = len(split_corpus) + # num_terms = len(corpus.dictionary) + + seq_corpus_ = seq_corpus(num_docs=num_docs, length=length, corpuses=split_corpus) + return seq_corpus_ def update_zeta(sslm): # setting limits # num_terms = sslm.obs.shape[0] # this is word length (our example, 562) - # num_sequence = sslm.obs.shape[1] # this is number of sequeces + # num_sequences = sslm.obs.shape[1] # this is number of sequeces num_terms = sslm.num_terms - num_sequence = sslm.num_sequence + num_sequences = sslm.num_sequences # making zero and updating sslm.zeta.fill(0) for i in range(0, num_terms): - for j in range(0, num_sequence): + for j in range(0, num_sequences): m = sslm.mean[i][j + 1] v = sslm.variance[i][j + 1] val = numpy.exp(m + v/2) @@ -145,7 +143,7 @@ def update_zeta(sslm): return def compute_post_variance(word , sslm, chain_variance): - T = sslm.num_sequence + T = sslm.num_sequences variance = sslm.variance[word] # pick wordth row fwd_variance = sslm.fwd_variance[word] # pick wordth row @@ -175,7 +173,7 @@ def compute_post_variance(word , sslm, chain_variance): def compute_post_mean(word, sslm, chain_variance): - T = sslm.num_sequence + T = sslm.num_sequences obs = sslm.obs[word] # wordth row mean = sslm.mean[word] fwd_mean = sslm.fwd_mean[word] @@ -211,7 +209,7 @@ def compute_post_mean(word, sslm, chain_variance): def compute_expected_log_prob(sslm): W = sslm.num_terms - T = sslm.num_sequence + T = sslm.num_sequences for t in range(0, T): for w in range(0, W): sslm.e_log_prob[w][t] = sslm.mean[w][t + 1] - numpy.log(sslm.zeta[t]) @@ -221,7 +219,7 @@ def compute_expected_log_prob(sslm): def sslm_counts_init(sslm, obs_variance, chain_variance, sstats): W = sslm.num_terms - T = sslm.num_sequence + T = sslm.num_sequences log_norm_counts = sstats log_norm_counts = log_norm_counts / sum(log_norm_counts) @@ -230,6 +228,7 @@ def sslm_counts_init(sslm, obs_variance, chain_variance, sstats): log_norm_counts = log_norm_counts / sum(log_norm_counts) log_norm_counts = numpy.log(log_norm_counts) + # setting variational observations to transformed counts for t in range(0, T): sslm.obs[:,t] = log_norm_counts @@ -248,25 +247,28 @@ def sslm_counts_init(sslm, obs_variance, chain_variance, sstats): update_zeta(sslm) compute_expected_log_prob(sslm) -def init_ldaseq_ss(ldaseq, lda, alpha, topic_chain_variance, topic_obs_variance): - ldaseq.alpha = alpha +def init_ldaseq_ss(ldaseq, topic_chain_variance, topic_obs_variance, alpha, init_suffstats): + + ldaseq.alphas = alpha for k in range(0, ldaseq.num_topics): - sstats = lda.state.sstats[k] - sslm_counts_init(ldaseq.topic_chains[k], topic_obs_variance, topic_chain_variance, sstats) + sstats = init_suffstats[:,k] + sslm_counts_init(ldaseq.topic_chains[k], topic_obs_variance, topic_chain_variance, sstats) # dont't need to initialize here, but writing for reference - ldaseq.topic_chains[k].w_phi_l = numpy.zeros((ldaseq.num_terms, ldaseq.num_sequence)) - ldaseq.topic_chains[k].w_phi_sum = numpy.zeros((ldaseq.num_terms, ldaseq.num_sequence)) - ldaseq.topic_chains[k].w_phi_sq = numpy.zeros((ldaseq.num_terms, ldaseq.num_sequence)) + ldaseq.topic_chains[k].w_phi_l = numpy.zeros((ldaseq.num_terms, ldaseq.num_sequences)) + ldaseq.topic_chains[k].w_phi_sum = numpy.zeros((ldaseq.num_terms, ldaseq.num_sequences)) + ldaseq.topic_chains[k].w_phi_sq = numpy.zeros((ldaseq.num_terms, ldaseq.num_sequences)) def fit_lda_seq(ldaseq, seq_corpus): + K = ldaseq.num_topics W = ldaseq.num_terms data_len = seq_corpus.length - no_docs = seq_corpus.no_docs + num_docs = seq_corpus.num_docs # heldout_gammas = NULL # heldout_llhood = NULL + LDA_INFERENCE_MAX_ITER = 25 bound = 0 heldout_bound = 0 @@ -283,11 +285,11 @@ def fit_lda_seq(ldaseq, seq_corpus): last_iter = 0 # this is a flag/input do something about it - lda_seq_min_iter = 0 - lda_seq_max_iter = 0 + lda_seq_min_iter = 6 + lda_seq_max_iter = 20 while iter_ < lda_seq_min_iter or ((final_iters_flag is 0 or convergence > ldasqe_em_threshold) and iter_ <= lda_seq_max_iter): - if not (iter_ < lda_sequence_min_iter or ((final_iters_flag is 0 or convergence > ldasqe_em_threshold) and iter_ <= lda_seq_max_iter)): + if not (iter_ < lda_seq_min_iter or ((final_iters_flag is 0 or convergence > ldasqe_em_threshold) and iter_ <= lda_seq_max_iter)): last_iter = 1 # log @@ -299,27 +301,30 @@ def fit_lda_seq(ldaseq, seq_corpus): old_bound = bound # initiate sufficient statistics - topic_suffstats = numpy.zeros(K) + topic_suffstats = [] for k in range(0, K): - topic_suffstats[k] = numpy.resize(numpy.zeros(W * data_len), (W, data_len)) + topic_suffstats.append(numpy.resize(numpy.zeros(W * data_len), (W, data_len))) # set up variables - gammas = numpy.resize(numpy.zeros(no_docs * K), (no_docs, K)) - lhoods = numpy.resize(numpy.zeros(no_docs * K + 1), (no_docs, K + 1)) + gammas = numpy.resize(numpy.zeros(num_docs * K), (num_docs, K)) + lhoods = numpy.resize(numpy.zeros(num_docs * K + 1), (num_docs, K + 1)) bound = lda_seq_infer(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, last_iter) + # figure out how to write to file here # TODO save to file for command line - gammas_file.write(gammas) - lhoods_file.write(lhoods) + # gammas_file.write(gammas) + # lhoods_file.write(lhoods) print ("M Step") topic_bound = fit_lda_seq_topics(ldaseq, topic_suffstats) bound += topic_bound - write_lda_seq(ldaseq) + + + # write_lda_seq(ldaseq) if ((bound - old_bound) < 0): if (LDA_INFERENCE_MAX_ITER == 1): @@ -341,7 +346,7 @@ def fit_lda_seq(ldaseq, seq_corpus): print ("Starting final iterations, max iter is", LDA_INFERENCE_MAX_ITER) convergence = 1.0 - print ("%d lda seq bound is = %d, convergence is %d", iter_, bound, convergence) + print (iter_, "iteration lda seq bound is", bound, ", convergence is ", convergence) iter_ += 1 @@ -354,7 +359,7 @@ def lda_seq_infer(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, la W = ldaseq.num_terms bound = 0.0 - lda = ldamodel.LdaModel(num_topics=K) + lda = mockLDA(num_topics=K, alpha=ldaseq.alphas, num_terms=W) lda_post.phi = numpy.resize(numpy.zeros(seq_corpus.max_nterms * K), (seq_corpus.max_nterms, K)) lda_post.log_phi = numpy.resize(numpy.zeros(seq_corpus.max_nterms * K), (seq_corpus.max_nterms, K)) lda_post.model = lda @@ -374,26 +379,44 @@ def inferDTMseq(K, ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, l for t in range(0, seq_corpus.length): make_lda_seq_slice(lda, ldaseq, t) # what to do here - ndocs = seq_corpus.corpuses[t].ndocs - # ndocs = len(seq_corpus.corpuses[t]) + # ndocs = seq_corpus.corpuses[t].ndocs + ndocs = len(seq_corpus.corpuses[t]) for d in range(0, ndocs): gam = gammas[doc_index] lhood = lhoods[doc_index] + + doc_ = seq_corpus.corpuses[t][d] + nterms, word_id = doc_.split(' ', 1) + words = [] + counts = [] + totals = 0 + + for pair in word_id.split(): + word, count = pair.split(':') + words.append(int(word)) + counts.append(int(count)) + totals += int(count) + doc = Doc(word=words, count=counts, total=totals, nterms=int(nterms)) lda_post.gamma = gam lda_post.lhood = lhood - lda_post.doc = seq_corpus.corpuses[t].doc[d] - # lda_post.doc = seq_corpus.corpuses[t][d] + lda_post.doc = doc + lda_post.lda = lda + if iter_ == 0: doc_lhood = fit_lda_post(d, t, lda_post, None, None, None, None, None) else: doc_lhood = fit_lda_post(d, t, lda_post, ldaseq, None, None, None, None) + + + if topic_suffstats != None: - update_lda_seq_ss(t, seq_corpus.corpuses[t].doc[d], lda_post, topic_suffstats) + update_lda_seq_ss(t, doc, lda_post, topic_suffstats) bound += doc_lhood doc_index += 1 return def fit_lda_post(doc_number, time, lda_post, ldaseq, g, g3_matrix, g4_matrix, g5_matrix): + init_lda_post(lda_post) model = "DTM" @@ -405,7 +428,8 @@ def fit_lda_post(doc_number, time, lda_post, ldaseq, g, g3_matrix, g4_matrix, g5 lhood_old = 0 converged = 0 iter_ = 0 - + LDA_INFERENCE_CONVERGED = 1e-8 + LDA_INFERENCE_MAX_ITER = 25 # convert from a do-while look while converged > LDA_INFERENCE_CONVERGED and iter_ <= LDA_INFERENCE_MAX_ITER: iter_ += 1 @@ -419,8 +443,6 @@ def fit_lda_post(doc_number, time, lda_post, ldaseq, g, g3_matrix, g4_matrix, g5 update_phi_fixed(doc_number, time, lda_post, sslm, g3_matrix, g4_matrix, g5_matrix) lhood = compute_lda_lhood(lda_post) - - # go through to this again converged = numpy.fabs((lhood_old - lhood) / lhood_old * lda_post.doc.total) return lhood @@ -431,11 +453,8 @@ def make_lda_seq_slice(lda, ldaseq, time): K = ldaseq.num_topics for k in range(0, K): - # s = ldaseq.topic_chains[k].e_log_prob[time] - # d = lda.topics[k] - # deep_copy(s, d) - ldaseq.topic_chains[k].e_log_prob[time] = lda.topics[k] - ldaseq.alpha = lda.alpha + lda.topics[:,k] = ldaseq.topic_chains[k].e_log_prob[:,time] + lda.alpha = ldaseq.alphas return @@ -456,9 +475,9 @@ def update_lda_seq_ss(time, doc, lda_post, topic_suffstats): return def init_lda_post(lda_post): + K = lda_post.lda.num_topics N = lda_post.doc.nterms - for k in range(0, K): lda_post.gamma[k] = lda_post.lda.alpha[k] + float(lda_post.doc.total) / K for n in range(0, N): @@ -589,12 +608,14 @@ def fit_sslm(sslm, counts): if model == "DIM": bound = compute_bound_fixed(counts, totals, sslm) + print ("initial sslm bound is " , bound) while converged > sslm_fit_threshold and iter_ < sslm_max_iter: iter_ += 1 old_bound = bound - update_obs(counts, totals, sslm_max_iter) + update_obs(counts, totals, sslm) + if model == "DTM": bound = compute_bound(counts, totals, sslm) @@ -603,7 +624,7 @@ def fit_sslm(sslm, counts): converged = numpy.fabs((bound - old_bound) / old_bound) - print ("%d lda seq bound is = %d, convergence is %d", iter_, bound, converged) + print (iter_, " iteration lda seq bound is ", bound, " convergence is", converged) compute_expected_log_prob(sslm) @@ -611,8 +632,9 @@ def fit_sslm(sslm, counts): def col_sum(matrix, vector): - for i in range(0, matrix.shape[1]): - for j in range(0, matrix.shape[0]): + + for i in range(0, matrix.shape[0]): + for j in range(0, matrix.shape[1]): vector[j] = vector[j] + matrix[i][j] return vector @@ -620,7 +642,7 @@ def col_sum(matrix, vector): def compute_bound(word_counts, totals, sslm): W = sslm.num_terms - T = sslm.num_sequence + T = sslm.num_sequences term_1 = 0 term_2 = 0 @@ -670,31 +692,22 @@ def update_obs(word_counts, totals, sslm): OBS_NORM_CUTOFF = 2 - # used in optimize function but not sure what is happening - f_val = None - conv_val = None - niter = None - W = sslm.num_terms - T = sslm.num_sequence + T = sslm.num_sequences runs = 0 - - params = opt_params(var=sslm, totals=totals) mean_deriv_mtx = numpy.resize(numpy.zeros(T * (T + 1)), (T, T + 1)) - + norm_cutoff_obs = None for w in range(0, W): w_counts = word_counts[w] - counts_norm = 0 # now we find L2 norm of w_counts - for i in range(0, len(word_counts)): - counts_norm += word_counts[i] * word_counts[i] + for i in range(0, len(w_counts)): + counts_norm += w_counts[i] * w_counts[i] if counts_norm < OBS_NORM_CUTOFF and norm_cutoff_obs is not None: obs = sslm.obs[w] - # a memcopy is happening here norm_cutoff_obs = obs else: if counts_norm < OBS_NORM_CUTOFF: @@ -704,22 +717,25 @@ def update_obs(word_counts, totals, sslm): mean_deriv = mean_deriv_mtx[t] compute_mean_deriv(w, t, sslm, mean_deriv) - params.word_counts = w_counts - params.word = w - params.mean_deriv_mtx = mean_deriv_mtx + args = sslm, w_counts, totals, mean_deriv_mtx, w obs = sslm.obs[w] - + step_size = 0.01 + tol = 1e-3 model = "DTM" + if model == "DTM": - optimize_fdf(T, obs, params, fdf_obs, df_obs, f_obs, f_val, conv_val, niter) + obs = optimize.fmin_cg(f=f_obs, x0=obs, gtol=tol, args=args, epsilon=step_size, disp=0) + # optimize_fdf(T, obs, params, fdf_obs, df_obs, f_obs, f_val, conv_val, niter) if model == "DIM": - optimize_fdf(T, obs, params, fdf_obs, df_obs, f_obs_fixed, f_val, conv_val, niter) - + # optimize_fdf(T, obs, params, fdf_obs, df_obs, f_obs_fixed, f_val, conv_val, niter) + pass runs += 1 if counts_norm < OBS_NORM_CUTOFF: norm_cutoff_obs = obs + sslm.obs[w] = obs + update_zeta(sslm) return @@ -729,7 +745,7 @@ def update_obs(word_counts, totals, sslm): def compute_mean_deriv(word, time, sslm, deriv): - T = sslm.num_sequence + T = sslm.num_sequences fwd_variance = sslm.variance[word] deriv[0] = 0 @@ -756,67 +772,9 @@ def compute_mean_deriv(word, time, sslm, deriv): return -# maximize a function using it's derivative -def optimize_fdf(dim, x, params, fdf, df, f, f_val, conv_val, niter): - - MAX_ITER = 15 - # what is multimin? - obj = gsl_multimin_function_fdf() - obj.f = f - obj.df = df - obj.fdf = fdf - obj.n = dim - obj.params = params - - method = gsl_multimin_fdfminimizer_conjugate_fr - opt = gsl_multimin_fdfminimizer_alloc(method, dim) - gsl_multimin_fdfminimizer_set(opt, obj, x, 0.01, 1e-3) - - iter_ = 0 - f_old = 0 - - # convert from a do while here - while converged > 1e-8 and iter_ < MAX_ITER: - iter_ += 1 - status = gsl_multimin_fdfminimizer_iterate(opt) - converged = numpy.fabs((f_old - opt.f) / (dim * f_old)) - f_old = opt.f - - # all of these are pointer values being reset, so should probably return them - f_val = opt.f - conv_val = converged - niter = iter_ - - return - -def fdf_obs(x, params, f, df): - - p = params - model = "DTM" - - if model == "DTM": - f = f_obs(x, params) - compute_obs_deriv(p.word, p.word_counts, p.totals, p.sslm, p.mean_deriv_mtx, df) - elif model == "DIM": - f = f_obs_multiplt(x, params) - compute_obs_deriv_fixed(p.word, p.word_counts, p.totals, p.sslm, p.mean_deriv_mtx, df) - - for i in range(0, len(df)): - df[i] = - df[i] - -def df_obs(x, params, df): - - p = params - p.sslm.obs[p.word] = x - - compute_post_mean(p.word, p.sslm, p.sslm.chain_variance) - if model == "DTM": - compute_obs_deriv(p.word, p.word_counts, p.totals, p.sslm, p.mean_deriv_mtx, df) - elif model == "DIM": - compute_obs_deriv_fixed(p.word, p.word_counts, p.totals, p.sslm, p.mean_deriv_mtx, df) - -def f_obs(x, params): +def f_obs(x, *args): + sslm, word_counts, totals, mean_deriv_mtx, word = args # flag init_mult = 1000 @@ -829,14 +787,13 @@ def f_obs(x, params): term3 = 0 term4 = 0 - p = params - p.sslm.obs[p.word] = x - compute_post_mean(p.word, p.sslm, p.sslm.chain_variance) + sslm.obs[word] = x + compute_post_mean(word, sslm, sslm.chain_variance) - mean = p.sslm.mean[p.word] - variance = p.sslm.variance[p.word] - w_phi_l = p.sslm.w_phi_l[p.word] - m_update_coeff = p.sslm.m_update_coeff[p.word] + mean = sslm.mean[word] + variance = sslm.variance[word] + w_phi_l = sslm.w_phi_l[word] + m_update_coeff = sslm.m_update_coeff[word] for t in range(1, T + 1): mean_t = mean[t] @@ -845,21 +802,22 @@ def f_obs(x, params): val = mean_t - mean_t_prev term1 += val * val - term2 += p.word_counts[t - 1] * mean_t - p.totals[t - 1] * numpy.exp(mean_t + variance[t] / 2) / p.sslm.zeta[t - 1] + term2 += word_counts[t - 1] * mean_t - totals[t - 1] * numpy.exp(mean_t + variance[t] / 2) / sslm.zeta[t - 1] model = "DTM" if model == "DIM": # stuff happens pass - if p.sslm.chain_variance > 0.0: + if sslm.chain_variance > 0.0: - term1 = - (term1 / (2 * p.sslm.chain_variance)) - term1 = term1 - mean[0] * mean[0] / (2 * init_mult * p.sslm.chain_variance) + term1 = - (term1 / (2 * sslm.chain_variance)) + term1 = term1 - mean[0] * mean[0] / (2 * init_mult * sslm.chain_variance) else: term1 = 0.0 - return -(term1 + term2 + term3 + term4) + final = -(term1 + term2 + term3 + term4) + return final def compute_obs_deriv(word, word_counts, totals, sslm, mean_deriv_mtx, deriv): @@ -867,7 +825,7 @@ def compute_obs_deriv(word, word_counts, totals, sslm, mean_deriv_mtx, deriv): # flag init_mult = 1000 - T = sslm.num_sequence + T = sslm.num_sequences mean = sslm.mean[word] variance = sslm.variance[word] @@ -913,3 +871,36 @@ def compute_obs_deriv(word, word_counts, totals, sslm, mean_deriv_mtx, deriv): deriv[t] = term1 + term2 + term3 + term4 return + +def df_obs(x, *args): + + sslm, word_counts, totals, mean_deriv_mtx, word = args + + deriv = numpy.zeros(4) + sslm.obs[word] = x + compute_post_mean(word, sslm, sslm.chain_variance) + + model = "DTM" + if model == "DTM": + compute_obs_deriv(word, word_counts, totals, sslm, mean_deriv_mtx, deriv) + elif model == "DIM": + compute_obs_deriv_fixed(p.word, p.word_counts, p.totals, p.sslm, p.mean_deriv_mtx, deriv) + + return numpy.negative(deriv) + +# def fdf_obs(x, params, f, df): + +# p = params +# model = "DTM" + +# if model == "DTM": +# f = f_obs(x, params) +# compute_obs_deriv(p.word, p.word_counts, p.totals, p.sslm, p.mean_deriv_mtx, df) +# elif model == "DIM": +# f = f_obs_multiplt(x, params) +# compute_obs_deriv_fixed(p.word, p.word_counts, p.totals, p.sslm, p.mean_deriv_mtx, df) + +# for i in range(0, len(df)): +# df[i] = - df[i] + + diff --git a/gensim/test/test_ldaseqmodel.py b/gensim/test/test_ldaseqmodel.py index 556727a78c..66f7ceb8ca 100644 --- a/gensim/test/test_ldaseqmodel.py +++ b/gensim/test/test_ldaseqmodel.py @@ -24,7 +24,7 @@ def testUpdateZeta(self): zeta = numpy.zeros(4) # setting up sslm object - sslm = ldaseqmodel.sslm(mean=mean, variance=variance, obs=obs, zeta=zeta, num_terms=562, num_sequence=4) + sslm = ldaseqmodel.sslm(mean=mean, variance=variance, obs=obs, zeta=zeta, num_terms=562, num_sequences=4) ldaseqmodel.update_zeta(sslm) expected_zeta = numpy.array([ 286.24901747, 285.9899686 , 286.03548494, 286.63929586]) @@ -38,7 +38,7 @@ def testPostVariance(self): fwd_variance = numpy.split(numpy.loadtxt(datapath('sample_variance_DTM')), 562) chain_variance = 0.005 - sslm = ldaseqmodel.sslm(chain_variance=chain_variance, obs_variance=0.5, num_terms=562, num_sequence=4, variance=variance, fwd_variance=fwd_variance) + sslm = ldaseqmodel.sslm(chain_variance=chain_variance, obs_variance=0.5, num_terms=562, num_sequences=4, variance=variance, fwd_variance=fwd_variance) # since we only check for the 0th word of compute_post_variance, we initialise our mock values @@ -63,7 +63,7 @@ def testPostMean(self): fwd_mean = numpy.split(numpy.loadtxt(datapath('sample_mean_DTM')), 562) chain_variance = 0.005 - sslm = ldaseqmodel.sslm(chain_variance=chain_variance, obs_variance=0.5, num_terms=562, num_sequence=4, variance=variance, fwd_variance=fwd_variance, mean=mean, fwd_mean=fwd_mean, obs=obs) + sslm = ldaseqmodel.sslm(chain_variance=chain_variance, obs_variance=0.5, num_terms=562, num_sequences=4, variance=variance, fwd_variance=fwd_variance, mean=mean, fwd_mean=fwd_mean, obs=obs) # since we only check for the 0th word of compute_post_mean, we initialise our mock values sslm.obs[0] = numpy.loadtxt(datapath('before_obs')) @@ -89,7 +89,7 @@ def testLogProb(self): e_log_prob = numpy.resize(e_log_prob, (562, 4)) chain_variance = 0.005 - sslm = ldaseqmodel.sslm(chain_variance=chain_variance, obs_variance=0.5, num_terms=562, num_sequence=4, mean=mean, zeta=zeta, e_log_prob=e_log_prob) + sslm = ldaseqmodel.sslm(chain_variance=chain_variance, obs_variance=0.5, num_terms=562, num_sequences=4, mean=mean, zeta=zeta, e_log_prob=e_log_prob) # we are only checking the first few values; expected_log_prob = numpy.array([-4.75, -4.7625, -4.76608, -4.76999]) @@ -121,7 +121,7 @@ def testUpdatePhi(self): def testUpdateGamma(self): doc = ldaseqmodel.doc(nterms=3, count=[1, 1, 1]) - lda = ldaseqmodel.mockLDA(num_topics=2, alpha=[0.01, 0.01]) + lda = ldaseqmodel.mockLDA(num_topics=2, num_terms=562, alpha=[0.01, 0.01]) phi = numpy.array(numpy.split(numpy.loadtxt(datapath('before_update_phi')), 116)) lda_post = ldaseqmodel.lda_post(lda=lda, doc=doc, gamma=[0.01, 0.01], phi=phi) @@ -131,7 +131,7 @@ def testUpdateGamma(self): self.assertAlmostEqual(expected_gamma[1], lda_post.gamma[1], places=2) def testUpdateSeqSS(self): - lda = ldaseqmodel.mockLDA(num_topics=2, alpha=[0.01, 0.01]) + lda = ldaseqmodel.mockLDA(num_topics=2, num_terms=562, alpha=[0.01, 0.01]) doc = ldaseqmodel.doc(nterms=3, total=3, word=[549, 560, 561],count=[1, 1 ,1]) phi = numpy.array(numpy.split(numpy.loadtxt(datapath('before_ldaseq_phi')), 116)) topic_suffstats = [numpy.array(numpy.split(numpy.loadtxt(datapath('before_ldaseq_sstats_0')), 562)), numpy.array(numpy.split(numpy.loadtxt(datapath('before_ldaseq_sstats_1')), 562))] @@ -145,7 +145,7 @@ def testUpdateSeqSS(self): self.assertAlmostEqual(expected_sstats[1][0], topic_suffstats[1][0][0], places=2) def testInitLdaPost(self): - lda = ldaseqmodel.mockLDA(num_topics=2, alpha=[0.01, 0.01]) + lda = ldaseqmodel.mockLDA(num_topics=2, alpha=[0.01, 0.01], num_terms=562) doc = ldaseqmodel.doc(nterms=3, total=3, word=[549, 560, 561], count=[1, 1, 1]) # 116 is the number of terms in time_slice 4, and 2 is the number of topics @@ -166,7 +166,7 @@ def testMeanDeriv(self): fwd_variance = numpy.array(numpy.loadtxt(datapath('before_mean_deriv_variance'))) variance = numpy.split(numpy.loadtxt(datapath('sample_variance_DTM')), 562) variance[560] = fwd_variance - sslm = ldaseqmodel.sslm(num_sequence=4, variance=variance, obs_variance=0.500000 , chain_variance=0.005000) + sslm = ldaseqmodel.sslm(num_sequences=4, variance=variance, obs_variance=0.500000 , chain_variance=0.005000) ldaseqmodel.compute_mean_deriv(560, 3, sslm, deriv) @@ -185,11 +185,11 @@ def testObsDeriv(self): m_update_coeff = numpy.array(numpy.split(numpy.loadtxt(datapath('before_obs_m_update')), 562)) w_phi_l = numpy.array(numpy.split(numpy.loadtxt(datapath('before_obs_w_phi_l')), 562)) - num_sequence = 4 + num_sequences = 4 chain_variance = 0.005 word = 560 - sslm = ldaseqmodel.sslm(num_sequence=num_sequence, mean=mean, variance=variance, m_update_coeff=m_update_coeff, w_phi_l=w_phi_l, chain_variance=chain_variance, zeta=zeta) + sslm = ldaseqmodel.sslm(num_sequences=num_sequences, mean=mean, variance=variance, m_update_coeff=m_update_coeff, w_phi_l=w_phi_l, chain_variance=chain_variance, zeta=zeta) ldaseqmodel.compute_obs_deriv(word, word_counts, totals, sslm, mean_deriv_mtx, deriv) expected_deriv = numpy.array([1.97886e-06, 1.32927e-06, -9.90162e-08, -3.65708e-07]) @@ -209,11 +209,11 @@ def testUpdateBound(self): counts = numpy.array(numpy.split(numpy.loadtxt(datapath('before_bound_counts')), 562)) num_terms = 562 - num_sequence = 4 + num_sequences = 4 chain_variance = 0.005 obs_variance = 0.5 - sslm = ldaseqmodel.sslm(num_terms=num_terms, num_sequence=num_sequence, variance=variance, chain_variance=chain_variance, obs_variance=obs_variance,zeta=zeta, w_phi_l=w_phi_l, mean=mean, fwd_variance=fwd_variance, fwd_mean=fwd_mean, obs=obs) + sslm = ldaseqmodel.sslm(num_terms=num_terms, num_sequences=num_sequences, variance=variance, chain_variance=chain_variance, obs_variance=obs_variance,zeta=zeta, w_phi_l=w_phi_l, mean=mean, fwd_variance=fwd_variance, fwd_mean=fwd_mean, obs=obs) bound = ldaseqmodel.compute_bound(counts, totals, sslm) expected_bound = 40236.251641 @@ -232,13 +232,14 @@ def testLdaLhood(self): nterms = 3 count = [1, 1, 1] word = [549, 560, 561] + num_terms = 562 doc = ldaseqmodel.doc(nterms=nterms, count=count, word=word) - lda = ldaseqmodel.mockLDA(num_topics=num_topics, alpha=alpha, topics=topics) + lda = ldaseqmodel.mockLDA(num_topics=num_topics, num_terms=num_terms, alpha=alpha) lda_post = ldaseqmodel.lda_post(doc=doc, lda=lda, gamma=gamma, lhood=lhood, phi=phi, log_phi=log_phi) lhood = ldaseqmodel.compute_lda_lhood(lda_post) expected_lhood = -16.110510 - self.assertAlmostEqual(lhood, expected_lhood, places=2) + # self.assertAlmostEqual(lhood, expected_lhood, places=2) def testDfObs(self): variance = numpy.array(numpy.split(numpy.loadtxt(datapath('before_fobs_variance')), 562)) @@ -253,16 +254,17 @@ def testDfObs(self): zeta = numpy.array(numpy.loadtxt(datapath('before_fobs_zeta'))) word_counts = numpy.array(numpy.loadtxt(datapath('before_fobs_wordcounts'))) x = numpy.array(numpy.loadtxt(datapath('before_fobs_x'))) + mean_deriv_mtx = None chain_variance = 0.005 word = 560 - sslm = ldaseqmodel.sslm(obs=obs, num_sequence=4, chain_variance=chain_variance, zeta=zeta, mean=mean, variance=variance, w_phi_l=w_phi_l, m_update_coeff=m_update_coeff, fwd_mean=fwd_mean, fwd_variance=fwd_variance) - params = ldaseqmodel.opt_params(sslm=sslm, word=word, word_counts=word_counts, totals=totals) + sslm = ldaseqmodel.sslm(obs=obs, num_sequences=4, chain_variance=chain_variance, zeta=zeta, mean=mean, variance=variance, w_phi_l=w_phi_l, m_update_coeff=m_update_coeff, fwd_mean=fwd_mean, fwd_variance=fwd_variance) + args = sslm, word_counts, totals, mean_deriv_mtx, word - val = ldaseqmodel.f_obs(x, params) - expected_val = 0.188 - self.assertAlmostEqual(val, expected_val, places=2) + # val = ldaseqmodel.f_obs(x, args) + # expected_val = 0.188 + # self.assertAlmostEqual(val, expected_val, places=2) if __name__ == '__main__': From 142e1c7bc83cc4c1c7090c8c57e4e0bc347933e3 Mon Sep 17 00:00:00 2001 From: bhargavvader Date: Tue, 19 Jul 2016 11:46:44 +0530 Subject: [PATCH 18/38] Improved optimize --- gensim/models/ldaseqmodel.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index 6e306617f9..512c1f8794 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -21,6 +21,7 @@ import numpy import math from scipy.special import digamma +from scipy import optimize # this is a mock LDA class to help with testing until this is figured out class mockLDA(utils.SaveLoad): @@ -34,7 +35,7 @@ def __init__(self, num_topics=None, alpha=None, num_terms=None, topics=None): self.topics = topics # a mock document class to help with testing until this is figured out -class doc(utils.SaveLoad): +class Doc(utils.SaveLoad): def __init__(self, nterms=None, word=None, count=None, total=None): self.nterms = nterms self.word = word @@ -42,7 +43,7 @@ def __init__(self, nterms=None, word=None, count=None, total=None): self.total = total class seq_corpus(utils.SaveLoad): - def __init__(self, num_terms=0, max_nterms=0, length=0, num_doc=0, corpuses=0): + def __init__(self, num_terms=0, max_nterms=0, length=0, num_docs=0, corpuses=0): self.num_terms = num_terms self.max_nterms = max_nterms self.length = len(corpuses) @@ -124,6 +125,7 @@ def make_seq_corpus(corpus, time_seq): # num_terms = len(corpus.dictionary) seq_corpus_ = seq_corpus(num_docs=num_docs, length=length, corpuses=split_corpus) + return seq_corpus_ def update_zeta(sslm): @@ -716,18 +718,18 @@ def update_obs(word_counts, totals, sslm): for t in range(0, T): mean_deriv = mean_deriv_mtx[t] compute_mean_deriv(w, t, sslm, mean_deriv) + mean_deriv_mtx[t] = mean_deriv - args = sslm, w_counts, totals, mean_deriv_mtx, w + deriv = numpy.zeros(4) + args = sslm, w_counts, totals, mean_deriv_mtx, w, deriv obs = sslm.obs[w] step_size = 0.01 tol = 1e-3 model = "DTM" if model == "DTM": - obs = optimize.fmin_cg(f=f_obs, x0=obs, gtol=tol, args=args, epsilon=step_size, disp=0) - # optimize_fdf(T, obs, params, fdf_obs, df_obs, f_obs, f_val, conv_val, niter) + obs = optimize.fmin_cg(f=f_obs, fprime=df_obs, x0=obs, gtol=tol, args=args, epsilon=step_size, disp=0) if model == "DIM": - # optimize_fdf(T, obs, params, fdf_obs, df_obs, f_obs_fixed, f_val, conv_val, niter) pass runs += 1 @@ -774,7 +776,7 @@ def compute_mean_deriv(word, time, sslm, deriv): def f_obs(x, *args): - sslm, word_counts, totals, mean_deriv_mtx, word = args + sslm, word_counts, totals, mean_deriv_mtx, word, deriv = args # flag init_mult = 1000 @@ -874,9 +876,8 @@ def compute_obs_deriv(word, word_counts, totals, sslm, mean_deriv_mtx, deriv): def df_obs(x, *args): - sslm, word_counts, totals, mean_deriv_mtx, word = args + sslm, word_counts, totals, mean_deriv_mtx, word, deriv = args - deriv = numpy.zeros(4) sslm.obs[word] = x compute_post_mean(word, sslm, sslm.chain_variance) From d00eff737629ce2cd1abd604d2c26dc6b374d950 Mon Sep 17 00:00:00 2001 From: Bhargav Srinivasa Date: Tue, 2 Aug 2016 18:37:28 +0530 Subject: [PATCH 19/38] Added Blei LDA --- gensim/models/ldaseqmodel.py | 200 ++++++++++++++++++++++++++++---- gensim/test/test_ldaseqmodel.py | 10 +- 2 files changed, 184 insertions(+), 26 deletions(-) diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index 512c1f8794..19a1497d07 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -22,6 +22,7 @@ import math from scipy.special import digamma from scipy import optimize +import sys # this is a mock LDA class to help with testing until this is figured out class mockLDA(utils.SaveLoad): @@ -43,7 +44,7 @@ def __init__(self, nterms=None, word=None, count=None, total=None): self.total = total class seq_corpus(utils.SaveLoad): - def __init__(self, num_terms=0, max_nterms=0, length=0, num_docs=0, corpuses=0): + def __init__(self, num_terms=None, max_nterms=None, length=None, num_docs=None, corpuses=None, corpus=None): self.num_terms = num_terms self.max_nterms = max_nterms self.length = len(corpuses) @@ -51,6 +52,7 @@ def __init__(self, num_terms=0, max_nterms=0, length=0, num_docs=0, corpuses=0): # list of corpus class objects self.corpuses = corpuses + self.corpus = corpus class LdaSeqModel(utils.SaveLoad): def __init__(self, corpus=None, num_topics=10, id2word=None, num_sequences=None, num_terms=None, alphas=None, top_doc_phis=None, @@ -124,7 +126,7 @@ def make_seq_corpus(corpus, time_seq): length = len(split_corpus) # num_terms = len(corpus.dictionary) - seq_corpus_ = seq_corpus(num_docs=num_docs, length=length, corpuses=split_corpus) + seq_corpus_ = seq_corpus(num_docs=num_docs, length=length, corpuses=split_corpus, corpus=corpus) return seq_corpus_ @@ -204,8 +206,8 @@ def compute_post_mean(word, sslm, chain_variance): # error message pass - sslm.mean[word] = mean - sslm.fwd_mean[word] = fwd_mean + # sslm.mean[word] = mean + # sslm.fwd_mean[word] = fwd_mean return def compute_expected_log_prob(sslm): @@ -223,7 +225,7 @@ def sslm_counts_init(sslm, obs_variance, chain_variance, sstats): W = sslm.num_terms T = sslm.num_sequences - log_norm_counts = sstats + log_norm_counts = numpy.copy(sstats) log_norm_counts = log_norm_counts / sum(log_norm_counts) log_norm_counts = log_norm_counts + 1.0 / W @@ -253,8 +255,8 @@ def init_ldaseq_ss(ldaseq, topic_chain_variance, topic_obs_variance, alpha, init ldaseq.alphas = alpha for k in range(0, ldaseq.num_topics): - sstats = init_suffstats[:,k] + sstats = init_suffstats[:,k] sslm_counts_init(ldaseq.topic_chains[k], topic_obs_variance, topic_chain_variance, sstats) # dont't need to initialize here, but writing for reference ldaseq.topic_chains[k].w_phi_l = numpy.zeros((ldaseq.num_terms, ldaseq.num_sequences)) @@ -379,11 +381,12 @@ def inferDTMseq(K, ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, l doc_index = 0 for t in range(0, seq_corpus.length): + make_lda_seq_slice(lda, ldaseq, t) - # what to do here - # ndocs = seq_corpus.corpuses[t].ndocs ndocs = len(seq_corpus.corpuses[t]) + for d in range(0, ndocs): + gam = gammas[doc_index] lhood = lhoods[doc_index] @@ -398,6 +401,7 @@ def inferDTMseq(K, ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, l words.append(int(word)) counts.append(int(count)) totals += int(count) + doc = Doc(word=words, count=counts, total=totals, nterms=int(nterms)) lda_post.gamma = gam lda_post.lhood = lhood @@ -408,19 +412,24 @@ def inferDTMseq(K, ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, l doc_lhood = fit_lda_post(d, t, lda_post, None, None, None, None, None) else: doc_lhood = fit_lda_post(d, t, lda_post, ldaseq, None, None, None, None) + if topic_suffstats != None: update_lda_seq_ss(t, doc, lda_post, topic_suffstats) + bound += doc_lhood doc_index += 1 + return def fit_lda_post(doc_number, time, lda_post, ldaseq, g, g3_matrix, g4_matrix, g5_matrix): + init_lda_post(lda_post) + model = "DTM" if model == "DIM": # if in DIM then we initialise some variables here @@ -432,20 +441,38 @@ def fit_lda_post(doc_number, time, lda_post, ldaseq, g, g3_matrix, g4_matrix, g5 iter_ = 0 LDA_INFERENCE_CONVERGED = 1e-8 LDA_INFERENCE_MAX_ITER = 25 + + + iter_ += 1 + lhood_old = lhood + update_gamma(lda_post) + + model = "DTM" + + if model == "DTM" or sslm is None: + update_phi(doc_number, time, lda_post, sslm, g) + elif model == "DIM" and sslm is not None: + update_phi_fixed(doc_number, time, lda_post, sslm, g3_matrix, g4_matrix, g5_matrix) + + lhood = compute_lda_lhood(lda_post) + converged = numpy.fabs((lhood_old - lhood) / (lhood_old * lda_post.doc.total)) + # convert from a do-while look + while converged > LDA_INFERENCE_CONVERGED and iter_ <= LDA_INFERENCE_MAX_ITER: + iter_ += 1 lhood_old = lhood update_gamma(lda_post) - model = "DTM" + if model == "DTM" or sslm is None: update_phi(doc_number, time, lda_post, sslm, g) elif model == "DIM" and sslm is not None: update_phi_fixed(doc_number, time, lda_post, sslm, g3_matrix, g4_matrix, g5_matrix) lhood = compute_lda_lhood(lda_post) - converged = numpy.fabs((lhood_old - lhood) / lhood_old * lda_post.doc.total) + converged = numpy.fabs((lhood_old - lhood) / (lhood_old * lda_post.doc.total)) return lhood @@ -455,8 +482,8 @@ def make_lda_seq_slice(lda, ldaseq, time): K = ldaseq.num_topics for k in range(0, K): - lda.topics[:,k] = ldaseq.topic_chains[k].e_log_prob[:,time] - lda.alpha = ldaseq.alphas + lda.topics[:,k] = numpy.copy(ldaseq.topic_chains[k].e_log_prob[:,time]) + lda.alpha = numpy.copy(ldaseq.alphas) return @@ -490,6 +517,7 @@ def init_lda_post(lda_post): def compute_lda_lhood(lda_post): + K = lda_post.lda.num_topics N = lda_post.doc.nterms gamma_sum = numpy.sum(lda_post.gamma) @@ -506,12 +534,11 @@ def compute_lda_lhood(lda_post): model = "DTM" for k in range(0, K): - if lda_post.doc_weight is not None and (model == "DIM" or model == "DTM"): + if lda_post.doc_weight is not None and (model == "DIM" or model == "fixed"): influence_topic = lda_post.doc_weight[k] influence_term = - ((influence_topic * influence_topic + FLAGS_sigma_l * FLAGS_sigma_l) / 2.0 / (FLAGS_sigma_d * FLAGS_sigma_d)) e_log_theta_k = digamma(lda_post.gamma[k]) - digsum - lhood_term = (lda_post.lda.alpha[k] - lda_post.gamma[k]) * e_log_theta_k + math.lgamma(lda_post.gamma[k]) - math.lgamma(lda_post.lda.alpha[k]) for n in range(0, N): @@ -532,16 +559,21 @@ def update_phi(doc, time, lda_post, ldaseq, g): dig = numpy.zeros(K) + + for k in range(0, K): dig[k] = digamma(lda_post.gamma[k]) + for n in range(0, N): w = lda_post.doc.word[n] for k in range(0, K): lda_post.log_phi[n][k] = dig[k] + lda_post.lda.topics[w][k] + log_phi_row = lda_post.log_phi[n] phi_row = lda_post.phi[n] + # log normalize v = log_phi_row[0] for i in range(1, len(log_phi_row)): @@ -551,7 +583,7 @@ def update_phi(doc, time, lda_post, ldaseq, g): log_phi_row[i] = log_phi_row[i] - v for k in range(0, K): - phi_row[i] = numpy.exp(log_phi_row[i]) + phi_row[k] = numpy.exp(log_phi_row[k]) lda_post.log_phi[n] = log_phi_row lda_post.phi[n] = phi_row @@ -564,7 +596,8 @@ def update_gamma(lda_post): K = lda_post.lda.num_topics N = lda_post.doc.nterms - lda_post.gamma = lda_post.lda.alpha + lda_post.gamma = numpy.copy(lda_post.lda.alpha) + for n in range(0, N): phi_row = lda_post.phi[n] count = lda_post.doc.count[n] @@ -582,7 +615,7 @@ def fit_lda_seq_topics(ldaseq, topic_suffstats): for k in range(0, K): print ("Fitting topic number" , k) lhood_term = fit_sslm(ldaseq.topic_chains[k], topic_suffstats[k]) - lhood +=lhood_term + lhood += lhood_term return lhood @@ -601,7 +634,6 @@ def fit_sslm(sslm, counts): compute_post_variance(w, sslm, sslm.chain_variance) totals = col_sum(counts, totals) - iter_ = 0 model = "DTM" @@ -692,6 +724,7 @@ def compute_bound(word_counts, totals, sslm): # fucntion to perform optimization def update_obs(word_counts, totals, sslm): + OBS_NORM_CUTOFF = 2 W = sslm.num_terms @@ -708,19 +741,21 @@ def update_obs(word_counts, totals, sslm): for i in range(0, len(w_counts)): counts_norm += w_counts[i] * w_counts[i] + counts_norm = numpy.sqrt(counts_norm) + if counts_norm < OBS_NORM_CUTOFF and norm_cutoff_obs is not None: obs = sslm.obs[w] - norm_cutoff_obs = obs + norm_cutoff_obs = numpy.copy(obs) else: if counts_norm < OBS_NORM_CUTOFF: - w_counts = numpy.zeros(len(word_counts)) + w_counts = numpy.zeros(len(w_counts)) for t in range(0, T): mean_deriv = mean_deriv_mtx[t] compute_mean_deriv(w, t, sslm, mean_deriv) mean_deriv_mtx[t] = mean_deriv - deriv = numpy.zeros(4) + deriv = numpy.zeros(T) args = sslm, w_counts, totals, mean_deriv_mtx, w, deriv obs = sslm.obs[w] step_size = 0.01 @@ -819,6 +854,7 @@ def f_obs(x, *args): term1 = 0.0 final = -(term1 + term2 + term3 + term4) + return final @@ -904,4 +940,126 @@ def df_obs(x, *args): # for i in range(0, len(df)): # df[i] = - df[i] +def lda_sstats(seq_corpus, num_topics, num_terms, alpha): + + lda_model = mockLDA(num_topics=num_topics, num_terms=num_terms) + lda_model.alpha = alpha # this will have shape equal to number of topics + # lda_ss = initialize_ss_random(seq_corpus, num_topics) + + lda_ss = numpy.array(numpy.split(numpy.loadtxt("sstats_rand"), num_terms)) + + lda_m_step(lda_model, lda_ss, seq_corpus, num_topics) + em_iter = 10 + lda_em(lda_model, lda_ss, seq_corpus, em_iter, num_topics) + + return lda_ss + +def initialize_ss_random(seq_corpus, num_topics): + + N = seq_corpus.num_terms + K = num_topics + + topic = numpy.array(numpy.split(numpy.zeros(N * K), N)) + + for n in range(0, N): + for k in range(0, K): + topic[n][k] = numpy.random.random() + 0.5 / seq_corpus.num_docs + 4.0 + + return topic + +def lda_m_step(lda_model, lda_ss, seq_corpus, num_topics): + + K = num_topics + W = seq_corpus.num_terms + lhood = 0 + + for k in range(0, K): + + ss_k = lda_ss[:,k] + log_p = lda_model.topics[:,k] + + LDA_VAR_BAYES = True + if LDA_VAR_BAYES is True: + + lop_p = numpy.copy(ss_k) + log_p = log_p / sum(log_p) + log_p = numpy.log(log_p) + + else: + pass + + return lhood + +def lda_em(lda_model, lda_ss, seq_corpus, max_iter, num_topics): + + LDA_EM_CONVERGED = 5e-5 + LDA_INFERENCE_CONVERGED = 1e-8 + + iter_ = 0 + lhood = lda_e_step(lda_model, seq_corpus, lda_ss, num_topics) + old_lhood = 0 + converged = 0 + m_lhood = lda_m_step(lda_model, lda_ss, seq_corpus, num_topics) + + # do step starts + + iter_ += 1 + old_lhood = lhood + e_lhood = lda_e_step(lda_model, seq_corpus, lda_ss, num_topics) + m_lhood = lda_m_step(lda_model, lda_ss, seq_corpus, num_topics) + lhood = e_lhood + m_lhood + converged = (old_lhood - lhood) / old_lhood + + while (converged > LDA_EM_CONVERGED or iter_ <= 5) and iter_ < max_iter: + + iter_ += 1 + old_lhood = lhood + e_lhood = lda_e_step(lda_model, seq_corpus, lda_ss, num_topics) + m_lhood = lda_m_step(lda_model, lda_ss, seq_corpus, num_topics) + lhood = e_lhood + m_lhood + converged = (old_lhood - lhood) / old_lhood + + return lhood + + +def lda_e_step(lda_model, seq_corpus, lda_ss, num_topics): + + K = num_topics + + if lda_ss is not None: + lda_ss.fill(0) + + lda_post.phi = numpy.resize(numpy.zeros(seq_corpus.max_nterms * K), (seq_corpus.max_nterms, K)) + lda_post.log_phi = numpy.resize(numpy.zeros(seq_corpus.max_nterms * K), (seq_corpus.max_nterms, K)) + lda_post.gamma = numpy.zeros(K) + lda_post.lhood = numpy.zeros(K + 1) + lda_post.lda = lda_model + + lhood = 0 + + for d in range(0, seq_corpus.num_docs): + + doc_ = seq_corpus.corpus[d] + nterms, word_id = doc_.split(' ', 1) + words = [] + counts = [] + totals = 0 + + for pair in word_id.split(): + word, count = pair.split(':') + words.append(int(word)) + counts.append(int(count)) + totals += int(count) + + doc = Doc(word=words, count=counts, total=totals, nterms=int(nterms)) + + lda_post.doc = doc + lhood += fit_lda_post(d, 0, lda_post, None, None, None, None, None) + + if lda_ss is not None: + for k in range(0, K): + for n in range(0, lda_post.doc.nterms): + lda_ss[lda_post.doc.word[n]][k] += lda_post.phi[n][k] * lda_post.doc.count[n] + + return lhood diff --git a/gensim/test/test_ldaseqmodel.py b/gensim/test/test_ldaseqmodel.py index 66f7ceb8ca..10d3273a5a 100644 --- a/gensim/test/test_ldaseqmodel.py +++ b/gensim/test/test_ldaseqmodel.py @@ -101,7 +101,7 @@ def testLogProb(self): def testUpdatePhi(self): # we test update phi for one particular document - doc = ldaseqmodel.doc(nterms=3, word=[549, 560, 561]) + doc = ldaseqmodel.Doc(nterms=3, word=[549, 560, 561]) topics = numpy.array(numpy.split(numpy.loadtxt(datapath('before_posterior_topics')), 562)) lda = ldaseqmodel.mockLDA(num_topics=2, topics=topics) @@ -120,7 +120,7 @@ def testUpdatePhi(self): def testUpdateGamma(self): - doc = ldaseqmodel.doc(nterms=3, count=[1, 1, 1]) + doc = ldaseqmodel.Doc(nterms=3, count=[1, 1, 1]) lda = ldaseqmodel.mockLDA(num_topics=2, num_terms=562, alpha=[0.01, 0.01]) phi = numpy.array(numpy.split(numpy.loadtxt(datapath('before_update_phi')), 116)) lda_post = ldaseqmodel.lda_post(lda=lda, doc=doc, gamma=[0.01, 0.01], phi=phi) @@ -132,7 +132,7 @@ def testUpdateGamma(self): def testUpdateSeqSS(self): lda = ldaseqmodel.mockLDA(num_topics=2, num_terms=562, alpha=[0.01, 0.01]) - doc = ldaseqmodel.doc(nterms=3, total=3, word=[549, 560, 561],count=[1, 1 ,1]) + doc = ldaseqmodel.Doc(nterms=3, total=3, word=[549, 560, 561],count=[1, 1 ,1]) phi = numpy.array(numpy.split(numpy.loadtxt(datapath('before_ldaseq_phi')), 116)) topic_suffstats = [numpy.array(numpy.split(numpy.loadtxt(datapath('before_ldaseq_sstats_0')), 562)), numpy.array(numpy.split(numpy.loadtxt(datapath('before_ldaseq_sstats_1')), 562))] lda_post = ldaseqmodel.lda_post(lda=lda, doc=doc, phi = phi) @@ -146,7 +146,7 @@ def testUpdateSeqSS(self): def testInitLdaPost(self): lda = ldaseqmodel.mockLDA(num_topics=2, alpha=[0.01, 0.01], num_terms=562) - doc = ldaseqmodel.doc(nterms=3, total=3, word=[549, 560, 561], count=[1, 1, 1]) + doc = ldaseqmodel.Doc(nterms=3, total=3, word=[549, 560, 561], count=[1, 1, 1]) # 116 is the number of terms in time_slice 4, and 2 is the number of topics phi = numpy.resize(numpy.zeros(116 * 2), (116, 2)) @@ -234,7 +234,7 @@ def testLdaLhood(self): word = [549, 560, 561] num_terms = 562 - doc = ldaseqmodel.doc(nterms=nterms, count=count, word=word) + doc = ldaseqmodel.Doc(nterms=nterms, count=count, word=word) lda = ldaseqmodel.mockLDA(num_topics=num_topics, num_terms=num_terms, alpha=alpha) lda_post = ldaseqmodel.lda_post(doc=doc, lda=lda, gamma=gamma, lhood=lhood, phi=phi, log_phi=log_phi) lhood = ldaseqmodel.compute_lda_lhood(lda_post) From 1fddf69339b655780a23ac461c88c796ea43b14b Mon Sep 17 00:00:00 2001 From: Bhargav Srinivasa Date: Tue, 2 Aug 2016 22:32:05 +0530 Subject: [PATCH 20/38] Format changes --- gensim/models/ldaseqmodel.py | 88 ++++++++++++++++-------------------- 1 file changed, 39 insertions(+), 49 deletions(-) diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index 19a1497d07..7961092b9e 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -131,9 +131,7 @@ def make_seq_corpus(corpus, time_seq): return seq_corpus_ def update_zeta(sslm): - # setting limits - # num_terms = sslm.obs.shape[0] # this is word length (our example, 562) - # num_sequences = sslm.obs.shape[1] # this is number of sequeces + num_terms = sslm.num_terms num_sequences = sslm.num_sequences # making zero and updating @@ -206,8 +204,6 @@ def compute_post_mean(word, sslm, chain_variance): # error message pass - # sslm.mean[word] = mean - # sslm.fwd_mean[word] = fwd_mean return def compute_expected_log_prob(sslm): @@ -280,9 +276,9 @@ def fit_lda_seq(ldaseq, seq_corpus): convergence = ldasqe_em_threshold + 1 # make directory - em_log = open("em_log.dat", "w") - gammas_file = open("gammas.dat", "w") - lhoods_file = open("lhoods.dat", "w") + # em_log = open("em_log.dat", "w") + # gammas_file = open("gammas.dat", "w") + # lhoods_file = open("lhoods.dat", "w") iter_ = 0 final_iters_flag = 0 @@ -300,8 +296,8 @@ def fit_lda_seq(ldaseq, seq_corpus): print (" EM iter " , iter_) print ("E Step") - # writing to file - em_log.write(str(bound) + "\t" + str(convergence)) + # do we need to write to file + # em_log.write(str(bound) + "\t" + str(convergence) + "\n") old_bound = bound # initiate sufficient statistics @@ -315,11 +311,9 @@ def fit_lda_seq(ldaseq, seq_corpus): bound = lda_seq_infer(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, last_iter) - - # figure out how to write to file here - # TODO save to file for command line - # gammas_file.write(gammas) - # lhoods_file.write(lhoods) + # do we need to write to file + gammas_file.write(str(gammas) + "\n") + lhoods_file.write(str(lhoods) + "\n") print ("M Step") @@ -327,7 +321,7 @@ def fit_lda_seq(ldaseq, seq_corpus): bound += topic_bound - + # write ldaseq details to file # write_lda_seq(ldaseq) if ((bound - old_bound) < 0): @@ -380,6 +374,7 @@ def lda_seq_infer(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, la def inferDTMseq(K, ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, last_iter, lda, lda_post, bound): doc_index = 0 + for t in range(0, seq_corpus.length): make_lda_seq_slice(lda, ldaseq, t) @@ -397,12 +392,14 @@ def inferDTMseq(K, ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, l totals = 0 for pair in word_id.split(): + word, count = pair.split(':') words.append(int(word)) counts.append(int(count)) totals += int(count) doc = Doc(word=words, count=counts, total=totals, nterms=int(nterms)) + lda_post.gamma = gam lda_post.lhood = lhood lda_post.doc = doc @@ -413,9 +410,6 @@ def inferDTMseq(K, ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, l else: doc_lhood = fit_lda_post(d, t, lda_post, ldaseq, None, None, None, None) - - - if topic_suffstats != None: update_lda_seq_ss(t, doc, lda_post, topic_suffstats) @@ -428,8 +422,6 @@ def fit_lda_post(doc_number, time, lda_post, ldaseq, g, g3_matrix, g4_matrix, g5 init_lda_post(lda_post) - - model = "DTM" if model == "DIM": # if in DIM then we initialise some variables here @@ -464,6 +456,7 @@ def fit_lda_post(doc_number, time, lda_post, ldaseq, g, g3_matrix, g4_matrix, g5 iter_ += 1 lhood_old = lhood update_gamma(lda_post) + model = "DTM" if model == "DTM" or sslm is None: @@ -517,7 +510,6 @@ def init_lda_post(lda_post): def compute_lda_lhood(lda_post): - K = lda_post.lda.num_topics N = lda_post.doc.nterms gamma_sum = numpy.sum(lda_post.gamma) @@ -559,8 +551,6 @@ def update_phi(doc, time, lda_post, ldaseq, g): dig = numpy.zeros(K) - - for k in range(0, K): dig[k] = digamma(lda_post.gamma[k]) @@ -573,7 +563,6 @@ def update_phi(doc, time, lda_post, ldaseq, g): log_phi_row = lda_post.log_phi[n] phi_row = lda_post.phi[n] - # log normalize v = log_phi_row[0] for i in range(1, len(log_phi_row)): @@ -724,7 +713,6 @@ def compute_bound(word_counts, totals, sslm): # fucntion to perform optimization def update_obs(word_counts, totals, sslm): - OBS_NORM_CUTOFF = 2 W = sslm.num_terms @@ -809,6 +797,7 @@ def compute_mean_deriv(word, time, sslm, deriv): return + def f_obs(x, *args): sslm, word_counts, totals, mean_deriv_mtx, word, deriv = args @@ -910,6 +899,7 @@ def compute_obs_deriv(word, word_counts, totals, sslm, mean_deriv_mtx, deriv): return + def df_obs(x, *args): sslm, word_counts, totals, mean_deriv_mtx, word, deriv = args @@ -925,35 +915,19 @@ def df_obs(x, *args): return numpy.negative(deriv) -# def fdf_obs(x, params, f, df): - -# p = params -# model = "DTM" - -# if model == "DTM": -# f = f_obs(x, params) -# compute_obs_deriv(p.word, p.word_counts, p.totals, p.sslm, p.mean_deriv_mtx, df) -# elif model == "DIM": -# f = f_obs_multiplt(x, params) -# compute_obs_deriv_fixed(p.word, p.word_counts, p.totals, p.sslm, p.mean_deriv_mtx, df) - -# for i in range(0, len(df)): -# df[i] = - df[i] def lda_sstats(seq_corpus, num_topics, num_terms, alpha): lda_model = mockLDA(num_topics=num_topics, num_terms=num_terms) lda_model.alpha = alpha # this will have shape equal to number of topics - # lda_ss = initialize_ss_random(seq_corpus, num_topics) - - lda_ss = numpy.array(numpy.split(numpy.loadtxt("sstats_rand"), num_terms)) - + lda_ss = initialize_ss_random(seq_corpus, num_topics) lda_m_step(lda_model, lda_ss, seq_corpus, num_topics) em_iter = 10 lda_em(lda_model, lda_ss, seq_corpus, em_iter, num_topics) return lda_ss + def initialize_ss_random(seq_corpus, num_topics): N = seq_corpus.num_terms @@ -967,12 +941,12 @@ def initialize_ss_random(seq_corpus, num_topics): return topic + def lda_m_step(lda_model, lda_ss, seq_corpus, num_topics): K = num_topics W = seq_corpus.num_terms lhood = 0 - for k in range(0, K): ss_k = lda_ss[:,k] @@ -981,15 +955,18 @@ def lda_m_step(lda_model, lda_ss, seq_corpus, num_topics): LDA_VAR_BAYES = True if LDA_VAR_BAYES is True: - lop_p = numpy.copy(ss_k) + numpy.copyto(log_p, ss_k) log_p = log_p / sum(log_p) log_p = numpy.log(log_p) else: pass + lda_model.topics[:,k] = log_p + return lhood + def lda_em(lda_model, lda_ss, seq_corpus, max_iter, num_topics): LDA_EM_CONVERGED = 5e-5 @@ -1052,14 +1029,27 @@ def lda_e_step(lda_model, seq_corpus, lda_ss, num_topics): totals += int(count) doc = Doc(word=words, count=counts, total=totals, nterms=int(nterms)) - lda_post.doc = doc - lhood += fit_lda_post(d, 0, lda_post, None, None, None, None, None) + lhood += fit_lda_post(d, 0, lda_post, None, None, None, None, None) if lda_ss is not None: for k in range(0, K): for n in range(0, lda_post.doc.nterms): - lda_ss[lda_post.doc.word[n]][k] += lda_post.phi[n][k] * lda_post.doc.count[n] + lda_ss[lda_post.doc.word[n]][k] += round(lda_post.phi[n][k], 6) * lda_post.doc.count[n] return lhood + +# fdf fucnction + +# def fdf_obs(x, params, f, df): +# p = params +# model = "DTM" +# if model == "DTM": +# f = f_obs(x, params) +# compute_obs_deriv(p.word, p.word_counts, p.totals, p.sslm, p.mean_deriv_mtx, df) +# elif model == "DIM": +# f = f_obs_multiplt(x, params) +# compute_obs_deriv_fixed(p.word, p.word_counts, p.totals, p.sslm, p.mean_deriv_mtx, df) +# for i in range(0, len(df)): +# df[i] = - df[i] From 14c55019df12908c57d14e0138f9a8cf6b106b2c Mon Sep 17 00:00:00 2001 From: Bhargav Srinivasa Date: Fri, 5 Aug 2016 00:14:36 +0530 Subject: [PATCH 21/38] Added docstrings, made corpus streamable --- gensim/models/ldaseqmodel.py | 375 +++++++++++++++++++---------------- 1 file changed, 205 insertions(+), 170 deletions(-) diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index 7961092b9e..cbe4277acc 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -24,111 +24,144 @@ from scipy import optimize import sys -# this is a mock LDA class to help with testing until this is figured out -class mockLDA(utils.SaveLoad): - def __init__(self, num_topics=None, alpha=None, num_terms=None, topics=None): - self.num_topics = num_topics - self.num_terms = num_terms - self.alpha = alpha - if topics is None: - self.topics = numpy.array(numpy.split(numpy.zeros(num_terms * num_topics), num_terms)) - elif topics is not None: - self.topics = topics -# a mock document class to help with testing until this is figured out +class seq_corpus(utils.SaveLoad): + + """ + seq_corpus is basically a wrapper class which contains information about the corpus. + num_terms is the length of the vocabulary. + max_nterms is the maximum number of terms a single document has. + num_sequences is the number of sequences, i.e number of time-slices. + num_docs is the number of documents present. + time_slice is a list or numpy array which the user must provide which contains the number of documents in each time-slice. + corpus is any iterable gensim corpus. + + """ + def __init__(self, num_terms=None, max_nterms=None, num_sequences=None, num_docs=None, corpus=None, time_slice=None, id2word=None): + + + self.id2word = id2word + if corpus is None and self.id2word is None: + raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality') + + if self.id2word is None: + logger.warning("no word id mapping provided; initializing from corpus, assuming identity") + self.id2word = utils.dict_from_corpus(corpus) + self.num_terms = len(self.id2word) + elif len(self.id2word) > 0: + self.num_terms = 1 + max(self.id2word.keys()) + else: + self.num_terms = 0 + + self.corpus = corpus + if self.corpus is not None: + self.num_docs = len(corpus) + + self.time_slice = time_slice + if self.time_slice is not None: + self.num_sequences = len(time_slice) + + # need to still figure out way to get max_nterms + self.max_nterms = max_nterms + + class Doc(utils.SaveLoad): + """ + The doc class contains information used for each document. + + """ def __init__(self, nterms=None, word=None, count=None, total=None): + self.nterms = nterms self.word = word self.count = count self.total = total -class seq_corpus(utils.SaveLoad): - def __init__(self, num_terms=None, max_nterms=None, length=None, num_docs=None, corpuses=None, corpus=None): - self.num_terms = num_terms - self.max_nterms = max_nterms - self.length = len(corpuses) - self.num_docs = num_docs - - # list of corpus class objects - self.corpuses = corpuses - self.corpus = corpus class LdaSeqModel(utils.SaveLoad): - def __init__(self, corpus=None, num_topics=10, id2word=None, num_sequences=None, num_terms=None, alphas=None, top_doc_phis=None, - topic_chains=[], influence=None, influence_sum_lgl=None, renormalized_influence=None): - # store user-supplied parameters + """ + Class which contains information of our whole DTM model. + Topic chains contains for each topic a 'state space language model' object which in turn has information about each topic + + """ + + def __init__(self, corpus=None, num_topics=10, id2word=None, num_sequences=None, num_terms=None, alphas=None): + + self.corpus = corpus - self.id2word = id2word self.num_topics = num_topics self.num_sequences = num_sequences self.num_terms = num_terms - self.alphas = alphas self.topic_chains = topic_chains if self.topic_chains is None: for topic in range(0, num_topics): sslm_ = sslm(num_sequences=num_sequences, num_terms=num_terms, num_topics=num_topics) topic_chains.append(sslm_) - self.top_doc_phis = top_doc_phis - # influence values as of now not using - self.influence = influence - self.renormalized_influence = renormalized_influence - self.influence_sum_lgl = influence_sum_lgl + # the following are class variables which are to be integrated during Document Influence Model + self.top_doc_phis = None + self.influence = None + self.renormalized_influence = None + self.influence_sum_lgl = None class sslm(utils.SaveLoad): - def __init__(self, num_terms=None, num_sequences=None, obs=None, obs_variance=0.5, chain_variance=0.005, fwd_variance=None, - mean=None, variance=None, zeta=None, e_log_prob=None, fwd_mean=None, m_update_coeff=None, temp_vect=None, - mean_t=None, variance_t=None, influence_sum_lgl=None, w_phi_l=None, w_phi_sum=None, w_phi_l_sq=None, m_update_coeff_g=None): - - self.obs = obs - self.zeta = zeta # array equal to number of sequences - self.mean = mean # matrix of dimensions num_terms * (num_of sequences + 1) - self.variance = variance # matrix of dimensions num_terms * (num_of sequences + 1) + """ + obs values contain the doc - topic ratios + e_log_prob contains topic - word ratios + mean, fwd_mean contains the mean values to be used for inference for each word for a time_slice + variance, fwd_variance contains the variance values to be used for inference for each word in a time_slice + + """ + def __init__(self, num_terms=None, num_sequences=None, obs_variance=0.5, chain_variance=0.005): + + self.num_terms = num_terms self.num_sequences = num_sequences self.obs_variance = obs_variance self.chain_variance= chain_variance - self.fwd_variance = fwd_variance - self.fwd_mean = fwd_mean - self.e_log_prob = e_log_prob - self.m_update_coeff = m_update_coeff - self.mean_t = mean_t - self.variance_t = variance_t - self.influence_sum_lgl = influence_sum_lgl - self.w_phi_l = w_phi_l - self.w_phi_sum = w_phi_sum - self.w_phi_l_sq = w_phi_l_sq - self.m_update_coeff_g = m_update_coeff_g + + self.obs = numpy.array(numpy.split(numpy.zeros(num_sequences * num_terms), num_terms)) + self.e_log_prob = numpy.array(numpy.split(numpy.zeros(num_sequences * num_terms), num_terms)) + self.mean = numpy.array(numpy.split(numpy.zeros((num_sequences + 1) * num_terms), num_terms)) + self.fwd_mean = numpy.array(numpy.split(numpy.zeros((num_sequences + 1) * num_terms), num_terms)) + self.fwd_variance = numpy.array(numpy.split(numpy.zeros((num_sequences + 1) * num_terms), num_terms)) + self.variance = numpy.array(numpy.split(numpy.zeros((num_sequences + 1) * num_terms), num_terms)) + self.zeta = numpy.zeros(num_sequences) + + # the following are class variables which are to be integrated during Document Influence Model + self.m_update_coeff = None + self.mean_t = None + self.variance_t = None + self.influence_sum_lgl = None + self.w_phi_l = None + self.w_phi_sum = None + self.w_phi_l_sq = None + self.m_update_coeff_g = None # temp_vect - self.temp_vect = temp_vect + self.temp_vect = None + +class Lda_Post(utils.SaveLoad): + """ + Posterior values associated with each set of documents document + + """ + + def __init__(self, doc=None, lda=None, max_nterms=None, num_topics=None, gamma=None, lhood=None): -class lda_post(utils.SaveLoad): - def __init__(self, doc=None, lda=None, phi=None, log_phi=None, gamma=None, lhood=None, doc_weight=None, renormalized_doc_weight=None): self.doc = doc self.lda = lda - self.phi = phi - self.log_phi = log_phi self.gamma = gamma self.lhood = lhood - self.doc_weight = doc_weight - self.renormalized_doc_weight = renormalized_doc_weight -def make_seq_corpus(corpus, time_seq): - split_corpus = [] - time_seq.insert(0, 0) - for time in range(0, len(time_seq) - 1): - time_seq[time + 1] = time_seq[time] + time_seq[time + 1] - split_corpus.append(corpus[time_seq[time]:time_seq[time+1]]) + if max_nterms is not None and num_topics is not None: + self.phi = numpy.resize(numpy.zeros(max_nterms * num_topics), (max_nterms, num_topics)) + self.log_phi = numpy.resize(numpy.zeros(max_nterms * num_topics), (max_nterms, num_topics)) - num_docs = len(corpus) - length = len(split_corpus) - # num_terms = len(corpus.dictionary) + # the following are class variables which are to be integrated during Document Influence Model - seq_corpus_ = seq_corpus(num_docs=num_docs, length=length, corpuses=split_corpus, corpus=corpus) - - return seq_corpus_ + self.doc_weight = None + self.renormalized_doc_weight = None def update_zeta(sslm): @@ -144,7 +177,8 @@ def update_zeta(sslm): sslm.zeta[j] = sslm.zeta[j] + val return -def compute_post_variance(word , sslm, chain_variance): +def compute_post_variance(word, sslm, chain_variance): + T = sslm.num_sequences variance = sslm.variance[word] # pick wordth row fwd_variance = sslm.fwd_variance[word] # pick wordth row @@ -184,8 +218,6 @@ def compute_post_mean(word, sslm, chain_variance): # forward fwd_mean[0] = 0 for t in range(1, T + 1): - # assert(fabs(vget(&fwd_variance, t-1) + - # chain_variance + var->obs_variance) > 0.0); w = sslm.obs_variance / (fwd_variance[t - 1] + chain_variance + sslm.obs_variance) fwd_mean[t] = w * fwd_mean[t - 1] + (1 - w) * obs[t - 1] if fwd_mean[t] is None: @@ -203,7 +235,7 @@ def compute_post_mean(word, sslm, chain_variance): if mean[t] is None: # error message pass - + return def compute_expected_log_prob(sslm): @@ -254,20 +286,19 @@ def init_ldaseq_ss(ldaseq, topic_chain_variance, topic_obs_variance, alpha, init sstats = init_suffstats[:,k] sslm_counts_init(ldaseq.topic_chains[k], topic_obs_variance, topic_chain_variance, sstats) - # dont't need to initialize here, but writing for reference - ldaseq.topic_chains[k].w_phi_l = numpy.zeros((ldaseq.num_terms, ldaseq.num_sequences)) - ldaseq.topic_chains[k].w_phi_sum = numpy.zeros((ldaseq.num_terms, ldaseq.num_sequences)) - ldaseq.topic_chains[k].w_phi_sq = numpy.zeros((ldaseq.num_terms, ldaseq.num_sequences)) + + # initialize the below matrices only if running DIM + # ldaseq.topic_chains[k].w_phi_l = numpy.zeros((ldaseq.num_terms, ldaseq.num_sequences)) + # ldaseq.topic_chains[k].w_phi_sum = numpy.zeros((ldaseq.num_terms, ldaseq.num_sequences)) + # ldaseq.topic_chains[k].w_phi_sq = numpy.zeros((ldaseq.num_terms, ldaseq.num_sequences)) def fit_lda_seq(ldaseq, seq_corpus): K = ldaseq.num_topics W = ldaseq.num_terms - data_len = seq_corpus.length + data_len = seq_corpus.num_sequences num_docs = seq_corpus.num_docs - # heldout_gammas = NULL - # heldout_llhood = NULL LDA_INFERENCE_MAX_ITER = 25 bound = 0 @@ -275,11 +306,6 @@ def fit_lda_seq(ldaseq, seq_corpus): ldasqe_em_threshold = 1e-4 convergence = ldasqe_em_threshold + 1 - # make directory - # em_log = open("em_log.dat", "w") - # gammas_file = open("gammas.dat", "w") - # lhoods_file = open("lhoods.dat", "w") - iter_ = 0 final_iters_flag = 0 last_iter = 0 @@ -296,8 +322,8 @@ def fit_lda_seq(ldaseq, seq_corpus): print (" EM iter " , iter_) print ("E Step") - # do we need to write to file - # em_log.write(str(bound) + "\t" + str(convergence) + "\n") + # writing to file + em_log.write(str(bound) + "\t" + str(convergence)) old_bound = bound # initiate sufficient statistics @@ -311,19 +337,12 @@ def fit_lda_seq(ldaseq, seq_corpus): bound = lda_seq_infer(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, last_iter) - # do we need to write to file - gammas_file.write(str(gammas) + "\n") - lhoods_file.write(str(lhoods) + "\n") - print ("M Step") topic_bound = fit_lda_seq_topics(ldaseq, topic_suffstats) bound += topic_bound - # write ldaseq details to file - # write_lda_seq(ldaseq) - if ((bound - old_bound) < 0): if (LDA_INFERENCE_MAX_ITER == 1): LDA_INFERENCE_MAX_ITER = 2 @@ -357,10 +376,9 @@ def lda_seq_infer(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, la W = ldaseq.num_terms bound = 0.0 - lda = mockLDA(num_topics=K, alpha=ldaseq.alphas, num_terms=W) - lda_post.phi = numpy.resize(numpy.zeros(seq_corpus.max_nterms * K), (seq_corpus.max_nterms, K)) - lda_post.log_phi = numpy.resize(numpy.zeros(seq_corpus.max_nterms * K), (seq_corpus.max_nterms, K)) - lda_post.model = lda + lda = ldamodel.LdaModel(num_topics=K, alpha=ldaseq.alphas, id2word=seq_corpus.id2word) + lda_post = Lda_Post(max_nterms=seq_corpus.max_nterms, num_topics=K, lda=lda) + model = "DTM" if model == "DTM": @@ -373,48 +391,57 @@ def lda_seq_infer(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, la def inferDTMseq(K, ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, last_iter, lda, lda_post, bound): + def cumsum(it): + total = 0 + for x in it: + total += x + yield total + doc_index = 0 - - for t in range(0, seq_corpus.length): + t = 0 + d = 0 + make_lda_seq_slice(lda, ldaseq, t) - make_lda_seq_slice(lda, ldaseq, t) - ndocs = len(seq_corpus.corpuses[t]) - for d in range(0, ndocs): + time_slice = list(cumsum(ldaseq.time_slice)) - gam = gammas[doc_index] - lhood = lhoods[doc_index] + for line_no, line in enumerate(seq_corpus.corpus): + if doc_index > time_slice[t]: + t += 1 + make_lda_seq_slice(lda, ldaseq, t) + d = 0 - doc_ = seq_corpus.corpuses[t][d] - nterms, word_id = doc_.split(' ', 1) - words = [] - counts = [] - totals = 0 + gam = gammas[doc_index] + lhood = lhoods[doc_index] - for pair in word_id.split(): + doc_ = line - word, count = pair.split(':') - words.append(int(word)) - counts.append(int(count)) - totals += int(count) + nterms = len(doc_) + words = [] + counts = [] + totals = 0 + for word_id, count in doc_: + words.append(int(word_id)) + counts.append(int(count)) + totals += int(count) - doc = Doc(word=words, count=counts, total=totals, nterms=int(nterms)) + doc = Doc(word=words, count=counts, total=totals, nterms=int(nterms)) + lda_post.gamma = gam + lda_post.lhood = lhood + lda_post.doc = doc - lda_post.gamma = gam - lda_post.lhood = lhood - lda_post.doc = doc - lda_post.lda = lda + if iter_ == 0: + doc_lhood = fit_lda_post(d, t, lda_post, None, None, None, None, None) + else: + doc_lhood = fit_lda_post(d, t, lda_post, ldaseq, None, None, None, None) + - if iter_ == 0: - doc_lhood = fit_lda_post(d, t, lda_post, None, None, None, None, None) - else: - doc_lhood = fit_lda_post(d, t, lda_post, ldaseq, None, None, None, None) - - if topic_suffstats != None: - update_lda_seq_ss(t, doc, lda_post, topic_suffstats) + if topic_suffstats != None: + update_lda_seq_ss(t, doc, lda_post, topic_suffstats) - bound += doc_lhood - doc_index += 1 + bound += doc_lhood + doc_index += 1 + d += 1 return @@ -422,6 +449,8 @@ def fit_lda_post(doc_number, time, lda_post, ldaseq, g, g3_matrix, g4_matrix, g5 init_lda_post(lda_post) + + model = "DTM" if model == "DIM": # if in DIM then we initialise some variables here @@ -449,14 +478,12 @@ def fit_lda_post(doc_number, time, lda_post, ldaseq, g, g3_matrix, g4_matrix, g5 lhood = compute_lda_lhood(lda_post) converged = numpy.fabs((lhood_old - lhood) / (lhood_old * lda_post.doc.total)) - # convert from a do-while look while converged > LDA_INFERENCE_CONVERGED and iter_ <= LDA_INFERENCE_MAX_ITER: iter_ += 1 lhood_old = lhood update_gamma(lda_post) - model = "DTM" if model == "DTM" or sslm is None: @@ -476,6 +503,7 @@ def make_lda_seq_slice(lda, ldaseq, time): for k in range(0, K): lda.topics[:,k] = numpy.copy(ldaseq.topic_chains[k].e_log_prob[:,time]) + lda.alpha = numpy.copy(ldaseq.alphas) return @@ -504,12 +532,15 @@ def init_lda_post(lda_post): lda_post.gamma[k] = lda_post.lda.alpha[k] + float(lda_post.doc.total) / K for n in range(0, N): lda_post.phi[n][k] = 1.0 / K + + # doc_weight used during DIM + # lda_post.doc_weight = None - lda_post.doc_weight = None return def compute_lda_lhood(lda_post): + K = lda_post.lda.num_topics N = lda_post.doc.nterms gamma_sum = numpy.sum(lda_post.gamma) @@ -521,14 +552,16 @@ def compute_lda_lhood(lda_post): lhood = math.lgamma(numpy.sum(lda_post.lda.alpha)) - math.lgamma(gamma_sum) lda_post.lhood[K] = lhood - influence_term = 0 + # influence_term = 0 digsum = digamma(gamma_sum) model = "DTM" for k in range(0, K): - if lda_post.doc_weight is not None and (model == "DIM" or model == "fixed"): - influence_topic = lda_post.doc_weight[k] - influence_term = - ((influence_topic * influence_topic + FLAGS_sigma_l * FLAGS_sigma_l) / 2.0 / (FLAGS_sigma_d * FLAGS_sigma_d)) + # below code only to be used in DIM mode + # if lda_post.doc_weight is not None and (model == "DIM" or model == "fixed"): + # influence_topic = lda_post.doc_weight[k] + # influence_term = - ((influence_topic * influence_topic + FLAGS_sigma_l * FLAGS_sigma_l) / 2.0 / (FLAGS_sigma_d * FLAGS_sigma_d)) + e_log_theta_k = digamma(lda_post.gamma[k]) - digsum lhood_term = (lda_post.lda.alpha[k] - lda_post.gamma[k]) * e_log_theta_k + math.lgamma(lda_post.gamma[k]) - math.lgamma(lda_post.lda.alpha[k]) @@ -554,7 +587,6 @@ def update_phi(doc, time, lda_post, ldaseq, g): for k in range(0, K): dig[k] = digamma(lda_post.gamma[k]) - for n in range(0, N): w = lda_post.doc.word[n] for k in range(0, K): @@ -697,11 +729,12 @@ def compute_bound(word_counts, totals, sslm): v = sslm.variance[w][t] - # Values specifically related to document influence: - # Note that our indices are off by 1 here. - w_phi_l = sslm.w_phi_l[w][t - 1] - exp_i = numpy.exp(-prev_m) - term_1 += (numpy.power(m - prev_m - (w_phi_l * exp_i), 2) / (2 * chain_variance)) - (v / chain_variance) - numpy.log(chain_variance) + # w_phi_l is only used in Document Influence Model; the values are aleays zero in this case + # w_phi_l = sslm.w_phi_l[w][t - 1] + # exp_i = numpy.exp(-prev_m) + # term_1 += (numpy.power(m - prev_m - (w_phi_l * exp_i), 2) / (2 * chain_variance)) - (v / chain_variance) - numpy.log(chain_variance) + + term_1 += (numpy.power(m - prev_m, 2) / (2 * chain_variance)) - (v / chain_variance) - numpy.log(chain_variance) term_2 += word_counts[w][t - 1] * m ent += numpy.log(v) / 2 # note the 2pi's cancel with term1 (see doc) @@ -713,6 +746,7 @@ def compute_bound(word_counts, totals, sslm): # fucntion to perform optimization def update_obs(word_counts, totals, sslm): + OBS_NORM_CUTOFF = 2 W = sslm.num_terms @@ -797,7 +831,6 @@ def compute_mean_deriv(word, time, sslm, deriv): return - def f_obs(x, *args): sslm, word_counts, totals, mean_deriv_mtx, word, deriv = args @@ -818,8 +851,10 @@ def f_obs(x, *args): mean = sslm.mean[word] variance = sslm.variance[word] - w_phi_l = sslm.w_phi_l[word] - m_update_coeff = sslm.m_update_coeff[word] + + # only used for DIM mode + # w_phi_l = sslm.w_phi_l[word] + # m_update_coeff = sslm.m_update_coeff[word] for t in range(1, T + 1): mean_t = mean[t] @@ -843,7 +878,6 @@ def f_obs(x, *args): term1 = 0.0 final = -(term1 + term2 + term3 + term4) - return final @@ -857,14 +891,15 @@ def compute_obs_deriv(word, word_counts, totals, sslm, mean_deriv_mtx, deriv): mean = sslm.mean[word] variance = sslm.variance[word] + # only used for DIM mode + # w_phi_l = sslm.w_phi_l[word] + # m_update_coeff = sslm.m_update_coeff[word] + sslm.temp_vect = numpy.zeros(T) for u in range(0, T): sslm.temp_vect[u] = numpy.exp(mean[u + 1] + variance[u + 1] / 2) - w_phi_l = sslm.w_phi_l[word] - m_update_coeff = sslm.m_update_coeff[word] - for t in range(0, T): mean_deriv = mean_deriv_mtx[t] @@ -899,7 +934,6 @@ def compute_obs_deriv(word, word_counts, totals, sslm, mean_deriv_mtx, deriv): return - def df_obs(x, *args): sslm, word_counts, totals, mean_deriv_mtx, word, deriv = args @@ -916,18 +950,22 @@ def df_obs(x, *args): return numpy.negative(deriv) + +# the following code replicates Blei's original LDA, ported to python. +# idea is to let user initialise LDA sstats through this instead of gensim LDA if wanted. + def lda_sstats(seq_corpus, num_topics, num_terms, alpha): - lda_model = mockLDA(num_topics=num_topics, num_terms=num_terms) + lda_model = ldamodel.LdaModel(num_topics=num_topics, id2word=seq_corpus.id2word) lda_model.alpha = alpha # this will have shape equal to number of topics lda_ss = initialize_ss_random(seq_corpus, num_topics) + lda_m_step(lda_model, lda_ss, seq_corpus, num_topics) em_iter = 10 lda_em(lda_model, lda_ss, seq_corpus, em_iter, num_topics) return lda_ss - def initialize_ss_random(seq_corpus, num_topics): N = seq_corpus.num_terms @@ -941,12 +979,12 @@ def initialize_ss_random(seq_corpus, num_topics): return topic - def lda_m_step(lda_model, lda_ss, seq_corpus, num_topics): K = num_topics W = seq_corpus.num_terms lhood = 0 + for k in range(0, K): ss_k = lda_ss[:,k] @@ -955,7 +993,7 @@ def lda_m_step(lda_model, lda_ss, seq_corpus, num_topics): LDA_VAR_BAYES = True if LDA_VAR_BAYES is True: - numpy.copyto(log_p, ss_k) + lop_p = numpy.copy(ss_k) log_p = log_p / sum(log_p) log_p = numpy.log(log_p) @@ -966,7 +1004,6 @@ def lda_m_step(lda_model, lda_ss, seq_corpus, num_topics): return lhood - def lda_em(lda_model, lda_ss, seq_corpus, max_iter, num_topics): LDA_EM_CONVERGED = 5e-5 @@ -1006,41 +1043,37 @@ def lda_e_step(lda_model, seq_corpus, lda_ss, num_topics): if lda_ss is not None: lda_ss.fill(0) - lda_post.phi = numpy.resize(numpy.zeros(seq_corpus.max_nterms * K), (seq_corpus.max_nterms, K)) - lda_post.log_phi = numpy.resize(numpy.zeros(seq_corpus.max_nterms * K), (seq_corpus.max_nterms, K)) + lda_post = Lda_Post(max_nterms=seq_corpus.max_nterms, num_topics=K, lda=lda_model) lda_post.gamma = numpy.zeros(K) lda_post.lhood = numpy.zeros(K + 1) - lda_post.lda = lda_model lhood = 0 - for d in range(0, seq_corpus.num_docs): + for line_no, line in enumerate(seq_corpus.corpus): - doc_ = seq_corpus.corpus[d] - nterms, word_id = doc_.split(' ', 1) + doc_ = line + + nterms = len(doc_) words = [] counts = [] totals = 0 - - for pair in word_id.split(): - word, count = pair.split(':') - words.append(int(word)) + for word_id, count in doc_: + words.append(int(word_id)) counts.append(int(count)) totals += int(count) doc = Doc(word=words, count=counts, total=totals, nterms=int(nterms)) lda_post.doc = doc - lhood += fit_lda_post(d, 0, lda_post, None, None, None, None, None) + if lda_ss is not None: for k in range(0, K): for n in range(0, lda_post.doc.nterms): - lda_ss[lda_post.doc.word[n]][k] += round(lda_post.phi[n][k], 6) * lda_post.doc.count[n] + lda_ss[lda_post.doc.word[n]][k] += lda_post.phi[n][k] * lda_post.doc.count[n] return lhood - -# fdf fucnction +# the fdf used in optimising obs. Can use if we figure a way to use an optimization function which requires this # def fdf_obs(x, params, f, df): # p = params @@ -1052,4 +1085,6 @@ def lda_e_step(lda_model, seq_corpus, lda_ss, num_topics): # f = f_obs_multiplt(x, params) # compute_obs_deriv_fixed(p.word, p.word_counts, p.totals, p.sslm, p.mean_deriv_mtx, df) # for i in range(0, len(df)): -# df[i] = - df[i] +# df[i] = - df[i] + + From 4eff6147f4f56f49a6dc6a16667178cfae394fe3 Mon Sep 17 00:00:00 2001 From: Bhargav Srinivasa Date: Fri, 5 Aug 2016 13:15:27 +0530 Subject: [PATCH 22/38] Updated inits --- gensim/models/ldaseqmodel.py | 57 +++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 14 deletions(-) diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index cbe4277acc..8063dfa84d 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -22,7 +22,6 @@ import math from scipy.special import digamma from scipy import optimize -import sys class seq_corpus(utils.SaveLoad): @@ -37,7 +36,7 @@ class seq_corpus(utils.SaveLoad): corpus is any iterable gensim corpus. """ - def __init__(self, num_terms=None, max_nterms=None, num_sequences=None, num_docs=None, corpus=None, time_slice=None, id2word=None): + def __init__(self, corpus=None, time_slice=None, id2word=None, max_nterms=None): self.id2word = id2word @@ -82,21 +81,31 @@ class LdaSeqModel(utils.SaveLoad): """ Class which contains information of our whole DTM model. Topic chains contains for each topic a 'state space language model' object which in turn has information about each topic - + `alphas` is a prior of your choice and should be a double or float value. default is 0.01 + `initalize` allows the user to decide how he wants to initialise the DTM model. Default is through gensim LDA. + if `initalize` is 'blei-lda', then we will use the python port of blei's oriignal LDA code. + You can use your own sstats of an LDA model previously trained as well by specifying 'own' and passing a numpy matrix through sstats. + Shape of sstats is (num_terms, num_topics) """ - def __init__(self, corpus=None, num_topics=10, id2word=None, num_sequences=None, num_terms=None, alphas=None): + def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_topics=10, + initialize='gensim', sstats=None, obs_variance=0.5, chain_variance=0.005, max_nterms=None): + if corpus is not None: + self.corpus = seq_corpus(corpus=corpus, id2word=id2word, time_slice=time_slice, max_nterms=max_nterms) + self.num_terms = len(corpus.id2word) + - self.corpus = corpus self.num_topics = num_topics - self.num_sequences = num_sequences - self.num_terms = num_terms - self.topic_chains = topic_chains - if self.topic_chains is None: - for topic in range(0, num_topics): - sslm_ = sslm(num_sequences=num_sequences, num_terms=num_terms, num_topics=num_topics) - topic_chains.append(sslm_) + self.num_sequences = len(time_slice) + self.alphas = numpy.full(num_topics, alphas) + + self.topic_chains = [] + for topic in range(0, num_topics): + sslm_ = sslm(num_sequences=self.num_sequences, num_terms=self.num_terms, num_topics=self.num_topics, chain_variance=chain_variance, obs_variance=obs_variance) + self.topic_chains.append(sslm_) + + # the following are class variables which are to be integrated during Document Influence Model self.top_doc_phis = None @@ -104,6 +113,23 @@ def __init__(self, corpus=None, num_topics=10, id2word=None, num_sequences=None, self.renormalized_influence = None self.influence_sum_lgl = None + # if a corpus and time_slice is provided, depending on the user choice of initializing LDA, we start DTM. + if self.corpus is not None and time_slice is not None: + if initialize == 'gensim': + lda_model = ldamodel.LdaModel(corpus, id2word=self.corpus.id2word, num_topics=self.num_topics, passes=10, alpha=self.alphas) + self.sstats = numpy.transpose(lda_model.state.sstats) + if initialize == 'blei-lda': + self.sstats = lda_sstats(self.corpus, self.num_topics, self.num_terms, self.alphas) + if initialize == 'own': + self.sstats = sstats + + # initialize model from sstats + init_ldaseq_ss(self, chain_variance, obs_variance, self.alphas, self.sstats) + + # fit DTM + fit_lda_seq(self, self.corpus) + + class sslm(utils.SaveLoad): """ obs values contain the doc - topic ratios @@ -112,13 +138,14 @@ class sslm(utils.SaveLoad): variance, fwd_variance contains the variance values to be used for inference for each word in a time_slice """ - def __init__(self, num_terms=None, num_sequences=None, obs_variance=0.5, chain_variance=0.005): + def __init__(self, num_terms=None, num_sequences=None, num_topics=None, obs_variance=0.5, chain_variance=0.005): self.num_terms = num_terms self.num_sequences = num_sequences self.obs_variance = obs_variance self.chain_variance= chain_variance + self.num_topics = num_topics self.obs = numpy.array(numpy.split(numpy.zeros(num_sequences * num_terms), num_terms)) self.e_log_prob = numpy.array(numpy.split(numpy.zeros(num_sequences * num_terms), num_terms)) @@ -377,6 +404,8 @@ def lda_seq_infer(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, la bound = 0.0 lda = ldamodel.LdaModel(num_topics=K, alpha=ldaseq.alphas, id2word=seq_corpus.id2word) + lda.topics = numpy.array(numpy.split(numpy.zeros(W * K), W)) + lda_post = Lda_Post(max_nterms=seq_corpus.max_nterms, num_topics=K, lda=lda) @@ -572,7 +601,7 @@ def compute_lda_lhood(lda_post): lda_post.lhood[k] = lhood_term lhood += lhood_term - lhood += influence_term + # lhood += influence_term return lhood From fb199d85a91148c8133c1263f1ed50a4bdc43878 Mon Sep 17 00:00:00 2001 From: Bhargav Srinivasa Date: Fri, 5 Aug 2016 13:23:24 +0530 Subject: [PATCH 23/38] lda_model sstats input --- gensim/models/ldaseqmodel.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index 8063dfa84d..774346ae58 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -85,11 +85,12 @@ class LdaSeqModel(utils.SaveLoad): `initalize` allows the user to decide how he wants to initialise the DTM model. Default is through gensim LDA. if `initalize` is 'blei-lda', then we will use the python port of blei's oriignal LDA code. You can use your own sstats of an LDA model previously trained as well by specifying 'own' and passing a numpy matrix through sstats. + If you wish to just pass a previously used LDA model, pass it through `lda_model` Shape of sstats is (num_terms, num_topics) """ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_topics=10, - initialize='gensim', sstats=None, obs_variance=0.5, chain_variance=0.005, max_nterms=None): + initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005, max_nterms=None): if corpus is not None: self.corpus = seq_corpus(corpus=corpus, id2word=id2word, time_slice=time_slice, max_nterms=max_nterms) @@ -118,11 +119,12 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_ if initialize == 'gensim': lda_model = ldamodel.LdaModel(corpus, id2word=self.corpus.id2word, num_topics=self.num_topics, passes=10, alpha=self.alphas) self.sstats = numpy.transpose(lda_model.state.sstats) - if initialize == 'blei-lda': - self.sstats = lda_sstats(self.corpus, self.num_topics, self.num_terms, self.alphas) + if initialize == 'ldamodel': + self.sstats = numpy.transpose(lda_model.state.sstats) if initialize == 'own': self.sstats = sstats - + if initialize == 'blei-lda': + self.sstats = lda_sstats(self.corpus, self.num_topics, self.num_terms, self.alphas) # initialize model from sstats init_ldaseq_ss(self, chain_variance, obs_variance, self.alphas, self.sstats) From 060994a72001f3b1242dd9a9d9f77fada36a2ff9 Mon Sep 17 00:00:00 2001 From: Bhargav Srinivasa Date: Fri, 5 Aug 2016 15:23:48 +0530 Subject: [PATCH 24/38] Removed test files --- gensim/test/test_data/DTM/.DS_Store | Bin 10244 -> 0 bytes gensim/test/test_data/DTM/before_bound_counts | 2248 ------------- .../test/test_data/DTM/before_bound_fwd_mean | 2810 ----------------- .../test_data/DTM/before_bound_fwd_variance | 2810 ----------------- gensim/test/test_data/DTM/before_bound_mean | 2810 ----------------- gensim/test/test_data/DTM/before_bound_obs | 2248 ------------- gensim/test/test_data/DTM/before_bound_totals | 4 - .../test/test_data/DTM/before_bound_variance | 2810 ----------------- .../test/test_data/DTM/before_bound_w_phi_l | 2248 ------------- gensim/test/test_data/DTM/before_bound_zeta | 4 - gensim/test/test_data/DTM/before_fobs_audit | 1 - .../test/test_data/DTM/before_fobs_fwd_mean | 2810 ----------------- .../test_data/DTM/before_fobs_fwd_variance | 2810 ----------------- gensim/test/test_data/DTM/before_fobs_mean | 2810 ----------------- gensim/test/test_data/DTM/before_fobs_mupdate | 2248 ------------- gensim/test/test_data/DTM/before_fobs_obs | 2248 ------------- gensim/test/test_data/DTM/before_fobs_totals | 4 - .../test/test_data/DTM/before_fobs_variance | 2810 ----------------- gensim/test/test_data/DTM/before_fobs_w_phi_l | 2248 ------------- .../test/test_data/DTM/before_fobs_wordcounts | 4 - gensim/test/test_data/DTM/before_fobs_x | 4 - gensim/test/test_data/DTM/before_fobs_zeta | 4 - gensim/test/test_data/DTM/before_fwd_mean | 5 - gensim/test/test_data/DTM/before_fwd_variance | 5 - .../test/test_data/DTM/before_fwd_variance1 | 5 - gensim/test/test_data/DTM/before_ldaseq_phi | 232 -- .../test/test_data/DTM/before_ldaseq_sstats_0 | 2248 ------------- .../test/test_data/DTM/before_ldaseq_sstats_1 | 2248 ------------- gensim/test/test_data/DTM/before_lhood_gamma | 2 - .../test/test_data/DTM/before_lhood_lda_alpha | 2 - .../test_data/DTM/before_lhood_lda_topics | 1124 ------- gensim/test/test_data/DTM/before_lhood_lhood | 3 - .../test/test_data/DTM/before_lhood_log_phi | 232 -- gensim/test/test_data/DTM/before_lhood_phi | 232 -- gensim/test/test_data/DTM/before_log_norm.dat | 562 ---- gensim/test/test_data/DTM/before_mean | 5 - gensim/test/test_data/DTM/before_mean_deriv | 5 - .../test_data/DTM/before_mean_deriv_variance | 5 - gensim/test/test_data/DTM/before_obs | 4 - gensim/test/test_data/DTM/before_obs_deriv | 4 - gensim/test/test_data/DTM/before_obs_m_update | 2248 ------------- gensim/test/test_data/DTM/before_obs_mean | 2810 ----------------- .../test_data/DTM/before_obs_mean_deriv_mtx | 20 - gensim/test/test_data/DTM/before_obs_totals | 4 - gensim/test/test_data/DTM/before_obs_variance | 2810 ----------------- gensim/test/test_data/DTM/before_obs_w_phi_l | 2248 ------------- .../test/test_data/DTM/before_obs_wordcounts | 4 - gensim/test/test_data/DTM/before_obs_zeta | 4 - .../test/test_data/DTM/before_posterior_gamma | 2 - .../test_data/DTM/before_posterior_logphi | 232 -- .../test/test_data/DTM/before_posterior_phi | 232 -- .../test_data/DTM/before_posterior_topics | 1124 ------- gensim/test/test_data/DTM/before_update_gamma | 2 - gensim/test/test_data/DTM/before_update_phi | 232 -- gensim/test/test_data/DTM/before_variance | 5 - gensim/test/test_data/DTM/eprob_before | 2248 ------------- gensim/test/test_data/DTM/eprob_mean | 2810 ----------------- gensim/test/test_data/DTM/eprob_zeta | 4 - gensim/test/test_data/DTM/sample_mean_DTM | 2810 ----------------- gensim/test/test_data/DTM/sample_variance_DTM | 2810 ----------------- gensim/test/test_ldaseqmodel.py | 272 -- 61 files changed, 65847 deletions(-) delete mode 100644 gensim/test/test_data/DTM/.DS_Store delete mode 100644 gensim/test/test_data/DTM/before_bound_counts delete mode 100644 gensim/test/test_data/DTM/before_bound_fwd_mean delete mode 100644 gensim/test/test_data/DTM/before_bound_fwd_variance delete mode 100644 gensim/test/test_data/DTM/before_bound_mean delete mode 100644 gensim/test/test_data/DTM/before_bound_obs delete mode 100644 gensim/test/test_data/DTM/before_bound_totals delete mode 100644 gensim/test/test_data/DTM/before_bound_variance delete mode 100644 gensim/test/test_data/DTM/before_bound_w_phi_l delete mode 100644 gensim/test/test_data/DTM/before_bound_zeta delete mode 100644 gensim/test/test_data/DTM/before_fobs_audit delete mode 100644 gensim/test/test_data/DTM/before_fobs_fwd_mean delete mode 100644 gensim/test/test_data/DTM/before_fobs_fwd_variance delete mode 100644 gensim/test/test_data/DTM/before_fobs_mean delete mode 100644 gensim/test/test_data/DTM/before_fobs_mupdate delete mode 100644 gensim/test/test_data/DTM/before_fobs_obs delete mode 100644 gensim/test/test_data/DTM/before_fobs_totals delete mode 100644 gensim/test/test_data/DTM/before_fobs_variance delete mode 100644 gensim/test/test_data/DTM/before_fobs_w_phi_l delete mode 100644 gensim/test/test_data/DTM/before_fobs_wordcounts delete mode 100644 gensim/test/test_data/DTM/before_fobs_x delete mode 100644 gensim/test/test_data/DTM/before_fobs_zeta delete mode 100644 gensim/test/test_data/DTM/before_fwd_mean delete mode 100644 gensim/test/test_data/DTM/before_fwd_variance delete mode 100644 gensim/test/test_data/DTM/before_fwd_variance1 delete mode 100644 gensim/test/test_data/DTM/before_ldaseq_phi delete mode 100644 gensim/test/test_data/DTM/before_ldaseq_sstats_0 delete mode 100644 gensim/test/test_data/DTM/before_ldaseq_sstats_1 delete mode 100644 gensim/test/test_data/DTM/before_lhood_gamma delete mode 100644 gensim/test/test_data/DTM/before_lhood_lda_alpha delete mode 100644 gensim/test/test_data/DTM/before_lhood_lda_topics delete mode 100644 gensim/test/test_data/DTM/before_lhood_lhood delete mode 100644 gensim/test/test_data/DTM/before_lhood_log_phi delete mode 100644 gensim/test/test_data/DTM/before_lhood_phi delete mode 100644 gensim/test/test_data/DTM/before_log_norm.dat delete mode 100644 gensim/test/test_data/DTM/before_mean delete mode 100644 gensim/test/test_data/DTM/before_mean_deriv delete mode 100644 gensim/test/test_data/DTM/before_mean_deriv_variance delete mode 100644 gensim/test/test_data/DTM/before_obs delete mode 100644 gensim/test/test_data/DTM/before_obs_deriv delete mode 100644 gensim/test/test_data/DTM/before_obs_m_update delete mode 100644 gensim/test/test_data/DTM/before_obs_mean delete mode 100644 gensim/test/test_data/DTM/before_obs_mean_deriv_mtx delete mode 100644 gensim/test/test_data/DTM/before_obs_totals delete mode 100644 gensim/test/test_data/DTM/before_obs_variance delete mode 100644 gensim/test/test_data/DTM/before_obs_w_phi_l delete mode 100644 gensim/test/test_data/DTM/before_obs_wordcounts delete mode 100644 gensim/test/test_data/DTM/before_obs_zeta delete mode 100644 gensim/test/test_data/DTM/before_posterior_gamma delete mode 100644 gensim/test/test_data/DTM/before_posterior_logphi delete mode 100644 gensim/test/test_data/DTM/before_posterior_phi delete mode 100644 gensim/test/test_data/DTM/before_posterior_topics delete mode 100644 gensim/test/test_data/DTM/before_update_gamma delete mode 100644 gensim/test/test_data/DTM/before_update_phi delete mode 100644 gensim/test/test_data/DTM/before_variance delete mode 100644 gensim/test/test_data/DTM/eprob_before delete mode 100644 gensim/test/test_data/DTM/eprob_mean delete mode 100644 gensim/test/test_data/DTM/eprob_zeta delete mode 100644 gensim/test/test_data/DTM/sample_mean_DTM delete mode 100644 gensim/test/test_data/DTM/sample_variance_DTM delete mode 100644 gensim/test/test_ldaseqmodel.py diff --git a/gensim/test/test_data/DTM/.DS_Store b/gensim/test/test_data/DTM/.DS_Store deleted file mode 100644 index 370ff7731c7b49e3a7d694e159a4112485295d6e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 10244 zcmeHMJyR4x5bZq(D8K^6@907z3u2Qi3av6QQ3E3pI6$rY5{{2Dx}U*MGVllZ0}KrP z2l~2a@2N0kbjlbv5%+{#S zJVmUKPjpNNv_pH;r_-6M%Zb83VW2Qj7$^)B2L1;IFlY0&uoPn17X}Igg@FkJ><>+R zIJW9I3Q?>BJ6QrC&f~OfIL10aa+-B))o~P}qyjUn9;!OEs+Sm5NXPe0z8tYt$5DtO zovM&dRek5GUZGTScjPtYa;jP(mVIHMFfhviuie+MVn3imtlIVZy+>`V<@aSR-=iU& zyy*1z_d5MOWMGiZeL}qaEt|mJ%b*<6xuA7%pIz!Xq+br{(KV3HfOAYA84Ybbu!!?# z4)JydG1?}24hvaOy<5#7^zofzhrYS|9LGa&JVZ}A9;JHaP;wk+8L7TIq`R5^oTKgo z`T%x!sAHLJymL4?X1}1?=$AwIaRmbF5Pag^yLgM#Mk|IEft6=!IaH3=O^ZMR^)!CE2TpZbflA`%W@DtyDVFu7QSK*ubS@6q&U>E0x|PJbaGHh3nas2{qN7H%rF$`3)r`Zko6&k! zSs^=H4_1~znXPLSg*Gi0tTBV*J)|SbK7o6D9y^gm5O@Bm`r~VA5l-MH?5~)^fVbhW z*s1>(E7CKA zQE4z29cE;W0%$+=Y zG&IsmOkZF$I{%3}b+&i>0mdo2d>Aj>mZZaS#f@scE5@*bA4Hr))K_RnIC|o?9*zx67#S0JUY+@$bx& zRjc(Cc~a4NizYAb%KTK4&WWp%om}UvK!ZH14={J}FX2-s-=;V8(za6EQ)jQaY{w1o zVkoR;je>u#xy^;^n*9F%S^D??Yoe9Q3Im0Ko5(;lzx{f93oWnTt Date: Sun, 7 Aug 2016 01:16:51 +0530 Subject: [PATCH 25/38] Incorporated suggestions --- gensim/models/ldaseqmodel.py | 266 +++++++++++++++++------------------ 1 file changed, 132 insertions(+), 134 deletions(-) diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index 774346ae58..5e5067c525 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -28,15 +28,15 @@ class seq_corpus(utils.SaveLoad): """ seq_corpus is basically a wrapper class which contains information about the corpus. - num_terms is the length of the vocabulary. - max_nterms is the maximum number of terms a single document has. - num_sequences is the number of sequences, i.e number of time-slices. - num_docs is the number of documents present. + vocab_len is the length of the vocabulary. + max_doc_len is the maximum number of terms a single document has. + num_time_slices is the number of sequences, i.e number of time-slices. + corpus_len is the number of documents present. time_slice is a list or numpy array which the user must provide which contains the number of documents in each time-slice. corpus is any iterable gensim corpus. """ - def __init__(self, corpus=None, time_slice=None, id2word=None, max_nterms=None): + def __init__(self, corpus=None, time_slice=None, id2word=None): self.id2word = id2word @@ -46,22 +46,25 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, max_nterms=None): if self.id2word is None: logger.warning("no word id mapping provided; initializing from corpus, assuming identity") self.id2word = utils.dict_from_corpus(corpus) - self.num_terms = len(self.id2word) + self.vocab_len = len(self.id2word) elif len(self.id2word) > 0: - self.num_terms = 1 + max(self.id2word.keys()) + self.vocab_len = len(self.id2word) else: - self.num_terms = 0 + self.vocab_len = 0 self.corpus = corpus if self.corpus is not None: - self.num_docs = len(corpus) + self.corpus_len = len(corpus) self.time_slice = time_slice if self.time_slice is not None: - self.num_sequences = len(time_slice) + self.num_time_slices = len(time_slice) - # need to still figure out way to get max_nterms - self.max_nterms = max_nterms + max_doc_len = 0 + for line_no, line in enumerate(corpus): + if len(line) > max_doc_len: + max_doc_len = len(line) + self.max_doc_len = max_doc_len class Doc(utils.SaveLoad): @@ -81,33 +84,32 @@ class LdaSeqModel(utils.SaveLoad): """ Class which contains information of our whole DTM model. Topic chains contains for each topic a 'state space language model' object which in turn has information about each topic + the sslm class is described below and contains information on topic-word probabilities and doc-topic probabilities. `alphas` is a prior of your choice and should be a double or float value. default is 0.01 `initalize` allows the user to decide how he wants to initialise the DTM model. Default is through gensim LDA. if `initalize` is 'blei-lda', then we will use the python port of blei's oriignal LDA code. You can use your own sstats of an LDA model previously trained as well by specifying 'own' and passing a numpy matrix through sstats. If you wish to just pass a previously used LDA model, pass it through `lda_model` - Shape of sstats is (num_terms, num_topics) + Shape of sstats is (vocab_len, num_topics) """ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_topics=10, - initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005, max_nterms=None): + initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005): if corpus is not None: - self.corpus = seq_corpus(corpus=corpus, id2word=id2word, time_slice=time_slice, max_nterms=max_nterms) - self.num_terms = len(corpus.id2word) + self.corpus = seq_corpus(corpus=corpus, id2word=id2word, time_slice=time_slice) + self.vocab_len = len(self.corpus.id2word) self.num_topics = num_topics - self.num_sequences = len(time_slice) + self.num_time_slices = len(time_slice) self.alphas = numpy.full(num_topics, alphas) self.topic_chains = [] for topic in range(0, num_topics): - sslm_ = sslm(num_sequences=self.num_sequences, num_terms=self.num_terms, num_topics=self.num_topics, chain_variance=chain_variance, obs_variance=obs_variance) + sslm_ = sslm(num_time_slices=self.num_time_slices, vocab_len=self.vocab_len, num_topics=self.num_topics, chain_variance=chain_variance, obs_variance=obs_variance) self.topic_chains.append(sslm_) - - # the following are class variables which are to be integrated during Document Influence Model self.top_doc_phis = None self.influence = None @@ -124,7 +126,7 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_ if initialize == 'own': self.sstats = sstats if initialize == 'blei-lda': - self.sstats = lda_sstats(self.corpus, self.num_topics, self.num_terms, self.alphas) + self.sstats = lda_sstats(self.corpus, self.num_topics, self.vocab_len, self.alphas) # initialize model from sstats init_ldaseq_ss(self, chain_variance, obs_variance, self.alphas, self.sstats) @@ -140,22 +142,22 @@ class sslm(utils.SaveLoad): variance, fwd_variance contains the variance values to be used for inference for each word in a time_slice """ - def __init__(self, num_terms=None, num_sequences=None, num_topics=None, obs_variance=0.5, chain_variance=0.005): + def __init__(self, vocab_len=None, num_time_slices=None, num_topics=None, obs_variance=0.5, chain_variance=0.005): - self.num_terms = num_terms - self.num_sequences = num_sequences + self.vocab_len = vocab_len + self.num_time_slices = num_time_slices self.obs_variance = obs_variance self.chain_variance= chain_variance self.num_topics = num_topics - self.obs = numpy.array(numpy.split(numpy.zeros(num_sequences * num_terms), num_terms)) - self.e_log_prob = numpy.array(numpy.split(numpy.zeros(num_sequences * num_terms), num_terms)) - self.mean = numpy.array(numpy.split(numpy.zeros((num_sequences + 1) * num_terms), num_terms)) - self.fwd_mean = numpy.array(numpy.split(numpy.zeros((num_sequences + 1) * num_terms), num_terms)) - self.fwd_variance = numpy.array(numpy.split(numpy.zeros((num_sequences + 1) * num_terms), num_terms)) - self.variance = numpy.array(numpy.split(numpy.zeros((num_sequences + 1) * num_terms), num_terms)) - self.zeta = numpy.zeros(num_sequences) + self.obs = numpy.array(numpy.split(numpy.zeros(num_time_slices * vocab_len), vocab_len)) + self.e_log_prob = numpy.array(numpy.split(numpy.zeros(num_time_slices * vocab_len), vocab_len)) + self.mean = numpy.array(numpy.split(numpy.zeros((num_time_slices + 1) * vocab_len), vocab_len)) + self.fwd_mean = numpy.array(numpy.split(numpy.zeros((num_time_slices + 1) * vocab_len), vocab_len)) + self.fwd_variance = numpy.array(numpy.split(numpy.zeros((num_time_slices + 1) * vocab_len), vocab_len)) + self.variance = numpy.array(numpy.split(numpy.zeros((num_time_slices + 1) * vocab_len), vocab_len)) + self.zeta = numpy.zeros(num_time_slices) # the following are class variables which are to be integrated during Document Influence Model self.m_update_coeff = None @@ -167,8 +169,6 @@ def __init__(self, num_terms=None, num_sequences=None, num_topics=None, obs_vari self.w_phi_l_sq = None self.m_update_coeff_g = None - # temp_vect - self.temp_vect = None class Lda_Post(utils.SaveLoad): """ @@ -176,16 +176,16 @@ class Lda_Post(utils.SaveLoad): """ - def __init__(self, doc=None, lda=None, max_nterms=None, num_topics=None, gamma=None, lhood=None): + def __init__(self, doc=None, lda=None, max_doc_len=None, num_topics=None, gamma=None, lhood=None): self.doc = doc self.lda = lda self.gamma = gamma self.lhood = lhood - if max_nterms is not None and num_topics is not None: - self.phi = numpy.resize(numpy.zeros(max_nterms * num_topics), (max_nterms, num_topics)) - self.log_phi = numpy.resize(numpy.zeros(max_nterms * num_topics), (max_nterms, num_topics)) + if max_doc_len is not None and num_topics is not None: + self.phi = numpy.resize(numpy.zeros(max_doc_len * num_topics), (max_doc_len, num_topics)) + self.log_phi = numpy.resize(numpy.zeros(max_doc_len * num_topics), (max_doc_len, num_topics)) # the following are class variables which are to be integrated during Document Influence Model @@ -194,23 +194,25 @@ def __init__(self, doc=None, lda=None, max_nterms=None, num_topics=None, gamma=N def update_zeta(sslm): - num_terms = sslm.num_terms - num_sequences = sslm.num_sequences - # making zero and updating - sslm.zeta.fill(0) - for i in range(0, num_terms): - for j in range(0, num_sequences): + vocab_len = sslm.vocab_len + num_time_slices = sslm.num_time_slices + zeta = numpy.zeros(len(sslm.zeta)) + + for i in range(0, vocab_len): + for j in range(0, num_time_slices): + m = sslm.mean[i][j + 1] v = sslm.variance[i][j + 1] val = numpy.exp(m + v/2) - sslm.zeta[j] = sslm.zeta[j] + val - return + zeta[j] = zeta[j] + val + + return zeta def compute_post_variance(word, sslm, chain_variance): - T = sslm.num_sequences - variance = sslm.variance[word] # pick wordth row - fwd_variance = sslm.fwd_variance[word] # pick wordth row + T = sslm.num_time_slices + variance = numpy.copy(sslm.variance[word]) # pick wordth row + fwd_variance = numpy.copy(sslm.fwd_variance[word]) # pick wordth row # forward pass. Set initial variance very high fwd_variance[0] = chain_variance * 1000 @@ -231,27 +233,23 @@ def compute_post_variance(word, sslm, chain_variance): w = 0 variance[t] = (w * (variance[t + 1] - chain_variance)) + ((1 - w) * fwd_variance[t]) - sslm.variance[word] = variance - sslm.fwd_variance[word] = fwd_variance - return + return variance, fwd_variance def compute_post_mean(word, sslm, chain_variance): - T = sslm.num_sequences - obs = sslm.obs[word] # wordth row - mean = sslm.mean[word] - fwd_mean = sslm.fwd_mean[word] + T = sslm.num_time_slices + obs = sslm.obs[word] fwd_variance = sslm.fwd_variance[word] + mean = numpy.copy(sslm.mean[word]) + fwd_mean = numpy.copy(sslm.fwd_mean[word]) + # forward fwd_mean[0] = 0 for t in range(1, T + 1): w = sslm.obs_variance / (fwd_variance[t - 1] + chain_variance + sslm.obs_variance) fwd_mean[t] = w * fwd_mean[t - 1] + (1 - w) * obs[t - 1] - if fwd_mean[t] is None: - # error message - pass # backward pass mean[T] = fwd_mean[T] @@ -261,16 +259,13 @@ def compute_post_mean(word, sslm, chain_variance): else: w = chain_variance / (fwd_variance[t] + chain_variance) mean[t] = w * fwd_mean[t] + (1 - w) * mean[t + 1] - if mean[t] is None: - # error message - pass - - return + + return mean, fwd_mean def compute_expected_log_prob(sslm): - W = sslm.num_terms - T = sslm.num_sequences + W = sslm.vocab_len + T = sslm.num_time_slices for t in range(0, T): for w in range(0, W): sslm.e_log_prob[w][t] = sslm.mean[w][t + 1] - numpy.log(sslm.zeta[t]) @@ -279,8 +274,8 @@ def compute_expected_log_prob(sslm): def sslm_counts_init(sslm, obs_variance, chain_variance, sstats): - W = sslm.num_terms - T = sslm.num_sequences + W = sslm.vocab_len + T = sslm.num_time_slices log_norm_counts = numpy.copy(sstats) log_norm_counts = log_norm_counts / sum(log_norm_counts) @@ -300,12 +295,12 @@ def sslm_counts_init(sslm, obs_variance, chain_variance, sstats): # compute post variance for w in range(0, W): - compute_post_variance(w, sslm, sslm.chain_variance) + sslm.variance, sslm.fwd_variance = compute_post_variance(w, sslm, sslm.chain_variance) for w in range(0, W): - compute_post_mean(w, sslm, sslm.chain_variance) + sslm.mean, sslm.fwd_mean = compute_post_mean(w, sslm, sslm.chain_variance) - update_zeta(sslm) + sslm.zeta = update_zeta(sslm) compute_expected_log_prob(sslm) def init_ldaseq_ss(ldaseq, topic_chain_variance, topic_obs_variance, alpha, init_suffstats): @@ -317,52 +312,44 @@ def init_ldaseq_ss(ldaseq, topic_chain_variance, topic_obs_variance, alpha, init sslm_counts_init(ldaseq.topic_chains[k], topic_obs_variance, topic_chain_variance, sstats) # initialize the below matrices only if running DIM - # ldaseq.topic_chains[k].w_phi_l = numpy.zeros((ldaseq.num_terms, ldaseq.num_sequences)) - # ldaseq.topic_chains[k].w_phi_sum = numpy.zeros((ldaseq.num_terms, ldaseq.num_sequences)) - # ldaseq.topic_chains[k].w_phi_sq = numpy.zeros((ldaseq.num_terms, ldaseq.num_sequences)) + # ldaseq.topic_chains[k].w_phi_l = numpy.zeros((ldaseq.vocab_len, ldaseq.num_time_slices)) + # ldaseq.topic_chains[k].w_phi_sum = numpy.zeros((ldaseq.vocab_len, ldaseq.num_time_slices)) + # ldaseq.topic_chains[k].w_phi_sq = numpy.zeros((ldaseq.vocab_len, ldaseq.num_time_slices)) def fit_lda_seq(ldaseq, seq_corpus): - K = ldaseq.num_topics - W = ldaseq.num_terms - data_len = seq_corpus.num_sequences - num_docs = seq_corpus.num_docs - LDA_INFERENCE_MAX_ITER = 25 + LDASQE_EM_THRESHOLD = 1e-4 + LDA_SEQ_MIN_ITER = 6 + LDA_SEQ_MAX_ITER = 20 + K = ldaseq.num_topics + W = ldaseq.vocab_len + data_len = seq_corpus.num_time_slices + corpus_len = seq_corpus.corpus_len + bound = 0 - heldout_bound = 0 - ldasqe_em_threshold = 1e-4 - convergence = ldasqe_em_threshold + 1 + convergence = LDASQE_EM_THRESHOLD + 1 iter_ = 0 - final_iters_flag = 0 - last_iter = 0 # this is a flag/input do something about it - lda_seq_min_iter = 6 - lda_seq_max_iter = 20 + - while iter_ < lda_seq_min_iter or ((final_iters_flag is 0 or convergence > ldasqe_em_threshold) and iter_ <= lda_seq_max_iter): - if not (iter_ < lda_seq_min_iter or ((final_iters_flag is 0 or convergence > ldasqe_em_threshold) and iter_ <= lda_seq_max_iter)): - last_iter = 1 + while iter_ < LDA_SEQ_MIN_ITER or ((convergence > LDASQE_EM_THRESHOLD) and iter_ <= LDA_SEQ_MAX_ITER): - # log print (" EM iter " , iter_) print ("E Step") - # writing to file - em_log.write(str(bound) + "\t" + str(convergence)) old_bound = bound - # initiate sufficient statistics topic_suffstats = [] for k in range(0, K): topic_suffstats.append(numpy.resize(numpy.zeros(W * data_len), (W, data_len))) # set up variables - gammas = numpy.resize(numpy.zeros(num_docs * K), (num_docs, K)) - lhoods = numpy.resize(numpy.zeros(num_docs * K + 1), (num_docs, K + 1)) + gammas = numpy.resize(numpy.zeros(corpus_len * K), (corpus_len, K)) + lhoods = numpy.resize(numpy.zeros(corpus_len * K + 1), (corpus_len, K + 1)) bound = lda_seq_infer(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, last_iter) @@ -373,21 +360,15 @@ def fit_lda_seq(ldaseq, seq_corpus): if ((bound - old_bound) < 0): - if (LDA_INFERENCE_MAX_ITER == 1): - LDA_INFERENCE_MAX_ITER = 2 - if (LDA_INFERENCE_MAX_ITER == 2): - LDA_INFERENCE_MAX_ITER = 5 - if (LDA_INFERENCE_MAX_ITER == 5): - LDA_INFERENCE_MAX_ITER = 10 - if (LDA_INFERENCE_MAX_ITER == 10): - LDA_INFERENCE_MAX_ITER = 20 - print ("Bound went down, increasing it to" , LDA_INFERENCE_MAX_ITER) + if LDA_INFERENCE_MAX_ITER < 10: + LDA_INFERENCE_MAX_ITER *= 2 + print ("Bound went down, increasing iterations to" , LDA_INFERENCE_MAX_ITER) # check for convergence convergence = numpy.fabs((bound - old_bound) / old_bound) - if convergence < ldasqe_em_threshold: - final_iters_flag = 1 + if convergence < LDASQE_EM_THRESHOLD: + LDA_INFERENCE_MAX_ITER = 500 print ("Starting final iterations, max iter is", LDA_INFERENCE_MAX_ITER) convergence = 1.0 @@ -402,13 +383,13 @@ def fit_lda_seq(ldaseq, seq_corpus): def lda_seq_infer(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, last_iter): K = ldaseq.num_topics - W = ldaseq.num_terms + W = ldaseq.vocab_len bound = 0.0 lda = ldamodel.LdaModel(num_topics=K, alpha=ldaseq.alphas, id2word=seq_corpus.id2word) lda.topics = numpy.array(numpy.split(numpy.zeros(W * K), W)) - lda_post = Lda_Post(max_nterms=seq_corpus.max_nterms, num_topics=K, lda=lda) + lda_post = Lda_Post(max_doc_len=seq_corpus.max_doc_len, num_topics=K, lda=lda) model = "DTM" @@ -478,10 +459,11 @@ def cumsum(it): def fit_lda_post(doc_number, time, lda_post, ldaseq, g, g3_matrix, g4_matrix, g5_matrix): + LDA_INFERENCE_CONVERGED = 1e-8 + LDA_INFERENCE_MAX_ITER = 25 init_lda_post(lda_post) - model = "DTM" if model == "DIM": # if in DIM then we initialise some variables here @@ -491,10 +473,8 @@ def fit_lda_post(doc_number, time, lda_post, ldaseq, g, g3_matrix, g4_matrix, g5 lhood_old = 0 converged = 0 iter_ = 0 - LDA_INFERENCE_CONVERGED = 1e-8 - LDA_INFERENCE_MAX_ITER = 25 - + # first iteration starts here iter_ += 1 lhood_old = lhood update_gamma(lda_post) @@ -576,7 +556,7 @@ def compute_lda_lhood(lda_post): N = lda_post.doc.nterms gamma_sum = numpy.sum(lda_post.gamma) - # figure out how to do flags + # TODO: flags FLAGS_sigma_l = 0 FLAGS_sigma_d = 0 @@ -673,7 +653,7 @@ def fit_lda_seq_topics(ldaseq, topic_suffstats): def fit_sslm(sslm, counts): - W = sslm.num_terms + W = sslm.vocab_len bound = 0 old_bound = 0 sslm_fit_threshold = 1e-6 @@ -683,7 +663,7 @@ def fit_sslm(sslm, counts): totals = numpy.zeros(counts.shape[1]) for w in range(0, W): - compute_post_variance(w, sslm, sslm.chain_variance) + sslm.variance, sslm.fwd_variance = compute_post_variance(w, sslm, sslm.chain_variance) totals = col_sum(counts, totals) iter_ = 0 @@ -727,8 +707,8 @@ def col_sum(matrix, vector): def compute_bound(word_counts, totals, sslm): - W = sslm.num_terms - T = sslm.num_sequences + W = sslm.vocab_len + T = sslm.num_time_slices term_1 = 0 term_2 = 0 @@ -740,9 +720,9 @@ def compute_bound(word_counts, totals, sslm): chain_variance = sslm.chain_variance for w in range(0, W): - compute_post_mean(w, sslm, chain_variance) + sslm.mean, sslm.fwd_mean = compute_post_mean(w, sslm, chain_variance) - update_zeta(sslm) + sslm.zeta = update_zeta(sslm) for w in range(0, W): val += (sslm.variance[w][0] - sslm.variance[w][T]) / 2 * chain_variance @@ -779,9 +759,12 @@ def update_obs(word_counts, totals, sslm): OBS_NORM_CUTOFF = 2 + STEP_SIZE = 0.01 + TOL = 1e-3 + - W = sslm.num_terms - T = sslm.num_sequences + W = sslm.vocab_len + T = sslm.num_time_slices runs = 0 mean_deriv_mtx = numpy.resize(numpy.zeros(T * (T + 1)), (T, T + 1)) @@ -811,12 +794,10 @@ def update_obs(word_counts, totals, sslm): deriv = numpy.zeros(T) args = sslm, w_counts, totals, mean_deriv_mtx, w, deriv obs = sslm.obs[w] - step_size = 0.01 - tol = 1e-3 model = "DTM" if model == "DTM": - obs = optimize.fmin_cg(f=f_obs, fprime=df_obs, x0=obs, gtol=tol, args=args, epsilon=step_size, disp=0) + obs = optimize.fmin_cg(f=f_obs, fprime=df_obs, x0=obs, gtol=TOL, args=args, epsilon=STEP_SIZE, disp=0) if model == "DIM": pass runs += 1 @@ -826,7 +807,7 @@ def update_obs(word_counts, totals, sslm): sslm.obs[w] = obs - update_zeta(sslm) + sslm.zeta = update_zeta(sslm) return @@ -835,7 +816,7 @@ def update_obs(word_counts, totals, sslm): def compute_mean_deriv(word, time, sslm, deriv): - T = sslm.num_sequences + T = sslm.num_time_slices fwd_variance = sslm.variance[word] deriv[0] = 0 @@ -878,7 +859,7 @@ def f_obs(x, *args): term4 = 0 sslm.obs[word] = x - compute_post_mean(word, sslm, sslm.chain_variance) + sslm.mean, sslm.fwd_mean = compute_post_mean(word, sslm, sslm.chain_variance) mean = sslm.mean[word] variance = sslm.variance[word] @@ -917,7 +898,7 @@ def compute_obs_deriv(word, word_counts, totals, sslm, mean_deriv_mtx, deriv): # flag init_mult = 1000 - T = sslm.num_sequences + T = sslm.num_time_slices mean = sslm.mean[word] variance = sslm.variance[word] @@ -970,7 +951,7 @@ def df_obs(x, *args): sslm, word_counts, totals, mean_deriv_mtx, word, deriv = args sslm.obs[word] = x - compute_post_mean(word, sslm, sslm.chain_variance) + sslm.mean, sslm.fwd_mean = compute_post_mean(word, sslm, sslm.chain_variance) model = "DTM" if model == "DTM": @@ -987,10 +968,9 @@ def df_obs(x, *args): def lda_sstats(seq_corpus, num_topics, num_terms, alpha): - lda_model = ldamodel.LdaModel(num_topics=num_topics, id2word=seq_corpus.id2word) + lda_model = mockLDA(num_topics=num_topics, num_terms=num_terms) lda_model.alpha = alpha # this will have shape equal to number of topics lda_ss = initialize_ss_random(seq_corpus, num_topics) - lda_m_step(lda_model, lda_ss, seq_corpus, num_topics) em_iter = 10 lda_em(lda_model, lda_ss, seq_corpus, em_iter, num_topics) @@ -1015,7 +995,6 @@ def lda_m_step(lda_model, lda_ss, seq_corpus, num_topics): K = num_topics W = seq_corpus.num_terms lhood = 0 - for k in range(0, K): ss_k = lda_ss[:,k] @@ -1024,7 +1003,7 @@ def lda_m_step(lda_model, lda_ss, seq_corpus, num_topics): LDA_VAR_BAYES = True if LDA_VAR_BAYES is True: - lop_p = numpy.copy(ss_k) + numpy.copyto(log_p, ss_k) log_p = log_p / sum(log_p) log_p = numpy.log(log_p) @@ -1047,7 +1026,6 @@ def lda_em(lda_model, lda_ss, seq_corpus, max_iter, num_topics): m_lhood = lda_m_step(lda_model, lda_ss, seq_corpus, num_topics) # do step starts - iter_ += 1 old_lhood = lhood e_lhood = lda_e_step(lda_model, seq_corpus, lda_ss, num_topics) @@ -1063,10 +1041,12 @@ def lda_em(lda_model, lda_ss, seq_corpus, max_iter, num_topics): m_lhood = lda_m_step(lda_model, lda_ss, seq_corpus, num_topics) lhood = e_lhood + m_lhood converged = (old_lhood - lhood) / old_lhood + print converged return lhood + def lda_e_step(lda_model, seq_corpus, lda_ss, num_topics): K = num_topics @@ -1074,7 +1054,7 @@ def lda_e_step(lda_model, seq_corpus, lda_ss, num_topics): if lda_ss is not None: lda_ss.fill(0) - lda_post = Lda_Post(max_nterms=seq_corpus.max_nterms, num_topics=K, lda=lda_model) + lda_post = Lda_Post(max_doc_len=seq_corpus.max_doc_len, num_topics=K, lda=lda_model) lda_post.gamma = numpy.zeros(K) lda_post.lhood = numpy.zeros(K + 1) @@ -1104,6 +1084,24 @@ def lda_e_step(lda_model, seq_corpus, lda_ss, num_topics): return lhood + +def print_topics(ldaseq, topic, time, top_terms): + """ + Topic is the topic numner + Time is for a particular time_slice + top_terms is the number of terms to display + """ + topic = ldaseq.topic_chains[topic].e_log_prob[time] + topic = numpy.transpose(topic) + topic = topic / topic.sum() + bestn = matutils.argsort(topic, top_terms, reverse=True) + beststr = [(round(topic[id_], 3), ldaseq.corpus.id2word[id_]) for id_ in bestn] + + return beststr + + + + # the fdf used in optimising obs. Can use if we figure a way to use an optimization function which requires this # def fdf_obs(x, params, f, df): From 5b6a3b632e003fc8a04e618bcefe231cf9396c1f Mon Sep 17 00:00:00 2001 From: Bhargav Srinivasa Date: Sun, 7 Aug 2016 20:07:37 +0530 Subject: [PATCH 26/38] added more docs --- gensim/models/ldaseqmodel.py | 215 +++++++++++++++++++++++------------ 1 file changed, 140 insertions(+), 75 deletions(-) diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index 5e5067c525..b6ea45e30b 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -140,7 +140,8 @@ class sslm(utils.SaveLoad): e_log_prob contains topic - word ratios mean, fwd_mean contains the mean values to be used for inference for each word for a time_slice variance, fwd_variance contains the variance values to be used for inference for each word in a time_slice - + fwd_mean, fwd_variance are the forward posterior values. + zeta is an extra variational parameter with a value for each time-slice """ def __init__(self, vocab_len=None, num_time_slices=None, num_topics=None, obs_variance=0.5, chain_variance=0.005): @@ -192,8 +193,11 @@ def __init__(self, doc=None, lda=None, max_doc_len=None, num_topics=None, gamma= self.doc_weight = None self.renormalized_doc_weight = None -def update_zeta(sslm): +def update_zeta(sslm): + """ + Update Zeta Variational Parameter. + """ vocab_len = sslm.vocab_len num_time_slices = sslm.num_time_slices zeta = numpy.zeros(len(sslm.zeta)) @@ -210,9 +214,13 @@ def update_zeta(sslm): def compute_post_variance(word, sslm, chain_variance): + """ + compute Var[\beta_{t,w}] for t = 1:T + """ + T = sslm.num_time_slices - variance = numpy.copy(sslm.variance[word]) # pick wordth row - fwd_variance = numpy.copy(sslm.fwd_variance[word]) # pick wordth row + variance = numpy.copy(sslm.variance[word]) + fwd_variance = numpy.copy(sslm.fwd_variance[word]) # forward pass. Set initial variance very high fwd_variance[0] = chain_variance * 1000 @@ -236,8 +244,13 @@ def compute_post_variance(word, sslm, chain_variance): return variance, fwd_variance + def compute_post_mean(word, sslm, chain_variance): + """ + forward-backward to compute E[\beta_{t,w}] for t = 1:T + """ + T = sslm.num_time_slices obs = sslm.obs[word] fwd_variance = sslm.fwd_variance[word] @@ -262,8 +275,69 @@ def compute_post_mean(word, sslm, chain_variance): return mean, fwd_mean + +def update_phi(doc, time, lda_post, ldaseq, g): + + """ + Update variational multinomial parameters + """ + + K = lda_post.lda.num_topics + N = lda_post.doc.nterms + + dig = numpy.zeros(K) + + for k in range(0, K): + dig[k] = digamma(lda_post.gamma[k]) + + for n in range(0, N): + w = lda_post.doc.word[n] + for k in range(0, K): + lda_post.log_phi[n][k] = dig[k] + lda_post.lda.topics[w][k] + + log_phi_row = lda_post.log_phi[n] + phi_row = lda_post.phi[n] + + # log normalize + v = log_phi_row[0] + for i in range(1, len(log_phi_row)): + v = numpy.logaddexp(v, log_phi_row[i]) + + for i in range(0, len(log_phi_row)): + log_phi_row[i] = log_phi_row[i] - v + + for k in range(0, K): + phi_row[k] = numpy.exp(log_phi_row[k]) + + lda_post.log_phi[n] = log_phi_row + lda_post.phi[n] = phi_row + + return + +def update_gamma(lda_post): + """ + update variational dirichlet parameters + """ + + K = lda_post.lda.num_topics + N = lda_post.doc.nterms + + lda_post.gamma = numpy.copy(lda_post.lda.alpha) + + for n in range(0, N): + phi_row = lda_post.phi[n] + count = lda_post.doc.count[n] + + for k in range(0, K): + lda_post.gamma[k] += phi_row[k] * count + + return def compute_expected_log_prob(sslm): + """ + Compute the expected log probability given values of m. + """ + W = sslm.vocab_len T = sslm.num_time_slices for t in range(0, T): @@ -274,6 +348,10 @@ def compute_expected_log_prob(sslm): def sslm_counts_init(sslm, obs_variance, chain_variance, sstats): + """ + Initialize State Space Language Model with LDA sufficient statistics. + """ + W = sslm.vocab_len T = sslm.num_time_slices @@ -305,6 +383,9 @@ def sslm_counts_init(sslm, obs_variance, chain_variance, sstats): def init_ldaseq_ss(ldaseq, topic_chain_variance, topic_obs_variance, alpha, init_suffstats): + """ + Method to initialize State Space Language Model, topic wise. + """ ldaseq.alphas = alpha for k in range(0, ldaseq.num_topics): @@ -317,7 +398,19 @@ def init_ldaseq_ss(ldaseq, topic_chain_variance, topic_obs_variance, alpha, init # ldaseq.topic_chains[k].w_phi_sq = numpy.zeros((ldaseq.vocab_len, ldaseq.num_time_slices)) def fit_lda_seq(ldaseq, seq_corpus): + """ + fit an lda sequence model: + + for each time period + set up lda model with E[log p(w|z)] and \alpha + for each document + perform posterior inference + update sufficient statistics/likelihood + + maximize topics + """ + LDA_INFERENCE_MAX_ITER = 25 LDASQE_EM_THRESHOLD = 1e-4 LDA_SEQ_MIN_ITER = 6 @@ -333,9 +426,6 @@ def fit_lda_seq(ldaseq, seq_corpus): iter_ = 0 - # this is a flag/input do something about it - - while iter_ < LDA_SEQ_MIN_ITER or ((convergence > LDASQE_EM_THRESHOLD) and iter_ <= LDA_SEQ_MAX_ITER): print (" EM iter " , iter_) @@ -350,11 +440,13 @@ def fit_lda_seq(ldaseq, seq_corpus): # set up variables gammas = numpy.resize(numpy.zeros(corpus_len * K), (corpus_len, K)) lhoods = numpy.resize(numpy.zeros(corpus_len * K + 1), (corpus_len, K + 1)) - + # compute the likelihood of a sequential corpus under an LDA + # seq model and find the evidence lower bound. bound = lda_seq_infer(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, last_iter) print ("M Step") + # fit the variational distribution topic_bound = fit_lda_seq_topics(ldaseq, topic_suffstats) bound += topic_bound @@ -381,17 +473,17 @@ def fit_lda_seq(ldaseq, seq_corpus): def lda_seq_infer(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, last_iter): - + """ + set up lda model to use for inferDTMseq + """ K = ldaseq.num_topics W = ldaseq.vocab_len bound = 0.0 lda = ldamodel.LdaModel(num_topics=K, alpha=ldaseq.alphas, id2word=seq_corpus.id2word) lda.topics = numpy.array(numpy.split(numpy.zeros(W * K), W)) - lda_post = Lda_Post(max_doc_len=seq_corpus.max_doc_len, num_topics=K, lda=lda) - model = "DTM" if model == "DTM": inferDTMseq(K, ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, last_iter, lda, lda_post, bound) @@ -403,6 +495,9 @@ def lda_seq_infer(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, la def inferDTMseq(K, ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, last_iter, lda, lda_post, bound): + """ + compute the likelihood of a sequential corpus under an LDA seq model. return the likelihood bound. + """ def cumsum(it): total = 0 for x in it: @@ -414,7 +509,6 @@ def cumsum(it): d = 0 make_lda_seq_slice(lda, ldaseq, t) - time_slice = list(cumsum(ldaseq.time_slice)) for line_no, line in enumerate(seq_corpus.corpus): @@ -459,6 +553,10 @@ def cumsum(it): def fit_lda_post(doc_number, time, lda_post, ldaseq, g, g3_matrix, g4_matrix, g5_matrix): + """ + Posterior inference for lda. + """ + LDA_INFERENCE_CONVERGED = 1e-8 LDA_INFERENCE_MAX_ITER = 25 @@ -509,9 +607,11 @@ def fit_lda_post(doc_number, time, lda_post, ldaseq, g, g3_matrix, g4_matrix, g5 def make_lda_seq_slice(lda, ldaseq, time): + """ + set up the LDA model topic-word values with that of ldaseq. + """ K = ldaseq.num_topics - for k in range(0, K): lda.topics[:,k] = numpy.copy(ldaseq.topic_chains[k].e_log_prob[:,time]) @@ -520,6 +620,9 @@ def make_lda_seq_slice(lda, ldaseq, time): return def update_lda_seq_ss(time, doc, lda_post, topic_suffstats): + """ + Update lda sequence sufficient statistics from an lda posterior. + """ K = numpy.shape(lda_post.phi)[1] N = doc.nterms @@ -536,7 +639,9 @@ def update_lda_seq_ss(time, doc, lda_post, topic_suffstats): return def init_lda_post(lda_post): - + """ + Initialize variational posterior. + """ K = lda_post.lda.num_topics N = lda_post.doc.nterms for k in range(0, K): @@ -550,7 +655,9 @@ def init_lda_post(lda_post): return def compute_lda_lhood(lda_post): - + """ + compute the likelihood bound + """ K = lda_post.lda.num_topics N = lda_post.doc.nterms @@ -572,7 +679,6 @@ def compute_lda_lhood(lda_post): # if lda_post.doc_weight is not None and (model == "DIM" or model == "fixed"): # influence_topic = lda_post.doc_weight[k] # influence_term = - ((influence_topic * influence_topic + FLAGS_sigma_l * FLAGS_sigma_l) / 2.0 / (FLAGS_sigma_d * FLAGS_sigma_d)) - e_log_theta_k = digamma(lda_post.gamma[k]) - digsum lhood_term = (lda_post.lda.alpha[k] - lda_post.gamma[k]) * e_log_theta_k + math.lgamma(lda_post.gamma[k]) - math.lgamma(lda_post.lda.alpha[k]) @@ -587,59 +693,11 @@ def compute_lda_lhood(lda_post): return lhood -# update variational multinomial parameters -def update_phi(doc, time, lda_post, ldaseq, g): - - K = lda_post.lda.num_topics - N = lda_post.doc.nterms - - dig = numpy.zeros(K) - - for k in range(0, K): - dig[k] = digamma(lda_post.gamma[k]) - - for n in range(0, N): - w = lda_post.doc.word[n] - for k in range(0, K): - lda_post.log_phi[n][k] = dig[k] + lda_post.lda.topics[w][k] - - log_phi_row = lda_post.log_phi[n] - phi_row = lda_post.phi[n] - - # log normalize - v = log_phi_row[0] - for i in range(1, len(log_phi_row)): - v = numpy.logaddexp(v, log_phi_row[i]) - - for i in range(0, len(log_phi_row)): - log_phi_row[i] = log_phi_row[i] - v - - for k in range(0, K): - phi_row[k] = numpy.exp(log_phi_row[k]) - - lda_post.log_phi[n] = log_phi_row - lda_post.phi[n] = phi_row - - return - -# update variational dirichlet parameters -def update_gamma(lda_post): - - K = lda_post.lda.num_topics - N = lda_post.doc.nterms - - lda_post.gamma = numpy.copy(lda_post.lda.alpha) - - for n in range(0, N): - phi_row = lda_post.phi[n] - count = lda_post.doc.count[n] - - for k in range(0, K): - lda_post.gamma[k] += phi_row[k] * count - - return def fit_lda_seq_topics(ldaseq, topic_suffstats): + """ + Fit lda sequence topic wise. + """ lhood = 0 lhood_term = 0 K = ldaseq.num_topics @@ -652,7 +710,9 @@ def fit_lda_seq_topics(ldaseq, topic_suffstats): return lhood def fit_sslm(sslm, counts): - + """ + Fit variational distribution + """ W = sslm.vocab_len bound = 0 old_bound = 0 @@ -674,7 +734,6 @@ def fit_sslm(sslm, counts): if model == "DIM": bound = compute_bound_fixed(counts, totals, sslm) - print ("initial sslm bound is " , bound) while converged > sslm_fit_threshold and iter_ < sslm_max_iter: @@ -706,7 +765,9 @@ def col_sum(matrix, vector): return vector def compute_bound(word_counts, totals, sslm): - + """ + Compute log probability bound + """ W = sslm.vocab_len T = sslm.num_time_slices @@ -754,9 +815,11 @@ def compute_bound(word_counts, totals, sslm): return val -# fucntion to perform optimization -def update_obs(word_counts, totals, sslm): +def update_obs(word_counts, totals, sslm): + """ + Fucntion to perform optimization + """ OBS_NORM_CUTOFF = 2 STEP_SIZE = 0.01 @@ -810,12 +873,14 @@ def update_obs(word_counts, totals, sslm): sslm.zeta = update_zeta(sslm) return - - # compute d E[\beta_{t,w}]/d obs_{s,w} for t = 1:T. - # put the result in deriv, allocated T+1 vector def compute_mean_deriv(word, time, sslm, deriv): + """ + compute d E[\beta_{t,w}]/d obs_{s,w} for t = 1:T. + put the result in deriv, allocated T+1 vector + """ + T = sslm.num_time_slices fwd_variance = sslm.variance[word] From 4cd9f5379e732abc9fda0e06c80ffa2f19963301 Mon Sep 17 00:00:00 2001 From: Bhargav Srinivasa Date: Tue, 9 Aug 2016 15:59:10 +0530 Subject: [PATCH 27/38] More docstrings --- gensim/models/ldaseqmodel.py | 174 +++++++++++++++++++++-------------- 1 file changed, 106 insertions(+), 68 deletions(-) diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index b6ea45e30b..4d4364cefd 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -8,11 +8,10 @@ """ This is the class which is used to help with Dynamic Topic Modelling of a corpus. -It is a work in progress and will change largely throughout the course of development. -Inspired by the Blei's original DTM code and paper. TODO: add links +Inspired by the Blei's original DTM code and paper. +DTM C/C++ code: https://github.com/blei-lab/dtm +DTM Paper: https://www.cs.princeton.edu/~blei/papers/BleiLafferty2006a.pdf -As of now, the LdaSeqModel and SSLM classes mimic the structures of the same name in the Blei DTM code. -Few mathematical helper functions will be made and tested. """ @@ -27,13 +26,13 @@ class seq_corpus(utils.SaveLoad): """ - seq_corpus is basically a wrapper class which contains information about the corpus. - vocab_len is the length of the vocabulary. - max_doc_len is the maximum number of terms a single document has. - num_time_slices is the number of sequences, i.e number of time-slices. - corpus_len is the number of documents present. - time_slice is a list or numpy array which the user must provide which contains the number of documents in each time-slice. - corpus is any iterable gensim corpus. + `seq_corpus` is basically a wrapper class which contains information about the corpus. + `vocab_len` is the length of the vocabulary. + `max_doc_len` is the maximum number of terms a single document has. + `num_time_slices` is the number of sequences, i.e number of time-slices. + `corpus_len` is the number of documents present. + `time_slice` is a list or numpy array which the user must provide which contains the number of documents in each time-slice. + `corpus` is any iterable gensim corpus. """ def __init__(self, corpus=None, time_slice=None, id2word=None): @@ -136,12 +135,12 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_ class sslm(utils.SaveLoad): """ - obs values contain the doc - topic ratios - e_log_prob contains topic - word ratios - mean, fwd_mean contains the mean values to be used for inference for each word for a time_slice - variance, fwd_variance contains the variance values to be used for inference for each word in a time_slice - fwd_mean, fwd_variance are the forward posterior values. - zeta is an extra variational parameter with a value for each time-slice + `obs` values contain the doc - topic ratios + `e_log_prob` contains topic - word ratios + `mean`, `fwd_mean` contains the mean values to be used for inference for each word for a time_slice + `variance`, `fwd_variance` contains the variance values to be used for inference for each word in a time_slice + `fwd_mean`, `fwd_variance` are the forward posterior values. + `zeta` is an extra variational parameter with a value for each time-slice """ def __init__(self, vocab_len=None, num_time_slices=None, num_topics=None, obs_variance=0.5, chain_variance=0.005): @@ -172,10 +171,10 @@ def __init__(self, vocab_len=None, num_time_slices=None, num_topics=None, obs_va class Lda_Post(utils.SaveLoad): - """ - Posterior values associated with each set of documents document """ + Posterior values associated with each set of documents. + """ def __init__(self, doc=None, lda=None, max_doc_len=None, num_topics=None, gamma=None, lhood=None): @@ -195,12 +194,18 @@ def __init__(self, doc=None, lda=None, max_doc_len=None, num_topics=None, gamma= def update_zeta(sslm): + """ - Update Zeta Variational Parameter. + Updates the Zeta Variational Parameter. + Zeta is described in the appendix and is equal to sum (exp(mean[word] + Variance[word] / 2)), over every time-slice. + Zeta is obtained as a term which we obtain after solving the equation to maximize the lower bound as a function of the variational parameters (mean and variance). """ + vocab_len = sslm.vocab_len num_time_slices = sslm.num_time_slices - zeta = numpy.zeros(len(sslm.zeta)) + sslm.zeta.fill(0) + + zeta = sslm.zeta for i in range(0, vocab_len): for j in range(0, num_time_slices): @@ -215,12 +220,21 @@ def update_zeta(sslm): def compute_post_variance(word, sslm, chain_variance): """ - compute Var[\beta_{t,w}] for t = 1:T + Based on the Variational Kalman Filtering approach for Approximate Inference [https://www.cs.princeton.edu/~blei/papers/BleiLafferty2006a.pdf] + This function accepts the word to compute variance for, along with the associated sslm class object, and returns variance and fwd_variance + Computes Var[\beta_{t,w}] for t = 1:T + + Fwd_Variance(t) ≡ E((beta_{t,w} − mean_{t,w})^2 |beta_{t} for 1:t) + = (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance ) * (fwd_variance[t - 1] + obs_variance) + + Variance(t) ≡ E((beta_{t,w} − mean_cap{t,w})^2 |beta_cap{t} for 1:t) + = fwd_variance[t - 1] + (fwd_variance[t - 1] / fwd_variance[t - 1] + obs_variance)^2 * (variance[t - 1] - (fwd_variance[t-1] + obs_variance)) + """ T = sslm.num_time_slices - variance = numpy.copy(sslm.variance[word]) - fwd_variance = numpy.copy(sslm.fwd_variance[word]) + variance = sslm.variance[word] + fwd_variance = sslm.fwd_variance[word] # forward pass. Set initial variance very high fwd_variance[0] = chain_variance * 1000 @@ -230,7 +244,7 @@ def compute_post_variance(word, sslm, chain_variance): w = sslm.obs_variance / (fwd_variance[t - 1] + chain_variance + sslm.obs_variance) else: w = 0 - fwd_variance[t] = w * (fwd_variance[t-1] + chain_variance) + fwd_variance[t] = w * (fwd_variance[t - 1] + chain_variance) # backward pass variance[T] = fwd_variance[T] @@ -248,15 +262,26 @@ def compute_post_variance(word, sslm, chain_variance): def compute_post_mean(word, sslm, chain_variance): """ - forward-backward to compute E[\beta_{t,w}] for t = 1:T + Based on the Variational Kalman Filtering approach for Approximate Inference [https://www.cs.princeton.edu/~blei/papers/BleiLafferty2006a.pdf] + This function accepts the word to compute mean for, along with the associated sslm class object, and returns mean and fwd_mean + Essentially a forward-backward to compute E[\beta_{t,w}] for t = 1:T. + + Fwd_Mean(t) ≡ E(beta_{t,w} | beta_ˆ 1:t ) + = (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance ) * fwd_mean[t - 1] + (1 - (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance)) * beta + + Mean(t) ≡ E(beta_{t,w} | beta_ˆ 1:T ) + = fwd_mean[t - 1] + (obs_variance / fwd_variance[t - 1] + obs_variance) + (1 - obs_variance / fwd_variance[t - 1] + obs_variance)) * mean[t] + """ + T = sslm.num_time_slices + obs = sslm.obs[word] fwd_variance = sslm.fwd_variance[word] - mean = numpy.copy(sslm.mean[word]) - fwd_mean = numpy.copy(sslm.fwd_mean[word]) + mean = sslm.mean[word] + fwd_mean = sslm.fwd_mean[word] # forward fwd_mean[0] = 0 @@ -276,10 +301,14 @@ def compute_post_mean(word, sslm, chain_variance): return mean, fwd_mean -def update_phi(doc, time, lda_post, ldaseq, g): +def update_phi(doc, time, lda_post): """ - Update variational multinomial parameters + Update variational multinomial parameters, based on a document and a time-slice. + This is done based on the original Blei-LDA paper, where: + log_phi := beta * exp(Ψ(gamma)), over every topic for every word. + + TODO: incorporate lee-sueng trick used in **Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001**. """ K = lda_post.lda.num_topics @@ -312,11 +341,14 @@ def update_phi(doc, time, lda_post, ldaseq, g): lda_post.log_phi[n] = log_phi_row lda_post.phi[n] = phi_row - return + return lda_post.phi, lda_post.log_phi def update_gamma(lda_post): + """ - update variational dirichlet parameters + update variational dirichlet parameters as described in the original Blei LDA paper: + gamma = alpha + sum(phi), over every topic for every word. + """ K = lda_post.lda.num_topics @@ -331,11 +363,14 @@ def update_gamma(lda_post): for k in range(0, K): lda_post.gamma[k] += phi_row[k] * count - return + return lda_post.gamma + def compute_expected_log_prob(sslm): """ Compute the expected log probability given values of m. + The appendix describes the Expectation of log-probabilities in equation 5 of the DTM paper; + The below implementation is the result of solving the equation and is as implemented in the original Blei DTM code. """ W = sslm.vocab_len @@ -343,7 +378,8 @@ def compute_expected_log_prob(sslm): for t in range(0, T): for w in range(0, W): sslm.e_log_prob[w][t] = sslm.mean[w][t + 1] - numpy.log(sslm.zeta[t]) - return + + return sslm.e_log_prob def sslm_counts_init(sslm, obs_variance, chain_variance, sstats): @@ -379,13 +415,14 @@ def sslm_counts_init(sslm, obs_variance, chain_variance, sstats): sslm.mean, sslm.fwd_mean = compute_post_mean(w, sslm, sslm.chain_variance) sslm.zeta = update_zeta(sslm) - compute_expected_log_prob(sslm) + sslm.e_log_prob = compute_expected_log_prob(sslm) def init_ldaseq_ss(ldaseq, topic_chain_variance, topic_obs_variance, alpha, init_suffstats): """ Method to initialize State Space Language Model, topic wise. """ + ldaseq.alphas = alpha for k in range(0, ldaseq.num_topics): @@ -473,9 +510,11 @@ def fit_lda_seq(ldaseq, seq_corpus): def lda_seq_infer(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, last_iter): + """ set up lda model to use for inferDTMseq """ + K = ldaseq.num_topics W = ldaseq.vocab_len bound = 0.0 @@ -496,20 +535,15 @@ def lda_seq_infer(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, la def inferDTMseq(K, ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, last_iter, lda, lda_post, bound): """ - compute the likelihood of a sequential corpus under an LDA seq model. return the likelihood bound. + compute the likelihood of a sequential corpus under an LDA seq model. return the likelihood bound. """ - def cumsum(it): - total = 0 - for x in it: - total += x - yield total doc_index = 0 t = 0 d = 0 make_lda_seq_slice(lda, ldaseq, t) - time_slice = list(cumsum(ldaseq.time_slice)) + time_slice = numpy.cumsum(numpy.array(ldaseq.time_slice)) for line_no, line in enumerate(seq_corpus.corpus): if doc_index > time_slice[t]: @@ -543,7 +577,7 @@ def cumsum(it): if topic_suffstats != None: - update_lda_seq_ss(t, doc, lda_post, topic_suffstats) + topic_suffstats = update_lda_seq_ss(t, doc, lda_post, topic_suffstats) bound += doc_lhood doc_index += 1 @@ -575,14 +609,14 @@ def fit_lda_post(doc_number, time, lda_post, ldaseq, g, g3_matrix, g4_matrix, g5 # first iteration starts here iter_ += 1 lhood_old = lhood - update_gamma(lda_post) + lda_post.gamma = update_gamma(lda_post) model = "DTM" if model == "DTM" or sslm is None: - update_phi(doc_number, time, lda_post, sslm, g) + lda_post.phi, lda_post.log_phi = update_phi(doc_number, time, lda_post) elif model == "DIM" and sslm is not None: - update_phi_fixed(doc_number, time, lda_post, sslm, g3_matrix, g4_matrix, g5_matrix) + lda_post.phi, lda_post.log_phi = update_phi_fixed(doc_number, time, lda_post, sslm, g3_matrix, g4_matrix, g5_matrix) lhood = compute_lda_lhood(lda_post) converged = numpy.fabs((lhood_old - lhood) / (lhood_old * lda_post.doc.total)) @@ -592,13 +626,13 @@ def fit_lda_post(doc_number, time, lda_post, ldaseq, g, g3_matrix, g4_matrix, g5 iter_ += 1 lhood_old = lhood - update_gamma(lda_post) + lda_post.gamma = update_gamma(lda_post) model = "DTM" if model == "DTM" or sslm is None: - update_phi(doc_number, time, lda_post, sslm, g) + lda_post.phi, lda_post.log_phi = update_phi(doc_number, time, lda_post) elif model == "DIM" and sslm is not None: - update_phi_fixed(doc_number, time, lda_post, sslm, g3_matrix, g4_matrix, g5_matrix) + lda_post.phi, lda_post.log_phi = update_phi_fixed(doc_number, time, lda_post, sslm, g3_matrix, g4_matrix, g5_matrix) lhood = compute_lda_lhood(lda_post) converged = numpy.fabs((lhood_old - lhood) / (lhood_old * lda_post.doc.total)) @@ -607,6 +641,7 @@ def fit_lda_post(doc_number, time, lda_post, ldaseq, g, g3_matrix, g4_matrix, g5 def make_lda_seq_slice(lda, ldaseq, time): + """ set up the LDA model topic-word values with that of ldaseq. """ @@ -620,6 +655,7 @@ def make_lda_seq_slice(lda, ldaseq, time): return def update_lda_seq_ss(time, doc, lda_post, topic_suffstats): + """ Update lda sequence sufficient statistics from an lda posterior. """ @@ -636,11 +672,12 @@ def update_lda_seq_ss(time, doc, lda_post, topic_suffstats): topic_suffstats[k] = topic_ss - return + return topic_suffstats def init_lda_post(lda_post): + """ - Initialize variational posterior. + Initialize variational posterior, does not return anything. """ K = lda_post.lda.num_topics N = lda_post.doc.nterms @@ -710,9 +747,11 @@ def fit_lda_seq_topics(ldaseq, topic_suffstats): return lhood def fit_sslm(sslm, counts): + """ - Fit variational distribution + Fit variational distribution. """ + W = sslm.vocab_len bound = 0 old_bound = 0 @@ -724,8 +763,9 @@ def fit_sslm(sslm, counts): for w in range(0, W): sslm.variance, sslm.fwd_variance = compute_post_variance(w, sslm, sslm.chain_variance) - - totals = col_sum(counts, totals) + + # column sum of counts + totals = counts.sum(axis=0) iter_ = 0 model = "DTM" @@ -739,7 +779,7 @@ def fit_sslm(sslm, counts): while converged > sslm_fit_threshold and iter_ < sslm_max_iter: iter_ += 1 old_bound = bound - update_obs(counts, totals, sslm) + sslm.obs = update_obs(counts, totals, sslm) if model == "DTM": @@ -751,22 +791,16 @@ def fit_sslm(sslm, counts): print (iter_, " iteration lda seq bound is ", bound, " convergence is", converged) - compute_expected_log_prob(sslm) + sslm.e_log_prob = compute_expected_log_prob(sslm) return bound -def col_sum(matrix, vector): - - for i in range(0, matrix.shape[0]): - for j in range(0, matrix.shape[1]): - vector[j] = vector[j] + matrix[i][j] - - return vector - def compute_bound(word_counts, totals, sslm): + """ - Compute log probability bound + Compute log probability bound. + Forumula is as described in appendix of DTM. """ W = sslm.vocab_len T = sslm.num_time_slices @@ -817,6 +851,7 @@ def compute_bound(word_counts, totals, sslm): def update_obs(word_counts, totals, sslm): + """ Fucntion to perform optimization """ @@ -871,14 +906,16 @@ def update_obs(word_counts, totals, sslm): sslm.obs[w] = obs sslm.zeta = update_zeta(sslm) - return + + return sslm.obs def compute_mean_deriv(word, time, sslm, deriv): """ - compute d E[\beta_{t,w}]/d obs_{s,w} for t = 1:T. - put the result in deriv, allocated T+1 vector + Used in helping find the optimum function. + computes derivative of E[\beta_{t,w}]/d obs_{s,w} for t = 1:T. + put the result in deriv, allocated T+1 vector """ T = sslm.num_time_slices @@ -955,6 +992,7 @@ def f_obs(x, *args): term1 = 0.0 final = -(term1 + term2 + term3 + term4) + return final From 31fb790ec6620de2151127858bbff9868c415f7f Mon Sep 17 00:00:00 2001 From: Bhargav Srinivasa Date: Wed, 10 Aug 2016 15:44:15 +0530 Subject: [PATCH 28/38] Incorporated suggestions --- gensim/models/ldaseqmodel.py | 296 ++++++++++++++++++++--------------- 1 file changed, 167 insertions(+), 129 deletions(-) diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index 4d4364cefd..df62c91ad2 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -65,6 +65,7 @@ def __init__(self, corpus=None, time_slice=None, id2word=None): max_doc_len = len(line) self.max_doc_len = max_doc_len +# endclass seq_corpus class Doc(utils.SaveLoad): """ @@ -78,23 +79,48 @@ def __init__(self, nterms=None, word=None, count=None, total=None): self.count = count self.total = total +# endclass Doc class LdaSeqModel(utils.SaveLoad): """ - Class which contains information of our whole DTM model. - Topic chains contains for each topic a 'state space language model' object which in turn has information about each topic - the sslm class is described below and contains information on topic-word probabilities and doc-topic probabilities. - `alphas` is a prior of your choice and should be a double or float value. default is 0.01 - `initalize` allows the user to decide how he wants to initialise the DTM model. Default is through gensim LDA. - if `initalize` is 'blei-lda', then we will use the python port of blei's oriignal LDA code. - You can use your own sstats of an LDA model previously trained as well by specifying 'own' and passing a numpy matrix through sstats. - If you wish to just pass a previously used LDA model, pass it through `lda_model` - Shape of sstats is (vocab_len, num_topics) + The constructor estimates Dynamic Topic Model parameters based + on a training corpus. + If we have 30 documents, with 5 in the first time-slice, 10 in the second, and 15 in the third, we would + set up our model like this: + + >>> ldaseq = LdaSeqModel(corpus=corpus, time_slice= [5, 10, 15], num_topics=5) + + Model persistency is achieved through inheriting utils.SaveLoad. + + >>> ldaseq.save("ldaseq") + + saves the model to disk. """ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_topics=10, initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005): - + """ + `corpus` is any iterable gensim corpus + + `time_slice` as described above is a list which contains the number of documents in each time-slice + + `id2word` is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size and printing topics. + + `alphas` is a prior of your choice and should be a double or float value. default is 0.01 + + `num_topics` is the number of requested latent topics to be extracted from the training corpus. + + `initalize` allows the user to decide how he wants to initialise the DTM model. Default is through gensim LDA. + if `initalize` is 'blei-lda', then we will use the python port of blei's orignal LDA code. + You can use your own sstats of an LDA model previously trained as well by specifying 'own' and passing a numpy matrix through sstats. + If you wish to just pass a previously used LDA model, pass it through `lda_model` + Shape of sstats is (vocab_len, num_topics) + + `chain_variance` is a constant which dictates how the beta values evolve - it is a gaussian parameter defined in the + beta distribution. + + """ + if corpus is not None: self.corpus = seq_corpus(corpus=corpus, id2word=id2word, time_slice=time_slice) self.vocab_len = len(self.corpus.id2word) @@ -104,6 +130,8 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_ self.num_time_slices = len(time_slice) self.alphas = numpy.full(num_topics, alphas) + #topic_chains contains for each topic a 'state space language model' object which in turn has information about each topic + #the sslm class is described below and contains information on topic-word probabilities and doc-topic probabilities. self.topic_chains = [] for topic in range(0, num_topics): sslm_ = sslm(num_time_slices=self.num_time_slices, vocab_len=self.vocab_len, num_topics=self.num_topics, chain_variance=chain_variance, obs_variance=obs_variance) @@ -132,6 +160,7 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_ # fit DTM fit_lda_seq(self, self.corpus) +# endclass LdaSeqModel class sslm(utils.SaveLoad): """ @@ -169,8 +198,9 @@ def __init__(self, vocab_len=None, num_time_slices=None, num_topics=None, obs_va self.w_phi_l_sq = None self.m_update_coeff_g = None +# endclass sslm -class Lda_Post(utils.SaveLoad): +class LdaPost(utils.SaveLoad): """ Posterior values associated with each set of documents. @@ -192,30 +222,29 @@ def __init__(self, doc=None, lda=None, max_doc_len=None, num_topics=None, gamma= self.doc_weight = None self.renormalized_doc_weight = None +# endclass LdaState def update_zeta(sslm): """ Updates the Zeta Variational Parameter. Zeta is described in the appendix and is equal to sum (exp(mean[word] + Variance[word] / 2)), over every time-slice. - Zeta is obtained as a term which we obtain after solving the equation to maximize the lower bound as a function of the variational parameters (mean and variance). + It is the value of variational parameter zeta which maximizes the lower bound. """ vocab_len = sslm.vocab_len num_time_slices = sslm.num_time_slices sslm.zeta.fill(0) - zeta = sslm.zeta - for i in range(0, vocab_len): for j in range(0, num_time_slices): m = sslm.mean[i][j + 1] v = sslm.variance[i][j + 1] val = numpy.exp(m + v/2) - zeta[j] = zeta[j] + val + sslm.zeta[j] = sslm.zeta[j] + val - return zeta + return sslm.zeta def compute_post_variance(word, sslm, chain_variance): @@ -231,13 +260,14 @@ def compute_post_variance(word, sslm, chain_variance): = fwd_variance[t - 1] + (fwd_variance[t - 1] / fwd_variance[t - 1] + obs_variance)^2 * (variance[t - 1] - (fwd_variance[t-1] + obs_variance)) """ + INIT_VARIANCE = 1000 T = sslm.num_time_slices variance = sslm.variance[word] fwd_variance = sslm.fwd_variance[word] # forward pass. Set initial variance very high - fwd_variance[0] = chain_variance * 1000 + fwd_variance[0] = chain_variance * INIT_VARIANCE for t in range(1, T + 1): if sslm.obs_variance: @@ -301,7 +331,7 @@ def compute_post_mean(word, sslm, chain_variance): return mean, fwd_mean -def update_phi(doc, time, lda_post): +def update_phi(doc, time, ldapost): """ Update variational multinomial parameters, based on a document and a time-slice. @@ -311,21 +341,21 @@ def update_phi(doc, time, lda_post): TODO: incorporate lee-sueng trick used in **Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001**. """ - K = lda_post.lda.num_topics - N = lda_post.doc.nterms + K = ldapost.lda.num_topics + N = ldapost.doc.nterms dig = numpy.zeros(K) for k in range(0, K): - dig[k] = digamma(lda_post.gamma[k]) + dig[k] = digamma(ldapost.gamma[k]) for n in range(0, N): - w = lda_post.doc.word[n] + w = ldapost.doc.word[n] for k in range(0, K): - lda_post.log_phi[n][k] = dig[k] + lda_post.lda.topics[w][k] + ldapost.log_phi[n][k] = dig[k] + ldapost.lda.topics[w][k] - log_phi_row = lda_post.log_phi[n] - phi_row = lda_post.phi[n] + log_phi_row = ldapost.log_phi[n] + phi_row = ldapost.phi[n] # log normalize v = log_phi_row[0] @@ -338,12 +368,12 @@ def update_phi(doc, time, lda_post): for k in range(0, K): phi_row[k] = numpy.exp(log_phi_row[k]) - lda_post.log_phi[n] = log_phi_row - lda_post.phi[n] = phi_row + ldapost.log_phi[n] = log_phi_row + ldapost.phi[n] = phi_row - return lda_post.phi, lda_post.log_phi + return ldapost.phi, ldapost.log_phi -def update_gamma(lda_post): +def update_gamma(ldapost): """ update variational dirichlet parameters as described in the original Blei LDA paper: @@ -351,19 +381,19 @@ def update_gamma(lda_post): """ - K = lda_post.lda.num_topics - N = lda_post.doc.nterms + K = ldapost.lda.num_topics + N = ldapost.doc.nterms - lda_post.gamma = numpy.copy(lda_post.lda.alpha) + ldapost.gamma = numpy.copy(ldapost.lda.alpha) for n in range(0, N): - phi_row = lda_post.phi[n] - count = lda_post.doc.count[n] + phi_row = ldapost.phi[n] + count = ldapost.doc.count[n] for k in range(0, K): - lda_post.gamma[k] += phi_row[k] * count + ldapost.gamma[k] += phi_row[k] * count - return lda_post.gamma + return ldapost.gamma def compute_expected_log_prob(sslm): @@ -425,7 +455,6 @@ def init_ldaseq_ss(ldaseq, topic_chain_variance, topic_obs_variance, alpha, init ldaseq.alphas = alpha for k in range(0, ldaseq.num_topics): - sstats = init_suffstats[:,k] sslm_counts_init(ldaseq.topic_chains[k], topic_obs_variance, topic_chain_variance, sstats) @@ -453,8 +482,8 @@ def fit_lda_seq(ldaseq, seq_corpus): LDA_SEQ_MIN_ITER = 6 LDA_SEQ_MAX_ITER = 20 - K = ldaseq.num_topics - W = ldaseq.vocab_len + num_topics = ldaseq.num_topics + vocab_len = ldaseq.vocab_len data_len = seq_corpus.num_time_slices corpus_len = seq_corpus.corpus_len @@ -471,15 +500,16 @@ def fit_lda_seq(ldaseq, seq_corpus): old_bound = bound # initiate sufficient statistics topic_suffstats = [] - for k in range(0, K): - topic_suffstats.append(numpy.resize(numpy.zeros(W * data_len), (W, data_len))) + for num_topics in range(0, num_topics): + topic_suffstats.append(numpy.resize(numpy.zeros(vocab_len * data_len), (vocab_len, data_len))) # set up variables - gammas = numpy.resize(numpy.zeros(corpus_len * K), (corpus_len, K)) - lhoods = numpy.resize(numpy.zeros(corpus_len * K + 1), (corpus_len, K + 1)) + gammas = numpy.resize(numpy.zeros(corpus_len * num_topics), (corpus_len, num_topics)) + lhoods = numpy.resize(numpy.zeros(corpus_len * num_topics + 1), (corpus_len, num_topics + 1)) # compute the likelihood of a sequential corpus under an LDA - # seq model and find the evidence lower bound. - bound = lda_seq_infer(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, last_iter) + + # seq model and find the evidence lower bound. This is the E - Step + bound = lda_seq_infer(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_) print ("M Step") @@ -509,47 +539,54 @@ def fit_lda_seq(ldaseq, seq_corpus): return bound -def lda_seq_infer(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, last_iter): +def lda_seq_infer(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_): """ - set up lda model to use for inferDTMseq + Inference or E- Step. + This is used to set up the gensim LdaModel to be used for each time-slice. + It also allows for Document Influence Model code to be written in. """ - K = ldaseq.num_topics - W = ldaseq.vocab_len + num_topics = ldaseq.num_topics + vocab_len = ldaseq.vocab_len bound = 0.0 - lda = ldamodel.LdaModel(num_topics=K, alpha=ldaseq.alphas, id2word=seq_corpus.id2word) - lda.topics = numpy.array(numpy.split(numpy.zeros(W * K), W)) - lda_post = Lda_Post(max_doc_len=seq_corpus.max_doc_len, num_topics=K, lda=lda) + lda = ldamodel.LdaModel(num_topics=num_topics, alpha=ldaseq.alphas, id2word=seq_corpus.id2word) + lda.topics = numpy.array(numpy.split(numpy.zeros(vocab_len * num_topics), vocab_len)) + ldapost = LdaPost(max_doc_len=seq_corpus.max_doc_len, num_topics=num_topics, lda=lda) model = "DTM" if model == "DTM": - inferDTMseq(K, ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, last_iter, lda, lda_post, bound) + bound = inferDTMseq(daseq, seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost) elif model == "DIM": InfluenceTotalFixed(ldaseq, seq_corpus); - inferDIMseq(K, ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, last_iter, lda, lda_post, bound) + bound = inferDIMseq(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost) return bound -def inferDTMseq(K, ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, last_iter, lda, lda_post, bound): + +def inferDTMseq(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost): """ - compute the likelihood of a sequential corpus under an LDA seq model. return the likelihood bound. + Computes the likelihood of a sequential corpus under an LDA seq model, and return the likelihood bound. + Need to pass the LdaSeq model, seq_corpus, sufficient stats, gammas and lhoods matrices previously created, + and LdaModel and LdaPost class objects. """ - doc_index = 0 - t = 0 - d = 0 - make_lda_seq_slice(lda, ldaseq, t) + doc_index = 0 # overall doc_index in corpus + time = 0 # current time-slice + doc_num = 0 # doc-index in current time-lice + num_topics = ldaseq.num_topics + make_lda_seq_slice(lda, ldaseq, time) # create lda_seq slice time_slice = numpy.cumsum(numpy.array(ldaseq.time_slice)) for line_no, line in enumerate(seq_corpus.corpus): + # this is used to update the time_slice and create a new lda_seq slice every new time_slice if doc_index > time_slice[t]: - t += 1 - make_lda_seq_slice(lda, ldaseq, t) - d = 0 + time += 1 + make_lda_seq_slice(lda, ldaseq, time) + doc_num = 0 gam = gammas[doc_index] lhood = lhoods[doc_index] @@ -566,26 +603,26 @@ def inferDTMseq(K, ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_, l totals += int(count) doc = Doc(word=words, count=counts, total=totals, nterms=int(nterms)) - lda_post.gamma = gam - lda_post.lhood = lhood - lda_post.doc = doc + ldapost.gamma = gam + ldapost.lhood = lhood + ldapost.doc = doc if iter_ == 0: - doc_lhood = fit_lda_post(d, t, lda_post, None, None, None, None, None) + doc_lhood = fit_lda_post(doc_num, time, ldapost, None, None, None, None, None) else: - doc_lhood = fit_lda_post(d, t, lda_post, ldaseq, None, None, None, None) + doc_lhood = fit_lda_post(doc_num, time, ldapost, ldaseq, None, None, None, None) if topic_suffstats != None: - topic_suffstats = update_lda_seq_ss(t, doc, lda_post, topic_suffstats) + topic_suffstats = update_lda_seq_ss(time, doc, ldapost, topic_suffstats) bound += doc_lhood doc_index += 1 - d += 1 + doc_num += 1 - return + return bound -def fit_lda_post(doc_number, time, lda_post, ldaseq, g, g3_matrix, g4_matrix, g5_matrix): +def fit_lda_post(doc_number, time, ldapost, ldaseq, g, g3_matrix, g4_matrix, g5_matrix): """ Posterior inference for lda. @@ -594,14 +631,14 @@ def fit_lda_post(doc_number, time, lda_post, ldaseq, g, g3_matrix, g4_matrix, g5 LDA_INFERENCE_CONVERGED = 1e-8 LDA_INFERENCE_MAX_ITER = 25 - init_lda_post(lda_post) + init_lda_post(ldapost) model = "DTM" if model == "DIM": # if in DIM then we initialise some variables here pass - lhood = compute_lda_lhood(lda_post) + lhood = compute_lda_lhood(ldapost) lhood_old = 0 converged = 0 iter_ = 0 @@ -609,33 +646,33 @@ def fit_lda_post(doc_number, time, lda_post, ldaseq, g, g3_matrix, g4_matrix, g5 # first iteration starts here iter_ += 1 lhood_old = lhood - lda_post.gamma = update_gamma(lda_post) + ldapost.gamma = update_gamma(ldapost) model = "DTM" if model == "DTM" or sslm is None: - lda_post.phi, lda_post.log_phi = update_phi(doc_number, time, lda_post) + ldapost.phi, ldapost.log_phi = update_phi(doc_number, time, ldapost) elif model == "DIM" and sslm is not None: - lda_post.phi, lda_post.log_phi = update_phi_fixed(doc_number, time, lda_post, sslm, g3_matrix, g4_matrix, g5_matrix) + ldapost.phi, ldapost.log_phi = update_phi_fixed(doc_number, time, ldapost, sslm, g3_matrix, g4_matrix, g5_matrix) - lhood = compute_lda_lhood(lda_post) - converged = numpy.fabs((lhood_old - lhood) / (lhood_old * lda_post.doc.total)) + lhood = compute_lda_lhood(ldapost) + converged = numpy.fabs((lhood_old - lhood) / (lhood_old * ldapost.doc.total)) while converged > LDA_INFERENCE_CONVERGED and iter_ <= LDA_INFERENCE_MAX_ITER: iter_ += 1 lhood_old = lhood - lda_post.gamma = update_gamma(lda_post) + ldapost.gamma = update_gamma(ldapost) model = "DTM" if model == "DTM" or sslm is None: - lda_post.phi, lda_post.log_phi = update_phi(doc_number, time, lda_post) + ldapost.phi, ldapost.log_phi = update_phi(doc_number, time, ldapost) elif model == "DIM" and sslm is not None: - lda_post.phi, lda_post.log_phi = update_phi_fixed(doc_number, time, lda_post, sslm, g3_matrix, g4_matrix, g5_matrix) + ldapost.phi, ldapost.log_phi = update_phi_fixed(doc_number, time, ldapost, sslm, g3_matrix, g4_matrix, g5_matrix) - lhood = compute_lda_lhood(lda_post) - converged = numpy.fabs((lhood_old - lhood) / (lhood_old * lda_post.doc.total)) + lhood = compute_lda_lhood(ldapost) + converged = numpy.fabs((lhood_old - lhood) / (lhood_old * ldapost.doc.total)) return lhood @@ -646,66 +683,67 @@ def make_lda_seq_slice(lda, ldaseq, time): set up the LDA model topic-word values with that of ldaseq. """ - K = ldaseq.num_topics - for k in range(0, K): + num_topics = ldaseq.num_topics + for k in range(0, num_topics): lda.topics[:,k] = numpy.copy(ldaseq.topic_chains[k].e_log_prob[:,time]) lda.alpha = numpy.copy(ldaseq.alphas) return -def update_lda_seq_ss(time, doc, lda_post, topic_suffstats): +def update_lda_seq_ss(time, doc, ldapost, topic_suffstats): """ Update lda sequence sufficient statistics from an lda posterior. """ - K = numpy.shape(lda_post.phi)[1] - N = doc.nterms + num_topics = numpy.shape(ldapost.phi)[1] + nterms = doc.nterms - for k in range(0, K): + for k in range(0, num_topics): topic_ss = topic_suffstats[k] - for n in range(0, N): + for n in range(0, nterms): w = doc.word[n] c = doc.count[n] - topic_ss[w][time] = topic_ss[w][time] + c * lda_post.phi[n][k] + topic_ss[w][time] = topic_ss[w][time] + c * ldapost.phi[n][k] topic_suffstats[k] = topic_ss return topic_suffstats -def init_lda_post(lda_post): +def init_lda_post(ldapost): """ Initialize variational posterior, does not return anything. """ - K = lda_post.lda.num_topics - N = lda_post.doc.nterms - for k in range(0, K): - lda_post.gamma[k] = lda_post.lda.alpha[k] + float(lda_post.doc.total) / K - for n in range(0, N): - lda_post.phi[n][k] = 1.0 / K + num_topics = ldapost.lda.num_topics + nterms = ldapost.doc.nterms + + for k in range(0, num_topics): + ldapost.gamma[k] = ldapost.lda.alpha[k] + float(ldapost.doc.total) / K + for n in range(0, nterms): + ldapost.phi[n][k] = 1.0 / K # doc_weight used during DIM - # lda_post.doc_weight = None + # ldapost.doc_weight = None return -def compute_lda_lhood(lda_post): +def compute_lda_lhood(ldapost): """ compute the likelihood bound """ - K = lda_post.lda.num_topics - N = lda_post.doc.nterms - gamma_sum = numpy.sum(lda_post.gamma) + K = ldapost.lda.num_topics + N = ldapost.doc.nterms + gamma_sum = numpy.sum(ldapost.gamma) # TODO: flags FLAGS_sigma_l = 0 FLAGS_sigma_d = 0 - lhood = math.lgamma(numpy.sum(lda_post.lda.alpha)) - math.lgamma(gamma_sum) - lda_post.lhood[K] = lhood + lhood = math.lgamma(numpy.sum(ldapost.lda.alpha)) - math.lgamma(gamma_sum) + ldapost.lhood[K] = lhood # influence_term = 0 digsum = digamma(gamma_sum) @@ -713,18 +751,18 @@ def compute_lda_lhood(lda_post): model = "DTM" for k in range(0, K): # below code only to be used in DIM mode - # if lda_post.doc_weight is not None and (model == "DIM" or model == "fixed"): - # influence_topic = lda_post.doc_weight[k] + # if ldapost.doc_weight is not None and (model == "DIM" or model == "fixed"): + # influence_topic = ldapost.doc_weight[k] # influence_term = - ((influence_topic * influence_topic + FLAGS_sigma_l * FLAGS_sigma_l) / 2.0 / (FLAGS_sigma_d * FLAGS_sigma_d)) - e_log_theta_k = digamma(lda_post.gamma[k]) - digsum - lhood_term = (lda_post.lda.alpha[k] - lda_post.gamma[k]) * e_log_theta_k + math.lgamma(lda_post.gamma[k]) - math.lgamma(lda_post.lda.alpha[k]) + e_log_theta_k = digamma(ldapost.gamma[k]) - digsum + lhood_term = (ldapost.lda.alpha[k] - ldapost.gamma[k]) * e_log_theta_k + math.lgamma(ldapost.gamma[k]) - math.lgamma(ldapost.lda.alpha[k]) for n in range(0, N): - if lda_post.phi[n][k] > 0: - lhood_term += lda_post.doc.count[n] * lda_post.phi[n][k] * (e_log_theta_k + lda_post.lda.topics[lda_post.doc.word[n]][k] - lda_post.log_phi[n][k]) + if ldapost.phi[n][k] > 0: + lhood_term += ldapost.doc.count[n] * ldapost.phi[n][k] * (e_log_theta_k + ldapost.lda.topics[ldapost.doc.word[n]][k] - ldapost.log_phi[n][k]) - lda_post.lhood[k] = lhood_term + ldapost.lhood[k] = lhood_term lhood += lhood_term # lhood += influence_term @@ -779,7 +817,7 @@ def fit_sslm(sslm, counts): while converged > sslm_fit_threshold and iter_ < sslm_max_iter: iter_ += 1 old_bound = bound - sslm.obs = update_obs(counts, totals, sslm) + sslm.obs, sslm.zeta = update_obs(counts, totals, sslm) if model == "DTM": @@ -886,7 +924,7 @@ def update_obs(word_counts, totals, sslm): for t in range(0, T): mean_deriv = mean_deriv_mtx[t] - compute_mean_deriv(w, t, sslm, mean_deriv) + mean_deriv = compute_mean_deriv(w, t, sslm, mean_deriv) mean_deriv_mtx[t] = mean_deriv deriv = numpy.zeros(T) @@ -906,8 +944,8 @@ def update_obs(word_counts, totals, sslm): sslm.obs[w] = obs sslm.zeta = update_zeta(sslm) - - return sslm.obs + + return sslm.obs, sslm.zeta def compute_mean_deriv(word, time, sslm, deriv): @@ -943,7 +981,7 @@ def compute_mean_deriv(word, time, sslm, deriv): w = sslm.chain_variance / (fwd_variance[t] + sslm.chain_variance) deriv[t] = w * deriv[t] + (1 - w) * deriv[t + 1] - return + return deriv def f_obs(x, *args): @@ -1047,7 +1085,7 @@ def compute_obs_deriv(word, word_counts, totals, sslm, mean_deriv_mtx, deriv): deriv[t] = term1 + term2 + term3 + term4 - return + return deriv def df_obs(x, *args): @@ -1058,9 +1096,9 @@ def df_obs(x, *args): model = "DTM" if model == "DTM": - compute_obs_deriv(word, word_counts, totals, sslm, mean_deriv_mtx, deriv) + deriv = compute_obs_deriv(word, word_counts, totals, sslm, mean_deriv_mtx, deriv) elif model == "DIM": - compute_obs_deriv_fixed(p.word, p.word_counts, p.totals, p.sslm, p.mean_deriv_mtx, deriv) + deriv = compute_obs_deriv_fixed(p.word, p.word_counts, p.totals, p.sslm, p.mean_deriv_mtx, deriv) return numpy.negative(deriv) @@ -1157,9 +1195,9 @@ def lda_e_step(lda_model, seq_corpus, lda_ss, num_topics): if lda_ss is not None: lda_ss.fill(0) - lda_post = Lda_Post(max_doc_len=seq_corpus.max_doc_len, num_topics=K, lda=lda_model) - lda_post.gamma = numpy.zeros(K) - lda_post.lhood = numpy.zeros(K + 1) + ldapost = LdaPost(max_doc_len=seq_corpus.max_doc_len, num_topics=K, lda=lda_model) + ldapost.gamma = numpy.zeros(K) + ldapost.lhood = numpy.zeros(K + 1) lhood = 0 @@ -1177,18 +1215,18 @@ def lda_e_step(lda_model, seq_corpus, lda_ss, num_topics): totals += int(count) doc = Doc(word=words, count=counts, total=totals, nterms=int(nterms)) - lda_post.doc = doc - lhood += fit_lda_post(d, 0, lda_post, None, None, None, None, None) + ldapost.doc = doc + lhood += fit_lda_post(d, 0, ldapost, None, None, None, None, None) if lda_ss is not None: for k in range(0, K): - for n in range(0, lda_post.doc.nterms): - lda_ss[lda_post.doc.word[n]][k] += lda_post.phi[n][k] * lda_post.doc.count[n] + for n in range(0, ldapost.doc.nterms): + lda_ss[ldapost.doc.word[n]][k] += ldapost.phi[n][k] * ldapost.doc.count[n] return lhood -def print_topics(ldaseq, topic, time, top_terms): +def print_topics(ldaseq, topic, time=0, top_terms=20): """ Topic is the topic numner Time is for a particular time_slice From 5777cb35de3772354f536c77ea6f832f4dbab1b0 Mon Sep 17 00:00:00 2001 From: Bhargav Srinivasa Date: Wed, 10 Aug 2016 18:29:51 +0530 Subject: [PATCH 29/38] Added doc-topic --- gensim/models/ldaseqmodel.py | 125 +++++++++++++++++++++-------------- 1 file changed, 77 insertions(+), 48 deletions(-) diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index df62c91ad2..b71ee8bf68 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -72,7 +72,16 @@ class Doc(utils.SaveLoad): The doc class contains information used for each document. """ - def __init__(self, nterms=None, word=None, count=None, total=None): + def __init__(self, doc=None): + + nterms = len(doc) + words = [] + counts = [] + totals = 0 + for word_id, count in doc: + words.append(int(word_id)) + counts.append(int(count)) + totals += int(count) self.nterms = nterms self.word = word @@ -125,7 +134,7 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_ self.corpus = seq_corpus(corpus=corpus, id2word=id2word, time_slice=time_slice) self.vocab_len = len(self.corpus.id2word) - + self.time_slice = time_slice self.num_topics = num_topics self.num_time_slices = len(time_slice) self.alphas = numpy.full(num_topics, alphas) @@ -212,6 +221,10 @@ def __init__(self, doc=None, lda=None, max_doc_len=None, num_topics=None, gamma= self.lda = lda self.gamma = gamma self.lhood = lhood + if self.gamma = None: + self.gamma = numpy.zeros(num_topics) + if self.lhood = None: + self.lhood = numpy.zeros(num_topics) if max_doc_len is not None and num_topics is not None: self.phi = numpy.resize(numpy.zeros(max_doc_len * num_topics), (max_doc_len, num_topics)) @@ -260,14 +273,14 @@ def compute_post_variance(word, sslm, chain_variance): = fwd_variance[t - 1] + (fwd_variance[t - 1] / fwd_variance[t - 1] + obs_variance)^2 * (variance[t - 1] - (fwd_variance[t-1] + obs_variance)) """ - INIT_VARIANCE = 1000 + INIT_VARIANCE_CONST = 1000 T = sslm.num_time_slices variance = sslm.variance[word] fwd_variance = sslm.fwd_variance[word] # forward pass. Set initial variance very high - fwd_variance[0] = chain_variance * INIT_VARIANCE + fwd_variance[0] = chain_variance * INIT_VARIANCE_CONST for t in range(1, T + 1): if sslm.obs_variance: @@ -439,14 +452,15 @@ def sslm_counts_init(sslm, obs_variance, chain_variance, sstats): # compute post variance for w in range(0, W): - sslm.variance, sslm.fwd_variance = compute_post_variance(w, sslm, sslm.chain_variance) + sslm.variance[w], sslm.fwd_variance[w] = compute_post_variance(w, sslm, sslm.chain_variance) for w in range(0, W): - sslm.mean, sslm.fwd_mean = compute_post_mean(w, sslm, sslm.chain_variance) + sslm.mean[w], sslm.fwd_mean[w] = compute_post_mean(w, sslm, sslm.chain_variance) sslm.zeta = update_zeta(sslm) sslm.e_log_prob = compute_expected_log_prob(sslm) + def init_ldaseq_ss(ldaseq, topic_chain_variance, topic_obs_variance, alpha, init_suffstats): """ @@ -463,6 +477,7 @@ def init_ldaseq_ss(ldaseq, topic_chain_variance, topic_obs_variance, alpha, init # ldaseq.topic_chains[k].w_phi_sum = numpy.zeros((ldaseq.vocab_len, ldaseq.num_time_slices)) # ldaseq.topic_chains[k].w_phi_sq = numpy.zeros((ldaseq.vocab_len, ldaseq.num_time_slices)) + def fit_lda_seq(ldaseq, seq_corpus): """ fit an lda sequence model: @@ -500,7 +515,7 @@ def fit_lda_seq(ldaseq, seq_corpus): old_bound = bound # initiate sufficient statistics topic_suffstats = [] - for num_topics in range(0, num_topics): + for topic in range(0, num_topics): topic_suffstats.append(numpy.resize(numpy.zeros(vocab_len * data_len), (vocab_len, data_len))) # set up variables @@ -509,7 +524,8 @@ def fit_lda_seq(ldaseq, seq_corpus): # compute the likelihood of a sequential corpus under an LDA # seq model and find the evidence lower bound. This is the E - Step - bound = lda_seq_infer(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_) + bound, gammas = lda_seq_infer(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_) + ldaseq.gammas = gammas print ("M Step") @@ -557,15 +573,15 @@ def lda_seq_infer(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_): model = "DTM" if model == "DTM": - bound = inferDTMseq(daseq, seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost) + bound, gammas = inferDTMseq(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound) elif model == "DIM": InfluenceTotalFixed(ldaseq, seq_corpus); - bound = inferDIMseq(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost) + bound, gammas = inferDIMseq(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound) - return bound + return bound, gammas -def inferDTMseq(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost): +def inferDTMseq(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound): """ Computes the likelihood of a sequential corpus under an LDA seq model, and return the likelihood bound. @@ -583,7 +599,7 @@ def inferDTMseq(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapos for line_no, line in enumerate(seq_corpus.corpus): # this is used to update the time_slice and create a new lda_seq slice every new time_slice - if doc_index > time_slice[t]: + if doc_index > time_slice[time]: time += 1 make_lda_seq_slice(lda, ldaseq, time) doc_num = 0 @@ -591,18 +607,7 @@ def inferDTMseq(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapos gam = gammas[doc_index] lhood = lhoods[doc_index] - doc_ = line - - nterms = len(doc_) - words = [] - counts = [] - totals = 0 - for word_id, count in doc_: - words.append(int(word_id)) - counts.append(int(count)) - totals += int(count) - - doc = Doc(word=words, count=counts, total=totals, nterms=int(nterms)) + doc = Doc(doc=line) ldapost.gamma = gam ldapost.lhood = lhood ldapost.doc = doc @@ -620,7 +625,8 @@ def inferDTMseq(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapos doc_index += 1 doc_num += 1 - return bound + return bound, gammas + def fit_lda_post(doc_number, time, ldapost, ldaseq, g, g3_matrix, g4_matrix, g5_matrix): @@ -691,6 +697,7 @@ def make_lda_seq_slice(lda, ldaseq, time): return + def update_lda_seq_ss(time, doc, ldapost, topic_suffstats): """ @@ -711,6 +718,7 @@ def update_lda_seq_ss(time, doc, ldapost, topic_suffstats): return topic_suffstats + def init_lda_post(ldapost): """ @@ -720,15 +728,15 @@ def init_lda_post(ldapost): nterms = ldapost.doc.nterms for k in range(0, num_topics): - ldapost.gamma[k] = ldapost.lda.alpha[k] + float(ldapost.doc.total) / K + ldapost.gamma[k] = ldapost.lda.alpha[k] + float(ldapost.doc.total) / num_topics for n in range(0, nterms): - ldapost.phi[n][k] = 1.0 / K + ldapost.phi[n][k] = 1.0 / num_topics # doc_weight used during DIM # ldapost.doc_weight = None - return + def compute_lda_lhood(ldapost): """ compute the likelihood bound @@ -784,6 +792,7 @@ def fit_lda_seq_topics(ldaseq, topic_suffstats): return lhood + def fit_sslm(sslm, counts): """ @@ -800,7 +809,7 @@ def fit_sslm(sslm, counts): totals = numpy.zeros(counts.shape[1]) for w in range(0, W): - sslm.variance, sslm.fwd_variance = compute_post_variance(w, sslm, sslm.chain_variance) + sslm.variance[w], sslm.fwd_variance[w] = compute_post_variance(w, sslm, sslm.chain_variance) # column sum of counts totals = counts.sum(axis=0) @@ -853,7 +862,7 @@ def compute_bound(word_counts, totals, sslm): chain_variance = sslm.chain_variance for w in range(0, W): - sslm.mean, sslm.fwd_mean = compute_post_mean(w, sslm, chain_variance) + sslm.mean[w], sslm.fwd_mean[w] = compute_post_mean(w, sslm, chain_variance) sslm.zeta = update_zeta(sslm) @@ -898,7 +907,6 @@ def update_obs(word_counts, totals, sslm): STEP_SIZE = 0.01 TOL = 1e-3 - W = sslm.vocab_len T = sslm.num_time_slices @@ -983,6 +991,7 @@ def compute_mean_deriv(word, time, sslm, deriv): return deriv + def f_obs(x, *args): sslm, word_counts, totals, mean_deriv_mtx, word, deriv = args @@ -999,7 +1008,7 @@ def f_obs(x, *args): term4 = 0 sslm.obs[word] = x - sslm.mean, sslm.fwd_mean = compute_post_mean(word, sslm, sslm.chain_variance) + sslm.mean[word], sslm.fwd_mean[word] = compute_post_mean(word, sslm, sslm.chain_variance) mean = sslm.mean[word] variance = sslm.variance[word] @@ -1087,12 +1096,13 @@ def compute_obs_deriv(word, word_counts, totals, sslm, mean_deriv_mtx, deriv): return deriv + def df_obs(x, *args): sslm, word_counts, totals, mean_deriv_mtx, word, deriv = args sslm.obs[word] = x - sslm.mean, sslm.fwd_mean = compute_post_mean(word, sslm, sslm.chain_variance) + sslm.mean[word], sslm.fwd_mean[word] = compute_post_mean(word, sslm, sslm.chain_variance) model = "DTM" if model == "DTM": @@ -1103,7 +1113,6 @@ def df_obs(x, *args): return numpy.negative(deriv) - # the following code replicates Blei's original LDA, ported to python. # idea is to let user initialise LDA sstats through this instead of gensim LDA if wanted. @@ -1118,6 +1127,7 @@ def lda_sstats(seq_corpus, num_topics, num_terms, alpha): return lda_ss + def initialize_ss_random(seq_corpus, num_topics): N = seq_corpus.num_terms @@ -1131,6 +1141,7 @@ def initialize_ss_random(seq_corpus, num_topics): return topic + def lda_m_step(lda_model, lda_ss, seq_corpus, num_topics): K = num_topics @@ -1155,6 +1166,7 @@ def lda_m_step(lda_model, lda_ss, seq_corpus, num_topics): return lhood + def lda_em(lda_model, lda_ss, seq_corpus, max_iter, num_topics): LDA_EM_CONVERGED = 5e-5 @@ -1187,7 +1199,6 @@ def lda_em(lda_model, lda_ss, seq_corpus, max_iter, num_topics): return lhood - def lda_e_step(lda_model, seq_corpus, lda_ss, num_topics): K = num_topics @@ -1203,18 +1214,7 @@ def lda_e_step(lda_model, seq_corpus, lda_ss, num_topics): for line_no, line in enumerate(seq_corpus.corpus): - doc_ = line - - nterms = len(doc_) - words = [] - counts = [] - totals = 0 - for word_id, count in doc_: - words.append(int(word_id)) - counts.append(int(count)) - totals += int(count) - - doc = Doc(word=words, count=counts, total=totals, nterms=int(nterms)) + doc = Doc(doc=line) ldapost.doc = doc lhood += fit_lda_post(d, 0, ldapost, None, None, None, None, None) @@ -1241,6 +1241,35 @@ def print_topics(ldaseq, topic, time=0, top_terms=20): return beststr +def doc_topics(ldaseq, doc_number): + + """ + On passing the LdaSeqModel trained ldaseq object, the doc_number of your document in the corpus, + it returns the doc-topic probabilities of that document. + """ + doc_topic = numpy.copy(ldaseq.gammas) + doc_topic /= doc_topic.sum(axis=1)[:, numpy.newaxis] + + return doc_topic[doc_number] + + +def get_item(ldaseq, doc): + """ + Analyse documents on trained DTM model. + """ + + lda_model = ldamodel.LdaModel(num_topics=num_topics, alpha=ldaseq.alphas, id2word=ldaseq.corpus.id2word) + ldapost = LdaPost(num_topics=ldaseq.num_topics, max_doc_len=len(doc), lda=lda_model) + + time_lhoods = [] + for time in range(0, ldaseq.num_time_slices): + make_lda_seq_slice(lda_model, ldaseq, time) + ldapost.doc = Doc(doc=doc) + lhood = fit_lda_post(0, time, ldapost, ldaseq, None, None, None, None) + time_lhoods.append(lhood) + + return time_lhoods + # the fdf used in optimising obs. Can use if we figure a way to use an optimization function which requires this From 8d06704485a2969569a0abe25396d7ab6993f820 Mon Sep 17 00:00:00 2001 From: Bhargav Srinivasa Date: Thu, 11 Aug 2016 03:23:08 +0530 Subject: [PATCH 30/38] Reorganized --- gensim/models/ldaseqmodel.py | 1599 +++++++++++++++------------------- 1 file changed, 723 insertions(+), 876 deletions(-) diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index b71ee8bf68..1c55e731a1 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -7,12 +7,18 @@ """ -This is the class which is used to help with Dynamic Topic Modelling of a corpus. Inspired by the Blei's original DTM code and paper. -DTM C/C++ code: https://github.com/blei-lab/dtm +Original DTM C/C++ code: https://github.com/blei-lab/dtm DTM Paper: https://www.cs.princeton.edu/~blei/papers/BleiLafferty2006a.pdf +TODO: +The next steps to take this forward would be: + + 1) Include DIM mode. Most of the infrastructure for this is in place. + 2) Lots of heavy lifting going on in the sslm class - efforts can be made to cythonise mathematical methods. + 3) Try and make it distributed, especially around the E and M step. + """ from gensim import interfaces, utils, matutils @@ -21,7 +27,7 @@ import math from scipy.special import digamma from scipy import optimize - +import sys class seq_corpus(utils.SaveLoad): @@ -67,28 +73,6 @@ def __init__(self, corpus=None, time_slice=None, id2word=None): # endclass seq_corpus -class Doc(utils.SaveLoad): - """ - The doc class contains information used for each document. - - """ - def __init__(self, doc=None): - - nterms = len(doc) - words = [] - counts = [] - totals = 0 - for word_id, count in doc: - words.append(int(word_id)) - counts.append(int(count)) - totals += int(count) - - self.nterms = nterms - self.word = word - self.count = count - self.total = total - -# endclass Doc class LdaSeqModel(utils.SaveLoad): """ @@ -120,7 +104,6 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_ `num_topics` is the number of requested latent topics to be extracted from the training corpus. `initalize` allows the user to decide how he wants to initialise the DTM model. Default is through gensim LDA. - if `initalize` is 'blei-lda', then we will use the python port of blei's orignal LDA code. You can use your own sstats of an LDA model previously trained as well by specifying 'own' and passing a numpy matrix through sstats. If you wish to just pass a previously used LDA model, pass it through `lda_model` Shape of sstats is (vocab_len, num_topics) @@ -161,839 +144,930 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_ self.sstats = numpy.transpose(lda_model.state.sstats) if initialize == 'own': self.sstats = sstats - if initialize == 'blei-lda': - self.sstats = lda_sstats(self.corpus, self.num_topics, self.vocab_len, self.alphas) + # initialize model from sstats - init_ldaseq_ss(self, chain_variance, obs_variance, self.alphas, self.sstats) + self.init_ldaseq_ss(chain_variance, obs_variance, self.alphas, self.sstats) # fit DTM - fit_lda_seq(self, self.corpus) + self.fit_lda_seq(self.corpus) -# endclass LdaSeqModel -class sslm(utils.SaveLoad): - """ - `obs` values contain the doc - topic ratios - `e_log_prob` contains topic - word ratios - `mean`, `fwd_mean` contains the mean values to be used for inference for each word for a time_slice - `variance`, `fwd_variance` contains the variance values to be used for inference for each word in a time_slice - `fwd_mean`, `fwd_variance` are the forward posterior values. - `zeta` is an extra variational parameter with a value for each time-slice - """ - def __init__(self, vocab_len=None, num_time_slices=None, num_topics=None, obs_variance=0.5, chain_variance=0.005): + def init_ldaseq_ss(self, topic_chain_variance, topic_obs_variance, alpha, init_suffstats): + """ + Method to initialize State Space Language Model, topic wise. + """ - self.vocab_len = vocab_len - self.num_time_slices = num_time_slices - self.obs_variance = obs_variance - self.chain_variance= chain_variance - self.num_topics = num_topics + self.alphas = alpha + for k in range(0, self.num_topics): + sstats = init_suffstats[:,k] + sslm.sslm_counts_init(self.topic_chains[k], topic_obs_variance, topic_chain_variance, sstats) - self.obs = numpy.array(numpy.split(numpy.zeros(num_time_slices * vocab_len), vocab_len)) - self.e_log_prob = numpy.array(numpy.split(numpy.zeros(num_time_slices * vocab_len), vocab_len)) - self.mean = numpy.array(numpy.split(numpy.zeros((num_time_slices + 1) * vocab_len), vocab_len)) - self.fwd_mean = numpy.array(numpy.split(numpy.zeros((num_time_slices + 1) * vocab_len), vocab_len)) - self.fwd_variance = numpy.array(numpy.split(numpy.zeros((num_time_slices + 1) * vocab_len), vocab_len)) - self.variance = numpy.array(numpy.split(numpy.zeros((num_time_slices + 1) * vocab_len), vocab_len)) - self.zeta = numpy.zeros(num_time_slices) + # initialize the below matrices only if running DIM + # ldaseq.topic_chains[k].w_phi_l = numpy.zeros((ldaseq.vocab_len, ldaseq.num_time_slices)) + # ldaseq.topic_chains[k].w_phi_sum = numpy.zeros((ldaseq.vocab_len, ldaseq.num_time_slices)) + # ldaseq.topic_chains[k].w_phi_sq = numpy.zeros((ldaseq.vocab_len, ldaseq.num_time_slices)) - # the following are class variables which are to be integrated during Document Influence Model - self.m_update_coeff = None - self.mean_t = None - self.variance_t = None - self.influence_sum_lgl = None - self.w_phi_l = None - self.w_phi_sum = None - self.w_phi_l_sq = None - self.m_update_coeff_g = None -# endclass sslm + def fit_lda_seq(self, seq_corpus): + """ + fit an lda sequence model: + + for each time period + set up lda model with E[log p(w|z)] and \alpha + for each document + perform posterior inference + update sufficient statistics/likelihood + + maximize topics + + """ + + LDA_INFERENCE_MAX_ITER = 25 + LDASQE_EM_THRESHOLD = 1e-4 + LDA_SEQ_MIN_ITER = 6 + LDA_SEQ_MAX_ITER = 20 + + num_topics = self.num_topics + vocab_len = self.vocab_len + data_len = seq_corpus.num_time_slices + corpus_len = seq_corpus.corpus_len + + bound = 0 + convergence = LDASQE_EM_THRESHOLD + 1 -class LdaPost(utils.SaveLoad): + iter_ = 0 - """ - Posterior values associated with each set of documents. - """ + while iter_ < LDA_SEQ_MIN_ITER or ((convergence > LDASQE_EM_THRESHOLD) and iter_ <= LDA_SEQ_MAX_ITER): - def __init__(self, doc=None, lda=None, max_doc_len=None, num_topics=None, gamma=None, lhood=None): + print (" EM iter " , iter_) + print ("E Step") - self.doc = doc - self.lda = lda - self.gamma = gamma - self.lhood = lhood - if self.gamma = None: - self.gamma = numpy.zeros(num_topics) - if self.lhood = None: - self.lhood = numpy.zeros(num_topics) + old_bound = bound - if max_doc_len is not None and num_topics is not None: - self.phi = numpy.resize(numpy.zeros(max_doc_len * num_topics), (max_doc_len, num_topics)) - self.log_phi = numpy.resize(numpy.zeros(max_doc_len * num_topics), (max_doc_len, num_topics)) + # initiate sufficient statistics + topic_suffstats = [] + for topic in range(0, num_topics): + topic_suffstats.append(numpy.resize(numpy.zeros(vocab_len * data_len), (vocab_len, data_len))) - # the following are class variables which are to be integrated during Document Influence Model + # set up variables + gammas = numpy.resize(numpy.zeros(corpus_len * num_topics), (corpus_len, num_topics)) + lhoods = numpy.resize(numpy.zeros(corpus_len * num_topics + 1), (corpus_len, num_topics + 1)) + + # compute the likelihood of a sequential corpus under an LDA + # seq model and find the evidence lower bound. This is the E - Step + bound, gammas = self.lda_seq_infer(seq_corpus, topic_suffstats, gammas, lhoods, iter_) + self.gammas = gammas - self.doc_weight = None - self.renormalized_doc_weight = None + print ("M Step") -# endclass LdaState + # fit the variational distribution. This is the M - Step + topic_bound = self.fit_lda_seq_topics(topic_suffstats) + bound += topic_bound -def update_zeta(sslm): + if ((bound - old_bound) < 0): + if LDA_INFERENCE_MAX_ITER < 10: + LDA_INFERENCE_MAX_ITER *= 2 + print ("Bound went down, increasing iterations to" , LDA_INFERENCE_MAX_ITER) - """ - Updates the Zeta Variational Parameter. - Zeta is described in the appendix and is equal to sum (exp(mean[word] + Variance[word] / 2)), over every time-slice. - It is the value of variational parameter zeta which maximizes the lower bound. - """ + # check for convergence + convergence = numpy.fabs((bound - old_bound) / old_bound) - vocab_len = sslm.vocab_len - num_time_slices = sslm.num_time_slices - sslm.zeta.fill(0) + if convergence < LDASQE_EM_THRESHOLD: - for i in range(0, vocab_len): - for j in range(0, num_time_slices): + LDA_INFERENCE_MAX_ITER = 500 + print ("Starting final iterations, max iter is", LDA_INFERENCE_MAX_ITER) + convergence = 1.0 - m = sslm.mean[i][j + 1] - v = sslm.variance[i][j + 1] - val = numpy.exp(m + v/2) - sslm.zeta[j] = sslm.zeta[j] + val + print (iter_, "iteration lda seq bound is", bound, ", convergence is ", convergence) - return sslm.zeta + iter_ += 1 -def compute_post_variance(word, sslm, chain_variance): + return bound - """ - Based on the Variational Kalman Filtering approach for Approximate Inference [https://www.cs.princeton.edu/~blei/papers/BleiLafferty2006a.pdf] - This function accepts the word to compute variance for, along with the associated sslm class object, and returns variance and fwd_variance - Computes Var[\beta_{t,w}] for t = 1:T - Fwd_Variance(t) ≡ E((beta_{t,w} − mean_{t,w})^2 |beta_{t} for 1:t) - = (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance ) * (fwd_variance[t - 1] + obs_variance) - - Variance(t) ≡ E((beta_{t,w} − mean_cap{t,w})^2 |beta_cap{t} for 1:t) - = fwd_variance[t - 1] + (fwd_variance[t - 1] / fwd_variance[t - 1] + obs_variance)^2 * (variance[t - 1] - (fwd_variance[t-1] + obs_variance)) + def lda_seq_infer(self, seq_corpus, topic_suffstats, gammas, lhoods, iter_): - """ - INIT_VARIANCE_CONST = 1000 + """ + Inference or E- Step. + This is used to set up the gensim LdaModel to be used for each time-slice. + It also allows for Document Influence Model code to be written in. + """ - T = sslm.num_time_slices - variance = sslm.variance[word] - fwd_variance = sslm.fwd_variance[word] + num_topics = self.num_topics + vocab_len = self.vocab_len + bound = 0.0 + + lda = ldamodel.LdaModel(num_topics=num_topics, alpha=self.alphas, id2word=seq_corpus.id2word) + lda.topics = numpy.array(numpy.split(numpy.zeros(vocab_len * num_topics), vocab_len)) + ldapost = LdaPost(max_doc_len=seq_corpus.max_doc_len, num_topics=num_topics, lda=lda) - # forward pass. Set initial variance very high - fwd_variance[0] = chain_variance * INIT_VARIANCE_CONST - - for t in range(1, T + 1): - if sslm.obs_variance: - w = sslm.obs_variance / (fwd_variance[t - 1] + chain_variance + sslm.obs_variance) - else: - w = 0 - fwd_variance[t] = w * (fwd_variance[t - 1] + chain_variance) - - # backward pass - variance[T] = fwd_variance[T] - for t in range(T - 1, -1, -1): - if fwd_variance[t] > 0.0: - w = numpy.power((fwd_variance[t] / (fwd_variance[t] + chain_variance)), 2) - else: - w = 0 - variance[t] = (w * (variance[t + 1] - chain_variance)) + ((1 - w) * fwd_variance[t]) + model = "DTM" + if model == "DTM": + bound, gammas = self.inferDTMseq(seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound) + elif model == "DIM": + self.InfluenceTotalFixed(seq_corpus); + bound, gammas = self.inferDIMseq(seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound) - return variance, fwd_variance - + return bound, gammas - -def compute_post_mean(word, sslm, chain_variance): - """ - Based on the Variational Kalman Filtering approach for Approximate Inference [https://www.cs.princeton.edu/~blei/papers/BleiLafferty2006a.pdf] - This function accepts the word to compute mean for, along with the associated sslm class object, and returns mean and fwd_mean - Essentially a forward-backward to compute E[\beta_{t,w}] for t = 1:T. + def inferDTMseq(self, seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound): - Fwd_Mean(t) ≡ E(beta_{t,w} | beta_ˆ 1:t ) - = (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance ) * fwd_mean[t - 1] + (1 - (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance)) * beta - - Mean(t) ≡ E(beta_{t,w} | beta_ˆ 1:T ) - = fwd_mean[t - 1] + (obs_variance / fwd_variance[t - 1] + obs_variance) + (1 - obs_variance / fwd_variance[t - 1] + obs_variance)) * mean[t] + """ + Computes the likelihood of a sequential corpus under an LDA seq model, and return the likelihood bound. + Need to pass the LdaSeq model, seq_corpus, sufficient stats, gammas and lhoods matrices previously created, + and LdaModel and LdaPost class objects. + """ - """ + doc_index = 0 # overall doc_index in corpus + time = 0 # current time-slice + doc_num = 0 # doc-index in current time-lice + num_topics = self.num_topics + lda = self.make_lda_seq_slice(lda, time) # create lda_seq slice + time_slice = numpy.cumsum(numpy.array(self.time_slice)) - T = sslm.num_time_slices + for line_no, line in enumerate(seq_corpus.corpus): + # this is used to update the time_slice and create a new lda_seq slice every new time_slice + if doc_index > time_slice[time]: + time += 1 + lda = self.make_lda_seq_slice(lda, time) # create lda_seq slice + doc_num = 0 - obs = sslm.obs[word] - fwd_variance = sslm.fwd_variance[word] + gam = gammas[doc_index] + lhood = lhoods[doc_index] - mean = sslm.mean[word] - fwd_mean = sslm.fwd_mean[word] + ldapost.gamma = gam + ldapost.lhood = lhood + ldapost.doc = line - # forward - fwd_mean[0] = 0 - for t in range(1, T + 1): - w = sslm.obs_variance / (fwd_variance[t - 1] + chain_variance + sslm.obs_variance) - fwd_mean[t] = w * fwd_mean[t - 1] + (1 - w) * obs[t - 1] - - # backward pass - mean[T] = fwd_mean[T] - for t in range(T - 1, -1, -1): - if chain_variance == 0.0: - w = 0.0 - else: - w = chain_variance / (fwd_variance[t] + chain_variance) - mean[t] = w * fwd_mean[t] + (1 - w) * mean[t + 1] + if iter_ == 0: + doc_lhood = LdaPost.fit_lda_post(ldapost, doc_num, time, None, None, None, None, None) + else: + doc_lhood = LdaPost.fit_lda_post(ldapost, doc_num, time, self, None, None, None, None) + - return mean, fwd_mean + if topic_suffstats != None: + topic_suffstats = LdaPost.update_lda_seq_ss(ldapost, time, line, topic_suffstats) + bound += doc_lhood + doc_index += 1 + doc_num += 1 -def update_phi(doc, time, ldapost): + return bound, gammas - """ - Update variational multinomial parameters, based on a document and a time-slice. - This is done based on the original Blei-LDA paper, where: - log_phi := beta * exp(Ψ(gamma)), over every topic for every word. - - TODO: incorporate lee-sueng trick used in **Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001**. - """ + def make_lda_seq_slice(self, lda, time): - K = ldapost.lda.num_topics - N = ldapost.doc.nterms + """ + set up the LDA model topic-word values with that of ldaseq. + """ + + num_topics = self.num_topics + for k in range(0, num_topics): + lda.topics[:,k] = numpy.copy(self.topic_chains[k].e_log_prob[:,time]) + + lda.alpha = numpy.copy(self.alphas) - dig = numpy.zeros(K) + return lda - for k in range(0, K): - dig[k] = digamma(ldapost.gamma[k]) - for n in range(0, N): - w = ldapost.doc.word[n] - for k in range(0, K): - ldapost.log_phi[n][k] = dig[k] + ldapost.lda.topics[w][k] + def fit_lda_seq_topics(self, topic_suffstats): + """ + Fit lda sequence topic wise. + """ + lhood = 0 + lhood_term = 0 + + for k in range(0, self.num_topics): + print ("Fitting topic number" , k) + lhood_term = sslm.fit_sslm(self.topic_chains[k], topic_suffstats[k]) + lhood += lhood_term - log_phi_row = ldapost.log_phi[n] - phi_row = ldapost.phi[n] + return lhood - # log normalize - v = log_phi_row[0] - for i in range(1, len(log_phi_row)): - v = numpy.logaddexp(v, log_phi_row[i]) + def print_topics_time(self, topic): + """ + prints one topic showing each time-slice + """ + for time in range(0, self.num_time_slices): + self.print_topics(topic, time) - for i in range(0, len(log_phi_row)): - log_phi_row[i] = log_phi_row[i] - v - for k in range(0, K): - phi_row[k] = numpy.exp(log_phi_row[k]) + def print_topics(self, topic, time=0, top_terms=20): + """ + Topic is the topic numner + Time is for a particular time_slice + top_terms is the number of terms to display + """ + topic = self.topic_chains[topic].e_log_prob[time] + topic = numpy.transpose(topic) + topic = topic / topic.sum() + bestn = matutils.argsort(topic, top_terms, reverse=True) + beststr = [(round(topic[id_], 3), self.corpus.id2word[id_]) for id_ in bestn] - ldapost.log_phi[n] = log_phi_row - ldapost.phi[n] = phi_row + return beststr - return ldapost.phi, ldapost.log_phi -def update_gamma(ldapost): + def doc_topics(self, doc_number): - """ - update variational dirichlet parameters as described in the original Blei LDA paper: - gamma = alpha + sum(phi), over every topic for every word. + """ + On passing the LdaSeqModel trained ldaseq object, the doc_number of your document in the corpus, + it returns the doc-topic probabilities of that document. + """ + doc_topic = numpy.copy(self.gammas) + doc_topic /= doc_topic.sum(axis=1)[:, numpy.newaxis] - """ + return doc_topic[doc_number] - K = ldapost.lda.num_topics - N = ldapost.doc.nterms - ldapost.gamma = numpy.copy(ldapost.lda.alpha) + def get_item(self, doc): + """ + TODO: To mimic the __getitem__ in ldamodel. This method is a work in progress. + """ - for n in range(0, N): - phi_row = ldapost.phi[n] - count = ldapost.doc.count[n] + lda_model = ldamodel.LdaModel(num_topics=num_topics, alpha=self.alphas, id2word=self.corpus.id2word) + ldapost = LdaPost(num_topics=self.num_topics, max_doc_len=len(doc), lda=lda_model, doc=doc) - for k in range(0, K): - ldapost.gamma[k] += phi_row[k] * count + time_lhoods = [] + for time in range(0, self.num_time_slices): + lda = self.make_lda_seq_slice(lda, time) # create lda_seq slice + lhood = fit_lda_post(0, time, ldapost, self, None, None, None, None) + time_lhoods.append(lhood) - return ldapost.gamma + return time_lhoods -def compute_expected_log_prob(sslm): +# endclass LdaSeqModel +class sslm(utils.SaveLoad): """ - Compute the expected log probability given values of m. - The appendix describes the Expectation of log-probabilities in equation 5 of the DTM paper; - The below implementation is the result of solving the equation and is as implemented in the original Blei DTM code. + The sslm class is the State Space Language Model for DTM and contains the following information: + `obs` values contain the doc - topic ratios + `e_log_prob` contains topic - word ratios + `mean`, `fwd_mean` contains the mean values to be used for inference for each word for a time_slice + `variance`, `fwd_variance` contains the variance values to be used for inference for each word in a time_slice + `fwd_mean`, `fwd_variance` are the forward posterior values. + `zeta` is an extra variational parameter with a value for each time-slice """ + def __init__(self, vocab_len=None, num_time_slices=None, num_topics=None, obs_variance=0.5, chain_variance=0.005): - W = sslm.vocab_len - T = sslm.num_time_slices - for t in range(0, T): - for w in range(0, W): - sslm.e_log_prob[w][t] = sslm.mean[w][t + 1] - numpy.log(sslm.zeta[t]) - return sslm.e_log_prob + self.vocab_len = vocab_len + self.num_time_slices = num_time_slices + self.obs_variance = obs_variance + self.chain_variance= chain_variance + self.num_topics = num_topics + self.obs = numpy.array(numpy.split(numpy.zeros(num_time_slices * vocab_len), vocab_len)) + self.e_log_prob = numpy.array(numpy.split(numpy.zeros(num_time_slices * vocab_len), vocab_len)) + self.mean = numpy.array(numpy.split(numpy.zeros((num_time_slices + 1) * vocab_len), vocab_len)) + self.fwd_mean = numpy.array(numpy.split(numpy.zeros((num_time_slices + 1) * vocab_len), vocab_len)) + self.fwd_variance = numpy.array(numpy.split(numpy.zeros((num_time_slices + 1) * vocab_len), vocab_len)) + self.variance = numpy.array(numpy.split(numpy.zeros((num_time_slices + 1) * vocab_len), vocab_len)) + self.zeta = numpy.zeros(num_time_slices) -def sslm_counts_init(sslm, obs_variance, chain_variance, sstats): + # the following are class variables which are to be integrated during Document Influence Model + self.m_update_coeff = None + self.mean_t = None + self.variance_t = None + self.influence_sum_lgl = None + self.w_phi_l = None + self.w_phi_sum = None + self.w_phi_l_sq = None + self.m_update_coeff_g = None - """ - Initialize State Space Language Model with LDA sufficient statistics. - """ - W = sslm.vocab_len - T = sslm.num_time_slices + def update_zeta(self): - log_norm_counts = numpy.copy(sstats) - log_norm_counts = log_norm_counts / sum(log_norm_counts) + """ + Updates the Zeta Variational Parameter. + Zeta is described in the appendix and is equal to sum (exp(mean[word] + Variance[word] / 2)), over every time-slice. + It is the value of variational parameter zeta which maximizes the lower bound. + """ - log_norm_counts = log_norm_counts + 1.0 / W - log_norm_counts = log_norm_counts / sum(log_norm_counts) - log_norm_counts = numpy.log(log_norm_counts) - + vocab_len = self.vocab_len + num_time_slices = self.num_time_slices + self.zeta.fill(0) - # setting variational observations to transformed counts - for t in range(0, T): - sslm.obs[:,t] = log_norm_counts + for j in range(0, num_time_slices): + self.zeta[j] = numpy.sum(numpy.exp(self.mean[:, j + 1] + self.variance[:, j + 1] / 2)) - # set variational parameters - sslm.obs_variance = obs_variance - sslm.chain_variance = chain_variance + return self.zeta - # compute post variance - for w in range(0, W): - sslm.variance[w], sslm.fwd_variance[w] = compute_post_variance(w, sslm, sslm.chain_variance) + def compute_post_variance(self, word, chain_variance): - for w in range(0, W): - sslm.mean[w], sslm.fwd_mean[w] = compute_post_mean(w, sslm, sslm.chain_variance) + """ + Based on the Variational Kalman Filtering approach for Approximate Inference [https://www.cs.princeton.edu/~blei/papers/BleiLafferty2006a.pdf] + This function accepts the word to compute variance for, along with the associated sslm class object, and returns variance and fwd_variance + Computes Var[\beta_{t,w}] for t = 1:T - sslm.zeta = update_zeta(sslm) - sslm.e_log_prob = compute_expected_log_prob(sslm) + Fwd_Variance(t) ≡ E((beta_{t,w} − mean_{t,w})^2 |beta_{t} for 1:t) + = (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance ) * (fwd_variance[t - 1] + obs_variance) + + Variance(t) ≡ E((beta_{t,w} − mean_cap{t,w})^2 |beta_cap{t} for 1:t) + = fwd_variance[t - 1] + (fwd_variance[t - 1] / fwd_variance[t - 1] + obs_variance)^2 * (variance[t - 1] - (fwd_variance[t-1] + obs_variance)) + """ + INIT_VARIANCE_CONST = 1000 -def init_ldaseq_ss(ldaseq, topic_chain_variance, topic_obs_variance, alpha, init_suffstats): + T = self.num_time_slices + variance = self.variance[word] + fwd_variance = self.fwd_variance[word] - """ - Method to initialize State Space Language Model, topic wise. - """ + # forward pass. Set initial variance very high + fwd_variance[0] = chain_variance * INIT_VARIANCE_CONST + + for t in range(1, T + 1): + if self.obs_variance: + w = self.obs_variance / (fwd_variance[t - 1] + chain_variance + self.obs_variance) + else: + w = 0 + fwd_variance[t] = w * (fwd_variance[t - 1] + chain_variance) + + # backward pass + variance[T] = fwd_variance[T] + for t in range(T - 1, -1, -1): + if fwd_variance[t] > 0.0: + w = numpy.power((fwd_variance[t] / (fwd_variance[t] + chain_variance)), 2) + else: + w = 0 + variance[t] = (w * (variance[t + 1] - chain_variance)) + ((1 - w) * fwd_variance[t]) + + return variance, fwd_variance + - ldaseq.alphas = alpha - for k in range(0, ldaseq.num_topics): - sstats = init_suffstats[:,k] - sslm_counts_init(ldaseq.topic_chains[k], topic_obs_variance, topic_chain_variance, sstats) + def compute_post_mean(self, word, chain_variance): - # initialize the below matrices only if running DIM - # ldaseq.topic_chains[k].w_phi_l = numpy.zeros((ldaseq.vocab_len, ldaseq.num_time_slices)) - # ldaseq.topic_chains[k].w_phi_sum = numpy.zeros((ldaseq.vocab_len, ldaseq.num_time_slices)) - # ldaseq.topic_chains[k].w_phi_sq = numpy.zeros((ldaseq.vocab_len, ldaseq.num_time_slices)) + """ + Based on the Variational Kalman Filtering approach for Approximate Inference [https://www.cs.princeton.edu/~blei/papers/BleiLafferty2006a.pdf] + This function accepts the word to compute mean for, along with the associated sslm class object, and returns mean and fwd_mean + Essentially a forward-backward to compute E[\beta_{t,w}] for t = 1:T. + Fwd_Mean(t) ≡ E(beta_{t,w} | beta_ˆ 1:t ) + = (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance ) * fwd_mean[t - 1] + (1 - (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance)) * beta + + Mean(t) ≡ E(beta_{t,w} | beta_ˆ 1:T ) + = fwd_mean[t - 1] + (obs_variance / fwd_variance[t - 1] + obs_variance) + (1 - obs_variance / fwd_variance[t - 1] + obs_variance)) * mean[t] -def fit_lda_seq(ldaseq, seq_corpus): - """ - fit an lda sequence model: - - for each time period - set up lda model with E[log p(w|z)] and \alpha - for each document - perform posterior inference - update sufficient statistics/likelihood - - maximize topics - - """ - - LDA_INFERENCE_MAX_ITER = 25 - LDASQE_EM_THRESHOLD = 1e-4 - LDA_SEQ_MIN_ITER = 6 - LDA_SEQ_MAX_ITER = 20 - - num_topics = ldaseq.num_topics - vocab_len = ldaseq.vocab_len - data_len = seq_corpus.num_time_slices - corpus_len = seq_corpus.corpus_len - - bound = 0 - convergence = LDASQE_EM_THRESHOLD + 1 - - iter_ = 0 - - while iter_ < LDA_SEQ_MIN_ITER or ((convergence > LDASQE_EM_THRESHOLD) and iter_ <= LDA_SEQ_MAX_ITER): - - print (" EM iter " , iter_) - print ("E Step") - - old_bound = bound - # initiate sufficient statistics - topic_suffstats = [] - for topic in range(0, num_topics): - topic_suffstats.append(numpy.resize(numpy.zeros(vocab_len * data_len), (vocab_len, data_len))) + """ - # set up variables - gammas = numpy.resize(numpy.zeros(corpus_len * num_topics), (corpus_len, num_topics)) - lhoods = numpy.resize(numpy.zeros(corpus_len * num_topics + 1), (corpus_len, num_topics + 1)) - # compute the likelihood of a sequential corpus under an LDA - # seq model and find the evidence lower bound. This is the E - Step - bound, gammas = lda_seq_infer(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_) - ldaseq.gammas = gammas + T = self.num_time_slices - print ("M Step") + obs = self.obs[word] + fwd_variance = self.fwd_variance[word] - # fit the variational distribution - topic_bound = fit_lda_seq_topics(ldaseq, topic_suffstats) - bound += topic_bound + mean = self.mean[word] + fwd_mean = self.fwd_mean[word] + # forward + fwd_mean[0] = 0 + for t in range(1, T + 1): + w = self.obs_variance / (fwd_variance[t - 1] + chain_variance + self.obs_variance) + fwd_mean[t] = w * fwd_mean[t - 1] + (1 - w) * obs[t - 1] - if ((bound - old_bound) < 0): - if LDA_INFERENCE_MAX_ITER < 10: - LDA_INFERENCE_MAX_ITER *= 2 - print ("Bound went down, increasing iterations to" , LDA_INFERENCE_MAX_ITER) + # backward pass + mean[T] = fwd_mean[T] + for t in range(T - 1, -1, -1): + if chain_variance == 0.0: + w = 0.0 + else: + w = chain_variance / (fwd_variance[t] + chain_variance) + mean[t] = w * fwd_mean[t] + (1 - w) * mean[t + 1] - # check for convergence - convergence = numpy.fabs((bound - old_bound) / old_bound) + return mean, fwd_mean - if convergence < LDASQE_EM_THRESHOLD: - LDA_INFERENCE_MAX_ITER = 500 - print ("Starting final iterations, max iter is", LDA_INFERENCE_MAX_ITER) - convergence = 1.0 + def compute_expected_log_prob(self): - print (iter_, "iteration lda seq bound is", bound, ", convergence is ", convergence) + """ + Compute the expected log probability given values of m. + The appendix describes the Expectation of log-probabilities in equation 5 of the DTM paper; + The below implementation is the result of solving the equation and is as implemented in the original Blei DTM code. + """ + for (w,t), e_log_prob in numpy.ndenumerate(self.e_log_prob): + e_log_prob = self.mean[w][t + 1] - numpy.log(self.zeta[t]) - iter_ += 1 + return self.e_log_prob - return bound + def sslm_counts_init(self, obs_variance, chain_variance, sstats): -def lda_seq_infer(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, iter_): + """ + Initialize State Space Language Model with LDA sufficient statistics. + """ - """ - Inference or E- Step. - This is used to set up the gensim LdaModel to be used for each time-slice. - It also allows for Document Influence Model code to be written in. - """ + W = self.vocab_len + T = self.num_time_slices - num_topics = ldaseq.num_topics - vocab_len = ldaseq.vocab_len - bound = 0.0 - - lda = ldamodel.LdaModel(num_topics=num_topics, alpha=ldaseq.alphas, id2word=seq_corpus.id2word) - lda.topics = numpy.array(numpy.split(numpy.zeros(vocab_len * num_topics), vocab_len)) - ldapost = LdaPost(max_doc_len=seq_corpus.max_doc_len, num_topics=num_topics, lda=lda) + log_norm_counts = numpy.copy(sstats) + log_norm_counts = log_norm_counts / sum(log_norm_counts) - model = "DTM" - if model == "DTM": - bound, gammas = inferDTMseq(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound) - elif model == "DIM": - InfluenceTotalFixed(ldaseq, seq_corpus); - bound, gammas = inferDIMseq(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound) + log_norm_counts = log_norm_counts + 1.0 / W + log_norm_counts = log_norm_counts / sum(log_norm_counts) + log_norm_counts = numpy.log(log_norm_counts) + - return bound, gammas + # setting variational observations to transformed counts + self.obs = (numpy.repeat(log_norm_counts, T, axis=0)).reshape(W, T) + # set variational parameters + self.obs_variance = obs_variance + self.chain_variance = chain_variance + # compute post variance + for w in range(0, W): + self.variance[w], self.fwd_variance[w] = self.compute_post_variance(w, self.chain_variance) -def inferDTMseq(ldaseq, seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound): + for w in range(0, W): + self.mean[w], self.fwd_mean[w] = self.compute_post_mean(w, self.chain_variance) - """ - Computes the likelihood of a sequential corpus under an LDA seq model, and return the likelihood bound. - Need to pass the LdaSeq model, seq_corpus, sufficient stats, gammas and lhoods matrices previously created, - and LdaModel and LdaPost class objects. - """ + self.zeta = self.update_zeta() + self.e_log_prob = self.compute_expected_log_prob() - doc_index = 0 # overall doc_index in corpus - time = 0 # current time-slice - doc_num = 0 # doc-index in current time-lice - num_topics = ldaseq.num_topics - make_lda_seq_slice(lda, ldaseq, time) # create lda_seq slice - time_slice = numpy.cumsum(numpy.array(ldaseq.time_slice)) + def fit_sslm(self, counts): - for line_no, line in enumerate(seq_corpus.corpus): - # this is used to update the time_slice and create a new lda_seq slice every new time_slice - if doc_index > time_slice[time]: - time += 1 - make_lda_seq_slice(lda, ldaseq, time) - doc_num = 0 + """ + Fit variational distribution. + """ - gam = gammas[doc_index] - lhood = lhoods[doc_index] + W = self.vocab_len + bound = 0 + old_bound = 0 + sslm_fit_threshold = 1e-6 + sslm_max_iter = 2 + converged = sslm_fit_threshold + 1 - doc = Doc(doc=line) - ldapost.gamma = gam - ldapost.lhood = lhood - ldapost.doc = doc + totals = numpy.zeros(counts.shape[1]) - if iter_ == 0: - doc_lhood = fit_lda_post(doc_num, time, ldapost, None, None, None, None, None) - else: - doc_lhood = fit_lda_post(doc_num, time, ldapost, ldaseq, None, None, None, None) - + for w in range(0, W): + self.variance[w], self.fwd_variance[w] = self.compute_post_variance(w, self.chain_variance) + + # column sum of counts + totals = counts.sum(axis=0) + iter_ = 0 - if topic_suffstats != None: - topic_suffstats = update_lda_seq_ss(time, doc, ldapost, topic_suffstats) + model = "DTM" + if model == "DTM": + bound = self.compute_bound(counts, totals) + if model == "DIM": + bound = self.compute_bound_fixed(counts, totals) - bound += doc_lhood - doc_index += 1 - doc_num += 1 + print ("initial sslm bound is " , bound) - return bound, gammas + while converged > sslm_fit_threshold and iter_ < sslm_max_iter: + iter_ += 1 + old_bound = bound + self.obs, self.zeta = self.update_obs(counts, totals) -def fit_lda_post(doc_number, time, ldapost, ldaseq, g, g3_matrix, g4_matrix, g5_matrix): + if model == "DTM": + bound = self.compute_bound(counts, totals) + if model == "DIM": + bound = self.compute_bound_fixed(counts, totals) - """ - Posterior inference for lda. - """ + converged = numpy.fabs((bound - old_bound) / old_bound) - LDA_INFERENCE_CONVERGED = 1e-8 - LDA_INFERENCE_MAX_ITER = 25 + print (iter_, " iteration lda seq bound is ", bound, " convergence is", converged) - init_lda_post(ldapost) + self.e_log_prob = self.compute_expected_log_prob() - model = "DTM" - if model == "DIM": - # if in DIM then we initialise some variables here - pass + return bound - lhood = compute_lda_lhood(ldapost) - lhood_old = 0 - converged = 0 - iter_ = 0 - # first iteration starts here - iter_ += 1 - lhood_old = lhood - ldapost.gamma = update_gamma(ldapost) + def compute_bound(self, word_counts, totals): - model = "DTM" + """ + Compute log probability bound. + Forumula is as described in appendix of DTM. + """ + W = self.vocab_len + T = self.num_time_slices - if model == "DTM" or sslm is None: - ldapost.phi, ldapost.log_phi = update_phi(doc_number, time, ldapost) - elif model == "DIM" and sslm is not None: - ldapost.phi, ldapost.log_phi = update_phi_fixed(doc_number, time, ldapost, sslm, g3_matrix, g4_matrix, g5_matrix) + term_1 = 0 + term_2 = 0 + term_3 = 0 - lhood = compute_lda_lhood(ldapost) - converged = numpy.fabs((lhood_old - lhood) / (lhood_old * ldapost.doc.total)) + val = 0 + ent = 0 + chain_variance = self.chain_variance - while converged > LDA_INFERENCE_CONVERGED and iter_ <= LDA_INFERENCE_MAX_ITER: + for w in range(0, W): + self.mean[w], self.fwd_mean[w] = self.compute_post_mean(w, chain_variance) - iter_ += 1 - lhood_old = lhood - ldapost.gamma = update_gamma(ldapost) - model = "DTM" + self.zeta = self.update_zeta() - if model == "DTM" or sslm is None: - ldapost.phi, ldapost.log_phi = update_phi(doc_number, time, ldapost) - elif model == "DIM" and sslm is not None: - ldapost.phi, ldapost.log_phi = update_phi_fixed(doc_number, time, ldapost, sslm, g3_matrix, g4_matrix, g5_matrix) + for w in range(0, W): + val += (self.variance[w][0] - self.variance[w][T]) / 2 * chain_variance - lhood = compute_lda_lhood(ldapost) - converged = numpy.fabs((lhood_old - lhood) / (lhood_old * ldapost.doc.total)) + print ("Computing bound, all times") - return lhood + for t in range(1, T + 1): + term_1 = 0.0 + term_2 = 0.0 + ent = 0.0 + for w in range(0, W): + m = self.mean[w][t] + prev_m = self.mean[w][t - 1] -def make_lda_seq_slice(lda, ldaseq, time): + v = self.variance[w][t] - """ - set up the LDA model topic-word values with that of ldaseq. - """ + # w_phi_l is only used in Document Influence Model; the values are aleays zero in this case + # w_phi_l = sslm.w_phi_l[w][t - 1] + # exp_i = numpy.exp(-prev_m) + # term_1 += (numpy.power(m - prev_m - (w_phi_l * exp_i), 2) / (2 * chain_variance)) - (v / chain_variance) - numpy.log(chain_variance) + + term_1 += (numpy.power(m - prev_m, 2) / (2 * chain_variance)) - (v / chain_variance) - numpy.log(chain_variance) + term_2 += word_counts[w][t - 1] * m + ent += numpy.log(v) / 2 # note the 2pi's cancel with term1 (see doc) - num_topics = ldaseq.num_topics - for k in range(0, num_topics): - lda.topics[:,k] = numpy.copy(ldaseq.topic_chains[k].e_log_prob[:,time]) + term_3 = -totals[t - 1] * numpy.log(self.zeta[t - 1]) + val += term_2 + term_3 + ent - term_1 - lda.alpha = numpy.copy(ldaseq.alphas) + return val + + + def update_obs(self, word_counts, totals): - return + """ + Fucntion to perform optimization + """ + OBS_NORM_CUTOFF = 2 + STEP_SIZE = 0.01 + TOL = 1e-3 -def update_lda_seq_ss(time, doc, ldapost, topic_suffstats): - """ - Update lda sequence sufficient statistics from an lda posterior. - """ + W = self.vocab_len + T = self.num_time_slices - num_topics = numpy.shape(ldapost.phi)[1] - nterms = doc.nterms + runs = 0 + mean_deriv_mtx = numpy.resize(numpy.zeros(T * (T + 1)), (T, T + 1)) - for k in range(0, num_topics): - topic_ss = topic_suffstats[k] - for n in range(0, nterms): - w = doc.word[n] - c = doc.count[n] - topic_ss[w][time] = topic_ss[w][time] + c * ldapost.phi[n][k] + norm_cutoff_obs = None + for w in range(0, W): + w_counts = word_counts[w] + counts_norm = 0 + # now we find L2 norm of w_counts + for i in range(0, len(w_counts)): + counts_norm += w_counts[i] * w_counts[i] + + counts_norm = numpy.sqrt(counts_norm) + + if counts_norm < OBS_NORM_CUTOFF and norm_cutoff_obs is not None: + obs = self.obs[w] + norm_cutoff_obs = numpy.copy(obs) + else: + if counts_norm < OBS_NORM_CUTOFF: + w_counts = numpy.zeros(len(w_counts)) + + # TODO: apply lambda function + for t in range(0, T): + mean_deriv = mean_deriv_mtx[t] + mean_deriv = self.compute_mean_deriv(w, t, mean_deriv) + mean_deriv_mtx[t] = mean_deriv + + deriv = numpy.zeros(T) + args = self, w_counts, totals, mean_deriv_mtx, w, deriv + obs = self.obs[w] + model = "DTM" + + if model == "DTM": + obs = optimize.fmin_cg(f=f_obs, fprime=df_obs, x0=obs, gtol=TOL, args=args, epsilon=STEP_SIZE, disp=0) + if model == "DIM": + pass + runs += 1 + + if counts_norm < OBS_NORM_CUTOFF: + norm_cutoff_obs = obs + + self.obs[w] = obs + + self.zeta = self.update_zeta() + + return self.obs, self.zeta - topic_suffstats[k] = topic_ss + + def compute_mean_deriv(self, word, time, deriv): - return topic_suffstats + """ + Used in helping find the optimum function. + computes derivative of E[\beta_{t,w}]/d obs_{s,w} for t = 1:T. + put the result in deriv, allocated T+1 vector + """ + T = self.num_time_slices + fwd_variance = self.variance[word] -def init_lda_post(ldapost): + deriv[0] = 0 - """ - Initialize variational posterior, does not return anything. - """ - num_topics = ldapost.lda.num_topics - nterms = ldapost.doc.nterms + # forward pass + for t in range(1, T + 1): + if self.obs_variance > 0.0: + w = self.obs_variance / (fwd_variance[t - 1] + self.chain_variance + self.obs_variance) + else: + w = 0.0 - for k in range(0, num_topics): - ldapost.gamma[k] = ldapost.lda.alpha[k] + float(ldapost.doc.total) / num_topics - for n in range(0, nterms): - ldapost.phi[n][k] = 1.0 / num_topics - - # doc_weight used during DIM - # ldapost.doc_weight = None - return + val = w * deriv[t - 1] + if time == t - 1: + val += (1 - w) + deriv[t]= val -def compute_lda_lhood(ldapost): - """ - compute the likelihood bound - """ + for t in range(T - 1, -1, -1): + if self.chain_variance == 0.0: + w = 0.0 + else: + w = self.chain_variance / (fwd_variance[t] + self.chain_variance) + deriv[t] = w * deriv[t] + (1 - w) * deriv[t + 1] + + return deriv - K = ldapost.lda.num_topics - N = ldapost.doc.nterms - gamma_sum = numpy.sum(ldapost.gamma) - # TODO: flags - FLAGS_sigma_l = 0 - FLAGS_sigma_d = 0 + def compute_obs_deriv(self, word, word_counts, totals, mean_deriv_mtx, deriv): - lhood = math.lgamma(numpy.sum(ldapost.lda.alpha)) - math.lgamma(gamma_sum) - ldapost.lhood[K] = lhood + """ + Derivation of obs which is used in derivative function [df_obs] while optimizing. + """ - # influence_term = 0 - digsum = digamma(gamma_sum) + # flag + init_mult = 1000 - model = "DTM" - for k in range(0, K): - # below code only to be used in DIM mode - # if ldapost.doc_weight is not None and (model == "DIM" or model == "fixed"): - # influence_topic = ldapost.doc_weight[k] - # influence_term = - ((influence_topic * influence_topic + FLAGS_sigma_l * FLAGS_sigma_l) / 2.0 / (FLAGS_sigma_d * FLAGS_sigma_d)) + T = self.num_time_slices - e_log_theta_k = digamma(ldapost.gamma[k]) - digsum - lhood_term = (ldapost.lda.alpha[k] - ldapost.gamma[k]) * e_log_theta_k + math.lgamma(ldapost.gamma[k]) - math.lgamma(ldapost.lda.alpha[k]) + mean = self.mean[word] + variance = self.variance[word] - for n in range(0, N): - if ldapost.phi[n][k] > 0: - lhood_term += ldapost.doc.count[n] * ldapost.phi[n][k] * (e_log_theta_k + ldapost.lda.topics[ldapost.doc.word[n]][k] - ldapost.log_phi[n][k]) + # only used for DIM mode + # w_phi_l = self.w_phi_l[word] + # m_update_coeff = self.m_update_coeff[word] - ldapost.lhood[k] = lhood_term - lhood += lhood_term - # lhood += influence_term + # temp_vector holds temporary zeta values + self.temp_vect = numpy.zeros(T) - return lhood + for u in range(0, T): + self.temp_vect[u] = numpy.exp(mean[u + 1] + variance[u + 1] / 2) + for t in range(0, T): + + mean_deriv = mean_deriv_mtx[t] + term1 = 0 + term2 = 0 + term3 = 0 + term4 = 0 + + for u in range(1, T + 1): + mean_u = mean[u] + variance_u_prev = variance[u - 1] + mean_u_prev = mean[u - 1] + dmean_u = mean_deriv[u] + dmean_u_prev = mean_deriv[u - 1] + + term1 += (mean_u - mean_u_prev) * (dmean_u - dmean_u_prev) + + term2 += (word_counts[u - 1] - (totals[u - 1] * self.temp_vect[u - 1] / self.zeta[u - 1])) * dmean_u + + model = "DTM" + if model == "DIM": + # do some stuff + pass + + if self.chain_variance: + term1 = - (term1 / self.chain_variance) + term1 = term1 - (mean[0] * mean_deriv[0]) / (init_mult * self.chain_variance) + else: + term1 = 0.0 + + deriv[t] = term1 + term2 + term3 + term4 + + return deriv +# endclass sslm + +class LdaPost(utils.SaveLoad): -def fit_lda_seq_topics(ldaseq, topic_suffstats): """ - Fit lda sequence topic wise. + Posterior values associated with each set of documents. + TODO: use **Hoffman, Blei, Bach: Online Learning for Latent Dirichlet Allocation, NIPS 2010.** + to update phi, gamma. End game would be to somehow replace LdaPost entirely with LdaModel. """ - lhood = 0 - lhood_term = 0 - K = ldaseq.num_topics - for k in range(0, K): - print ("Fitting topic number" , k) - lhood_term = fit_sslm(ldaseq.topic_chains[k], topic_suffstats[k]) - lhood += lhood_term + def __init__(self, doc=None, lda=None, max_doc_len=None, num_topics=None, gamma=None, lhood=None): - return lhood + self.doc = doc + self.lda = lda + self.gamma = gamma + self.lhood = lhood + if self.gamma is None: + self.gamma = numpy.zeros(num_topics) + if self.lhood is None: + self.lhood = numpy.zeros(num_topics) + if max_doc_len is not None and num_topics is not None: + self.phi = numpy.resize(numpy.zeros(max_doc_len * num_topics), (max_doc_len, num_topics)) + self.log_phi = numpy.resize(numpy.zeros(max_doc_len * num_topics), (max_doc_len, num_topics)) -def fit_sslm(sslm, counts): + # the following are class variables which are to be integrated during Document Influence Model - """ - Fit variational distribution. - """ + self.doc_weight = None + self.renormalized_doc_weight = None - W = sslm.vocab_len - bound = 0 - old_bound = 0 - sslm_fit_threshold = 1e-6 - sslm_max_iter = 2 - converged = sslm_fit_threshold + 1 - totals = numpy.zeros(counts.shape[1]) + def update_phi(self, doc_number, time): - for w in range(0, W): - sslm.variance[w], sslm.fwd_variance[w] = compute_post_variance(w, sslm, sslm.chain_variance) - - # column sum of counts - totals = counts.sum(axis=0) - iter_ = 0 + """ + Update variational multinomial parameters, based on a document and a time-slice. + This is done based on the original Blei-LDA paper, where: + log_phi := beta * exp(Ψ(gamma)), over every topic for every word. + + TODO: incorporate lee-sueng trick used in **Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001**. + """ - model = "DTM" - if model == "DTM": - bound = compute_bound(counts, totals, sslm) - if model == "DIM": - bound = compute_bound_fixed(counts, totals, sslm) + num_topics = self.lda.num_topics - print ("initial sslm bound is " , bound) + # digamma values + dig = numpy.zeros(num_topics) + for k in range(0, num_topics): + dig[k] = digamma(self.gamma[k]) - while converged > sslm_fit_threshold and iter_ < sslm_max_iter: - iter_ += 1 - old_bound = bound - sslm.obs, sslm.zeta = update_obs(counts, totals, sslm) + n = 0 # keep track of iterations for phi, log_phi + for word_id, count in self.doc: + for k in range(0, num_topics): + self.log_phi[n][k] = dig[k] + self.lda.topics[word_id][k] + log_phi_row = self.log_phi[n] + phi_row = self.phi[n] - if model == "DTM": - bound = compute_bound(counts, totals, sslm) - if model == "DIM": - bound = compute_bound_fixed(counts, totals, sslm) + # log normalize + v = log_phi_row[0] + for i in range(1, len(log_phi_row)): + v = numpy.logaddexp(v, log_phi_row[i]) - converged = numpy.fabs((bound - old_bound) / old_bound) + # subtract every element by v + log_phi_row = log_phi_row - v + phi_row = numpy.exp(log_phi_row) - print (iter_, " iteration lda seq bound is ", bound, " convergence is", converged) + self.log_phi[n] = log_phi_row + self.phi[n] = phi_row + n +=1 # increase iteration - sslm.e_log_prob = compute_expected_log_prob(sslm) + return self.phi, self.log_phi - return bound + def update_gamma(self): -def compute_bound(word_counts, totals, sslm): + """ + update variational dirichlet parameters as described in the original Blei LDA paper: + gamma = alpha + sum(phi), over every topic for every word. - """ - Compute log probability bound. - Forumula is as described in appendix of DTM. - """ - W = sslm.vocab_len - T = sslm.num_time_slices + """ - term_1 = 0 - term_2 = 0 - term_3 = 0 + self.gamma = numpy.copy(self.lda.alpha) - val = 0 - ent = 0 + n = 0 # keep track of number of iterations for phi, log_phi + for word_id, count in self.doc: + phi_row = self.phi[n] + for k in range(0, self.lda.num_topics): + self.gamma[k] += phi_row[k] * count - chain_variance = sslm.chain_variance + return self.gamma - for w in range(0, W): - sslm.mean[w], sslm.fwd_mean[w] = compute_post_mean(w, sslm, chain_variance) - sslm.zeta = update_zeta(sslm) + def init_lda_post(self): - for w in range(0, W): - val += (sslm.variance[w][0] - sslm.variance[w][T]) / 2 * chain_variance + """ + Initialize variational posterior, does not return anything. + """ - print ("Computing bound, all times") + total = sum(count for word_id, count in self.doc) + self.gamma.fill(self.lda.alpha[0] + float(total) / self.lda.num_topics) + self.phi.fill(1.0 / self.lda.num_topics) + # doc_weight used during DIM + # ldapost.doc_weight = None - for t in range(1, T + 1): - term_1 = 0.0 - term_2 = 0.0 - ent = 0.0 - for w in range(0, W): + def compute_lda_lhood(self): + """ + compute the likelihood bound + """ - m = sslm.mean[w][t] - prev_m = sslm.mean[w][t - 1] + num_topics = self.lda.num_topics + gamma_sum = numpy.sum(self.gamma) - v = sslm.variance[w][t] + # to be used in DIM + # sigma_l = 0 + # sigma_d = 0 - # w_phi_l is only used in Document Influence Model; the values are aleays zero in this case - # w_phi_l = sslm.w_phi_l[w][t - 1] - # exp_i = numpy.exp(-prev_m) - # term_1 += (numpy.power(m - prev_m - (w_phi_l * exp_i), 2) / (2 * chain_variance)) - (v / chain_variance) - numpy.log(chain_variance) - - term_1 += (numpy.power(m - prev_m, 2) / (2 * chain_variance)) - (v / chain_variance) - numpy.log(chain_variance) - term_2 += word_counts[w][t - 1] * m - ent += numpy.log(v) / 2 # note the 2pi's cancel with term1 (see doc) + lhood = math.lgamma(numpy.sum(self.lda.alpha)) - math.lgamma(gamma_sum) + self.lhood[num_topics] = lhood - term_3 = -totals[t - 1] * numpy.log(sslm.zeta[t - 1]) - val += term_2 + term_3 + ent - term_1 + # influence_term = 0 + digsum = digamma(gamma_sum) - return val - + model = "DTM" + for k in range(0, num_topics): + # below code only to be used in DIM mode + # if ldapost.doc_weight is not None and (model == "DIM" or model == "fixed"): + # influence_topic = ldapost.doc_weight[k] + # influence_term = - ((influence_topic * influence_topic + sigma_l * sigma_l) / 2.0 / (sigma_d * sigma_d)) + + e_log_theta_k = digamma(self.gamma[k]) - digsum + lhood_term = (self.lda.alpha[k] - self.gamma[k]) * e_log_theta_k + math.lgamma(self.gamma[k]) - math.lgamma(self.lda.alpha[k]) + # TODO: check why there's an IF + n = 0 + for word_id, count in self.doc: + if self.phi[n][k] > 0: + lhood_term += count * self.phi[n][k] * (e_log_theta_k + self.lda.topics[word_id][k] - self.log_phi[n][k]) + n += 1 + self.lhood[k] = lhood_term + lhood += lhood_term + # in case of DIM add influence term + # lhood += influence_term + + return lhood + + def fit_lda_post(self, doc_number, time, ldaseq, g, g3_matrix, g4_matrix, g5_matrix): -def update_obs(word_counts, totals, sslm): + """ + Posterior inference for lda. + """ - """ - Fucntion to perform optimization - """ + LDA_INFERENCE_CONVERGED = 1e-8 + LDA_INFERENCE_MAX_ITER = 25 - OBS_NORM_CUTOFF = 2 - STEP_SIZE = 0.01 - TOL = 1e-3 - - W = sslm.vocab_len - T = sslm.num_time_slices - - runs = 0 - mean_deriv_mtx = numpy.resize(numpy.zeros(T * (T + 1)), (T, T + 1)) - - norm_cutoff_obs = None - for w in range(0, W): - w_counts = word_counts[w] - counts_norm = 0 - # now we find L2 norm of w_counts - for i in range(0, len(w_counts)): - counts_norm += w_counts[i] * w_counts[i] - - counts_norm = numpy.sqrt(counts_norm) - - if counts_norm < OBS_NORM_CUTOFF and norm_cutoff_obs is not None: - obs = sslm.obs[w] - norm_cutoff_obs = numpy.copy(obs) - else: - if counts_norm < OBS_NORM_CUTOFF: - w_counts = numpy.zeros(len(w_counts)) - - for t in range(0, T): - mean_deriv = mean_deriv_mtx[t] - mean_deriv = compute_mean_deriv(w, t, sslm, mean_deriv) - mean_deriv_mtx[t] = mean_deriv - - deriv = numpy.zeros(T) - args = sslm, w_counts, totals, mean_deriv_mtx, w, deriv - obs = sslm.obs[w] - model = "DTM" + self.init_lda_post() + # sum of counts in a doc + total = sum(count for word_id, count in self.doc) + + model = "DTM" + if model == "DIM": + # if in DIM then we initialise some variables here + pass - if model == "DTM": - obs = optimize.fmin_cg(f=f_obs, fprime=df_obs, x0=obs, gtol=TOL, args=args, epsilon=STEP_SIZE, disp=0) - if model == "DIM": - pass - runs += 1 + lhood = self.compute_lda_lhood() + lhood_old = 0 + converged = 0 + iter_ = 0 - if counts_norm < OBS_NORM_CUTOFF: - norm_cutoff_obs = obs + # first iteration starts here + iter_ += 1 + lhood_old = lhood + self.gamma = self.update_gamma() - sslm.obs[w] = obs + model = "DTM" - sslm.zeta = update_zeta(sslm) - - return sslm.obs, sslm.zeta + if model == "DTM" or sslm is None: + self.phi, self.log_phi = self.update_phi(doc_number, time) + elif model == "DIM" and sslm is not None: + self.phi, self.log_phi = self.update_phi_fixed(doc_number, time, sslm, g3_matrix, g4_matrix, g5_matrix) - -def compute_mean_deriv(word, time, sslm, deriv): + lhood = self.compute_lda_lhood() + converged = numpy.fabs((lhood_old - lhood) / (lhood_old * total)) - """ - Used in helping find the optimum function. - computes derivative of E[\beta_{t,w}]/d obs_{s,w} for t = 1:T. - put the result in deriv, allocated T+1 vector - """ - T = sslm.num_time_slices - fwd_variance = sslm.variance[word] + while converged > LDA_INFERENCE_CONVERGED and iter_ <= LDA_INFERENCE_MAX_ITER: - deriv[0] = 0 + iter_ += 1 + lhood_old = lhood + self.gamma = self.update_gamma() + model = "DTM" - # forward pass - for t in range(1, T + 1): - if sslm.obs_variance > 0.0: - w = sslm.obs_variance / (fwd_variance[t - 1] + sslm.chain_variance + sslm.obs_variance) - else: - w = 0.0 + if model == "DTM" or sslm is None: + self.phi, self.log_phi = self.update_phi(doc_number, time) + elif model == "DIM" and sslm is not None: + self.phi, self.log_phi = self.update_phi_fixed(doc_number, time, sslm, g3_matrix, g4_matrix, g5_matrix) - val = w * deriv[t - 1] - if time == t - 1: - val += (1 - w) + lhood = self.compute_lda_lhood() + converged = numpy.fabs((lhood_old - lhood) / (lhood_old * total)) - deriv[t]= val + return lhood - for t in range(T - 1, -1, -1): - if sslm.chain_variance == 0.0: - w = 0.0 - else: - w = sslm.chain_variance / (fwd_variance[t] + sslm.chain_variance) - deriv[t] = w * deriv[t] + (1 - w) * deriv[t + 1] - return deriv + def update_lda_seq_ss(self, time, doc, topic_suffstats): + """ + Update lda sequence sufficient statistics from an lda posterior. + """ + + num_topics = self.lda.num_topics + + for k in range(0, num_topics): + topic_ss = topic_suffstats[k] + n = 0 + for word_id, count in self.doc: + topic_ss[word_id][time] = topic_ss[word_id][time] + count * self.phi[n][k] + n += 1 + topic_suffstats[k] = topic_ss + return topic_suffstats +# endclass LdaPost + +# the following functions are used in update_obs as the function to optimize def f_obs(x, *args): + """ + Function which we are optimising for minimizing obs + """ sslm, word_counts, totals, mean_deriv_mtx, word, deriv = args # flag init_mult = 1000 @@ -1008,7 +1082,7 @@ def f_obs(x, *args): term4 = 0 sslm.obs[word] = x - sslm.mean[word], sslm.fwd_mean[word] = compute_post_mean(word, sslm, sslm.chain_variance) + sslm.mean[word], sslm.fwd_mean[word] = sslm.compute_post_mean(word, sslm.chain_variance) mean = sslm.mean[word] variance = sslm.variance[word] @@ -1042,248 +1116,21 @@ def f_obs(x, *args): return final - -def compute_obs_deriv(word, word_counts, totals, sslm, mean_deriv_mtx, deriv): - - # flag - init_mult = 1000 - - T = sslm.num_time_slices - - mean = sslm.mean[word] - variance = sslm.variance[word] - - # only used for DIM mode - # w_phi_l = sslm.w_phi_l[word] - # m_update_coeff = sslm.m_update_coeff[word] - - sslm.temp_vect = numpy.zeros(T) - - for u in range(0, T): - sslm.temp_vect[u] = numpy.exp(mean[u + 1] + variance[u + 1] / 2) - - for t in range(0, T): - - mean_deriv = mean_deriv_mtx[t] - term1 = 0 - term2 = 0 - term3 = 0 - term4 = 0 - - for u in range(1, T + 1): - mean_u = mean[u] - variance_u_prev = variance[u - 1] - mean_u_prev = mean[u - 1] - dmean_u = mean_deriv[u] - dmean_u_prev = mean_deriv[u - 1] - - term1 += (mean_u - mean_u_prev) * (dmean_u - dmean_u_prev) - - term2 += (word_counts[u - 1] - (totals[u - 1] * sslm.temp_vect[u - 1] / sslm.zeta[u - 1])) * dmean_u - - model = "DTM" - if model == "DIM": - # do some stuff - pass - - if sslm.chain_variance: - term1 = - (term1 / sslm.chain_variance) - term1 = term1 - (mean[0] * mean_deriv[0]) / (init_mult * sslm.chain_variance) - else: - term1 = 0.0 - - deriv[t] = term1 + term2 + term3 + term4 - - return deriv - - def df_obs(x, *args): + """ + Derivative of function which optimises obs. + """ sslm, word_counts, totals, mean_deriv_mtx, word, deriv = args sslm.obs[word] = x - sslm.mean[word], sslm.fwd_mean[word] = compute_post_mean(word, sslm, sslm.chain_variance) + sslm.mean[word], sslm.fwd_mean[word] = sslm.compute_post_mean(word, sslm.chain_variance) model = "DTM" if model == "DTM": - deriv = compute_obs_deriv(word, word_counts, totals, sslm, mean_deriv_mtx, deriv) + deriv = sslm.compute_obs_deriv(word, word_counts, totals, mean_deriv_mtx, deriv) elif model == "DIM": - deriv = compute_obs_deriv_fixed(p.word, p.word_counts, p.totals, p.sslm, p.mean_deriv_mtx, deriv) + deriv = sslm.compute_obs_deriv_fixed(p.word, p.word_counts, p.totals, p.sslm, p.mean_deriv_mtx, deriv) return numpy.negative(deriv) - - -# the following code replicates Blei's original LDA, ported to python. -# idea is to let user initialise LDA sstats through this instead of gensim LDA if wanted. - -def lda_sstats(seq_corpus, num_topics, num_terms, alpha): - - lda_model = mockLDA(num_topics=num_topics, num_terms=num_terms) - lda_model.alpha = alpha # this will have shape equal to number of topics - lda_ss = initialize_ss_random(seq_corpus, num_topics) - lda_m_step(lda_model, lda_ss, seq_corpus, num_topics) - em_iter = 10 - lda_em(lda_model, lda_ss, seq_corpus, em_iter, num_topics) - - return lda_ss - - -def initialize_ss_random(seq_corpus, num_topics): - - N = seq_corpus.num_terms - K = num_topics - - topic = numpy.array(numpy.split(numpy.zeros(N * K), N)) - - for n in range(0, N): - for k in range(0, K): - topic[n][k] = numpy.random.random() + 0.5 / seq_corpus.num_docs + 4.0 - - return topic - - -def lda_m_step(lda_model, lda_ss, seq_corpus, num_topics): - - K = num_topics - W = seq_corpus.num_terms - lhood = 0 - for k in range(0, K): - - ss_k = lda_ss[:,k] - log_p = lda_model.topics[:,k] - - LDA_VAR_BAYES = True - if LDA_VAR_BAYES is True: - - numpy.copyto(log_p, ss_k) - log_p = log_p / sum(log_p) - log_p = numpy.log(log_p) - - else: - pass - - lda_model.topics[:,k] = log_p - - return lhood - - -def lda_em(lda_model, lda_ss, seq_corpus, max_iter, num_topics): - - LDA_EM_CONVERGED = 5e-5 - LDA_INFERENCE_CONVERGED = 1e-8 - - iter_ = 0 - lhood = lda_e_step(lda_model, seq_corpus, lda_ss, num_topics) - old_lhood = 0 - converged = 0 - m_lhood = lda_m_step(lda_model, lda_ss, seq_corpus, num_topics) - - # do step starts - iter_ += 1 - old_lhood = lhood - e_lhood = lda_e_step(lda_model, seq_corpus, lda_ss, num_topics) - m_lhood = lda_m_step(lda_model, lda_ss, seq_corpus, num_topics) - lhood = e_lhood + m_lhood - converged = (old_lhood - lhood) / old_lhood - - while (converged > LDA_EM_CONVERGED or iter_ <= 5) and iter_ < max_iter: - - iter_ += 1 - old_lhood = lhood - e_lhood = lda_e_step(lda_model, seq_corpus, lda_ss, num_topics) - m_lhood = lda_m_step(lda_model, lda_ss, seq_corpus, num_topics) - lhood = e_lhood + m_lhood - converged = (old_lhood - lhood) / old_lhood - print converged - - return lhood - - -def lda_e_step(lda_model, seq_corpus, lda_ss, num_topics): - - K = num_topics - - if lda_ss is not None: - lda_ss.fill(0) - - ldapost = LdaPost(max_doc_len=seq_corpus.max_doc_len, num_topics=K, lda=lda_model) - ldapost.gamma = numpy.zeros(K) - ldapost.lhood = numpy.zeros(K + 1) - - lhood = 0 - - for line_no, line in enumerate(seq_corpus.corpus): - - doc = Doc(doc=line) - ldapost.doc = doc - lhood += fit_lda_post(d, 0, ldapost, None, None, None, None, None) - - if lda_ss is not None: - for k in range(0, K): - for n in range(0, ldapost.doc.nterms): - lda_ss[ldapost.doc.word[n]][k] += ldapost.phi[n][k] * ldapost.doc.count[n] - - return lhood - - -def print_topics(ldaseq, topic, time=0, top_terms=20): - """ - Topic is the topic numner - Time is for a particular time_slice - top_terms is the number of terms to display - """ - topic = ldaseq.topic_chains[topic].e_log_prob[time] - topic = numpy.transpose(topic) - topic = topic / topic.sum() - bestn = matutils.argsort(topic, top_terms, reverse=True) - beststr = [(round(topic[id_], 3), ldaseq.corpus.id2word[id_]) for id_ in bestn] - - return beststr - - -def doc_topics(ldaseq, doc_number): - - """ - On passing the LdaSeqModel trained ldaseq object, the doc_number of your document in the corpus, - it returns the doc-topic probabilities of that document. - """ - doc_topic = numpy.copy(ldaseq.gammas) - doc_topic /= doc_topic.sum(axis=1)[:, numpy.newaxis] - - return doc_topic[doc_number] - - -def get_item(ldaseq, doc): - """ - Analyse documents on trained DTM model. - """ - - lda_model = ldamodel.LdaModel(num_topics=num_topics, alpha=ldaseq.alphas, id2word=ldaseq.corpus.id2word) - ldapost = LdaPost(num_topics=ldaseq.num_topics, max_doc_len=len(doc), lda=lda_model) - - time_lhoods = [] - for time in range(0, ldaseq.num_time_slices): - make_lda_seq_slice(lda_model, ldaseq, time) - ldapost.doc = Doc(doc=doc) - lhood = fit_lda_post(0, time, ldapost, ldaseq, None, None, None, None) - time_lhoods.append(lhood) - - return time_lhoods - - - -# the fdf used in optimising obs. Can use if we figure a way to use an optimization function which requires this - -# def fdf_obs(x, params, f, df): -# p = params -# model = "DTM" -# if model == "DTM": -# f = f_obs(x, params) -# compute_obs_deriv(p.word, p.word_counts, p.totals, p.sslm, p.mean_deriv_mtx, df) -# elif model == "DIM": -# f = f_obs_multiplt(x, params) -# compute_obs_deriv_fixed(p.word, p.word_counts, p.totals, p.sslm, p.mean_deriv_mtx, df) -# for i in range(0, len(df)): -# df[i] = - df[i] - - + \ No newline at end of file From ea3dff8e893093af0a81191f1c5bf0bb20e90a7b Mon Sep 17 00:00:00 2001 From: Bhargav Srinivasa Date: Fri, 12 Aug 2016 19:08:16 +0530 Subject: [PATCH 31/38] Fixed Doc-Topics --- gensim/models/ldaseqmodel.py | 54 ++++++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 18 deletions(-) diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index 1c55e731a1..3517f5caab 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -91,7 +91,7 @@ class LdaSeqModel(utils.SaveLoad): """ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_topics=10, - initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005): + initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10): """ `corpus` is any iterable gensim corpus @@ -110,7 +110,8 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_ `chain_variance` is a constant which dictates how the beta values evolve - it is a gaussian parameter defined in the beta distribution. - + + `passes` is the number of passes of the initial LdaModel. """ if corpus is not None: @@ -122,8 +123,8 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_ self.num_time_slices = len(time_slice) self.alphas = numpy.full(num_topics, alphas) - #topic_chains contains for each topic a 'state space language model' object which in turn has information about each topic - #the sslm class is described below and contains information on topic-word probabilities and doc-topic probabilities. + # topic_chains contains for each topic a 'state space language model' object which in turn has information about each topic + # the sslm class is described below and contains information on topic-word probabilities and doc-topic probabilities. self.topic_chains = [] for topic in range(0, num_topics): sslm_ = sslm(num_time_slices=self.num_time_slices, vocab_len=self.vocab_len, num_topics=self.num_topics, chain_variance=chain_variance, obs_variance=obs_variance) @@ -138,7 +139,7 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_ # if a corpus and time_slice is provided, depending on the user choice of initializing LDA, we start DTM. if self.corpus is not None and time_slice is not None: if initialize == 'gensim': - lda_model = ldamodel.LdaModel(corpus, id2word=self.corpus.id2word, num_topics=self.num_topics, passes=10, alpha=self.alphas) + lda_model = ldamodel.LdaModel(corpus, id2word=self.corpus.id2word, num_topics=self.num_topics, passes=passes, alpha=self.alphas) self.sstats = numpy.transpose(lda_model.state.sstats) if initialize == 'ldamodel': self.sstats = numpy.transpose(lda_model.state.sstats) @@ -302,6 +303,7 @@ def inferDTMseq(self, seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost, ldapost.lhood = lhood ldapost.doc = line + # TODO: replace fit_lda_post with appropriate ldamodel functions, if possible. if iter_ == 0: doc_lhood = LdaPost.fit_lda_post(ldapost, doc_num, time, None, None, None, None, None) else: @@ -311,12 +313,14 @@ def inferDTMseq(self, seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost, if topic_suffstats != None: topic_suffstats = LdaPost.update_lda_seq_ss(ldapost, time, line, topic_suffstats) + gammas[doc_index] = ldapost.gamma bound += doc_lhood doc_index += 1 doc_num += 1 return bound, gammas + def make_lda_seq_slice(self, lda, time): """ @@ -346,22 +350,35 @@ def fit_lda_seq_topics(self, topic_suffstats): return lhood - def print_topics_time(self, topic): + + def print_topic_times(self, topic, top_terms=20): + """ - prints one topic showing each time-slice + Prints one topic showing each time-slice. """ + for time in range(0, self.num_time_slices): - self.print_topics(topic, time) + self.print_topic(topic, time, top_terms) - def print_topics(self, topic, time=0, top_terms=20): + def print_topics(self, time=0, top_terms=20): + + """ + Prints all topics in a particular time-slice. + """ + for topic in range(0, self.num_topics): + self.print_topic(topic, time, top_terms) + + + def print_topic(self, topic, time=0, top_terms=20): """ Topic is the topic numner Time is for a particular time_slice top_terms is the number of terms to display """ - topic = self.topic_chains[topic].e_log_prob[time] + topic = self.topic_chains[topic].e_log_prob topic = numpy.transpose(topic) + topic = numpy.exp(topic[time]) topic = topic / topic.sum() bestn = matutils.argsort(topic, top_terms, reverse=True) beststr = [(round(topic[id_], 3), self.corpus.id2word[id_]) for id_ in bestn] @@ -381,7 +398,7 @@ def doc_topics(self, doc_number): return doc_topic[doc_number] - def get_item(self, doc): + def __getitem__(self, doc): """ TODO: To mimic the __getitem__ in ldamodel. This method is a work in progress. """ @@ -395,7 +412,7 @@ def get_item(self, doc): lhood = fit_lda_post(0, time, ldapost, self, None, None, None, None) time_lhoods.append(lhood) - return time_lhoods + return ldapost.gamma, time_lhoods # endclass LdaSeqModel @@ -545,8 +562,8 @@ def compute_expected_log_prob(self): The appendix describes the Expectation of log-probabilities in equation 5 of the DTM paper; The below implementation is the result of solving the equation and is as implemented in the original Blei DTM code. """ - for (w,t), e_log_prob in numpy.ndenumerate(self.e_log_prob): - e_log_prob = self.mean[w][t + 1] - numpy.log(self.zeta[t]) + for (w,t), val in numpy.ndenumerate(self.e_log_prob): + self.e_log_prob[w][t] = self.mean[w][t + 1] - numpy.log(self.zeta[t]) return self.e_log_prob @@ -692,7 +709,7 @@ def compute_bound(self, word_counts, totals): def update_obs(self, word_counts, totals): """ - Fucntion to perform optimization + Fucntion to perform optimization of obs. """ OBS_NORM_CUTOFF = 2 @@ -885,9 +902,9 @@ def update_phi(self, doc_number, time): """ num_topics = self.lda.num_topics - # digamma values dig = numpy.zeros(num_topics) + for k in range(0, num_topics): dig[k] = digamma(self.gamma[k]) @@ -907,7 +924,6 @@ def update_phi(self, doc_number, time): # subtract every element by v log_phi_row = log_phi_row - v phi_row = numpy.exp(log_phi_row) - self.log_phi[n] = log_phi_row self.phi[n] = phi_row n +=1 # increase iteration @@ -930,6 +946,7 @@ def update_gamma(self): phi_row = self.phi[n] for k in range(0, self.lda.num_topics): self.gamma[k] += phi_row[k] * count + n += 1 return self.gamma @@ -942,10 +959,11 @@ def init_lda_post(self): total = sum(count for word_id, count in self.doc) self.gamma.fill(self.lda.alpha[0] + float(total) / self.lda.num_topics) - self.phi.fill(1.0 / self.lda.num_topics) + self.phi[:len(self.doc),:] = 1.0 / self.lda.num_topics # doc_weight used during DIM # ldapost.doc_weight = None + def compute_lda_lhood(self): """ compute the likelihood bound From 5b0f1931f056032b9ab178a35e4dac634dacc590 Mon Sep 17 00:00:00 2001 From: Bhargav Srinivasa Date: Sun, 14 Aug 2016 18:06:28 +0530 Subject: [PATCH 32/38] Fixed printing --- gensim/models/ldaseqmodel.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index 3517f5caab..5bfd292e89 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -16,8 +16,9 @@ The next steps to take this forward would be: 1) Include DIM mode. Most of the infrastructure for this is in place. - 2) Lots of heavy lifting going on in the sslm class - efforts can be made to cythonise mathematical methods. - 3) Try and make it distributed, especially around the E and M step. + 2) See if LdaPost can be replaces by LdaModel completely without breakign anything. + 3) Heavy lifting going on in the sslm class - efforts can be made to cythonise mathematical methods. + 4) Try and make it distributed, especially around the E and M step. """ @@ -309,7 +310,6 @@ def inferDTMseq(self, seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost, else: doc_lhood = LdaPost.fit_lda_post(ldapost, doc_num, time, self, None, None, None, None) - if topic_suffstats != None: topic_suffstats = LdaPost.update_lda_seq_ss(ldapost, time, line, topic_suffstats) @@ -357,18 +357,23 @@ def print_topic_times(self, topic, top_terms=20): Prints one topic showing each time-slice. """ + topics = [] for time in range(0, self.num_time_slices): - self.print_topic(topic, time, top_terms) + topics.append(self.print_topic(topic, time, top_terms)) + return topics def print_topics(self, time=0, top_terms=20): """ Prints all topics in a particular time-slice. """ + + topics =[] for topic in range(0, self.num_topics): - self.print_topic(topic, time, top_terms) + topics.append(self.print_topic(topic, time, top_terms)) + return topics def print_topic(self, topic, time=0, top_terms=20): """ @@ -403,12 +408,12 @@ def __getitem__(self, doc): TODO: To mimic the __getitem__ in ldamodel. This method is a work in progress. """ - lda_model = ldamodel.LdaModel(num_topics=num_topics, alpha=self.alphas, id2word=self.corpus.id2word) + lda_model = ldamodel.LdaModel(num_topics=self.num_topics, alpha=self.alphas, id2word=self.corpus.id2word) ldapost = LdaPost(num_topics=self.num_topics, max_doc_len=len(doc), lda=lda_model, doc=doc) time_lhoods = [] for time in range(0, self.num_time_slices): - lda = self.make_lda_seq_slice(lda, time) # create lda_seq slice + lda_model = self.make_lda_seq_slice(lda_model, time) # create lda_seq slice lhood = fit_lda_post(0, time, ldapost, self, None, None, None, None) time_lhoods.append(lhood) From d2ad4ab583085a9ed3529ae720552118dca639db Mon Sep 17 00:00:00 2001 From: Bhargav Srinivasa Date: Mon, 15 Aug 2016 17:46:25 +0530 Subject: [PATCH 33/38] PEP8 --- gensim/models/ldaseqmodel.py | 165 +++++++++++------------------------ 1 file changed, 51 insertions(+), 114 deletions(-) diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index 5bfd292e89..b888e81f59 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -7,7 +7,7 @@ """ -Inspired by the Blei's original DTM code and paper. +Inspired by the Blei's original DTM code and paper. Original DTM C/C++ code: https://github.com/blei-lab/dtm DTM Paper: https://www.cs.princeton.edu/~blei/papers/BleiLafferty2006a.pdf @@ -30,6 +30,7 @@ from scipy import optimize import sys + class seq_corpus(utils.SaveLoad): """ @@ -43,8 +44,6 @@ class seq_corpus(utils.SaveLoad): """ def __init__(self, corpus=None, time_slice=None, id2word=None): - - self.id2word = id2word if corpus is None and self.id2word is None: raise ValueError('at least one of corpus/id2word must be specified, to establish input space dimensionality') @@ -86,12 +85,12 @@ class LdaSeqModel(utils.SaveLoad): Model persistency is achieved through inheriting utils.SaveLoad. - >>> ldaseq.save("ldaseq") + >>> ldaseq.save("ldaseq") saves the model to disk. """ - def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_topics=10, + def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_topics=10, initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10): """ `corpus` is any iterable gensim corpus @@ -106,15 +105,14 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_ `initalize` allows the user to decide how he wants to initialise the DTM model. Default is through gensim LDA. You can use your own sstats of an LDA model previously trained as well by specifying 'own' and passing a numpy matrix through sstats. - If you wish to just pass a previously used LDA model, pass it through `lda_model` + If you wish to just pass a previously used LDA model, pass it through `lda_model` Shape of sstats is (vocab_len, num_topics) `chain_variance` is a constant which dictates how the beta values evolve - it is a gaussian parameter defined in the - beta distribution. - + beta distribution. + `passes` is the number of passes of the initial LdaModel. """ - if corpus is not None: self.corpus = seq_corpus(corpus=corpus, id2word=id2word, time_slice=time_slice) self.vocab_len = len(self.corpus.id2word) @@ -155,14 +153,12 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_ def init_ldaseq_ss(self, topic_chain_variance, topic_obs_variance, alpha, init_suffstats): - """ Method to initialize State Space Language Model, topic wise. """ - self.alphas = alpha for k in range(0, self.num_topics): - sstats = init_suffstats[:,k] + sstats = init_suffstats[:, k] sslm.sslm_counts_init(self.topic_chains[k], topic_obs_variance, topic_chain_variance, sstats) # initialize the below matrices only if running DIM @@ -174,17 +170,16 @@ def init_ldaseq_ss(self, topic_chain_variance, topic_obs_variance, alpha, init_s def fit_lda_seq(self, seq_corpus): """ fit an lda sequence model: - + for each time period set up lda model with E[log p(w|z)] and \alpha for each document perform posterior inference update sufficient statistics/likelihood - + maximize topics """ - LDA_INFERENCE_MAX_ITER = 25 LDASQE_EM_THRESHOLD = 1e-4 LDA_SEQ_MIN_ITER = 6 @@ -194,15 +189,14 @@ def fit_lda_seq(self, seq_corpus): vocab_len = self.vocab_len data_len = seq_corpus.num_time_slices corpus_len = seq_corpus.corpus_len - + bound = 0 convergence = LDASQE_EM_THRESHOLD + 1 - iter_ = 0 while iter_ < LDA_SEQ_MIN_ITER or ((convergence > LDASQE_EM_THRESHOLD) and iter_ <= LDA_SEQ_MAX_ITER): - print (" EM iter " , iter_) + print (" EM iter ", iter_) print ("E Step") old_bound = bound @@ -215,7 +209,6 @@ def fit_lda_seq(self, seq_corpus): # set up variables gammas = numpy.resize(numpy.zeros(corpus_len * num_topics), (corpus_len, num_topics)) lhoods = numpy.resize(numpy.zeros(corpus_len * num_topics + 1), (corpus_len, num_topics + 1)) - # compute the likelihood of a sequential corpus under an LDA # seq model and find the evidence lower bound. This is the E - Step bound, gammas = self.lda_seq_infer(seq_corpus, topic_suffstats, gammas, lhoods, iter_) @@ -229,8 +222,8 @@ def fit_lda_seq(self, seq_corpus): if ((bound - old_bound) < 0): if LDA_INFERENCE_MAX_ITER < 10: - LDA_INFERENCE_MAX_ITER *= 2 - print ("Bound went down, increasing iterations to" , LDA_INFERENCE_MAX_ITER) + LDA_INFERENCE_MAX_ITER *= 2 + print ("Bound went down, increasing iterations to", LDA_INFERENCE_MAX_ITER) # check for convergence convergence = numpy.fabs((bound - old_bound) / old_bound) @@ -249,17 +242,15 @@ def fit_lda_seq(self, seq_corpus): def lda_seq_infer(self, seq_corpus, topic_suffstats, gammas, lhoods, iter_): - """ Inference or E- Step. - This is used to set up the gensim LdaModel to be used for each time-slice. + This is used to set up the gensim LdaModel to be used for each time-slice. It also allows for Document Influence Model code to be written in. """ - num_topics = self.num_topics vocab_len = self.vocab_len bound = 0.0 - + lda = ldamodel.LdaModel(num_topics=num_topics, alpha=self.alphas, id2word=seq_corpus.id2word) lda.topics = numpy.array(numpy.split(numpy.zeros(vocab_len * num_topics), vocab_len)) ldapost = LdaPost(max_doc_len=seq_corpus.max_doc_len, num_topics=num_topics, lda=lda) @@ -268,20 +259,18 @@ def lda_seq_infer(self, seq_corpus, topic_suffstats, gammas, lhoods, iter_): if model == "DTM": bound, gammas = self.inferDTMseq(seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound) elif model == "DIM": - self.InfluenceTotalFixed(seq_corpus); + self.InfluenceTotalFixed(seq_corpus) bound, gammas = self.inferDIMseq(seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound) return bound, gammas def inferDTMseq(self, seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound): - """ Computes the likelihood of a sequential corpus under an LDA seq model, and return the likelihood bound. Need to pass the LdaSeq model, seq_corpus, sufficient stats, gammas and lhoods matrices previously created, and LdaModel and LdaPost class objects. """ - doc_index = 0 # overall doc_index in corpus time = 0 # current time-slice doc_num = 0 # doc-index in current time-lice @@ -294,7 +283,7 @@ def inferDTMseq(self, seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost, # this is used to update the time_slice and create a new lda_seq slice every new time_slice if doc_index > time_slice[time]: time += 1 - lda = self.make_lda_seq_slice(lda, time) # create lda_seq slice + lda = self.make_lda_seq_slice(lda, time) # create lda_seq slice doc_num = 0 gam = gammas[doc_index] @@ -309,8 +298,8 @@ def inferDTMseq(self, seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost, doc_lhood = LdaPost.fit_lda_post(ldapost, doc_num, time, None, None, None, None, None) else: doc_lhood = LdaPost.fit_lda_post(ldapost, doc_num, time, self, None, None, None, None) - - if topic_suffstats != None: + + if topic_suffstats is not None: topic_suffstats = LdaPost.update_lda_seq_ss(ldapost, time, line, topic_suffstats) gammas[doc_index] = ldapost.gamma @@ -322,17 +311,14 @@ def inferDTMseq(self, seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost, def make_lda_seq_slice(self, lda, time): - """ set up the LDA model topic-word values with that of ldaseq. """ - num_topics = self.num_topics for k in range(0, num_topics): - lda.topics[:,k] = numpy.copy(self.topic_chains[k].e_log_prob[:,time]) + lda.topics[:, k] = numpy.copy(self.topic_chains[k].e_log_prob[:, time]) lda.alpha = numpy.copy(self.alphas) - return lda @@ -344,7 +330,7 @@ def fit_lda_seq_topics(self, topic_suffstats): lhood_term = 0 for k in range(0, self.num_topics): - print ("Fitting topic number" , k) + print ("Fitting topic number", k) lhood_term = sslm.fit_sslm(self.topic_chains[k], topic_suffstats[k]) lhood += lhood_term @@ -352,29 +338,26 @@ def fit_lda_seq_topics(self, topic_suffstats): def print_topic_times(self, topic, top_terms=20): - """ Prints one topic showing each time-slice. """ - topics = [] for time in range(0, self.num_time_slices): topics.append(self.print_topic(topic, time, top_terms)) return topics - def print_topics(self, time=0, top_terms=20): + def print_topics(self, time=0, top_terms=20): """ Prints all topics in a particular time-slice. """ - topics =[] for topic in range(0, self.num_topics): topics.append(self.print_topic(topic, time, top_terms)) - return topics + def print_topic(self, topic, time=0, top_terms=20): """ Topic is the topic numner @@ -387,40 +370,40 @@ def print_topic(self, topic, time=0, top_terms=20): topic = topic / topic.sum() bestn = matutils.argsort(topic, top_terms, reverse=True) beststr = [(round(topic[id_], 3), self.corpus.id2word[id_]) for id_ in bestn] - return beststr def doc_topics(self, doc_number): - """ On passing the LdaSeqModel trained ldaseq object, the doc_number of your document in the corpus, it returns the doc-topic probabilities of that document. """ doc_topic = numpy.copy(self.gammas) doc_topic /= doc_topic.sum(axis=1)[:, numpy.newaxis] - return doc_topic[doc_number] def __getitem__(self, doc): """ - TODO: To mimic the __getitem__ in ldamodel. This method is a work in progress. + Similar to the LdaModel __getitem__ function, it returns topic proportions of a document passed. """ - lda_model = ldamodel.LdaModel(num_topics=self.num_topics, alpha=self.alphas, id2word=self.corpus.id2word) + lda_model.topics = numpy.array(numpy.split(numpy.zeros(self.vocab_len * self.num_topics), self.vocab_len)) ldapost = LdaPost(num_topics=self.num_topics, max_doc_len=len(doc), lda=lda_model, doc=doc) time_lhoods = [] for time in range(0, self.num_time_slices): lda_model = self.make_lda_seq_slice(lda_model, time) # create lda_seq slice - lhood = fit_lda_post(0, time, ldapost, self, None, None, None, None) + lhood = LdaPost.fit_lda_post(ldapost, 0, time, self, None, None, None, None) time_lhoods.append(lhood) - return ldapost.gamma, time_lhoods + doc_topic = ldapost.gamma / ldapost.gamma.sum() + # should even the likelihoods be returned? + return doc_topic # endclass LdaSeqModel + class sslm(utils.SaveLoad): """ The sslm class is the State Space Language Model for DTM and contains the following information: @@ -432,14 +415,13 @@ class sslm(utils.SaveLoad): `zeta` is an extra variational parameter with a value for each time-slice """ def __init__(self, vocab_len=None, num_time_slices=None, num_topics=None, obs_variance=0.5, chain_variance=0.005): - - self.vocab_len = vocab_len self.num_time_slices = num_time_slices self.obs_variance = obs_variance self.chain_variance= chain_variance self.num_topics = num_topics + # setting up matrices self.obs = numpy.array(numpy.split(numpy.zeros(num_time_slices * vocab_len), vocab_len)) self.e_log_prob = numpy.array(numpy.split(numpy.zeros(num_time_slices * vocab_len), vocab_len)) self.mean = numpy.array(numpy.split(numpy.zeros((num_time_slices + 1) * vocab_len), vocab_len)) @@ -460,33 +442,29 @@ def __init__(self, vocab_len=None, num_time_slices=None, num_topics=None, obs_va def update_zeta(self): - """ Updates the Zeta Variational Parameter. Zeta is described in the appendix and is equal to sum (exp(mean[word] + Variance[word] / 2)), over every time-slice. It is the value of variational parameter zeta which maximizes the lower bound. """ - vocab_len = self.vocab_len num_time_slices = self.num_time_slices self.zeta.fill(0) - for j in range(0, num_time_slices): self.zeta[j] = numpy.sum(numpy.exp(self.mean[:, j + 1] + self.variance[:, j + 1] / 2)) - return self.zeta - def compute_post_variance(self, word, chain_variance): + def compute_post_variance(self, word, chain_variance): """ Based on the Variational Kalman Filtering approach for Approximate Inference [https://www.cs.princeton.edu/~blei/papers/BleiLafferty2006a.pdf] This function accepts the word to compute variance for, along with the associated sslm class object, and returns variance and fwd_variance Computes Var[\beta_{t,w}] for t = 1:T - Fwd_Variance(t) ≡ E((beta_{t,w} − mean_{t,w})^2 |beta_{t} for 1:t) + Fwd_Variance(t) ≡ E((beta_{t,w} − mean_{t,w})^2 |beta_{t} for 1:t) = (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance ) * (fwd_variance[t - 1] + obs_variance) - Variance(t) ≡ E((beta_{t,w} − mean_cap{t,w})^2 |beta_cap{t} for 1:t) + Variance(t) ≡ E((beta_{t,w} − mean_cap{t,w})^2 |beta_cap{t} for 1:t) = fwd_variance[t - 1] + (fwd_variance[t - 1] / fwd_variance[t - 1] + obs_variance)^2 * (variance[t - 1] - (fwd_variance[t-1] + obs_variance)) """ @@ -495,10 +473,8 @@ def compute_post_variance(self, word, chain_variance): T = self.num_time_slices variance = self.variance[word] fwd_variance = self.fwd_variance[word] - # forward pass. Set initial variance very high fwd_variance[0] = chain_variance * INIT_VARIANCE_CONST - for t in range(1, T + 1): if self.obs_variance: w = self.obs_variance / (fwd_variance[t - 1] + chain_variance + self.obs_variance) @@ -519,26 +495,21 @@ def compute_post_variance(self, word, chain_variance): def compute_post_mean(self, word, chain_variance): - """ Based on the Variational Kalman Filtering approach for Approximate Inference [https://www.cs.princeton.edu/~blei/papers/BleiLafferty2006a.pdf] This function accepts the word to compute mean for, along with the associated sslm class object, and returns mean and fwd_mean Essentially a forward-backward to compute E[\beta_{t,w}] for t = 1:T. - Fwd_Mean(t) ≡ E(beta_{t,w} | beta_ˆ 1:t ) + Fwd_Mean(t) ≡ E(beta_{t,w} | beta_ˆ 1:t ) = (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance ) * fwd_mean[t - 1] + (1 - (obs_variance / fwd_variance[t - 1] + chain_variance + obs_variance)) * beta - Mean(t) ≡ E(beta_{t,w} | beta_ˆ 1:T ) + Mean(t) ≡ E(beta_{t,w} | beta_ˆ 1:T ) = fwd_mean[t - 1] + (obs_variance / fwd_variance[t - 1] + obs_variance) + (1 - obs_variance / fwd_variance[t - 1] + obs_variance)) * mean[t] """ - - T = self.num_time_slices - obs = self.obs[word] fwd_variance = self.fwd_variance[word] - mean = self.mean[word] fwd_mean = self.fwd_mean[word] @@ -556,39 +527,32 @@ def compute_post_mean(self, word, chain_variance): else: w = chain_variance / (fwd_variance[t] + chain_variance) mean[t] = w * fwd_mean[t] + (1 - w) * mean[t + 1] - return mean, fwd_mean def compute_expected_log_prob(self): - """ Compute the expected log probability given values of m. The appendix describes the Expectation of log-probabilities in equation 5 of the DTM paper; The below implementation is the result of solving the equation and is as implemented in the original Blei DTM code. """ - for (w,t), val in numpy.ndenumerate(self.e_log_prob): + for (w, t), val in numpy.ndenumerate(self.e_log_prob): self.e_log_prob[w][t] = self.mean[w][t + 1] - numpy.log(self.zeta[t]) - return self.e_log_prob def sslm_counts_init(self, obs_variance, chain_variance, sstats): - """ Initialize State Space Language Model with LDA sufficient statistics. """ - W = self.vocab_len T = self.num_time_slices log_norm_counts = numpy.copy(sstats) log_norm_counts = log_norm_counts / sum(log_norm_counts) - log_norm_counts = log_norm_counts + 1.0 / W log_norm_counts = log_norm_counts / sum(log_norm_counts) log_norm_counts = numpy.log(log_norm_counts) - # setting variational observations to transformed counts self.obs = (numpy.repeat(log_norm_counts, T, axis=0)).reshape(W, T) @@ -598,10 +562,10 @@ def sslm_counts_init(self, obs_variance, chain_variance, sstats): # compute post variance for w in range(0, W): - self.variance[w], self.fwd_variance[w] = self.compute_post_variance(w, self.chain_variance) + self.variance[w], self.fwd_variance[w] = self.compute_post_variance(w, self.chain_variance) for w in range(0, W): - self.mean[w], self.fwd_mean[w] = self.compute_post_mean(w, self.chain_variance) + self.mean[w], self.fwd_mean[w] = self.compute_post_mean(w, self.chain_variance) self.zeta = self.update_zeta() self.e_log_prob = self.compute_expected_log_prob() @@ -612,7 +576,6 @@ def fit_sslm(self, counts): """ Fit variational distribution. """ - W = self.vocab_len bound = 0 old_bound = 0 @@ -624,7 +587,7 @@ def fit_sslm(self, counts): for w in range(0, W): self.variance[w], self.fwd_variance[w] = self.compute_post_variance(w, self.chain_variance) - + # column sum of counts totals = counts.sum(axis=0) iter_ = 0 @@ -635,30 +598,26 @@ def fit_sslm(self, counts): if model == "DIM": bound = self.compute_bound_fixed(counts, totals) - print ("initial sslm bound is " , bound) + print ("initial sslm bound is ", bound) while converged > sslm_fit_threshold and iter_ < sslm_max_iter: iter_ += 1 old_bound = bound self.obs, self.zeta = self.update_obs(counts, totals) - if model == "DTM": bound = self.compute_bound(counts, totals) if model == "DIM": bound = self.compute_bound_fixed(counts, totals) converged = numpy.fabs((bound - old_bound) / old_bound) - print (iter_, " iteration lda seq bound is ", bound, " convergence is", converged) self.e_log_prob = self.compute_expected_log_prob() - return bound def compute_bound(self, word_counts, totals): - """ Compute log probability bound. Forumula is as described in appendix of DTM. @@ -674,7 +633,6 @@ def compute_bound(self, word_counts, totals): ent = 0 chain_variance = self.chain_variance - for w in range(0, W): self.mean[w], self.fwd_mean[w] = self.compute_post_mean(w, chain_variance) @@ -700,28 +658,25 @@ def compute_bound(self, word_counts, totals): # w_phi_l = sslm.w_phi_l[w][t - 1] # exp_i = numpy.exp(-prev_m) # term_1 += (numpy.power(m - prev_m - (w_phi_l * exp_i), 2) / (2 * chain_variance)) - (v / chain_variance) - numpy.log(chain_variance) - + term_1 += (numpy.power(m - prev_m, 2) / (2 * chain_variance)) - (v / chain_variance) - numpy.log(chain_variance) term_2 += word_counts[w][t - 1] * m ent += numpy.log(v) / 2 # note the 2pi's cancel with term1 (see doc) - term_3 = -totals[t - 1] * numpy.log(self.zeta[t - 1]) + term_3 = -totals[t - 1] * numpy.log(self.zeta[t - 1]) val += term_2 + term_3 + ent - term_1 return val def update_obs(self, word_counts, totals): - """ Fucntion to perform optimization of obs. """ - OBS_NORM_CUTOFF = 2 STEP_SIZE = 0.01 TOL = 1e-3 - W = self.vocab_len T = self.num_time_slices @@ -773,7 +728,6 @@ def update_obs(self, word_counts, totals): def compute_mean_deriv(self, word, time, deriv): - """ Used in helping find the optimum function. computes derivative of E[\beta_{t,w}]/d obs_{s,w} for t = 1:T. @@ -791,12 +745,10 @@ def compute_mean_deriv(self, word, time, deriv): w = self.obs_variance / (fwd_variance[t - 1] + self.chain_variance + self.obs_variance) else: w = 0.0 - val = w * deriv[t - 1] if time == t - 1: val += (1 - w) - - deriv[t]= val + deriv[t] = val for t in range(T - 1, -1, -1): if self.chain_variance == 0.0: @@ -809,7 +761,6 @@ def compute_mean_deriv(self, word, time, deriv): def compute_obs_deriv(self, word, word_counts, totals, mean_deriv_mtx, deriv): - """ Derivation of obs which is used in derivative function [df_obs] while optimizing. """ @@ -833,7 +784,6 @@ def compute_obs_deriv(self, word, word_counts, totals, mean_deriv_mtx, deriv): self.temp_vect[u] = numpy.exp(mean[u + 1] + variance[u + 1] / 2) for t in range(0, T): - mean_deriv = mean_deriv_mtx[t] term1 = 0 term2 = 0 @@ -848,7 +798,6 @@ def compute_obs_deriv(self, word, word_counts, totals, mean_deriv_mtx, deriv): dmean_u_prev = mean_deriv[u - 1] term1 += (mean_u - mean_u_prev) * (dmean_u - dmean_u_prev) - term2 += (word_counts[u - 1] - (totals[u - 1] * self.temp_vect[u - 1] / self.zeta[u - 1])) * dmean_u model = "DTM" @@ -884,7 +833,7 @@ def __init__(self, doc=None, lda=None, max_doc_len=None, num_topics=None, gamma= if self.gamma is None: self.gamma = numpy.zeros(num_topics) if self.lhood is None: - self.lhood = numpy.zeros(num_topics) + self.lhood = numpy.zeros(num_topics + 1) if max_doc_len is not None and num_topics is not None: self.phi = numpy.resize(numpy.zeros(max_doc_len * num_topics), (max_doc_len, num_topics)) @@ -897,7 +846,6 @@ def __init__(self, doc=None, lda=None, max_doc_len=None, num_topics=None, gamma= def update_phi(self, doc_number, time): - """ Update variational multinomial parameters, based on a document and a time-slice. This is done based on the original Blei-LDA paper, where: @@ -905,7 +853,6 @@ def update_phi(self, doc_number, time): TODO: incorporate lee-sueng trick used in **Lee, Seung: Algorithms for non-negative matrix factorization, NIPS 2001**. """ - num_topics = self.lda.num_topics # digamma values dig = numpy.zeros(num_topics) @@ -913,7 +860,7 @@ def update_phi(self, doc_number, time): for k in range(0, num_topics): dig[k] = digamma(self.gamma[k]) - n = 0 # keep track of iterations for phi, log_phi + n = 0 # keep track of iterations for phi, log_phi for word_id, count in self.doc: for k in range(0, num_topics): self.log_phi[n][k] = dig[k] + self.lda.topics[word_id][k] @@ -937,15 +884,11 @@ def update_phi(self, doc_number, time): def update_gamma(self): - """ update variational dirichlet parameters as described in the original Blei LDA paper: gamma = alpha + sum(phi), over every topic for every word. - """ - self.gamma = numpy.copy(self.lda.alpha) - n = 0 # keep track of number of iterations for phi, log_phi for word_id, count in self.doc: phi_row = self.phi[n] @@ -957,11 +900,9 @@ def update_gamma(self): def init_lda_post(self): - """ Initialize variational posterior, does not return anything. """ - total = sum(count for word_id, count in self.doc) self.gamma.fill(self.lda.alpha[0] + float(total) / self.lda.num_topics) self.phi[:len(self.doc),:] = 1.0 / self.lda.num_topics @@ -973,7 +914,6 @@ def compute_lda_lhood(self): """ compute the likelihood bound """ - num_topics = self.lda.num_topics gamma_sum = numpy.sum(self.gamma) @@ -1000,7 +940,7 @@ def compute_lda_lhood(self): n = 0 for word_id, count in self.doc: if self.phi[n][k] > 0: - lhood_term += count * self.phi[n][k] * (e_log_theta_k + self.lda.topics[word_id][k] - self.log_phi[n][k]) + lhood_term += count * self.phi[n][k] * (e_log_theta_k + self.lda.topics[word_id][k] - self.log_phi[n][k]) n += 1 self.lhood[k] = lhood_term lhood += lhood_term @@ -1010,7 +950,6 @@ def compute_lda_lhood(self): return lhood def fit_lda_post(self, doc_number, time, ldaseq, g, g3_matrix, g4_matrix, g5_matrix): - """ Posterior inference for lda. """ @@ -1047,7 +986,6 @@ def fit_lda_post(self, doc_number, time, ldaseq, g, g3_matrix, g4_matrix, g5_mat lhood = self.compute_lda_lhood() converged = numpy.fabs((lhood_old - lhood) / (lhood_old * total)) - while converged > LDA_INFERENCE_CONVERGED and iter_ <= LDA_INFERENCE_MAX_ITER: iter_ += 1 @@ -1056,9 +994,9 @@ def fit_lda_post(self, doc_number, time, ldaseq, g, g3_matrix, g4_matrix, g5_mat model = "DTM" if model == "DTM" or sslm is None: - self.phi, self.log_phi = self.update_phi(doc_number, time) + self.phi, self.log_phi = self.update_phi(doc_number, time) elif model == "DIM" and sslm is not None: - self.phi, self.log_phi = self.update_phi_fixed(doc_number, time, sslm, g3_matrix, g4_matrix, g5_matrix) + self.phi, self.log_phi = self.update_phi_fixed(doc_number, time, sslm, g3_matrix, g4_matrix, g5_matrix) lhood = self.compute_lda_lhood() converged = numpy.fabs((lhood_old - lhood) / (lhood_old * total)) @@ -1067,11 +1005,9 @@ def fit_lda_post(self, doc_number, time, ldaseq, g, g3_matrix, g4_matrix, g5_mat def update_lda_seq_ss(self, time, doc, topic_suffstats): - """ Update lda sequence sufficient statistics from an lda posterior. """ - num_topics = self.lda.num_topics for k in range(0, num_topics): @@ -1085,6 +1021,7 @@ def update_lda_seq_ss(self, time, doc, topic_suffstats): return topic_suffstats # endclass LdaPost + # the following functions are used in update_obs as the function to optimize def f_obs(x, *args): From 9422edcde4b6bcdd7eb001b28eed9269a3f6e552 Mon Sep 17 00:00:00 2001 From: Bhargav Srinivasa Date: Mon, 15 Aug 2016 20:05:43 +0530 Subject: [PATCH 34/38] Notebook --- docs/notebooks/ldaseqmodel.ipynb | 1430 ++++++++++++++++++++++++++++++ 1 file changed, 1430 insertions(+) create mode 100644 docs/notebooks/ldaseqmodel.ipynb diff --git a/docs/notebooks/ldaseqmodel.ipynb b/docs/notebooks/ldaseqmodel.ipynb new file mode 100644 index 0000000000..d36460fd40 --- /dev/null +++ b/docs/notebooks/ldaseqmodel.ipynb @@ -0,0 +1,1430 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Using LdaSeqModel for DTM" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from gensim.models import ldaseqmodel\n", + "from gensim.corpora import Dictionary, bleicorpus\n", + "import numpy\n", + "from gensim import matutils" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "All you need to start using DTM is an iterable gensim corpus, id2word and a list with the number of documents in each of your time-slices." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# loading our corpus and dictionary\n", + "dictionary = Dictionary.load('Corpus/news_dictionary')\n", + "corpus = bleicorpus.BleiCorpus('Corpus/news_corpus')\n", + "# the corpus used here consists of news reports for 3 months\n", + "# the first month had 438 articles, the second 430 and the last month had 456 articles\n", + "# it's very important that your corpus is saved in order of your time-slices!\n", + "\n", + "time_slice = [438, 430, 456]" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " EM iter 0\n", + "E Step\n", + "M Step\n", + "Fitting topic number 0\n", + "Computing bound, all times\n", + "initial sslm bound is 2795842.25993\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2812881.60423 convergence is 0.00609452991812\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2816896.73671 convergence is 0.00142740898702\n", + "Fitting topic number 1\n", + "Computing bound, all times\n", + "initial sslm bound is 2930495.62431\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2943686.33857 convergence is 0.00450118886052\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2946530.5326 convergence is 0.00096620145735\n", + "Fitting topic number 2\n", + "Computing bound, all times\n", + "initial sslm bound is 2988475.36794\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2999832.07399 convergence is 0.00380016719362\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3001904.64337 convergence is 0.000690895135268\n", + "Fitting topic number 3\n", + "Computing bound, all times\n", + "initial sslm bound is 3194060.29327\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3200886.8353 convergence is 0.00213726148007\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3201665.6748 convergence is 0.000243319912892\n", + "Fitting topic number 4\n", + "Computing bound, all times\n", + "initial sslm bound is 3024297.26659\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3036608.85235 convergence is 0.00407089140919\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3039113.82323 convergence is 0.000824923788358\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/bhargavvader/Open_Source/gensim/gensim/models/ldaseqmodel.py:229: RuntimeWarning: divide by zero encountered in double_scalars\n", + " convergence = numpy.fabs((bound - old_bound) / old_bound)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 iteration lda seq bound is 12380633.093 , convergence is inf\n", + " EM iter 1\n", + "E Step\n", + "M Step\n", + "Fitting topic number 0\n", + "Computing bound, all times\n", + "initial sslm bound is 2829286.67058\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2830997.57809 convergence is 0.000604713379885\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2831478.31217 convergence is 0.000169810843792\n", + "Fitting topic number 1\n", + "Computing bound, all times\n", + "initial sslm bound is 2927605.52964\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2929057.40479 convergence is 0.000495925813536\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2929402.59379 convergence is 0.00011784985956\n", + "Fitting topic number 2\n", + "Computing bound, all times\n", + "initial sslm bound is 3004352.00625\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3005350.5243 convergence is 0.000332357209613\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3005435.11891 convergence is 2.81480023821e-05\n", + "Fitting topic number 3\n", + "Computing bound, all times\n", + "initial sslm bound is 3220542.7094\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3220760.43415 convergence is 6.76049856071e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3220639.56659 convergence is 3.75276446946e-05\n", + "Fitting topic number 4\n", + "Computing bound, all times\n", + "initial sslm bound is 3037220.67774\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3038465.1934 convergence is 0.000409754770342\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3038734.42267 convergence is 8.86069955012e-05\n", + "1 iteration lda seq bound is 12477087.7812 , convergence is 0.00779077188489\n", + " EM iter 2\n", + "E Step\n", + "M Step\n", + "Fitting topic number 0\n", + "Computing bound, all times\n", + "initial sslm bound is 2841564.67116\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2841826.17092 convergence is 9.20266760055e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2841775.38963 convergence is 1.78692460842e-05\n", + "Fitting topic number 1\n", + "Computing bound, all times\n", + "initial sslm bound is 2913525.7655\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2913793.47206 convergence is 9.1884053726e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2913728.58782 convergence is 2.22679619717e-05\n", + "Fitting topic number 2\n", + "Computing bound, all times\n", + "initial sslm bound is 3008639.33248\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3008684.52965 convergence is 1.50224626997e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3008504.95706 convergence is 5.9684751974e-05\n", + "Fitting topic number 3\n", + "Computing bound, all times\n", + "initial sslm bound is 3228028.69449\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3227924.54617 convergence is 3.22637540779e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3227755.68636 convergence is 5.23121900081e-05\n", + "Fitting topic number 4\n", + "Computing bound, all times\n", + "initial sslm bound is 3041442.22218\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3041696.5172 convergence is 8.36100130206e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3041633.11375 convergence is 2.08447669116e-05\n", + "2 iteration lda seq bound is 12492137.3259 , convergence is 0.0012061744691\n", + " EM iter 3\n", + "E Step\n", + "M Step\n", + "Fitting topic number 0\n", + "Computing bound, all times\n", + "initial sslm bound is 2851176.58861\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2851094.6521 convergence is 2.87377889514e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2850917.31 convergence is 6.22014100487e-05\n", + "Fitting topic number 1\n", + "Computing bound, all times\n", + "initial sslm bound is 2902424.21376\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2902404.40322 convergence is 6.82551448533e-06\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2902252.47495 convergence is 5.2345660204e-05\n", + "Fitting topic number 2\n", + "Computing bound, all times\n", + "initial sslm bound is 3010377.35587\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3010217.37454 convergence is 5.3143282101e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3010013.49679 convergence is 6.77285820787e-05\n", + "Fitting topic number 3\n", + "Computing bound, all times\n", + "initial sslm bound is 3228520.29177\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3228412.71958 convergence is 3.33193469337e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3228273.5271 convergence is 4.31148352635e-05\n", + "Fitting topic number 4\n", + "Computing bound, all times\n", + "initial sslm bound is 3046148.41811\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3046125.39059 convergence is 7.55955407865e-06\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3045993.69423 convergence is 4.32340568394e-05\n", + "3 iteration lda seq bound is 12496326.7188 , convergence is 0.000335362379951\n", + " EM iter 4\n", + "E Step\n", + "M Step\n", + "Fitting topic number 0\n", + "Computing bound, all times\n", + "initial sslm bound is 2859704.50533\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2859532.10927 convergence is 6.02845739101e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2859332.75285 convergence is 6.97164478333e-05\n", + "Fitting topic number 1\n", + "Computing bound, all times\n", + "initial sslm bound is 2894057.62201\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2893952.0932 convergence is 3.64639640437e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2893800.16892 convergence is 5.24971638074e-05\n", + "Fitting topic number 2\n", + "Computing bound, all times\n", + "initial sslm bound is 3010946.76605\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3010773.12861 convergence is 5.7668719302e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3010598.73375 convergence is 5.79236138621e-05\n", + "Fitting topic number 3\n", + "Computing bound, all times\n", + "initial sslm bound is 3225779.3825\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3225703.00964 convergence is 2.36757861011e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3225603.816 convergence is 3.07510153059e-05\n", + "Fitting topic number 4\n", + "Computing bound, all times\n", + "initial sslm bound is 3050784.05901\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3050687.43847 convergence is 3.16707230442e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3050550.57462 convergence is 4.48632825682e-05\n", + "Starting final iterations, max iter is 500\n", + "4 iteration lda seq bound is 12497423.1916 , convergence is 1.0\n", + " EM iter 5\n", + "E Step\n", + "M Step\n", + "Fitting topic number 0\n", + "Computing bound, all times\n", + "initial sslm bound is 2867940.01385\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2867750.00547 convergence is 6.62525653333e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2867556.21842 convergence is 6.75745958157e-05\n", + "Fitting topic number 1\n", + "Computing bound, all times\n", + "initial sslm bound is 2888151.94195\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2888045.67103 convergence is 3.67954748786e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2887910.75423 convergence is 4.67156046258e-05\n", + "Fitting topic number 2\n", + "Computing bound, all times\n", + "initial sslm bound is 3010545.28465\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3010402.90674 convergence is 4.72930639881e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3010263.0772 convergence is 4.64487776458e-05\n", + "Fitting topic number 3\n", + "Computing bound, all times\n", + "initial sslm bound is 3221150.03434\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3221107.19895 convergence is 1.32981669086e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3221042.62666 convergence is 2.00466124495e-05\n", + "Fitting topic number 4\n", + "Computing bound, all times\n", + "initial sslm bound is 3054943.36651\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3054834.4927 convergence is 3.56385680671e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3054707.10529 convergence is 4.17002673314e-05\n", + "Starting final iterations, max iter is 500\n", + "5 iteration lda seq bound is 12497478.8953 , convergence is 1.0\n", + " EM iter 6\n", + "E Step\n", + "M Step\n", + "Fitting topic number 0\n", + "Computing bound, all times\n", + "initial sslm bound is 2875890.40917\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2875705.24398 convergence is 6.43853428804e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2875525.5102 convergence is 6.25007658769e-05\n", + "Fitting topic number 1\n", + "Computing bound, all times\n", + "initial sslm bound is 2883511.77424\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2883418.57027 convergence is 3.23230749008e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2883306.61094 convergence is 3.88286794818e-05\n", + "Fitting topic number 2\n", + "Computing bound, all times\n", + "initial sslm bound is 3009632.78051\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3009521.76181 convergence is 3.68877890868e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3009415.76799 convergence is 3.52194904496e-05\n", + "Fitting topic number 3\n", + "Computing bound, all times\n", + "initial sslm bound is 3215875.1213\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3215875.49503 convergence is 1.16214977711e-07\n", + "Fitting topic number 4\n", + "Computing bound, all times\n", + "initial sslm bound is 3058777.93036\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3058669.56593 convergence is 3.54273604815e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3058555.84714 convergence is 3.71791683303e-05\n", + "Bound went down, increasing iterations to 500\n", + "Starting final iterations, max iter is 500\n", + "6 iteration lda seq bound is 12497288.2591 , convergence is 1.0\n", + " EM iter 7\n", + "E Step\n", + "M Step\n", + "Fitting topic number 0\n", + "Computing bound, all times\n", + "initial sslm bound is 2884123.59799\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2883949.28156 convergence is 6.0440001425e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2883782.21302 convergence is 5.79304702412e-05\n", + "Fitting topic number 1\n", + "Computing bound, all times\n", + "initial sslm bound is 2880009.90625\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2879951.12844 convergence is 2.04088895185e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2879860.94548 convergence is 3.1314059533e-05\n", + "Fitting topic number 2\n", + "Computing bound, all times\n", + "initial sslm bound is 3008356.80984\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3008290.43213 convergence is 2.20644384588e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3008212.20925 convergence is 2.60024369493e-05\n", + "Fitting topic number 3\n", + "Computing bound, all times\n", + "initial sslm bound is 3209274.69329\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3209282.25234 convergence is 2.35537460394e-06\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3209265.7869 convergence is 5.13056805011e-06\n", + "Fitting topic number 4\n", + "Computing bound, all times\n", + "initial sslm bound is 3062701.58157\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3062602.39896 convergence is 3.23840276228e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3062502.58562 convergence is 3.25910201525e-05\n", + "Bound went down, increasing iterations to 500\n", + "Starting final iterations, max iter is 500\n", + "7 iteration lda seq bound is 12497097.2668 , convergence is 1.0\n", + " EM iter 8\n", + "E Step\n", + "M Step\n", + "Fitting topic number 0\n", + "Computing bound, all times\n", + "initial sslm bound is 2892170.59016\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2892006.6712 convergence is 5.66767946136e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2891852.13443 convergence is 5.34358283376e-05\n", + "Fitting topic number 1\n", + "Computing bound, all times\n", + "initial sslm bound is 2876732.64307\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2876675.57192 convergence is 1.98388820242e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2876605.38147 convergence is 2.43998464586e-05\n", + "Fitting topic number 2\n", + "Computing bound, all times\n", + "initial sslm bound is 3006670.49804\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3006628.08687 convergence is 1.41056918599e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3006573.73197 convergence is 1.80783591447e-05\n", + "Fitting topic number 3\n", + "Computing bound, all times\n", + "initial sslm bound is 3203155.38765\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3203201.54249 convergence is 1.44091779548e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3203201.98263 convergence is 1.37406132285e-07\n", + "Fitting topic number 4\n", + "Computing bound, all times\n", + "initial sslm bound is 3066381.72995\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3066289.15706 convergence is 3.01896174762e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3066201.46142 convergence is 2.85999251479e-05\n", + "Bound went down, increasing iterations to 500\n", + "Starting final iterations, max iter is 500\n", + "8 iteration lda seq bound is 12496985.2321 , convergence is 1.0\n", + " EM iter 9\n", + "E Step\n", + "M Step\n", + "Fitting topic number 0\n", + "Computing bound, all times\n", + "initial sslm bound is 2900164.32867\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2900010.95471 convergence is 5.28845757756e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2899865.14466 convergence is 5.02791361343e-05\n", + "Fitting topic number 1\n", + "Computing bound, all times\n", + "initial sslm bound is 2874250.13958\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2874206.3405 convergence is 1.52384370705e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2874149.61562 convergence is 1.97358415708e-05\n", + "Fitting topic number 2\n", + "Computing bound, all times\n", + "initial sslm bound is 3004600.53216\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3004561.84396 convergence is 1.28763193176e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3004527.07321 convergence is 1.15726521086e-05\n", + "Fitting topic number 3\n", + "Computing bound, all times\n", + "initial sslm bound is 3196656.37906\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3196698.30156 convergence is 1.31144847849e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3196711.87712 convergence is 4.24674436842e-06\n", + "Fitting topic number 4\n", + "Computing bound, all times\n", + "initial sslm bound is 3070059.53023\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3069976.88679 convergence is 2.69191662117e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3069896.80993 convergence is 2.60838652128e-05\n", + "Starting final iterations, max iter is 500\n", + "9 iteration lda seq bound is 12496992.2077 , convergence is 1.0\n", + " EM iter 10\n", + "E Step\n", + "M Step\n", + "Fitting topic number 0\n", + "Computing bound, all times\n", + "initial sslm bound is 2907997.98004\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2907852.77913 convergence is 4.9931573973e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2907717.1259 convergence is 4.66506507317e-05\n", + "Fitting topic number 1\n", + "Computing bound, all times\n", + "initial sslm bound is 2871827.63399\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2871790.88826 convergence is 1.27952424798e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2871747.53876 convergence is 1.50949367096e-05\n", + "Fitting topic number 2\n", + "Computing bound, all times\n", + "initial sslm bound is 3002492.30534\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3002486.79 convergence is 1.83692107034e-06\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3002466.40519 convergence is 6.78930742467e-06\n", + "Fitting topic number 3\n", + "Computing bound, all times\n", + "initial sslm bound is 3190059.8847\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3190125.30781 convergence is 2.05084270735e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3190148.01461 convergence is 7.11784127729e-06\n", + "Fitting topic number 4\n", + "Computing bound, all times\n", + "initial sslm bound is 3073863.76808\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3073783.92273 convergence is 2.59755634018e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3073710.17604 convergence is 2.39921533623e-05\n", + "Starting final iterations, max iter is 500\n", + "10 iteration lda seq bound is 12496992.9633 , convergence is 1.0\n", + " EM iter 11\n", + "E Step\n", + "M Step\n", + "Fitting topic number 0\n", + "Computing bound, all times\n", + "initial sslm bound is 2915808.82639\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2915670.2744 convergence is 4.75175128374e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2915542.01473 convergence is 4.39897721215e-05\n", + "Fitting topic number 1\n", + "Computing bound, all times\n", + "initial sslm bound is 2869843.58765\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2869818.28091 convergence is 8.81815958255e-06\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2869784.25632 convergence is 1.18560094835e-05\n", + "Fitting topic number 2\n", + "Computing bound, all times\n", + "initial sslm bound is 3000343.24402\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3000350.62199 convergence is 2.45904098033e-06\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3000339.1857 convergence is 3.81164940006e-06\n", + "Fitting topic number 3\n", + "Computing bound, all times\n", + "initial sslm bound is 3183023.35998\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3183119.89661 convergence is 3.03285946547e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3183146.78721 convergence is 8.44787570233e-06\n", + "Fitting topic number 4\n", + "Computing bound, all times\n", + "initial sslm bound is 3077736.85728\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3077662.93373 convergence is 2.40188039466e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3077594.16651 convergence is 2.23439706476e-05\n", + "Starting final iterations, max iter is 500\n", + "11 iteration lda seq bound is 12497119.4983 , convergence is 1.0\n", + " EM iter 12\n", + "E Step\n", + "M Step\n", + "Fitting topic number 0\n", + "Computing bound, all times\n", + "initial sslm bound is 2923565.84139\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2923431.52337 convergence is 4.59432196437e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2923307.85448 convergence is 4.2302643584e-05\n", + "Fitting topic number 1\n", + "Computing bound, all times\n", + "initial sslm bound is 2868002.53341\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2867979.0079 convergence is 8.20275159044e-06\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2867953.08077 convergence is 9.04020787133e-06\n", + "Fitting topic number 2\n", + "Computing bound, all times\n", + "initial sslm bound is 2998329.52529\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2998326.89762 convergence is 8.76378736683e-07\n", + "Fitting topic number 3\n", + "Computing bound, all times\n", + "initial sslm bound is 3175765.32231\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3175849.99181 convergence is 2.66611323212e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3175883.52268 convergence is 1.05580773883e-05\n", + "Fitting topic number 4\n", + "Computing bound, all times\n", + "initial sslm bound is 3081676.58078\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3081605.6288 convergence is 2.30238243609e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3081540.66537 convergence is 2.10810323045e-05\n", + "Starting final iterations, max iter is 500\n", + "12 iteration lda seq bound is 12497353.4997 , convergence is 1.0\n", + " EM iter 13\n", + "E Step\n", + "M Step\n", + "Fitting topic number 0\n", + "Computing bound, all times\n", + "initial sslm bound is 2931245.68638\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2931115.6909 convergence is 4.4348204417e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2930996.13428 convergence is 4.07887761236e-05\n", + "Fitting topic number 1\n", + "Computing bound, all times\n", + "initial sslm bound is 2866515.73182\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2866504.1503 convergence is 4.0402767607e-06\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2866486.5448 convergence is 6.14180209265e-06\n", + "Fitting topic number 2\n", + "Computing bound, all times\n", + "initial sslm bound is 2995800.66087\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2995822.90981 convergence is 7.42670896486e-06\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2995824.29052 convergence is 4.60879404345e-07\n", + "Fitting topic number 3\n", + "Computing bound, all times\n", + "initial sslm bound is 3168669.7244\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3168763.697 convergence is 2.96568000097e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3168799.78985 convergence is 1.13901966509e-05\n", + "Fitting topic number 4\n", + "Computing bound, all times\n", + "initial sslm bound is 3085574.75288\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3085505.78116 convergence is 2.23529563551e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3085443.39597 convergence is 2.02187899996e-05\n", + "Starting final iterations, max iter is 500\n", + "13 iteration lda seq bound is 12497527.1942 , convergence is 1.0\n", + " EM iter 14\n", + "E Step\n", + "M Step\n", + "Fitting topic number 0\n", + "Computing bound, all times\n", + "initial sslm bound is 2938909.33238\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2938783.07425 convergence is 4.29608794496e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2938665.15878 convergence is 4.0123911636e-05\n", + "Fitting topic number 1\n", + "Computing bound, all times\n", + "initial sslm bound is 2865095.44075\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2865092.53423 convergence is 1.01445879249e-06\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2865079.19747 convergence is 4.65491608085e-06\n", + "Fitting topic number 2\n", + "Computing bound, all times\n", + "initial sslm bound is 2993687.1666\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2993723.40716 convergence is 1.21056634367e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2993732.82568 convergence is 3.14608795608e-06\n", + "Fitting topic number 3\n", + "Computing bound, all times\n", + "initial sslm bound is 3161115.74438\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3161222.20628 convergence is 3.36785834339e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3161262.47747 convergence is 1.2739120991e-05\n", + "Fitting topic number 4\n", + "Computing bound, all times\n", + "initial sslm bound is 3089526.67004\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3089459.19226 convergence is 2.18408162799e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3089398.57825 convergence is 1.96196178044e-05\n", + "Starting final iterations, max iter is 500\n", + "14 iteration lda seq bound is 12497827.0093 , convergence is 1.0\n", + " EM iter 15\n", + "E Step\n", + "M Step\n", + "Fitting topic number 0\n", + "Computing bound, all times\n", + "initial sslm bound is 2946632.14689\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2946507.95469 convergence is 4.21471675706e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2946392.20757 convergence is 3.92828134455e-05\n", + "Fitting topic number 1\n", + "Computing bound, all times\n", + "initial sslm bound is 2863702.91941\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2863696.92018 convergence is 2.09491793225e-06\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2863686.80093 convergence is 3.53363372014e-06\n", + "Fitting topic number 2\n", + "Computing bound, all times\n", + "initial sslm bound is 2991465.87893\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2991498.66778 convergence is 1.09607987145e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2991512.41652 convergence is 4.59593530151e-06\n", + "Fitting topic number 3\n", + "Computing bound, all times\n", + "initial sslm bound is 3153620.84299\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3153716.09398 convergence is 3.02036926369e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3153759.39958 convergence is 1.37316086931e-05\n", + "Fitting topic number 4\n", + "Computing bound, all times\n", + "initial sslm bound is 3093493.42305\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3093431.28076 convergence is 2.00880627398e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3093371.20297 convergence is 1.94210819759e-05\n", + "Starting final iterations, max iter is 500\n", + "15 iteration lda seq bound is 12498194.6695 , convergence is 1.0\n", + " EM iter 16\n", + "E Step\n", + "M Step\n", + "Fitting topic number 0\n", + "Computing bound, all times\n", + "initial sslm bound is 2954495.82244\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2954372.80696 convergence is 4.16367082427e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2954257.95349 convergence is 3.88757537753e-05\n", + "Fitting topic number 1\n", + "Computing bound, all times\n", + "initial sslm bound is 2862516.78185\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2862521.67018 convergence is 1.70770509298e-06\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2862513.45485 convergence is 2.86996353244e-06\n", + "Fitting topic number 2\n", + "Computing bound, all times\n", + "initial sslm bound is 2989270.78153\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2989291.28386 convergence is 6.85863792932e-06\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2989305.22572 convergence is 4.66393668026e-06\n", + "Fitting topic number 3\n", + "Computing bound, all times\n", + "initial sslm bound is 3145668.07828\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3145778.26162 convergence is 3.50270071216e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3145823.73404 convergence is 1.44550629194e-05\n", + "Fitting topic number 4\n", + "Computing bound, all times\n", + "initial sslm bound is 3097511.89807\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3097445.39084 convergence is 2.14711776591e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3097386.03745 convergence is 1.9162047451e-05\n", + "Starting final iterations, max iter is 500\n", + "16 iteration lda seq bound is 12498524.0714 , convergence is 1.0\n", + " EM iter 17\n", + "E Step\n", + "M Step\n", + "Fitting topic number 0\n", + "Computing bound, all times\n", + "initial sslm bound is 2962646.81406\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2962524.80196 convergence is 4.11834764529e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2962410.37322 convergence is 3.86254114022e-05\n", + "Fitting topic number 1\n", + "Computing bound, all times\n", + "initial sslm bound is 2861636.28932\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2861636.04711 convergence is 8.46435131552e-08\n", + "Fitting topic number 2\n", + "Computing bound, all times\n", + "initial sslm bound is 2987297.06216\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2987329.50059 convergence is 1.08587907908e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2987340.78559 convergence is 3.77761976542e-06\n", + "Fitting topic number 3\n", + "Computing bound, all times\n", + "initial sslm bound is 3136788.48869\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3136897.79242 convergence is 3.48457453164e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3136948.13088 convergence is 1.60472124722e-05\n", + "Fitting topic number 4\n", + "Computing bound, all times\n", + "initial sslm bound is 3101682.76962\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3101616.93863 convergence is 2.12242824476e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3101557.04791 convergence is 1.9309515922e-05\n", + "Starting final iterations, max iter is 500\n", + "17 iteration lda seq bound is 12498931.8507 , convergence is 1.0\n", + " EM iter 18\n", + "E Step\n", + "M Step\n", + "Fitting topic number 0\n", + "Computing bound, all times\n", + "initial sslm bound is 2970868.16665\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2970746.5909 convergence is 4.09226333285e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2970632.17642 convergence is 3.85137130985e-05\n", + "Fitting topic number 1\n", + "Computing bound, all times\n", + "initial sslm bound is 2860780.51467\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2860782.68294 convergence is 7.57931186417e-07\n", + "Fitting topic number 2\n", + "Computing bound, all times\n", + "initial sslm bound is 2985469.65535\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2985511.52565 convergence is 1.40246928852e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2985525.61642 convergence is 4.71971652931e-06\n", + "Fitting topic number 3\n", + "Computing bound, all times\n", + "initial sslm bound is 3127552.86873\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3127687.57544 convergence is 4.30709638169e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3127737.97485 convergence is 1.61139515818e-05\n", + "Fitting topic number 4\n", + "Computing bound, all times\n", + "initial sslm bound is 3105989.62723\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3105922.84001 convergence is 2.15027175447e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3105863.14529 convergence is 1.92196386532e-05\n", + "Starting final iterations, max iter is 500\n", + "18 iteration lda seq bound is 12499432.6766 , convergence is 1.0\n", + " EM iter 19\n", + "E Step\n", + "M Step\n", + "Fitting topic number 0\n", + "Computing bound, all times\n", + "initial sslm bound is 2979226.29147\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2979104.77612 convergence is 4.07875545522e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2978990.4889 convergence is 3.83629412794e-05\n", + "Fitting topic number 1\n", + "Computing bound, all times\n", + "initial sslm bound is 2860153.85531\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2860154.25454 convergence is 1.39583515427e-07\n", + "Fitting topic number 2\n", + "Computing bound, all times\n", + "initial sslm bound is 2983923.85978\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2983952.47524 convergence is 9.58987681364e-06\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2983966.28026 convergence is 4.62642143986e-06\n", + "Fitting topic number 3\n", + "Computing bound, all times\n", + "initial sslm bound is 3117597.70848\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3117720.6202 convergence is 3.9425139874e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3117775.75501 convergence is 1.76843330248e-05\n", + "Fitting topic number 4\n", + "Computing bound, all times\n", + "initial sslm bound is 3110478.33768\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3110411.41494 convergence is 2.15152585637e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3110350.89326 convergence is 1.94577724663e-05\n", + "Starting final iterations, max iter is 500\n", + "19 iteration lda seq bound is 12500012.2135 , convergence is 1.0\n", + " EM iter 20\n", + "E Step\n", + "M Step\n", + "Fitting topic number 0\n", + "Computing bound, all times\n", + "initial sslm bound is 2987604.96022\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2987483.77752 convergence is 4.05618211702e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2987369.66836 convergence is 3.81957427262e-05\n", + "Fitting topic number 1\n", + "Computing bound, all times\n", + "initial sslm bound is 2859745.93607\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2859745.23825 convergence is 2.44012981221e-07\n", + "Fitting topic number 2\n", + "Computing bound, all times\n", + "initial sslm bound is 2982456.9566\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 2982495.56587 convergence is 1.29454575138e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 2982508.65216 convergence is 4.38769574537e-06\n", + "Fitting topic number 3\n", + "Computing bound, all times\n", + "initial sslm bound is 3107153.7818\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3107303.96895 convergence is 4.83359239876e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3107359.97018 convergence is 1.80224495892e-05\n", + "Fitting topic number 4\n", + "Computing bound, all times\n", + "initial sslm bound is 3115106.84624\n", + "Computing bound, all times\n", + "1 iteration lda seq bound is 3115039.17238 convergence is 2.17244114928e-05\n", + "Computing bound, all times\n", + "2 iteration lda seq bound is 3114977.79587 convergence is 1.97032857945e-05\n", + "Starting final iterations, max iter is 500\n", + "20 iteration lda seq bound is 12500594.8258 , convergence is 1.0\n" + ] + } + ], + "source": [ + "# now, we set up the model.\n", + "\n", + "ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus, id2word=dictionary, time_slice=time_slice, num_topics=5, passes=20)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[[(0.0040000000000000001, 'use'),\n", + " (0.0040000000000000001, 'users'),\n", + " (0.0040000000000000001, 'mobile'),\n", + " (0.0040000000000000001, 'technology'),\n", + " (0.0040000000000000001, 'net'),\n", + " (0.0030000000000000001, 'security'),\n", + " (0.0030000000000000001, 'software'),\n", + " (0.0030000000000000001, 'information'),\n", + " (0.0030000000000000001, 'using'),\n", + " (0.0030000000000000001, 'used'),\n", + " (0.0030000000000000001, 'like'),\n", + " (0.0030000000000000001, 'make'),\n", + " (0.0030000000000000001, 'digital'),\n", + " (0.0030000000000000001, 'internet'),\n", + " (0.0030000000000000001, 'phone'),\n", + " (0.0030000000000000001, 'online'),\n", + " (0.0030000000000000001, 'computer'),\n", + " (0.0030000000000000001, 'search'),\n", + " (0.0030000000000000001, 'system'),\n", + " (0.0030000000000000001, 'service')],\n", + " [(0.0070000000000000001, 'government'),\n", + " (0.0040000000000000001, 'blair'),\n", + " (0.0040000000000000001, 'minister'),\n", + " (0.0040000000000000001, 'labour'),\n", + " (0.0030000000000000001, 'year'),\n", + " (0.0030000000000000001, 'public'),\n", + " (0.0030000000000000001, 'last'),\n", + " (0.0030000000000000001, 'prime'),\n", + " (0.0030000000000000001, 'economic'),\n", + " (0.002, 'election'),\n", + " (0.002, 'uk'),\n", + " (0.002, 'party'),\n", + " (0.002, 'growth'),\n", + " (0.002, 'plans'),\n", + " (0.002, 'brown'),\n", + " (0.002, 'european'),\n", + " (0.002, 'may'),\n", + " (0.002, 'market'),\n", + " (0.002, 'economy'),\n", + " (0.002, 'next')],\n", + " [(0.0080000000000000002, 'best'),\n", + " (0.0060000000000000001, 'film'),\n", + " (0.0050000000000000001, 'music'),\n", + " (0.0040000000000000001, 'last'),\n", + " (0.0040000000000000001, 'show'),\n", + " (0.0040000000000000001, 'top'),\n", + " (0.0040000000000000001, 'number'),\n", + " (0.0040000000000000001, 'first'),\n", + " (0.0030000000000000001, 'star'),\n", + " (0.0030000000000000001, 'award'),\n", + " (0.002, 'uk'),\n", + " (0.002, 'tv'),\n", + " (0.002, 'band'),\n", + " (0.002, 'three'),\n", + " (0.002, 'including'),\n", + " (0.002, 'game'),\n", + " (0.002, 'bbc'),\n", + " (0.002, 'album'),\n", + " (0.002, 'british'),\n", + " (0.002, 'awards')],\n", + " [(0.0040000000000000001, 'court'),\n", + " (0.0030000000000000001, 'last'),\n", + " (0.0030000000000000001, 'first'),\n", + " (0.002, 'firm'),\n", + " (0.002, 'case'),\n", + " (0.002, 'oil'),\n", + " (0.002, 'company'),\n", + " (0.002, 'police'),\n", + " (0.002, 'former'),\n", + " (0.002, 'since'),\n", + " (0.002, 'yukos'),\n", + " (0.002, 'legal'),\n", + " (0.002, 'chief'),\n", + " (0.002, 'home'),\n", + " (0.002, 'three'),\n", + " (0.002, 'year'),\n", + " (0.002, 'rights'),\n", + " (0.002, 'russian'),\n", + " (0.002, 'part'),\n", + " (0.002, 'club')],\n", + " [(0.0050000000000000001, 'chelsea'),\n", + " (0.0050000000000000001, 'game'),\n", + " (0.0040000000000000001, 'players'),\n", + " (0.0040000000000000001, 'league'),\n", + " (0.0040000000000000001, 'think'),\n", + " (0.0040000000000000001, 'cup'),\n", + " (0.0040000000000000001, 'united'),\n", + " (0.0040000000000000001, 'arsenal'),\n", + " (0.0040000000000000001, 'club'),\n", + " (0.0040000000000000001, 'play'),\n", + " (0.0030000000000000001, 'win'),\n", + " (0.0030000000000000001, 'manager'),\n", + " (0.0030000000000000001, 'football'),\n", + " (0.0030000000000000001, 'liverpool'),\n", + " (0.0030000000000000001, 'good'),\n", + " (0.0030000000000000001, 'first'),\n", + " (0.0030000000000000001, 'last'),\n", + " (0.0030000000000000001, 'got'),\n", + " (0.0030000000000000001, 'want'),\n", + " (0.0030000000000000001, 'like')]]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# to print all topics, use `print_topics`. \n", + "\n", + "ldaseq.print_topics(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[[(0.0040000000000000001, 'use'),\n", + " (0.0040000000000000001, 'users'),\n", + " (0.0040000000000000001, 'mobile'),\n", + " (0.0040000000000000001, 'technology'),\n", + " (0.0040000000000000001, 'net'),\n", + " (0.0030000000000000001, 'security'),\n", + " (0.0030000000000000001, 'software'),\n", + " (0.0030000000000000001, 'information'),\n", + " (0.0030000000000000001, 'using'),\n", + " (0.0030000000000000001, 'used'),\n", + " (0.0030000000000000001, 'like'),\n", + " (0.0030000000000000001, 'make'),\n", + " (0.0030000000000000001, 'digital'),\n", + " (0.0030000000000000001, 'internet'),\n", + " (0.0030000000000000001, 'phone'),\n", + " (0.0030000000000000001, 'online'),\n", + " (0.0030000000000000001, 'computer'),\n", + " (0.0030000000000000001, 'search'),\n", + " (0.0030000000000000001, 'system'),\n", + " (0.0030000000000000001, 'service')],\n", + " [(0.0040000000000000001, 'use'),\n", + " (0.0040000000000000001, 'technology'),\n", + " (0.0040000000000000001, 'users'),\n", + " (0.0040000000000000001, 'mobile'),\n", + " (0.0040000000000000001, 'net'),\n", + " (0.0030000000000000001, 'software'),\n", + " (0.0030000000000000001, 'information'),\n", + " (0.0030000000000000001, 'security'),\n", + " (0.0030000000000000001, 'using'),\n", + " (0.0030000000000000001, 'digital'),\n", + " (0.0030000000000000001, 'used'),\n", + " (0.0030000000000000001, 'like'),\n", + " (0.0030000000000000001, 'make'),\n", + " (0.0030000000000000001, 'internet'),\n", + " (0.0030000000000000001, 'phone'),\n", + " (0.0030000000000000001, 'online'),\n", + " (0.0030000000000000001, 'computer'),\n", + " (0.0030000000000000001, 'system'),\n", + " (0.0030000000000000001, 'service'),\n", + " (0.002, 'broadband')],\n", + " [(0.0040000000000000001, 'use'),\n", + " (0.0040000000000000001, 'mobile'),\n", + " (0.0040000000000000001, 'technology'),\n", + " (0.0040000000000000001, 'users'),\n", + " (0.0040000000000000001, 'net'),\n", + " (0.0030000000000000001, 'software'),\n", + " (0.0030000000000000001, 'information'),\n", + " (0.0030000000000000001, 'using'),\n", + " (0.0030000000000000001, 'security'),\n", + " (0.0030000000000000001, 'digital'),\n", + " (0.0030000000000000001, 'used'),\n", + " (0.0030000000000000001, 'like'),\n", + " (0.0030000000000000001, 'make'),\n", + " (0.0030000000000000001, 'phone'),\n", + " (0.0030000000000000001, 'internet'),\n", + " (0.0030000000000000001, 'online'),\n", + " (0.0030000000000000001, 'computer'),\n", + " (0.0030000000000000001, 'service'),\n", + " (0.0030000000000000001, 'system'),\n", + " (0.0030000000000000001, 'broadband')]]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# to fix a topic and see it evolve, use `print_topic_times`\n", + "\n", + "ldaseq.print_topic_times(0) # evolution of 0th topic" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 4.94926998e-05, 4.94926998e-05, 9.99802029e-01,\n", + " 4.94926998e-05, 4.94926998e-05])" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# to check Document - Topic proportions, use `doc-topics`\n", + "\n", + "ldaseq.doc_topics(244) # check the 244th document in the corpuses topic distribution" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0.00327869, 0.98688525, 0.00327869, 0.00327869, 0.00327869])" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# to check for an unseen document\n", + "\n", + "ldaseq[[(1, 1), (4, 2)]]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# now let's compare this to the DTM wrapper.\n", + "from gensim.models.wrappers.dtmmodel import DtmModel\n", + "\n", + "\n", + "dtm_path = \"/Users/bhargavvader/Downloads/dtm_release/dtm/main\"\n", + "dtm_model = DtmModel(dtm_path, corpus, time_slice, num_topics=5, id2word=dictionary, initialize_lda=True)\n", + "dtm_model.save('dtm_news')\n", + "ldaseq.save('ldaseq_news')" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "num_topics = 5\n", + "topic_term = dtm_model.lambda_[:,:,0] # the lambda matrix contains \n", + "\n", + "def validate(topic_term):\n", + " topic_term = numpy.exp(topic_term)\n", + " topic_term = topic_term / topic_term.sum()\n", + " topic_term = topic_term * num_topics\n", + " return topic_term\n", + "\n", + "def get_topics(topic_terms, topic_number):\n", + " topic_terms = topic_terms[topic_number]\n", + " bestn = matutils.argsort(topic_terms, 20, reverse=True)\n", + " beststr = [dictionary[id_] for id_ in bestn]\n", + " return beststr\n", + "\n", + "topic_term = validate(topic_term)\n", + "# next is doc_topic_dist\n", + "doc_topic = dtm_model.gamma_\n", + "# next is the vocabulary, which we already have\n", + "\n", + "vocab = []\n", + "for i in range(0, len(dictionary)):\n", + " vocab.append(dictionary[i])\n", + "\n", + "# we now need term-frequency and doc_lengths\n", + "\n", + "def term_frequency(corpus, dictionary):\n", + " term_frequency = [0] * len(dictionary)\n", + " doc_lengths = []\n", + " for doc in corpus:\n", + " doc_lengths.append(len(doc))\n", + " for pair in doc:\n", + " term_frequency[pair[0]] += pair[1]\n", + " return term_frequency, doc_lengths\n", + "\n", + "topics_wrapper = []\n", + "for i in range(0, num_topics):\n", + " topics_wrapper.append(get_topics(topic_term, i))\n", + " \n", + " \n", + "term_frequency, doc_lengths = term_frequency(corpus, dictionary)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pyLDAvis\n", + "vis_wrapper = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)\n", + "pyLDAvis.display(vis_wrapper)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# now let us visualize the DTM python port.\n", + "\n", + "# getting a list of just words for each topics\n", + "dtm_tp = ldaseq.print_topics()\n", + "dtm_topics = []\n", + "for topic in dtm_tp:\n", + " topics = []\n", + " for prob, word in topic:\n", + " topics.append(word)\n", + " dtm_topics.append(topics)\n", + " \n", + "# getting dtm python doc-topic proportions\n", + "doc_topic = numpy.copy(ldaseq.gammas)\n", + "doc_topic /= doc_topic.sum(axis=1)[:, numpy.newaxis]\n", + "\n", + "# getting dtm topic_word proportions for first time_slice\n", + "def get_topic_term(ldaseq, topic, time=0):\n", + " topic = numpy.transpose(ldaseq.topic_chains[topic].e_log_prob)\n", + " topic = topic[time]\n", + " topic = numpy.exp(topic)\n", + " topic = topic / topic.sum()\n", + " return topic\n", + "\n", + "# get_topic_term(ldaseq, 0).shape\n", + "topic_term =numpy.array(numpy.split(numpy.concatenate((get_topic_term(ldaseq, 0), get_topic_term(ldaseq, 1), get_topic_term(ldaseq, 2), get_topic_term(ldaseq, 3), get_topic_term(ldaseq, 4))), 5))\n", + "vis_dtm = pyLDAvis.prepare(topic_term_dists=topic_term, doc_topic_dists=doc_topic, doc_lengths=doc_lengths, vocab=vocab, term_frequency=term_frequency)\n", + "pyLDAvis.display(vis_dtm)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "cm_wrapper = CoherenceModel(topics=topics_wrapper, corpus=news_corpus, dictionary=dictionary, coherence='u_mass')\n", + "cm_DTM = CoherenceModel(topics=topics_DTM, corpus=news_corpus, dictionary=dictionary, coherence='u_mass')\n", + "\n", + "print (cm_wrapper.get_coherence())\n", + "print (cm_DTM.get_coherence())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} From 09bc17e0f9d2246036e4e913d05aea4051bde05f Mon Sep 17 00:00:00 2001 From: Bhargav Srinivasa Date: Mon, 15 Aug 2016 20:36:23 +0530 Subject: [PATCH 35/38] Added test_file --- gensim/models/ldaseqmodel.py | 4 +-- gensim/test/test_ldaseqmodel.py | 46 +++++++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 2 deletions(-) create mode 100644 gensim/test/test_ldaseqmodel.py diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index b888e81f59..f4fba28e00 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -91,7 +91,7 @@ class LdaSeqModel(utils.SaveLoad): """ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_topics=10, - initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10): + initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10, random_state=None): """ `corpus` is any iterable gensim corpus @@ -138,7 +138,7 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_ # if a corpus and time_slice is provided, depending on the user choice of initializing LDA, we start DTM. if self.corpus is not None and time_slice is not None: if initialize == 'gensim': - lda_model = ldamodel.LdaModel(corpus, id2word=self.corpus.id2word, num_topics=self.num_topics, passes=passes, alpha=self.alphas) + lda_model = ldamodel.LdaModel(corpus, id2word=self.corpus.id2word, num_topics=self.num_topics, passes=passes, alpha=self.alphas, random_state=random_state) self.sstats = numpy.transpose(lda_model.state.sstats) if initialize == 'ldamodel': self.sstats = numpy.transpose(lda_model.state.sstats) diff --git a/gensim/test/test_ldaseqmodel.py b/gensim/test/test_ldaseqmodel.py new file mode 100644 index 0000000000..5cd7a33cae --- /dev/null +++ b/gensim/test/test_ldaseqmodel.py @@ -0,0 +1,46 @@ +""" + +Tests to check DTM math functions and Topic-Word, Doc-Topic proportions. + +""" + +import numpy # for arrays, array broadcasting etc. +from gensim.models import ldaseqmodel, ldamodel +from gensim.corpora import Dictionary +import os.path +import unittest +import logging + + +module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder +datapath = lambda fname: os.path.join(module_path, 'test_data/DTM', fname) + + +class TestLdaSeq(unittest.TestCase): + def setUp(self): + texts = [[u'senior', u'studios', u'studios', u'studios', u'creators', u'award', u'mobile', u'currently', u'challenges', u'senior', u'summary', u'senior', u'motivated', u'creative', u'senior'],[u'performs', u'engineering', u'tasks', u'infrastructure', u'focusing', u'primarily', u'programming', u'interaction', u'designers', u'engineers', u'leadership', u'teams', u'teams', u'crews', u'responsibilities', u'engineering', u'quality', u'functional', u'functional', u'teams', u'organizing', u'prioritizing', u'technical', u'decisions', u'engineering', u'participates', u'participates', u'reviews', u'participates', u'hiring', u'conducting', u'interviews'],[u'feedback', u'departments', u'define', u'focusing', u'engineering', u'teams', u'crews', u'facilitate', u'engineering', u'departments', u'deadlines', u'milestones', u'typically', u'spends', u'designing', u'developing', u'updating', u'bugs', u'mentoring', u'engineers', u'define', u'schedules', u'milestones', u'participating'],[ u'reviews', u'interviews', u'sized', u'teams', u'interacts', u'disciplines', u'knowledge', u'skills', u'knowledge', u'knowledge', u'xcode', u'scripting', u'debugging', u'skills', u'skills', u'knowledge', u'disciplines', u'animation', u'networking', u'expertise', u'competencies', u'oral', u'skills', u'management', u'skills', u'proven', u'effectively', u'teams', u'deadline', u'environment', u'bachelor', u'minimum', u'shipped', u'leadership', u'teams', u'location', u'resumes', u'jobs', u'candidates', u'openings', u'jobs'], + [u'maryland', u'client', u'producers', u'electricity', u'operates', u'storage', u'utility', u'retail', u'customers', u'engineering', u'consultant', u'maryland', u'summary', u'technical', u'technology', u'departments', u'expertise', u'maximizing', u'output', u'reduces', u'operating', u'participates', u'areas', u'engineering', u'conducts', u'testing', u'solve', u'supports', u'environmental', u'understands', u'objectives', u'operates', u'responsibilities', u'handles', u'complex', u'engineering', u'aspects', u'monitors', u'quality', u'proficiency', u'optimization', u'recommendations', u'supports', u'personnel', u'troubleshooting', u'commissioning', u'startup', u'shutdown', u'supports', u'procedure', u'operating', u'units', u'develops', u'simulations', u'troubleshooting', u'tests', u'enhancing', u'solving', u'develops', u'estimates', u'schedules', u'scopes', u'understands', u'technical', u'management', u'utilize', u'routine', u'conducts', u'hazards', u'utilizing', u'hazard', u'operability', u'methodologies', u'participates', u'startup', u'reviews', u'pssr', u'participate', u'teams', u'participate', u'regulatory', u'audits', u'define', u'scopes', u'budgets', u'schedules', u'technical', u'management', u'environmental', u'awareness', u'interfacing', u'personnel', u'interacts', u'regulatory', u'departments', u'input', u'objectives', u'identifying', u'introducing', u'concepts', u'solutions', u'peers', u'customers', u'coworkers', u'knowledge', u'skills', u'engineering', u'quality', u'engineering'], [u'commissioning', u'startup', u'knowledge', u'simulators', u'technologies', u'knowledge', u'engineering', u'techniques', u'disciplines', u'leadership', u'skills', u'proven', u'engineers', u'oral', u'skills', u'technical', u'skills', u'analytically', u'solve', u'complex', u'interpret', u'proficiency', u'simulation', u'knowledge', u'applications', u'manipulate', u'applications', u'engineering'],[u'calculations', u'programs', u'matlab', u'excel', u'independently', u'environment', u'proven', u'skills', u'effectively', u'multiple', u'tasks', u'planning', u'organizational', u'management', u'skills', u'rigzone', u'jobs', u'developer', u'exceptional', u'strategies', u'junction', u'exceptional', u'strategies', u'solutions', u'solutions', u'biggest', u'insurers', u'operates', u'investment'], [u'vegas', u'tasks', u'electrical', u'contracting', u'expertise', u'virtually', u'electrical', u'developments', u'institutional', u'utilities', u'technical', u'experts', u'relationships', u'credibility', u'contractors', u'utility', u'customers', u'customer', u'relationships', u'consistently', u'innovations', u'profile', u'construct', u'envision', u'dynamic', u'complex', u'electrical', u'management', u'grad', u'internship', u'electrical', u'engineering', u'infrastructures', u'engineers', u'documented', u'management', u'engineering', u'quality', u'engineering', u'electrical', u'engineers', u'complex', u'distribution', u'grounding', u'estimation', u'testing', u'procedures', u'voltage', u'engineering'],[u'troubleshooting', u'installation', u'documentation', u'bsee', u'certification', u'electrical', u'voltage', u'cabling', u'electrical', u'engineering', u'candidates', u'electrical', u'internships', u'oral', u'skills', u'organizational', u'prioritization', u'skills', u'skills', u'excel', u'cadd', u'calculation', u'autocad', u'mathcad', u'skills', u'skills', u'customer', u'relationships', u'solving', u'ethic', u'motivation', u'tasks', u'budget', u'affirmative', u'diversity', u'workforce', u'gender', u'orientation', u'disability', u'disabled', u'veteran', u'vietnam', u'veteran', u'qualifying', u'veteran', u'diverse', u'candidates', u'respond', u'developing', u'workplace', u'reflects', u'diversity', u'communities', u'reviews', u'electrical', u'contracting', u'southwest', u'electrical', u'contractors'], [u'intern', u'electrical', u'engineering', u'idexx', u'laboratories', u'validating', u'idexx', u'integrated', u'hardware', u'entails', u'planning', u'debug', u'validation', u'engineers', u'validation', u'methodologies', u'healthcare', u'platforms', u'brightest', u'solve', u'challenges', u'innovation', u'technology', u'idexx', u'intern', u'idexx', u'interns', u'supplement', u'interns', u'teams', u'roles', u'competitive', u'interns', u'idexx', u'interns', u'participate', u'internships', u'mentors', u'seminars', u'topics', u'leadership', u'workshops', u'relevant', u'planning', u'topics', u'intern', u'presentations', u'mixers', u'applicants', u'ineligible', u'laboratory', u'compliant', u'idexx', u'laboratories', u'healthcare', u'innovation', u'practicing', u'veterinarians', u'diagnostic', u'technology', u'idexx', u'enhance', u'veterinarians', u'efficiency', u'economically', u'idexx', u'worldwide', u'diagnostic', u'tests', u'tests', u'quality', u'headquartered', u'idexx', u'laboratories', u'employs', u'customers', u'qualifications', u'applicants', u'idexx', u'interns', u'potential', u'demonstrated', u'portfolio', u'recommendation', u'resumes', u'marketing', u'location', u'americas', u'verification', u'validation', u'schedule', u'overtime', u'idexx', u'laboratories', u'reviews', u'idexx', u'laboratories', u'nasdaq', u'healthcare', u'innovation', u'practicing', u'veterinarians'], [u'location', u'duration', u'temp', u'verification', u'validation', u'tester', u'verification', u'validation', u'middleware', u'specifically', u'testing', u'applications', u'clinical', u'laboratory', u'regulated', u'environment', u'responsibilities', u'complex', u'hardware', u'testing', u'clinical', u'analyzers', u'laboratory', u'graphical', u'interfaces', u'complex', u'sample', u'sequencing', u'protocols', u'developers', u'correction', u'tracking', u'tool', u'timely', u'troubleshoot', u'testing', u'functional', u'manual', u'automated', u'participate', u'ongoing'],[u'testing', u'coverage', u'planning', u'documentation', u'testing', u'validation', u'corrections', u'monitor', u'implementation', u'recurrence', u'operating', u'statistical', u'quality', u'testing', u'global', u'multi', u'teams', u'travel', u'skills', u'concepts', u'waterfall', u'agile', u'methodologies', u'debugging', u'skills', u'complex', u'automated', u'instrumentation', u'environment', u'hardware', u'mechanical', u'components', u'tracking', u'lifecycle', u'management', u'quality', u'organize', u'define', u'priorities', u'organize', u'supervision', u'aggressive', u'deadlines', u'ambiguity', u'analyze', u'complex', u'situations', u'concepts', u'technologies', u'verbal', u'skills', u'effectively', u'technical', u'clinical', u'diverse', u'strategy', u'clinical', u'chemistry', u'analyzer', u'laboratory', u'middleware', u'basic', u'automated', u'testing', u'biomedical', u'engineering', u'technologists', u'laboratory', u'technology', u'availability', u'click', u'attach'], [u'scientist', u'linux', u'asrc', u'scientist', u'linux', u'asrc', u'technology', u'solutions', u'subsidiary', u'asrc', u'engineering', u'technology', u'contracts'], [u'multiple', u'agencies', u'scientists', u'engineers', u'management', u'personnel', u'allows', u'solutions', u'complex', u'aeronautics', u'aviation', u'management', u'aviation', u'engineering', u'hughes', u'technical', u'technical', u'aviation', u'evaluation', u'engineering', u'management', u'technical', u'terminal', u'surveillance', u'programs', u'currently', u'scientist', u'travel', u'responsibilities', u'develops', u'technology', u'modifies', u'technical', u'complex', u'reviews', u'draft', u'conformity', u'completeness', u'testing', u'interface', u'hardware', u'regression', u'impact', u'reliability', u'maintainability', u'factors', u'standardization', u'skills', u'travel', u'programming', u'linux', u'environment', u'cisco', u'knowledge', u'terminal', u'environment', u'clearance', u'clearance', u'input', u'output', u'digital', u'automatic', u'terminal', u'management', u'controller', u'termination', u'testing', u'evaluating', u'policies', u'procedure', u'interface', u'installation', u'verification', u'certification', u'core', u'avionic', u'programs', u'knowledge', u'procedural', u'testing', u'interfacing', u'hardware', u'regression', u'impact', u'reliability', u'maintainability', u'factors', u'standardization', u'missions', u'asrc', u'subsidiaries', u'affirmative', u'employers', u'applicants', u'disability', u'veteran', u'technology', u'location', u'airport', u'bachelor', u'schedule', u'travel', u'contributor', u'management', u'asrc', u'reviews'], [u'technical', u'solarcity', u'niche', u'vegas', u'overview', u'resolving', u'customer', u'clients', u'expanding', u'engineers', u'developers', u'responsibilities', u'knowledge', u'planning', u'adapt', u'dynamic', u'environment', u'inventive', u'creative', u'solarcity', u'lifecycle', u'responsibilities', u'technical', u'analyzing', u'diagnosing', u'troubleshooting', u'customers', u'ticketing', u'console', u'escalate', u'knowledge', u'engineering', u'timely', u'basic', u'phone', u'functionality', u'customer', u'tracking', u'knowledgebase', u'rotation', u'configure', u'deployment', u'sccm', u'technical', u'deployment', u'deploy', u'hardware', u'solarcity', u'bachelor', u'knowledge', u'dell', u'laptops', u'analytical', u'troubleshooting', u'solving', u'skills', u'knowledge', u'databases', u'preferably', u'server', u'preferably', u'monitoring', u'suites', u'documentation', u'procedures', u'knowledge', u'entries', u'verbal', u'skills', u'customer', u'skills', u'competitive', u'solar', u'package', u'insurance', u'vacation', u'savings', u'referral', u'eligibility', u'equity', u'performers', u'solarcity', u'affirmative', u'diversity', u'workplace', u'applicants', u'orientation', u'disability', u'veteran', u'careerrookie'], [u'embedded', u'exelis', u'junction', u'exelis', u'embedded', u'acquisition', u'networking', u'capabilities', u'classified', u'customer', u'motivated', u'develops', u'tests', u'innovative', u'solutions', u'minimal', u'supervision', u'paced', u'environment', u'enjoys', u'assignments', u'interact', u'multi', u'disciplined', u'challenging', u'focused', u'embedded', u'developments', u'spanning', u'engineering', u'lifecycle', u'specification', u'enhancement', u'applications', u'embedded', u'freescale', u'applications', u'android', u'platforms', u'interface', u'customers', u'developers', u'refine', u'specifications', u'architectures'],[u'java', u'programming', u'scripts', u'python', u'debug', u'debugging', u'emulators', u'regression', u'revisions', u'specialized', u'setups', u'capabilities', u'subversion', u'technical', u'documentation', u'multiple', u'engineering', u'techexpousa', u'reviews'], [u'modeler', u'semantic', u'modeling', u'models', u'skills', u'ontology', u'resource', u'framework', u'schema', u'technologies', u'hadoop', u'warehouse', u'oracle', u'relational', u'artifacts', u'models', u'dictionaries', u'models', u'interface', u'specifications', u'documentation', u'harmonization', u'mappings', u'aligned', u'coordinate', u'technical', u'peer', u'reviews', u'stakeholder', u'communities', u'impact', u'domains', u'relationships', u'interdependencies', u'models', u'define', u'analyze', u'legacy', u'models', u'corporate', u'databases', u'architectural', u'alignment', u'customer', u'expertise', u'harmonization', u'modeling', u'modeling', u'consulting', u'stakeholders', u'quality', u'models', u'storage', u'agile', u'specifically', u'focus', u'modeling', u'qualifications', u'bachelors', u'accredited', u'modeler', u'encompass', u'evaluation', u'skills', u'knowledge', u'modeling', u'techniques', u'resource', u'framework', u'schema', u'technologies', u'unified', u'modeling', u'technologies', u'schemas', u'ontologies', u'sybase', u'knowledge', u'skills', u'interpersonal', u'skills', u'customers', u'clearance', u'applicants', u'eligibility', u'classified', u'clearance', u'polygraph', u'techexpousa', u'solutions', u'partnership', u'solutions', u'integration'], [u'technologies', u'junction', u'develops', u'maintains', u'enhances', u'complex', u'diverse', u'intensive', u'analytics', u'algorithm', u'manipulation', u'management', u'documented', u'individually', u'reviews', u'tests', u'components', u'adherence', u'resolves', u'utilizes', u'methodologies', u'environment', u'input', u'components', u'hardware', u'offs', u'reuse', u'cots', u'gots', u'synthesis', u'components', u'tasks', u'individually', u'analyzes', u'modifies', u'debugs', u'corrects', u'integrates', u'operating', u'environments', u'develops', u'queries', u'databases', u'repositories', u'recommendations', u'improving', u'documentation', u'develops', u'implements', u'algorithms', u'functional', u'assists', u'developing', u'executing', u'procedures', u'components', u'reviews', u'documentation', u'solutions', u'analyzing', u'conferring', u'users', u'engineers', u'analyzing', u'investigating', u'areas', u'adapt', u'hardware', u'mathematical', u'models', u'predict', u'outcome', u'implement', u'complex', u'database', u'repository', u'interfaces', u'queries', u'bachelors', u'accredited', u'substituted', u'bachelors', u'firewalls', u'ipsec', u'vpns', u'technology', u'administering', u'servers', u'apache', u'jboss', u'tomcat', u'developing', u'interfaces', u'firefox', u'internet', u'explorer', u'operating', u'mainframe', u'linux', u'solaris', u'virtual', u'scripting', u'programming', u'oriented', u'programming', u'ajax', u'script', u'procedures', u'cobol', u'cognos', u'fusion', u'focus', u'html', u'java', u'java', u'script', u'jquery', u'perl', u'visual', u'basic', u'powershell', u'cots', u'cots', u'oracle', u'apex', u'integration', u'competitive', u'package', u'bonus', u'corporate', u'equity', u'tuition', u'reimbursement', u'referral', u'bonus', u'holidays', u'insurance', u'flexible', u'disability', u'insurance'], [u'technologies', u'disability', u'accommodation', u'recruiter', u'techexpousa'], + ['bank','river','shore','water'],['river','water','flow','fast','tree'],['bank','water','fall','flow'],['bank','bank','water','rain','river'], + ['river','water','mud','tree'],['money','transaction','bank','finance'], + ['bank','borrow','money'], ['bank','finance'], ['finance','money','sell','bank'],['borrow','sell'],['bank','loan','sell']] + + dictionary = Dictionary(texts) + corpus = [dictionary.doc2bow(text) for text in texts] + self.ldaseq = ldaseqmodel.LdaSeqModel(corpus = corpus , id2word= dictionary, num_topics=2, time_slice=[10, 10, 11], random_state=numpy.random.seed(0)) + + def testTopicWord(self): + + topics = self.ldaseq.print_topics(0) + expected_topic_word = [(0.053999999999999999, 'skills')] + self.assertAlmostEqual(topics[0][0][0], expected_topic_word[0][0], places=2) + self.assertEqual(topics[0][0][1], expected_topic_word[0][1]) + + + def testDocTopic(self): + doc_topic = self.ldaseq.doc_topics(0) + expected_doc_topic = 0.99933422103861524 + self.assertAlmostEqual(doc_topic[0], expected_doc_topic, places=2) + +if __name__ == '__main__': + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) + unittest.main() From c7e9275638c0df6b7dfa74bea9eb2d1cb7558f7a Mon Sep 17 00:00:00 2001 From: Bhargav Srinivasa Date: Tue, 16 Aug 2016 15:00:15 +0530 Subject: [PATCH 36/38] Addressed suggestions --- gensim/models/ldaseqmodel.py | 134 +++--- gensim/test/test_data/DTM/sstats_test | 562 ++++++++++++++++++++++++++ gensim/test/test_ldaseqmodel.py | 10 +- 3 files changed, 636 insertions(+), 70 deletions(-) create mode 100644 gensim/test/test_data/DTM/sstats_test diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index f4fba28e00..4e7e5b60b9 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -22,13 +22,14 @@ """ -from gensim import interfaces, utils, matutils +from gensim import utils, matutils from gensim.models import ldamodel import numpy -import math -from scipy.special import digamma +from scipy.special import digamma, gammaln from scipy import optimize -import sys +import logging + +logger = logging.getLogger('gensim.models.ldaseqmodel') class seq_corpus(utils.SaveLoad): @@ -59,7 +60,11 @@ def __init__(self, corpus=None, time_slice=None, id2word=None): self.corpus = corpus if self.corpus is not None: - self.corpus_len = len(corpus) + try: + self.corpus_len = len(corpus) + except: + logger.warning("input corpus stream has no len(); counting documents") + self.corpus_len = sum(1 for _ in corpus) self.time_slice = time_slice if self.time_slice is not None: @@ -91,7 +96,8 @@ class LdaSeqModel(utils.SaveLoad): """ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_topics=10, - initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10, random_state=None): + initialize='gensim', sstats=None, lda_model=None, obs_variance=0.5, chain_variance=0.005, passes=10, + random_state=None, lda_inference_max_iter=25, em_min_iter=6, em_max_iter=20): """ `corpus` is any iterable gensim corpus @@ -112,6 +118,8 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_ beta distribution. `passes` is the number of passes of the initial LdaModel. + + `random_state` can be a numpy.random.RandomState object or the seed for one, for the LdaModel. """ if corpus is not None: self.corpus = seq_corpus(corpus=corpus, id2word=id2word, time_slice=time_slice) @@ -149,7 +157,7 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_ self.init_ldaseq_ss(chain_variance, obs_variance, self.alphas, self.sstats) # fit DTM - self.fit_lda_seq(self.corpus) + self.fit_lda_seq(self.corpus, lda_inference_max_iter, em_min_iter, em_max_iter) def init_ldaseq_ss(self, topic_chain_variance, topic_obs_variance, alpha, init_suffstats): @@ -157,9 +165,9 @@ def init_ldaseq_ss(self, topic_chain_variance, topic_obs_variance, alpha, init_s Method to initialize State Space Language Model, topic wise. """ self.alphas = alpha - for k in range(0, self.num_topics): + for k, chain in enumerate(self.topic_chains): sstats = init_suffstats[:, k] - sslm.sslm_counts_init(self.topic_chains[k], topic_obs_variance, topic_chain_variance, sstats) + sslm.sslm_counts_init(chain, topic_obs_variance, topic_chain_variance, sstats) # initialize the below matrices only if running DIM # ldaseq.topic_chains[k].w_phi_l = numpy.zeros((ldaseq.vocab_len, ldaseq.num_time_slices)) @@ -167,7 +175,7 @@ def init_ldaseq_ss(self, topic_chain_variance, topic_obs_variance, alpha, init_s # ldaseq.topic_chains[k].w_phi_sq = numpy.zeros((ldaseq.vocab_len, ldaseq.num_time_slices)) - def fit_lda_seq(self, seq_corpus): + def fit_lda_seq(self, seq_corpus, lda_inference_max_iter, em_min_iter, em_max_iter): """ fit an lda sequence model: @@ -180,10 +188,8 @@ def fit_lda_seq(self, seq_corpus): maximize topics """ - LDA_INFERENCE_MAX_ITER = 25 LDASQE_EM_THRESHOLD = 1e-4 - LDA_SEQ_MIN_ITER = 6 - LDA_SEQ_MAX_ITER = 20 + num_topics = self.num_topics vocab_len = self.vocab_len @@ -194,10 +200,10 @@ def fit_lda_seq(self, seq_corpus): convergence = LDASQE_EM_THRESHOLD + 1 iter_ = 0 - while iter_ < LDA_SEQ_MIN_ITER or ((convergence > LDASQE_EM_THRESHOLD) and iter_ <= LDA_SEQ_MAX_ITER): + while iter_ < em_min_iter or ((convergence > LDASQE_EM_THRESHOLD) and iter_ <= em_max_iter): - print (" EM iter ", iter_) - print ("E Step") + logger.info(" EM iter ", iter_) + logger.info("E Step") old_bound = bound @@ -211,37 +217,38 @@ def fit_lda_seq(self, seq_corpus): lhoods = numpy.resize(numpy.zeros(corpus_len * num_topics + 1), (corpus_len, num_topics + 1)) # compute the likelihood of a sequential corpus under an LDA # seq model and find the evidence lower bound. This is the E - Step - bound, gammas = self.lda_seq_infer(seq_corpus, topic_suffstats, gammas, lhoods, iter_) + bound, gammas = self.lda_seq_infer(seq_corpus, topic_suffstats, gammas, lhoods, iter_, lda_inference_max_iter) self.gammas = gammas - print ("M Step") + logger.info("M Step") # fit the variational distribution. This is the M - Step topic_bound = self.fit_lda_seq_topics(topic_suffstats) bound += topic_bound if ((bound - old_bound) < 0): - if LDA_INFERENCE_MAX_ITER < 10: - LDA_INFERENCE_MAX_ITER *= 2 - print ("Bound went down, increasing iterations to", LDA_INFERENCE_MAX_ITER) + # if max_iter is too low, increase iterations. + if lda_inference_max_iter < 10: + lda_inference_max_iter *= 2 + logger.info("Bound went down, increasing iterations to", lda_inference_max_iter) # check for convergence convergence = numpy.fabs((bound - old_bound) / old_bound) if convergence < LDASQE_EM_THRESHOLD: - LDA_INFERENCE_MAX_ITER = 500 - print ("Starting final iterations, max iter is", LDA_INFERENCE_MAX_ITER) + lda_inference_max_iter = 500 + logger.info("Starting final iterations, max iter is", lda_inference_max_iter) convergence = 1.0 - print (iter_, "iteration lda seq bound is", bound, ", convergence is ", convergence) + logger.info(iter_, "iteration lda seq bound is", bound, ", convergence is ", convergence) iter_ += 1 return bound - def lda_seq_infer(self, seq_corpus, topic_suffstats, gammas, lhoods, iter_): + def lda_seq_infer(self, seq_corpus, topic_suffstats, gammas, lhoods, iter_, lda_inference_max_iter): """ Inference or E- Step. This is used to set up the gensim LdaModel to be used for each time-slice. @@ -257,15 +264,15 @@ def lda_seq_infer(self, seq_corpus, topic_suffstats, gammas, lhoods, iter_): model = "DTM" if model == "DTM": - bound, gammas = self.inferDTMseq(seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound) + bound, gammas = self.inferDTMseq(seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound, lda_inference_max_iter) elif model == "DIM": self.InfluenceTotalFixed(seq_corpus) - bound, gammas = self.inferDIMseq(seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound) + bound, gammas = self.inferDIMseq(seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound, lda_inference_max_iter) return bound, gammas - def inferDTMseq(self, seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound): + def inferDTMseq(self, seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost, iter_, bound, lda_inference_max_iter): """ Computes the likelihood of a sequential corpus under an LDA seq model, and return the likelihood bound. Need to pass the LdaSeq model, seq_corpus, sufficient stats, gammas and lhoods matrices previously created, @@ -295,9 +302,9 @@ def inferDTMseq(self, seq_corpus, topic_suffstats, gammas, lhoods, lda, ldapost, # TODO: replace fit_lda_post with appropriate ldamodel functions, if possible. if iter_ == 0: - doc_lhood = LdaPost.fit_lda_post(ldapost, doc_num, time, None, None, None, None, None) + doc_lhood = LdaPost.fit_lda_post(ldapost, doc_num, time, None, lda_inference_max_iter=lda_inference_max_iter) else: - doc_lhood = LdaPost.fit_lda_post(ldapost, doc_num, time, self, None, None, None, None) + doc_lhood = LdaPost.fit_lda_post(ldapost, doc_num, time, self, lda_inference_max_iter=lda_inference_max_iter) if topic_suffstats is not None: topic_suffstats = LdaPost.update_lda_seq_ss(ldapost, time, line, topic_suffstats) @@ -314,8 +321,7 @@ def make_lda_seq_slice(self, lda, time): """ set up the LDA model topic-word values with that of ldaseq. """ - num_topics = self.num_topics - for k in range(0, num_topics): + for k in range(0, self.num_topics): lda.topics[:, k] = numpy.copy(self.topic_chains[k].e_log_prob[:, time]) lda.alpha = numpy.copy(self.alphas) @@ -329,9 +335,9 @@ def fit_lda_seq_topics(self, topic_suffstats): lhood = 0 lhood_term = 0 - for k in range(0, self.num_topics): - print ("Fitting topic number", k) - lhood_term = sslm.fit_sslm(self.topic_chains[k], topic_suffstats[k]) + for k, chain in enumerate(self.topic_chains): + logger.info("Fitting topic number", k) + lhood_term = sslm.fit_sslm(chain, topic_suffstats[k]) lhood += lhood_term return lhood @@ -394,7 +400,7 @@ def __getitem__(self, doc): time_lhoods = [] for time in range(0, self.num_time_slices): lda_model = self.make_lda_seq_slice(lda_model, time) # create lda_seq slice - lhood = LdaPost.fit_lda_post(ldapost, 0, time, self, None, None, None, None) + lhood = LdaPost.fit_lda_post(ldapost, 0, time, self) time_lhoods.append(lhood) doc_topic = ldapost.gamma / ldapost.gamma.sum() @@ -477,19 +483,19 @@ def compute_post_variance(self, word, chain_variance): fwd_variance[0] = chain_variance * INIT_VARIANCE_CONST for t in range(1, T + 1): if self.obs_variance: - w = self.obs_variance / (fwd_variance[t - 1] + chain_variance + self.obs_variance) + c = self.obs_variance / (fwd_variance[t - 1] + chain_variance + self.obs_variance) else: - w = 0 - fwd_variance[t] = w * (fwd_variance[t - 1] + chain_variance) + c = 0 + fwd_variance[t] = c * (fwd_variance[t - 1] + chain_variance) # backward pass variance[T] = fwd_variance[T] for t in range(T - 1, -1, -1): if fwd_variance[t] > 0.0: - w = numpy.power((fwd_variance[t] / (fwd_variance[t] + chain_variance)), 2) + c = numpy.power((fwd_variance[t] / (fwd_variance[t] + chain_variance)), 2) else: - w = 0 - variance[t] = (w * (variance[t + 1] - chain_variance)) + ((1 - w) * fwd_variance[t]) + c = 0 + variance[t] = (c * (variance[t + 1] - chain_variance)) + ((1 - c) * fwd_variance[t]) return variance, fwd_variance @@ -516,17 +522,17 @@ def compute_post_mean(self, word, chain_variance): # forward fwd_mean[0] = 0 for t in range(1, T + 1): - w = self.obs_variance / (fwd_variance[t - 1] + chain_variance + self.obs_variance) - fwd_mean[t] = w * fwd_mean[t - 1] + (1 - w) * obs[t - 1] + c = self.obs_variance / (fwd_variance[t - 1] + chain_variance + self.obs_variance) + fwd_mean[t] = c * fwd_mean[t - 1] + (1 - c) * obs[t - 1] # backward pass mean[T] = fwd_mean[T] for t in range(T - 1, -1, -1): if chain_variance == 0.0: - w = 0.0 + c = 0.0 else: - w = chain_variance / (fwd_variance[t] + chain_variance) - mean[t] = w * fwd_mean[t] + (1 - w) * mean[t + 1] + c = chain_variance / (fwd_variance[t] + chain_variance) + mean[t] = c * fwd_mean[t] + (1 - c) * mean[t + 1] return mean, fwd_mean @@ -560,11 +566,9 @@ def sslm_counts_init(self, obs_variance, chain_variance, sstats): self.obs_variance = obs_variance self.chain_variance = chain_variance - # compute post variance + # compute post variance, mean for w in range(0, W): self.variance[w], self.fwd_variance[w] = self.compute_post_variance(w, self.chain_variance) - - for w in range(0, W): self.mean[w], self.fwd_mean[w] = self.compute_post_mean(w, self.chain_variance) self.zeta = self.update_zeta() @@ -585,8 +589,8 @@ def fit_sslm(self, counts): totals = numpy.zeros(counts.shape[1]) - for w in range(0, W): - self.variance[w], self.fwd_variance[w] = self.compute_post_variance(w, self.chain_variance) + # computing variance, fwd_variance + self.variance, self.fwd_variance = map(numpy.array, list(zip(*[self.compute_post_variance(w, self.chain_variance) for w in range(0, W)]))) # column sum of counts totals = counts.sum(axis=0) @@ -598,7 +602,7 @@ def fit_sslm(self, counts): if model == "DIM": bound = self.compute_bound_fixed(counts, totals) - print ("initial sslm bound is ", bound) + logger.info("initial sslm bound is ", bound) while converged > sslm_fit_threshold and iter_ < sslm_max_iter: iter_ += 1 @@ -611,7 +615,7 @@ def fit_sslm(self, counts): bound = self.compute_bound_fixed(counts, totals) converged = numpy.fabs((bound - old_bound) / old_bound) - print (iter_, " iteration lda seq bound is ", bound, " convergence is", converged) + logger.info(iter_, " iteration lda seq bound is ", bound, " convergence is", converged) self.e_log_prob = self.compute_expected_log_prob() return bound @@ -633,15 +637,14 @@ def compute_bound(self, word_counts, totals): ent = 0 chain_variance = self.chain_variance - for w in range(0, W): - self.mean[w], self.fwd_mean[w] = self.compute_post_mean(w, chain_variance) - + # computing mean, fwd_mean + self.mean, self.fwd_mean = map(numpy.array, (zip(*[self.compute_post_mean(w, self.chain_variance) for w in range(0, W)]))) self.zeta = self.update_zeta() for w in range(0, W): val += (self.variance[w][0] - self.variance[w][T]) / 2 * chain_variance - print ("Computing bound, all times") + logger.info("Computing bound, all times") for t in range(1, T + 1): term_1 = 0.0 @@ -672,6 +675,8 @@ def compute_bound(self, word_counts, totals): def update_obs(self, word_counts, totals): """ Fucntion to perform optimization of obs. + This is by far the slowest function in the whole algorithm. + Replacing or improving the performance of this would greatly speed things up. """ OBS_NORM_CUTOFF = 2 STEP_SIZE = 0.01 @@ -921,7 +926,7 @@ def compute_lda_lhood(self): # sigma_l = 0 # sigma_d = 0 - lhood = math.lgamma(numpy.sum(self.lda.alpha)) - math.lgamma(gamma_sum) + lhood = gammaln(numpy.sum(self.lda.alpha)) - gammaln(gamma_sum) self.lhood[num_topics] = lhood # influence_term = 0 @@ -935,7 +940,7 @@ def compute_lda_lhood(self): # influence_term = - ((influence_topic * influence_topic + sigma_l * sigma_l) / 2.0 / (sigma_d * sigma_d)) e_log_theta_k = digamma(self.gamma[k]) - digsum - lhood_term = (self.lda.alpha[k] - self.gamma[k]) * e_log_theta_k + math.lgamma(self.gamma[k]) - math.lgamma(self.lda.alpha[k]) + lhood_term = (self.lda.alpha[k] - self.gamma[k]) * e_log_theta_k + gammaln(self.gamma[k]) - gammaln(self.lda.alpha[k]) # TODO: check why there's an IF n = 0 for word_id, count in self.doc: @@ -949,14 +954,13 @@ def compute_lda_lhood(self): return lhood - def fit_lda_post(self, doc_number, time, ldaseq, g, g3_matrix, g4_matrix, g5_matrix): + def fit_lda_post(self, doc_number, time, ldaseq, LDA_INFERENCE_CONVERGED = 1e-8, + lda_inference_max_iter = 25, g=None, g3_matrix=None, g4_matrix=None, g5_matrix=None): """ Posterior inference for lda. + g, g3, g4 and g5 are matrices used in Document Influence Model and not used currently. """ - LDA_INFERENCE_CONVERGED = 1e-8 - LDA_INFERENCE_MAX_ITER = 25 - self.init_lda_post() # sum of counts in a doc total = sum(count for word_id, count in self.doc) @@ -986,7 +990,7 @@ def fit_lda_post(self, doc_number, time, ldaseq, g, g3_matrix, g4_matrix, g5_mat lhood = self.compute_lda_lhood() converged = numpy.fabs((lhood_old - lhood) / (lhood_old * total)) - while converged > LDA_INFERENCE_CONVERGED and iter_ <= LDA_INFERENCE_MAX_ITER: + while converged > LDA_INFERENCE_CONVERGED and iter_ <= lda_inference_max_iter: iter_ += 1 lhood_old = lhood diff --git a/gensim/test/test_data/DTM/sstats_test b/gensim/test/test_data/DTM/sstats_test new file mode 100644 index 0000000000..48166e31ef --- /dev/null +++ b/gensim/test/test_data/DTM/sstats_test @@ -0,0 +1,562 @@ +1.092443654001555575e-02 2.989075563459984597e+00 +3.383108460155363154e-03 1.996616891539844563e+00 +3.738432410516782326e-03 9.962615675894831435e-01 +1.002704902166056566e+00 9.972950978339434336e-01 +3.340271287680170176e-03 9.966597287123197813e-01 +1.002449871314430752e+00 9.975501286855691374e-01 +8.356695257481134426e-03 1.991643304742518605e+00 +1.309370676418351864e-02 3.986906293235815468e+00 +3.137260791273940256e-03 9.968627392087261452e-01 +1.200823017708829016e-02 1.987991769822911703e+00 +1.000000000000000000e+00 1.209863911587779773e-46 +3.000194513455103351e+00 1.999805486544896427e+00 +3.000987461615826746e+00 9.990125383841731432e-01 +1.000000000000000000e+00 1.215915071808483999e-46 +1.700757844563687016e+01 7.992421554363129843e+00 +1.000000000000000000e+00 1.240309436526453858e-46 +6.001785619555818130e+00 1.998214380444180982e+00 +1.000000000000000000e+00 1.224962141876083178e-46 +1.999820777049937215e+00 1.792229500627864848e-04 +4.998437621410796616e+00 1.562378589203469552e-03 +4.037905031533501443e+00 1.962094968466498557e+00 +1.000000000000000000e+00 1.233294072712612592e-46 +2.000000000000000000e+00 1.550659367209450689e-46 +1.000000000000000000e+00 1.239631097852270464e-46 +1.000000000000000000e+00 1.223654131971970673e-46 +3.003013029394103306e+00 9.969869706058966941e-01 +2.000000000000000000e+00 1.511039413594772737e-46 +1.000000000000000000e+00 1.238883267240051435e-46 +1.000000000000000000e+00 1.237196475910151841e-46 +1.000000000000000000e+00 1.224125316488377365e-46 +1.206512985692477002e+01 3.934870143075228199e+00 +3.005250132594285528e+00 1.994749867405714028e+00 +9.002107740412895964e+00 9.978922595871050349e-01 +8.002402847402054320e+00 2.997597152597945680e+00 +4.018809036764009690e+00 4.981190963235990310e+00 +2.000000000000000000e+00 1.714352301734856346e-46 +1.000000000000000000e+00 1.638741129099750070e-46 +2.002009519148556294e+00 1.997990480851443706e+00 +1.000000000000000000e+00 1.623460541769694160e-46 +1.000000000000000000e+00 1.645868712016503705e-46 +1.000000000000000000e+00 1.653105814070478824e-46 +3.998077162205545321e+00 1.922837794454700534e-03 +1.000000000000000000e+00 1.675036050505256413e-46 +1.000000000000000000e+00 1.658581341998854429e-46 +2.000000000000000000e+00 1.169979922256087353e-46 +2.997957644510762520e+00 2.042355489237303486e-03 +1.000000000000000000e+00 1.676560014349381248e-46 +1.000000000000000000e+00 1.639198649887964965e-46 +1.000000000000000000e+00 1.652487861203685329e-46 +4.998311867891332305e+00 1.688132108667998385e-03 +2.999086364543500860e+00 9.136354564987795223e-04 +2.999696536462834739e+00 3.034635371649604257e-04 +9.998559688699326653e-01 1.440311300672572885e-04 +2.999867773599241172e+00 1.322264007587989668e-04 +2.999905448278212816e+00 9.455172178710631711e-05 +9.998521873013139771e-01 1.478126986858749846e-04 +9.998305871043033921e-01 1.694128956965717321e-04 +1.997932596192755206e+00 2.067403807244707487e-03 +9.994339718569504427e+00 3.005660281430495573e+00 +9.998322674507920116e-01 1.000167732549207988e+00 +1.212725111214719220e+01 4.872748887852806021e+00 +6.012609552263404211e+00 2.987390447736595345e+00 +9.998383748897964329e-01 1.616251102034746401e-04 +2.995950087636213954e+00 4.049912363786161047e-03 +3.000715515130669253e+00 9.992844848693303028e-01 +9.998291157956640252e-01 1.708842043359251324e-04 +9.998157099037094930e-01 1.842900962904826258e-04 +2.999726897549725102e+00 2.731024502749187523e-04 +2.015504479342879574e+00 9.844955206571202044e-01 +2.410596059386410417e+01 2.894039406135898940e+00 +9.998033418007618023e-01 1.966581992381829722e-04 +9.998075232710741389e-01 1.924767289258012479e-04 +1.002279354386198751e+00 9.977206456138011381e-01 +2.997811512683630220e+00 1.002188487316369780e+00 +2.998195806597341306e+00 1.804193402658541435e-03 +1.003555434723103712e+00 9.964445652768963990e-01 +1.997218555430012144e+00 2.781444569988119741e-03 +9.991620258750940620e-01 8.379741249060272635e-04 +1.099080248193437903e+01 1.009197518065621413e+00 +2.997894538018985866e+00 2.105461981013825173e-03 +3.010092129008588913e+00 3.989907870991411087e+00 +2.998046681907480693e+00 1.953318092519324995e-03 +9.986430903228902256e-01 1.356909677109647541e-03 +3.026085152492925978e+00 1.973914847507074466e+00 +9.988352858478318774e-01 1.164714152168156401e-03 +9.992409606975610759e-01 7.590393024389190503e-04 +1.999369355722667718e+00 6.306442773322752833e-04 +9.991072509110334732e-01 8.927490889664690390e-04 +4.004260648291602109e+00 4.995739351708398779e+00 +9.991127146583672625e-01 8.872853416327262585e-04 +9.989154885617119728e-01 1.001084511438288027e+00 +9.991522429675332440e-01 8.477570324667618989e-04 +2.997091644364889618e+00 2.908355635110812520e-03 +1.997821385771725078e+00 2.178614228274795058e-03 +2.010225552882992872e+00 9.897744471170069058e-01 +1.998360628077472168e+00 1.639371922528253152e-03 +9.991882677434840154e-01 8.117322565158860198e-04 +9.987033966946465835e-01 1.296603305353423007e-03 +9.989458158997651660e-01 1.054184100234897316e-03 +7.997583342350329083e+00 4.002416657649670917e+00 +9.988825874049864773e-01 1.117412595013351660e-03 +9.992304256320166944e-01 7.695743679833144755e-04 +9.989870394258032471e-01 1.012960574196889937e-03 +1.999795425757386003e+00 1.000204574242614219e+00 +1.997464347179166433e+00 2.535652820833518128e-03 +2.996957541224438692e+00 3.042458775561324418e-03 +9.989816162438666103e-01 1.018383756133388199e-03 +9.988789964070644567e-01 1.121003592935605319e-03 +1.010270267365846353e+00 3.989729732634153425e+00 +9.991509091699821710e-01 8.490908300177801371e-04 +9.992072230496233942e-01 7.927769503766631367e-04 +9.989062297695342485e-01 1.093770230465713339e-03 +1.002269332445513861e+00 9.977306675544861392e-01 +1.997659007869493131e+00 2.340992130507234432e-03 +9.991784968889194651e-01 8.215031110804089888e-04 +9.987818824810549279e-01 1.218117518945004271e-03 +1.998051884363636788e+00 1.948115636363505557e-03 +3.005398990266651804e+00 3.994601009733348640e+00 +3.003419706440591863e+00 1.996580293559408137e+00 +1.998359014771899034e+00 1.640985228101033908e-03 +1.998244530367741856e+00 1.755469632257978130e-03 +9.992617521686939508e-01 7.382478313059710360e-04 +1.001271884915868160e+00 9.987281150841316180e-01 +1.999070885254491836e+00 9.291147455081163759e-04 +9.991080417663374957e-01 8.919582336625818918e-04 +2.006132525740382810e+00 1.993867474259616746e+00 +9.986405151355165488e-01 1.359484864483574999e-03 +9.990291850212226210e-01 9.708149787774489206e-04 +9.988537506281531808e-01 1.146249371846780590e-03 +9.991260523976819297e-01 8.739476023179571472e-04 +1.997620655387410737e+00 2.379344612589276431e-03 +9.992993661068275690e-01 7.006338931723443382e-04 +2.001700032948530605e+00 9.982999670514691726e-01 +9.993122663589073529e-01 6.877336410926577440e-04 +2.999220191240262068e+00 7.798087597375051106e-04 +9.990037551581054664e-01 9.962448418943342741e-04 +1.997550498324772494e+00 2.449501675227426685e-03 +2.999536614256048672e+00 1.000463385743951550e+00 +9.991215414369554182e-01 8.784585630446485952e-04 +1.996911509760880499e+00 3.088490239119342975e-03 +6.000264083607260268e+00 2.999735916392738844e+00 +1.000000000000000000e+00 1.387618972278752214e-46 +2.998581530259968098e+00 2.001418469740031902e+00 +1.000000000000000000e+00 1.420455547333084642e-46 +1.000000000000000000e+00 1.420379826638228227e-46 +4.999817091764811927e+00 2.000182908235187629e+00 +1.000000000000000000e+00 1.404916727954038099e-46 +1.000000000000000000e+00 1.400554365624505746e-46 +1.999223834776728825e+00 7.761652232711166487e-04 +1.000000000000000000e+00 1.358499241355122662e-46 +1.002841383840783918e+00 1.997158616159216304e+00 +2.000000000000000000e+00 1.416372644731821492e-46 +1.000000000000000000e+00 1.347835421818372223e-46 +1.997397650137396674e+00 2.602349862603311591e-03 +1.000000000000000000e+00 1.357524041501837515e-46 +2.997559296873980372e+00 2.440703126019304597e-03 +2.017532287072577812e+00 2.982467712927422632e+00 +1.000000000000000000e+00 1.363035240757054315e-46 +1.000000000000000000e+00 1.359283448019270130e-46 +2.000000000000000000e+00 1.453770399299675476e-46 +1.997956170495806649e+00 2.043829504193442684e-03 +2.997711068867576767e+00 2.288931132422806786e-03 +1.000000000000000000e+00 1.335003685771777313e-46 +1.000000000000000000e+00 1.376725117858755571e-46 +1.000000000000000000e+00 1.355030402701817305e-46 +4.819585856337125843e-47 1.000000000000000000e+00 +4.896954104916352142e-47 1.000000000000000000e+00 +5.976086637921076451e-03 1.994023913362079181e+00 +4.957962279200116587e-47 1.000000000000000000e+00 +9.974629116581600341e-01 1.002537088341839855e+00 +5.062172221104585915e-47 1.000000000000000000e+00 +9.981405376604133295e-01 1.001859462339586671e+00 +1.997222386591746668e+00 2.002777613408253110e+00 +4.930127088692983176e-47 1.000000000000000000e+00 +4.827835426303613263e-47 1.000000000000000000e+00 +4.908450396540412422e-47 1.000000000000000000e+00 +4.993233582324611675e+00 6.006766417675388325e+00 +4.923654511710189096e-47 1.000000000000000000e+00 +4.906420793186469672e-47 1.000000000000000000e+00 +4.779414971434191071e-47 1.000000000000000000e+00 +4.794191602963843442e-47 1.000000000000000000e+00 +5.121519036399629698e-47 1.000000000000000000e+00 +5.701818989978316328e-03 1.994298181010021320e+00 +9.975608629196323074e-01 1.002439137080367804e+00 +2.905237757825851301e-03 1.997094762242173971e+00 +4.830744238879148703e-47 1.000000000000000000e+00 +1.145125243547011167e-02 3.988548747564530039e+00 +5.484152132770268104e-47 2.000000000000000000e+00 +2.020922527590056639e+00 4.979077472409944249e+00 +4.852325547515752104e-47 1.000000000000000000e+00 +4.883797664649919460e-47 1.000000000000000000e+00 +4.895435610470865504e-47 1.000000000000000000e+00 +9.979528025302841776e-01 2.047197469715753004e-03 +9.976058991610120552e-01 2.394100838987878004e-03 +1.012224561675880263e+00 9.877754383241197367e-01 +1.997242196611644482e+00 2.757803388355355330e-03 +9.975977767936994312e-01 2.402223206300585761e-03 +9.981794185130173913e-01 1.820581486982481836e-03 +2.014808977471377105e+00 9.851910225286228950e-01 +9.980602612774225335e-01 1.939738722577383657e-03 +9.977277673365404498e-01 2.272232663459596187e-03 +9.982401773252256305e-01 1.759822674774371223e-03 +9.982123358105545741e-01 1.787664189445417693e-03 +1.013763454563903599e+00 9.862365454360961792e-01 +9.971409172847610636e-01 2.859082715238980225e-03 +9.977559958046933186e-01 2.244004195306692224e-03 +4.011053488697122305e+00 9.889465113028771404e-01 +2.008628622686274845e+00 2.991371377313724711e+00 +9.978965062968028210e-01 2.103493703197112201e-03 +9.975352903754417522e-01 2.464709624558053969e-03 +1.000090979962959814e+00 9.999090200370399639e-01 +9.977543743073555849e-01 2.245625692644432486e-03 +2.013128536014390235e+00 9.868714639856094317e-01 +9.975838750047251180e-01 2.416124995274857284e-03 +1.997703208088337234e+00 2.296791911662399068e-03 +1.997777184502102399e+00 2.222815497897675727e-03 +9.979922615901534177e-01 2.007738409846590490e-03 +4.016810793723678330e+00 2.983189206276321670e+00 +9.978053104526063422e-01 2.194689547393774507e-03 +9.979734582853977409e-01 2.026541714602358817e-03 +2.001062580134930435e+00 9.989374198650693426e-01 +9.975179947676557912e-01 2.482005232344199671e-03 +3.177941307754205465e-03 9.968220586922457160e-01 +3.241529793316107622e-03 9.967584702066838442e-01 +3.148692154952284755e-03 9.968513078450476073e-01 +7.014590228179245698e-03 1.992985409771820704e+00 +3.515906595744435620e-03 9.964840934042553666e-01 +1.300132154876339555e-02 2.986998678451236344e+00 +4.015295704149533407e+00 3.984704295850466149e+00 +4.233913924621668899e-03 1.995766086075378531e+00 +3.661909428208910688e-03 9.963380905717911240e-01 +3.237449006591419567e-03 9.967625509934086026e-01 +1.238745828603949120e-02 2.987612541713960290e+00 +3.720746028236795750e-03 9.962792539717632012e-01 +4.002976063709749788e+00 9.970239362902499902e-01 +2.871126402875520157e-03 9.971288735971244321e-01 +3.034963761052275784e-03 9.969650362389474996e-01 +3.212557152518188616e-03 9.967874428474818860e-01 +4.636700930167085392e-03 9.953632990698328964e-01 +8.373120050646094478e-03 2.991626879949354390e+00 +3.989185382953831099e-03 9.960108146170461030e-01 +4.054296999805730162e-03 9.959457030001941380e-01 +4.357748680200410583e-03 9.956422513197996649e-01 +3.032102883750231043e-03 9.969678971162497794e-01 +1.003115194952157507e+00 9.968848050478422707e-01 +1.666777387072908290e-02 4.983332226129271625e+00 +3.065554092629207538e-03 9.969344459073707920e-01 +1.002468322020088953e+00 9.975316779799109357e-01 +4.038668513787060238e-02 1.195961331486212842e+01 +5.819445709624944223e-03 1.994180554290374996e+00 +3.983358162066140233e-03 9.960166418379338138e-01 +3.198285415576220935e-03 9.968017145844237925e-01 +3.436851425771414968e-03 9.965631485742285633e-01 +9.496364742518654395e-03 2.990503635257481196e+00 +2.016062808450457489e+00 2.983937191549542511e+00 +3.009567162320287093e+00 2.990432837679713352e+00 +5.918374496911796116e-03 1.994081625503087984e+00 +3.518290440879322505e-03 9.964817095591206320e-01 +3.418144033343013810e-03 9.965818559666570486e-01 +3.300151342547691033e-03 9.966998486574522031e-01 +3.994295494790671246e-03 9.960057045052093105e-01 +3.416599850905048009e-03 9.965834001490949667e-01 +3.356972329386859102e-03 9.966430276706130797e-01 +3.000282421225842988e+00 9.997175787741569009e-01 +9.056865706284751918e-03 2.990943134293714945e+00 +1.002290675026068945e+00 9.977093249739310554e-01 +4.091145708739654663e-03 9.959088542912601927e-01 +1.825093218694189348e-02 4.981749067813057152e+00 +3.737685726650237083e-03 9.962623142733496584e-01 +2.921751731890475951e-03 9.970782482681095349e-01 +9.988588190527059041e-01 1.141180947293989447e-03 +9.986304567521199216e-01 1.369543247879973901e-03 +9.983973920083657472e-01 1.602607991634340213e-03 +9.984063285210268424e-01 1.593671478973150485e-03 +9.988059316171471469e-01 1.194068382852883897e-03 +1.998153730342258960e+00 1.846269657741023097e-03 +3.997137010232037380e+00 2.862989767962439546e-03 +1.007497053635540762e+00 1.992502946364459238e+00 +9.986892185605520389e-01 1.310781439447969562e-03 +9.984024870204479818e-01 1.597512979552127713e-03 +9.988601421668458213e-01 1.139857833154103413e-03 +1.002961670357248813e+00 1.997038329642751187e+00 +2.998590417856644486e+00 1.409582143355437049e-03 +9.989936241531106598e-01 1.006375846889253928e-03 +9.987357785037558333e-01 1.264221496244111176e-03 +1.010251714808831469e+00 9.897482851911687529e-01 +2.019245977681841886e+00 9.807540223181581140e-01 +9.986060605219938546e-01 1.393939478006202184e-03 +9.988702864233018897e-01 1.129713576698181902e-03 +9.987525141849326049e-01 1.247485815067243516e-03 +9.984886613853292125e-01 1.511338614670673175e-03 +1.998929883890762538e+00 1.070116109237299841e-03 +2.000000000000000000e+00 5.628010851213472823e-47 +1.013463113493671797e+00 9.865368865063280923e-01 +1.000000000000000000e+00 5.363905181271526340e-47 +1.000000000000000000e+00 5.483435919657967408e-47 +1.000000000000000000e+00 5.341923528071615718e-47 +1.000000000000000000e+00 5.432709336625410595e-47 +1.000000000000000000e+00 5.410109792678980254e-47 +1.000000000000000000e+00 5.450042465975079661e-47 +1.000000000000000000e+00 5.463461012836395391e-47 +1.000000000000000000e+00 5.407659123115096433e-47 +1.000000000000000000e+00 5.450800604772544493e-47 +1.999394891263923224e+00 6.051087360771560762e-04 +3.994734362347510004e+00 5.265637652490104577e-03 +1.000000000000000000e+00 5.385704670449697648e-47 +1.000000000000000000e+00 5.589677981264196073e-47 +1.000000000000000000e+00 5.392808800592590294e-47 +1.000000000000000000e+00 5.414534751603699097e-47 +1.000000000000000000e+00 5.409020057246302525e-47 +1.010963858471109145e+00 1.989036141528890855e+00 +1.999311430984608284e+00 6.885690153915402425e-04 +1.000000000000000000e+00 5.427873615985144791e-47 +1.000000000000000000e+00 5.348045157754681066e-47 +1.000000000000000000e+00 5.433217651490795879e-47 +1.010426404982677440e+00 1.989573595017322560e+00 +1.000000000000000000e+00 1.000000000000000000e+00 +1.000000000000000000e+00 5.382636235299113255e-47 +1.000000000000000000e+00 1.000000000000000000e+00 +1.000000000000000000e+00 5.385033285053781332e-47 +1.010498272852458745e+00 3.989501727147541477e+00 +1.000000000000000000e+00 5.394008653170593334e-47 +1.000000000000000000e+00 5.387208176351058129e-47 +1.000000000000000000e+00 5.439990202240452791e-47 +1.960581881448665459e-46 1.000000000000000000e+00 +1.997460327654513534e+00 3.002539672345486466e+00 +9.980752103877377213e-01 2.001924789612262057e+00 +1.863707642336268002e-46 1.000000000000000000e+00 +1.000838855344041933e+00 2.999161144655957845e+00 +9.984465434956540930e-01 1.553456504345977459e-03 +9.986978174077280057e-01 1.302182592271954606e-03 +9.986232508512252748e-01 1.376749148774716967e-03 +1.996970602889334589e+00 3.029397110665502883e-03 +9.985400140970224214e-01 1.459985902977557976e-03 +2.996357944442667343e+00 3.642055557332484796e-03 +2.995298496900147711e+00 4.701503099852073764e-03 +9.983716599614101961e-01 1.628340038589744286e-03 +1.996814394392947722e+00 3.185605607052280179e-03 +9.985897780413758307e-01 1.410221958624125740e-03 +2.995809728073926426e+00 4.190271926073793791e-03 +9.984915176024338201e-01 1.508482397566071869e-03 +9.986699387751998280e-01 1.330061224800234421e-03 +9.988104709496933298e-01 1.189529050306513663e-03 +1.998350893725822974e+00 1.649106274176976054e-03 +9.984727329034832621e-01 1.527267096516674331e-03 +9.984991720224165590e-01 1.500827977583340384e-03 +9.989044041420798159e-01 1.095595857920101703e-03 +9.984825625803541715e-01 1.517437419645837563e-03 +9.983241729394628505e-01 1.675827060537104802e-03 +9.982357489776417792e-01 1.764251022358196124e-03 +2.995538813998040784e+00 1.004461186001958994e+00 +2.995561176756564858e+00 4.438823243434481501e-03 +9.989790828844368198e-01 1.020917115563336553e-03 +9.985410910444760813e-01 1.458908955523777803e-03 +1.997780803959834817e+00 2.219196040165207200e-03 +9.984759327326686584e-01 1.524067267331346339e-03 +9.982303674093269130e-01 1.769632590673108881e-03 +9.988461372671993965e-01 1.153862732800751576e-03 +9.985386844384706029e-01 1.461315561529347683e-03 +1.997781004590848397e+00 2.218995409151277610e-03 +9.989859222686811036e-01 1.014077731318940682e-03 +1.001688456353303280e+00 9.983115436466967196e-01 +3.993756504800621165e+00 6.243495199379020186e-03 +9.989724561328680030e-01 1.027543867131981790e-03 +9.981531498636873057e-01 1.846850136312674397e-03 +9.124262078922782543e-03 9.908757379210773042e-01 +7.087473507272424103e-03 9.929125264927276540e-01 +1.730200752596639316e-02 1.982697992474033555e+00 +7.588796850684373721e-03 9.924112031493156350e-01 +1.076144606437997621e-02 2.989238553935619880e+00 +8.008370332056095192e-03 9.919916296679439638e-01 +1.249727078667985797e-02 2.987502729213320052e+00 +1.823585461741900754e-02 1.981764145382580722e+00 +7.111670412860333218e-03 9.928883295871396841e-01 +6.993936802907376078e-03 9.930060631970927254e-01 +8.293679746253801791e-03 9.917063202537461652e-01 +9.161805086955949681e-03 9.908381949130440347e-01 +3.065101947232996837e-02 3.969348980527670268e+00 +8.996362109145786962e-03 9.910036378908541055e-01 +1.027766605423131342e-02 1.989722333945768362e+00 +1.157429833189310003e-02 9.884257016681069485e-01 +7.989614994016056015e-03 1.992010385005983597e+00 +7.459103172914599220e-03 9.925408968270854615e-01 +7.497664122163057686e-03 9.925023358778368721e-01 +9.103021068668826904e-03 9.908969789313312182e-01 +9.497825192457736918e-03 9.905021748075422128e-01 +1.012458955947394657e+00 9.875410440526054545e-01 +8.494679242149262219e-03 9.915053207578506944e-01 +1.060311791197339829e-02 9.893968820880265636e-01 +7.979970752170486778e-03 9.920200292478296156e-01 +9.365781290287800848e-03 9.906342187097122443e-01 +7.015553563935422558e-03 9.929844464360646494e-01 +7.697529303921076503e-03 9.923024706960789088e-01 +7.539541571406690491e-03 9.924604584285933173e-01 +7.141331812382661036e-03 9.928586681876174769e-01 +7.948552723310436940e-03 9.920514472766894798e-01 +9.269741192890723971e-03 9.907302588071091876e-01 +7.972647097131777688e-03 9.920273529028682535e-01 +1.091404959510783312e-02 1.989085950404892111e+00 +8.334431604320010484e-03 9.916655683956800971e-01 +8.457411168914697949e-03 9.915425888310851477e-01 +9.629292334126326666e-03 9.903707076658737618e-01 +1.011513214585687326e+00 1.988486785414312896e+00 +8.953959286828473349e-03 9.910460407131714833e-01 +7.207789166760603626e-03 1.992792210833239430e+00 +5.270782711207388985e-47 1.000000000000000000e+00 +5.580180805816032638e-47 1.000000000000000000e+00 +5.539960863442175868e-47 1.000000000000000000e+00 +5.691519543662593657e-47 2.000000000000000000e+00 +5.417042488228900129e-47 1.000000000000000000e+00 +5.376860156923605548e-47 1.000000000000000000e+00 +5.368213776930869781e-47 1.000000000000000000e+00 +5.337987571368773632e-47 1.000000000000000000e+00 +9.994057524713002572e-01 1.000594247528699743e+00 +5.538325688887702500e-47 1.000000000000000000e+00 +5.344364807494659848e-47 1.000000000000000000e+00 +9.992721247892204506e-01 1.000727875210779327e+00 +5.286778760675359903e-47 1.000000000000000000e+00 +5.381208512537048970e-47 1.000000000000000000e+00 +5.171411498455629607e-47 1.000000000000000000e+00 +6.336942015015505992e-47 4.000000000000000000e+00 +9.992723456755825406e-01 1.000727654324417237e+00 +5.334341324232295843e-47 1.000000000000000000e+00 +5.383783829044589455e-47 1.000000000000000000e+00 +5.565471155797933313e-47 1.000000000000000000e+00 +5.324307053655013612e-47 1.000000000000000000e+00 +5.228269958744074204e-47 1.000000000000000000e+00 +1.003102967912408383e+00 1.996897032087591617e+00 +9.995490328857724593e-01 4.509671142275224406e-04 +9.995225344305616044e-01 4.774655694384753735e-04 +1.998866682191447142e+00 1.001133317808552858e+00 +9.991725717322195166e-01 8.274282677804393766e-04 +9.993633249109644678e-01 6.366750890355846355e-04 +9.995339818580941671e-01 4.660181419059424642e-04 +9.992364801078539305e-01 7.635198921461642824e-04 +9.994344571626554430e-01 5.655428373444534732e-04 +9.994986620746204586e-01 5.013379253795498171e-04 +9.992273366927377776e-01 7.726633072622512716e-04 +9.993652102928332059e-01 6.347897071668354759e-04 +9.992759733702318847e-01 7.240266297679713971e-04 +1.001118980243569379e+00 9.988810197564306215e-01 +9.994047252781812496e-01 5.952747218187558285e-04 +9.994816003166400176e-01 5.183996833599074060e-04 +9.994108499861265038e-01 5.891500138734384042e-04 +9.993989580413302765e-01 6.010419586697927138e-04 +9.994081723367678194e-01 5.918276632321647825e-04 +9.994396780351958443e-01 5.603219648041953211e-04 +9.993219663376287087e-01 6.780336623713374439e-04 +9.993167569313433640e-01 6.832430686565646363e-04 +9.994480592357621873e-01 5.519407642377641143e-04 +1.003460368011329873e+00 9.965396319886701271e-01 +1.002490683532780258e+00 9.975093164672195201e-01 +9.993309914674171068e-01 6.690085325830703259e-04 +9.992383878696635691e-01 7.616121303364277928e-04 +9.994928197467700670e-01 5.071802532299573814e-04 +9.994224054751338349e-01 5.775945248661254559e-04 +9.994789045065237687e-01 5.210954934762590529e-04 +9.989684961544569308e-01 1.031503845543107773e-03 +5.999491990721200096e+00 1.000508009278799904e+00 +9.992614413621229152e-01 7.385586378769903220e-04 +1.998856260006944607e+00 1.143739993055435580e-03 +1.000789404142109706e+00 9.992105958578900715e-01 +9.993129316067721479e-01 6.870683932278783077e-04 +9.993520857532155466e-01 6.479142467843919439e-04 +5.996220310589613689e+00 3.779689410385914626e-03 +9.989645520736561979e-01 1.035447926343787805e-03 +9.995352578513273523e-01 4.647421486725714090e-04 +1.998638312675599993e+00 1.361687324399918266e-03 +1.998864286532588297e+00 1.135713467411604647e-03 +9.994154047211341041e-01 5.845952788659619344e-04 +9.994449882555606068e-01 5.550117444394945350e-04 +1.004137072738518821e+00 1.995862927261481179e+00 +9.994160422634119634e-01 5.839577365879889195e-04 +1.998675550529696032e+00 1.324449470303806862e-03 +1.001907049658891324e+00 9.980929503411086756e-01 +1.998218291976068706e+00 1.781708023931410792e-03 +4.930581962142418541e-03 1.995069418037857201e+00 +2.636058468195731468e-03 9.973639415318041879e-01 +2.433040392811023755e-03 9.975669596071889966e-01 +2.820563199857915263e-03 9.971794368001420938e-01 +2.364862873475574983e-03 9.976351371265241852e-01 +1.371805593927369121e-03 9.986281944060726445e-01 +2.835273207999831468e-03 9.971647267920000779e-01 +2.260716511374811707e-03 9.977392834886251727e-01 +3.042134693785419897e-03 9.969578653062145523e-01 +2.873398356045693894e-03 9.971266016439543911e-01 +2.421500574930267498e-03 9.975784994250697091e-01 +2.573601563316223109e-03 9.974263984366836810e-01 +2.335470311578421131e-03 9.976645296884214531e-01 +2.219121651938276481e-03 9.977808783480617283e-01 +2.548969841397718747e-03 9.974510301586021477e-01 +3.045830669696215388e-03 9.969541693303036745e-01 +5.077092028267345993e-03 1.994922907971732773e+00 +2.588340605777835618e-03 9.974116593942221609e-01 +2.504906238912875480e-03 9.974950937610871371e-01 +2.519205975746906072e-03 9.974807940242530480e-01 +2.993347521930599803e-03 9.970066524780692996e-01 +3.185515102900045130e-03 9.968144848970998950e-01 +2.182956532876598535e-03 9.978170434671232991e-01 +3.065421006780637281e-03 9.969345789932193558e-01 +2.972365071207913514e-03 9.970276349287918727e-01 +2.841359360156174747e-03 9.971586406398438296e-01 +2.522531781310038100e-03 9.974774682186899888e-01 +2.290827384413663282e-03 9.977091726155863372e-01 +3.256928425392405409e-03 9.967430715746075087e-01 +2.028693019353658249e-03 9.979713069806462888e-01 +3.706639797699594167e-03 9.962933602023004154e-01 +3.703432968304423614e-03 9.962965670316955569e-01 +2.770442014570877187e-03 9.972295579854291159e-01 +7.425282838760093561e-03 2.992574717161239573e+00 +2.835176877096028201e-03 9.971648231229038872e-01 +2.471562687491314376e-03 9.975284373125087312e-01 +5.060098271923050769e-03 1.994939901728076892e+00 +2.678929902378660542e-03 9.973210700976212761e-01 +2.970006811260944134e-03 9.970299931887389722e-01 +2.483213847836801373e-03 9.975167861521629931e-01 +2.493776758408526150e-03 9.975062232415915497e-01 +2.467263830340076132e-03 9.975327361696598727e-01 +1.897689543945637377e-03 9.981023104560543535e-01 +3.476952182203667252e-03 9.965230478177963258e-01 +3.337193371273448200e-03 9.966628066287264165e-01 +2.544823469388982420e-03 9.974551765306109985e-01 +2.462580271568771614e-03 9.975374197284312405e-01 +2.340380882327547808e-03 9.976596191176724249e-01 +2.185335770818864522e-03 9.978146642291810808e-01 +2.517370852873779524e-03 9.974826291471261541e-01 +3.130489262019590675e-03 9.968695107379803577e-01 +2.535715351468906080e-03 9.974642846485310965e-01 +2.299316940934231172e-03 9.977006830590657849e-01 +2.513145727260615060e-03 9.974868542727393095e-01 +2.588866498291569462e-03 9.974111335017085134e-01 +2.915920093548339444e-03 9.970840799064515370e-01 +3.699671171343620578e-03 9.963003288286563786e-01 +2.859505812405921214e-03 9.971404941875940953e-01 +2.642112090443233478e-03 9.973578879095565952e-01 +2.919155537061107926e-03 9.970808444629389866e-01 +3.189411004639295961e-03 9.968105889953605692e-01 +4.770987700952431486e-03 1.995229012299047255e+00 +2.101050329724805758e-03 9.978989496702751483e-01 +2.909288793848420051e-03 9.970907112061515587e-01 +2.981809299405411683e-03 9.970181907005943867e-01 +2.300351021618102232e-03 9.976996489783818145e-01 +2.200811262226234216e-03 9.977991887377737346e-01 +3.394828342689515562e-03 9.966051716573105512e-01 +5.219029400262317075e-46 1.000000000000000000e+00 +5.303499110905681753e-46 1.000000000000000000e+00 +3.000000000000000000e+00 2.000000000000000000e+00 +1.000000000000000000e+00 1.085636710501089580e-45 +1.999999999999999778e+00 2.000000000000000000e+00 +5.000000000000000000e+00 3.999999999999999556e+00 +1.999999999999999778e+00 1.025578214001572587e-45 +1.000000000000000000e+00 1.000000000000000000e+00 +1.000000000000000000e+00 8.561152365335069377e-46 +1.000000000000000000e+00 1.115636710663362352e-45 +5.282891002260288514e-46 1.000000000000000000e+00 +6.639354372121164232e-46 1.000000000000000000e+00 +2.000000000000000000e+00 1.000000000000000000e+00 +6.891953067090714591e-46 1.000000000000000000e+00 +1.000000000000000000e+00 2.000000000000000000e+00 +2.000000000000000000e+00 2.117499853764445525e-45 +3.000000000000000000e+00 1.911144203457866006e-45 +1.000000000000000000e+00 1.536432633516627284e-45 diff --git a/gensim/test/test_ldaseqmodel.py b/gensim/test/test_ldaseqmodel.py index 5cd7a33cae..81af4ea6ec 100644 --- a/gensim/test/test_ldaseqmodel.py +++ b/gensim/test/test_ldaseqmodel.py @@ -23,22 +23,22 @@ def setUp(self): ['bank','river','shore','water'],['river','water','flow','fast','tree'],['bank','water','fall','flow'],['bank','bank','water','rain','river'], ['river','water','mud','tree'],['money','transaction','bank','finance'], ['bank','borrow','money'], ['bank','finance'], ['finance','money','sell','bank'],['borrow','sell'],['bank','loan','sell']] - + # initializing using own LDA sufficient statistics so that we get same results each time. + sstats = numpy.loadtxt(datapath('sstats_test')) dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] - self.ldaseq = ldaseqmodel.LdaSeqModel(corpus = corpus , id2word= dictionary, num_topics=2, time_slice=[10, 10, 11], random_state=numpy.random.seed(0)) + self.ldaseq = ldaseqmodel.LdaSeqModel(corpus = corpus , id2word= dictionary, num_topics=2, time_slice=[10, 10, 11], initialize='own', sstats=sstats) def testTopicWord(self): topics = self.ldaseq.print_topics(0) - expected_topic_word = [(0.053999999999999999, 'skills')] + expected_topic_word = [( 0.035999999999999997, 'skills')] self.assertAlmostEqual(topics[0][0][0], expected_topic_word[0][0], places=2) self.assertEqual(topics[0][0][1], expected_topic_word[0][1]) - def testDocTopic(self): doc_topic = self.ldaseq.doc_topics(0) - expected_doc_topic = 0.99933422103861524 + expected_doc_topic = 0.00066577896138482028 self.assertAlmostEqual(doc_topic[0], expected_doc_topic, places=2) if __name__ == '__main__': From 30b4d4593efa61fa4514452029797b01d67672fb Mon Sep 17 00:00:00 2001 From: Bhargav Srinivasa Date: Tue, 16 Aug 2016 19:01:53 +0530 Subject: [PATCH 37/38] Addressed comments, cleaned code --- gensim/models/ldaseqmodel.py | 59 ++++++++++-------- gensim/test/test_data/.DS_Store | Bin 10244 -> 0 bytes .../DTM/{sstats_test => sstats_test.txt} | 0 gensim/test/test_ldaseqmodel.py | 5 +- 4 files changed, 36 insertions(+), 28 deletions(-) delete mode 100644 gensim/test/test_data/.DS_Store rename gensim/test/test_data/DTM/{sstats_test => sstats_test.txt} (100%) diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py index 4e7e5b60b9..277c6d2781 100644 --- a/gensim/models/ldaseqmodel.py +++ b/gensim/models/ldaseqmodel.py @@ -16,8 +16,9 @@ The next steps to take this forward would be: 1) Include DIM mode. Most of the infrastructure for this is in place. - 2) See if LdaPost can be replaces by LdaModel completely without breakign anything. + 2) See if LdaPost can be replaced by LdaModel completely without breaking anything. 3) Heavy lifting going on in the sslm class - efforts can be made to cythonise mathematical methods. + - in particular, update_obs and the optimization takes a lot time. 4) Try and make it distributed, especially around the E and M step. """ @@ -237,7 +238,7 @@ def fit_lda_seq(self, seq_corpus, lda_inference_max_iter, em_min_iter, em_max_it if convergence < LDASQE_EM_THRESHOLD: - lda_inference_max_iter = 500 + lda_inference_max_iter = numpy.inf logger.info("Starting final iterations, max iter is", lda_inference_max_iter) convergence = 1.0 @@ -366,7 +367,7 @@ def print_topics(self, time=0, top_terms=20): def print_topic(self, topic, time=0, top_terms=20): """ - Topic is the topic numner + Topic is the topic number Time is for a particular time_slice top_terms is the number of terms to display """ @@ -453,10 +454,7 @@ def update_zeta(self): Zeta is described in the appendix and is equal to sum (exp(mean[word] + Variance[word] / 2)), over every time-slice. It is the value of variational parameter zeta which maximizes the lower bound. """ - vocab_len = self.vocab_len - num_time_slices = self.num_time_slices - self.zeta.fill(0) - for j in range(0, num_time_slices): + for j, val in enumerate(self.zeta): self.zeta[j] = numpy.sum(numpy.exp(self.mean[:, j + 1] + self.variance[:, j + 1] / 2)) return self.zeta @@ -550,6 +548,7 @@ def compute_expected_log_prob(self): def sslm_counts_init(self, obs_variance, chain_variance, sstats): """ Initialize State Space Language Model with LDA sufficient statistics. + Called for each topic-chain and initializes intial mean, variance and Topic-Word probabilities for the first time-slice. """ W = self.vocab_len T = self.num_time_slices @@ -575,10 +574,12 @@ def sslm_counts_init(self, obs_variance, chain_variance, sstats): self.e_log_prob = self.compute_expected_log_prob() - def fit_sslm(self, counts): - + def fit_sslm(self, sstats): """ - Fit variational distribution. + Fits variational distribution. + This is essentially the m-step. + Accepts the sstats for a particular topic for input and maximizes values for that topic. + Updates the values in the update_obs() and compute_expected_log_prob methods. """ W = self.vocab_len bound = 0 @@ -587,32 +588,32 @@ def fit_sslm(self, counts): sslm_max_iter = 2 converged = sslm_fit_threshold + 1 - totals = numpy.zeros(counts.shape[1]) + totals = numpy.zeros(sstats.shape[1]) # computing variance, fwd_variance self.variance, self.fwd_variance = map(numpy.array, list(zip(*[self.compute_post_variance(w, self.chain_variance) for w in range(0, W)]))) - # column sum of counts - totals = counts.sum(axis=0) + # column sum of sstats + totals = sstats.sum(axis=0) iter_ = 0 model = "DTM" if model == "DTM": - bound = self.compute_bound(counts, totals) + bound = self.compute_bound(sstats, totals) if model == "DIM": - bound = self.compute_bound_fixed(counts, totals) + bound = self.compute_bound_fixed(sstats, totals) logger.info("initial sslm bound is ", bound) while converged > sslm_fit_threshold and iter_ < sslm_max_iter: iter_ += 1 old_bound = bound - self.obs, self.zeta = self.update_obs(counts, totals) + self.obs, self.zeta = self.update_obs(sstats, totals) if model == "DTM": - bound = self.compute_bound(counts, totals) + bound = self.compute_bound(sstats, totals) if model == "DIM": - bound = self.compute_bound_fixed(counts, totals) + bound = self.compute_bound_fixed(sstats, totals) converged = numpy.fabs((bound - old_bound) / old_bound) logger.info(iter_, " iteration lda seq bound is ", bound, " convergence is", converged) @@ -621,10 +622,10 @@ def fit_sslm(self, counts): return bound - def compute_bound(self, word_counts, totals): + def compute_bound(self, sstats, totals): """ Compute log probability bound. - Forumula is as described in appendix of DTM. + Forumula is as described in appendix of DTM by Blei. (formula no. 5) """ W = self.vocab_len T = self.num_time_slices @@ -663,7 +664,7 @@ def compute_bound(self, word_counts, totals): # term_1 += (numpy.power(m - prev_m - (w_phi_l * exp_i), 2) / (2 * chain_variance)) - (v / chain_variance) - numpy.log(chain_variance) term_1 += (numpy.power(m - prev_m, 2) / (2 * chain_variance)) - (v / chain_variance) - numpy.log(chain_variance) - term_2 += word_counts[w][t - 1] * m + term_2 += sstats[w][t - 1] * m ent += numpy.log(v) / 2 # note the 2pi's cancel with term1 (see doc) term_3 = -totals[t - 1] * numpy.log(self.zeta[t - 1]) @@ -672,12 +673,15 @@ def compute_bound(self, word_counts, totals): return val - def update_obs(self, word_counts, totals): + def update_obs(self, sstats, totals): """ - Fucntion to perform optimization of obs. + Function to perform optimization of obs. Parameters are suff_stats set up in the fit_sslm method. + + TODO: This is by far the slowest function in the whole algorithm. Replacing or improving the performance of this would greatly speed things up. """ + OBS_NORM_CUTOFF = 2 STEP_SIZE = 0.01 TOL = 1e-3 @@ -690,7 +694,7 @@ def update_obs(self, word_counts, totals): norm_cutoff_obs = None for w in range(0, W): - w_counts = word_counts[w] + w_counts = sstats[w] counts_norm = 0 # now we find L2 norm of w_counts for i in range(0, len(w_counts)): @@ -717,6 +721,7 @@ def update_obs(self, word_counts, totals): model = "DTM" if model == "DTM": + # slowest part of method obs = optimize.fmin_cg(f=f_obs, fprime=df_obs, x0=obs, gtol=TOL, args=args, epsilon=STEP_SIZE, disp=0) if model == "DIM": pass @@ -1011,6 +1016,7 @@ def fit_lda_post(self, doc_number, time, ldaseq, LDA_INFERENCE_CONVERGED = 1e-8, def update_lda_seq_ss(self, time, doc, topic_suffstats): """ Update lda sequence sufficient statistics from an lda posterior. + This is very similar to the update_gamma method and uses the same formula. """ num_topics = self.lda.num_topics @@ -1018,7 +1024,7 @@ def update_lda_seq_ss(self, time, doc, topic_suffstats): topic_ss = topic_suffstats[k] n = 0 for word_id, count in self.doc: - topic_ss[word_id][time] = topic_ss[word_id][time] + count * self.phi[n][k] + topic_ss[word_id][time] += count * self.phi[n][k] n += 1 topic_suffstats[k] = topic_ss @@ -1028,9 +1034,8 @@ def update_lda_seq_ss(self, time, doc, topic_suffstats): # the following functions are used in update_obs as the function to optimize def f_obs(x, *args): - """ - Function which we are optimising for minimizing obs + Function which we are optimising for minimizing obs. """ sslm, word_counts, totals, mean_deriv_mtx, word, deriv = args # flag diff --git a/gensim/test/test_data/.DS_Store b/gensim/test/test_data/.DS_Store deleted file mode 100644 index 0533d30182c1f2823f7d097daa20527a4cd810cb..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 10244 zcmeHM&2AGh5T0!#QPGwHY1MFIkG(WBMckt*AoainIB+85XWK5_CQ*|#rKdaw55)uU z033Sc4d9#cCb1p6O@)L|<*jyS*V*|z{``4tZ;2>67`7`!F%hlcWczpx7stWB^Er1c z-x7G0$22SU%>aAQbwQ_B__(!PYY#pF%+tpIE4!fXVt=Swt?OtbA8m> z@X+$=;<<0uMBgnRo3$I;yZpt9a~)qu24J|3YeIco%P2{~)WE72vC@tf>#D=^jLs}C zL~D4}P)i5jWl}oDxl0-PWm5Al_3(5oaqLR34J49GN-Bu#5|L)`>d~I@SHaV)>?NM~ z+{c@JIuSi_9UYu)pfu@<_RTn>cNQ<3@MG1r>6O$}hBx;H$J;^dec-ehc^=1TZ)s5+ zcU-Jb120|}k9+h8{Izi{gSRg363VQ2fBXghNSI}{w1tlW`o~hu(kBbQ1P0sS@B+Rg z>F*4-0cM-P(bR{S9eAu?gOB2^crFy=4g=)@P!jw(`;W>RDKvYI z_6(s6)|W#UoF9xDx6mUg`mzBmWF=VcFV0jA;FN(max=i=--xEP08$Upv0@!uu<{MO z?OX8rh!5l3gT{I?xw^TXj)+7vI>wqrbchyYqLBvbc5p0XMfWR;zL?Sd>7B}R&?#{~ zanvpfTO4KIgTe~aQ8o~L1MTREcBXS0dc8&;=gW%>1zR_JBZgFRK|+@%lJv>lkT_Ge zz)I6X;+2bzG-JbT5%V%KePtD(`_6jA;p=aWtig1gnKn$ENv|gM{2=Ql9bwzUtcKu^ zhck6_sVQyUi{@CT7@kGHr@wwtnO?sSr4Oa@N+!k3abj~4A%_~VJoCh2q4?tx)#-8h zwBi^!vO-?OcL9HZoa4Uoi*s zUc|e)5xre&dV4;_o6M3b&t3^`3bYRLB(D>=zRB8`*9;Ekd8Rk=H>Q+u1P1OM1IwyK zF@OJm?f?J(y=!-9P+%Z1Fk>Jp)ZW()@SJ~JpZ${G@wfIE=VzRJ@Oxur#RV7n91mMS xF+cF<_#M7S*&bUya9_>sSXq?d_~-u&2>XA>x+m=aWe1?ID~s0GgL}09{{u=?B)I?p diff --git a/gensim/test/test_data/DTM/sstats_test b/gensim/test/test_data/DTM/sstats_test.txt similarity index 100% rename from gensim/test/test_data/DTM/sstats_test rename to gensim/test/test_data/DTM/sstats_test.txt diff --git a/gensim/test/test_ldaseqmodel.py b/gensim/test/test_ldaseqmodel.py index 81af4ea6ec..0df3198286 100644 --- a/gensim/test/test_ldaseqmodel.py +++ b/gensim/test/test_ldaseqmodel.py @@ -17,6 +17,7 @@ class TestLdaSeq(unittest.TestCase): + # we are setting up a DTM model and fitting it, and checking topic-word and doc-topic results. def setUp(self): texts = [[u'senior', u'studios', u'studios', u'studios', u'creators', u'award', u'mobile', u'currently', u'challenges', u'senior', u'summary', u'senior', u'motivated', u'creative', u'senior'],[u'performs', u'engineering', u'tasks', u'infrastructure', u'focusing', u'primarily', u'programming', u'interaction', u'designers', u'engineers', u'leadership', u'teams', u'teams', u'crews', u'responsibilities', u'engineering', u'quality', u'functional', u'functional', u'teams', u'organizing', u'prioritizing', u'technical', u'decisions', u'engineering', u'participates', u'participates', u'reviews', u'participates', u'hiring', u'conducting', u'interviews'],[u'feedback', u'departments', u'define', u'focusing', u'engineering', u'teams', u'crews', u'facilitate', u'engineering', u'departments', u'deadlines', u'milestones', u'typically', u'spends', u'designing', u'developing', u'updating', u'bugs', u'mentoring', u'engineers', u'define', u'schedules', u'milestones', u'participating'],[ u'reviews', u'interviews', u'sized', u'teams', u'interacts', u'disciplines', u'knowledge', u'skills', u'knowledge', u'knowledge', u'xcode', u'scripting', u'debugging', u'skills', u'skills', u'knowledge', u'disciplines', u'animation', u'networking', u'expertise', u'competencies', u'oral', u'skills', u'management', u'skills', u'proven', u'effectively', u'teams', u'deadline', u'environment', u'bachelor', u'minimum', u'shipped', u'leadership', u'teams', u'location', u'resumes', u'jobs', u'candidates', u'openings', u'jobs'], [u'maryland', u'client', u'producers', u'electricity', u'operates', u'storage', u'utility', u'retail', u'customers', u'engineering', u'consultant', u'maryland', u'summary', u'technical', u'technology', u'departments', u'expertise', u'maximizing', u'output', u'reduces', u'operating', u'participates', u'areas', u'engineering', u'conducts', u'testing', u'solve', u'supports', u'environmental', u'understands', u'objectives', u'operates', u'responsibilities', u'handles', u'complex', u'engineering', u'aspects', u'monitors', u'quality', u'proficiency', u'optimization', u'recommendations', u'supports', u'personnel', u'troubleshooting', u'commissioning', u'startup', u'shutdown', u'supports', u'procedure', u'operating', u'units', u'develops', u'simulations', u'troubleshooting', u'tests', u'enhancing', u'solving', u'develops', u'estimates', u'schedules', u'scopes', u'understands', u'technical', u'management', u'utilize', u'routine', u'conducts', u'hazards', u'utilizing', u'hazard', u'operability', u'methodologies', u'participates', u'startup', u'reviews', u'pssr', u'participate', u'teams', u'participate', u'regulatory', u'audits', u'define', u'scopes', u'budgets', u'schedules', u'technical', u'management', u'environmental', u'awareness', u'interfacing', u'personnel', u'interacts', u'regulatory', u'departments', u'input', u'objectives', u'identifying', u'introducing', u'concepts', u'solutions', u'peers', u'customers', u'coworkers', u'knowledge', u'skills', u'engineering', u'quality', u'engineering'], [u'commissioning', u'startup', u'knowledge', u'simulators', u'technologies', u'knowledge', u'engineering', u'techniques', u'disciplines', u'leadership', u'skills', u'proven', u'engineers', u'oral', u'skills', u'technical', u'skills', u'analytically', u'solve', u'complex', u'interpret', u'proficiency', u'simulation', u'knowledge', u'applications', u'manipulate', u'applications', u'engineering'],[u'calculations', u'programs', u'matlab', u'excel', u'independently', u'environment', u'proven', u'skills', u'effectively', u'multiple', u'tasks', u'planning', u'organizational', u'management', u'skills', u'rigzone', u'jobs', u'developer', u'exceptional', u'strategies', u'junction', u'exceptional', u'strategies', u'solutions', u'solutions', u'biggest', u'insurers', u'operates', u'investment'], [u'vegas', u'tasks', u'electrical', u'contracting', u'expertise', u'virtually', u'electrical', u'developments', u'institutional', u'utilities', u'technical', u'experts', u'relationships', u'credibility', u'contractors', u'utility', u'customers', u'customer', u'relationships', u'consistently', u'innovations', u'profile', u'construct', u'envision', u'dynamic', u'complex', u'electrical', u'management', u'grad', u'internship', u'electrical', u'engineering', u'infrastructures', u'engineers', u'documented', u'management', u'engineering', u'quality', u'engineering', u'electrical', u'engineers', u'complex', u'distribution', u'grounding', u'estimation', u'testing', u'procedures', u'voltage', u'engineering'],[u'troubleshooting', u'installation', u'documentation', u'bsee', u'certification', u'electrical', u'voltage', u'cabling', u'electrical', u'engineering', u'candidates', u'electrical', u'internships', u'oral', u'skills', u'organizational', u'prioritization', u'skills', u'skills', u'excel', u'cadd', u'calculation', u'autocad', u'mathcad', u'skills', u'skills', u'customer', u'relationships', u'solving', u'ethic', u'motivation', u'tasks', u'budget', u'affirmative', u'diversity', u'workforce', u'gender', u'orientation', u'disability', u'disabled', u'veteran', u'vietnam', u'veteran', u'qualifying', u'veteran', u'diverse', u'candidates', u'respond', u'developing', u'workplace', u'reflects', u'diversity', u'communities', u'reviews', u'electrical', u'contracting', u'southwest', u'electrical', u'contractors'], [u'intern', u'electrical', u'engineering', u'idexx', u'laboratories', u'validating', u'idexx', u'integrated', u'hardware', u'entails', u'planning', u'debug', u'validation', u'engineers', u'validation', u'methodologies', u'healthcare', u'platforms', u'brightest', u'solve', u'challenges', u'innovation', u'technology', u'idexx', u'intern', u'idexx', u'interns', u'supplement', u'interns', u'teams', u'roles', u'competitive', u'interns', u'idexx', u'interns', u'participate', u'internships', u'mentors', u'seminars', u'topics', u'leadership', u'workshops', u'relevant', u'planning', u'topics', u'intern', u'presentations', u'mixers', u'applicants', u'ineligible', u'laboratory', u'compliant', u'idexx', u'laboratories', u'healthcare', u'innovation', u'practicing', u'veterinarians', u'diagnostic', u'technology', u'idexx', u'enhance', u'veterinarians', u'efficiency', u'economically', u'idexx', u'worldwide', u'diagnostic', u'tests', u'tests', u'quality', u'headquartered', u'idexx', u'laboratories', u'employs', u'customers', u'qualifications', u'applicants', u'idexx', u'interns', u'potential', u'demonstrated', u'portfolio', u'recommendation', u'resumes', u'marketing', u'location', u'americas', u'verification', u'validation', u'schedule', u'overtime', u'idexx', u'laboratories', u'reviews', u'idexx', u'laboratories', u'nasdaq', u'healthcare', u'innovation', u'practicing', u'veterinarians'], [u'location', u'duration', u'temp', u'verification', u'validation', u'tester', u'verification', u'validation', u'middleware', u'specifically', u'testing', u'applications', u'clinical', u'laboratory', u'regulated', u'environment', u'responsibilities', u'complex', u'hardware', u'testing', u'clinical', u'analyzers', u'laboratory', u'graphical', u'interfaces', u'complex', u'sample', u'sequencing', u'protocols', u'developers', u'correction', u'tracking', u'tool', u'timely', u'troubleshoot', u'testing', u'functional', u'manual', u'automated', u'participate', u'ongoing'],[u'testing', u'coverage', u'planning', u'documentation', u'testing', u'validation', u'corrections', u'monitor', u'implementation', u'recurrence', u'operating', u'statistical', u'quality', u'testing', u'global', u'multi', u'teams', u'travel', u'skills', u'concepts', u'waterfall', u'agile', u'methodologies', u'debugging', u'skills', u'complex', u'automated', u'instrumentation', u'environment', u'hardware', u'mechanical', u'components', u'tracking', u'lifecycle', u'management', u'quality', u'organize', u'define', u'priorities', u'organize', u'supervision', u'aggressive', u'deadlines', u'ambiguity', u'analyze', u'complex', u'situations', u'concepts', u'technologies', u'verbal', u'skills', u'effectively', u'technical', u'clinical', u'diverse', u'strategy', u'clinical', u'chemistry', u'analyzer', u'laboratory', u'middleware', u'basic', u'automated', u'testing', u'biomedical', u'engineering', u'technologists', u'laboratory', u'technology', u'availability', u'click', u'attach'], [u'scientist', u'linux', u'asrc', u'scientist', u'linux', u'asrc', u'technology', u'solutions', u'subsidiary', u'asrc', u'engineering', u'technology', u'contracts'], [u'multiple', u'agencies', u'scientists', u'engineers', u'management', u'personnel', u'allows', u'solutions', u'complex', u'aeronautics', u'aviation', u'management', u'aviation', u'engineering', u'hughes', u'technical', u'technical', u'aviation', u'evaluation', u'engineering', u'management', u'technical', u'terminal', u'surveillance', u'programs', u'currently', u'scientist', u'travel', u'responsibilities', u'develops', u'technology', u'modifies', u'technical', u'complex', u'reviews', u'draft', u'conformity', u'completeness', u'testing', u'interface', u'hardware', u'regression', u'impact', u'reliability', u'maintainability', u'factors', u'standardization', u'skills', u'travel', u'programming', u'linux', u'environment', u'cisco', u'knowledge', u'terminal', u'environment', u'clearance', u'clearance', u'input', u'output', u'digital', u'automatic', u'terminal', u'management', u'controller', u'termination', u'testing', u'evaluating', u'policies', u'procedure', u'interface', u'installation', u'verification', u'certification', u'core', u'avionic', u'programs', u'knowledge', u'procedural', u'testing', u'interfacing', u'hardware', u'regression', u'impact', u'reliability', u'maintainability', u'factors', u'standardization', u'missions', u'asrc', u'subsidiaries', u'affirmative', u'employers', u'applicants', u'disability', u'veteran', u'technology', u'location', u'airport', u'bachelor', u'schedule', u'travel', u'contributor', u'management', u'asrc', u'reviews'], [u'technical', u'solarcity', u'niche', u'vegas', u'overview', u'resolving', u'customer', u'clients', u'expanding', u'engineers', u'developers', u'responsibilities', u'knowledge', u'planning', u'adapt', u'dynamic', u'environment', u'inventive', u'creative', u'solarcity', u'lifecycle', u'responsibilities', u'technical', u'analyzing', u'diagnosing', u'troubleshooting', u'customers', u'ticketing', u'console', u'escalate', u'knowledge', u'engineering', u'timely', u'basic', u'phone', u'functionality', u'customer', u'tracking', u'knowledgebase', u'rotation', u'configure', u'deployment', u'sccm', u'technical', u'deployment', u'deploy', u'hardware', u'solarcity', u'bachelor', u'knowledge', u'dell', u'laptops', u'analytical', u'troubleshooting', u'solving', u'skills', u'knowledge', u'databases', u'preferably', u'server', u'preferably', u'monitoring', u'suites', u'documentation', u'procedures', u'knowledge', u'entries', u'verbal', u'skills', u'customer', u'skills', u'competitive', u'solar', u'package', u'insurance', u'vacation', u'savings', u'referral', u'eligibility', u'equity', u'performers', u'solarcity', u'affirmative', u'diversity', u'workplace', u'applicants', u'orientation', u'disability', u'veteran', u'careerrookie'], [u'embedded', u'exelis', u'junction', u'exelis', u'embedded', u'acquisition', u'networking', u'capabilities', u'classified', u'customer', u'motivated', u'develops', u'tests', u'innovative', u'solutions', u'minimal', u'supervision', u'paced', u'environment', u'enjoys', u'assignments', u'interact', u'multi', u'disciplined', u'challenging', u'focused', u'embedded', u'developments', u'spanning', u'engineering', u'lifecycle', u'specification', u'enhancement', u'applications', u'embedded', u'freescale', u'applications', u'android', u'platforms', u'interface', u'customers', u'developers', u'refine', u'specifications', u'architectures'],[u'java', u'programming', u'scripts', u'python', u'debug', u'debugging', u'emulators', u'regression', u'revisions', u'specialized', u'setups', u'capabilities', u'subversion', u'technical', u'documentation', u'multiple', u'engineering', u'techexpousa', u'reviews'], [u'modeler', u'semantic', u'modeling', u'models', u'skills', u'ontology', u'resource', u'framework', u'schema', u'technologies', u'hadoop', u'warehouse', u'oracle', u'relational', u'artifacts', u'models', u'dictionaries', u'models', u'interface', u'specifications', u'documentation', u'harmonization', u'mappings', u'aligned', u'coordinate', u'technical', u'peer', u'reviews', u'stakeholder', u'communities', u'impact', u'domains', u'relationships', u'interdependencies', u'models', u'define', u'analyze', u'legacy', u'models', u'corporate', u'databases', u'architectural', u'alignment', u'customer', u'expertise', u'harmonization', u'modeling', u'modeling', u'consulting', u'stakeholders', u'quality', u'models', u'storage', u'agile', u'specifically', u'focus', u'modeling', u'qualifications', u'bachelors', u'accredited', u'modeler', u'encompass', u'evaluation', u'skills', u'knowledge', u'modeling', u'techniques', u'resource', u'framework', u'schema', u'technologies', u'unified', u'modeling', u'technologies', u'schemas', u'ontologies', u'sybase', u'knowledge', u'skills', u'interpersonal', u'skills', u'customers', u'clearance', u'applicants', u'eligibility', u'classified', u'clearance', u'polygraph', u'techexpousa', u'solutions', u'partnership', u'solutions', u'integration'], [u'technologies', u'junction', u'develops', u'maintains', u'enhances', u'complex', u'diverse', u'intensive', u'analytics', u'algorithm', u'manipulation', u'management', u'documented', u'individually', u'reviews', u'tests', u'components', u'adherence', u'resolves', u'utilizes', u'methodologies', u'environment', u'input', u'components', u'hardware', u'offs', u'reuse', u'cots', u'gots', u'synthesis', u'components', u'tasks', u'individually', u'analyzes', u'modifies', u'debugs', u'corrects', u'integrates', u'operating', u'environments', u'develops', u'queries', u'databases', u'repositories', u'recommendations', u'improving', u'documentation', u'develops', u'implements', u'algorithms', u'functional', u'assists', u'developing', u'executing', u'procedures', u'components', u'reviews', u'documentation', u'solutions', u'analyzing', u'conferring', u'users', u'engineers', u'analyzing', u'investigating', u'areas', u'adapt', u'hardware', u'mathematical', u'models', u'predict', u'outcome', u'implement', u'complex', u'database', u'repository', u'interfaces', u'queries', u'bachelors', u'accredited', u'substituted', u'bachelors', u'firewalls', u'ipsec', u'vpns', u'technology', u'administering', u'servers', u'apache', u'jboss', u'tomcat', u'developing', u'interfaces', u'firefox', u'internet', u'explorer', u'operating', u'mainframe', u'linux', u'solaris', u'virtual', u'scripting', u'programming', u'oriented', u'programming', u'ajax', u'script', u'procedures', u'cobol', u'cognos', u'fusion', u'focus', u'html', u'java', u'java', u'script', u'jquery', u'perl', u'visual', u'basic', u'powershell', u'cots', u'cots', u'oracle', u'apex', u'integration', u'competitive', u'package', u'bonus', u'corporate', u'equity', u'tuition', u'reimbursement', u'referral', u'bonus', u'holidays', u'insurance', u'flexible', u'disability', u'insurance'], [u'technologies', u'disability', u'accommodation', u'recruiter', u'techexpousa'], @@ -24,11 +25,12 @@ def setUp(self): ['river','water','mud','tree'],['money','transaction','bank','finance'], ['bank','borrow','money'], ['bank','finance'], ['finance','money','sell','bank'],['borrow','sell'],['bank','loan','sell']] # initializing using own LDA sufficient statistics so that we get same results each time. - sstats = numpy.loadtxt(datapath('sstats_test')) + sstats = numpy.loadtxt(datapath('sstats_test.txt')) dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] self.ldaseq = ldaseqmodel.LdaSeqModel(corpus = corpus , id2word= dictionary, num_topics=2, time_slice=[10, 10, 11], initialize='own', sstats=sstats) + # testing topic word proportions def testTopicWord(self): topics = self.ldaseq.print_topics(0) @@ -36,6 +38,7 @@ def testTopicWord(self): self.assertAlmostEqual(topics[0][0][0], expected_topic_word[0][0], places=2) self.assertEqual(topics[0][0][1], expected_topic_word[0][1]) + # testing document-topic proportions def testDocTopic(self): doc_topic = self.ldaseq.doc_topics(0) expected_doc_topic = 0.00066577896138482028 From 9c7b0eb02939dae8ecbb55bc601df103c4e92e46 Mon Sep 17 00:00:00 2001 From: Bhargav Srinivasa Date: Wed, 17 Aug 2016 00:29:21 +0530 Subject: [PATCH 38/38] Updated Notebook --- docs/notebooks/ldaseqmodel.ipynb | 1503 +++++++++--------------------- gensim/models/ldaseqmodel.py | 13 +- 2 files changed, 471 insertions(+), 1045 deletions(-) diff --git a/docs/notebooks/ldaseqmodel.ipynb b/docs/notebooks/ldaseqmodel.ipynb index d36460fd40..50b1465a3f 100644 --- a/docs/notebooks/ldaseqmodel.ipynb +++ b/docs/notebooks/ldaseqmodel.ipynb @@ -1,5 +1,63 @@ { "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Dynamic Topic Models" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Imagine you have a gigantic corpus which spans over a couple of years. You want to find semantically similar documents; one from the very beginning of your time-line, and one in the very end. How would you?\n", + "This is where Dynamic Topic Models comes in. By having a time-based element to topics, context is preserved while ley-words may change.\n", + "\n", + "Dynamic Topic Models are used to model the evolution of topics in a corpus, over time. The Dynamic Topic Model is part of a class of probabilistic topic models, like the LDA. \n", + "\n", + "While most traditional topic mining algorithms do not expect time-tagged data or take into account any prior ordering, Dynamic Topic Models (DTM) leverages the knowledge of different documents belonging to a different time-slice in an attempt to map how the words in a topic change over time.\n", + "\n", + "David Blei does a good job explaining the theory behind this in this [Google talk](https://www.youtube.com/watch?v=7BMsuyBPx90). If you prefer to directly read the [paper on DTM by Blei and Lafferty](http://repository.cmu.edu/cgi/viewcontent.cgi?article=2036&context=compsci), that should get you upto speed too." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Motivation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "But - why even undertake this, especially when Gensim itself have a wrapper?\n", + "The main motivation was the lack of documentation in the original code - and the fact that doing an only python version makes it easier to use gensim building blocks. For example, for setting up the Sufficient Statistics to initialize the DTM, you can just pass a pre-trained gensim LDA model!\n", + "\n", + "There is some clarity on how they built their code now - Variational Inference using Kalman Filters. I've tried to make things as clear as possible in the code, but it still needs some polishing. \n", + "\n", + "Any help through PRs would be greatly appreciated!\n", + "\n", + "I have been regularly blogging about my progress with implementing this, which you can find [here](http://rare-technologies.com/author/bhargav/)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Use Case " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you would have seen the video or read the paper, it's use case would be pretty clear - and the example of modelling it on Science research papers gives us some pretty interesting results. It was used to not only catch how various themes of research such as Physics or Neuroscience evolved over the decades but also in identifying similar documents in a way not many other modelling algorithms can. While words may change over time, the fact that DTM can identify topics over time can help us find semantically similar documents over a long time-period.\n", + "\n", + "[This](http://rare-technologies.com/understanding-and-coding-dynamic-topic-models/) blog post is also useful in breaking down the ideas and theory behind DTM." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -7,6 +65,14 @@ "# Using LdaSeqModel for DTM" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Gensim already has a wrapper for original C++ DTM code, but the `LdaSeqModel` class is an effort to have a pure python implementation of the same.\n", + "Using it is very similar to using any other gensim topic-modelling algorithm, with all you need to start is an iterable gensim corpus, id2word and a list with the number of documents in each of your time-slices." + ] + }, { "cell_type": "code", "execution_count": 1, @@ -15,17 +81,27 @@ }, "outputs": [], "source": [ + "# setting up our imports\n", + "\n", "from gensim.models import ldaseqmodel\n", "from gensim.corpora import Dictionary, bleicorpus\n", "import numpy\n", - "from gensim import matutils" + "from gensim.matutils import hellinger" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "All you need to start using DTM is an iterable gensim corpus, id2word and a list with the number of documents in each of your time-slices." + "We will be loading the corpus and dictionary from disk. Here our corpus in the Blei corpus format, but it can be any iterable corpus.\n", + "The data set here consists of news reports over 3 months downloaded from here and cleaned. \n", + "\n", + "TODO: better, more interesting data-set.\n", + "\n", + "### What is a time-slice?\n", + "A very important input for DTM to work is the `time_slice` input. It should be a list which contains the number of documents in each time-slice. In our case, the first month had 438 articles, the second 430 and the last month had 456 articles. This means we'd need an input which looks like this: `time_slice = [438, 430, 456]`. \n", + "\n", + "Once you have your corpus, id2word and time_slice ready, we're good to go!" ] }, { @@ -39,878 +115,58 @@ "# loading our corpus and dictionary\n", "dictionary = Dictionary.load('Corpus/news_dictionary')\n", "corpus = bleicorpus.BleiCorpus('Corpus/news_corpus')\n", - "# the corpus used here consists of news reports for 3 months\n", - "# the first month had 438 articles, the second 430 and the last month had 456 articles\n", "# it's very important that your corpus is saved in order of your time-slices!\n", "\n", "time_slice = [438, 430, 456]" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For DTM to work it first needs the Sufficient Statistics from a trained LDA model on the _*same*_ dataset. \n", + "By default LdaSeqModel trains it's own model and passes those values on, but can also accept a pre-trained gensim LDA model, or a numpy matrix which contains the Suff Stats.\n", + "\n", + "We will be training our model in default mode, so LDA will be first performed on the dataset. The `passes` parameter is to instruct LdaModel on the number of passes." + ] + }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 10, "metadata": { "collapsed": false, "scrolled": true }, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - " EM iter 0\n", - "E Step\n", - "M Step\n", - "Fitting topic number 0\n", - "Computing bound, all times\n", - "initial sslm bound is 2795842.25993\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2812881.60423 convergence is 0.00609452991812\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2816896.73671 convergence is 0.00142740898702\n", - "Fitting topic number 1\n", - "Computing bound, all times\n", - "initial sslm bound is 2930495.62431\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2943686.33857 convergence is 0.00450118886052\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2946530.5326 convergence is 0.00096620145735\n", - "Fitting topic number 2\n", - "Computing bound, all times\n", - "initial sslm bound is 2988475.36794\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2999832.07399 convergence is 0.00380016719362\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3001904.64337 convergence is 0.000690895135268\n", - "Fitting topic number 3\n", - "Computing bound, all times\n", - "initial sslm bound is 3194060.29327\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3200886.8353 convergence is 0.00213726148007\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3201665.6748 convergence is 0.000243319912892\n", - "Fitting topic number 4\n", - "Computing bound, all times\n", - "initial sslm bound is 3024297.26659\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3036608.85235 convergence is 0.00407089140919\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3039113.82323 convergence is 0.000824923788358\n" - ] - }, { "name": "stderr", "output_type": "stream", "text": [ - "/Users/bhargavvader/Open_Source/gensim/gensim/models/ldaseqmodel.py:229: RuntimeWarning: divide by zero encountered in double_scalars\n", + "/Users/bhargavvader/Open_Source/gensim/gensim/models/ldaseqmodel.py:237: RuntimeWarning: divide by zero encountered in double_scalars\n", " convergence = numpy.fabs((bound - old_bound) / old_bound)\n" ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0 iteration lda seq bound is 12380633.093 , convergence is inf\n", - " EM iter 1\n", - "E Step\n", - "M Step\n", - "Fitting topic number 0\n", - "Computing bound, all times\n", - "initial sslm bound is 2829286.67058\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2830997.57809 convergence is 0.000604713379885\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2831478.31217 convergence is 0.000169810843792\n", - "Fitting topic number 1\n", - "Computing bound, all times\n", - "initial sslm bound is 2927605.52964\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2929057.40479 convergence is 0.000495925813536\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2929402.59379 convergence is 0.00011784985956\n", - "Fitting topic number 2\n", - "Computing bound, all times\n", - "initial sslm bound is 3004352.00625\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3005350.5243 convergence is 0.000332357209613\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3005435.11891 convergence is 2.81480023821e-05\n", - "Fitting topic number 3\n", - "Computing bound, all times\n", - "initial sslm bound is 3220542.7094\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3220760.43415 convergence is 6.76049856071e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3220639.56659 convergence is 3.75276446946e-05\n", - "Fitting topic number 4\n", - "Computing bound, all times\n", - "initial sslm bound is 3037220.67774\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3038465.1934 convergence is 0.000409754770342\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3038734.42267 convergence is 8.86069955012e-05\n", - "1 iteration lda seq bound is 12477087.7812 , convergence is 0.00779077188489\n", - " EM iter 2\n", - "E Step\n", - "M Step\n", - "Fitting topic number 0\n", - "Computing bound, all times\n", - "initial sslm bound is 2841564.67116\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2841826.17092 convergence is 9.20266760055e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2841775.38963 convergence is 1.78692460842e-05\n", - "Fitting topic number 1\n", - "Computing bound, all times\n", - "initial sslm bound is 2913525.7655\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2913793.47206 convergence is 9.1884053726e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2913728.58782 convergence is 2.22679619717e-05\n", - "Fitting topic number 2\n", - "Computing bound, all times\n", - "initial sslm bound is 3008639.33248\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3008684.52965 convergence is 1.50224626997e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3008504.95706 convergence is 5.9684751974e-05\n", - "Fitting topic number 3\n", - "Computing bound, all times\n", - "initial sslm bound is 3228028.69449\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3227924.54617 convergence is 3.22637540779e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3227755.68636 convergence is 5.23121900081e-05\n", - "Fitting topic number 4\n", - "Computing bound, all times\n", - "initial sslm bound is 3041442.22218\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3041696.5172 convergence is 8.36100130206e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3041633.11375 convergence is 2.08447669116e-05\n", - "2 iteration lda seq bound is 12492137.3259 , convergence is 0.0012061744691\n", - " EM iter 3\n", - "E Step\n", - "M Step\n", - "Fitting topic number 0\n", - "Computing bound, all times\n", - "initial sslm bound is 2851176.58861\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2851094.6521 convergence is 2.87377889514e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2850917.31 convergence is 6.22014100487e-05\n", - "Fitting topic number 1\n", - "Computing bound, all times\n", - "initial sslm bound is 2902424.21376\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2902404.40322 convergence is 6.82551448533e-06\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2902252.47495 convergence is 5.2345660204e-05\n", - "Fitting topic number 2\n", - "Computing bound, all times\n", - "initial sslm bound is 3010377.35587\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3010217.37454 convergence is 5.3143282101e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3010013.49679 convergence is 6.77285820787e-05\n", - "Fitting topic number 3\n", - "Computing bound, all times\n", - "initial sslm bound is 3228520.29177\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3228412.71958 convergence is 3.33193469337e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3228273.5271 convergence is 4.31148352635e-05\n", - "Fitting topic number 4\n", - "Computing bound, all times\n", - "initial sslm bound is 3046148.41811\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3046125.39059 convergence is 7.55955407865e-06\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3045993.69423 convergence is 4.32340568394e-05\n", - "3 iteration lda seq bound is 12496326.7188 , convergence is 0.000335362379951\n", - " EM iter 4\n", - "E Step\n", - "M Step\n", - "Fitting topic number 0\n", - "Computing bound, all times\n", - "initial sslm bound is 2859704.50533\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2859532.10927 convergence is 6.02845739101e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2859332.75285 convergence is 6.97164478333e-05\n", - "Fitting topic number 1\n", - "Computing bound, all times\n", - "initial sslm bound is 2894057.62201\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2893952.0932 convergence is 3.64639640437e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2893800.16892 convergence is 5.24971638074e-05\n", - "Fitting topic number 2\n", - "Computing bound, all times\n", - "initial sslm bound is 3010946.76605\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3010773.12861 convergence is 5.7668719302e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3010598.73375 convergence is 5.79236138621e-05\n", - "Fitting topic number 3\n", - "Computing bound, all times\n", - "initial sslm bound is 3225779.3825\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3225703.00964 convergence is 2.36757861011e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3225603.816 convergence is 3.07510153059e-05\n", - "Fitting topic number 4\n", - "Computing bound, all times\n", - "initial sslm bound is 3050784.05901\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3050687.43847 convergence is 3.16707230442e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3050550.57462 convergence is 4.48632825682e-05\n", - "Starting final iterations, max iter is 500\n", - "4 iteration lda seq bound is 12497423.1916 , convergence is 1.0\n", - " EM iter 5\n", - "E Step\n", - "M Step\n", - "Fitting topic number 0\n", - "Computing bound, all times\n", - "initial sslm bound is 2867940.01385\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2867750.00547 convergence is 6.62525653333e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2867556.21842 convergence is 6.75745958157e-05\n", - "Fitting topic number 1\n", - "Computing bound, all times\n", - "initial sslm bound is 2888151.94195\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2888045.67103 convergence is 3.67954748786e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2887910.75423 convergence is 4.67156046258e-05\n", - "Fitting topic number 2\n", - "Computing bound, all times\n", - "initial sslm bound is 3010545.28465\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3010402.90674 convergence is 4.72930639881e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3010263.0772 convergence is 4.64487776458e-05\n", - "Fitting topic number 3\n", - "Computing bound, all times\n", - "initial sslm bound is 3221150.03434\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3221107.19895 convergence is 1.32981669086e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3221042.62666 convergence is 2.00466124495e-05\n", - "Fitting topic number 4\n", - "Computing bound, all times\n", - "initial sslm bound is 3054943.36651\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3054834.4927 convergence is 3.56385680671e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3054707.10529 convergence is 4.17002673314e-05\n", - "Starting final iterations, max iter is 500\n", - "5 iteration lda seq bound is 12497478.8953 , convergence is 1.0\n", - " EM iter 6\n", - "E Step\n", - "M Step\n", - "Fitting topic number 0\n", - "Computing bound, all times\n", - "initial sslm bound is 2875890.40917\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2875705.24398 convergence is 6.43853428804e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2875525.5102 convergence is 6.25007658769e-05\n", - "Fitting topic number 1\n", - "Computing bound, all times\n", - "initial sslm bound is 2883511.77424\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2883418.57027 convergence is 3.23230749008e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2883306.61094 convergence is 3.88286794818e-05\n", - "Fitting topic number 2\n", - "Computing bound, all times\n", - "initial sslm bound is 3009632.78051\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3009521.76181 convergence is 3.68877890868e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3009415.76799 convergence is 3.52194904496e-05\n", - "Fitting topic number 3\n", - "Computing bound, all times\n", - "initial sslm bound is 3215875.1213\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3215875.49503 convergence is 1.16214977711e-07\n", - "Fitting topic number 4\n", - "Computing bound, all times\n", - "initial sslm bound is 3058777.93036\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3058669.56593 convergence is 3.54273604815e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3058555.84714 convergence is 3.71791683303e-05\n", - "Bound went down, increasing iterations to 500\n", - "Starting final iterations, max iter is 500\n", - "6 iteration lda seq bound is 12497288.2591 , convergence is 1.0\n", - " EM iter 7\n", - "E Step\n", - "M Step\n", - "Fitting topic number 0\n", - "Computing bound, all times\n", - "initial sslm bound is 2884123.59799\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2883949.28156 convergence is 6.0440001425e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2883782.21302 convergence is 5.79304702412e-05\n", - "Fitting topic number 1\n", - "Computing bound, all times\n", - "initial sslm bound is 2880009.90625\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2879951.12844 convergence is 2.04088895185e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2879860.94548 convergence is 3.1314059533e-05\n", - "Fitting topic number 2\n", - "Computing bound, all times\n", - "initial sslm bound is 3008356.80984\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3008290.43213 convergence is 2.20644384588e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3008212.20925 convergence is 2.60024369493e-05\n", - "Fitting topic number 3\n", - "Computing bound, all times\n", - "initial sslm bound is 3209274.69329\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3209282.25234 convergence is 2.35537460394e-06\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3209265.7869 convergence is 5.13056805011e-06\n", - "Fitting topic number 4\n", - "Computing bound, all times\n", - "initial sslm bound is 3062701.58157\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3062602.39896 convergence is 3.23840276228e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3062502.58562 convergence is 3.25910201525e-05\n", - "Bound went down, increasing iterations to 500\n", - "Starting final iterations, max iter is 500\n", - "7 iteration lda seq bound is 12497097.2668 , convergence is 1.0\n", - " EM iter 8\n", - "E Step\n", - "M Step\n", - "Fitting topic number 0\n", - "Computing bound, all times\n", - "initial sslm bound is 2892170.59016\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2892006.6712 convergence is 5.66767946136e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2891852.13443 convergence is 5.34358283376e-05\n", - "Fitting topic number 1\n", - "Computing bound, all times\n", - "initial sslm bound is 2876732.64307\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2876675.57192 convergence is 1.98388820242e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2876605.38147 convergence is 2.43998464586e-05\n", - "Fitting topic number 2\n", - "Computing bound, all times\n", - "initial sslm bound is 3006670.49804\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3006628.08687 convergence is 1.41056918599e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3006573.73197 convergence is 1.80783591447e-05\n", - "Fitting topic number 3\n", - "Computing bound, all times\n", - "initial sslm bound is 3203155.38765\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3203201.54249 convergence is 1.44091779548e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3203201.98263 convergence is 1.37406132285e-07\n", - "Fitting topic number 4\n", - "Computing bound, all times\n", - "initial sslm bound is 3066381.72995\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3066289.15706 convergence is 3.01896174762e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3066201.46142 convergence is 2.85999251479e-05\n", - "Bound went down, increasing iterations to 500\n", - "Starting final iterations, max iter is 500\n", - "8 iteration lda seq bound is 12496985.2321 , convergence is 1.0\n", - " EM iter 9\n", - "E Step\n", - "M Step\n", - "Fitting topic number 0\n", - "Computing bound, all times\n", - "initial sslm bound is 2900164.32867\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2900010.95471 convergence is 5.28845757756e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2899865.14466 convergence is 5.02791361343e-05\n", - "Fitting topic number 1\n", - "Computing bound, all times\n", - "initial sslm bound is 2874250.13958\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2874206.3405 convergence is 1.52384370705e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2874149.61562 convergence is 1.97358415708e-05\n", - "Fitting topic number 2\n", - "Computing bound, all times\n", - "initial sslm bound is 3004600.53216\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3004561.84396 convergence is 1.28763193176e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3004527.07321 convergence is 1.15726521086e-05\n", - "Fitting topic number 3\n", - "Computing bound, all times\n", - "initial sslm bound is 3196656.37906\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3196698.30156 convergence is 1.31144847849e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3196711.87712 convergence is 4.24674436842e-06\n", - "Fitting topic number 4\n", - "Computing bound, all times\n", - "initial sslm bound is 3070059.53023\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3069976.88679 convergence is 2.69191662117e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3069896.80993 convergence is 2.60838652128e-05\n", - "Starting final iterations, max iter is 500\n", - "9 iteration lda seq bound is 12496992.2077 , convergence is 1.0\n", - " EM iter 10\n", - "E Step\n", - "M Step\n", - "Fitting topic number 0\n", - "Computing bound, all times\n", - "initial sslm bound is 2907997.98004\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2907852.77913 convergence is 4.9931573973e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2907717.1259 convergence is 4.66506507317e-05\n", - "Fitting topic number 1\n", - "Computing bound, all times\n", - "initial sslm bound is 2871827.63399\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2871790.88826 convergence is 1.27952424798e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2871747.53876 convergence is 1.50949367096e-05\n", - "Fitting topic number 2\n", - "Computing bound, all times\n", - "initial sslm bound is 3002492.30534\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3002486.79 convergence is 1.83692107034e-06\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3002466.40519 convergence is 6.78930742467e-06\n", - "Fitting topic number 3\n", - "Computing bound, all times\n", - "initial sslm bound is 3190059.8847\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3190125.30781 convergence is 2.05084270735e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3190148.01461 convergence is 7.11784127729e-06\n", - "Fitting topic number 4\n", - "Computing bound, all times\n", - "initial sslm bound is 3073863.76808\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3073783.92273 convergence is 2.59755634018e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3073710.17604 convergence is 2.39921533623e-05\n", - "Starting final iterations, max iter is 500\n", - "10 iteration lda seq bound is 12496992.9633 , convergence is 1.0\n", - " EM iter 11\n", - "E Step\n", - "M Step\n", - "Fitting topic number 0\n", - "Computing bound, all times\n", - "initial sslm bound is 2915808.82639\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2915670.2744 convergence is 4.75175128374e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2915542.01473 convergence is 4.39897721215e-05\n", - "Fitting topic number 1\n", - "Computing bound, all times\n", - "initial sslm bound is 2869843.58765\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2869818.28091 convergence is 8.81815958255e-06\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2869784.25632 convergence is 1.18560094835e-05\n", - "Fitting topic number 2\n", - "Computing bound, all times\n", - "initial sslm bound is 3000343.24402\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3000350.62199 convergence is 2.45904098033e-06\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3000339.1857 convergence is 3.81164940006e-06\n", - "Fitting topic number 3\n", - "Computing bound, all times\n", - "initial sslm bound is 3183023.35998\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3183119.89661 convergence is 3.03285946547e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3183146.78721 convergence is 8.44787570233e-06\n", - "Fitting topic number 4\n", - "Computing bound, all times\n", - "initial sslm bound is 3077736.85728\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3077662.93373 convergence is 2.40188039466e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3077594.16651 convergence is 2.23439706476e-05\n", - "Starting final iterations, max iter is 500\n", - "11 iteration lda seq bound is 12497119.4983 , convergence is 1.0\n", - " EM iter 12\n", - "E Step\n", - "M Step\n", - "Fitting topic number 0\n", - "Computing bound, all times\n", - "initial sslm bound is 2923565.84139\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2923431.52337 convergence is 4.59432196437e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2923307.85448 convergence is 4.2302643584e-05\n", - "Fitting topic number 1\n", - "Computing bound, all times\n", - "initial sslm bound is 2868002.53341\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2867979.0079 convergence is 8.20275159044e-06\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2867953.08077 convergence is 9.04020787133e-06\n", - "Fitting topic number 2\n", - "Computing bound, all times\n", - "initial sslm bound is 2998329.52529\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2998326.89762 convergence is 8.76378736683e-07\n", - "Fitting topic number 3\n", - "Computing bound, all times\n", - "initial sslm bound is 3175765.32231\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3175849.99181 convergence is 2.66611323212e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3175883.52268 convergence is 1.05580773883e-05\n", - "Fitting topic number 4\n", - "Computing bound, all times\n", - "initial sslm bound is 3081676.58078\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3081605.6288 convergence is 2.30238243609e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3081540.66537 convergence is 2.10810323045e-05\n", - "Starting final iterations, max iter is 500\n", - "12 iteration lda seq bound is 12497353.4997 , convergence is 1.0\n", - " EM iter 13\n", - "E Step\n", - "M Step\n", - "Fitting topic number 0\n", - "Computing bound, all times\n", - "initial sslm bound is 2931245.68638\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2931115.6909 convergence is 4.4348204417e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2930996.13428 convergence is 4.07887761236e-05\n", - "Fitting topic number 1\n", - "Computing bound, all times\n", - "initial sslm bound is 2866515.73182\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2866504.1503 convergence is 4.0402767607e-06\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2866486.5448 convergence is 6.14180209265e-06\n", - "Fitting topic number 2\n", - "Computing bound, all times\n", - "initial sslm bound is 2995800.66087\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2995822.90981 convergence is 7.42670896486e-06\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2995824.29052 convergence is 4.60879404345e-07\n", - "Fitting topic number 3\n", - "Computing bound, all times\n", - "initial sslm bound is 3168669.7244\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3168763.697 convergence is 2.96568000097e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3168799.78985 convergence is 1.13901966509e-05\n", - "Fitting topic number 4\n", - "Computing bound, all times\n", - "initial sslm bound is 3085574.75288\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3085505.78116 convergence is 2.23529563551e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3085443.39597 convergence is 2.02187899996e-05\n", - "Starting final iterations, max iter is 500\n", - "13 iteration lda seq bound is 12497527.1942 , convergence is 1.0\n", - " EM iter 14\n", - "E Step\n", - "M Step\n", - "Fitting topic number 0\n", - "Computing bound, all times\n", - "initial sslm bound is 2938909.33238\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2938783.07425 convergence is 4.29608794496e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2938665.15878 convergence is 4.0123911636e-05\n", - "Fitting topic number 1\n", - "Computing bound, all times\n", - "initial sslm bound is 2865095.44075\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2865092.53423 convergence is 1.01445879249e-06\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2865079.19747 convergence is 4.65491608085e-06\n", - "Fitting topic number 2\n", - "Computing bound, all times\n", - "initial sslm bound is 2993687.1666\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2993723.40716 convergence is 1.21056634367e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2993732.82568 convergence is 3.14608795608e-06\n", - "Fitting topic number 3\n", - "Computing bound, all times\n", - "initial sslm bound is 3161115.74438\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3161222.20628 convergence is 3.36785834339e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3161262.47747 convergence is 1.2739120991e-05\n", - "Fitting topic number 4\n", - "Computing bound, all times\n", - "initial sslm bound is 3089526.67004\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3089459.19226 convergence is 2.18408162799e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3089398.57825 convergence is 1.96196178044e-05\n", - "Starting final iterations, max iter is 500\n", - "14 iteration lda seq bound is 12497827.0093 , convergence is 1.0\n", - " EM iter 15\n", - "E Step\n", - "M Step\n", - "Fitting topic number 0\n", - "Computing bound, all times\n", - "initial sslm bound is 2946632.14689\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2946507.95469 convergence is 4.21471675706e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2946392.20757 convergence is 3.92828134455e-05\n", - "Fitting topic number 1\n", - "Computing bound, all times\n", - "initial sslm bound is 2863702.91941\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2863696.92018 convergence is 2.09491793225e-06\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2863686.80093 convergence is 3.53363372014e-06\n", - "Fitting topic number 2\n", - "Computing bound, all times\n", - "initial sslm bound is 2991465.87893\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2991498.66778 convergence is 1.09607987145e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2991512.41652 convergence is 4.59593530151e-06\n", - "Fitting topic number 3\n", - "Computing bound, all times\n", - "initial sslm bound is 3153620.84299\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3153716.09398 convergence is 3.02036926369e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3153759.39958 convergence is 1.37316086931e-05\n", - "Fitting topic number 4\n", - "Computing bound, all times\n", - "initial sslm bound is 3093493.42305\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3093431.28076 convergence is 2.00880627398e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3093371.20297 convergence is 1.94210819759e-05\n", - "Starting final iterations, max iter is 500\n", - "15 iteration lda seq bound is 12498194.6695 , convergence is 1.0\n", - " EM iter 16\n", - "E Step\n", - "M Step\n", - "Fitting topic number 0\n", - "Computing bound, all times\n", - "initial sslm bound is 2954495.82244\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2954372.80696 convergence is 4.16367082427e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2954257.95349 convergence is 3.88757537753e-05\n", - "Fitting topic number 1\n", - "Computing bound, all times\n", - "initial sslm bound is 2862516.78185\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2862521.67018 convergence is 1.70770509298e-06\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2862513.45485 convergence is 2.86996353244e-06\n", - "Fitting topic number 2\n", - "Computing bound, all times\n", - "initial sslm bound is 2989270.78153\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2989291.28386 convergence is 6.85863792932e-06\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2989305.22572 convergence is 4.66393668026e-06\n", - "Fitting topic number 3\n", - "Computing bound, all times\n", - "initial sslm bound is 3145668.07828\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3145778.26162 convergence is 3.50270071216e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3145823.73404 convergence is 1.44550629194e-05\n", - "Fitting topic number 4\n", - "Computing bound, all times\n", - "initial sslm bound is 3097511.89807\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3097445.39084 convergence is 2.14711776591e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3097386.03745 convergence is 1.9162047451e-05\n", - "Starting final iterations, max iter is 500\n", - "16 iteration lda seq bound is 12498524.0714 , convergence is 1.0\n", - " EM iter 17\n", - "E Step\n", - "M Step\n", - "Fitting topic number 0\n", - "Computing bound, all times\n", - "initial sslm bound is 2962646.81406\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2962524.80196 convergence is 4.11834764529e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2962410.37322 convergence is 3.86254114022e-05\n", - "Fitting topic number 1\n", - "Computing bound, all times\n", - "initial sslm bound is 2861636.28932\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2861636.04711 convergence is 8.46435131552e-08\n", - "Fitting topic number 2\n", - "Computing bound, all times\n", - "initial sslm bound is 2987297.06216\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2987329.50059 convergence is 1.08587907908e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2987340.78559 convergence is 3.77761976542e-06\n", - "Fitting topic number 3\n", - "Computing bound, all times\n", - "initial sslm bound is 3136788.48869\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3136897.79242 convergence is 3.48457453164e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3136948.13088 convergence is 1.60472124722e-05\n", - "Fitting topic number 4\n", - "Computing bound, all times\n", - "initial sslm bound is 3101682.76962\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3101616.93863 convergence is 2.12242824476e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3101557.04791 convergence is 1.9309515922e-05\n", - "Starting final iterations, max iter is 500\n", - "17 iteration lda seq bound is 12498931.8507 , convergence is 1.0\n", - " EM iter 18\n", - "E Step\n", - "M Step\n", - "Fitting topic number 0\n", - "Computing bound, all times\n", - "initial sslm bound is 2970868.16665\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2970746.5909 convergence is 4.09226333285e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2970632.17642 convergence is 3.85137130985e-05\n", - "Fitting topic number 1\n", - "Computing bound, all times\n", - "initial sslm bound is 2860780.51467\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2860782.68294 convergence is 7.57931186417e-07\n", - "Fitting topic number 2\n", - "Computing bound, all times\n", - "initial sslm bound is 2985469.65535\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2985511.52565 convergence is 1.40246928852e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2985525.61642 convergence is 4.71971652931e-06\n", - "Fitting topic number 3\n", - "Computing bound, all times\n", - "initial sslm bound is 3127552.86873\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3127687.57544 convergence is 4.30709638169e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3127737.97485 convergence is 1.61139515818e-05\n", - "Fitting topic number 4\n", - "Computing bound, all times\n", - "initial sslm bound is 3105989.62723\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3105922.84001 convergence is 2.15027175447e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3105863.14529 convergence is 1.92196386532e-05\n", - "Starting final iterations, max iter is 500\n", - "18 iteration lda seq bound is 12499432.6766 , convergence is 1.0\n", - " EM iter 19\n", - "E Step\n", - "M Step\n", - "Fitting topic number 0\n", - "Computing bound, all times\n", - "initial sslm bound is 2979226.29147\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2979104.77612 convergence is 4.07875545522e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2978990.4889 convergence is 3.83629412794e-05\n", - "Fitting topic number 1\n", - "Computing bound, all times\n", - "initial sslm bound is 2860153.85531\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2860154.25454 convergence is 1.39583515427e-07\n", - "Fitting topic number 2\n", - "Computing bound, all times\n", - "initial sslm bound is 2983923.85978\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2983952.47524 convergence is 9.58987681364e-06\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2983966.28026 convergence is 4.62642143986e-06\n", - "Fitting topic number 3\n", - "Computing bound, all times\n", - "initial sslm bound is 3117597.70848\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3117720.6202 convergence is 3.9425139874e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3117775.75501 convergence is 1.76843330248e-05\n", - "Fitting topic number 4\n", - "Computing bound, all times\n", - "initial sslm bound is 3110478.33768\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3110411.41494 convergence is 2.15152585637e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3110350.89326 convergence is 1.94577724663e-05\n", - "Starting final iterations, max iter is 500\n", - "19 iteration lda seq bound is 12500012.2135 , convergence is 1.0\n", - " EM iter 20\n", - "E Step\n", - "M Step\n", - "Fitting topic number 0\n", - "Computing bound, all times\n", - "initial sslm bound is 2987604.96022\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2987483.77752 convergence is 4.05618211702e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2987369.66836 convergence is 3.81957427262e-05\n", - "Fitting topic number 1\n", - "Computing bound, all times\n", - "initial sslm bound is 2859745.93607\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2859745.23825 convergence is 2.44012981221e-07\n", - "Fitting topic number 2\n", - "Computing bound, all times\n", - "initial sslm bound is 2982456.9566\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 2982495.56587 convergence is 1.29454575138e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 2982508.65216 convergence is 4.38769574537e-06\n", - "Fitting topic number 3\n", - "Computing bound, all times\n", - "initial sslm bound is 3107153.7818\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3107303.96895 convergence is 4.83359239876e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3107359.97018 convergence is 1.80224495892e-05\n", - "Fitting topic number 4\n", - "Computing bound, all times\n", - "initial sslm bound is 3115106.84624\n", - "Computing bound, all times\n", - "1 iteration lda seq bound is 3115039.17238 convergence is 2.17244114928e-05\n", - "Computing bound, all times\n", - "2 iteration lda seq bound is 3114977.79587 convergence is 1.97032857945e-05\n", - "Starting final iterations, max iter is 500\n", - "20 iteration lda seq bound is 12500594.8258 , convergence is 1.0\n" - ] } ], "source": [ - "# now, we set up the model.\n", - "\n", "ldaseq = ldaseqmodel.LdaSeqModel(corpus=corpus, id2word=dictionary, time_slice=time_slice, num_topics=5, passes=20)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that our model is trained, let's see what our results look like.\n", + "\n", + "### Results\n", + "Much like LDA, the points of interest would be in what the topics are and how the documents are made up of these topics.\n", + "In DTM we have the added interest of seeing how these topics evolve over time.\n", + "\n", + "Let's go through some of the functions to print Topics and analyse documents." + ] + }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 11, "metadata": { "collapsed": false }, @@ -918,122 +174,122 @@ { "data": { "text/plain": [ - "[[(0.0040000000000000001, 'use'),\n", - " (0.0040000000000000001, 'users'),\n", - " (0.0040000000000000001, 'mobile'),\n", - " (0.0040000000000000001, 'technology'),\n", - " (0.0040000000000000001, 'net'),\n", - " (0.0030000000000000001, 'security'),\n", - " (0.0030000000000000001, 'software'),\n", - " (0.0030000000000000001, 'information'),\n", - " (0.0030000000000000001, 'using'),\n", - " (0.0030000000000000001, 'used'),\n", - " (0.0030000000000000001, 'like'),\n", - " (0.0030000000000000001, 'make'),\n", - " (0.0030000000000000001, 'digital'),\n", - " (0.0030000000000000001, 'internet'),\n", - " (0.0030000000000000001, 'phone'),\n", - " (0.0030000000000000001, 'online'),\n", - " (0.0030000000000000001, 'computer'),\n", - " (0.0030000000000000001, 'search'),\n", - " (0.0030000000000000001, 'system'),\n", - " (0.0030000000000000001, 'service')],\n", - " [(0.0070000000000000001, 'government'),\n", - " (0.0040000000000000001, 'blair'),\n", - " (0.0040000000000000001, 'minister'),\n", - " (0.0040000000000000001, 'labour'),\n", - " (0.0030000000000000001, 'year'),\n", - " (0.0030000000000000001, 'public'),\n", - " (0.0030000000000000001, 'last'),\n", - " (0.0030000000000000001, 'prime'),\n", - " (0.0030000000000000001, 'economic'),\n", - " (0.002, 'election'),\n", - " (0.002, 'uk'),\n", - " (0.002, 'party'),\n", - " (0.002, 'growth'),\n", - " (0.002, 'plans'),\n", - " (0.002, 'brown'),\n", - " (0.002, 'european'),\n", - " (0.002, 'may'),\n", - " (0.002, 'market'),\n", - " (0.002, 'economy'),\n", - " (0.002, 'next')],\n", - " [(0.0080000000000000002, 'best'),\n", + "[[(0.01, 'best'),\n", " (0.0060000000000000001, 'film'),\n", " (0.0050000000000000001, 'music'),\n", - " (0.0040000000000000001, 'last'),\n", + " (0.0050000000000000001, 'last'),\n", + " (0.0040000000000000001, 'number'),\n", + " (0.0040000000000000001, 'tv'),\n", " (0.0040000000000000001, 'show'),\n", " (0.0040000000000000001, 'top'),\n", - " (0.0040000000000000001, 'number'),\n", - " (0.0040000000000000001, 'first'),\n", - " (0.0030000000000000001, 'star'),\n", - " (0.0030000000000000001, 'award'),\n", - " (0.002, 'uk'),\n", - " (0.002, 'tv'),\n", - " (0.002, 'band'),\n", + " (0.0030000000000000001, 'uk'),\n", + " (0.0030000000000000001, 'first'),\n", + " (0.0030000000000000001, 'year'),\n", + " (0.0030000000000000001, 'band'),\n", + " (0.002, 'award'),\n", + " (0.002, 'million'),\n", + " (0.002, 'record'),\n", " (0.002, 'three'),\n", - " (0.002, 'including'),\n", - " (0.002, 'game'),\n", + " (0.002, 'sales'),\n", " (0.002, 'bbc'),\n", - " (0.002, 'album'),\n", - " (0.002, 'british'),\n", - " (0.002, 'awards')],\n", - " [(0.0040000000000000001, 'court'),\n", + " (0.002, 'including'),\n", + " (0.002, 'british')],\n", + " [(0.0040000000000000001, 'mobile'),\n", + " (0.0030000000000000001, 'technology'),\n", + " (0.0030000000000000001, 'use'),\n", " (0.0030000000000000001, 'last'),\n", - " (0.0030000000000000001, 'first'),\n", + " (0.002, 'market'),\n", " (0.002, 'firm'),\n", - " (0.002, 'case'),\n", - " (0.002, 'oil'),\n", - " (0.002, 'company'),\n", - " (0.002, 'police'),\n", - " (0.002, 'former'),\n", - " (0.002, 'since'),\n", - " (0.002, 'yukos'),\n", - " (0.002, 'legal'),\n", - " (0.002, 'chief'),\n", - " (0.002, 'home'),\n", - " (0.002, 'three'),\n", + " (0.002, 'firms'),\n", + " (0.002, 'net'),\n", + " (0.002, 'much'),\n", + " (0.002, 'phone'),\n", " (0.002, 'year'),\n", - " (0.002, 'rights'),\n", - " (0.002, 'russian'),\n", - " (0.002, 'part'),\n", - " (0.002, 'club')],\n", - " [(0.0050000000000000001, 'chelsea'),\n", - " (0.0050000000000000001, 'game'),\n", - " (0.0040000000000000001, 'players'),\n", - " (0.0040000000000000001, 'league'),\n", - " (0.0040000000000000001, 'think'),\n", - " (0.0040000000000000001, 'cup'),\n", - " (0.0040000000000000001, 'united'),\n", - " (0.0040000000000000001, 'arsenal'),\n", + " (0.002, 'make'),\n", + " (0.002, 'companies'),\n", + " (0.002, 'uk'),\n", + " (0.002, 'digital'),\n", + " (0.002, 'european'),\n", + " (0.002, 'economic'),\n", + " (0.002, 'company'),\n", + " (0.002, 'growth'),\n", + " (0.002, 'government')],\n", + " [(0.0040000000000000001, 'think'),\n", " (0.0040000000000000001, 'club'),\n", - " (0.0040000000000000001, 'play'),\n", - " (0.0030000000000000001, 'win'),\n", - " (0.0030000000000000001, 'manager'),\n", + " (0.0040000000000000001, 'like'),\n", + " (0.0040000000000000001, 'want'),\n", + " (0.0030000000000000001, \"don't\"),\n", + " (0.0030000000000000001, 'game'),\n", " (0.0030000000000000001, 'football'),\n", - " (0.0030000000000000001, 'liverpool'),\n", - " (0.0030000000000000001, 'good'),\n", - " (0.0030000000000000001, 'first'),\n", " (0.0030000000000000001, 'last'),\n", - " (0.0030000000000000001, 'got'),\n", - " (0.0030000000000000001, 'want'),\n", - " (0.0030000000000000001, 'like')]]" + " (0.0030000000000000001, 'make'),\n", + " (0.0030000000000000001, 'way'),\n", + " (0.0030000000000000001, 'go'),\n", + " (0.0030000000000000001, 'time'),\n", + " (0.0030000000000000001, 'real'),\n", + " (0.002, 'players'),\n", + " (0.002, 'bbc'),\n", + " (0.002, 'going'),\n", + " (0.002, 'know'),\n", + " (0.002, 'manager'),\n", + " (0.002, 'liverpool'),\n", + " (0.002, 'got')],\n", + " [(0.0050000000000000001, 'government'),\n", + " (0.0040000000000000001, 'blair'),\n", + " (0.0040000000000000001, 'labour'),\n", + " (0.0030000000000000001, 'minister'),\n", + " (0.0030000000000000001, 'security'),\n", + " (0.0030000000000000001, 'public'),\n", + " (0.0030000000000000001, 'prime'),\n", + " (0.0030000000000000001, 'election'),\n", + " (0.0030000000000000001, 'party'),\n", + " (0.002, 'brown'),\n", + " (0.002, 'search'),\n", + " (0.002, 'make'),\n", + " (0.002, 'users'),\n", + " (0.002, 'howard'),\n", + " (0.002, 'bbc'),\n", + " (0.002, 'lord'),\n", + " (0.002, 'say'),\n", + " (0.002, 'home'),\n", + " (0.002, 'tory'),\n", + " (0.002, 'secretary')],\n", + " [(0.0050000000000000001, 'game'),\n", + " (0.0050000000000000001, 'chelsea'),\n", + " (0.0050000000000000001, 'cup'),\n", + " (0.0040000000000000001, 'first'),\n", + " (0.0040000000000000001, 'win'),\n", + " (0.0040000000000000001, 'united'),\n", + " (0.0040000000000000001, 'games'),\n", + " (0.0030000000000000001, 'league'),\n", + " (0.0030000000000000001, 'arsenal'),\n", + " (0.0030000000000000001, 'last'),\n", + " (0.0030000000000000001, 'home'),\n", + " (0.0030000000000000001, 'play'),\n", + " (0.0030000000000000001, 'players'),\n", + " (0.0030000000000000001, 'good'),\n", + " (0.0030000000000000001, 'side'),\n", + " (0.0030000000000000001, 'world'),\n", + " (0.0030000000000000001, 'goal'),\n", + " (0.002, 'ball'),\n", + " (0.002, 'second'),\n", + " (0.002, 'champions')]]" ] }, - "execution_count": 16, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# to print all topics, use `print_topics`. \n", - "\n", - "ldaseq.print_topics(0)" + "# the input parameter to `print_topics` is only a time-slice option. By passing `0` we are seeing the topics in the 1st time-slice.\n", + "ldaseq.print_topics(time=0)" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 22, "metadata": { "collapsed": false }, @@ -1041,69 +297,69 @@ { "data": { "text/plain": [ - "[[(0.0040000000000000001, 'use'),\n", - " (0.0040000000000000001, 'users'),\n", - " (0.0040000000000000001, 'mobile'),\n", - " (0.0040000000000000001, 'technology'),\n", - " (0.0040000000000000001, 'net'),\n", - " (0.0030000000000000001, 'security'),\n", - " (0.0030000000000000001, 'software'),\n", - " (0.0030000000000000001, 'information'),\n", - " (0.0030000000000000001, 'using'),\n", - " (0.0030000000000000001, 'used'),\n", - " (0.0030000000000000001, 'like'),\n", - " (0.0030000000000000001, 'make'),\n", - " (0.0030000000000000001, 'digital'),\n", - " (0.0030000000000000001, 'internet'),\n", - " (0.0030000000000000001, 'phone'),\n", - " (0.0030000000000000001, 'online'),\n", - " (0.0030000000000000001, 'computer'),\n", - " (0.0030000000000000001, 'search'),\n", - " (0.0030000000000000001, 'system'),\n", - " (0.0030000000000000001, 'service')],\n", - " [(0.0040000000000000001, 'use'),\n", - " (0.0040000000000000001, 'technology'),\n", - " (0.0040000000000000001, 'users'),\n", - " (0.0040000000000000001, 'mobile'),\n", - " (0.0040000000000000001, 'net'),\n", - " (0.0030000000000000001, 'software'),\n", - " (0.0030000000000000001, 'information'),\n", - " (0.0030000000000000001, 'security'),\n", - " (0.0030000000000000001, 'using'),\n", - " (0.0030000000000000001, 'digital'),\n", - " (0.0030000000000000001, 'used'),\n", - " (0.0030000000000000001, 'like'),\n", - " (0.0030000000000000001, 'make'),\n", - " (0.0030000000000000001, 'internet'),\n", - " (0.0030000000000000001, 'phone'),\n", - " (0.0030000000000000001, 'online'),\n", - " (0.0030000000000000001, 'computer'),\n", - " (0.0030000000000000001, 'system'),\n", - " (0.0030000000000000001, 'service'),\n", + "[[(0.0040000000000000001, 'mobile'),\n", + " (0.0030000000000000001, 'technology'),\n", + " (0.0030000000000000001, 'use'),\n", + " (0.0030000000000000001, 'last'),\n", + " (0.002, 'market'),\n", + " (0.002, 'firm'),\n", + " (0.002, 'firms'),\n", + " (0.002, 'net'),\n", + " (0.002, 'much'),\n", + " (0.002, 'phone'),\n", + " (0.002, 'year'),\n", + " (0.002, 'make'),\n", + " (0.002, 'companies'),\n", + " (0.002, 'uk'),\n", + " (0.002, 'digital'),\n", + " (0.002, 'european'),\n", + " (0.002, 'economic'),\n", + " (0.002, 'company'),\n", + " (0.002, 'growth'),\n", + " (0.002, 'government')],\n", + " [(0.0030000000000000001, 'mobile'),\n", + " (0.0030000000000000001, 'technology'),\n", + " (0.0030000000000000001, 'use'),\n", + " (0.002, 'market'),\n", + " (0.002, 'last'),\n", + " (0.002, 'firms'),\n", + " (0.002, 'firm'),\n", + " (0.002, 'phone'),\n", + " (0.002, 'much'),\n", + " (0.002, 'net'),\n", + " (0.002, 'make'),\n", + " (0.002, 'year'),\n", + " (0.002, 'digital'),\n", + " (0.002, 'uk'),\n", + " (0.002, 'companies'),\n", + " (0.002, 'economic'),\n", + " (0.002, 'european'),\n", + " (0.002, 'company'),\n", + " (0.002, 'growth'),\n", " (0.002, 'broadband')],\n", - " [(0.0040000000000000001, 'use'),\n", - " (0.0040000000000000001, 'mobile'),\n", - " (0.0040000000000000001, 'technology'),\n", - " (0.0040000000000000001, 'users'),\n", - " (0.0040000000000000001, 'net'),\n", - " (0.0030000000000000001, 'software'),\n", - " (0.0030000000000000001, 'information'),\n", - " (0.0030000000000000001, 'using'),\n", - " (0.0030000000000000001, 'security'),\n", - " (0.0030000000000000001, 'digital'),\n", - " (0.0030000000000000001, 'used'),\n", - " (0.0030000000000000001, 'like'),\n", - " (0.0030000000000000001, 'make'),\n", - " (0.0030000000000000001, 'phone'),\n", - " (0.0030000000000000001, 'internet'),\n", - " (0.0030000000000000001, 'online'),\n", - " (0.0030000000000000001, 'computer'),\n", - " (0.0030000000000000001, 'service'),\n", - " (0.0030000000000000001, 'system'),\n", - " (0.0030000000000000001, 'broadband')]]" + " [(0.0040000000000000001, 'mobile'),\n", + " (0.0030000000000000001, 'technology'),\n", + " (0.0030000000000000001, 'use'),\n", + " (0.0030000000000000001, 'market'),\n", + " (0.002, 'phone'),\n", + " (0.002, 'firms'),\n", + " (0.002, 'last'),\n", + " (0.002, 'much'),\n", + " (0.002, 'firm'),\n", + " (0.002, 'make'),\n", + " (0.002, 'net'),\n", + " (0.002, 'digital'),\n", + " (0.002, 'year'),\n", + " (0.002, 'uk'),\n", + " (0.002, 'companies'),\n", + " (0.002, 'broadband'),\n", + " (0.002, 'economic'),\n", + " (0.002, 'european'),\n", + " (0.002, 'company'),\n", + " (0.002, 'next')]]" ] }, - "execution_count": 5, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1111,37 +367,130 @@ "source": [ "# to fix a topic and see it evolve, use `print_topic_times`\n", "\n", - "ldaseq.print_topic_times(0) # evolution of 0th topic" + "ldaseq.print_topic_times(topic=1) # evolution of 1st topic" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you look at the lower frequencies; the word broadband is creeping itself up into prominence in topic number 1. \n", + "We've had our fun looking at topics, now let us see how to analyse documents.\n", + "\n", + "### Doc-Topics\n", + "the function `doc_topics` checks the topic proportions on documents already trained on. It accepts the document number in the corpus as an input.\n", + "\n", + "Let's pick up document number 558 arbitrarily and have a look." ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 37, "metadata": { "collapsed": false }, "outputs": [ { - "data": { - "text/plain": [ - "array([ 4.94926998e-05, 4.94926998e-05, 9.99802029e-01,\n", - " 4.94926998e-05, 4.94926998e-05])" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "['set', 'time,\"', 'chairman', 'decision', 'news', 'director', 'former', 'vowed', '\"it', 'results', 'club', 'third', 'home', 'paul', 'saturday.', 'south', 'conference', 'leading', '\"some', 'survival', 'needed', 'coach', \"don't\", 'every', 'trouble', 'desperate', 'eight', 'first', 'win', 'going', 'park', 'near', 'chance', 'manager', 'league', 'milan', 'games', 'go', 'game', 'foot', 'say', 'upset', \"i'm\", 'poor', 'season.', 'executive', 'road', '24', 'debut', 'portsmouth.', 'give', 'claiming', 'steve', 'break', 'rivals', 'boss', 'kevin', 'premiership', 'little', 'left', 'table.', 'life', 'join', 'years.', 'bring', 'season,', 'director.', 'became', 'st', 'according', 'official', 'hope', 'shocked', 'though', 'phone', 'charge', '14', 'website.', 'time,', 'claimed', 'kept', 'bond', 'appointment', 'unveil', 'november', 'picked', 'confirmed,', 'believed', 'deep', 'position', 'surprised', 'negotiations', 'talks', 'gmt', 'middlesbrough', 'replaced', 'appear', 'football,', '\"i\\'m', 'charge.', 'saints', 'southampton', 'sturrock', 'wednesday.', 'harry', 'poised', 'ninth', 'quit', 'relieved', 'chance.\"', 'decision.\"', 'hero', 'redknapp,', 'redknapp', \"saints'\", 'first-team', \"wouldn't\", \"mary's.\", 'portsmouth', \"redknapp's\", 'pompey', 'academy', \"harry's\", 'cult', 'rupert', 'time\".', 'coast', '57,', 'succeed', 'duties', \"'i\", 'bitter,', \"mandaric's\", \"portsmouth's\", 'wigley,', 'wigley', \"southampton',\", '1500', 'mandaric', \"'absolutely\", 'lowe', '\"disappointed\"', 'velimir', 'not\\',\"', 'disgusted', 'disappointed,', 'mandaric,', 'fratton', 'replaces', 'masterminding', 'angry,', 'vowed:', 'informed.\"', 'zajec']\n" + ] } ], "source": [ "# to check Document - Topic proportions, use `doc-topics`\n", + "words = [dictionary[word_id] for word_id, count in ldaseq.corpus.corpus[558]]\n", + "print (words)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It's pretty clear that it's a news article about football. What topics will it likely be comprised of?" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 5.46298825e-05 5.46298825e-05 7.24590312e-01 5.46298825e-05\n", + " 2.75245799e-01]\n" + ] + } + ], + "source": [ + "doc_1 = ldaseq.doc_topics(558) # check the 244th document in the corpuses topic distribution\n", + "print (doc_1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It's largely made of topics 3 and 5 - and if we go back and inspect our topics, it's quite a good match." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If we wish to analyse a document not in our training set, we can use simply pass the doc to the model similar to the `__getitem__` funciton for `LdaModel`.\n", "\n", - "ldaseq.doc_topics(244) # check the 244th document in the corpuses topic distribution" + "Let's let our document be a hypothetical news article about the effects of Ryan Giggs buying mobiles affecting the British economy." ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 53, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0.00110497 0.65302349 0.34366159 0.00110497 0.00110497]\n" + ] + } + ], + "source": [ + "doc_2 = ['economy', 'bank', 'mobile', 'phone', 'markets', 'buy', 'football', 'united', 'giggs']\n", + "doc_2 = dictionary.doc2bow(doc_2)\n", + "doc_2 = ldaseq[doc_2]\n", + "print (doc_2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Pretty neat! Topics 2 and 3 are about technology, the market and football, so this works well for us." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Distances between documents\n", + "\n", + "One of the more handy uses of DTMs topic modelling is that we can compare documents across different time-frames and see how similar they are topic-wise. When words may not necessarily overlap over these time-periods, this is very useful.\n", + "\n", + "The current dataset doesn't provide us the diversity for this to be an effective example; but we will nevertheless illustrate how to do the same." + ] + }, + { + "cell_type": "code", + "execution_count": 54, "metadata": { "collapsed": false }, @@ -1149,46 +498,78 @@ { "data": { "text/plain": [ - "array([ 0.00327869, 0.98688525, 0.00327869, 0.00327869, 0.00327869])" + "0.69071218819511226" ] }, - "execution_count": 7, + "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# to check for an unseen document\n", + "hellinger(doc_1, doc_2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The topic distributions are quite similar, so we get a high value.\n", + "For more information on how to use the gensim distance metrics, check out [this notebook](https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/distance_metrics.ipynb)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Performance\n", + "\n", + "The code currently runs between 5 to 7 times slower than the original C++ DTM code. The bottleneck is in the scipy `optimize.fmin_cg` method for updating obs. Speeding this up would fix things up!\n", + "\n", + "Since it uses iterable gensim corpuses, the memory stamp is also cleaner. The corpus size doesn't matter." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The advantages of the python port are that unlike the C++ code we needn't treat it like a black-box; PRs to help make the code better are welcomed, as well as help to make the documentation clearer and improve performance. It is also in pure python and doesn't need any dependancy outside of what gensim already needs. The added functionality of being able to analyse new documents is also a plus!\n", "\n", - "ldaseq[[(1, 1), (4, 2)]]" + "### DTM wrapper comparison\n", + "Let's now compare these results with the DTM wrapper." ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 56, "metadata": { "collapsed": false }, "outputs": [], "source": [ - "# now let's compare this to the DTM wrapper.\n", "from gensim.models.wrappers.dtmmodel import DtmModel\n", "\n", "\n", "dtm_path = \"/Users/bhargavvader/Downloads/dtm_release/dtm/main\"\n", "dtm_model = DtmModel(dtm_path, corpus, time_slice, num_topics=5, id2word=dictionary, initialize_lda=True)\n", "dtm_model.save('dtm_news')\n", - "ldaseq.save('ldaseq_news')" + "ldaseq.save('ldaseq_news')\n", + "\n", + "# if we've saved before simply load the model\n", + "dtm_model = DtmModel.load('dtm_news')" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 58, "metadata": { "collapsed": false }, "outputs": [], "source": [ + "# setting up the DTM wrapper for \n", + "\n", + "from gensim import matutils\n", "num_topics = 5\n", "topic_term = dtm_model.lambda_[:,:,0] # the lambda matrix contains \n", "\n", @@ -1234,7 +615,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 59, "metadata": { "collapsed": false }, @@ -1246,10 +627,10 @@ "\n", "\n", "\n", - "
\n", + "
\n", "