piskvorky · tmylk · Aug 25, 2016 · Aug 19, 2016 · Aug 21, 2016 · Aug 21, 2016
diff --git a/gensim/models/ldaseqmodel.py b/gensim/models/ldaseqmodel.py
@@ -183,7 +183,7 @@ def fit_lda_seq(self, corpus, lda_inference_max_iter, em_min_iter, em_max_iter):
 
         while iter_ < em_min_iter or ((convergence > LDASQE_EM_THRESHOLD) and iter_ <= em_max_iter):
 
-            logger.info(" EM iter ", iter_)
+            logger.info(" EM iter %i", iter_)
             logger.info("E Step")
             # TODO: bound is initialized to 0
             old_bound = bound
@@ -211,15 +211,15 @@ def fit_lda_seq(self, corpus, lda_inference_max_iter, em_min_iter, em_max_iter):
                 # if max_iter is too low, increase iterations.
                 if lda_inference_max_iter < LOWER_ITER:
                     lda_inference_max_iter *= ITER_MULT_LOW
-                logger.info("Bound went down, increasing iterations to", lda_inference_max_iter)
+                logger.info("Bound went down, increasing iterations to %i", lda_inference_max_iter)
 
             # check for convergence
             convergence = numpy.fabs((bound - old_bound) / old_bound)
 
             if convergence < LDASQE_EM_THRESHOLD:
 
                 lda_inference_max_iter = MAX_ITER
-                logger.info("Starting final iterations, max iter is", lda_inference_max_iter)
+                logger.info("Starting final iterations, max iter is %i", lda_inference_max_iter)
                 convergence = 1.0
 
             logger.info(iter_, "iteration lda seq bound is", bound, ", convergence is ", convergence)
@@ -318,7 +318,7 @@ def fit_lda_seq_topics(self, topic_suffstats):
         lhood_term = 0
 
         for k, chain in enumerate(self.topic_chains):
-            logger.info("Fitting topic number", k)
+            logger.info("Fitting topic number %i", k)
             lhood_term = sslm.fit_sslm(chain, topic_suffstats[k])
             lhood += lhood_term
 
@@ -371,6 +371,53 @@ def doc_topics(self, doc_number):
         return doc_topic[doc_number]
 
 
+    def DTMvis(self, time, corpus):
+        """
+        returns term_frequency, vocab, doc_lengths, topic-term distributions and doc_topic distributions, specified by pyLDAvis format.
+        all of these are needed to visualise topics for DTM for a particular time-slice via pyLDAvis.
+        input parameter is the year to do the visualisation.
+        """
+
+        doc_topic = numpy.copy(self.gammas)
+        doc_topic /= doc_topic.sum(axis=1)[:, numpy.newaxis]
+
+        topic_term = []
+        for chain in enumerate(self.topic_chains):
+            topic = numpy.transpose(chain.e_log_prob)
+            topic = topic[time]
+            topic = numpy.exp(topic)
+            topic = topic / topic.sum()
+            topic_term.append(topic)
+
+        term_frequency = [0] * self.vocab_len
+        doc_lengths = []
+        for doc_no, doc in enumerate(corpus):
+            doc_lengths.append(len(doc))
+            for pair in doc:
+                term_frequency[pair[0]] += pair[1]
+
+        vocab = []
+        for i in range(0, len(self.id2word)):
+            vocab.append(self.id2word[i])
+        # returns numpy arrays for doc_topic proportions, topic_term proportions, and document_lengths, term_frequency.
+        # these should be passed to the `pyLDAvis.prepare` method to visualise one time-slice of DTM topics.
+        return doc_topic, numpy.array(topic_term), doc_lengths, term_frequency, vocab
+
+
+    def DTMcoherence(self, time):
+        """
+        returns all topics of a particular time-slice without probabilitiy values for it to be used 
+        for either "u_mass" or "c_v" coherence.
+        """
+        coherence_topics = []
+        for topics in self.print_topics(time):
+            coherence_topic = []
+            for word, dist in topics:
+                coherence_topic.append(word)
+            coherence_topics.append(coherence_topic)
+
+        return coherence_topics
+
     def __getitem__(self, doc):
         """
         Similar to the LdaModel __getitem__ function, it returns topic proportions of a document passed.
@@ -584,7 +631,7 @@ def fit_sslm(self, sstats):
         if model == "DIM":
             bound = self.compute_bound_fixed(sstats, totals)
 
-        logger.info("initial sslm bound is ", bound)
+        logger.info("initial sslm bound is %f", bound)
 
         while converged > sslm_fit_threshold and iter_ < sslm_max_iter:
             iter_ += 1
@@ -597,7 +644,7 @@ def fit_sslm(self, sstats):
                 bound = self.compute_bound_fixed(sstats, totals)
 
             converged = numpy.fabs((bound - old_bound) / old_bound)
-            logger.info(iter_, " iteration lda seq bound is ", bound, " convergence is", converged)
+            logger.info("iteration %i iteration lda seq bound is %f convergence is %f", iter_, bound, converged)
 
         self.e_log_prob = self.compute_expected_log_prob()
         return bound

diff --git a/gensim/models/wrappers/dtmmodel.py b/gensim/models/wrappers/dtmmodel.py
@@ -303,3 +303,47 @@ def show_topic(self, topicid, time, num_words=50):
     def print_topic(self, topicid, time, num_words=10):
         """Return the given topic, formatted as a string."""
         return ' + '.join(['%.3f*%s' % v for v in self.show_topic(topicid, time, num_words)])
+
+    def DTMvis(self, corpus, time):
+        """
+        returns term_frequency, vocab, doc_lengths, topic-term distributions and doc_topic distributions, specified by pyLDAvis format.
+        all of these are needed to visualise topics for DTM for a particular time-slice via pyLDAvis.
+        input parameter is the year to do the visualisation.
+        """
+        topic_term = self.lambda_[:,:,time]
+        topic_term = np.exp(topic_term)
+        topic_term = topic_term / topic_term.sum()
+        topic_term = topic_term * self.num_topics
+
+        doc_topic = self.gamma_
+
+        term_frequency = [0] * self.num_terms
+        doc_lengths = []
+        for doc_num, doc in enumerate(corpus):
+            doc_lengths.append(len(doc))
+            for pair in doc:
+                term_frequency[pair[0]] += pair[1]
+
+        vocab = []
+        for i in range(0, len(self.id2word)):
+            vocab.append(self.id2word[i])
+        # returns numpy arrays for doc_topic proportions, topic_term proportions, and document_lengths, term_frequency.
+        # these should be passed to the `pyLDAvis.prepare` method to visualise one time-slice of DTM topics.
+        return doc_topic, topic_term, doc_lengths, term_frequency, vocab
+
+    def DTMcoherence(self, time, num_words=20):
+        """
+        returns all topics of a particular time-slice without probabilitiy values for it to be used 
+        for either "u_mass" or "c_v" coherence.
+        TODO: because of print format right now can only return for 1st time-slice.
+              should we fix the coherence printing or make changes to the print statements to mirror DTM python?  
+        """
+        coherence_topics = []
+        for topic_no in range(0, self.num_topics):
+            topic = self.show_topic(topicid=topic_no, time=time, num_words=num_words)
+            coherence_topic = []
+            for prob, word in topic:
+                coherence_topic.append(word)
+            coherence_topics.append(coherence_topic)
+
+        return coherence_topics