Skip to content

Commit

Permalink
Add 'diagonal' parameter for LdaModel.diff (#1448)
Browse files Browse the repository at this point in the history
* add flags for diagnol and annotation

* make matrix default

* remove duplication

* raise error on diff no. of topics

* add docstrings

* Fix flake8

* rename annotation matrix variable

* add tests

* fix indent

* flake8 fixes
  • Loading branch information
parulsethi authored and menshikh-iv committed Aug 3, 2017
1 parent 5f63081 commit 3cb8495
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 16 deletions.
48 changes: 32 additions & 16 deletions gensim/models/ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -971,16 +971,18 @@ def get_term_topics(self, word_id, minimum_probability=None):

return values

def diff(self, other, distance="kullback_leibler", num_words=100, n_ann_terms=10, normed=True):
def diff(self, other, distance="kullback_leibler", num_words=100, n_ann_terms=10, diagonal=False, annotation=True, normed=True):
"""
Calculate difference topic2topic between two Lda models
`other` instances of `LdaMulticore` or `LdaModel`
`distance` is function that will be applied to calculate difference between any topic pair.
Available values: `kullback_leibler`, `hellinger` and `jaccard`
`num_words` is quantity of most relevant words that used if distance == `jaccard` (also used for annotation)
`n_ann_terms` is max quantity of words in intersection/symmetric difference between topics (used for annotation)
`diagonal` set to True if the difference is required only between the identical topic no.s (returns diagonal of diff matrix)
`annotation` whether the intersection or difference of words between two topics should be returned
Returns a matrix Z with shape (m1.num_topics, m2.num_topics), where Z[i][j] - difference between topic_i and topic_j
and matrix annotation with shape (m1.num_topics, m2.num_topics, 2, None),
and matrix annotation (if True) with shape (m1.num_topics, m2.num_topics, 2, None),
where:
annotation[i][j] = [[`int_1`, `int_2`, ...], [`diff_1`, `diff_2`, ...]] and
Expand Down Expand Up @@ -1013,35 +1015,49 @@ def diff(self, other, distance="kullback_leibler", num_words=100, n_ann_terms=10
distance_func = distances[distance]
d1, d2 = self.state.get_lambda(), other.state.get_lambda()
t1_size, t2_size = d1.shape[0], d2.shape[0]
annotation_terms = None

fst_topics = [{w for (w, _) in self.show_topic(topic, topn=num_words)} for topic in xrange(t1_size)]
snd_topics = [{w for (w, _) in other.show_topic(topic, topn=num_words)} for topic in xrange(t2_size)]

if distance == "jaccard":
d1, d2 = fst_topics, snd_topics

z = np.zeros((t1_size, t2_size))
for topic1 in range(t1_size):
for topic2 in range(t2_size):
z[topic1][topic2] = distance_func(d1[topic1], d2[topic2])

if normed:
if np.abs(np.max(z)) > 1e-8:
z /= np.max(z)

annotation = [[None] * t1_size for _ in range(t2_size)]
if diagonal:
assert t1_size == t2_size, "Both input models should have same no. of topics, as the diagonal will only be valid in a square matrix"
# initialize z and annotation array
z = np.zeros(t1_size)
if annotation:
annotation_terms = np.zeros(t1_size, dtype=list)
else:
# initialize z and annotation matrix
z = np.zeros((t1_size, t2_size))
if annotation:
annotation_terms = np.zeros((t1_size, t2_size), dtype=list)

# iterate over each cell in the initialized z and annotation
for topic in np.ndindex(z.shape):
topic1 = topic[0]
if diagonal:
topic2 = topic1
else:
topic2 = topic[1]

for topic1 in range(t1_size):
for topic2 in range(t2_size):
z[topic] = distance_func(d1[topic1], d2[topic2])
if annotation:
pos_tokens = fst_topics[topic1] & snd_topics[topic2]
neg_tokens = fst_topics[topic1].symmetric_difference(snd_topics[topic2])

pos_tokens = sample(pos_tokens, min(len(pos_tokens), n_ann_terms))
neg_tokens = sample(neg_tokens, min(len(neg_tokens), n_ann_terms))

annotation[topic1][topic2] = [pos_tokens, neg_tokens]
annotation_terms[topic] = [pos_tokens, neg_tokens]

if normed:
if np.abs(np.max(z)) > 1e-8:
z /= np.max(z)

return z, annotation
return z, annotation_terms

def __getitem__(self, bow, eps=None):
"""
Expand Down
26 changes: 26 additions & 0 deletions gensim/test/test_tmdiff.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
# Copyright (C) 2016 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

import logging
import unittest
import numpy as np

Expand Down Expand Up @@ -31,14 +32,22 @@ def setUp(self):
self.model = LdaModel(corpus=self.corpus, id2word=self.dictionary, num_topics=self.num_topics, passes=10)

def testBasic(self):
# test for matrix case
mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms)

self.assertEqual(mdiff.shape, (self.num_topics, self.num_topics))
self.assertEquals(len(annotation), self.num_topics)
self.assertEquals(len(annotation[0]), self.num_topics)

# test for diagonal case
mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms, diagonal=True)

self.assertEqual(mdiff.shape, (self.num_topics,))
self.assertEquals(len(annotation), self.num_topics)

def testIdentity(self):
for dist_name in ["hellinger", "kullback_leibler", "jaccard"]:
# test for matrix case
mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms, distance=dist_name)

for row in annotation:
Expand All @@ -51,6 +60,23 @@ def testIdentity(self):
if dist_name == "jaccard":
self.assertTrue(np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype)))

# test for diagonal case
mdiff, annotation = self.model.diff(self.model, n_ann_terms=self.n_ann_terms, distance=dist_name, diagonal=True)

for (int_tokens, diff_tokens) in annotation:
self.assertEquals(diff_tokens, [])
self.assertEquals(len(int_tokens), self.n_ann_terms)

self.assertTrue(np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype)))

if dist_name == "jaccard":
self.assertTrue(np.allclose(mdiff, np.zeros(mdiff.shape, dtype=mdiff.dtype)))

def testInput(self):
self.assertRaises(ValueError, self.model.diff, self.model, n_ann_terms=self.n_ann_terms, distance='something')
self.assertRaises(ValueError, self.model.diff, [], n_ann_terms=self.n_ann_terms, distance='something')


if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
unittest.main()

0 comments on commit 3cb8495

Please sign in to comment.