Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] HDP #1055

Merged
merged 9 commits into from
Dec 27, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ None

0.13.4, 2016-12-22

* Added suggested lda model method and print methods to HDP class (@bhargavvader, [#1055](https://github.com/RaRe-Technologies/gensim/pull/1055))
* New class KeyedVectors to store embedding separate from training code (@anmol01gulati and @droudy, [#980](https://github.com/RaRe-Technologies/gensim/pull/980))
* Evaluation of word2vec models against semantic similarity datasets like SimLex-999 (@akutuzov, [#1047](https://github.com/RaRe-Technologies/gensim/pull/1047))
* TensorBoard word embedding visualisation of Gensim Word2vec format (@loretoparisi, [#1051](https://github.com/RaRe-Technologies/gensim/pull/1051))
Expand Down
61 changes: 58 additions & 3 deletions gensim/models/hdpmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
import scipy.special as sp

from gensim import interfaces, utils, matutils
from gensim.models import basemodel
from gensim.models import basemodel, ldamodel
from six.moves import xrange

logger = logging.getLogger(__name__)
Expand All @@ -56,6 +56,7 @@ def dirichlet_expectation(alpha):
return(sp.psi(alpha) - sp.psi(np.sum(alpha, 1))[:, np.newaxis])



def expect_log_sticks(sticks):
"""
For stick-breaking hdp, return the E[log(sticks)]
Expand Down Expand Up @@ -130,7 +131,7 @@ class HdpModel(interfaces.TransformationABC, basemodel.BaseTopicModel):
def __init__(self, corpus, id2word, max_chunks=None, max_time=None,
chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1,
gamma=1, eta=0.01, scale=1.0, var_converge=0.0001,
outputdir=None):
outputdir=None, random_state=None):
"""
`gamma`: first level concentration
`alpha`: second level concentration
Expand All @@ -151,6 +152,8 @@ def __init__(self, corpus, id2word, max_chunks=None, max_time=None,
self.max_time = max_time
self.outputdir = outputdir

self.random_state = utils.get_random_state(random_state)

self.lda_alpha = None
self.lda_beta = None

Expand All @@ -169,7 +172,7 @@ def __init__(self, corpus, id2word, max_chunks=None, max_time=None,
self.m_var_sticks[1] = range(T - 1, 0, -1)
self.m_varphi_ss = np.zeros(T)

self.m_lambda = np.random.gamma(1.0, 1.0, (T, self.m_W)) * self.m_D * 100 / (T * self.m_W) - eta
self.m_lambda = self.random_state.gamma(1.0, 1.0, (T, self.m_W)) * self.m_D * 100 / (T * self.m_W) - eta
self.m_eta = eta
self.m_Elogbeta = dirichlet_expectation(self.m_eta + self.m_lambda)

Expand Down Expand Up @@ -442,6 +445,21 @@ def update_expectations(self):
self.m_timestamp[:] = self.m_updatect
self.m_status_up_to_date = True

def show_topic(self, topic_id, num_words=20, log=False, formatted=False):
"""
Print the `num_words` most probable words for `topics` number of topics.
Set `topics=-1` to print all topics.

Set `formatted=True` to return the topics as a list of strings, or
`False` as lists of (weight, word) pairs.

"""
if not self.m_status_up_to_date:
self.update_expectations()
betas = self.m_lambda + self.m_eta
hdp_formatter = HdpTopicFormatter(self.id2word, betas)
return hdp_formatter.show_topic(topic_id, num_words, log, formatted)

def show_topics(self, num_topics=20, num_words=20, log=False, formatted=True):
"""
Print the `num_words` most probable words for `topics` number of topics.
Expand Down Expand Up @@ -510,6 +528,17 @@ def hdp_to_lda(self):

return (alpha, beta)

def suggested_lda_model(self):
"""
Returns closest corresponding ldamodel object corresponding to current hdp model.
The hdp_to_lda method only returns corresponding alpha, beta values, and this method returns a trained ldamodel.
The num_topics is m_T (default is 150) so as to preserve the matrice shapes when we assign alpha and beta.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

how is it different from hdp_to_lda? Add a comment

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed duplicate code, added comment.

"""
alpha, beta = self.hdp_to_lda()
ldam = ldamodel.LdaModel(num_topics=self.m_T, alpha=alpha, id2word=self.id2word, random_state=self.random_state)
ldam.expElogbeta[:] = beta
return ldam

def evaluate_test_corpus(self, corpus):
logger.info('TEST: evaluating test corpus')
if self.lda_alpha is None or self.lda_beta is None:
Expand Down Expand Up @@ -589,6 +618,32 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):

return shown

def print_topic(self, topic_id, num_words):
return self.show_topic(topic_id, num_words, formatted=True)

def show_topic(self, topic_id, num_words, log=False, formatted=False):

lambdak = list(self.data[topic_id, :])
lambdak = lambdak / sum(lambdak)

temp = zip(lambdak, xrange(len(lambdak)))
temp = sorted(temp, key=lambda x: x[0], reverse=True)

topic_terms = self.show_topic_terms(temp, num_words)

if formatted:
topic = self.format_topic(topic_id, topic_terms)

# assuming we only output formatted topics
if log:
logger.info(topic)
else:
topic = (topic_id, topic_terms)

# we only return the topic_terms
return topic[1]


def show_topic_terms(self, topic_data, num_words):
return [(self.dictionary[wid], weight) for (weight, wid) in topic_data[:num_words]]

Expand Down
15 changes: 1 addition & 14 deletions gensim/models/ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,19 +92,6 @@ def update_dir_prior(prior, N, logphat, rho):

return prior

def get_random_state(seed):
""" Turn seed into a np.random.RandomState instance.

Method originally from maciejkula/glove-python, and written by @joshloyal
"""
if seed is None or seed is np.random:
return np.random.mtrand._rand
if isinstance(seed, (numbers.Integral, np.integer)):
return np.random.RandomState(seed)
if isinstance(seed, np.random.RandomState):
return seed
raise ValueError('%r cannot be used to seed a np.random.RandomState'
' instance' % seed)

class LdaState(utils.SaveLoad):
"""
Expand Down Expand Up @@ -314,7 +301,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,

self.eta, self.optimize_eta = self.init_dir_prior(eta, 'eta')

self.random_state = get_random_state(random_state)
self.random_state = utils.get_random_state(random_state)

assert (self.eta.shape == (self.num_terms,) or self.eta.shape == (self.num_topics, self.num_terms)), (
"Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" %
Expand Down
24 changes: 21 additions & 3 deletions gensim/test/test_hdpmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from gensim import matutils
from gensim.test import basetests

import numpy as np

module_path = os.path.dirname(__file__) # needed because sample data files are located in the same folder
datapath = lambda fname: os.path.join(module_path, 'test_data', fname)
Expand Down Expand Up @@ -51,12 +52,29 @@ class TestHdpModel(unittest.TestCase, basetests.TestBaseTopicModel):
def setUp(self):
self.corpus = mmcorpus.MmCorpus(datapath('testcorpus.mm'))
self.class_ = hdpmodel.HdpModel
self.model = self.class_(corpus, id2word=dictionary)
self.model = self.class_(corpus, id2word=dictionary, random_state=np.random.seed(0))

def testShowTopic(self):
# TODO create show_topic in HdpModel and then test
def testTopicValues(self):
"""
Check show topics method
"""
results = self.model.show_topics()[0]
expected_prob, expected_word = '0.264', 'trees '
prob, word = results[1].split('+')[0].split('*')
self.assertEqual(results[0], 0)
self.assertEqual(prob, expected_prob)
self.assertEqual(word, expected_word)

return

def testLDAmodel(self):
"""
Create ldamodel object, and check if the corresponding alphas are equal.
"""
ldam = self.model.suggested_lda_model()
self.assertEqual(ldam.alpha[0], self.model.lda_alpha[0])


if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
unittest.main()
16 changes: 16 additions & 0 deletions gensim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
import subprocess

import numpy
import numbers
import scipy.sparse

if sys.version_info[0] >= 3:
Expand Down Expand Up @@ -80,6 +81,21 @@ def smart_open(fname, mode='rb'):
RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE)


def get_random_state(seed):
""" Turn seed into a np.random.RandomState instance.

Method originally from maciejkula/glove-python, and written by @joshloyal
"""
if seed is None or seed is numpy.random:
return numpy.random.mtrand._rand
if isinstance(seed, (numbers.Integral, numpy.integer)):
return numpy.random.RandomState(seed)
if isinstance(seed, numpy.random.RandomState):
return seed
raise ValueError('%r cannot be used to seed a numpy.random.RandomState'
Copy link
Owner

@piskvorky piskvorky Dec 27, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No vertical indent in gensim, please use hanging indent.

@tmylk this keeps happening over and over -- watch out for this in reviews.

Copy link
Contributor Author

@bhargavvader bhargavvader Dec 27, 2016

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My bad, I had just copy-pasted this from the existing ldamodel code and missed this. Fixing it in a new PR where I make some changes to utils.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@bhargavvader Thanks!

' instance' % seed)


def synchronous(tlockname):
"""
A decorator to place an instance-based lock around a method.
Expand Down