diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 009179e26f..ef95e630d6 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -35,6 +35,7 @@ import logging import numpy # for arrays, array broadcasting etc. +import numbers from gensim import interfaces, utils, matutils from itertools import chain @@ -66,6 +67,29 @@ def dirichlet_expectation(alpha): result = psi(alpha) - psi(numpy.sum(alpha, 1))[:, numpy.newaxis] return result.astype(alpha.dtype) # keep the same precision as input +def update_dir_prior(prior, N, logphat, rho): + """ + Updates a given prior using Newton's method, described in + **Huang: Maximum Likelihood Estimation of Dirichlet Distribution Parameters.** + http://jonathan-huang.org/research/dirichlet/dirichlet.pdf + """ + dprior = numpy.copy(prior) + gradf = N * (psi(numpy.sum(prior)) - psi(prior) + logphat) + + c = N * polygamma(1, numpy.sum(prior)) + q = -N * polygamma(1, prior) + + b = numpy.sum(gradf / q) / (1 / c + numpy.sum(1 / q)) + + dprior = -(gradf - b) / q + + if all(rho * dprior + prior > 0): + prior += rho * dprior + else: + logger.warning("updated prior not positive") + + return prior + class LdaState(utils.SaveLoad): """ @@ -200,11 +224,12 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, prior directly from your data. `eta` can be a scalar for a symmetric prior over topic/word - distributions, or a matrix of shape num_topics x num_words, - which can be used to impose asymmetric priors over the word - distribution on a per-topic basis. This may be useful if you - want to seed certain topics with particular words by boosting - the priors for those words. + distributions, or a matrix of shape num_topics x num_words, which can + be used to impose asymmetric priors over the word distribution on a + per-topic basis. This may be useful if you want to seed certain topics + with particular words by boosting the priors for those words. It also + supports the special value 'auto', which learns an asymmetric prior + directly from your data. Turn on `distributed` to force distributed computing (see the `web tutorial `_ on how to set up a cluster of machines for gensim). @@ -258,26 +283,16 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, self.eval_every = eval_every self.optimize_alpha = alpha == 'auto' - if alpha == 'symmetric' or alpha is None: - logger.info("using symmetric alpha at %s", 1.0 / num_topics) - self.alpha = numpy.asarray([1.0 / num_topics for i in xrange(num_topics)]) - elif alpha == 'asymmetric': - self.alpha = numpy.asarray([1.0 / (i + numpy.sqrt(num_topics)) for i in xrange(num_topics)]) - self.alpha /= self.alpha.sum() - logger.info("using asymmetric alpha %s", list(self.alpha)) - elif alpha == 'auto': - self.alpha = numpy.asarray([1.0 / num_topics for i in xrange(num_topics)]) - logger.info("using autotuned alpha, starting with %s", list(self.alpha)) - else: - # must be either float or an array of floats, of size num_topics - self.alpha = alpha if isinstance(alpha, numpy.ndarray) else numpy.asarray([alpha] * num_topics) - if len(self.alpha) != num_topics: - raise RuntimeError("invalid alpha shape (must match num_topics)") + self.alpha = self.init_dir_prior(alpha, 'alpha') - if eta is None: - self.eta = 1.0 / num_topics - else: - self.eta = eta + assert self.alpha.shape == (num_topics,), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), num_topics) + + self.optimize_eta = eta == 'auto' + self.eta = self.init_dir_prior(eta, 'eta') + + assert (self.eta.shape == (num_topics, 1) or self.eta.shape == (num_topics, self.num_terms)), ( + "Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" % + (str(self.eta.shape), num_topics, num_topics, self.num_terms)) # VB constants self.iterations = iterations @@ -314,6 +329,36 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, if corpus is not None: self.update(corpus) + def init_dir_prior(self, prior, name): + if prior == 'symmetric' or prior is None: + logger.info("using symmetric %s at %s", name, 1.0 / self.num_topics) + init_prior = numpy.asarray([1.0 / self.num_topics for i in xrange(self.num_topics)]) + elif prior == 'asymmetric': + init_prior = numpy.asarray([1.0 / (i + numpy.sqrt(self.num_topics)) for i in xrange(self.num_topics)]) + init_prior /= init_prior.sum() + logger.info("using asymmetric %s %s", name, list(init_prior)) + elif prior == 'auto': + init_prior = numpy.asarray([1.0 / self.num_topics for i in xrange(self.num_topics)]) + logger.info("using autotuned %s, starting with %s", name, list(init_prior)) + elif isinstance(prior, list): + init_prior = numpy.asarray(prior) + elif isinstance(prior, numpy.ndarray): + init_prior = prior + elif isinstance(prior, numpy.number) or isinstance(prior, numbers.Real): + init_prior = numpy.asarray([prior] * self.num_topics) + else: + raise ValueError("%s must be either a numpy array of scalars, list of scalars, or scalar" % name) + + if name == 'eta': + # please note the difference in shapes between alpha and eta: + # alpha is a row: [0.1, 0.1] + # eta is a column: [[0.1], + # [0.1]] + if init_prior.shape == (self.num_topics,) or init_prior.shape == (1, self.num_topics): + init_prior = init_prior.reshape((self.num_topics, 1)) # this statement throws ValueError if eta did not match self.num_topics + + return init_prior + def __str__(self): return "LdaModel(num_terms=%s, num_topics=%s, decay=%s, chunksize=%s)" % \ (self.num_terms, self.num_topics, self.decay, self.chunksize) @@ -425,34 +470,34 @@ def do_estep(self, chunk, state=None): state.numdocs += gamma.shape[0] # avoids calling len(chunk) on a generator return gamma + def update_alpha(self, gammat, rho): """ Update parameters for the Dirichlet prior on the per-document topic weights `alpha` given the last `gammat`. - - Uses Newton's method, described in **Huang: Maximum Likelihood Estimation of Dirichlet Distribution Parameters.** - http://jonathan-huang.org/research/dirichlet/dirichlet.pdf - """ N = float(len(gammat)) logphat = sum(dirichlet_expectation(gamma) for gamma in gammat) / N - dalpha = numpy.copy(self.alpha) - gradf = N * (psi(numpy.sum(self.alpha)) - psi(self.alpha) + logphat) - c = N * polygamma(1, numpy.sum(self.alpha)) - q = -N * polygamma(1, self.alpha) + self.alpha = update_dir_prior(self.alpha, N, logphat, rho) + logger.info("optimized alpha %s", list(self.alpha)) - b = numpy.sum(gradf / q) / (1 / c + numpy.sum(1 / q)) + return self.alpha - dalpha = -(gradf - b) / q + def update_eta(self, lambdat, rho): + """ + Update parameters for the Dirichlet prior on the per-topic + word weights `eta` given the last `lambdat`. + """ + if self.eta.shape[1] != 1: + raise ValueError("Can't use update_eta with eta matrices, only column vectors.") + N = float(lambdat.shape[1]) + logphat = (sum(dirichlet_expectation(lambda_) for lambda_ in lambdat.transpose()) / N).reshape((self.num_topics,1)) - if all(rho * dalpha + self.alpha > 0): - self.alpha += rho * dalpha - else: - logger.warning("updated alpha not positive") - logger.info("optimized alpha %s", list(self.alpha)) + self.eta = update_dir_prior(self.eta, N, logphat, rho) + logger.info("optimized eta %s", list(self.eta.reshape((self.num_topics)))) - return self.alpha + return self.eta def log_perplexity(self, chunk, total_docs=None): """ @@ -629,6 +674,9 @@ def do_mstep(self, rho, other, extra_pass=False): self.print_topics(5) logger.info("topic diff=%f, rho=%f", numpy.mean(numpy.abs(diff)), rho) + if self.optimize_eta: + self.update_eta(self.state.get_lambda(), rho) + if not extra_pass: # only update if this isn't an additional pass self.num_updates += other.numdocs @@ -846,9 +894,9 @@ def save(self, fname, ignore=['state', 'dispatcher'], *args, **kwargs): Save the model to file. Large internal arrays may be stored into separate files, with `fname` as prefix. - + `separately` can be used to define which arrays should be stored in separate files. - + `ignore` parameter can be used to define which variables should be ignored, i.e. left out from the pickled lda model. By default the internal `state` is ignored as it uses its own serialisation not the one provided by `LdaModel`. The `state` and `dispatcher @@ -870,7 +918,7 @@ def save(self, fname, ignore=['state', 'dispatcher'], *args, **kwargs): """ if self.state is not None: self.state.save(utils.smart_extension(fname, '.state'), *args, **kwargs) - + # make sure 'state' and 'dispatcher' are ignored from the pickled object, even if # someone sets the ignore list themselves if ignore is not None and ignore: diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py index f7a2eaf820..ba0df78b38 100644 --- a/gensim/test/test_ldamodel.py +++ b/gensim/test/test_ldamodel.py @@ -77,6 +77,132 @@ def testTransform(self): (i, sorted(vec), sorted(expected))) self.assertTrue(passed) + def testAlphaAuto(self): + model1 = self.class_(corpus, id2word=dictionary, alpha='symmetric', passes=10) + modelauto = self.class_(corpus, id2word=dictionary, alpha='auto', passes=10) + + # did we learn something? + self.assertFalse(all(numpy.equal(model1.alpha, modelauto.alpha))) + + def testAlpha(self): + kwargs = dict( + id2word=dictionary, + num_topics=2, + alpha=None + ) + expected_shape = (2,) + + # should not raise anything + self.class_(**kwargs) + + kwargs['alpha'] = 'symmetric' + model = self.class_(**kwargs) + self.assertEqual(model.alpha.shape, expected_shape) + self.assertTrue(all(model.alpha == numpy.array([0.5, 0.5]))) + + kwargs['alpha'] = 'asymmetric' + model = self.class_(**kwargs) + self.assertEqual(model.alpha.shape, expected_shape) + self.assertTrue(numpy.allclose(model.alpha, [0.630602, 0.369398])) + + kwargs['alpha'] = 0.3 + model = self.class_(**kwargs) + self.assertEqual(model.alpha.shape, expected_shape) + self.assertTrue(all(model.alpha == numpy.array([0.3, 0.3]))) + + kwargs['alpha'] = 3 + model = self.class_(**kwargs) + self.assertEqual(model.alpha.shape, expected_shape) + self.assertTrue(all(model.alpha == numpy.array([3, 3]))) + + kwargs['alpha'] = [0.3, 0.3] + model = self.class_(**kwargs) + self.assertEqual(model.alpha.shape, expected_shape) + self.assertTrue(all(model.alpha == numpy.array([0.3, 0.3]))) + + # all should raise an exception for being wrong shape + kwargs['alpha'] = [0.3, 0.3, 0.3] + self.assertRaises(AssertionError, self.class_, **kwargs) + + kwargs['alpha'] = [[0.3], [0.3]] + self.assertRaises(AssertionError, self.class_, **kwargs) + + kwargs['alpha'] = [0.3] + self.assertRaises(AssertionError, self.class_, **kwargs) + + kwargs['alpha'] = "gensim is cool" + self.assertRaises(ValueError, self.class_, **kwargs) + + + def testEtaAuto(self): + model1 = self.class_(corpus, id2word=dictionary, eta='symmetric', passes=10) + modelauto = self.class_(corpus, id2word=dictionary, eta='auto', passes=10) + + # did we learn something? + self.assertFalse(all(numpy.equal(model1.eta, modelauto.eta))) + + def testEta(self): + kwargs = dict( + id2word=dictionary, + num_topics=2, + eta=None + ) + expected_shape = (2, 1) + + # should not raise anything + model = self.class_(**kwargs) + self.assertEqual(model.eta.shape, expected_shape) + self.assertTrue(all(model.eta == numpy.array([[0.5], [0.5]]))) + + kwargs['eta'] = 'symmetric' + model = self.class_(**kwargs) + self.assertEqual(model.eta.shape, expected_shape) + self.assertTrue(all(model.eta == numpy.array([[0.5], [0.5]]))) + + kwargs['eta'] = 'asymmetric' + model = self.class_(**kwargs) + self.assertEqual(model.eta.shape, expected_shape) + self.assertTrue(numpy.allclose(model.eta, [[0.630602], [0.369398]])) + + kwargs['eta'] = 0.3 + model = self.class_(**kwargs) + self.assertEqual(model.eta.shape, expected_shape) + self.assertTrue(all(model.eta == numpy.array([[0.3], [0.3]]))) + + kwargs['eta'] = 3 + model = self.class_(**kwargs) + self.assertEqual(model.eta.shape, expected_shape) + self.assertTrue(all(model.eta == numpy.array([[3], [3]]))) + + kwargs['eta'] = [[0.3], [0.3]] + model = self.class_(**kwargs) + self.assertEqual(model.eta.shape, expected_shape) + self.assertTrue(all(model.eta == numpy.array([[0.3], [0.3]]))) + + kwargs['eta'] = [0.3, 0.3] + model = self.class_(**kwargs) + self.assertEqual(model.eta.shape, expected_shape) + self.assertTrue(all(model.eta == numpy.array([[0.3], [0.3]]))) + + # should be ok with num_topics x num_terms + testeta = numpy.array([[0.5] * len(dictionary)] * 2) + kwargs['eta'] = testeta + self.class_(**kwargs) + + # all should raise an exception for being wrong shape + kwargs['eta'] = testeta.reshape(tuple(reversed(testeta.shape))) + self.assertRaises(AssertionError, self.class_, **kwargs) + + kwargs['eta'] = [0.3, 0.3, 0.3] + self.assertRaises(AssertionError, self.class_, **kwargs) + + kwargs['eta'] = [0.3] + self.assertRaises(AssertionError, self.class_, **kwargs) + + kwargs['eta'] = "gensim is cool" + self.assertRaises(ValueError, self.class_, **kwargs) + + def testTopTopics(self): top_topics = self.model.top_topics(self.corpus) @@ -254,6 +380,11 @@ def setUp(self): self.class_ = ldamulticore.LdaMulticore self.model = self.class_(corpus, id2word=dictionary, num_topics=2, passes=100) + # override LdaModel because multicore does not allow alpha=auto + def testAlphaAuto(self): + self.assertRaises(RuntimeError, self.class_, alpha='auto') + + #endclass TestLdaMulticore