From b96975e3a97abcf38e366d07343cafa28697be7e Mon Sep 17 00:00:00 2001 From: Joshua Charles Campbell Date: Wed, 12 Aug 2015 18:05:37 -0600 Subject: [PATCH 1/5] Add automatic updates for eta parameter --- gensim/models/ldamodel.py | 58 +++++++++++++++++++++++++++++++++------ 1 file changed, 49 insertions(+), 9 deletions(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 009179e26f..c016617759 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -200,11 +200,12 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, prior directly from your data. `eta` can be a scalar for a symmetric prior over topic/word - distributions, or a matrix of shape num_topics x num_words, - which can be used to impose asymmetric priors over the word - distribution on a per-topic basis. This may be useful if you - want to seed certain topics with particular words by boosting - the priors for those words. + distributions, or a matrix of shape num_topics x num_words, which can + be used to impose asymmetric priors over the word distribution on a + per-topic basis. This may be useful if you want to seed certain topics + with particular words by boosting the priors for those words. It also + supports the special value 'auto', which learns an asymmetric prior + directly from your data. Turn on `distributed` to force distributed computing (see the `web tutorial `_ on how to set up a cluster of machines for gensim). @@ -274,8 +275,14 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, if len(self.alpha) != num_topics: raise RuntimeError("invalid alpha shape (must match num_topics)") - if eta is None: + self.optimize_eta = eta == 'auto' + if eta == 'symmetric' or eta is None: + logger.info("using symmetric eta at %s", 1.0 / num_topics) self.eta = 1.0 / num_topics + elif eta == 'auto': + # this needs to be a column vector of length num_topics + self.eta = numpy.asarray([1.0 / num_topics for i in xrange(num_topics)]).reshape((num_topics,1)) + logger.info("using autotuned eta, starting with %s", list(self.eta)) else: self.eta = eta @@ -454,6 +461,36 @@ def update_alpha(self, gammat, rho): return self.alpha + def update_eta(self, lambdat, rho): + """ + Update parameters for the Dirichlet prior on the per-topic + word weights `eta` given the last `lambdat`. + + Uses Newton's method, described in **Huang: Maximum Likelihood Estimation of Dirichlet Distribution Parameters.** + http://jonathan-huang.org/research/dirichlet/dirichlet.pdf + + """ + if self.eta.shape[1] != 1: + raise ValueError("Can't use update_eta with eta matrices, only column vectors.") + N = float(lambdat.shape[1]) + logphat = (sum(dirichlet_expectation(lambda_) for lambda_ in lambdat.transpose()) / N).reshape((self.num_topics,1)) + deta = numpy.copy(self.eta) + gradf = N * (psi(numpy.sum(self.eta)) - psi(self.eta) + logphat) + + c = N * polygamma(1, numpy.sum(self.eta)) + q = -N * polygamma(1, self.eta) + + b = numpy.sum(gradf / q) / (1 / c + numpy.sum(1 / q)) + + deta = -(gradf - b) / q + if all(rho * deta + self.eta > 0): + self.eta += rho * deta + else: + logger.warning("updated eta not positive") + logger.info("optimized eta %s", list(self.eta.reshape((self.num_topics)))) + + return self.eta + def log_perplexity(self, chunk, total_docs=None): """ Calculate and return per-word likelihood bound, using the `chunk` of @@ -629,6 +666,9 @@ def do_mstep(self, rho, other, extra_pass=False): self.print_topics(5) logger.info("topic diff=%f, rho=%f", numpy.mean(numpy.abs(diff)), rho) + if self.optimize_eta: + self.update_eta(self.state.get_lambda(), rho) + if not extra_pass: # only update if this isn't an additional pass self.num_updates += other.numdocs @@ -846,9 +886,9 @@ def save(self, fname, ignore=['state', 'dispatcher'], *args, **kwargs): Save the model to file. Large internal arrays may be stored into separate files, with `fname` as prefix. - + `separately` can be used to define which arrays should be stored in separate files. - + `ignore` parameter can be used to define which variables should be ignored, i.e. left out from the pickled lda model. By default the internal `state` is ignored as it uses its own serialisation not the one provided by `LdaModel`. The `state` and `dispatcher @@ -870,7 +910,7 @@ def save(self, fname, ignore=['state', 'dispatcher'], *args, **kwargs): """ if self.state is not None: self.state.save(utils.smart_extension(fname, '.state'), *args, **kwargs) - + # make sure 'state' and 'dispatcher' are ignored from the pickled object, even if # someone sets the ignore list themselves if ignore is not None and ignore: From e7ba07b70efbf4988c609ecf7993c5a08dd6fcba Mon Sep 17 00:00:00 2001 From: Christopher Corley Date: Sat, 10 Oct 2015 17:59:22 -0400 Subject: [PATCH 2/5] Add testing for LdaModel alpha,eta 'auto' settings --- gensim/test/test_ldamodel.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py index f7a2eaf820..e7f7e3b660 100644 --- a/gensim/test/test_ldamodel.py +++ b/gensim/test/test_ldamodel.py @@ -77,6 +77,20 @@ def testTransform(self): (i, sorted(vec), sorted(expected))) self.assertTrue(passed) + def testAlpha(self): + model1 = self.class_(corpus, id2word=dictionary, alpha='symmetric', passes=10) + modelauto = self.class_(corpus, id2word=dictionary, alpha='auto', passes=10) + + # did we learn something? + self.assertFalse(all(numpy.equal(model1.alpha, modelauto.alpha))) + + def testEta(self): + model1 = self.class_(corpus, id2word=dictionary, eta='symmetric', passes=10) + modelauto = self.class_(corpus, id2word=dictionary, eta='auto', passes=10) + + # did we learn something? + self.assertFalse(all(numpy.equal(model1.eta, modelauto.eta))) + def testTopTopics(self): top_topics = self.model.top_topics(self.corpus) @@ -254,6 +268,11 @@ def setUp(self): self.class_ = ldamulticore.LdaMulticore self.model = self.class_(corpus, id2word=dictionary, num_topics=2, passes=100) + # override LdaModel because multicore does not allow alpha=auto + def testAlpha(self): + pass + + #endclass TestLdaMulticore From f331d0c53413a59ec7d8de7601d7ae520582600a Mon Sep 17 00:00:00 2001 From: Christopher Corley Date: Sun, 11 Oct 2015 11:34:05 -0400 Subject: [PATCH 3/5] Refactor out prior updates into single function --- gensim/models/ldamodel.py | 59 +++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 33 deletions(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index c016617759..7bb96b3a17 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -66,6 +66,29 @@ def dirichlet_expectation(alpha): result = psi(alpha) - psi(numpy.sum(alpha, 1))[:, numpy.newaxis] return result.astype(alpha.dtype) # keep the same precision as input +def update_dir_prior(prior, N, logphat, rho): + """ + Updates a given prior using Newton's method, described in + **Huang: Maximum Likelihood Estimation of Dirichlet Distribution Parameters.** + http://jonathan-huang.org/research/dirichlet/dirichlet.pdf + """ + dprior = numpy.copy(prior) + gradf = N * (psi(numpy.sum(prior)) - psi(prior) + logphat) + + c = N * polygamma(1, numpy.sum(prior)) + q = -N * polygamma(1, prior) + + b = numpy.sum(gradf / q) / (1 / c + numpy.sum(1 / q)) + + dprior = -(gradf - b) / q + + if all(rho * dprior + prior > 0): + prior += rho * dprior + else: + logger.warning("updated prior not positive") + + return prior + class LdaState(utils.SaveLoad): """ @@ -432,31 +455,16 @@ def do_estep(self, chunk, state=None): state.numdocs += gamma.shape[0] # avoids calling len(chunk) on a generator return gamma + def update_alpha(self, gammat, rho): """ Update parameters for the Dirichlet prior on the per-document topic weights `alpha` given the last `gammat`. - - Uses Newton's method, described in **Huang: Maximum Likelihood Estimation of Dirichlet Distribution Parameters.** - http://jonathan-huang.org/research/dirichlet/dirichlet.pdf - """ N = float(len(gammat)) logphat = sum(dirichlet_expectation(gamma) for gamma in gammat) / N - dalpha = numpy.copy(self.alpha) - gradf = N * (psi(numpy.sum(self.alpha)) - psi(self.alpha) + logphat) - - c = N * polygamma(1, numpy.sum(self.alpha)) - q = -N * polygamma(1, self.alpha) - - b = numpy.sum(gradf / q) / (1 / c + numpy.sum(1 / q)) - dalpha = -(gradf - b) / q - - if all(rho * dalpha + self.alpha > 0): - self.alpha += rho * dalpha - else: - logger.warning("updated alpha not positive") + self.alpha = update_dir_prior(self.alpha, N, logphat, rho) logger.info("optimized alpha %s", list(self.alpha)) return self.alpha @@ -465,28 +473,13 @@ def update_eta(self, lambdat, rho): """ Update parameters for the Dirichlet prior on the per-topic word weights `eta` given the last `lambdat`. - - Uses Newton's method, described in **Huang: Maximum Likelihood Estimation of Dirichlet Distribution Parameters.** - http://jonathan-huang.org/research/dirichlet/dirichlet.pdf - """ if self.eta.shape[1] != 1: raise ValueError("Can't use update_eta with eta matrices, only column vectors.") N = float(lambdat.shape[1]) logphat = (sum(dirichlet_expectation(lambda_) for lambda_ in lambdat.transpose()) / N).reshape((self.num_topics,1)) - deta = numpy.copy(self.eta) - gradf = N * (psi(numpy.sum(self.eta)) - psi(self.eta) + logphat) - - c = N * polygamma(1, numpy.sum(self.eta)) - q = -N * polygamma(1, self.eta) - b = numpy.sum(gradf / q) / (1 / c + numpy.sum(1 / q)) - - deta = -(gradf - b) / q - if all(rho * deta + self.eta > 0): - self.eta += rho * deta - else: - logger.warning("updated eta not positive") + self.eta = update_dir_prior(self.eta, N, logphat, rho) logger.info("optimized eta %s", list(self.eta.reshape((self.num_topics)))) return self.eta From 5fe33f280656815137b4d95d596a0cc414433dfa Mon Sep 17 00:00:00 2001 From: Christopher Corley Date: Sun, 18 Oct 2015 14:30:10 -0400 Subject: [PATCH 4/5] Makes eta and alpha init sequences similar and add tests for each. --- gensim/models/ldamodel.py | 43 ++++++++++--- gensim/test/test_ldamodel.py | 120 +++++++++++++++++++++++++++++++++-- 2 files changed, 151 insertions(+), 12 deletions(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index 7bb96b3a17..fcf26bf400 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -35,6 +35,7 @@ import logging import numpy # for arrays, array broadcasting etc. +import numbers from gensim import interfaces, utils, matutils from itertools import chain @@ -292,22 +293,48 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, elif alpha == 'auto': self.alpha = numpy.asarray([1.0 / num_topics for i in xrange(num_topics)]) logger.info("using autotuned alpha, starting with %s", list(self.alpha)) + elif isinstance(alpha, list): + self.alpha = numpy.asarray(alpha) + elif isinstance(alpha, numpy.ndarray): + self.alpha = alpha + elif isinstance(alpha, numpy.number) or isinstance(alpha, numbers.Real): + self.alpha = numpy.asarray([alpha] * num_topics) else: - # must be either float or an array of floats, of size num_topics - self.alpha = alpha if isinstance(alpha, numpy.ndarray) else numpy.asarray([alpha] * num_topics) - if len(self.alpha) != num_topics: - raise RuntimeError("invalid alpha shape (must match num_topics)") + raise ValueError("alpha must be either a numpy array of scalars, list of scalars, or scalar") + assert self.alpha.shape == (num_topics,), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), num_topics) + + # please note the difference in init between alpha and eta: + # alpha is a row: [0.1, 0.1] + # eta is a column: [[0.1], + # [0.1]] self.optimize_eta = eta == 'auto' if eta == 'symmetric' or eta is None: logger.info("using symmetric eta at %s", 1.0 / num_topics) - self.eta = 1.0 / num_topics + self.eta = numpy.asarray([[1.0 / num_topics] for i in xrange(num_topics)]) + elif eta == 'asymmetric': + self.eta = numpy.asarray([[1.0 / (i + numpy.sqrt(num_topics))] for i in xrange(num_topics)]) + self.eta /= self.eta.sum() + logger.info("using asymmetric eta %s", list(self.eta)) elif eta == 'auto': - # this needs to be a column vector of length num_topics - self.eta = numpy.asarray([1.0 / num_topics for i in xrange(num_topics)]).reshape((num_topics,1)) + self.eta = numpy.asarray([[1.0 / num_topics] for i in xrange(num_topics)]) logger.info("using autotuned eta, starting with %s", list(self.eta)) - else: + elif isinstance(eta, list): + self.eta = numpy.asarray(eta) + elif isinstance(eta, numpy.ndarray): self.eta = eta + elif isinstance(eta, numpy.number) or isinstance(eta, numbers.Real): + self.eta = numpy.asarray([[eta]] * num_topics) + else: + raise ValueError("eta must be either a numpy array of scalars, list of scalars, or scalar") + + if self.eta.shape == (num_topics,) or self.eta.shape == (1, num_topics): + # client sent in something in the wrong shape, but in this case is a simple mistake that we can fix. + self.eta = self.eta.reshape((num_topics, 1)) # this statement throws ValueError if eta did not match num_topics + + assert (self.eta.shape == (num_topics, 1) or self.eta.shape == (num_topics, self.num_terms)), ( + "Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" % + (str(self.eta.shape), num_topics, num_topics, self.num_terms)) # VB constants self.iterations = iterations diff --git a/gensim/test/test_ldamodel.py b/gensim/test/test_ldamodel.py index e7f7e3b660..ba0df78b38 100644 --- a/gensim/test/test_ldamodel.py +++ b/gensim/test/test_ldamodel.py @@ -77,20 +77,132 @@ def testTransform(self): (i, sorted(vec), sorted(expected))) self.assertTrue(passed) - def testAlpha(self): + def testAlphaAuto(self): model1 = self.class_(corpus, id2word=dictionary, alpha='symmetric', passes=10) modelauto = self.class_(corpus, id2word=dictionary, alpha='auto', passes=10) # did we learn something? self.assertFalse(all(numpy.equal(model1.alpha, modelauto.alpha))) - def testEta(self): + def testAlpha(self): + kwargs = dict( + id2word=dictionary, + num_topics=2, + alpha=None + ) + expected_shape = (2,) + + # should not raise anything + self.class_(**kwargs) + + kwargs['alpha'] = 'symmetric' + model = self.class_(**kwargs) + self.assertEqual(model.alpha.shape, expected_shape) + self.assertTrue(all(model.alpha == numpy.array([0.5, 0.5]))) + + kwargs['alpha'] = 'asymmetric' + model = self.class_(**kwargs) + self.assertEqual(model.alpha.shape, expected_shape) + self.assertTrue(numpy.allclose(model.alpha, [0.630602, 0.369398])) + + kwargs['alpha'] = 0.3 + model = self.class_(**kwargs) + self.assertEqual(model.alpha.shape, expected_shape) + self.assertTrue(all(model.alpha == numpy.array([0.3, 0.3]))) + + kwargs['alpha'] = 3 + model = self.class_(**kwargs) + self.assertEqual(model.alpha.shape, expected_shape) + self.assertTrue(all(model.alpha == numpy.array([3, 3]))) + + kwargs['alpha'] = [0.3, 0.3] + model = self.class_(**kwargs) + self.assertEqual(model.alpha.shape, expected_shape) + self.assertTrue(all(model.alpha == numpy.array([0.3, 0.3]))) + + # all should raise an exception for being wrong shape + kwargs['alpha'] = [0.3, 0.3, 0.3] + self.assertRaises(AssertionError, self.class_, **kwargs) + + kwargs['alpha'] = [[0.3], [0.3]] + self.assertRaises(AssertionError, self.class_, **kwargs) + + kwargs['alpha'] = [0.3] + self.assertRaises(AssertionError, self.class_, **kwargs) + + kwargs['alpha'] = "gensim is cool" + self.assertRaises(ValueError, self.class_, **kwargs) + + + def testEtaAuto(self): model1 = self.class_(corpus, id2word=dictionary, eta='symmetric', passes=10) modelauto = self.class_(corpus, id2word=dictionary, eta='auto', passes=10) # did we learn something? self.assertFalse(all(numpy.equal(model1.eta, modelauto.eta))) + def testEta(self): + kwargs = dict( + id2word=dictionary, + num_topics=2, + eta=None + ) + expected_shape = (2, 1) + + # should not raise anything + model = self.class_(**kwargs) + self.assertEqual(model.eta.shape, expected_shape) + self.assertTrue(all(model.eta == numpy.array([[0.5], [0.5]]))) + + kwargs['eta'] = 'symmetric' + model = self.class_(**kwargs) + self.assertEqual(model.eta.shape, expected_shape) + self.assertTrue(all(model.eta == numpy.array([[0.5], [0.5]]))) + + kwargs['eta'] = 'asymmetric' + model = self.class_(**kwargs) + self.assertEqual(model.eta.shape, expected_shape) + self.assertTrue(numpy.allclose(model.eta, [[0.630602], [0.369398]])) + + kwargs['eta'] = 0.3 + model = self.class_(**kwargs) + self.assertEqual(model.eta.shape, expected_shape) + self.assertTrue(all(model.eta == numpy.array([[0.3], [0.3]]))) + + kwargs['eta'] = 3 + model = self.class_(**kwargs) + self.assertEqual(model.eta.shape, expected_shape) + self.assertTrue(all(model.eta == numpy.array([[3], [3]]))) + + kwargs['eta'] = [[0.3], [0.3]] + model = self.class_(**kwargs) + self.assertEqual(model.eta.shape, expected_shape) + self.assertTrue(all(model.eta == numpy.array([[0.3], [0.3]]))) + + kwargs['eta'] = [0.3, 0.3] + model = self.class_(**kwargs) + self.assertEqual(model.eta.shape, expected_shape) + self.assertTrue(all(model.eta == numpy.array([[0.3], [0.3]]))) + + # should be ok with num_topics x num_terms + testeta = numpy.array([[0.5] * len(dictionary)] * 2) + kwargs['eta'] = testeta + self.class_(**kwargs) + + # all should raise an exception for being wrong shape + kwargs['eta'] = testeta.reshape(tuple(reversed(testeta.shape))) + self.assertRaises(AssertionError, self.class_, **kwargs) + + kwargs['eta'] = [0.3, 0.3, 0.3] + self.assertRaises(AssertionError, self.class_, **kwargs) + + kwargs['eta'] = [0.3] + self.assertRaises(AssertionError, self.class_, **kwargs) + + kwargs['eta'] = "gensim is cool" + self.assertRaises(ValueError, self.class_, **kwargs) + + def testTopTopics(self): top_topics = self.model.top_topics(self.corpus) @@ -269,8 +381,8 @@ def setUp(self): self.model = self.class_(corpus, id2word=dictionary, num_topics=2, passes=100) # override LdaModel because multicore does not allow alpha=auto - def testAlpha(self): - pass + def testAlphaAuto(self): + self.assertRaises(RuntimeError, self.class_, alpha='auto') #endclass TestLdaMulticore From 1350a422dfc0be5bdb268b6254c383839f337789 Mon Sep 17 00:00:00 2001 From: Christopher Corley Date: Sun, 18 Oct 2015 15:06:45 -0400 Subject: [PATCH 5/5] Refactor common alpha/eta initialization into method --- gensim/models/ldamodel.py | 76 +++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 44 deletions(-) diff --git a/gensim/models/ldamodel.py b/gensim/models/ldamodel.py index fcf26bf400..ef95e630d6 100755 --- a/gensim/models/ldamodel.py +++ b/gensim/models/ldamodel.py @@ -283,54 +283,12 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, self.eval_every = eval_every self.optimize_alpha = alpha == 'auto' - if alpha == 'symmetric' or alpha is None: - logger.info("using symmetric alpha at %s", 1.0 / num_topics) - self.alpha = numpy.asarray([1.0 / num_topics for i in xrange(num_topics)]) - elif alpha == 'asymmetric': - self.alpha = numpy.asarray([1.0 / (i + numpy.sqrt(num_topics)) for i in xrange(num_topics)]) - self.alpha /= self.alpha.sum() - logger.info("using asymmetric alpha %s", list(self.alpha)) - elif alpha == 'auto': - self.alpha = numpy.asarray([1.0 / num_topics for i in xrange(num_topics)]) - logger.info("using autotuned alpha, starting with %s", list(self.alpha)) - elif isinstance(alpha, list): - self.alpha = numpy.asarray(alpha) - elif isinstance(alpha, numpy.ndarray): - self.alpha = alpha - elif isinstance(alpha, numpy.number) or isinstance(alpha, numbers.Real): - self.alpha = numpy.asarray([alpha] * num_topics) - else: - raise ValueError("alpha must be either a numpy array of scalars, list of scalars, or scalar") + self.alpha = self.init_dir_prior(alpha, 'alpha') assert self.alpha.shape == (num_topics,), "Invalid alpha shape. Got shape %s, but expected (%d, )" % (str(self.alpha.shape), num_topics) - # please note the difference in init between alpha and eta: - # alpha is a row: [0.1, 0.1] - # eta is a column: [[0.1], - # [0.1]] self.optimize_eta = eta == 'auto' - if eta == 'symmetric' or eta is None: - logger.info("using symmetric eta at %s", 1.0 / num_topics) - self.eta = numpy.asarray([[1.0 / num_topics] for i in xrange(num_topics)]) - elif eta == 'asymmetric': - self.eta = numpy.asarray([[1.0 / (i + numpy.sqrt(num_topics))] for i in xrange(num_topics)]) - self.eta /= self.eta.sum() - logger.info("using asymmetric eta %s", list(self.eta)) - elif eta == 'auto': - self.eta = numpy.asarray([[1.0 / num_topics] for i in xrange(num_topics)]) - logger.info("using autotuned eta, starting with %s", list(self.eta)) - elif isinstance(eta, list): - self.eta = numpy.asarray(eta) - elif isinstance(eta, numpy.ndarray): - self.eta = eta - elif isinstance(eta, numpy.number) or isinstance(eta, numbers.Real): - self.eta = numpy.asarray([[eta]] * num_topics) - else: - raise ValueError("eta must be either a numpy array of scalars, list of scalars, or scalar") - - if self.eta.shape == (num_topics,) or self.eta.shape == (1, num_topics): - # client sent in something in the wrong shape, but in this case is a simple mistake that we can fix. - self.eta = self.eta.reshape((num_topics, 1)) # this statement throws ValueError if eta did not match num_topics + self.eta = self.init_dir_prior(eta, 'eta') assert (self.eta.shape == (num_topics, 1) or self.eta.shape == (num_topics, self.num_terms)), ( "Invalid eta shape. Got shape %s, but expected (%d, 1) or (%d, %d)" % @@ -371,6 +329,36 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, if corpus is not None: self.update(corpus) + def init_dir_prior(self, prior, name): + if prior == 'symmetric' or prior is None: + logger.info("using symmetric %s at %s", name, 1.0 / self.num_topics) + init_prior = numpy.asarray([1.0 / self.num_topics for i in xrange(self.num_topics)]) + elif prior == 'asymmetric': + init_prior = numpy.asarray([1.0 / (i + numpy.sqrt(self.num_topics)) for i in xrange(self.num_topics)]) + init_prior /= init_prior.sum() + logger.info("using asymmetric %s %s", name, list(init_prior)) + elif prior == 'auto': + init_prior = numpy.asarray([1.0 / self.num_topics for i in xrange(self.num_topics)]) + logger.info("using autotuned %s, starting with %s", name, list(init_prior)) + elif isinstance(prior, list): + init_prior = numpy.asarray(prior) + elif isinstance(prior, numpy.ndarray): + init_prior = prior + elif isinstance(prior, numpy.number) or isinstance(prior, numbers.Real): + init_prior = numpy.asarray([prior] * self.num_topics) + else: + raise ValueError("%s must be either a numpy array of scalars, list of scalars, or scalar" % name) + + if name == 'eta': + # please note the difference in shapes between alpha and eta: + # alpha is a row: [0.1, 0.1] + # eta is a column: [[0.1], + # [0.1]] + if init_prior.shape == (self.num_topics,) or init_prior.shape == (1, self.num_topics): + init_prior = init_prior.reshape((self.num_topics, 1)) # this statement throws ValueError if eta did not match self.num_topics + + return init_prior + def __str__(self): return "LdaModel(num_terms=%s, num_topics=%s, decay=%s, chunksize=%s)" % \ (self.num_terms, self.num_topics, self.decay, self.chunksize)