Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding dtype to LDAModel to speed it up #1656

Merged
merged 21 commits into from
Nov 14, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions gensim/matutils.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,13 +602,12 @@ def jaccard_distance(set1, set2):
def dirichlet_expectation(alpha):
"""
For a vector `theta~Dir(alpha)`, compute `E[log(theta)]`.

"""
if len(alpha.shape) == 1:
result = psi(alpha) - psi(np.sum(alpha))
else:
result = psi(alpha) - psi(np.sum(alpha, 1))[:, np.newaxis]
return result.astype(alpha.dtype) # keep the same precision as input
return result.astype(alpha.dtype, copy=False) # keep the same precision as input


def qr_destroy(la):
Expand Down
4 changes: 4 additions & 0 deletions gensim/models/atmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ def __init__(self, eta, lambda_shape, gamma_shape):
self.sstats = np.zeros(lambda_shape)
self.gamma = np.zeros(gamma_shape)
self.numdocs = 0
self.dtype = np.float64 # To be compatible with LdaState


def construct_doc2author(corpus, author2doc):
Expand Down Expand Up @@ -203,6 +204,9 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, author2doc=None, d
>>> model = AuthorTopicModel(corpus, num_topics=50, author2doc=author2doc, id2word=id2word, alpha='auto', eval_every=5) # train asymmetric alpha from data

"""
# NOTE: this doesn't call constructor of a base class, but duplicates most of this code
# so we have to set dtype to float64 default here
self.dtype = np.float64

# NOTE: as distributed version of this model is not implemented, "distributed" is set to false. Some of the
# infrastructure to implement a distributed author-topic model is already in place, such as the AuthorTopicState.
Expand Down
2 changes: 1 addition & 1 deletion gensim/models/hdpmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -538,7 +538,7 @@ def suggested_lda_model(self):
The num_topics is m_T (default is 150) so as to preserve the matrice shapes when we assign alpha and beta.
"""
alpha, beta = self.hdp_to_lda()
ldam = ldamodel.LdaModel(num_topics=self.m_T, alpha=alpha, id2word=self.id2word, random_state=self.random_state)
ldam = ldamodel.LdaModel(num_topics=self.m_T, alpha=alpha, id2word=self.id2word, random_state=self.random_state, dtype=np.float64)
ldam.expElogbeta[:] = beta
return ldam

Expand Down
89 changes: 66 additions & 23 deletions gensim/models/ldamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def update_dir_prior(prior, N, logphat, rho):
**Huang: Maximum Likelihood Estimation of Dirichlet Distribution Parameters.**
http://jonathan-huang.org/research/dirichlet/dirichlet.pdf
"""
dprior = np.copy(prior)
dprior = np.copy(prior) # TODO: unused var???
gradf = N * (psi(np.sum(prior)) - psi(prior) + logphat)

c = N * polygamma(1, np.sum(prior))
Expand All @@ -87,10 +87,11 @@ class LdaState(utils.SaveLoad):

"""

def __init__(self, eta, shape):
self.eta = eta
self.sstats = np.zeros(shape)
def __init__(self, eta, shape, dtype=np.float32):
self.eta = eta.astype(dtype, copy=False)
self.sstats = np.zeros(shape, dtype=dtype)
self.numdocs = 0
self.dtype = dtype

def reset(self):
"""
Expand Down Expand Up @@ -165,6 +166,17 @@ def get_lambda(self):

def get_Elogbeta(self):
return dirichlet_expectation(self.get_lambda())

@classmethod
def load(cls, fname, *args, **kwargs):
result = super(LdaState, cls).load(fname, *args, **kwargs)

# dtype could be absent in old models
if not hasattr(result, 'dtype'):
result.dtype = np.float64 # float64 was implicitly used before (cause it's default in numpy)
logging.info("dtype was not set in saved %s file %s, assuming np.float64", result.__class__.__name__, fname)

return result
# endclass LdaState


Expand All @@ -191,7 +203,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
alpha='symmetric', eta=None, decay=0.5, offset=1.0, eval_every=10,
iterations=50, gamma_threshold=0.001, minimum_probability=0.01,
random_state=None, ns_conf=None, minimum_phi_value=0.01,
per_word_topics=False, callbacks=None):
per_word_topics=False, callbacks=None, dtype=np.float32):
"""
If given, start training from the iterable `corpus` straight away. If not given,
the model is left untrained (presumably because you want to call `update()` manually).
Expand Down Expand Up @@ -233,9 +245,11 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,

`minimum_probability` controls filtering the topics returned for a document (bow).

`random_state` can be a np.random.RandomState object or the seed for one
`random_state` can be a np.random.RandomState object or the seed for one.

`callbacks` a list of metric callbacks to log/visualize evaluation metrics of topic model during training.

`callbacks` a list of metric callbacks to log/visualize evaluation metrics of topic model during training
`dtype` is data-type to use during calculations inside model. All inputs are also converted to this dtype.

Example:

Expand All @@ -247,6 +261,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
>>> lda = LdaModel(corpus, num_topics=50, alpha='auto', eval_every=5) # train asymmetric alpha from data

"""
self.dtype = dtype

# store user-supplied parameters
self.id2word = id2word
Expand Down Expand Up @@ -330,10 +345,14 @@ def __init__(self, corpus=None, num_topics=100, id2word=None,
raise RuntimeError("failed to initialize distributed LDA (%s)" % err)

# Initialize the variational distribution q(beta|lambda)
self.state = LdaState(self.eta, (self.num_topics, self.num_terms))
self.state.sstats = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms))
self.state = LdaState(self.eta, (self.num_topics, self.num_terms), dtype=self.dtype)
self.state.sstats[...] = self.random_state.gamma(100., 1. / 100., (self.num_topics, self.num_terms))
self.expElogbeta = np.exp(dirichlet_expectation(self.state.sstats))

# Check that we haven't accidentally fall back to np.float64
assert self.eta.dtype == self.dtype
assert self.expElogbeta.dtype == self.dtype

# if a training corpus was provided, start estimating the model right away
if corpus is not None:
use_numpy = self.dispatcher is not None
Expand All @@ -354,25 +373,25 @@ def init_dir_prior(self, prior, name):

if isinstance(prior, six.string_types):
if prior == 'symmetric':
logger.info("using symmetric %s at %s", name, 1.0 / prior_shape)
init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)])
logger.info("using symmetric %s at %s", name, 1.0 / self.num_topics)
init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)], dtype=self.dtype)
elif prior == 'asymmetric':
init_prior = np.asarray([1.0 / (i + np.sqrt(prior_shape)) for i in xrange(prior_shape)])
init_prior = np.asarray([1.0 / (i + np.sqrt(prior_shape)) for i in xrange(prior_shape)], dtype=self.dtype)
init_prior /= init_prior.sum()
logger.info("using asymmetric %s %s", name, list(init_prior))
elif prior == 'auto':
is_auto = True
init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)])
init_prior = np.asarray([1.0 / self.num_topics for i in xrange(prior_shape)], dtype=self.dtype)
if name == 'alpha':
logger.info("using autotuned %s, starting with %s", name, list(init_prior))
else:
raise ValueError("Unable to determine proper %s value given '%s'" % (name, prior))
elif isinstance(prior, list):
init_prior = np.asarray(prior)
init_prior = np.asarray(prior, dtype=self.dtype)
elif isinstance(prior, np.ndarray):
init_prior = prior
init_prior = prior.astype(self.dtype, copy=False)
elif isinstance(prior, np.number) or isinstance(prior, numbers.Real):
init_prior = np.asarray([prior] * prior_shape)
init_prior = np.asarray([prior] * prior_shape, dtype=self.dtype)
else:
raise ValueError("%s must be either a np array of scalars, list of scalars, or scalar" % name)

Expand All @@ -385,6 +404,7 @@ def __str__(self):

def sync_state(self):
self.expElogbeta = np.exp(self.state.get_Elogbeta())
assert self.expElogbeta.dtype == self.dtype

def clear(self):
"""Clear model state (free up some memory). Used in the distributed algo."""
Expand Down Expand Up @@ -418,11 +438,15 @@ def inference(self, chunk, collect_sstats=False):
logger.debug("performing inference on a chunk of %i documents", len(chunk))

# Initialize the variational distribution q(theta|gamma) for the chunk
gamma = self.random_state.gamma(100., 1. / 100., (len(chunk), self.num_topics))
gamma = self.random_state.gamma(100., 1. / 100., (len(chunk), self.num_topics)).astype(self.dtype, copy=False)
Elogtheta = dirichlet_expectation(gamma)
expElogtheta = np.exp(Elogtheta)

assert Elogtheta.dtype == self.dtype
assert expElogtheta.dtype == self.dtype

if collect_sstats:
sstats = np.zeros_like(self.expElogbeta)
sstats = np.zeros_like(self.expElogbeta, dtype=self.dtype)
else:
sstats = None
converged = 0
Expand All @@ -437,7 +461,7 @@ def inference(self, chunk, collect_sstats=False):
ids = [int(idx) for idx, _ in doc]
else:
ids = [idx for idx, _ in doc]
cts = np.array([cnt for _, cnt in doc])
cts = np.array([cnt for _, cnt in doc], dtype=self.dtype)
gammad = gamma[d, :]
Elogthetad = Elogtheta[d, :]
expElogthetad = expElogtheta[d, :]
Expand All @@ -464,6 +488,7 @@ def inference(self, chunk, collect_sstats=False):
converged += 1
break
gamma[d, :] = gammad
assert gammad.dtype == self.dtype
if collect_sstats:
# Contribution of document d to the expected sufficient
# statistics for the M step.
Expand All @@ -478,6 +503,9 @@ def inference(self, chunk, collect_sstats=False):
# sstats[k, w] = \sum_d n_{dw} * phi_{dwk}
# = \sum_d n_{dw} * exp{Elogtheta_{dk} + Elogbeta_{kw}} / phinorm_{dw}.
sstats *= self.expElogbeta
assert sstats.dtype == self.dtype

assert gamma.dtype == self.dtype
return gamma, sstats

def do_estep(self, chunk, state=None):
Expand All @@ -491,6 +519,7 @@ def do_estep(self, chunk, state=None):
gamma, sstats = self.inference(chunk, collect_sstats=True)
state.sstats += sstats
state.numdocs += gamma.shape[0] # avoids calling len(chunk) on a generator
assert gamma.dtype == self.dtype
return gamma

def update_alpha(self, gammat, rho):
Expand All @@ -500,10 +529,12 @@ def update_alpha(self, gammat, rho):
"""
N = float(len(gammat))
logphat = sum(dirichlet_expectation(gamma) for gamma in gammat) / N
assert logphat.dtype == self.dtype

self.alpha = update_dir_prior(self.alpha, N, logphat, rho)
logger.info("optimized alpha %s", list(self.alpha))

assert self.alpha.dtype == self.dtype
return self.alpha

def update_eta(self, lambdat, rho):
Expand All @@ -513,9 +544,11 @@ def update_eta(self, lambdat, rho):
"""
N = float(lambdat.shape[0])
logphat = (sum(dirichlet_expectation(lambda_) for lambda_ in lambdat) / N).reshape((self.num_terms,))
assert logphat.dtype == self.dtype

self.eta = update_dir_prior(self.eta, N, logphat, rho)

assert self.eta.dtype == self.dtype
return self.eta

def log_perplexity(self, chunk, total_docs=None):
Expand Down Expand Up @@ -647,7 +680,7 @@ def rho():
logger.info('initializing %s workers', self.numworkers)
self.dispatcher.reset(self.state)
else:
other = LdaState(self.eta, self.state.sstats.shape)
other = LdaState(self.eta, self.state.sstats.shape, self.dtype)
dirty = False

reallen = 0
Expand Down Expand Up @@ -691,7 +724,7 @@ def rho():
logger.info('initializing workers')
self.dispatcher.reset(self.state)
else:
other = LdaState(self.eta, self.state.sstats.shape)
other = LdaState(self.eta, self.state.sstats.shape, self.dtype)
dirty = False
# endfor single corpus iteration

Expand Down Expand Up @@ -772,6 +805,9 @@ def bound(self, corpus, gamma=None, subsample_ratio=1.0):
gammad = gamma[d]
Elogthetad = dirichlet_expectation(gammad)

assert gammad.dtype == self.dtype
assert Elogthetad.dtype == self.dtype

# E[log p(doc | theta, beta)]
score += np.sum(cnt * logsumexp(Elogthetad + Elogbeta[:, int(id)]) for id, cnt in doc)

Expand Down Expand Up @@ -820,6 +856,7 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True):

# add a little random jitter, to randomize results around the same alpha
sort_alpha = self.alpha + 0.0001 * self.random_state.rand(len(self.alpha))
# random_state.rand returns float64, but converting back to dtype won't speed up anything
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe .astype (for consistency only) ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consistency vs one additional array copy. I'm not sure :)


sorted_topics = list(matutils.argsort(sort_alpha))
chosen_topics = sorted_topics[:num_topics // 2] + sorted_topics[-num_topics // 2:]
Expand Down Expand Up @@ -856,7 +893,7 @@ def show_topic(self, topicid, topn=10):
def get_topics(self):
"""
Returns:
np.ndarray: `num_topics` x `vocabulary_size` array of floats which represents
np.ndarray: `num_topics` x `vocabulary_size` array of floats (self.dtype) which represents
the term topic matrix learned during inference.
"""
topics = self.state.get_lambda()
Expand Down Expand Up @@ -1028,6 +1065,7 @@ def diff(self, other, distance="kullback_leibler", num_words=100,
>>> print(mdiff) # get matrix with difference for each topic pair from `m1` and `m2`
>>> print(annotation) # get array with positive/negative words for each topic pair from `m1` and `m2`

Note: this ignores difference in model dtypes
"""

distances = {
Expand Down Expand Up @@ -1186,9 +1224,14 @@ def load(cls, fname, *args, **kwargs):
result.random_state = utils.get_random_state(None) # using default value `get_random_state(None)`
logging.warning("random_state not set so using default value")

# dtype could be absent in old models
if not hasattr(result, 'dtype'):
result.dtype = np.float64 # float64 was implicitly used before (cause it's default in numpy)
logging.info("dtype was not set in saved %s file %s, assuming np.float64", result.__class__.__name__, fname)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe warn?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See #1656 (comment) for discussion. Although it's expected state when loading old model, maybe a warning still a good thing.


state_fname = utils.smart_extension(fname, '.state')
try:
result.state = super(LdaModel, cls).load(state_fname, *args, **kwargs)
result.state = LdaState.load(state_fname, *args, **kwargs)
except Exception as e:
logging.warning("failed to load state from %s: %s", state_fname, e)

Expand Down
6 changes: 4 additions & 2 deletions gensim/models/ldamulticore.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@

import logging

import numpy as np

from gensim import utils
from gensim.models.ldamodel import LdaModel, LdaState

Expand Down Expand Up @@ -82,7 +84,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None,
chunksize=2000, passes=1, batch=False, alpha='symmetric',
eta=None, decay=0.5, offset=1.0, eval_every=10, iterations=50,
gamma_threshold=0.001, random_state=None, minimum_probability=0.01,
minimum_phi_value=0.01, per_word_topics=False):
minimum_phi_value=0.01, per_word_topics=False, dtype=np.float32):
"""
If given, start training from the iterable `corpus` straight away. If not given,
the model is left untrained (presumably because you want to call `update()` manually).
Expand Down Expand Up @@ -148,7 +150,7 @@ def __init__(self, corpus=None, num_topics=100, id2word=None, workers=None,
id2word=id2word, chunksize=chunksize, passes=passes, alpha=alpha, eta=eta,
decay=decay, offset=offset, eval_every=eval_every, iterations=iterations,
gamma_threshold=gamma_threshold, random_state=random_state, minimum_probability=minimum_probability,
minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics
minimum_phi_value=minimum_phi_value, per_word_topics=per_word_topics, dtype=dtype
)

def update(self, corpus, chunks_as_numpy=False):
Expand Down
7 changes: 4 additions & 3 deletions gensim/models/ldaseqmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,8 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_
if initialize == 'gensim':
lda_model = ldamodel.LdaModel(
corpus, id2word=self.id2word, num_topics=self.num_topics,
passes=passes, alpha=self.alphas, random_state=random_state
passes=passes, alpha=self.alphas, random_state=random_state,
dtype=np.float64
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe it will be a good idea to change default behaviour (to float32)?
CC @piskvorky @xelez

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not now, LdaSeqModel will require modifications similar to those I made in LdaModel to handle dtype properly.

)
self.sstats = np.transpose(lda_model.state.sstats)
if initialize == 'ldamodel':
Expand Down Expand Up @@ -244,7 +245,7 @@ def lda_seq_infer(self, corpus, topic_suffstats, gammas, lhoods,
vocab_len = self.vocab_len
bound = 0.0

lda = ldamodel.LdaModel(num_topics=num_topics, alpha=self.alphas, id2word=self.id2word)
lda = ldamodel.LdaModel(num_topics=num_topics, alpha=self.alphas, id2word=self.id2word, dtype=np.float64)
lda.topics = np.array(np.split(np.zeros(vocab_len * num_topics), vocab_len))
ldapost = LdaPost(max_doc_len=self.max_doc_len, num_topics=num_topics, lda=lda)

Expand Down Expand Up @@ -419,7 +420,7 @@ def __getitem__(self, doc):
"""
Similar to the LdaModel __getitem__ function, it returns topic proportions of a document passed.
"""
lda_model = ldamodel.LdaModel(num_topics=self.num_topics, alpha=self.alphas, id2word=self.id2word)
lda_model = ldamodel.LdaModel(num_topics=self.num_topics, alpha=self.alphas, id2word=self.id2word, dtype=np.float64)
lda_model.topics = np.array(np.split(np.zeros(self.vocab_len * self.num_topics), self.vocab_len))
ldapost = LdaPost(num_topics=self.num_topics, max_doc_len=len(doc), lda=lda_model, doc=doc)

Expand Down
3 changes: 2 additions & 1 deletion gensim/models/wrappers/ldamallet.py
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,8 @@ def malletmodel2ldamodel(mallet_model, gamma_threshold=0.001, iterations=50):
model_gensim = LdaModel(
id2word=mallet_model.id2word, num_topics=mallet_model.num_topics,
alpha=mallet_model.alpha, iterations=iterations,
gamma_threshold=gamma_threshold
gamma_threshold=gamma_threshold,
dtype=numpy.float64 # don't loose precision when converting from MALLET
)
model_gensim.expElogbeta[:] = mallet_model.wordtopics
return model_gensim
3 changes: 2 additions & 1 deletion gensim/models/wrappers/ldavowpalwabbit.py
Original file line number Diff line number Diff line change
Expand Up @@ -586,7 +586,8 @@ def vwmodel2ldamodel(vw_model, iterations=50):
model_gensim = LdaModel(
num_topics=vw_model.num_topics, id2word=vw_model.id2word, chunksize=vw_model.chunksize,
passes=vw_model.passes, alpha=vw_model.alpha, eta=vw_model.eta, decay=vw_model.decay,
offset=vw_model.offset, iterations=iterations, gamma_threshold=vw_model.gamma_threshold
offset=vw_model.offset, iterations=iterations, gamma_threshold=vw_model.gamma_threshold,
dtype=numpy.float32
)
model_gensim.expElogbeta[:] = vw_model._get_topics()
return model_gensim
Loading