Skip to content

Commit

Permalink
Fix light linting issues in LdaSeqModel (#2360)
Browse files Browse the repository at this point in the history
* Fix light linting issues in `LdaSeqModel`

* fix build

* fix linting
  • Loading branch information
horpto authored and menshikh-iv committed Jan 29, 2019
1 parent 80406c2 commit 9242a6b
Showing 1 changed file with 58 additions and 62 deletions.
120 changes: 58 additions & 62 deletions gensim/models/ldaseqmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
from scipy.special import digamma, gammaln
from scipy import optimize
import logging
from six.moves import range, zip

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -126,7 +127,7 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_
logger.warning("no word id mapping provided; initializing from corpus, assuming identity")
self.id2word = utils.dict_from_corpus(corpus)
self.vocab_len = len(self.id2word)
elif len(self.id2word) > 0:
elif self.id2word:
self.vocab_len = len(self.id2word)
else:
self.vocab_len = 0
Expand All @@ -142,12 +143,6 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_
if self.time_slice is not None:
self.num_time_slices = len(time_slice)

max_doc_len = 0
for line_no, line in enumerate(corpus):
if len(line) > max_doc_len:
max_doc_len = len(line)
self.max_doc_len = max_doc_len

self.num_topics = num_topics
self.num_time_slices = len(time_slice)
self.alphas = np.full(num_topics, alphas)
Expand All @@ -157,7 +152,7 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_
# the sslm class is described below and contains information
# on topic-word probabilities and doc-topic probabilities.
self.topic_chains = []
for topic in range(0, num_topics):
for topic in range(num_topics):
sslm_ = sslm(
num_time_slices=self.num_time_slices, vocab_len=self.vocab_len, num_topics=self.num_topics,
chain_variance=chain_variance, obs_variance=obs_variance
Expand All @@ -172,6 +167,8 @@ def __init__(self, corpus=None, time_slice=None, id2word=None, alphas=0.01, num_

# if a corpus and time_slice is provided, depending on the user choice of initializing LDA, we start DTM.
if corpus is not None and time_slice is not None:
self.max_doc_len = max(len(line) for line in corpus)

if initialize == 'gensim':
lda_model = ldamodel.LdaModel(
corpus, id2word=self.id2word, num_topics=self.num_topics,
Expand Down Expand Up @@ -268,12 +265,12 @@ def fit_lda_seq(self, corpus, lda_inference_max_iter, em_min_iter, em_max_iter,

# initiate sufficient statistics
topic_suffstats = []
for topic in range(0, num_topics):
topic_suffstats.append(np.resize(np.zeros(vocab_len * data_len), (vocab_len, data_len)))
for topic in range(num_topics):
topic_suffstats.append(np.zeros((vocab_len, data_len)))

# set up variables
gammas = np.resize(np.zeros(corpus_len * num_topics), (corpus_len, num_topics))
lhoods = np.resize(np.zeros(corpus_len * num_topics + 1), (corpus_len, num_topics + 1))
gammas = np.zeros((corpus_len, num_topics))
lhoods = np.zeros((corpus_len, num_topics + 1))
# compute the likelihood of a sequential corpus under an LDA
# seq model and find the evidence lower bound. This is the E - Step
bound, gammas = \
Expand Down Expand Up @@ -346,7 +343,7 @@ def lda_seq_infer(self, corpus, topic_suffstats, gammas, lhoods,
bound = 0.0

lda = ldamodel.LdaModel(num_topics=num_topics, alpha=self.alphas, id2word=self.id2word, dtype=np.float64)
lda.topics = np.array(np.split(np.zeros(vocab_len * num_topics), vocab_len))
lda.topics = np.zeros((vocab_len, num_topics))
ldapost = LdaPost(max_doc_len=self.max_doc_len, num_topics=num_topics, lda=lda)

model = "DTM"
Expand Down Expand Up @@ -460,8 +457,8 @@ def make_lda_seq_slice(self, lda, time):
The stationary model updated to reflect the passed time slice.
"""
for k in range(0, self.num_topics):
lda.topics[:, k] = np.copy(self.topic_chains[k].e_log_prob[:, time])
for k in range(self.num_topics):
lda.topics[:, k] = self.topic_chains[k].e_log_prob[:, time]

lda.alpha = np.copy(self.alphas)
return lda
Expand Down Expand Up @@ -507,7 +504,7 @@ def print_topic_times(self, topic, top_terms=20):
"""
topics = []
for time in range(0, self.num_time_slices):
for time in range(self.num_time_slices):
topics.append(self.print_topic(topic, time, top_terms))

return topics
Expand All @@ -530,7 +527,7 @@ def print_topics(self, time=0, top_terms=20):
probability.
"""
return [self.print_topic(topic, time, top_terms) for topic in range(0, self.num_topics)]
return [self.print_topic(topic, time, top_terms) for topic in range(self.num_topics)]

def print_topic(self, topic, time=0, top_terms=20):
"""Get the list of words most relevant to the given topic.
Expand Down Expand Up @@ -578,8 +575,7 @@ def doc_topics(self, doc_number):
Probability for each topic in the mixture (essentially a point in the `self.num_topics - 1` simplex.
"""
doc_topic = np.copy(self.gammas)
doc_topic /= doc_topic.sum(axis=1)[:, np.newaxis]
doc_topic = self.gammas / self.gammas.sum(axis=1)[:, np.newaxis]
return doc_topic[doc_number]

def dtm_vis(self, time, corpus):
Expand Down Expand Up @@ -608,22 +604,25 @@ def dtm_vis(self, time, corpus):
The set of unique terms existing in the cropuse's vocabulary.
"""
doc_topic = np.copy(self.gammas)
doc_topic /= doc_topic.sum(axis=1)[:, np.newaxis]
doc_topic = self.gammas / self.gammas.sum(axis=1)[:, np.newaxis]

def normalize(x):
return x / x.sum()

topic_term = [
np.exp(np.transpose(chain.e_log_prob)[time]) / np.exp(np.transpose(chain.e_log_prob)[time]).sum()
normalize(np.exp(chain.e_log_prob.T[time]))
for k, chain in enumerate(self.topic_chains)
]

doc_lengths = [len(doc) for doc_no, doc in enumerate(corpus)]

doc_lengths = []
term_frequency = np.zeros(self.vocab_len)
for doc_no, doc in enumerate(corpus):
for pair in doc:
term_frequency[pair[0]] += pair[1]
doc_lengths.append(len(doc))

for term, freq in doc:
term_frequency[term] += freq

vocab = [self.id2word[i] for i in range(0, len(self.id2word))]
vocab = [self.id2word[i] for i in range(len(self.id2word))]

return doc_topic, np.array(topic_term), doc_lengths, term_frequency, vocab

Expand Down Expand Up @@ -668,13 +667,13 @@ def __getitem__(self, doc):
Probabilities for each topic in the mixture. This is essentially a point in the `num_topics - 1` simplex.
"""
lda_model = \
ldamodel.LdaModel(num_topics=self.num_topics, alpha=self.alphas, id2word=self.id2word, dtype=np.float64)
lda_model.topics = np.array(np.split(np.zeros(self.vocab_len * self.num_topics), self.vocab_len))
lda_model = ldamodel.LdaModel(
num_topics=self.num_topics, alpha=self.alphas, id2word=self.id2word, dtype=np.float64)
lda_model.topics = np.zeros((self.vocab_len, self.num_topics))
ldapost = LdaPost(num_topics=self.num_topics, max_doc_len=len(doc), lda=lda_model, doc=doc)

time_lhoods = []
for time in range(0, self.num_time_slices):
for time in range(self.num_time_slices):
lda_model = self.make_lda_seq_slice(lda_model, time) # create lda_seq slice
lhood = LdaPost.fit_lda_post(ldapost, 0, time, self)
time_lhoods.append(lhood)
Expand Down Expand Up @@ -706,12 +705,12 @@ def __init__(self, vocab_len=None, num_time_slices=None, num_topics=None, obs_va
self.num_topics = num_topics

# setting up matrices
self.obs = np.array(np.split(np.zeros(num_time_slices * vocab_len), vocab_len))
self.e_log_prob = np.array(np.split(np.zeros(num_time_slices * vocab_len), vocab_len))
self.mean = np.array(np.split(np.zeros((num_time_slices + 1) * vocab_len), vocab_len))
self.fwd_mean = np.array(np.split(np.zeros((num_time_slices + 1) * vocab_len), vocab_len))
self.fwd_variance = np.array(np.split(np.zeros((num_time_slices + 1) * vocab_len), vocab_len))
self.variance = np.array(np.split(np.zeros((num_time_slices + 1) * vocab_len), vocab_len))
self.obs = np.zeros((vocab_len, num_time_slices))
self.e_log_prob = np.zeros((vocab_len, num_time_slices))
self.mean = np.zeros((vocab_len, num_time_slices + 1))
self.fwd_mean = np.zeros((vocab_len, num_time_slices + 1))
self.fwd_variance = np.zeros((vocab_len, num_time_slices + 1))
self.variance = np.zeros((vocab_len, num_time_slices + 1))
self.zeta = np.zeros(num_time_slices)

# the following are class variables which are to be integrated during Document Influence Model
Expand Down Expand Up @@ -896,9 +895,9 @@ def sslm_counts_init(self, obs_variance, chain_variance, sstats):
T = self.num_time_slices

log_norm_counts = np.copy(sstats)
log_norm_counts = log_norm_counts / sum(log_norm_counts)
log_norm_counts = log_norm_counts + 1.0 / W
log_norm_counts = log_norm_counts / sum(log_norm_counts)
log_norm_counts /= sum(log_norm_counts)
log_norm_counts += 1.0 / W
log_norm_counts /= sum(log_norm_counts)
log_norm_counts = np.log(log_norm_counts)

# setting variational observations to transformed counts
Expand All @@ -908,7 +907,7 @@ def sslm_counts_init(self, obs_variance, chain_variance, sstats):
self.chain_variance = chain_variance

# compute post variance, mean
for w in range(0, W):
for w in range(W):
self.variance[w], self.fwd_variance[w] = self.compute_post_variance(w, self.chain_variance)
self.mean[w], self.fwd_mean[w] = self.compute_post_mean(w, self.chain_variance)

Expand Down Expand Up @@ -944,7 +943,7 @@ def fit_sslm(self, sstats):

# computing variance, fwd_variance
self.variance, self.fwd_variance = \
(np.array(x) for x in list(zip(*[self.compute_post_variance(w, self.chain_variance) for w in range(0, W)])))
(np.array(x) for x in zip(*(self.compute_post_variance(w, self.chain_variance) for w in range(W))))

# column sum of sstats
totals = sstats.sum(axis=0)
Expand Down Expand Up @@ -1006,19 +1005,18 @@ def compute_bound(self, sstats, totals):
chain_variance = self.chain_variance
# computing mean, fwd_mean
self.mean, self.fwd_mean = \
(np.array(x) for x in zip(*[self.compute_post_mean(w, self.chain_variance) for w in range(0, w)]))
(np.array(x) for x in zip(*(self.compute_post_mean(w, self.chain_variance) for w in range(w))))
self.zeta = self.update_zeta()

for w in range(0, w):
val += (self.variance[w][0] - self.variance[w][t]) / 2 * chain_variance
val = sum(self.variance[w][0] - self.variance[w][t] for w in range(w)) / 2 * chain_variance

logger.info("Computing bound, all times")

for t in range(1, t + 1):
term_1 = 0.0
term_2 = 0.0
ent = 0.0
for w in range(0, w):
for w in range(w):

m = self.mean[w][t]
prev_m = self.mean[w][t - 1]
Expand Down Expand Up @@ -1071,14 +1069,14 @@ def update_obs(self, sstats, totals):
T = self.num_time_slices

runs = 0
mean_deriv_mtx = np.resize(np.zeros(T * (T + 1)), (T, T + 1))
mean_deriv_mtx = np.zeros((T, T + 1))

norm_cutoff_obs = None
for w in range(0, W):
for w in range(W):
w_counts = sstats[w]
counts_norm = 0
# now we find L2 norm of w_counts
for i in range(0, len(w_counts)):
for i in range(len(w_counts)):
counts_norm += w_counts[i] * w_counts[i]

counts_norm = np.sqrt(counts_norm)
Expand All @@ -1091,10 +1089,8 @@ def update_obs(self, sstats, totals):
w_counts = np.zeros(len(w_counts))

# TODO: apply lambda function
for t in range(0, T):
mean_deriv = mean_deriv_mtx[t]
mean_deriv = self.compute_mean_deriv(w, t, mean_deriv)
mean_deriv_mtx[t] = mean_deriv
for t in range(T):
mean_deriv_mtx[t] = self.compute_mean_deriv(w, t, mean_deriv_mtx[t])

deriv = np.zeros(T)
args = self, w_counts, totals, mean_deriv_mtx, w, deriv
Expand Down Expand Up @@ -1207,10 +1203,10 @@ def compute_obs_deriv(self, word, word_counts, totals, mean_deriv_mtx, deriv):
# temp_vector holds temporary zeta values
self.temp_vect = np.zeros(T)

for u in range(0, T):
for u in range(T):
self.temp_vect[u] = np.exp(mean[u + 1] + variance[u + 1] / 2)

for t in range(0, T):
for t in range(T):
mean_deriv = mean_deriv_mtx[t]
term1 = 0
term2 = 0
Expand Down Expand Up @@ -1280,8 +1276,8 @@ def __init__(self, doc=None, lda=None, max_doc_len=None, num_topics=None, gamma=
self.lhood = np.zeros(num_topics + 1)

if max_doc_len is not None and num_topics is not None:
self.phi = np.resize(np.zeros(max_doc_len * num_topics), (max_doc_len, num_topics))
self.log_phi = np.resize(np.zeros(max_doc_len * num_topics), (max_doc_len, num_topics))
self.phi = np.zeros((max_doc_len, num_topics))
self.log_phi = np.zeros((max_doc_len, num_topics))

# the following are class variables which are to be integrated during Document Influence Model

Expand Down Expand Up @@ -1314,12 +1310,12 @@ def update_phi(self, doc_number, time):
# digamma values
dig = np.zeros(num_topics)

for k in range(0, num_topics):
for k in range(num_topics):
dig[k] = digamma(self.gamma[k])

n = 0 # keep track of iterations for phi, log_phi
for word_id, count in self.doc:
for k in range(0, num_topics):
for k in range(num_topics):
self.log_phi[n][k] = dig[k] + self.lda.topics[word_id][k]

log_phi_row = self.log_phi[n]
Expand Down Expand Up @@ -1355,7 +1351,7 @@ def update_gamma(self):
n = 0 # keep track of number of iterations for phi, log_phi
for word_id, count in self.doc:
phi_row = self.phi[n]
for k in range(0, self.lda.num_topics):
for k in range(self.lda.num_topics):
self.gamma[k] += phi_row[k] * count
n += 1

Expand Down Expand Up @@ -1392,7 +1388,7 @@ def compute_lda_lhood(self):
digsum = digamma(gamma_sum)

model = "DTM" # noqa:F841
for k in range(0, num_topics):
for k in range(num_topics):
# below code only to be used in DIM mode
# if ldapost.doc_weight is not None and (model == "DIM" or model == "fixed"):
# influence_topic = ldapost.doc_weight[k]
Expand Down Expand Up @@ -1518,7 +1514,7 @@ def update_lda_seq_ss(self, time, doc, topic_suffstats):
"""
num_topics = self.lda.num_topics

for k in range(0, num_topics):
for k in range(num_topics):
topic_ss = topic_suffstats[k]
n = 0
for word_id, count in self.doc:
Expand Down

0 comments on commit 9242a6b

Please sign in to comment.