-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Correctly process empty documents in AuthorTopicModel
#2133
Changes from 4 commits
accc625
e3e47ef
7b7633d
db74531
8aa04b2
ddf8dec
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -460,11 +460,12 @@ def inference(self, chunk, author2doc, doc2author, rhot, collect_sstats=False, c | |
# make sure the term IDs are ints, otherwise np will get upset | ||
ids = [int(idx) for idx, _ in doc] | ||
else: | ||
ids = [idx for idx, _ in doc] | ||
cts = np.array([cnt for _, cnt in doc]) | ||
ids = [id for id, _ in doc] | ||
ids = np.array(ids, dtype=np.integer) | ||
cts = np.array([cnt for _, cnt in doc], dtype=np.integer) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not familiar with this There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No difference in our case import numpy as np
arr1, arr2 = [1, 2, 3], []
assert np.array(arr1, dtype=np.int).dtype == \
np.array(arr1, dtype=np.integer).dtype == \
np.array(arr2, dtype=np.int).dtype == \
np.array(arr2, dtype=np.integer).dtype There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. all of it "casted" to |
||
|
||
# Get all authors in current document, and convert the author names to integer IDs. | ||
authors_d = [self.author2id[a] for a in self.doc2author[doc_no]] | ||
authors_d = np.array([self.author2id[a] for a in self.doc2author[doc_no]], dtype=np.integer) | ||
|
||
gammad = self.state.gamma[authors_d, :] # gamma of document d before update. | ||
tilde_gamma = gammad.copy() # gamma that will be updated. | ||
|
@@ -972,9 +973,9 @@ def bound(self, chunk, chunk_doc_idx=None, subsample_ratio=1.0, author2doc=None, | |
else: | ||
doc_no = d | ||
# Get all authors in current document, and convert the author names to integer IDs. | ||
authors_d = [self.author2id[a] for a in self.doc2author[doc_no]] | ||
ids = np.array([id for id, _ in doc]) # Word IDs in doc. | ||
cts = np.array([cnt for _, cnt in doc]) # Word counts. | ||
authors_d = np.array([self.author2id[a] for a in self.doc2author[doc_no]], dtype=np.integer) | ||
ids = np.array([id for id, _ in doc], dtype=np.integer) # Word IDs in doc. | ||
cts = np.array([cnt for _, cnt in doc], dtype=np.integer) # Word counts. | ||
|
||
if d % self.chunksize == 0: | ||
logger.debug("bound: at document #%i in chunk", d) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -35,7 +35,6 @@ | |
# increases the bound. | ||
# Test that models are compatiple across versions, as done in LdaModel. | ||
|
||
|
||
# Assign some authors randomly to the documents above. | ||
author2doc = { | ||
'john': [0, 1, 2, 3, 4, 5, 6], | ||
|
@@ -110,6 +109,19 @@ def testBasic(self): | |
jill_topics = matutils.sparse2full(jill_topics, model.num_topics) | ||
self.assertTrue(all(jill_topics > 0)) | ||
|
||
def testEmptyDocument(self): | ||
_local_texts = common_texts + [['only_occurs_once_in_corpus_and_alone_in_doc']] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why vars starts from underscore? please remove underscores from start |
||
_dictionary = Dictionary(_local_texts) | ||
_dictionary.filter_extremes(no_below=2) | ||
_corpus = [_dictionary.doc2bow(text) for text in _local_texts] | ||
_a2d = author2doc.copy() | ||
_a2d['joaquin'] = [len(_local_texts) - 1] | ||
try: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No need try/except section in test, if test raise unexpected exception - this means that test broken |
||
_ = self.class_(_corpus, author2doc=_a2d, id2word=_dictionary, num_topics=2) | ||
except IndexError: | ||
raise IndexError("error occurs in 1.0.0 release tag") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ??? |
||
assert(_) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Better to retrieve vector for any document or corpus (instead of assertion) as "sanity check" action, because |
||
|
||
def testAuthor2docMissing(self): | ||
# Check that the results are the same if author2doc is constructed automatically from doc2author. | ||
model = self.class_( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please revert back
idx
(id
is built-in function name)