Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

KeyedVecs refactoring for word2vec #980

Merged
merged 43 commits into from
Nov 13, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
43 commits
Select commit Hold shift + click to select a range
55a4fc9
updated refactor
Aug 18, 2016
e916f7e
commit missed file
Aug 18, 2016
e5416ed
docstring added
Aug 18, 2016
e64766b
more refactoring
Aug 19, 2016
c34cf37
add missing docstring
Aug 19, 2016
c9b31f9
fix docstring format
Aug 19, 2016
a0329af
clearer docstring
droudy Aug 19, 2016
0c0e2fa
minor typo in word2vec wmdistance
jayantj Sep 2, 2016
cdefeb0
pyemd error in keyedvecs
jayantj Sep 8, 2016
1aec5a2
relative import of keyedvecs from word2vec fails
jayantj Sep 8, 2016
e7368a3
bug in init_sims in word2vec
jayantj Sep 8, 2016
fe283c2
property descriptors for syn0, syn0norm, index2word, vocab - fixes bu…
jayantj Sep 8, 2016
9b36bc4
tests for loading older word2vec models
jayantj Sep 9, 2016
dfe1893
backwards compatibility for loading older models
jayantj Sep 9, 2016
4a03f20
test for syn0norm not saved to file
jayantj Sep 9, 2016
09b6ebe
syn0norm not saved to file for KeyedVectors
jayantj Sep 9, 2016
7df4138
tests and fix for accuracy
jayantj Sep 9, 2016
4c54d9b
minor bug in finalized vocab check
jayantj Sep 9, 2016
a28f9f1
warnings for direct syn0/syn0norm access
jayantj Sep 9, 2016
bf1182e
fixes use of most_similar in accuracy
jayantj Sep 10, 2016
5a6b97b
changes logging level to ERROR in word2vec tests
jayantj Sep 10, 2016
cfb2e1c
renames kv to wv in word2vec
jayantj Sep 12, 2016
b002765
minor bugs with checking existence of syn0
jayantj Sep 12, 2016
27c0a14
replaces syn0 and syn0norm with wv.syn0 and wv.syn0norm in tests and …
jayantj Sep 12, 2016
81f8cbb
adds changelog
jayantj Sep 12, 2016
7f98c8d
Merge branch 'develop' into keyedvecs
jayantj Oct 16, 2016
1b282ab
updates tests for loading word2vec models for different python versions
jayantj Oct 16, 2016
c95f95f
Merge remote-tracking branch 'droudy/keyedvecs' into keyedvecs
anmolgulati Oct 25, 2016
a703bc8
Added separate word2vec model explicitly for python version 3.4.
anmolgulati Oct 26, 2016
dbb098b
Added saved word2vec model for python 3.4 files
anmolgulati Oct 26, 2016
d7b067e
Removed blank line in test_wikicorpus.py
anmolgulati Oct 26, 2016
fd80b89
Increased window size in test_sg_hs_online
anmolgulati Oct 26, 2016
7a6c281
Merge remote-tracking branch 'rare/develop' into keyedvecs
anmolgulati Oct 31, 2016
0690597
PR #986 merged in wmd in keyedvectors.py
anmolgulati Oct 31, 2016
82e0c49
Added deprecation warnings in Word2vec class attributes for future re…
anmolgulati Nov 10, 2016
d35d5c5
Merge remote-tracking branch 'rare/develop' into keyedvecs
anmolgulati Nov 10, 2016
6e63cf8
Merged rare/develop into keyedvecs removing conflicts.
anmolgulati Nov 10, 2016
3438064
Merge branch 'keyedvecs' of https://github.com/anmol01gulati/gensim i…
anmolgulati Nov 10, 2016
3775552
Merged rare/develop into keyedvecs and resolved conflicts.
anmolgulati Nov 10, 2016
9e99459
Changed numpy to np in test_word2vec.py
anmolgulati Nov 10, 2016
f6adac5
Increased window size in test_cbow_hs_online
anmolgulati Nov 11, 2016
6c60c1d
Removed blank line in test_ldamodel, work around for Travis-CI issue …
anmolgulati Nov 11, 2016
c59e8a7
Removed logging during import
anmolgulati Nov 11, 2016
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions gensim/models/doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,16 +130,16 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N

"""
if word_vectors is None:
word_vectors = model.syn0
word_vectors = model.wv.syn0
if word_locks is None:
word_locks = model.syn0_lockf
if doctag_vectors is None:
doctag_vectors = model.docvecs.doctag_syn0
if doctag_locks is None:
doctag_locks = model.docvecs.doctag_syn0_lockf

word_vocabs = [model.vocab[w] for w in doc_words if w in model.vocab and
model.vocab[w].sample_int > model.random.rand() * 2**32]
word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab and
model.wv.vocab[w].sample_int > model.random.rand() * 2**32]

for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window) # `b` in the original doc2vec code
Expand Down Expand Up @@ -185,21 +185,21 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None,

"""
if word_vectors is None:
word_vectors = model.syn0
word_vectors = model.wv.syn0
if word_locks is None:
word_locks = model.syn0_lockf
if doctag_vectors is None:
doctag_vectors = model.docvecs.doctag_syn0
if doctag_locks is None:
doctag_locks = model.docvecs.doctag_syn0_lockf

word_vocabs = [model.vocab[w] for w in doc_words if w in model.vocab and
model.vocab[w].sample_int > model.random.rand() * 2**32]
word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab and
model.wv.vocab[w].sample_int > model.random.rand() * 2**32]
doctag_len = len(doctag_indexes)
if doctag_len != model.dm_tag_count:
return 0 # skip doc without expected number of doctag(s) (TODO: warn/pad?)

null_word = model.vocab['\0']
null_word = model.wv.vocab['\0']
pre_pad_count = model.window
post_pad_count = model.window
padded_document_indexes = (
Expand All @@ -214,7 +214,7 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None,
+ padded_document_indexes[(pos + 1):(pos + 1 + post_pad_count)] # following words
)
word_context_len = len(word_context_indexes)
predict_word = model.vocab[model.index2word[padded_document_indexes[pos]]]
predict_word = model.wv.vocab[model.wv.index2word[padded_document_indexes[pos]]]
# numpy advanced-indexing copies; concatenate, flatten to 1d
l1 = concatenate((doctag_vectors[doctag_indexes], word_vectors[word_context_indexes])).ravel()
neu1e = train_cbow_pair(model, predict_word, None, l1, alpha,
Expand Down
6 changes: 3 additions & 3 deletions gensim/models/doc2vec_inner.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None,

# default vectors, locks from syn0/doctag_syn0
if word_vectors is None:
word_vectors = model.syn0
word_vectors = model.wv.syn0
_word_vectors = <REAL_t *>(np.PyArray_DATA(word_vectors))
if doctag_vectors is None:
doctag_vectors = model.docvecs.doctag_syn0
Expand Down Expand Up @@ -405,7 +405,7 @@ def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=N

# default vectors, locks from syn0/doctag_syn0
if word_vectors is None:
word_vectors = model.syn0
word_vectors = model.wv.syn0
_word_vectors = <REAL_t *>(np.PyArray_DATA(word_vectors))
if doctag_vectors is None:
doctag_vectors = model.docvecs.doctag_syn0
Expand Down Expand Up @@ -567,7 +567,7 @@ def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None,

# default vectors, locks from syn0/doctag_syn0
if word_vectors is None:
word_vectors = model.syn0
word_vectors = model.wv.syn0
_word_vectors = <REAL_t *>(np.PyArray_DATA(word_vectors))
if doctag_vectors is None:
doctag_vectors = model.docvecs.doctag_syn0
Expand Down
Loading