-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
KeyedVectors & *2Vec API streamlining, consistency #2698
Changes from 35 commits
7e642a2
b8de987
38343d6
a255e8c
4e334c1
a16cec5
d4267f8
f6e7aa6
470b119
cd02b8b
0c77ae4
cfa723d
a4f7b77
4412696
65c2b2d
1d0f52f
8123596
c5efb24
2c234dd
9910404
658813f
3cdb1d6
10d9f55
79af68e
8875d8b
4b7566e
1baab2a
8d2f1fe
fc65525
4657b14
b5ff29b
098119b
318a858
fe3ae31
d503205
679dde9
411473b
45fd5f6
5acc5f5
5764f8c
e49ae4c
278c2bd
5c7eb1c
23805d1
f5b902c
7b571b2
87860c5
39fe128
15152ff
3d424a2
99f7009
2bb8abf
33c6508
8f17d6d
cb33e46
581ef06
9f21cba
d912616
583bbe6
0330cfc
9caf217
14dd9f5
8674949
0d2679a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
This file was deleted.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -435,7 +435,7 @@ def _get_field_from_model(model, field): | |
requested field name, fields are listed in the `_NEW_HEADER_FORMAT` list | ||
""" | ||
if field == 'bucket': | ||
return model.trainables.bucket | ||
return model.bucket | ||
elif field == 'dim': | ||
return model.vector_size | ||
elif field == 'epoch': | ||
|
@@ -457,7 +457,7 @@ def _get_field_from_model(model, field): | |
elif field == 'minn': | ||
return model.wv.min_n | ||
elif field == 'min_count': | ||
return model.vocabulary.min_count | ||
return model.min_count | ||
elif field == 'model': | ||
# `model` => cbow:1, sg:2, sup:3 | ||
# cbow = continous bag of words (default) | ||
|
@@ -467,7 +467,7 @@ def _get_field_from_model(model, field): | |
elif field == 'neg': | ||
return model.negative | ||
elif field == 't': | ||
return model.vocabulary.sample | ||
return model.sample | ||
elif field == 'word_ngrams': | ||
# This is skipped in gensim loading setting, using the default from FB C++ code | ||
return 1 | ||
|
@@ -531,9 +531,9 @@ def _dict_save(fout, model, encoding): | |
# In the unsupervised case we have only words (no labels). Hence both fields | ||
# are equal. | ||
|
||
fout.write(np.int32(len(model.wv.vocab)).tobytes()) | ||
fout.write(np.int32(len(model.wv)).tobytes()) | ||
|
||
fout.write(np.int32(len(model.wv.vocab)).tobytes()) | ||
fout.write(np.int32(len(model.wv)).tobytes()) | ||
|
||
# nlabels=0 <- no labels we are in unsupervised mode | ||
fout.write(np.int32(0).tobytes()) | ||
|
@@ -544,7 +544,7 @@ def _dict_save(fout, model, encoding): | |
fout.write(np.int64(-1)) | ||
|
||
for word in model.wv.index2word: | ||
word_count = model.wv.vocab[word].count | ||
word_count = model.wv.get_vecattr(word, 'count') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Check: what's going on here, what's this API? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Discussed more following #2698 (comment) |
||
fout.write(word.encode(encoding)) | ||
fout.write(_END_OF_WORD_MARKER) | ||
fout.write(np.int64(word_count).tobytes()) | ||
|
@@ -572,7 +572,7 @@ def _input_save(fout, model): | |
ngrams_n, ngrams_dim = model.wv.vectors_ngrams.shape | ||
|
||
assert vocab_dim == ngrams_dim | ||
assert vocab_n == len(model.wv.vocab) | ||
assert vocab_n == len(model.wv) | ||
assert ngrams_n == model.wv.bucket | ||
|
||
fout.write(struct.pack('@2q', vocab_n + ngrams_n, vocab_dim)) | ||
|
@@ -596,9 +596,9 @@ def _output_save(fout, model): | |
saved model | ||
""" | ||
if model.hs: | ||
hidden_output = model.trainables.syn1 | ||
hidden_output = model.syn1 | ||
if model.negative: | ||
hidden_output = model.trainables.syn1neg | ||
hidden_output = model.syn1neg | ||
|
||
hidden_n, hidden_dim = hidden_output.shape | ||
fout.write(struct.pack('@2q', hidden_n, hidden_dim)) | ||
|
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think the general preference in gensim is to avoid
format
.@piskvorky Is that still th case?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes. Preference is
%
, and then move tof""
which is infinitely nicer once we drop py3.6.