-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Allow initialization with max_final_vocab
in lieu of min_count
for gensim.models.Word2Vec
. Fix #465
#1915
Allow initialization with max_final_vocab
in lieu of min_count
for gensim.models.Word2Vec
. Fix #465
#1915
Changes from 5 commits
e249ed4
62f6c82
1677e98
e8c08f8
258d033
875c65c
0aa8426
390f333
8c508c7
c826b19
35dc681
67f6a14
7b1f612
fafee70
9d99660
6c06fbc
c5a0e6e
fdd2aab
974d587
ddb3556
c54d8a9
f379616
46d3885
2cf5625
8578e3d
a43fea3
340a8cf
0b62407
5b7a6c2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -425,7 +425,8 @@ class Word2Vec(BaseWordEmbeddingsModel): | |
def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, | ||
max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, | ||
sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, | ||
trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=()): | ||
trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(), | ||
use_max_vocab=False, max_vocab=None): | ||
""" | ||
Initialize the model from an iterable of `sentences`. Each sentence is a | ||
list of words (unicode strings) that will be used for training. | ||
|
@@ -510,14 +511,16 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, | |
>>> say_vector = model['say'] # get vector for word | ||
|
||
""" | ||
self.use_max_vocab = use_max_vocab | ||
self.max_vocab = max_vocab | ||
|
||
self.callbacks = callbacks | ||
self.load = call_on_class_only | ||
|
||
self.wv = Word2VecKeyedVectors(size) | ||
self.vocabulary = Word2VecVocab( | ||
max_vocab_size=max_vocab_size, min_count=min_count, sample=sample, | ||
sorted_vocab=bool(sorted_vocab), null_word=null_word) | ||
sorted_vocab=bool(sorted_vocab), null_word=null_word, use_max_vocab=use_max_vocab, max_vocab=max_vocab) | ||
self.trainables = Word2VecTrainables(seed=seed, vector_size=size, hashfxn=hashfxn) | ||
|
||
super(Word2Vec, self).__init__( | ||
|
@@ -1131,14 +1134,17 @@ def __iter__(self): | |
|
||
|
||
class Word2VecVocab(utils.SaveLoad): | ||
def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0): | ||
def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0, | ||
use_max_vocab=False, max_vocab=None): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No need to add 2 parameters, |
||
self.max_vocab_size = max_vocab_size | ||
self.min_count = min_count | ||
self.sample = sample | ||
self.sorted_vocab = sorted_vocab | ||
self.null_word = null_word | ||
self.cum_table = None # for negative sampling | ||
self.raw_vocab = None | ||
self.use_max_vocab = use_max_vocab | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. problem with backward compatibility, here and above (when you add the new attribute, you should modify There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this where I should make changes?
|
||
self.max_vocab = max_vocab | ||
|
||
def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): | ||
"""Do an initial scan of all words appearing in sentences.""" | ||
|
@@ -1176,6 +1182,25 @@ def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): | |
) | ||
corpus_count = sentence_no + 1 | ||
self.raw_vocab = vocab | ||
|
||
if self.use_max_vocab: | ||
import operator | ||
|
||
if self.max_vocab is None: | ||
raise ValueError('max_vocab not defined') | ||
|
||
sorted_vocab = sorted(vocab.items(), key=operator.itemgetter(1), reverse=True) | ||
curr_count = 0 | ||
final_vocab = {} | ||
for item in sorted_vocab: | ||
curr_count += item[1] | ||
if curr_count < self.max_vocab: | ||
final_vocab[item[0]] = item[1] | ||
else: | ||
break | ||
|
||
self.raw_vocab = final_vocab | ||
|
||
return total_words, corpus_count | ||
|
||
def sort_vocab(self, wv): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should it be implemented only for word2vec (or for other *2vec models too)?
CC: @gojomo