-
-
Notifications
You must be signed in to change notification settings - Fork 4.4k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Allow initialization with max_final_vocab
in lieu of min_count
for gensim.models.Word2Vec
. Fix #465
#1915
Allow initialization with max_final_vocab
in lieu of min_count
for gensim.models.Word2Vec
. Fix #465
#1915
Changes from 26 commits
e249ed4
62f6c82
1677e98
e8c08f8
258d033
875c65c
0aa8426
390f333
8c508c7
c826b19
35dc681
67f6a14
7b1f612
fafee70
9d99660
6c06fbc
c5a0e6e
fdd2aab
974d587
ddb3556
c54d8a9
f379616
46d3885
2cf5625
8578e3d
a43fea3
340a8cf
0b62407
5b7a6c2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -425,7 +425,8 @@ class Word2Vec(BaseWordEmbeddingsModel): | |
def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, | ||
max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, | ||
sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, | ||
trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=()): | ||
trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(), | ||
max_final_vocab=None): | ||
""" | ||
Initialize the model from an iterable of `sentences`. Each sentence is a | ||
list of words (unicode strings) that will be used for training. | ||
|
@@ -462,6 +463,10 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, | |
Limits the RAM during vocabulary building; if there are more unique | ||
words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM. | ||
Set to `None` for no limit. | ||
max_final_vocab : int | ||
Limits the vocab to a target vocab size by automatically picking a matching min_count. If the specified | ||
min_count is more than the calculated min_count, the specified min_count will be used. | ||
Set to `None` if not required. | ||
sample : float | ||
The threshold for configuring which higher-frequency words are randomly downsampled, | ||
useful range is (0, 1e-5). | ||
|
@@ -510,14 +515,15 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, | |
>>> say_vector = model['say'] # get vector for word | ||
|
||
""" | ||
self.max_final_vocab = max_final_vocab | ||
|
||
self.callbacks = callbacks | ||
self.load = call_on_class_only | ||
|
||
self.wv = Word2VecKeyedVectors(size) | ||
self.vocabulary = Word2VecVocab( | ||
max_vocab_size=max_vocab_size, min_count=min_count, sample=sample, | ||
sorted_vocab=bool(sorted_vocab), null_word=null_word) | ||
sorted_vocab=bool(sorted_vocab), null_word=null_word, max_final_vocab=max_final_vocab) | ||
self.trainables = Word2VecTrainables(seed=seed, vector_size=size, hashfxn=hashfxn) | ||
|
||
super(Word2Vec, self).__init__( | ||
|
@@ -1131,14 +1137,16 @@ def __iter__(self): | |
|
||
|
||
class Word2VecVocab(utils.SaveLoad): | ||
def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0): | ||
def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0, | ||
max_final_vocab=None): | ||
self.max_vocab_size = max_vocab_size | ||
self.min_count = min_count | ||
self.sample = sample | ||
self.sorted_vocab = sorted_vocab | ||
self.null_word = null_word | ||
self.cum_table = None # for negative sampling | ||
self.raw_vocab = None | ||
self.max_final_vocab = max_final_vocab | ||
|
||
def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): | ||
"""Do an initial scan of all words appearing in sentences.""" | ||
|
@@ -1204,6 +1212,23 @@ def prepare_vocab(self, hs, negative, wv, update=False, keep_raw_vocab=False, tr | |
sample = sample or self.sample | ||
drop_total = drop_unique = 0 | ||
|
||
# set effective_min_count to min_count in case max_final_vocab isn't set | ||
self.effective_min_count = min_count | ||
|
||
# if max_final_vocab is specified instead of min_count | ||
# pick a min_count which satisfies max_final_vocab as well as possible | ||
if self.max_final_vocab is not None: | ||
sorted_vocab = sorted(self.raw_vocab.keys(), key=lambda word: self.raw_vocab[word], reverse=True) | ||
calc_min_count = 1 | ||
|
||
if self.max_final_vocab < len(sorted_vocab): | ||
calc_min_count = self.raw_vocab[sorted_vocab[self.max_final_vocab]] + 1 | ||
|
||
self.effective_min_count = max(calc_min_count, min_count) | ||
logger.info("max_final_vocab=%d and min_count=%d resulted in calc_min_count=%d, effective_min_count=%d", | ||
self.max_final_vocab, min_count, calc_min_count, self.effective_min_count | ||
) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The behavior/logic here in this block all correct, but could be much shorter, with fewer branches & log-lines – and thus clearer to read/maintain. For example, you could start by setting Separately: (1) for logging our usual practice is to prefer the multiple-arguments method of passing values-to-be-formatted over directly (2) It'd be good to keep the effective value as a property on the model for later debugging/understanding - such as in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you for the notes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No apologies necessary, progress has been good & constant! Often the full set of concerns/tradeoffs only becomes clear after trying initial approaches… and, noticing all the consistency issues, and related supporting work (like updating |
||
if not update: | ||
logger.info("Loading a fresh vocabulary") | ||
retain_total, retain_words = 0, [] | ||
|
@@ -1216,7 +1241,7 @@ def prepare_vocab(self, hs, negative, wv, update=False, keep_raw_vocab=False, tr | |
wv.vocab = {} | ||
|
||
for word, v in iteritems(self.raw_vocab): | ||
if keep_vocab_item(word, v, min_count, trim_rule=trim_rule): | ||
if keep_vocab_item(word, v, self.effective_min_count, trim_rule=trim_rule): | ||
retain_words.append(word) | ||
retain_total += v | ||
if not dry_run: | ||
|
@@ -1228,21 +1253,21 @@ def prepare_vocab(self, hs, negative, wv, update=False, keep_raw_vocab=False, tr | |
original_unique_total = len(retain_words) + drop_unique | ||
retain_unique_pct = len(retain_words) * 100 / max(original_unique_total, 1) | ||
logger.info( | ||
"min_count=%d retains %i unique words (%i%% of original %i, drops %i)", | ||
min_count, len(retain_words), retain_unique_pct, original_unique_total, drop_unique | ||
"effective_min_count=%d retains %i unique words (%i%% of original %i, drops %i)", | ||
self.effective_min_count, len(retain_words), retain_unique_pct, original_unique_total, drop_unique | ||
) | ||
original_total = retain_total + drop_total | ||
retain_pct = retain_total * 100 / max(original_total, 1) | ||
logger.info( | ||
"min_count=%d leaves %i word corpus (%i%% of original %i, drops %i)", | ||
min_count, retain_total, retain_pct, original_total, drop_total | ||
"effective_min_count=%d leaves %i word corpus (%i%% of original %i, drops %i)", | ||
self.effective_min_count, retain_total, retain_pct, original_total, drop_total | ||
) | ||
else: | ||
logger.info("Updating model with new vocabulary") | ||
new_total = pre_exist_total = 0 | ||
new_words = pre_exist_words = [] | ||
for word, v in iteritems(self.raw_vocab): | ||
if keep_vocab_item(word, v, min_count, trim_rule=trim_rule): | ||
if keep_vocab_item(word, v, self.effective_min_count, trim_rule=trim_rule): | ||
if word in wv.vocab: | ||
pre_exist_words.append(word) | ||
pre_exist_total += v | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would put this outside the
max_final_vocab
branch, soeffective_min_count
logged same way even in the simple case ofmax_final_vocab
unset.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@menshikh-iv was this comment addressed?