Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow initialization with max_final_vocab in lieu of min_count for gensim.models.Word2Vec. Fix #465 #1915

Merged
merged 29 commits into from
Mar 22, 2018
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
e249ed4
handle deprecation
aneesh-joshi Feb 8, 2018
62f6c82
Merge branch 'develop' of https://github.com/RaRe-Technologies/gensim…
aneesh-joshi Feb 14, 2018
1677e98
handle max_count
aneesh-joshi Feb 18, 2018
e8c08f8
change flag name
aneesh-joshi Feb 18, 2018
258d033
make flake8 compatible
aneesh-joshi Feb 18, 2018
875c65c
move max_vocab to prepare vocab
aneesh-joshi Feb 20, 2018
0aa8426
correct max_vocab semantics
aneesh-joshi Feb 20, 2018
390f333
remove unnecessary nextline
aneesh-joshi Feb 20, 2018
8c508c7
fix bug and make flake8 complaint
aneesh-joshi Feb 21, 2018
c826b19
refactor code and change sorting to key based
aneesh-joshi Feb 22, 2018
35dc681
add tests
aneesh-joshi Mar 5, 2018
67f6a14
introduce effective_min_count
aneesh-joshi Mar 5, 2018
7b1f612
make flake8 compliant
aneesh-joshi Mar 5, 2018
fafee70
remove clobbering of min_count
aneesh-joshi Mar 7, 2018
9d99660
remove min_count assertion
aneesh-joshi Mar 7, 2018
6c06fbc
.\gensim\models\word2vec.py
aneesh-joshi Mar 7, 2018
c5a0e6e
Revert ".\gensim\models\word2vec.py"
aneesh-joshi Mar 7, 2018
fdd2aab
rename max_vocab to max_final_vocab
aneesh-joshi Mar 7, 2018
974d587
update test to max_final_vocab
aneesh-joshi Mar 7, 2018
ddb3556
move and modify comment docs
aneesh-joshi Mar 7, 2018
c54d8a9
make flake8 compliant
aneesh-joshi Mar 7, 2018
f379616
refactor word2vec.py
aneesh-joshi Mar 8, 2018
46d3885
handle possible old model load errors
aneesh-joshi Mar 11, 2018
2cf5625
include effective_min_count tests
aneesh-joshi Mar 11, 2018
8578e3d
make flake compliant
aneesh-joshi Mar 11, 2018
a43fea3
remove check for max_final_vocab
aneesh-joshi Mar 13, 2018
340a8cf
include backward compat for 3.3 models
aneesh-joshi Mar 15, 2018
0b62407
remove unnecessary newline
aneesh-joshi Mar 15, 2018
5b7a6c2
add test case for max_final_vocab
aneesh-joshi Mar 19, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions gensim/models/deprecated/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@ def load_old_word2vec(*args, **kwargs):
new_model.trainables.syn1neg = old_model.syn1neg
if hasattr(old_model, 'syn0_lockf'):
new_model.trainables.vectors_lockf = old_model.syn0_lockf

# set vocabulary attributes
new_model.wv.vocab = old_model.wv.vocab
new_model.wv.index2word = old_model.wv.index2word
Expand Down
43 changes: 34 additions & 9 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -425,7 +425,8 @@ class Word2Vec(BaseWordEmbeddingsModel):
def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=()):
trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(),
max_final_vocab=None):
"""
Initialize the model from an iterable of `sentences`. Each sentence is a
list of words (unicode strings) that will be used for training.
Expand Down Expand Up @@ -462,6 +463,10 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
Limits the RAM during vocabulary building; if there are more unique
words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM.
Set to `None` for no limit.
max_final_vocab : int
Limits the vocab to a target vocab size by automatically picking a matching min_count. If the specified
min_count is more than the calculated min_count, the specified min_count will be used.
Set to `None` if not required.
sample : float
The threshold for configuring which higher-frequency words are randomly downsampled,
useful range is (0, 1e-5).
Expand Down Expand Up @@ -510,14 +515,15 @@ def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5,
>>> say_vector = model['say'] # get vector for word

"""
self.max_final_vocab = max_final_vocab

self.callbacks = callbacks
self.load = call_on_class_only

self.wv = Word2VecKeyedVectors(size)
self.vocabulary = Word2VecVocab(
max_vocab_size=max_vocab_size, min_count=min_count, sample=sample,
sorted_vocab=bool(sorted_vocab), null_word=null_word)
sorted_vocab=bool(sorted_vocab), null_word=null_word, max_final_vocab=max_final_vocab)
self.trainables = Word2VecTrainables(seed=seed, vector_size=size, hashfxn=hashfxn)

super(Word2Vec, self).__init__(
Expand Down Expand Up @@ -1131,14 +1137,16 @@ def __iter__(self):


class Word2VecVocab(utils.SaveLoad):
def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0):
def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0,
max_final_vocab=None):
self.max_vocab_size = max_vocab_size
self.min_count = min_count
self.sample = sample
self.sorted_vocab = sorted_vocab
self.null_word = null_word
self.cum_table = None # for negative sampling
self.raw_vocab = None
self.max_final_vocab = max_final_vocab

def scan_vocab(self, sentences, progress_per=10000, trim_rule=None):
"""Do an initial scan of all words appearing in sentences."""
Expand Down Expand Up @@ -1204,6 +1212,23 @@ def prepare_vocab(self, hs, negative, wv, update=False, keep_raw_vocab=False, tr
sample = sample or self.sample
drop_total = drop_unique = 0

# set effective_min_count to min_count in case max_final_vocab isn't set
self.effective_min_count = min_count

# if max_final_vocab is specified instead of min_count
# pick a min_count which satisfies max_final_vocab as well as possible
if self.max_final_vocab is not None:
sorted_vocab = sorted(self.raw_vocab.keys(), key=lambda word: self.raw_vocab[word], reverse=True)
calc_min_count = 1

if self.max_final_vocab < len(sorted_vocab):
calc_min_count = self.raw_vocab[sorted_vocab[self.max_final_vocab]] + 1

self.effective_min_count = max(calc_min_count, min_count)
logger.info("max_final_vocab=%d and min_count=%d resulted in calc_min_count=%d, effective_min_count=%d",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would put this outside the max_final_vocab branch, so effective_min_count logged same way even in the simple case of max_final_vocab unset.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@menshikh-iv was this comment addressed?

self.max_final_vocab, min_count, calc_min_count, self.effective_min_count
)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The behavior/logic here in this block all correct, but could be much shorter, with fewer branches & log-lines – and thus clearer to read/maintain. For example, you could start by setting calc_min_count=1. Then, if a max_final_vocab is set, and then further if it's smaller than the sorted_vocab length, increase calc_min_count to the new required value. (That is: no else needed.) Then at the top level set effective_min_count = max(calc_min_count, min_count) – no comparisons, else, or multiple-alternative-assignments needed.) Finally, a single INFO log line showing max_final_vocab, 'calc_min_count, min_count, and effective_min_count` will reveal all that anyone would need to know about what min-count-handling occurred.

Separately: (1) for logging our usual practice is to prefer the multiple-arguments method of passing values-to-be-formatted over directly %-operator formatting. (This alternative avoids every having to interpolate the string, if logging is not set at the given level.) For example, use logger.info('something %d %d', some_value1, some_value2) rather than logger.info('something %d %d' % (some_value1, some_value2)).

(2) It'd be good to keep the effective value as a property on the model for later debugging/understanding - such as in self.effective_min_count.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for the notes.
I apologize for the bad code and for this PR taking so long.
Will make changes.

Copy link
Collaborator

@gojomo gojomo Mar 7, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No apologies necessary, progress has been good & constant! Often the full set of concerns/tradeoffs only becomes clear after trying initial approaches… and, noticing all the consistency issues, and related supporting work (like updating load() for backward-compatibility of model properties), becomes much easier, almost second nature, after more experience with such codebases.

if not update:
logger.info("Loading a fresh vocabulary")
retain_total, retain_words = 0, []
Expand All @@ -1216,7 +1241,7 @@ def prepare_vocab(self, hs, negative, wv, update=False, keep_raw_vocab=False, tr
wv.vocab = {}

for word, v in iteritems(self.raw_vocab):
if keep_vocab_item(word, v, min_count, trim_rule=trim_rule):
if keep_vocab_item(word, v, self.effective_min_count, trim_rule=trim_rule):
retain_words.append(word)
retain_total += v
if not dry_run:
Expand All @@ -1228,21 +1253,21 @@ def prepare_vocab(self, hs, negative, wv, update=False, keep_raw_vocab=False, tr
original_unique_total = len(retain_words) + drop_unique
retain_unique_pct = len(retain_words) * 100 / max(original_unique_total, 1)
logger.info(
"min_count=%d retains %i unique words (%i%% of original %i, drops %i)",
min_count, len(retain_words), retain_unique_pct, original_unique_total, drop_unique
"effective_min_count=%d retains %i unique words (%i%% of original %i, drops %i)",
self.effective_min_count, len(retain_words), retain_unique_pct, original_unique_total, drop_unique
)
original_total = retain_total + drop_total
retain_pct = retain_total * 100 / max(original_total, 1)
logger.info(
"min_count=%d leaves %i word corpus (%i%% of original %i, drops %i)",
min_count, retain_total, retain_pct, original_total, drop_total
"effective_min_count=%d leaves %i word corpus (%i%% of original %i, drops %i)",
self.effective_min_count, retain_total, retain_pct, original_total, drop_total
)
else:
logger.info("Updating model with new vocabulary")
new_total = pre_exist_total = 0
new_words = pre_exist_words = []
for word, v in iteritems(self.raw_vocab):
if keep_vocab_item(word, v, min_count, trim_rule=trim_rule):
if keep_vocab_item(word, v, self.effective_min_count, trim_rule=trim_rule):
if word in wv.vocab:
pre_exist_words.append(word)
pre_exist_total += v
Expand Down
21 changes: 21 additions & 0 deletions gensim/test/test_word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,27 @@ def testTotalWordCount(self):
total_words = model.vocabulary.scan_vocab(sentences)[0]
self.assertEqual(total_words, 29)

def testMaxFinalVocab(self):
# Test for less restricting effect of max_final_vocab
# max_final_vocab is specified but has no effect
model = word2vec.Word2Vec(size=10, max_final_vocab=4, min_count=4, sample=0)
model.vocabulary.scan_vocab(sentences)
reported_values = model.vocabulary.prepare_vocab(wv=model.wv, hs=0, negative=0)
self.assertEqual(reported_values['drop_unique'], 11)
self.assertEqual(reported_values['retain_total'], 4)
self.assertEqual(reported_values['num_retained_words'], 1)
self.assertEqual(model.vocabulary.effective_min_count, 4)

# Test for more restricting effect of max_final_vocab
# results in setting a min_count more restricting than specified min_count
model = word2vec.Word2Vec(size=10, max_final_vocab=4, min_count=2, sample=0)
model.vocabulary.scan_vocab(sentences)
reported_values = model.vocabulary.prepare_vocab(wv=model.wv, hs=0, negative=0)
self.assertEqual(reported_values['drop_unique'], 8)
self.assertEqual(reported_values['retain_total'], 13)
self.assertEqual(reported_values['num_retained_words'], 4)
self.assertEqual(model.vocabulary.effective_min_count, 3)

def testOnlineLearning(self):
"""Test that the algorithm is able to add new words to the
vocabulary and to a trained model when using a sorted vocabulary"""
Expand Down