Skip to content

Commit

Permalink
check hs and negative. add tests (#3443)
Browse files Browse the repository at this point in the history
  • Loading branch information
gau-nernst authored Feb 17, 2023
1 parent f35faae commit f260d1e
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 3 deletions.
15 changes: 13 additions & 2 deletions gensim/models/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,11 +286,11 @@ def __init__(
Training algorithm: 1 for skip-gram; otherwise CBOW.
hs : {0, 1}, optional
If 1, hierarchical softmax will be used for model training.
If 0, and `negative` is non-zero, negative sampling will be used.
If 0, hierarchical softmax will not be used for model training.
negative : int, optional
If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
should be drawn (usually between 5-20).
If set to 0, no negative sampling is used.
If 0, negative sampling will not be used.
ns_exponent : float, optional
The exponent used to shape the negative sampling distribution. A value of 1.0 samples exactly in proportion
to the frequencies, 0.0 samples all words equally, while a negative value samples low-frequency words more
Expand Down Expand Up @@ -1536,6 +1536,17 @@ def _check_training_sanity(self, epochs=0, total_examples=None, total_words=None
If the combination of input parameters is inconsistent.
"""
if (not self.hs) and (not self.negative):
raise ValueError(
"You must set either 'hs' or 'negative' to be positive for proper training. "
"When both 'hs=0' and 'negative=0', there will be no training."
)
if self.hs and self.negative:
logger.warning(
"Both hierarchical softmax and negative sampling are activated. "
"This is probably a mistake. You should set either 'hs=0' "
"or 'negative=0' to disable one of them. "
)
if self.alpha > self.min_alpha_yet_reached:
logger.warning("Effective 'alpha' higher than previous training cycles")

Expand Down
15 changes: 14 additions & 1 deletion gensim/test/test_word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -888,7 +888,7 @@ def test_predict_output_word(self):
self.assertRaises(RuntimeError, binary_model_with_neg.predict_output_word, ['system', 'human'])

# negative sampling scheme not used
model_without_neg = word2vec.Word2Vec(sentences, min_count=1, negative=0)
model_without_neg = word2vec.Word2Vec(sentences, min_count=1, hs=1, negative=0)
self.assertRaises(RuntimeError, model_without_neg.predict_output_word, ['system', 'human'])

# passing indices instead of words in context
Expand Down Expand Up @@ -1033,6 +1033,19 @@ def test_train_warning(self, loglines):
warning = "Effective 'alpha' higher than previous training cycles"
self.assertTrue(warning in str(loglines))

@log_capture()
def test_train_hs_and_neg(self, loglines):
"""
Test if ValueError is raised when both hs=0 and negative=0
Test if warning is raised if both hs and negative are activated
"""
with self.assertRaises(ValueError):
word2vec.Word2Vec(sentences, min_count=1, hs=0, negative=0)

word2vec.Word2Vec(sentences, min_count=1, hs=1, negative=5)
warning = "Both hierarchical softmax and negative sampling are activated."
self.assertTrue(warning in str(loglines))

def test_train_with_explicit_param(self):
model = word2vec.Word2Vec(vector_size=2, min_count=1, hs=1, negative=0)
model.build_vocab(sentences)
Expand Down

0 comments on commit f260d1e

Please sign in to comment.