Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Removing Doc2Vec defaults so that it won't override Word2Vec defaults. fix #795 #929

Merged
merged 12 commits into from
Nov 22, 2016
17 changes: 9 additions & 8 deletions gensim/models/doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -529,9 +529,8 @@ def repeat(self, word_count):

class Doc2Vec(Word2Vec):
"""Class for training, using and evaluating neural networks described in http://arxiv.org/pdf/1405.4053v2.pdf"""
def __init__(self, documents=None, size=300, alpha=0.025, window=8, min_count=5,
max_vocab_size=None, sample=0, seed=1, workers=1, min_alpha=0.0001,
dm=1, hs=1, negative=0, dbow_words=0, dm_mean=0, dm_concat=0, dm_tag_count=1,
def __init__(self, documents=None, dm_mean=None,
dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1,
docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, **kwargs):
"""
Initialize the model from an iterable of `documents`. Each document is a
Expand Down Expand Up @@ -600,18 +599,20 @@ def __init__(self, documents=None, size=300, alpha=0.025, window=8, min_count=5,
of the model.

"""

super(Doc2Vec, self).__init__(
size=size, alpha=alpha, window=window, min_count=min_count, max_vocab_size=max_vocab_size,
sample=sample, seed=seed, workers=workers, min_alpha=min_alpha,
sg=(1+dm) % 2, hs=hs, negative=negative, cbow_mean=dm_mean,
sg=(1 + dm) % 2,
null_word=dm_concat, **kwargs)

if dm_mean is not None:
self.cbow_mean = dm_mean

self.dbow_words = dbow_words
self.dm_concat = dm_concat
self.dm_tag_count = dm_tag_count
if self.dm and self.dm_concat:
self.layer1_size = (self.dm_tag_count + (2 * self.window)) * self.vector_size
else:
self.layer1_size = size

self.docvecs = docvecs or DocvecsArray(docvecs_mapfile)
self.comment = comment
if documents is not None:
Expand Down
11 changes: 6 additions & 5 deletions gensim/test/test_doc2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def test_int_doctags(self):
model = doc2vec.Doc2Vec(min_count=1)
model.build_vocab(corpus)
self.assertEqual(len(model.docvecs.doctag_syn0), 300)
self.assertEqual(model.docvecs[0].shape, (300,))
self.assertEqual(model.docvecs[0].shape, (100,))
self.assertRaises(KeyError, model.__getitem__, '_*0')

def test_missing_string_doctag(self):
Expand All @@ -108,9 +108,10 @@ def test_string_doctags(self):

model = doc2vec.Doc2Vec(min_count=1)
model.build_vocab(corpus)

self.assertEqual(len(model.docvecs.doctag_syn0), 300)
self.assertEqual(model.docvecs[0].shape, (300,))
self.assertEqual(model.docvecs['_*0'].shape, (300,))
self.assertEqual(model.docvecs[0].shape, (100,))
self.assertEqual(model.docvecs['_*0'].shape, (100,))
self.assertTrue(all(model.docvecs['_*0'] == model.docvecs[0]))
self.assertTrue(max(d.offset for d in model.docvecs.doctags.values()) < len(model.docvecs.doctags))
self.assertTrue(max(model.docvecs._int_index(str_key) for str_key in model.docvecs.doctags.keys()) < len(model.docvecs.doctag_syn0))
Expand Down Expand Up @@ -168,15 +169,15 @@ def model_sanity(self, model):
def test_training(self):
"""Test doc2vec training."""
corpus = DocsLeeCorpus()
model = doc2vec.Doc2Vec(size=100, min_count=2, iter=20)
model = doc2vec.Doc2Vec(size=100, min_count=2, iter=20, workers=1)
model.build_vocab(corpus)
self.assertEqual(model.docvecs.doctag_syn0.shape, (300, 100))
model.train(corpus)

self.model_sanity(model)

# build vocab and train in one step; must be the same as above
model2 = doc2vec.Doc2Vec(corpus, size=100, min_count=2, iter=20)
model2 = doc2vec.Doc2Vec(corpus, size=100, min_count=2, iter=20, workers=1)
self.models_equal(model, model2)

def test_dbow_hs(self):
Expand Down