Skip to content

Commit

Permalink
rm Word2VecTrainables class
Browse files Browse the repository at this point in the history
  • Loading branch information
gojomo committed Dec 19, 2019
1 parent b9ab036 commit 5479561
Show file tree
Hide file tree
Showing 6 changed files with 155 additions and 140 deletions.
23 changes: 10 additions & 13 deletions gensim/models/base_any2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,6 @@ def __init__(self, workers=3, vector_size=100, epochs=5, callbacks=(), batch_wor
A subclass should initialize the following attributes:
* self.kv - keyed vectors in model (see :class:`~gensim.models.keyedvectors.Word2VecKeyedVectors` as example)
* self.vocabulary - vocabulary (see :class:`~gensim.models.word2vec.Word2VecVocab` as example)
* self.trainables - internal matrices (see :class:`~gensim.models.word2vec.Word2VecTrainables` as example)
"""
self.vector_size = int(vector_size)
Expand Down Expand Up @@ -814,7 +812,7 @@ def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_p
self.corpus_total_words = total_words
report_values = self.prepare_vocab(update=update, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, **kwargs)
report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words'])
self.trainables.prepare_weights(self.hs, self.negative, self.wv, update=update, vocabulary=self)
self.prepare_weights(update=update, vocabulary=self)

def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False):
"""Build vocabulary from a dictionary of word frequencies.
Expand Down Expand Up @@ -861,8 +859,7 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No
# trim by min_count & precalculate downsampling
report_values = self.prepare_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update)
report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words'])
self.trainables.prepare_weights(
self.hs, self.negative, self.wv, update=update, vocabulary=self) # build tables & arrays
self.prepare_weights(update=update, vocabulary=self) # build tables & arrays

def estimate_memory(self, vocab_size=None, report=None):
"""Estimate required memory for a model using current settings and provided vocabulary size.
Expand All @@ -885,9 +882,9 @@ def estimate_memory(self, vocab_size=None, report=None):
report['vocab'] = vocab_size * (700 if self.hs else 500)
report['vectors'] = vocab_size * self.vector_size * dtype(REAL).itemsize
if self.hs:
report['syn1'] = vocab_size * self.trainables.layer1_size * dtype(REAL).itemsize
report['syn1'] = vocab_size * self.layer1_size * dtype(REAL).itemsize
if self.negative:
report['syn1neg'] = vocab_size * self.trainables.layer1_size * dtype(REAL).itemsize
report['syn1neg'] = vocab_size * self.layer1_size * dtype(REAL).itemsize
report['total'] = sum(report.values())
logger.info(
"estimated required memory for %i words and %i dimensions: %i bytes",
Expand Down Expand Up @@ -1004,8 +1001,8 @@ def _get_thread_working_mem(self):
Each worker threads private work memory.
"""
work = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL) # per-thread private work memory
neu1 = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL)
work = matutils.zeros_aligned(self.layer1_size, dtype=REAL) # per-thread private work memory
neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)
return work, neu1

def _raw_word_count(self, job):
Expand Down Expand Up @@ -1078,7 +1075,7 @@ def _check_training_sanity(self, epochs=None, total_examples=None, total_words=N
logger.info(
"training model with %i workers on %i vocabulary and %i features, "
"using sg=%s hs=%s sample=%s negative=%s window=%s",
self.workers, len(self.wv.vocab), self.trainables.layer1_size, self.sg,
self.workers, len(self.wv.vocab), self.layer1_size, self.sg,
self.hs, self.sample, self.negative, self.window
)

Expand Down Expand Up @@ -1122,10 +1119,10 @@ def load(cls, *args, **kwargs):
model.corpus_count = None
if not hasattr(model, 'corpus_total_words'):
model.corpus_total_words = None
if not hasattr(model.trainables, 'vectors_lockf') and hasattr(model.wv, 'vectors'):
model.trainables.vectors_lockf = ones(len(model.wv.vectors), dtype=REAL)
if not hasattr(model, 'vectors_lockf') and hasattr(model.wv, 'vectors'):
model.vectors_lockf = ones(len(model.wv.vectors), dtype=REAL)
if not hasattr(model, 'random'):
model.random = random.RandomState(model.trainables.seed)
model.random = random.RandomState(model.seed)
if not hasattr(model, 'train_count'):
model.train_count = 0
model.total_train_time = 0
Expand Down
8 changes: 4 additions & 4 deletions gensim/models/deprecated/word2vec.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def load_old_word2vec(*args, **kwargs):
old_model = Word2Vec.load(*args, **kwargs)
vector_size = getattr(old_model, 'vector_size', old_model.layer1_size)
params = {
'size': vector_size,
'vector_size': vector_size,
'alpha': old_model.alpha,
'window': old_model.window,
'min_count': old_model.min_count,
Expand All @@ -195,11 +195,11 @@ def load_old_word2vec(*args, **kwargs):
if hasattr(old_model.wv, 'syn0norm'):
new_model.wv.vectors_norm = old_model.wv.syn0norm
if hasattr(old_model, 'syn1'):
new_model.trainables.syn1 = old_model.syn1
new_model.syn1 = old_model.syn1
if hasattr(old_model, 'syn1neg'):
new_model.trainables.syn1neg = old_model.syn1neg
new_model.syn1neg = old_model.syn1neg
if hasattr(old_model, 'syn0_lockf'):
new_model.trainables.vectors_lockf = old_model.syn0_lockf
new_model.vectors_lockf = old_model.syn0_lockf
# set vocabulary attributes
new_model.wv.vocab = old_model.wv.vocab
new_model.wv.index2word = old_model.wv.index2word
Expand Down
5 changes: 5 additions & 0 deletions gensim/models/keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,11 @@ def _load_specials(self, *args, **kwargs):
# fixup rename/consolidation into index2key of older index2word, index2entity
if not hasattr(self, 'index2key'):
self.index2key = self.__dict__.pop('index2word', self.__dict__.pop('index2word', None))
# fixup rename into vectors of older syn0
if not hasattr(self, 'vectors'):
self.vectors = self.__dict__.pop('syn0', None)
self.vectors_norm = None
self.vector_size = self.vectors.shape[1]
# fixup rename of vocab into map
if 'map' not in self.__dict__:
self.map = self.__dict__.pop('vocab', None)
Expand Down
Loading

0 comments on commit 5479561

Please sign in to comment.