From 5479561f66c246ada4278daed41c506514ad7532 Mon Sep 17 00:00:00 2001 From: Gordon Mohr Date: Wed, 18 Dec 2019 21:59:56 -0800 Subject: [PATCH] rm Word2VecTrainables class --- gensim/models/base_any2vec.py | 23 ++-- gensim/models/deprecated/word2vec.py | 8 +- gensim/models/keyedvectors.py | 5 + gensim/models/word2vec.py | 153 ++++++++++++++------------- gensim/models/word2vec_inner.pyx | 10 +- gensim/test/test_word2vec.py | 96 +++++++++-------- 6 files changed, 155 insertions(+), 140 deletions(-) diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py index 62cce8d992..3cffa01ebd 100644 --- a/gensim/models/base_any2vec.py +++ b/gensim/models/base_any2vec.py @@ -83,8 +83,6 @@ def __init__(self, workers=3, vector_size=100, epochs=5, callbacks=(), batch_wor A subclass should initialize the following attributes: * self.kv - keyed vectors in model (see :class:`~gensim.models.keyedvectors.Word2VecKeyedVectors` as example) - * self.vocabulary - vocabulary (see :class:`~gensim.models.word2vec.Word2VecVocab` as example) - * self.trainables - internal matrices (see :class:`~gensim.models.word2vec.Word2VecTrainables` as example) """ self.vector_size = int(vector_size) @@ -814,7 +812,7 @@ def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_p self.corpus_total_words = total_words report_values = self.prepare_vocab(update=update, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, **kwargs) report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words']) - self.trainables.prepare_weights(self.hs, self.negative, self.wv, update=update, vocabulary=self) + self.prepare_weights(update=update, vocabulary=self) def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False): """Build vocabulary from a dictionary of word frequencies. @@ -861,8 +859,7 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No # trim by min_count & precalculate downsampling report_values = self.prepare_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words']) - self.trainables.prepare_weights( - self.hs, self.negative, self.wv, update=update, vocabulary=self) # build tables & arrays + self.prepare_weights(update=update, vocabulary=self) # build tables & arrays def estimate_memory(self, vocab_size=None, report=None): """Estimate required memory for a model using current settings and provided vocabulary size. @@ -885,9 +882,9 @@ def estimate_memory(self, vocab_size=None, report=None): report['vocab'] = vocab_size * (700 if self.hs else 500) report['vectors'] = vocab_size * self.vector_size * dtype(REAL).itemsize if self.hs: - report['syn1'] = vocab_size * self.trainables.layer1_size * dtype(REAL).itemsize + report['syn1'] = vocab_size * self.layer1_size * dtype(REAL).itemsize if self.negative: - report['syn1neg'] = vocab_size * self.trainables.layer1_size * dtype(REAL).itemsize + report['syn1neg'] = vocab_size * self.layer1_size * dtype(REAL).itemsize report['total'] = sum(report.values()) logger.info( "estimated required memory for %i words and %i dimensions: %i bytes", @@ -1004,8 +1001,8 @@ def _get_thread_working_mem(self): Each worker threads private work memory. """ - work = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL) # per-thread private work memory - neu1 = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL) + work = matutils.zeros_aligned(self.layer1_size, dtype=REAL) # per-thread private work memory + neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) return work, neu1 def _raw_word_count(self, job): @@ -1078,7 +1075,7 @@ def _check_training_sanity(self, epochs=None, total_examples=None, total_words=N logger.info( "training model with %i workers on %i vocabulary and %i features, " "using sg=%s hs=%s sample=%s negative=%s window=%s", - self.workers, len(self.wv.vocab), self.trainables.layer1_size, self.sg, + self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative, self.window ) @@ -1122,10 +1119,10 @@ def load(cls, *args, **kwargs): model.corpus_count = None if not hasattr(model, 'corpus_total_words'): model.corpus_total_words = None - if not hasattr(model.trainables, 'vectors_lockf') and hasattr(model.wv, 'vectors'): - model.trainables.vectors_lockf = ones(len(model.wv.vectors), dtype=REAL) + if not hasattr(model, 'vectors_lockf') and hasattr(model.wv, 'vectors'): + model.vectors_lockf = ones(len(model.wv.vectors), dtype=REAL) if not hasattr(model, 'random'): - model.random = random.RandomState(model.trainables.seed) + model.random = random.RandomState(model.seed) if not hasattr(model, 'train_count'): model.train_count = 0 model.total_train_time = 0 diff --git a/gensim/models/deprecated/word2vec.py b/gensim/models/deprecated/word2vec.py index 6e17e05dc5..279dbbd53e 100644 --- a/gensim/models/deprecated/word2vec.py +++ b/gensim/models/deprecated/word2vec.py @@ -169,7 +169,7 @@ def load_old_word2vec(*args, **kwargs): old_model = Word2Vec.load(*args, **kwargs) vector_size = getattr(old_model, 'vector_size', old_model.layer1_size) params = { - 'size': vector_size, + 'vector_size': vector_size, 'alpha': old_model.alpha, 'window': old_model.window, 'min_count': old_model.min_count, @@ -195,11 +195,11 @@ def load_old_word2vec(*args, **kwargs): if hasattr(old_model.wv, 'syn0norm'): new_model.wv.vectors_norm = old_model.wv.syn0norm if hasattr(old_model, 'syn1'): - new_model.trainables.syn1 = old_model.syn1 + new_model.syn1 = old_model.syn1 if hasattr(old_model, 'syn1neg'): - new_model.trainables.syn1neg = old_model.syn1neg + new_model.syn1neg = old_model.syn1neg if hasattr(old_model, 'syn0_lockf'): - new_model.trainables.vectors_lockf = old_model.syn0_lockf + new_model.vectors_lockf = old_model.syn0_lockf # set vocabulary attributes new_model.wv.vocab = old_model.wv.vocab new_model.wv.index2word = old_model.wv.index2word diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index d4882778d0..e50f6833f8 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -220,6 +220,11 @@ def _load_specials(self, *args, **kwargs): # fixup rename/consolidation into index2key of older index2word, index2entity if not hasattr(self, 'index2key'): self.index2key = self.__dict__.pop('index2word', self.__dict__.pop('index2word', None)) + # fixup rename into vectors of older syn0 + if not hasattr(self, 'vectors'): + self.vectors = self.__dict__.pop('syn0', None) + self.vectors_norm = None + self.vector_size = self.vectors.shape[1] # fixup rename of vocab into map if 'map' not in self.__dict__: self.map = self.__dict__.pop('vocab', None) diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index bdb71c71ce..bb286ce407 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -229,7 +229,7 @@ def score_cbow_pair(model, word, l1): class Word2Vec(BaseWordEmbeddingsModel): - def __init__(self, sentences=None, corpus_file=None, size=100, alpha=0.025, window=5, min_count=5, + def __init__(self, sentences=None, corpus_file=None, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(), @@ -262,7 +262,7 @@ def __init__(self, sentences=None, corpus_file=None, size=100, alpha=0.025, wind Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or `corpus_file` arguments need to be passed (or none of them, in that case, the model is left uninitialized). - size : int, optional + vector_size : int, optional Dimensionality of the word vectors. window : int, optional Maximum distance between the current and predicted word within a sentence. @@ -354,13 +354,6 @@ def __init__(self, sentences=None, corpus_file=None, size=100, alpha=0.025, wind This object essentially contains the mapping between words and embeddings. After training, it can be used directly to query those embeddings in various ways. See the module level docstring for examples. - trainables : :class:`~gensim.models.word2vec.Word2VecTrainables` - This object represents the inner shallow neural network used to train the embeddings. The semantics - of the network differ slightly in the two available training modes (CBOW or SG) but you can think of it - as a NN with single projection and hidden layer which we train on the corpus. The weights are then used - as our embeddings (which means that the size of the hidden layer is equal to the number of features - `self.size`). - """ self.max_final_vocab = max_final_vocab self.max_vocab_size = max_vocab_size @@ -371,14 +364,16 @@ def __init__(self, sentences=None, corpus_file=None, size=100, alpha=0.025, wind self.cum_table = None # for negative sampling self.raw_vocab = None - self.wv = KeyedVectors(size) + self.wv = KeyedVectors(vector_size) - self.trainables = Word2VecTrainables(seed=seed, vector_size=size, hashfxn=hashfxn) + self.hashfxn = hashfxn + self.layer1_size = vector_size + self.seed = seed self.load = call_on_class_only super(Word2Vec, self).__init__( - sentences=sentences, corpus_file=corpus_file, workers=workers, vector_size=size, epochs=iter, + sentences=sentences, corpus_file=corpus_file, workers=workers, vector_size=vector_size, epochs=iter, callbacks=callbacks, batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window, seed=seed, hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha, compute_loss=compute_loss) @@ -630,6 +625,54 @@ def make_cum_table(self, domain=2**31 - 1): if len(self.cum_table) > 0: assert self.cum_table[-1] == domain + def prepare_weights(self, update=False, vocabulary=None): + """Build tables and model weights based on final vocabulary settings.""" + # set initial input/projection and hidden weights + if not update: + self.reset_weights() + else: + self.update_weights() + + @deprecated("Use gensim.models.keyedvectors.pseudorandom_weak_vector() directly") + def seeded_vector(self, seed_string, vector_size): + return pseudorandom_weak_vector(vector_size, seed_string=seed_string, hashfxn=self.hashfxn) + + def reset_weights(self): + """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary.""" + logger.info("resetting layer weights") + self.wv.resize_vectors() + self.wv.randomly_initialize_vectors(seed=self.seed) + if self.hs: + self.syn1 = zeros((len(self.wv.vocab), self.layer1_size), dtype=REAL) + if self.negative: + self.syn1neg = zeros((len(self.wv.vocab), self.layer1_size), dtype=REAL) + + self.vectors_lockf = ones(len(self.wv.vocab), dtype=REAL) # zeros suppress learning + + def update_weights(self): + """Copy all the existing weights, and reset the weights for the newly added vocabulary.""" + logger.info("updating layer weights") + new_range = self.wv.resize_vectors() + gained_vocab = len(new_range) + self.wv.randomly_initialize_vectors(indexes=new_range) + + # Raise an error if an online update is run before initial training on a corpus + if not len(self.wv.vectors): + raise RuntimeError( + "You cannot do an online vocabulary-update of a model which has no prior vocabulary. " + "First build the vocabulary of your model with a corpus before doing an online update." + ) + + if self.hs: + self.syn1 = vstack([self.syn1, zeros((gained_vocab, self.layer1_size), dtype=REAL)]) + if self.negative: + pad = zeros((gained_vocab, self.layer1_size), dtype=REAL) + self.syn1neg = vstack([self.syn1neg, pad]) + self.wv.vectors_norm = None + + # do not suppress learning for already learned words + self.vectors_lockf = ones(len(self.wv.vocab), dtype=REAL) # zeros suppress learning + def _do_train_epoch(self, corpus_file, thread_id, offset, cython_vocab, thread_private_mem, cur_epoch, total_examples=None, total_words=None, **kwargs): work, neu1 = thread_private_mem @@ -792,7 +835,7 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor logger.info( "scoring sentences with %i workers on %i vocabulary and %i features, " "using sg=%s hs=%s sample=%s and negative=%s", - self.workers, len(self.wv.vocab), self.trainables.layer1_size, self.sg, self.hs, + self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative ) @@ -808,7 +851,7 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor def worker_loop(): """Compute log probability for each sentence, lifting lists of sentences from the jobs queue.""" work = zeros(1, dtype=REAL) # for sg hs, we actually only need one memory loc (running sum) - neu1 = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL) + neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) while True: job = job_queue.get() if job is None: # signal to finish @@ -941,7 +984,7 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut if word in self.wv.vocab: overlap_count += 1 self.wv.vectors[self.wv.vocab[word].index] = weights - self.trainables.vectors_lockf[self.wv.vocab[word].index] = lockf # lock-factor: 0.0=no changes + self.vectors_lockf[self.wv.vocab[word].index] = lockf # lock-factor: 0.0=no changes else: for line_no, line in enumerate(fin): parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ") @@ -951,7 +994,7 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut if word in self.wv.vocab: overlap_count += 1 self.wv.vectors[self.wv.vocab[word].index] = weights - self.trainables.vectors_lockf[self.wv.vocab[word].index] = lockf # lock-factor: 0.0=no changes + self.vectors_lockf[self.wv.vocab[word].index] = lockf # lock-factor: 0.0=no changes logger.info("merged %d vectors into %s matrix from %s", overlap_count, self.wv.vectors.shape, fname) def predict_output_word(self, context_words_list, topn=10): @@ -976,7 +1019,7 @@ def predict_output_word(self, context_words_list, topn=10): "so you need to have run word2vec with negative > 0 for this to work." ) - if not hasattr(self.wv, 'vectors') or not hasattr(self.trainables, 'syn1neg'): + if not hasattr(self.wv, 'vectors') or not hasattr(self, 'syn1neg'): raise RuntimeError("Parameters required for predicting the output words not found.") word_vocabs = [self.wv.vocab[w] for w in context_words_list if w in self.wv.vocab] @@ -991,7 +1034,7 @@ def predict_output_word(self, context_words_list, topn=10): l1 /= len(word2_indices) # propagate hidden -> output and take softmax to get probabilities - prob_values = exp(dot(l1, self.trainables.syn1neg.T)) + prob_values = exp(dot(l1, self.syn1neg.T)) prob_values /= sum(prob_values) top_indices = matutils.argsort(prob_values, topn=topn, reverse=True) # returning the most probable output words with their probabilities @@ -1018,7 +1061,7 @@ def reset_from(self, other_model): self.wv.index2key = other_model.wv.index2key self.cum_table = other_model.cum_table self.corpus_count = other_model.corpus_count - self.trainables.reset_weights(self.hs, self.negative, self.wv) + self.reset_weights() def __str__(self): """Human readable representation of the model's state. @@ -1083,12 +1126,23 @@ def load(cls, *args, **kwargs): try: model = super(Word2Vec, cls).load(*args, **kwargs) # for backward compatibility + if not hasattr(model, 'epochs'): + model.epochs = model.iter + del model.iter if not hasattr(model, 'max_final_vocab'): model.max_final_vocab = None if hasattr(model, 'vocabulary'): # re-integrate state that had been moved for a in ('max_vocab_size', 'min_count', 'sample', 'sorted_vocab', 'null_word', 'raw_vocab'): setattr(model, a, getattr(model.vocabulary, a)) del model.vocabulary + if hasattr(model, 'trainables'): # re-integrate state that had been moved + for a in ('hashfxn', 'layer1_size', 'seed', 'syn1neg', 'syn1'): + if hasattr(model.trainables, a): + setattr(model, a, getattr(model.trainables, a)) + if hasattr(model, 'syn1'): + model.syn1 = model.syn1 + del model.syn1 + del model.trainables return model except AttributeError: logger.info('Model saved using code from earlier Gensim Version. Re-loading old model in a compatible way.') @@ -1327,6 +1381,11 @@ class Word2VecVocab(utils.SaveLoad): pass +class Word2VecTrainables(utils.SaveLoad): + """Obsolete class retained for now as load-compatibility state capture""" + pass + + class Heapitem(namedtuple('Heapitem', 'count, index, left, right')): def __lt__(self, other): return self.count < other.count @@ -1394,62 +1453,6 @@ def _assign_binary_codes(vocab): logger.info("built huffman tree with maximum node depth %i", max_depth) -class Word2VecTrainables(utils.SaveLoad): - def __init__(self, vector_size=100, seed=1, hashfxn=hash): - """Represents the inner shallow neural network used to train :class:`~gensim.models.word2vec.Word2Vec`.""" - self.hashfxn = hashfxn - self.layer1_size = vector_size - self.seed = seed - - def prepare_weights(self, hs, negative, wv, update=False, vocabulary=None): - """Build tables and model weights based on final vocabulary settings.""" - # set initial input/projection and hidden weights - if not update: - self.reset_weights(hs, negative, wv) - else: - self.update_weights(hs, negative, wv) - - @deprecated("Use gensim.models.keyedvectors.pseudorandom_weak_vector() directly") - def seeded_vector(self, seed_string, vector_size): - return pseudorandom_weak_vector(vector_size, seed_string=seed_string, hashfxn=self.hashfxn) - - def reset_weights(self, hs, negative, wv): - """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary.""" - logger.info("resetting layer weights") - wv.resize_vectors() - wv.randomly_initialize_vectors(seed=self.seed) - if hs: - self.syn1 = zeros((len(wv.vocab), self.layer1_size), dtype=REAL) - if negative: - self.syn1neg = zeros((len(wv.vocab), self.layer1_size), dtype=REAL) - - self.vectors_lockf = ones(len(wv.vocab), dtype=REAL) # zeros suppress learning - - def update_weights(self, hs, negative, wv): - """Copy all the existing weights, and reset the weights for the newly added vocabulary.""" - logger.info("updating layer weights") - new_range = wv.resize_vectors() - gained_vocab = len(new_range) - wv.randomly_initialize_vectors(indexes=new_range) - - # Raise an error if an online update is run before initial training on a corpus - if not len(wv.vectors): - raise RuntimeError( - "You cannot do an online vocabulary-update of a model which has no prior vocabulary. " - "First build the vocabulary of your model with a corpus before doing an online update." - ) - - if hs: - self.syn1 = vstack([self.syn1, zeros((gained_vocab, self.layer1_size), dtype=REAL)]) - if negative: - pad = zeros((gained_vocab, self.layer1_size), dtype=REAL) - self.syn1neg = vstack([self.syn1neg, pad]) - wv.vectors_norm = None - - # do not suppress learning for already learned words - self.vectors_lockf = ones(len(wv.vocab), dtype=REAL) # zeros suppress learning - - # Example: ./word2vec.py -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 \ # -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3 if __name__ == "__main__": diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index 776d4b2308..8d9327d3c2 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -476,15 +476,15 @@ cdef init_w2v_config(Word2VecConfig *c, model, alpha, compute_loss, _work, _neu1 c[0].running_training_loss = model.running_training_loss c[0].syn0 = (np.PyArray_DATA(model.wv.vectors)) - c[0].word_locks = (np.PyArray_DATA(model.trainables.vectors_lockf)) + c[0].word_locks = (np.PyArray_DATA(model.vectors_lockf)) c[0].alpha = alpha c[0].size = model.wv.vector_size if c[0].hs: - c[0].syn1 = (np.PyArray_DATA(model.trainables.syn1)) + c[0].syn1 = (np.PyArray_DATA(model.syn1)) if c[0].negative: - c[0].syn1neg = (np.PyArray_DATA(model.trainables.syn1neg)) + c[0].syn1neg = (np.PyArray_DATA(model.syn1neg)) c[0].cum_table = (np.PyArray_DATA(model.cum_table)) c[0].cum_table_len = len(model.cum_table) if c[0].negative or c[0].sample: @@ -709,7 +709,7 @@ def score_sentence_sg(model, sentence, _work): cdef long result = 0 cdef int sentence_len - c.syn1 = (np.PyArray_DATA(model.trainables.syn1)) + c.syn1 = (np.PyArray_DATA(model.syn1)) # convert Python structures to primitive types, so we can release the GIL c.work = np.PyArray_DATA(_work) @@ -804,7 +804,7 @@ def score_sentence_cbow(model, sentence, _work, _neu1): cdef int i, j, k cdef long result = 0 - c.syn1 = (np.PyArray_DATA(model.trainables.syn1)) + c.syn1 = (np.PyArray_DATA(model.syn1)) # convert Python structures to primitive types, so we can release the GIL c.work = np.PyArray_DATA(_work) diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index c2e0900f99..8018d4dc7a 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -75,8 +75,8 @@ def testBuildVocabFromFreq(self): 'survey': 2, 'user': 3, 'human': 2, 'time': 2, 'interface': 2, 'response': 2 } - model_hs = word2vec.Word2Vec(size=10, min_count=0, seed=42, hs=1, negative=0) - model_neg = word2vec.Word2Vec(size=10, min_count=0, seed=42, hs=0, negative=5) + model_hs = word2vec.Word2Vec(vector_size=10, min_count=0, seed=42, hs=1, negative=0) + model_neg = word2vec.Word2Vec(vector_size=10, min_count=0, seed=42, hs=0, negative=5) model_hs.build_vocab_from_freq(freq_dict) model_neg.build_vocab_from_freq(freq_dict) self.assertEqual(len(model_hs.wv.vocab), 12) @@ -123,7 +123,7 @@ def testPruneVocab(self): ["system", "eps"], ["graph", "system"] ] - model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0) + model = word2vec.Word2Vec(sentences, vector_size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0) self.assertEqual(len(model.wv.vocab), 2) self.assertEqual(model.wv.vocab['graph'].count, 3) self.assertEqual(model.wv.vocab['system'].count, 4) @@ -135,21 +135,21 @@ def testPruneVocab(self): ["graph", "system"], ["minors", "survey", "minors", "survey", "minors"] ] - model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0) + model = word2vec.Word2Vec(sentences, vector_size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0) self.assertEqual(len(model.wv.vocab), 3) self.assertEqual(model.wv.vocab['graph'].count, 3) self.assertEqual(model.wv.vocab['minors'].count, 3) self.assertEqual(model.wv.vocab['system'].count, 4) def testTotalWordCount(self): - model = word2vec.Word2Vec(size=10, min_count=0, seed=42) + model = word2vec.Word2Vec(vector_size=10, min_count=0, seed=42) total_words = model.scan_vocab(sentences)[0] self.assertEqual(total_words, 29) def testMaxFinalVocab(self): # Test for less restricting effect of max_final_vocab # max_final_vocab is specified but has no effect - model = word2vec.Word2Vec(size=10, max_final_vocab=4, min_count=4, sample=0) + model = word2vec.Word2Vec(vector_size=10, max_final_vocab=4, min_count=4, sample=0) model.scan_vocab(sentences) reported_values = model.prepare_vocab() self.assertEqual(reported_values['drop_unique'], 11) @@ -159,7 +159,7 @@ def testMaxFinalVocab(self): # Test for more restricting effect of max_final_vocab # results in setting a min_count more restricting than specified min_count - model = word2vec.Word2Vec(size=10, max_final_vocab=4, min_count=2, sample=0) + model = word2vec.Word2Vec(vector_size=10, max_final_vocab=4, min_count=2, sample=0) model.scan_vocab(sentences) reported_values = model.prepare_vocab() self.assertEqual(reported_values['drop_unique'], 8) @@ -170,8 +170,8 @@ def testMaxFinalVocab(self): def testOnlineLearning(self): """Test that the algorithm is able to add new words to the vocabulary and to a trained model when using a sorted vocabulary""" - model_hs = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=1, negative=0) - model_neg = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=0, negative=5) + model_hs = word2vec.Word2Vec(sentences, vector_size=10, min_count=0, seed=42, hs=1, negative=0) + model_neg = word2vec.Word2Vec(sentences, vector_size=10, min_count=0, seed=42, hs=0, negative=5) self.assertTrue(len(model_hs.wv.vocab), 12) self.assertTrue(model_hs.wv.vocab['graph'].count, 3) model_hs.build_vocab(new_sentences, update=True) @@ -185,7 +185,7 @@ def testOnlineLearningAfterSave(self): """Test that the algorithm is able to add new words to the vocabulary and to a trained model when using a sorted vocabulary""" tmpf = get_tmpfile('gensim_word2vec.tst') - model_neg = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=0, negative=5) + model_neg = word2vec.Word2Vec(sentences, vector_size=10, min_count=0, seed=42, hs=0, negative=5) model_neg.save(tmpf) model_neg = word2vec.Word2Vec.load(tmpf) self.assertTrue(len(model_neg.wv.vocab), 12) @@ -202,8 +202,10 @@ def testOnlineLearningFromFile(self): utils.save_as_line_sentence(sentences, corpus_file) utils.save_as_line_sentence(new_sentences, new_corpus_file) - model_hs = word2vec.Word2Vec(corpus_file=corpus_file, size=10, min_count=0, seed=42, hs=1, negative=0) - model_neg = word2vec.Word2Vec(corpus_file=corpus_file, size=10, min_count=0, seed=42, hs=0, negative=5) + model_hs = word2vec.Word2Vec(corpus_file=corpus_file, vector_size=10, min_count=0, seed=42, + hs=1, negative=0) + model_neg = word2vec.Word2Vec(corpus_file=corpus_file, vector_size=10, min_count=0, seed=42, + hs=0, negative=5) self.assertTrue(len(model_hs.wv.vocab), 12) self.assertTrue(model_hs.wv.vocab['graph'].count, 3) model_hs.build_vocab(corpus_file=new_corpus_file, update=True) @@ -227,7 +229,8 @@ def testOnlineLearningAfterSaveFromFile(self): utils.save_as_line_sentence(new_sentences, new_corpus_file) tmpf = get_tmpfile('gensim_word2vec.tst') - model_neg = word2vec.Word2Vec(corpus_file=corpus_file, size=10, min_count=0, seed=42, hs=0, negative=5) + model_neg = word2vec.Word2Vec(corpus_file=corpus_file, vector_size=10, min_count=0, seed=42, + hs=0, negative=5) model_neg.save(tmpf) model_neg = word2vec.Word2Vec.load(tmpf) self.assertTrue(len(model_neg.wv.vocab), 12) @@ -370,13 +373,13 @@ def testLoadPreKeyedVectorModel(self): model_file = 'word2vec_pre_kv%s' % model_file_suffix model = word2vec.Word2Vec.load(datapath(model_file)) self.assertTrue(model.wv.vectors.shape == (len(model.wv.vocab), model.vector_size)) - self.assertTrue(model.trainables.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) + self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) # Model stored in multiple files model_file = 'word2vec_pre_kv_sep%s' % model_file_suffix model = word2vec.Word2Vec.load(datapath(model_file)) self.assertTrue(model.wv.vectors.shape == (len(model.wv.vocab), model.vector_size)) - self.assertTrue(model.trainables.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) + self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) def testLoadPreKeyedVectorModelCFormat(self): """Test loading pre-KeyedVectors word2vec model saved in word2vec format""" @@ -524,11 +527,11 @@ def testVocab(self): def testTraining(self): """Test word2vec training.""" # build vocabulary, don't train yet - model = word2vec.Word2Vec(size=2, min_count=1, hs=1, negative=0) + model = word2vec.Word2Vec(vector_size=2, min_count=1, hs=1, negative=0) model.build_vocab(sentences) self.assertTrue(model.wv.vectors.shape == (len(model.wv.vocab), 2)) - self.assertTrue(model.trainables.syn1.shape == (len(model.wv.vocab), 2)) + self.assertTrue(model.syn1.shape == (len(model.wv.vocab), 2)) model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs) sims = model.wv.most_similar('graph', topn=10) @@ -541,7 +544,7 @@ def testTraining(self): self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above - model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0) + model2 = word2vec.Word2Vec(sentences, vector_size=2, min_count=1, hs=1, negative=0) self.models_equal(model, model2) @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27") @@ -551,11 +554,11 @@ def testTrainingFromFile(self): with temporary_file(get_tmpfile('gensim_word2vec.tst')) as tf: utils.save_as_line_sentence(sentences, tf) - model = word2vec.Word2Vec(size=2, min_count=1, hs=1, negative=0) + model = word2vec.Word2Vec(vector_size=2, min_count=1, hs=1, negative=0) model.build_vocab(corpus_file=tf) self.assertTrue(model.wv.vectors.shape == (len(model.wv.vocab), 2)) - self.assertTrue(model.trainables.syn1.shape == (len(model.wv.vocab), 2)) + self.assertTrue(model.syn1.shape == (len(model.wv.vocab), 2)) model.train(corpus_file=tf, total_words=model.corpus_total_words, epochs=model.epochs) sims = model.wv.most_similar('graph', topn=10) @@ -569,7 +572,7 @@ def testTrainingFromFile(self): def testScoring(self): """Test word2vec scoring.""" - model = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0) + model = word2vec.Word2Vec(sentences, vector_size=2, min_count=1, hs=1, negative=0) # just score and make sure they exist scores = model.score(sentences, len(sentences)) @@ -580,14 +583,14 @@ def testLocking(self): corpus = LeeCorpus() # build vocabulary, don't train yet for sg in range(2): # test both cbow and sg - model = word2vec.Word2Vec(size=4, hs=1, negative=5, min_count=1, sg=sg, window=5) + model = word2vec.Word2Vec(vector_size=4, hs=1, negative=5, min_count=1, sg=sg, window=5) model.build_vocab(corpus) # remember two vectors locked0 = np.copy(model.wv.vectors[0]) unlocked1 = np.copy(model.wv.vectors[1]) # lock the vector in slot 0 against change - model.trainables.vectors_lockf[0] = 0.0 + model.vectors_lockf[0] = 0.0 model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs) self.assertFalse((unlocked1 == model.wv.vectors[1]).all()) # unlocked vector should vary @@ -709,7 +712,7 @@ def test_cbow_neg_fromfile(self): self.model_sanity(model, with_corpus_file=True) def test_cosmul(self): - model = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0) + model = word2vec.Word2Vec(sentences, vector_size=2, min_count=1, hs=1, negative=0) sims = model.wv.most_similar_cosmul('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar @@ -723,10 +726,10 @@ def testTrainingCbow(self): """Test CBOW word2vec training.""" # to test training, make the corpus larger by repeating its sentences over and over # build vocabulary, don't train yet - model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=1, negative=0) + model = word2vec.Word2Vec(vector_size=2, min_count=1, sg=0, hs=1, negative=0) model.build_vocab(sentences) self.assertTrue(model.wv.vectors.shape == (len(model.wv.vocab), 2)) - self.assertTrue(model.trainables.syn1.shape == (len(model.wv.vocab), 2)) + self.assertTrue(model.syn1.shape == (len(model.wv.vocab), 2)) model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs) sims = model.wv.most_similar('graph', topn=10) @@ -739,17 +742,17 @@ def testTrainingCbow(self): self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above - model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=0, hs=1, negative=0) + model2 = word2vec.Word2Vec(sentences, vector_size=2, min_count=1, sg=0, hs=1, negative=0) self.models_equal(model, model2) def testTrainingSgNegative(self): """Test skip-gram (negative sampling) word2vec training.""" # to test training, make the corpus larger by repeating its sentences over and over # build vocabulary, don't train yet - model = word2vec.Word2Vec(size=2, min_count=1, sg=1, hs=0, negative=2) + model = word2vec.Word2Vec(vector_size=2, min_count=1, sg=1, hs=0, negative=2) model.build_vocab(sentences) self.assertTrue(model.wv.vectors.shape == (len(model.wv.vocab), 2)) - self.assertTrue(model.trainables.syn1neg.shape == (len(model.wv.vocab), 2)) + self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), 2)) model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs) sims = model.wv.most_similar('graph', topn=10) @@ -762,17 +765,17 @@ def testTrainingSgNegative(self): self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above - model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=1, hs=0, negative=2) + model2 = word2vec.Word2Vec(sentences, vector_size=2, min_count=1, sg=1, hs=0, negative=2) self.models_equal(model, model2) def testTrainingCbowNegative(self): """Test CBOW (negative sampling) word2vec training.""" # to test training, make the corpus larger by repeating its sentences over and over # build vocabulary, don't train yet - model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=0, negative=2) + model = word2vec.Word2Vec(vector_size=2, min_count=1, sg=0, hs=0, negative=2) model.build_vocab(sentences) self.assertTrue(model.wv.vectors.shape == (len(model.wv.vocab), 2)) - self.assertTrue(model.trainables.syn1neg.shape == (len(model.wv.vocab), 2)) + self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), 2)) model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs) sims = model.wv.most_similar('graph', topn=10) @@ -785,13 +788,13 @@ def testTrainingCbowNegative(self): self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above - model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=0, hs=0, negative=2) + model2 = word2vec.Word2Vec(sentences, vector_size=2, min_count=1, sg=0, hs=0, negative=2) self.models_equal(model, model2) def testSimilarities(self): """Test similarity and n_similarity methods.""" # The model is trained using CBOW - model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=0, negative=2) + model = word2vec.Word2Vec(vector_size=2, min_count=1, sg=0, hs=0, negative=2) model.build_vocab(sentences) model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs) @@ -803,7 +806,7 @@ def testSimilarities(self): def testSimilarBy(self): """Test word2vec similar_by_word and similar_by_vector.""" - model = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0) + model = word2vec.Word2Vec(sentences, vector_size=2, min_count=1, hs=1, negative=0) wordsims = model.wv.similar_by_word('graph', topn=10) wordsims2 = model.wv.most_similar(positive='graph', topn=10) vectorsims = model.wv.similar_by_vector(model.wv['graph'], topn=10) @@ -833,9 +836,9 @@ def models_equal(self, model, model2): self.assertEqual(len(model.wv.vocab), len(model2.wv.vocab)) self.assertTrue(np.allclose(model.wv.vectors, model2.wv.vectors)) if model.hs: - self.assertTrue(np.allclose(model.trainables.syn1, model2.trainables.syn1)) + self.assertTrue(np.allclose(model.syn1, model2.syn1)) if model.negative: - self.assertTrue(np.allclose(model.trainables.syn1neg, model2.trainables.syn1neg)) + self.assertTrue(np.allclose(model.syn1neg, model2.syn1neg)) most_common_word = max(model.wv.vocab.items(), key=lambda item: item[1].count)[0] self.assertTrue(np.allclose(model.wv[most_common_word], model2.wv[most_common_word])) @@ -871,8 +874,8 @@ def testLoadOldModel(self): self.assertTrue(model.wv.vectors.shape == (12, 100)) self.assertTrue(len(model.wv.vocab) == 12) self.assertTrue(len(model.wv.index2word) == 12) - self.assertTrue(model.trainables.syn1neg.shape == (len(model.wv.vocab), model.wv.vector_size)) - self.assertTrue(model.trainables.vectors_lockf.shape == (12,)) + self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.wv.vector_size)) + self.assertTrue(model.vectors_lockf.shape == (12,)) self.assertTrue(model.cum_table.shape == (12,)) self.onlineSanity(model, trained_model=True) @@ -886,8 +889,8 @@ def testLoadOldModelSeparates(self): self.assertTrue(model.wv.vectors.shape == (12, 100)) self.assertTrue(len(model.wv.vocab) == 12) self.assertTrue(len(model.wv.index2word) == 12) - self.assertTrue(model.trainables.syn1neg.shape == (len(model.wv.vocab), model.wv.vector_size)) - self.assertTrue(model.trainables.vectors_lockf.shape == (12,)) + self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.wv.vector_size)) + self.assertTrue(model.vectors_lockf.shape == (12,)) self.assertTrue(model.cum_table.shape == (12,)) self.onlineSanity(model, trained_model=True) @@ -949,7 +952,14 @@ def _check_old_version(self, old_version): model = word2vec.Word2Vec.load(saved_models_dir.format(old_version)) self.assertIsNone(model.corpus_total_words) self.assertTrue(len(model.wv.vocab) == 3) - self.assertTrue(model.wv.vectors.shape == (3, 4)) + try: + self.assertTrue(model.wv.vectors.shape == (3, 4)) + except AttributeError as ae: + print("WV") + print(model.wv) + print(dir(model.wv)) + print(model.wv.syn0) + raise ae # check if similarity search and online training works. self.assertTrue(len(model.wv.most_similar('sentence')) == 2) model.build_vocab(list_corpus, update=True) @@ -989,7 +999,7 @@ def testTrainWarning(self, l): self.assertTrue(warning in str(l)) def test_train_with_explicit_param(self): - model = word2vec.Word2Vec(size=2, min_count=1, hs=1, negative=0) + model = word2vec.Word2Vec(vector_size=2, min_count=1, hs=1, negative=0) model.build_vocab(sentences) with self.assertRaises(ValueError): model.train(sentences, total_examples=model.corpus_count)