From e69fff895d309082400b97530c0821d31ffea734 Mon Sep 17 00:00:00 2001 From: Gordon Mohr Date: Tue, 17 Dec 2019 18:33:26 -0800 Subject: [PATCH] rm obsolete Vocab/Trainable/abstract/Wrapper classes, persistent callbacks (bug #2136), outdated tests/warnings; update usages --- docs/src/apiref.rst | 7 - docs/src/models/base_any2vec.rst | 10 - docs/src/models/deprecated/doc2vec.rst | 9 - docs/src/models/deprecated/fasttext.rst | 10 - .../models/deprecated/fasttext_wrapper.rst | 10 - docs/src/models/deprecated/keyedvectors.rst | 9 - docs/src/models/deprecated/word2vec.rst | 9 - docs/src/models/wrappers/fasttext.rst | 9 - gensim/models/__init__.py | 1 - gensim/models/base_any2vec.py | 1251 ----------- gensim/models/callbacks.py | 14 +- gensim/models/deprecated/__init__.py | 1 - gensim/models/deprecated/doc2vec.py | 1044 --------- gensim/models/deprecated/fasttext.py | 711 ------ gensim/models/deprecated/fasttext_wrapper.py | 461 ---- gensim/models/deprecated/keyedvectors.py | 1115 ---------- gensim/models/deprecated/old_saveload.py | 398 ---- gensim/models/deprecated/word2vec.py | 1907 ---------------- gensim/models/doc2vec.py | 335 ++- gensim/models/doc2vec_inner.pyx | 20 +- gensim/models/fasttext.py | 325 +-- gensim/models/fasttext_inner.pyx | 14 +- gensim/models/keyedvectors.py | 9 +- gensim/models/word2vec.py | 1909 ++++++++++++----- gensim/models/word2vec_inner.pyx | 16 +- gensim/models/wrappers/__init__.py | 1 - gensim/models/wrappers/fasttext.py | 40 - gensim/sklearn_api/d2vmodel.py | 19 +- gensim/sklearn_api/ftmodel.py | 22 +- gensim/sklearn_api/w2vmodel.py | 22 +- gensim/test/test_doc2vec.py | 74 +- gensim/test/test_fasttext.py | 119 +- gensim/test/test_fasttext_wrapper.py | 382 ---- gensim/test/test_keras_integration.py | 4 +- gensim/test/test_keyedvectors.py | 6 +- gensim/test/test_poincare.py | 8 +- gensim/test/test_sklearn_api.py | 48 +- gensim/test/test_translation_matrix.py | 27 +- gensim/test/test_word2vec.py | 150 +- 39 files changed, 1891 insertions(+), 8635 deletions(-) delete mode 100644 docs/src/models/base_any2vec.rst delete mode 100644 docs/src/models/deprecated/doc2vec.rst delete mode 100644 docs/src/models/deprecated/fasttext.rst delete mode 100644 docs/src/models/deprecated/fasttext_wrapper.rst delete mode 100644 docs/src/models/deprecated/keyedvectors.rst delete mode 100644 docs/src/models/deprecated/word2vec.rst delete mode 100644 docs/src/models/wrappers/fasttext.rst delete mode 100644 gensim/models/base_any2vec.py delete mode 100644 gensim/models/deprecated/__init__.py delete mode 100644 gensim/models/deprecated/doc2vec.py delete mode 100644 gensim/models/deprecated/fasttext.py delete mode 100644 gensim/models/deprecated/fasttext_wrapper.py delete mode 100644 gensim/models/deprecated/keyedvectors.py delete mode 100644 gensim/models/deprecated/old_saveload.py delete mode 100644 gensim/models/deprecated/word2vec.py delete mode 100644 gensim/models/wrappers/fasttext.py delete mode 100644 gensim/test/test_fasttext_wrapper.py diff --git a/docs/src/apiref.rst b/docs/src/apiref.rst index e20c1e2f1f..1e3e341487 100644 --- a/docs/src/apiref.rst +++ b/docs/src/apiref.rst @@ -61,13 +61,6 @@ Modules: models/wrappers/ldavowpalwabbit.rst models/wrappers/wordrank models/wrappers/varembed - models/wrappers/fasttext - models/deprecated/doc2vec - models/deprecated/fasttext - models/deprecated/word2vec - models/deprecated/keyedvectors - models/deprecated/fasttext_wrapper - models/base_any2vec similarities/docsim similarities/termsim similarities/index diff --git a/docs/src/models/base_any2vec.rst b/docs/src/models/base_any2vec.rst deleted file mode 100644 index e6685cda66..0000000000 --- a/docs/src/models/base_any2vec.rst +++ /dev/null @@ -1,10 +0,0 @@ -:mod:`models.base_any2vec` -- Base classes for any2vec models -============================================================= - -.. automodule:: gensim.models.base_any2vec - :synopsis: Base classes for any2vec models - :members: - :inherited-members: - :special-members: __getitem__ - :undoc-members: - :show-inheritance: diff --git a/docs/src/models/deprecated/doc2vec.rst b/docs/src/models/deprecated/doc2vec.rst deleted file mode 100644 index e8fb2d96b3..0000000000 --- a/docs/src/models/deprecated/doc2vec.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`models.deprecated.doc2vec` -- Deep learning with paragraph2vec -==================================================================== - -.. automodule:: gensim.models.deprecated.doc2vec - :synopsis: Deep learning with doc2vec - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/models/deprecated/fasttext.rst b/docs/src/models/deprecated/fasttext.rst deleted file mode 100644 index 08de0234d2..0000000000 --- a/docs/src/models/deprecated/fasttext.rst +++ /dev/null @@ -1,10 +0,0 @@ -:mod:`models.deprecated.fasttext` -- FastText model -=================================================== - -.. automodule:: gensim.models.deprecated.fasttext - :synopsis: FastText model - :members: - :inherited-members: - :special-members: __getitem__ - :undoc-members: - :show-inheritance: diff --git a/docs/src/models/deprecated/fasttext_wrapper.rst b/docs/src/models/deprecated/fasttext_wrapper.rst deleted file mode 100644 index 020504de24..0000000000 --- a/docs/src/models/deprecated/fasttext_wrapper.rst +++ /dev/null @@ -1,10 +0,0 @@ -:mod:`models.deprecated.fasttext_wrapper` -- Wrapper for Facebook implementation of FastText model -================================================================================================== - -.. automodule:: gensim.models.deprecated.fasttext_wrapper - :synopsis: FastText model - :members: - :inherited-members: - :special-members: __getitem__ - :undoc-members: - :show-inheritance: diff --git a/docs/src/models/deprecated/keyedvectors.rst b/docs/src/models/deprecated/keyedvectors.rst deleted file mode 100644 index 7d55cbc798..0000000000 --- a/docs/src/models/deprecated/keyedvectors.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`models.deprecated.keyedvectors` -- Store and query word vectors -===================================================================== - -.. automodule:: gensim.models.deprecated.keyedvectors - :synopsis: Store and query word vectors - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/models/deprecated/word2vec.rst b/docs/src/models/deprecated/word2vec.rst deleted file mode 100644 index 3b80aaf196..0000000000 --- a/docs/src/models/deprecated/word2vec.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`models.deprecated.word2vec` -- Deep learning with word2vec -================================================================ - -.. automodule:: gensim.models.deprecated.word2vec - :synopsis: Deep learning with word2vec - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/docs/src/models/wrappers/fasttext.rst b/docs/src/models/wrappers/fasttext.rst deleted file mode 100644 index 4476cc7b43..0000000000 --- a/docs/src/models/wrappers/fasttext.rst +++ /dev/null @@ -1,9 +0,0 @@ -:mod:`models.wrappers.fasttext` -- Wrapper for FastText implementation from Facebook -==================================================================================== - -.. automodule:: gensim.models.wrappers.fasttext - :synopsis: FastText - :members: - :inherited-members: - :undoc-members: - :show-inheritance: diff --git a/gensim/models/__init__.py b/gensim/models/__init__.py index 96ca698b27..ee054b167d 100644 --- a/gensim/models/__init__.py +++ b/gensim/models/__init__.py @@ -23,7 +23,6 @@ from .translation_matrix import TranslationMatrix, BackMappingTranslationMatrix # noqa:F401 from . import wrappers # noqa:F401 -from . import deprecated # noqa:F401 from gensim import interfaces, utils diff --git a/gensim/models/base_any2vec.py b/gensim/models/base_any2vec.py deleted file mode 100644 index f0a33ba7ff..0000000000 --- a/gensim/models/base_any2vec.py +++ /dev/null @@ -1,1251 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Author: Shiva Manne -# Copyright (C) 2018 RaRe Technologies s.r.o. -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -r"""This module contains base classes required for implementing \*2vec algorithms. - -The class hierarchy is designed to facilitate adding more concrete implementations for creating embeddings. -In the most general case, the purpose of this class is to transform an arbitrary representation to a numerical vector -(embedding). This is represented by the base :class:`~gensim.models.base_any2vec.BaseAny2VecModel`. The input space in -most cases (in the NLP field at least) is plain text. For this reason, we enrich the class hierarchy with the abstract -:class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` to be used as a base for models where the input -space is text. - -Notes ------ -Even though this is the usual case, not all embeddings transform text, such as the -:class:`~gensim.models.poincare.PoincareModel` that embeds graphs. - -See Also --------- -:class:`~gensim.models.word2vec.Word2Vec`. - Word2Vec model - embeddings for words. -:class:`~gensim.models.fasttext.FastText`. - FastText model - embeddings for words (ngram-based). -:class:`~gensim.models.doc2vec.Doc2Vec`. - Doc2Vec model - embeddings for documents. -:class:`~gensim.models.poincare.PoincareModel` - Poincare model - embeddings for graphs. - -""" - -from gensim import utils -import logging -from timeit import default_timer -import threading -from six.moves import range -from six import itervalues, string_types -from gensim import matutils -from numpy import float32 as REAL, ones, random, dtype -from types import GeneratorType -import os -import copy - - -try: - from queue import Queue -except ImportError: - from Queue import Queue - -logger = logging.getLogger(__name__) - - -class BaseAny2VecModel(utils.SaveLoad): - def __init__(self, workers=3, vector_size=100, epochs=5, callbacks=(), batch_words=10000): - r"""Base class for training, using and evaluating \*2vec model. - - Contains implementation for multi-threaded training. The purpose of this class is to provide a - reference interface for concrete embedding implementations, whether the input space is a corpus - of words, documents or anything else. At the same time, functionality that we expect to be common - for those implementations is provided here to avoid code duplication. - - In the special but usual case where the input space consists of words, a more specialized layer - is provided, consider inheriting from :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` - - Parameters - ---------- - workers : int, optional - Number of working threads, used for multithreading. - vector_size : int, optional - Dimensionality of the feature vectors. - epochs : int, optional - Number of iterations (epochs) of training through the corpus. - callbacks : list of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional - List of callbacks that need to be executed/run at specific stages during training. - batch_words : int, optional - Number of words to be processed by a single job. - - Notes - ----- - A subclass should initialize the following attributes: - - * self.kv - keyed vectors in model (see :class:`~gensim.models.keyedvectors.Word2VecKeyedVectors` as example) - * self.vocabulary - vocabulary (see :class:`~gensim.models.word2vec.Word2VecVocab` as example) - * self.trainables - internal matrices (see :class:`~gensim.models.word2vec.Word2VecTrainables` as example) - - """ - self.vector_size = int(vector_size) - self.workers = int(workers) - self.epochs = epochs - self.train_count = 0 - self.total_train_time = 0 - self.batch_words = batch_words - self.model_trimmed_post_training = False - self.callbacks = callbacks - - def _get_job_params(self, cur_epoch): - """Get job parameters required for each batch.""" - raise NotImplementedError() - - def _set_train_params(self, **kwargs): - """Set model parameters required for training.""" - raise NotImplementedError() - - def _update_job_params(self, job_params, epoch_progress, cur_epoch): - """Get updated job parameters based on the epoch_progress and cur_epoch.""" - raise NotImplementedError() - - def _get_thread_working_mem(self): - """Get private working memory per thread.""" - raise NotImplementedError() - - def _raw_word_count(self, job): - """Get the number of words in a given job.""" - raise NotImplementedError() - - def _clear_post_train(self): - """Resets certain properties of the model post training. eg. `keyedvectors.vectors_norm`.""" - raise NotImplementedError() - - def _do_train_epoch(self, corpus_file, thread_id, offset, cython_vocab, thread_private_mem, cur_epoch, - total_examples=None, total_words=None, **kwargs): - raise NotImplementedError() - - def _do_train_job(self, data_iterable, job_parameters, thread_private_mem): - """Train a single batch. Return 2-tuple `(effective word count, total word count)`.""" - raise NotImplementedError() - - def _check_training_sanity(self, epochs=None, total_examples=None, total_words=None, **kwargs): - """Check that the training parameters provided make sense. e.g. raise error if `epochs` not provided.""" - raise NotImplementedError() - - def _check_input_data_sanity(self, data_iterable=None, corpus_file=None): - """Check that only one argument is None.""" - if not (data_iterable is None) ^ (corpus_file is None): - raise ValueError("You must provide only one of singlestream or corpus_file arguments.") - - def _worker_loop_corpusfile(self, corpus_file, thread_id, offset, cython_vocab, progress_queue, cur_epoch=0, - total_examples=None, total_words=None, **kwargs): - """Train the model on a `corpus_file` in LineSentence format. - - This function will be called in parallel by multiple workers (threads or processes) to make - optimal use of multicore machines. - - Parameters - ---------- - corpus_file : str - Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. - thread_id : int - Thread index starting from 0 to `number of workers - 1`. - offset : int - Offset (in bytes) in the `corpus_file` for particular worker. - cython_vocab : :class:`~gensim.models.word2vec_inner.CythonVocab` - Copy of the vocabulary in order to access it without GIL. - progress_queue : Queue of (int, int, int) - A queue of progress reports. Each report is represented as a tuple of these 3 elements: - * Size of data chunk processed, for example number of sentences in the corpus chunk. - * Effective word count used in training (after ignoring unknown words and trimming the sentence length). - * Total word count used in training. - **kwargs : object - Additional key word parameters for the specific model inheriting from this class. - - """ - thread_private_mem = self._get_thread_working_mem() - - examples, tally, raw_tally = self._do_train_epoch( - corpus_file, thread_id, offset, cython_vocab, thread_private_mem, cur_epoch, - total_examples=total_examples, total_words=total_words, **kwargs) - - progress_queue.put((examples, tally, raw_tally)) - progress_queue.put(None) - - def _worker_loop(self, job_queue, progress_queue): - """Train the model, lifting batches of data from the queue. - - This function will be called in parallel by multiple workers (threads or processes) to make - optimal use of multicore machines. - - Parameters - ---------- - job_queue : Queue of (list of objects, (str, int)) - A queue of jobs still to be processed. The worker will take up jobs from this queue. - Each job is represented by a tuple where the first element is the corpus chunk to be processed and - the second is the dictionary of parameters. - progress_queue : Queue of (int, int, int) - A queue of progress reports. Each report is represented as a tuple of these 3 elements: - * Size of data chunk processed, for example number of sentences in the corpus chunk. - * Effective word count used in training (after ignoring unknown words and trimming the sentence length). - * Total word count used in training. - - """ - thread_private_mem = self._get_thread_working_mem() - jobs_processed = 0 - while True: - job = job_queue.get() - if job is None: - progress_queue.put(None) - break # no more jobs => quit this worker - data_iterable, job_parameters = job - - for callback in self.callbacks: - callback.on_batch_begin(self) - - tally, raw_tally = self._do_train_job(data_iterable, job_parameters, thread_private_mem) - - for callback in self.callbacks: - callback.on_batch_end(self) - - progress_queue.put((len(data_iterable), tally, raw_tally)) # report back progress - jobs_processed += 1 - logger.debug("worker exiting, processed %i jobs", jobs_processed) - - def _job_producer(self, data_iterator, job_queue, cur_epoch=0, total_examples=None, total_words=None): - """Fill the jobs queue using the data found in the input stream. - - Each job is represented by a tuple where the first element is the corpus chunk to be processed and - the second is a dictionary of parameters. - - Parameters - ---------- - data_iterator : iterable of list of objects - The input dataset. This will be split in chunks and these chunks will be pushed to the queue. - job_queue : Queue of (list of object, dict of (str, int)) - A queue of jobs still to be processed. The worker will take up jobs from this queue. - Each job is represented by a tuple where the first element is the corpus chunk to be processed and - the second is the dictionary of parameters. - cur_epoch : int, optional - The current training epoch, needed to compute the training parameters for each job. - For example in many implementations the learning rate would be dropping with the number of epochs. - total_examples : int, optional - Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences - in a corpus. Used to log progress. - total_words : int, optional - Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw - words in a corpus. Used to log progress. - - """ - job_batch, batch_size = [], 0 - pushed_words, pushed_examples = 0, 0 - next_job_params = self._get_job_params(cur_epoch) - job_no = 0 - - for data_idx, data in enumerate(data_iterator): - data_length = self._raw_word_count([data]) - - # can we fit this sentence into the existing job batch? - if batch_size + data_length <= self.batch_words: - # yes => add it to the current job - job_batch.append(data) - batch_size += data_length - else: - job_no += 1 - job_queue.put((job_batch, next_job_params)) - - # update the learning rate for the next job - if total_examples: - # examples-based decay - pushed_examples += len(job_batch) - epoch_progress = 1.0 * pushed_examples / total_examples - else: - # words-based decay - pushed_words += self._raw_word_count(job_batch) - epoch_progress = 1.0 * pushed_words / total_words - next_job_params = self._update_job_params(next_job_params, epoch_progress, cur_epoch) - - # add the sentence that didn't fit as the first item of a new job - job_batch, batch_size = [data], data_length - # add the last job too (may be significantly smaller than batch_words) - if job_batch: - job_no += 1 - job_queue.put((job_batch, next_job_params)) - - if job_no == 0 and self.train_count == 0: - logger.warning( - "train() called with an empty iterator (if not intended, " - "be sure to provide a corpus that offers restartable iteration = an iterable)." - ) - - # give the workers heads up that they can finish -- no more work! - for _ in range(self.workers): - job_queue.put(None) - logger.debug("job loop exiting, total %i jobs", job_no) - - def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, total_examples, - raw_word_count, total_words, trained_word_count, elapsed): - raise NotImplementedError() - - def _log_epoch_end(self, cur_epoch, example_count, total_examples, raw_word_count, total_words, - trained_word_count, elapsed, is_corpus_file_mode): - raise NotImplementedError() - - def _log_train_end(self, raw_word_count, trained_word_count, total_elapsed, job_tally): - raise NotImplementedError() - - def _log_epoch_progress(self, progress_queue=None, job_queue=None, cur_epoch=0, total_examples=None, - total_words=None, report_delay=1.0, is_corpus_file_mode=None): - """Get the progress report for a single training epoch. - - Parameters - ---------- - progress_queue : Queue of (int, int, int) - A queue of progress reports. Each report is represented as a tuple of these 3 elements: - * size of data chunk processed, for example number of sentences in the corpus chunk. - * Effective word count used in training (after ignoring unknown words and trimming the sentence length). - * Total word count used in training. - job_queue : Queue of (list of object, dict of (str, int)) - A queue of jobs still to be processed. The worker will take up jobs from this queue. - Each job is represented by a tuple where the first element is the corpus chunk to be processed and - the second is the dictionary of parameters. - cur_epoch : int, optional - The current training epoch, needed to compute the training parameters for each job. - For example in many implementations the learning rate would be dropping with the number of epochs. - total_examples : int, optional - Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences - in a corpus. Used to log progress. - total_words : int, optional - Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw - words in a corpus. Used to log progress. - report_delay : float, optional - Number of seconds between two consecutive progress report messages in the logger. - is_corpus_file_mode : bool, optional - Whether training is file-based (corpus_file argument) or not. - - Returns - ------- - (int, int, int) - The epoch report consisting of three elements: - * size of data chunk processed, for example number of sentences in the corpus chunk. - * Effective word count used in training (after ignoring unknown words and trimming the sentence length). - * Total word count used in training. - - """ - example_count, trained_word_count, raw_word_count = 0, 0, 0 - start, next_report = default_timer() - 0.00001, 1.0 - job_tally = 0 - unfinished_worker_count = self.workers - - while unfinished_worker_count > 0: - report = progress_queue.get() # blocks if workers too slow - if report is None: # a thread reporting that it finished - unfinished_worker_count -= 1 - logger.info("worker thread finished; awaiting finish of %i more threads", unfinished_worker_count) - continue - examples, trained_words, raw_words = report - job_tally += 1 - - # update progress stats - example_count += examples - trained_word_count += trained_words # only words in vocab & sampled - raw_word_count += raw_words - - # log progress once every report_delay seconds - elapsed = default_timer() - start - if elapsed >= next_report: - self._log_progress( - job_queue, progress_queue, cur_epoch, example_count, total_examples, - raw_word_count, total_words, trained_word_count, elapsed) - next_report = elapsed + report_delay - # all done; report the final stats - elapsed = default_timer() - start - self._log_epoch_end( - cur_epoch, example_count, total_examples, raw_word_count, total_words, - trained_word_count, elapsed, is_corpus_file_mode) - self.total_train_time += elapsed - return trained_word_count, raw_word_count, job_tally - - def _train_epoch_corpusfile(self, corpus_file, cur_epoch=0, total_examples=None, total_words=None, **kwargs): - """Train the model for a single epoch. - - Parameters - ---------- - corpus_file : str - Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. - cur_epoch : int, optional - The current training epoch, needed to compute the training parameters for each job. - For example in many implementations the learning rate would be dropping with the number of epochs. - total_examples : int, optional - Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences - in a corpus, used to log progress. - total_words : int - Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw - words in a corpus, used to log progress. Must be provided in order to seek in `corpus_file`. - **kwargs : object - Additional key word parameters for the specific model inheriting from this class. - - Returns - ------- - (int, int, int) - The training report for this epoch consisting of three elements: - * Size of data chunk processed, for example number of sentences in the corpus chunk. - * Effective word count used in training (after ignoring unknown words and trimming the sentence length). - * Total word count used in training. - - """ - if not total_words: - raise ValueError("total_words must be provided alongside corpus_file argument.") - - from gensim.models.word2vec_corpusfile import CythonVocab - from gensim.models.fasttext import FastText - cython_vocab = CythonVocab(self.wv, hs=self.hs, fasttext=isinstance(self, FastText)) - - progress_queue = Queue() - - corpus_file_size = os.path.getsize(corpus_file) - - thread_kwargs = copy.copy(kwargs) - thread_kwargs['cur_epoch'] = cur_epoch - thread_kwargs['total_examples'] = total_examples - thread_kwargs['total_words'] = total_words - workers = [ - threading.Thread( - target=self._worker_loop_corpusfile, - args=( - corpus_file, thread_id, corpus_file_size / self.workers * thread_id, cython_vocab, progress_queue - ), - kwargs=thread_kwargs - ) for thread_id in range(self.workers) - ] - - for thread in workers: - thread.daemon = True - thread.start() - - trained_word_count, raw_word_count, job_tally = self._log_epoch_progress( - progress_queue=progress_queue, job_queue=None, cur_epoch=cur_epoch, - total_examples=total_examples, total_words=total_words, is_corpus_file_mode=True) - - return trained_word_count, raw_word_count, job_tally - - def _train_epoch(self, data_iterable, cur_epoch=0, total_examples=None, total_words=None, - queue_factor=2, report_delay=1.0): - """Train the model for a single epoch. - - Parameters - ---------- - data_iterable : iterable of list of object - The input corpus. This will be split in chunks and these chunks will be pushed to the queue. - cur_epoch : int, optional - The current training epoch, needed to compute the training parameters for each job. - For example in many implementations the learning rate would be dropping with the number of epochs. - total_examples : int, optional - Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences - in a corpus, used to log progress. - total_words : int, optional - Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw - words in a corpus, used to log progress. - queue_factor : int, optional - Multiplier for size of queue -> size = number of workers * queue_factor. - report_delay : float, optional - Number of seconds between two consecutive progress report messages in the logger. - - Returns - ------- - (int, int, int) - The training report for this epoch consisting of three elements: - * Size of data chunk processed, for example number of sentences in the corpus chunk. - * Effective word count used in training (after ignoring unknown words and trimming the sentence length). - * Total word count used in training. - - """ - job_queue = Queue(maxsize=queue_factor * self.workers) - progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers) - - workers = [ - threading.Thread( - target=self._worker_loop, - args=(job_queue, progress_queue,)) - for _ in range(self.workers) - ] - - workers.append(threading.Thread( - target=self._job_producer, - args=(data_iterable, job_queue), - kwargs={'cur_epoch': cur_epoch, 'total_examples': total_examples, 'total_words': total_words})) - - for thread in workers: - thread.daemon = True # make interrupting the process with ctrl+c easier - thread.start() - - trained_word_count, raw_word_count, job_tally = self._log_epoch_progress( - progress_queue, job_queue, cur_epoch=cur_epoch, total_examples=total_examples, total_words=total_words, - report_delay=report_delay, is_corpus_file_mode=False) - - return trained_word_count, raw_word_count, job_tally - - def train(self, data_iterable=None, corpus_file=None, epochs=None, total_examples=None, - total_words=None, queue_factor=2, report_delay=1.0, callbacks=(), **kwargs): - """Train the model for multiple epochs using multiple workers. - - Parameters - ---------- - data_iterable : iterable of list of object - The input corpus. This will be split in chunks and these chunks will be pushed to the queue. - corpus_file : str, optional - Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. - If you use this argument instead of `data_iterable`, you must provide `total_words` argument as well. - epochs : int, optional - Number of epochs (training iterations over the whole input) of training. - total_examples : int, optional - Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences - in a corpus, used to log progress. - total_words : int, optional - Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw - words in a corpus, used to log progress. - queue_factor : int, optional - Multiplier for size of queue -> size = number of workers * queue_factor. - report_delay : float, optional - Number of seconds between two consecutive progress report messages in the logger. - callbacks : list of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional - List of callbacks to execute at specific stages during training. - **kwargs : object - Additional key word parameters for the specific model inheriting from this class. - - Returns - ------- - (int, int) - The total training report consisting of two elements: - * size of total data processed, for example number of sentences in the whole corpus. - * Effective word count used in training (after ignoring unknown words and trimming the sentence length). - - """ - self._set_train_params(**kwargs) - if callbacks: - self.callbacks = callbacks - self.epochs = epochs - self._check_training_sanity( - epochs=epochs, - total_examples=total_examples, - total_words=total_words, **kwargs) - - for callback in self.callbacks: - callback.on_train_begin(self) - - trained_word_count = 0 - raw_word_count = 0 - start = default_timer() - 0.00001 - job_tally = 0 - - for cur_epoch in range(self.epochs): - for callback in self.callbacks: - callback.on_epoch_begin(self) - - if data_iterable is not None: - trained_word_count_epoch, raw_word_count_epoch, job_tally_epoch = self._train_epoch( - data_iterable, cur_epoch=cur_epoch, total_examples=total_examples, - total_words=total_words, queue_factor=queue_factor, report_delay=report_delay) - else: - trained_word_count_epoch, raw_word_count_epoch, job_tally_epoch = self._train_epoch_corpusfile( - corpus_file, cur_epoch=cur_epoch, total_examples=total_examples, total_words=total_words, **kwargs) - - trained_word_count += trained_word_count_epoch - raw_word_count += raw_word_count_epoch - job_tally += job_tally_epoch - - for callback in self.callbacks: - callback.on_epoch_end(self) - - # Log overall time - total_elapsed = default_timer() - start - self._log_train_end(raw_word_count, trained_word_count, total_elapsed, job_tally) - - self.train_count += 1 # number of times train() has been called - self._clear_post_train() - - for callback in self.callbacks: - callback.on_train_end(self) - return trained_word_count, raw_word_count - - @classmethod - def load(cls, fname_or_handle, **kwargs): - """Load a previously saved object (using :meth:`gensim.models.base_any2vec.BaseAny2VecModel.save`) from a file. - - Parameters - ---------- - fname_or_handle : {str, file-like object} - Path to file that contains needed object or handle to an open file. - **kwargs : object - Keyword arguments propagated to :meth:`~gensim.utils.SaveLoad.load`. - - See Also - -------- - :meth:`~gensim.models.base_any2vec.BaseAny2VecModel.save` - Method for save a model. - - Returns - ------- - object - Object loaded from `fname_or_handle`. - - Raises - ------ - IOError - When methods are called on an instance (should be called on a class, this is a class method). - - """ - return super(BaseAny2VecModel, cls).load(fname_or_handle, **kwargs) - - def save(self, fname_or_handle, **kwargs): - """Save the object to file. - - Parameters - ---------- - fname_or_handle : {str, file-like object} - Path to file where the model will be persisted. - **kwargs : object - Key word arguments propagated to :meth:`~gensim.utils.SaveLoad.save`. - - See Also - -------- - :meth:`~gensim.models.base_any2vec.BaseAny2VecModel.load` - Method for load model after current method. - - """ - super(BaseAny2VecModel, self).save(fname_or_handle, **kwargs) - - -class BaseWordEmbeddingsModel(BaseAny2VecModel): - def __init__(self, sentences=None, corpus_file=None, workers=3, vector_size=100, epochs=5, callbacks=(), - batch_words=10000, trim_rule=None, sg=0, alpha=0.025, window=5, seed=1, hs=0, negative=5, - ns_exponent=0.75, cbow_mean=1, min_alpha=0.0001, compute_loss=False, **kwargs): - """Base class containing common methods for training, using & evaluating word embeddings learning models. - - Parameters - ---------- - sentences : iterable of list of str, optional - Can be simply a list of lists of tokens, but for larger corpora, - consider an iterable that streams the sentences directly from disk/network. - See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` for such examples. - corpus_file : str, optional - Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. - You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or - `corpus_file` arguments need to be passed (or none of them, in that case, the model is left uninitialized). - workers : int, optional - Number of working threads, used for multiprocessing. - vector_size : int, optional - Dimensionality of the feature vectors. - epochs : int, optional - Number of iterations (epochs) of training through the corpus. - callbacks : list of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional - List of callbacks that need to be executed/run at specific stages during training. - batch_words : int, optional - Number of words to be processed by a single job. - trim_rule : function, optional - Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, - be trimmed away, or handled using the default (discard if word count < min_count). - Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), - or a callable that accepts parameters (word, count, min_count) and returns either - :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. - The rule, if given, is only used to prune vocabulary during current method call and is not stored as part - of the model. - - The input parameters are of the following types: - * `word` (str) - the word we are examining - * `count` (int) - the word's frequency count in the corpus - * `min_count` (int) - the minimum count threshold. - - sg : {1, 0}, optional - Defines the training algorithm. If 1, skip-gram is used, otherwise, CBOW is employed. - alpha : float, optional - The beginning learning rate. This will linearly reduce with iterations until it reaches `min_alpha`. - window : int, optional - The maximum distance between the current and predicted word within a sentence. - seed : int, optional - Seed for the random number generator. Initial vectors for each word are seeded with a hash of - the concatenation of word + `str(seed)`. - Note that for a fully deterministically-reproducible run, you must also limit the model to a single worker - thread (`workers=1`), to eliminate ordering jitter from OS thread scheduling. - In Python 3, reproducibility between interpreter launches also requires use of the `PYTHONHASHSEED` - environment variable to control hash randomization. - hs : {1,0}, optional - If 1, hierarchical softmax will be used for model training. - If set to 0, and `negative` is non-zero, negative sampling will be used. - negative : int, optional - If > 0, negative sampling will be used, the int for negative specifies how many "noise words" - should be drawn (usually between 5-20). - If set to 0, no negative sampling is used. - cbow_mean : {1,0}, optional - If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used. - min_alpha : float, optional - Final learning rate. Drops linearly with the number of iterations from `alpha`. - compute_loss : bool, optional - If True, loss will be computed while training the Word2Vec model and stored in - :attr:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.running_training_loss` attribute. - **kwargs : object - Key word arguments needed to allow children classes to accept more arguments. - - See Also - -------- - :class:`~gensim.models.word2vec.Word2Vec`. - Word2Vec model - embeddings for words. - :class:`~gensim.models.fasttext.FastText`. - FastText model - embeddings for words (ngram-based). - :class:`~gensim.models.doc2vec.Doc2Vec`. - Doc2Vec model - embeddings for documents. - :class:`~gensim.models.poincare.PoincareModel` - Poincare model - embeddings for graphs. - - """ - self.sg = int(sg) - if vector_size % 4 != 0: - logger.warning("consider setting layer size to a multiple of 4 for greater performance") - self.alpha = float(alpha) - self.window = int(window) - self.random = random.RandomState(seed) - self.min_alpha = float(min_alpha) - self.hs = int(hs) - self.negative = int(negative) - self.ns_exponent = ns_exponent - self.cbow_mean = int(cbow_mean) - self.compute_loss = bool(compute_loss) - self.running_training_loss = 0 - self.min_alpha_yet_reached = float(alpha) - self.corpus_count = 0 - self.corpus_total_words = 0 - - super(BaseWordEmbeddingsModel, self).__init__( - workers=workers, vector_size=vector_size, epochs=epochs, callbacks=callbacks, batch_words=batch_words) - - if sentences is not None or corpus_file is not None: - self._check_input_data_sanity(data_iterable=sentences, corpus_file=corpus_file) - if corpus_file is not None and not isinstance(corpus_file, string_types): - raise TypeError("You must pass string as the corpus_file argument.") - elif isinstance(sentences, GeneratorType): - raise TypeError("You can't pass a generator as the sentences argument. Try a sequence.") - - self.build_vocab(sentences=sentences, corpus_file=corpus_file, trim_rule=trim_rule) - self.train( - sentences=sentences, corpus_file=corpus_file, total_examples=self.corpus_count, - total_words=self.corpus_total_words, epochs=self.epochs, start_alpha=self.alpha, - end_alpha=self.min_alpha, compute_loss=compute_loss) - else: - if trim_rule is not None: - logger.warning( - "The rule, if given, is only used to prune vocabulary during build_vocab() " - "and is not stored as part of the model. Model initialized without sentences. " - "trim_rule provided, if any, will be ignored.") - - def _clear_post_train(self): - raise NotImplementedError() - - def _do_train_job(self, data_iterable, job_parameters, thread_private_mem): - raise NotImplementedError() - - def _set_train_params(self, **kwargs): - raise NotImplementedError() - - def __str__(self): - """Get a human readable representation of the object. - - Returns - ------- - str - A human readable string containing the class name, as well as the size of dictionary, number of - features and starting learning rate used by the object. - - """ - return "%s(vocab=%s, size=%s, alpha=%s)" % ( - self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha - ) - - def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_per=10000, - keep_raw_vocab=False, trim_rule=None, **kwargs): - """Build vocabulary from a sequence of sentences (can be a once-only generator stream). - - Parameters - ---------- - sentences : iterable of list of str - Can be simply a list of lists of tokens, but for larger corpora, - consider an iterable that streams the sentences directly from disk/network. - See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` module for such examples. - corpus_file : str, optional - Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. - You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or - `corpus_file` arguments need to be passed (not both of them). - update : bool - If true, the new words in `sentences` will be added to model's vocab. - progress_per : int, optional - Indicates how many words to process before showing/updating the progress. - keep_raw_vocab : bool, optional - If False, the raw vocabulary will be deleted after the scaling is done to free up RAM. - trim_rule : function, optional - Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, - be trimmed away, or handled using the default (discard if word count < min_count). - Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), - or a callable that accepts parameters (word, count, min_count) and returns either - :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. - The rule, if given, is only used to prune vocabulary during current method call and is not stored as part - of the model. - - The input parameters are of the following types: - * `word` (str) - the word we are examining - * `count` (int) - the word's frequency count in the corpus - * `min_count` (int) - the minimum count threshold. - - **kwargs : object - Key word arguments propagated to `self.vocabulary.prepare_vocab` - - """ - total_words, corpus_count = self.vocabulary.scan_vocab( - sentences=sentences, corpus_file=corpus_file, progress_per=progress_per, trim_rule=trim_rule) - self.corpus_count = corpus_count - self.corpus_total_words = total_words - report_values = self.vocabulary.prepare_vocab( - self.hs, self.negative, self.wv, update=update, keep_raw_vocab=keep_raw_vocab, - trim_rule=trim_rule, **kwargs) - report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words']) - self.trainables.prepare_weights(self.hs, self.negative, self.wv, update=update, vocabulary=self.vocabulary) - - def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False): - """Build vocabulary from a dictionary of word frequencies. - - Parameters - ---------- - word_freq : dict of (str, int) - A mapping from a word in the vocabulary to its frequency count. - keep_raw_vocab : bool, optional - If False, delete the raw vocabulary after the scaling is done to free up RAM. - corpus_count : int, optional - Even if no corpus is provided, this argument can set corpus_count explicitly. - trim_rule : function, optional - Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, - be trimmed away, or handled using the default (discard if word count < min_count). - Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), - or a callable that accepts parameters (word, count, min_count) and returns either - :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. - The rule, if given, is only used to prune vocabulary during current method call and is not stored as part - of the model. - - The input parameters are of the following types: - * `word` (str) - the word we are examining - * `count` (int) - the word's frequency count in the corpus - * `min_count` (int) - the minimum count threshold. - - update : bool, optional - If true, the new provided words in `word_freq` dict will be added to model's vocab. - - """ - logger.info("Processing provided word frequencies") - # Instead of scanning text, this will assign provided word frequencies dictionary(word_freq) - # to be directly the raw vocab - raw_vocab = word_freq - logger.info( - "collected %i different raw word, with total frequency of %i", - len(raw_vocab), sum(itervalues(raw_vocab)) - ) - - # Since no sentences are provided, this is to control the corpus_count. - self.corpus_count = corpus_count or 0 - self.vocabulary.raw_vocab = raw_vocab - - # trim by min_count & precalculate downsampling - report_values = self.vocabulary.prepare_vocab( - self.hs, self.negative, self.wv, keep_raw_vocab=keep_raw_vocab, - trim_rule=trim_rule, update=update) - report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words']) - self.trainables.prepare_weights( - self.hs, self.negative, self.wv, update=update, vocabulary=self.vocabulary) # build tables & arrays - - def estimate_memory(self, vocab_size=None, report=None): - """Estimate required memory for a model using current settings and provided vocabulary size. - - Parameters - ---------- - vocab_size : int, optional - Number of unique tokens in the vocabulary - report : dict of (str, int), optional - A dictionary from string representations of the model's memory consuming members to their size in bytes. - - Returns - ------- - dict of (str, int) - A dictionary from string representations of the model's memory consuming members to their size in bytes. - - """ - vocab_size = vocab_size or len(self.wv.vocab) - report = report or {} - report['vocab'] = vocab_size * (700 if self.hs else 500) - report['vectors'] = vocab_size * self.vector_size * dtype(REAL).itemsize - if self.hs: - report['syn1'] = vocab_size * self.trainables.layer1_size * dtype(REAL).itemsize - if self.negative: - report['syn1neg'] = vocab_size * self.trainables.layer1_size * dtype(REAL).itemsize - report['total'] = sum(report.values()) - logger.info( - "estimated required memory for %i words and %i dimensions: %i bytes", - vocab_size, self.vector_size, report['total'] - ) - return report - - def train(self, sentences=None, corpus_file=None, total_examples=None, total_words=None, - epochs=None, start_alpha=None, end_alpha=None, word_count=0, - queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=(), **kwargs): - """Train the model. If the hyper-parameters are passed, they override the ones set in the constructor. - - Parameters - ---------- - sentences : iterable of list of str - Can be simply a list of lists of tokens, but for larger corpora, - consider an iterable that streams the sentences directly from disk/network. - See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` module for such examples. - corpus_file : str, optional - Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. - You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or - `corpus_file` arguments need to be passed (not both of them). - total_examples : int, optional - Count of sentences. - total_words : int, optional - Count of raw words in sentences. - epochs : int, optional - Number of iterations (epochs) over the corpus. - start_alpha : float, optional - Initial learning rate. - end_alpha : float, optional - Final learning rate. Drops linearly with the number of iterations from `start_alpha`. - word_count : int, optional - Count of words already trained. Leave this to 0 for the usual case of training on all words in sentences. - queue_factor : int, optional - Multiplier for size of queue -> size = number of workers * queue_factor. - report_delay : float, optional - Seconds to wait before reporting progress. - compute_loss : bool, optional - If True, loss will be computed while training the Word2Vec model and stored in - :attr:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.running_training_loss`. - callbacks : list of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional - List of callbacks that need to be executed/run at specific stages during training. - **kwargs : object - Additional key word parameters for the specific model inheriting from this class. - - Returns - ------- - (int, int) - Tuple of (effective word count after ignoring unknown words and sentence length trimming, total word count). - - """ - - self.alpha = start_alpha or self.alpha - self.min_alpha = end_alpha or self.min_alpha - self.compute_loss = compute_loss - self.running_training_loss = 0.0 - return super(BaseWordEmbeddingsModel, self).train( - data_iterable=sentences, corpus_file=corpus_file, total_examples=total_examples, - total_words=total_words, epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count, - queue_factor=queue_factor, report_delay=report_delay, compute_loss=compute_loss, callbacks=callbacks, - **kwargs) - - def _get_job_params(self, cur_epoch): - """Get the learning rate used in the current epoch. - - Parameters - ---------- - cur_epoch : int - Current iteration through the corpus - - Returns - ------- - float - The learning rate for this epoch (it is linearly reduced with epochs from `self.alpha` to `self.min_alpha`). - - """ - alpha = self.alpha - ((self.alpha - self.min_alpha) * float(cur_epoch) / self.epochs) - return alpha - - def _update_job_params(self, job_params, epoch_progress, cur_epoch): - """Get the correct learning rate for the next iteration. - - Parameters - ---------- - job_params : dict of (str, obj) - UNUSED. - epoch_progress : float - Ratio of finished work in the current epoch. - cur_epoch : int - Number of current iteration. - - Returns - ------- - float - The learning rate to be used in the next training epoch. - - """ - start_alpha = self.alpha - end_alpha = self.min_alpha - progress = (cur_epoch + epoch_progress) / self.epochs - next_alpha = start_alpha - (start_alpha - end_alpha) * progress - next_alpha = max(end_alpha, next_alpha) - self.min_alpha_yet_reached = next_alpha - return next_alpha - - def _get_thread_working_mem(self): - """Computes the memory used per worker thread. - - Returns - ------- - (np.ndarray, np.ndarray) - Each worker threads private work memory. - - """ - work = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL) # per-thread private work memory - neu1 = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL) - return work, neu1 - - def _raw_word_count(self, job): - """Get the number of words in a given job. - - Parameters - ---------- - job: iterable of list of str - The corpus chunk processed in a single batch. - - Returns - ------- - int - Number of raw words in the corpus chunk. - - """ - return sum(len(sentence) for sentence in job) - - def _check_training_sanity(self, epochs=None, total_examples=None, total_words=None, **kwargs): - """Checks whether the training parameters make sense. - - Called right before training starts in :meth:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.train` - and raises warning or errors depending on the severity of the issue in case an inconsistent parameter - combination is detected. - - Parameters - ---------- - epochs : int, optional - Number of training epochs. Must have a (non None) value. - total_examples : int, optional - Number of documents in the corpus. Either `total_examples` or `total_words` **must** be supplied. - total_words : int, optional - Number of words in the corpus. Either `total_examples` or `total_words` **must** be supplied. - **kwargs : object - Unused. Present to preserve signature among base and inherited implementations. - - Raises - ------ - RuntimeError - If one of the required training pre/post processing steps have not been performed. - ValueError - If the combination of input parameters is inconsistent. - - """ - if self.alpha > self.min_alpha_yet_reached: - logger.warning("Effective 'alpha' higher than previous training cycles") - if self.model_trimmed_post_training: - raise RuntimeError("Parameters for training were discarded using model_trimmed_post_training method") - - if not self.wv.vocab: # should be set by `build_vocab` - raise RuntimeError("you must first build vocabulary before training the model") - if not len(self.wv.vectors): - raise RuntimeError("you must initialize vectors before training the model") - - if not hasattr(self, 'corpus_count'): - raise ValueError( - "The number of examples in the training corpus is missing. " - "Please make sure this is set inside `build_vocab` function." - "Call the `build_vocab` function before calling `train`." - ) - - if total_words is None and total_examples is None: - raise ValueError( - "You must specify either total_examples or total_words, for proper job parameters updation" - "and progress calculations. " - "The usual value is total_examples=model.corpus_count." - ) - if epochs is None: - raise ValueError("You must specify an explict epochs count. The usual value is epochs=model.epochs.") - logger.info( - "training model with %i workers on %i vocabulary and %i features, " - "using sg=%s hs=%s sample=%s negative=%s window=%s", - self.workers, len(self.wv.vocab), self.trainables.layer1_size, self.sg, - self.hs, self.vocabulary.sample, self.negative, self.window - ) - - @classmethod - def load(cls, *args, **kwargs): - """Load a previously saved object (using :meth:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.save`) from file. - - Also initializes extra instance attributes in case the loaded model does not include them. - `*args` or `**kwargs` **MUST** include the fname argument (path to saved file). - See :meth:`~gensim.utils.SaveLoad.load`. - - Parameters - ---------- - *args : object - Positional arguments passed to :meth:`~gensim.utils.SaveLoad.load`. - **kwargs : object - Key word arguments passed to :meth:`~gensim.utils.SaveLoad.load`. - - See Also - -------- - :meth:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.save` - Method for save a model. - - Returns - ------- - :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` - Model loaded from disk. - - Raises - ------ - IOError - When methods are called on instance (should be called from class). - - """ - model = super(BaseWordEmbeddingsModel, cls).load(*args, **kwargs) - if not hasattr(model, 'ns_exponent'): - model.ns_exponent = 0.75 - if not hasattr(model.vocabulary, 'ns_exponent'): - model.vocabulary.ns_exponent = 0.75 - if model.negative and hasattr(model.wv, 'index2word'): - model.vocabulary.make_cum_table(model.wv) # rebuild cum_table from vocabulary - if not hasattr(model, 'corpus_count'): - model.corpus_count = None - if not hasattr(model, 'corpus_total_words'): - model.corpus_total_words = None - if not hasattr(model.trainables, 'vectors_lockf') and hasattr(model.wv, 'vectors'): - model.trainables.vectors_lockf = ones(len(model.wv.vectors), dtype=REAL) - if not hasattr(model, 'random'): - model.random = random.RandomState(model.trainables.seed) - if not hasattr(model, 'train_count'): - model.train_count = 0 - model.total_train_time = 0 - return model - - def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, total_examples, - raw_word_count, total_words, trained_word_count, elapsed): - """Callback used to log progress for long running jobs. - - Parameters - ---------- - job_queue : Queue of (list of object, dict of (str, float)) - The queue of jobs still to be performed by workers. Each job is represented as a tuple containing - the batch of data to be processed and the parameters to be used for the processing as a dict. - progress_queue : Queue of (int, int, int) - A queue of progress reports. Each report is represented as a tuple of these 3 elements: - * size of data chunk processed, for example number of sentences in the corpus chunk. - * Effective word count used in training (after ignoring unknown words and trimming the sentence length). - * Total word count used in training. - cur_epoch : int - The current training iteration through the corpus. - example_count : int - Number of examples (could be sentences for example) processed until now. - total_examples : int - Number of all examples present in the input corpus. - raw_word_count : int - Number of words used in training until now. - total_words : int - Number of all words in the input corpus. - trained_word_count : int - Number of effective words used in training until now (after ignoring unknown words and trimming - the sentence length). - elapsed : int - Elapsed time since the beginning of training in seconds. - - Notes - ----- - If you train the model via `corpus_file` argument, there is no job_queue, so reported job_queue size will - always be equal to -1. - - """ - if total_examples: - # examples-based progress % - logger.info( - "EPOCH %i - PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i", - cur_epoch + 1, 100.0 * example_count / total_examples, trained_word_count / elapsed, - -1 if job_queue is None else utils.qsize(job_queue), utils.qsize(progress_queue) - ) - else: - # words-based progress % - logger.info( - "EPOCH %i - PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i", - cur_epoch + 1, 100.0 * raw_word_count / total_words, trained_word_count / elapsed, - -1 if job_queue is None else utils.qsize(job_queue), utils.qsize(progress_queue) - ) - - def _log_epoch_end(self, cur_epoch, example_count, total_examples, raw_word_count, total_words, - trained_word_count, elapsed, is_corpus_file_mode): - """Callback used to log the end of a training epoch. - - Parameters - ---------- - cur_epoch : int - The current training iteration through the corpus. - example_count : int - Number of examples (could be sentences for example) processed until now. - total_examples : int - Number of all examples present in the input corpus. - raw_word_count : int - Number of words used in training until now. - total_words : int - Number of all words in the input corpus. - trained_word_count : int - Number of effective words used in training until now (after ignoring unknown words and trimming - the sentence length). - elapsed : int - Elapsed time since the beginning of training in seconds. - is_corpus_file_mode : bool - Whether training is file-based (corpus_file argument) or not. - - Warnings - -------- - In case the corpus is changed while the epoch was running. - - """ - logger.info( - "EPOCH - %i : training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s", - cur_epoch + 1, raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed - ) - - # don't warn if training in file-based mode, because it's expected behavior - if is_corpus_file_mode: - return - - # check that the input corpus hasn't changed during iteration - if total_examples and total_examples != example_count: - logger.warning( - "EPOCH - %i : supplied example count (%i) did not equal expected count (%i)", cur_epoch + 1, - example_count, total_examples - ) - if total_words and total_words != raw_word_count: - logger.warning( - "EPOCH - %i : supplied raw word count (%i) did not equal expected count (%i)", cur_epoch + 1, - raw_word_count, total_words - ) - - def _log_train_end(self, raw_word_count, trained_word_count, total_elapsed, job_tally): - """Callback to log the end of training. - - Parameters - ---------- - raw_word_count : int - Number of words used in the whole training. - trained_word_count : int - Number of effective words used in training (after ignoring unknown words and trimming the sentence length). - total_elapsed : int - Total time spent during training in seconds. - job_tally : int - Total number of jobs processed during training. - - """ - logger.info( - "training on a %i raw words (%i effective words) took %.1fs, %.0f effective words/s", - raw_word_count, trained_word_count, total_elapsed, trained_word_count / total_elapsed - ) diff --git a/gensim/models/callbacks.py b/gensim/models/callbacks.py index cfa29d1998..27dbca4dce 100644 --- a/gensim/models/callbacks.py +++ b/gensim/models/callbacks.py @@ -569,7 +569,7 @@ def on_epoch_end(self, epoch, topics=None): class CallbackAny2Vec(object): - """Base class to build callbacks for :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel`. + """Base class to build callbacks for :class:`~gensim.models.word2vec.Word2Vec` & subclasses. Callbacks are used to apply custom functions over the model at specific points during training (epoch start, batch end etc.). This is a base class and its purpose is to be inherited by @@ -584,7 +584,7 @@ def on_epoch_begin(self, model): Parameters ---------- - model : :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` + model : :class:`~gensim.models.word2vec.Word2Vec` or subclass Current model. """ @@ -595,7 +595,7 @@ def on_epoch_end(self, model): Parameters ---------- - model : :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` + model : :class:`~gensim.models.word2vec.Word2Vec` or subclass Current model. """ @@ -606,7 +606,7 @@ def on_batch_begin(self, model): Parameters ---------- - model : :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` + model : :class:`~gensim.models.word2vec.Word2Vec` or subclass Current model. """ @@ -617,7 +617,7 @@ def on_batch_end(self, model): Parameters ---------- - model : :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` + model : :class:`~gensim.models.word2vec.Word2Vec` or subclass Current model. """ @@ -628,7 +628,7 @@ def on_train_begin(self, model): Parameters ---------- - model : :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` + model : :class:`~gensim.models.word2vec.Word2Vec` or subclass Current model. """ @@ -639,7 +639,7 @@ def on_train_end(self, model): Parameters ---------- - model : :class:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel` + model : :class:`~gensim.models.word2vec.Word2Vec` or subclass Current model. """ diff --git a/gensim/models/deprecated/__init__.py b/gensim/models/deprecated/__init__.py deleted file mode 100644 index cfa71654f5..0000000000 --- a/gensim/models/deprecated/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""This package contains some deprecated implementations of algorithm, will be removed soon.""" diff --git a/gensim/models/deprecated/doc2vec.py b/gensim/models/deprecated/doc2vec.py deleted file mode 100644 index 41f74fdc6b..0000000000 --- a/gensim/models/deprecated/doc2vec.py +++ /dev/null @@ -1,1044 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2013 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - - -""" -Warnings --------- -.. deprecated:: 3.3.0 - Use :mod:`gensim.models.doc2vec` instead. - - - -Deep learning via the distributed memory and distributed bag of words models from -[1]_, using either hierarchical softmax or negative sampling [2]_ [3]_. See [#tutorial]_ - -**Make sure you have a C compiler before installing gensim, to use optimized (compiled) -doc2vec training** (70x speedup [blog]_). - -Initialize a model with e.g.:: - -.. sourcecode:: pycon - - >>> model = Doc2Vec(documents, size=100, window=8, min_count=5, workers=4) - -Persist a model to disk with:: - -.. sourcecode:: pycon - - >>> model.save(fname) - >>> model = Doc2Vec.load(fname) # you can continue training with the loaded model! - -If you're finished training a model (=no more updates, only querying), you can do - -.. sourcecode:: pycon - - >>> model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True): - -to trim unneeded model memory = use (much) less RAM. - - - -.. [1] Quoc Le and Tomas Mikolov. Distributed Representations of Sentences and Documents. - http://arxiv.org/pdf/1405.4053v2.pdf -.. [2] Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. - Efficient Estimation of Word Representations in Vector Space. In Proceedings of Workshop at ICLR, 2013. -.. [3] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. - Distributed Representations of Words and Phrases and their Compositionality. In Proceedings of NIPS, 2013. -.. [blog] Optimizing word2vec in gensim, http://radimrehurek.com/2013/09/word2vec-in-python-part-two-optimizing/ - -.. [#tutorial] Doc2vec in gensim tutorial, - https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-lee.ipynb - - - -""" - -import logging -import os - -try: - from queue import Queue -except ImportError: - from Queue import Queue # noqa:F401 - -from collections import namedtuple, defaultdict -from timeit import default_timer - -from numpy import zeros, sum as np_sum, add as np_add, concatenate, \ - repeat as np_repeat, array, float32 as REAL, empty, ones, memmap as np_memmap, \ - sqrt, newaxis, ndarray, dot, vstack, dtype, divide as np_divide, integer - -from gensim import utils -from gensim.utils import call_on_class_only, deprecated -from gensim.models.deprecated.word2vec import Word2Vec, train_cbow_pair, train_sg_pair, train_batch_sg,\ - MAX_WORDS_IN_BATCH -from gensim.models.deprecated.keyedvectors import KeyedVectors -from gensim.models.doc2vec import Doc2Vec as NewDoc2Vec -from gensim.models.deprecated.old_saveload import SaveLoad - -from gensim import matutils # utility fnc for pickling, common scipy operations etc -from six.moves import zip, range -from six import string_types, integer_types - -logger = logging.getLogger(__name__) - - -def load_old_doc2vec(*args, **kwargs): - old_model = Doc2Vec.load(*args, **kwargs) - params = { - 'dm_mean': old_model.__dict__.get('dm_mean', None), - 'dm': old_model.dm, - 'dbow_words': old_model.dbow_words, - 'dm_concat': old_model.dm_concat, - 'dm_tag_count': old_model.dm_tag_count, - 'docvecs_mapfile': old_model.__dict__.get('docvecs_mapfile', None), - 'comment': old_model.__dict__.get('comment', None), - 'vector_size': old_model.vector_size, - 'alpha': old_model.alpha, - 'window': old_model.window, - 'min_count': old_model.min_count, - 'max_vocab_size': old_model.__dict__.get('max_vocab_size', None), - 'sample': old_model.sample, - 'seed': old_model.seed, - 'workers': old_model.workers, - 'min_alpha': old_model.min_alpha, - 'hs': old_model.hs, - 'negative': old_model.negative, - 'cbow_mean': old_model.cbow_mean, - 'hashfxn': old_model.hashfxn, - 'epochs': old_model.iter, - 'sorted_vocab': old_model.__dict__.get('sorted_vocab', 1), - 'batch_words': old_model.__dict__.get('batch_words', MAX_WORDS_IN_BATCH), - 'compute_loss': old_model.__dict__.get('compute_loss', None) - } - new_model = NewDoc2Vec(**params) - # set word2vec trainables attributes - new_model.wv.vectors = old_model.wv.syn0 - if hasattr(old_model.wv, 'syn0norm'): - new_model.docvecs.vectors_norm = old_model.wv.syn0norm - if hasattr(old_model, 'syn1'): - new_model.trainables.syn1 = old_model.syn1 - if hasattr(old_model, 'syn1neg'): - new_model.trainables.syn1neg = old_model.syn1neg - if hasattr(old_model, 'syn0_lockf'): - new_model.trainables.vectors_lockf = old_model.syn0_lockf - - # set doc2vec trainables attributes - new_model.docvecs.vectors_docs = old_model.docvecs.doctag_syn0 - if hasattr(old_model.docvecs, 'doctag_syn0norm'): - new_model.docvecs.vectors_docs_norm = old_model.docvecs.doctag_syn0norm - if hasattr(old_model.docvecs, 'doctag_syn0_lockf'): - new_model.trainables.vectors_docs_lockf = old_model.docvecs.doctag_syn0_lockf - if hasattr(old_model.docvecs, 'mapfile_path'): - new_model.docvecs.mapfile_path = old_model.docvecs.mapfile_path - - # set word2vec vocabulary attributes - new_model.wv.vocab = old_model.wv.vocab - new_model.wv.index2word = old_model.wv.index2word - new_model.vocabulary.cum_table = old_model.cum_table - - # set doc2vec vocabulary attributes - new_model.docvecs.doctags = old_model.docvecs.doctags - new_model.docvecs.count = old_model.docvecs.count - if hasattr(old_model.docvecs, 'max_rawint'): # `doc2vec` models before `0.12.3` do not have these 2 attributes - new_model.docvecs.max_rawint = old_model.docvecs.__dict__.get('max_rawint') - new_model.docvecs.offset2doctag = old_model.docvecs.__dict__.get('offset2doctag') - else: - # Doc2Vec models before Gensim version 0.12.3 did not have `max_rawint` and `offset2doctag` as they did not - # mixing of string and int tags. This implies the new attribute `offset2doctag` equals the old `index2doctag` - # (which was only filled if the documents had string tags). - # This also implies that the new attribute, `max_rawint`(highest rawint-indexed doctag) would either be equal - # to the initial value -1, in case only string tags are used or would be equal to `count` if only int indexing - # was used. - new_model.docvecs.max_rawint = -1 if old_model.docvecs.index2doctag else old_model.docvecs.count - 1 - new_model.docvecs.offset2doctag = old_model.docvecs.index2doctag - # now upconvert that to gensim-4.0.0+ - new_model.docvecs._upconvert_old_d2vkv() - - new_model.train_count = old_model.__dict__.get('train_count', None) - new_model.corpus_count = old_model.__dict__.get('corpus_count', None) - new_model.corpus_total_words = old_model.__dict__.get('corpus_total_words', None) - new_model.running_training_loss = old_model.__dict__.get('running_training_loss', 0) - new_model.total_train_time = old_model.__dict__.get('total_train_time', None) - new_model.min_alpha_yet_reached = old_model.__dict__.get('min_alpha_yet_reached', old_model.alpha) - new_model.model_trimmed_post_training = old_model.__dict__.get('model_trimmed_post_training', None) - - return new_model - - -def train_document_dbow(model, doc_words, doctag_indexes, alpha, work=None, - train_words=False, learn_doctags=True, learn_words=True, learn_hidden=True, - word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): - """ - Update distributed bag of words model ("PV-DBOW") by training on a single document. - - Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. - - The document is provided as `doc_words`, a list of word tokens which are looked up - in the model's vocab dictionary, and `doctag_indexes`, which provide indexes - into the doctag_vectors array. - - If `train_words` is True, simultaneously train word-to-word (not just doc-to-word) - examples, exactly as per Word2Vec skip-gram training. (Without this option, - word vectors are neither consulted nor updated during DBOW doc vector training.) - - Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to - prevent learning-updates to those respective model weights, as if using the - (partially-)frozen model to infer other compatible vectors. - - This is the non-optimized, Python version. If you have cython installed, gensim - will use the optimized version from doc2vec_inner instead. - - """ - if doctag_vectors is None: - doctag_vectors = model.docvecs.doctag_syn0 - if doctag_locks is None: - doctag_locks = model.docvecs.doctag_syn0_lockf - - if train_words and learn_words: - train_batch_sg(model, [doc_words], alpha, work) - for doctag_index in doctag_indexes: - for word in doc_words: - train_sg_pair( - model, word, doctag_index, alpha, learn_vectors=learn_doctags, learn_hidden=learn_hidden, - context_vectors=doctag_vectors, context_locks=doctag_locks - ) - - return len(doc_words) - - -def train_document_dm(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, - learn_doctags=True, learn_words=True, learn_hidden=True, - word_vectors=None, word_locks=None, doctag_vectors=None, doctag_locks=None): - """ - Update distributed memory model ("PV-DM") by training on a single document. - - Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. This - method implements the DM model with a projection (input) layer that is - either the sum or mean of the context vectors, depending on the model's - `dm_mean` configuration field. See `train_document_dm_concat()` for the DM - model with a concatenated input layer. - - The document is provided as `doc_words`, a list of word tokens which are looked up - in the model's vocab dictionary, and `doctag_indexes`, which provide indexes - into the doctag_vectors array. - - Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to - prevent learning-updates to those respective model weights, as if using the - (partially-)frozen model to infer other compatible vectors. - - This is the non-optimized, Python version. If you have a C compiler, gensim - will use the optimized version from doc2vec_inner instead. - - """ - if word_vectors is None: - word_vectors = model.wv.syn0 - if word_locks is None: - word_locks = model.syn0_lockf - if doctag_vectors is None: - doctag_vectors = model.docvecs.doctag_syn0 - if doctag_locks is None: - doctag_locks = model.docvecs.doctag_syn0_lockf - - word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab - and model.wv.vocab[w].sample_int > model.random.rand() * 2**32] - - for pos, word in enumerate(word_vocabs): - reduced_window = model.random.randint(model.window) # `b` in the original doc2vec code - start = max(0, pos - model.window + reduced_window) - window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) - word2_indexes = [word2.index for pos2, word2 in window_pos if pos2 != pos] - l1 = np_sum(word_vectors[word2_indexes], axis=0) + np_sum(doctag_vectors[doctag_indexes], axis=0) - count = len(word2_indexes) + len(doctag_indexes) - if model.cbow_mean and count > 1: - l1 /= count - neu1e = train_cbow_pair(model, word, word2_indexes, l1, alpha, - learn_vectors=False, learn_hidden=learn_hidden) - if not model.cbow_mean and count > 1: - neu1e /= count - if learn_doctags: - for i in doctag_indexes: - doctag_vectors[i] += neu1e * doctag_locks[i] - if learn_words: - for i in word2_indexes: - word_vectors[i] += neu1e * word_locks[i] - - return len(word_vocabs) - - -def train_document_dm_concat(model, doc_words, doctag_indexes, alpha, work=None, neu1=None, learn_doctags=True, - learn_words=True, learn_hidden=True, word_vectors=None, word_locks=None, - doctag_vectors=None, doctag_locks=None): - """ - Update distributed memory model ("PV-DM") by training on a single document, using a - concatenation of the context window word vectors (rather than a sum or average). - - Called internally from `Doc2Vec.train()` and `Doc2Vec.infer_vector()`. - - The document is provided as `doc_words`, a list of word tokens which are looked up - in the model's vocab dictionary, and `doctag_indexes`, which provide indexes - into the doctag_vectors array. - - Any of `learn_doctags', `learn_words`, and `learn_hidden` may be set False to - prevent learning-updates to those respective model weights, as if using the - (partially-)frozen model to infer other compatible vectors. - - This is the non-optimized, Python version. If you have a C compiler, gensim - will use the optimized version from doc2vec_inner instead. - - """ - if word_vectors is None: - word_vectors = model.wv.syn0 - if word_locks is None: - word_locks = model.syn0_lockf - if doctag_vectors is None: - doctag_vectors = model.docvecs.doctag_syn0 - if doctag_locks is None: - doctag_locks = model.docvecs.doctag_syn0_lockf - - word_vocabs = [model.wv.vocab[w] for w in doc_words if w in model.wv.vocab - and model.wv.vocab[w].sample_int > model.random.rand() * 2**32] - doctag_len = len(doctag_indexes) - if doctag_len != model.dm_tag_count: - return 0 # skip doc without expected number of doctag(s) (TODO: warn/pad?) - - null_word = model.wv.vocab['\0'] - pre_pad_count = model.window - post_pad_count = model.window - padded_document_indexes = ( - (pre_pad_count * [null_word.index]) # pre-padding - + [word.index for word in word_vocabs if word is not None] # elide out-of-Vocabulary words - + (post_pad_count * [null_word.index]) # post-padding - ) - - for pos in range(pre_pad_count, len(padded_document_indexes) - post_pad_count): - word_context_indexes = ( - padded_document_indexes[(pos - pre_pad_count): pos] # preceding words - + padded_document_indexes[(pos + 1):(pos + 1 + post_pad_count)] # following words - ) - predict_word = model.wv.vocab[model.wv.index2word[padded_document_indexes[pos]]] - # numpy advanced-indexing copies; concatenate, flatten to 1d - l1 = concatenate((doctag_vectors[doctag_indexes], word_vectors[word_context_indexes])).ravel() - neu1e = train_cbow_pair(model, predict_word, None, l1, alpha, - learn_hidden=learn_hidden, learn_vectors=False) - - # filter by locks and shape for addition to source vectors - e_locks = concatenate((doctag_locks[doctag_indexes], word_locks[word_context_indexes])) - neu1e_r = (neu1e.reshape(-1, model.vector_size) - * np_repeat(e_locks, model.vector_size).reshape(-1, model.vector_size)) - - if learn_doctags: - np_add.at(doctag_vectors, doctag_indexes, neu1e_r[:doctag_len]) - if learn_words: - np_add.at(word_vectors, word_context_indexes, neu1e_r[doctag_len:]) - - return len(padded_document_indexes) - pre_pad_count - post_pad_count - - -class TaggedDocument(namedtuple('TaggedDocument', 'words tags')): - """ - A single document, made up of `words` (a list of unicode string tokens) - and `tags` (a list of tokens). Tags may be one or more unicode string - tokens, but typical practice (which will also be most memory-efficient) is - for the tags list to include a unique integer id as the only tag. - - Replaces "sentence as a list of words" from Word2Vec. - - """ - - def __str__(self): - return '%s(%s, %s)' % (self.__class__.__name__, self.words, self.tags) - - -# for compatibility -@deprecated("Class will be removed in 4.0.0, use TaggedDocument instead") -class LabeledSentence(TaggedDocument): - pass - - -class DocvecsArray(SaveLoad): - """ - Default storage of doc vectors during/after training, in a numpy array. - - As the 'docvecs' property of a Doc2Vec model, allows access and - comparison of document vectors. - - .. sourcecode:: pycon - - >>> docvec = d2v_model.docvecs[99] - >>> docvec = d2v_model.docvecs['SENT_99'] # if string tag used in training - >>> sims = d2v_model.docvecs.most_similar(99) - >>> sims = d2v_model.docvecs.most_similar('SENT_99') - >>> sims = d2v_model.docvecs.most_similar(docvec) - - If only plain int tags are presented during training, the dict (of - string tag -> index) and list (of index -> string tag) stay empty, - saving memory. - - Supplying a mapfile_path (as by initializing a Doc2Vec model with a - 'docvecs_mapfile' value) will use a pair of memory-mapped - files as the array backing for doctag_syn0/doctag_syn0_lockf values. - - The Doc2Vec model automatically uses this class, but a future alternative - implementation, based on another persistence mechanism like LMDB, LevelDB, - or SQLite, should also be possible. - """ - - def __init__(self, mapfile_path=None): - self.doctags = {} # string -> Doctag (only filled if necessary) - self.max_rawint = -1 # highest rawint-indexed doctag - self.offset2doctag = [] # int offset-past-(max_rawint+1) -> String (only filled if necessary) - self.count = 0 - self.mapfile_path = mapfile_path - - def note_doctag(self, key, document_no, document_length): - """Note a document tag during initial corpus scan, for structure sizing.""" - if isinstance(key, integer_types + (integer,)): - self.max_rawint = max(self.max_rawint, key) - else: - if key in self.doctags: - self.doctags[key] = self.doctags[key].repeat(document_length) - else: - self.doctags[key] = Doctag(len(self.offset2doctag), document_length, 1) - self.offset2doctag.append(key) - self.count = self.max_rawint + 1 + len(self.offset2doctag) - - def indexed_doctags(self, doctag_tokens): - """Return indexes and backing-arrays used in training examples.""" - return ([self._int_index(index) for index in doctag_tokens if index in self], - self.doctag_syn0, self.doctag_syn0_lockf, doctag_tokens) - - def trained_item(self, indexed_tuple): - """Persist any changes made to the given indexes (matching tuple previously - returned by indexed_doctags()); a no-op for this implementation""" - pass - - def _int_index(self, index): - """Return int index for either string or int index""" - if isinstance(index, integer_types + (integer,)): - return index - else: - return self.max_rawint + 1 + self.doctags[index].offset - - @deprecated("Method will be removed in 4.0.0, use self.index_to_doctag instead") - def _key_index(self, i_index, missing=None): - """Return string index for given int index, if available""" - return self.index_to_doctag(i_index) - - def index_to_doctag(self, i_index): - """Return string key for given i_index, if available. Otherwise return raw int doctag (same int).""" - candidate_offset = i_index - self.max_rawint - 1 - if 0 <= candidate_offset < len(self.offset2doctag): - return self.offset2doctag[candidate_offset] - else: - return i_index - - def __getitem__(self, index): - """ - Accept a single key (int or string tag) or list of keys as input. - - If a single string or int, return designated tag's vector - representation, as a 1D numpy array. - - If a list, return designated tags' vector representations as a - 2D numpy array: #tags x #vector_size. - """ - if isinstance(index, string_types + integer_types + (integer,)): - return self.doctag_syn0[self._int_index(index)] - - return vstack([self[i] for i in index]) - - def __len__(self): - return self.count - - def __contains__(self, index): - if isinstance(index, integer_types + (integer,)): - return index < self.count - else: - return index in self.doctags - - def save(self, *args, **kwargs): - # don't bother storing the cached normalized vectors - kwargs['ignore'] = kwargs.get('ignore', ['syn0norm']) - super(DocvecsArray, self).save(*args, **kwargs) - - def borrow_from(self, other_docvecs): - self.count = other_docvecs.count - self.doctags = other_docvecs.doctags - self.offset2doctag = other_docvecs.offset2doctag - - def clear_sims(self): - self.doctag_syn0norm = None - - def estimated_lookup_memory(self): - """Estimated memory for tag lookup; 0 if using pure int tags.""" - return 60 * len(self.offset2doctag) + 140 * len(self.doctags) - - def reset_weights(self, model): - length = max(len(self.doctags), self.count) - if self.mapfile_path: - self.doctag_syn0 = np_memmap( - self.mapfile_path + '.doctag_syn0', dtype=REAL, mode='w+', shape=(length, model.vector_size) - ) - self.doctag_syn0_lockf = np_memmap( - self.mapfile_path + '.doctag_syn0_lockf', dtype=REAL, mode='w+', shape=(length,) - ) - self.doctag_syn0_lockf.fill(1.0) - else: - self.doctag_syn0 = empty((length, model.vector_size), dtype=REAL) - self.doctag_syn0_lockf = ones((length,), dtype=REAL) # zeros suppress learning - - for i in range(length): - # construct deterministic seed from index AND model seed - seed = "%d %s" % (model.seed, self.index_to_doctag(i)) - self.doctag_syn0[i] = model.seeded_vector(seed) - - def init_sims(self, replace=False): - """ - Precompute L2-normalized vectors. - - If `replace` is set, forget the original vectors and only keep the normalized - ones = saves lots of memory! - - Note that you **cannot continue training or inference** after doing a replace. - The model becomes effectively read-only = you can call `most_similar`, `similarity` - etc., but not `train` or `infer_vector`. - - """ - if getattr(self, 'doctag_syn0norm', None) is None or replace: - logger.info("precomputing L2-norms of doc weight vectors") - if replace: - for i in range(self.doctag_syn0.shape[0]): - self.doctag_syn0[i, :] /= sqrt((self.doctag_syn0[i, :] ** 2).sum(-1)) - self.doctag_syn0norm = self.doctag_syn0 - else: - if self.mapfile_path: - self.doctag_syn0norm = np_memmap( - self.mapfile_path + '.doctag_syn0norm', dtype=REAL, - mode='w+', shape=self.doctag_syn0.shape) - else: - self.doctag_syn0norm = empty(self.doctag_syn0.shape, dtype=REAL) - np_divide(self.doctag_syn0, sqrt((self.doctag_syn0 ** 2).sum(-1))[..., newaxis], self.doctag_syn0norm) - - def most_similar(self, positive=None, negative=None, topn=10, clip_start=0, clip_end=None, indexer=None): - """ - Find the top-N most similar docvecs known from training. Positive docs contribute - positively towards the similarity, negative docs negatively. - - This method computes cosine similarity between a simple mean of the projection - weight vectors of the given docs. Docs may be specified as vectors, integer indexes - of trained docvecs, or if the documents were originally presented with string tags, - by the corresponding tags. - - The 'clip_start' and 'clip_end' allow limiting results to a particular contiguous - range of the underlying doctag_syn0norm vectors. (This may be useful if the ordering - there was chosen to be significant, such as more popular tag IDs in lower indexes.) - """ - if positive is None: - positive = [] - if negative is None: - negative = [] - - self.init_sims() - clip_end = clip_end or len(self.doctag_syn0norm) - - if isinstance(positive, string_types + integer_types + (integer,)) and not negative: - # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog']) - positive = [positive] - - # add weights for each doc, if not already present; default to 1.0 for positive and -1.0 for negative docs - positive = [ - (doc, 1.0) if isinstance(doc, string_types + integer_types + (ndarray, integer)) - else doc for doc in positive - ] - negative = [ - (doc, -1.0) if isinstance(doc, string_types + integer_types + (ndarray, integer)) - else doc for doc in negative - ] - - # compute the weighted average of all docs - all_docs, mean = set(), [] - for doc, weight in positive + negative: - if isinstance(doc, ndarray): - mean.append(weight * doc) - elif doc in self.doctags or doc < self.count: - mean.append(weight * self.doctag_syn0norm[self._int_index(doc)]) - all_docs.add(self._int_index(doc)) - else: - raise KeyError("doc '%s' not in trained set" % doc) - if not mean: - raise ValueError("cannot compute similarity with no input") - mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) - - if indexer is not None: - return indexer.most_similar(mean, topn) - - dists = dot(self.doctag_syn0norm[clip_start:clip_end], mean) - if not topn: - return dists - best = matutils.argsort(dists, topn=topn + len(all_docs), reverse=True) - # ignore (don't return) docs from the input - result = [ - (self.index_to_doctag(sim + clip_start), float(dists[sim])) - for sim in best - if (sim + clip_start) not in all_docs - ] - return result[:topn] - - def doesnt_match(self, docs): - """ - Which doc from the given list doesn't go with the others? - - (TODO: Accept vectors of out-of-training-set docs, as if from inference.) - - """ - self.init_sims() - - docs = [doc for doc in docs if doc in self.doctags or 0 <= doc < self.count] # filter out unknowns - logger.debug("using docs %s", docs) - if not docs: - raise ValueError("cannot select a doc from an empty list") - vectors = vstack(self.doctag_syn0norm[self._int_index(doc)] for doc in docs).astype(REAL) - mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL) - dists = dot(vectors, mean) - return sorted(zip(dists, docs))[0][1] - - def similarity(self, d1, d2): - """ - Compute cosine similarity between two docvecs in the trained set, specified by int index or - string tag. (TODO: Accept vectors of out-of-training-set docs, as if from inference.) - - """ - return dot(matutils.unitvec(self[d1]), matutils.unitvec(self[d2])) - - def n_similarity(self, ds1, ds2): - """ - Compute cosine similarity between two sets of docvecs from the trained set, specified by int - index or string tag. (TODO: Accept vectors of out-of-training-set docs, as if from inference.) - - """ - v1 = [self[doc] for doc in ds1] - v2 = [self[doc] for doc in ds2] - return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0))) - - def similarity_unseen_docs(self, model, doc_words1, doc_words2, alpha=0.1, min_alpha=0.0001, steps=5): - """ - Compute cosine similarity between two post-bulk out of training documents. - - Document should be a list of (word) tokens. - """ - d1 = model.infer_vector(doc_words=doc_words1, alpha=alpha, min_alpha=min_alpha, steps=steps) - d2 = model.infer_vector(doc_words=doc_words2, alpha=alpha, min_alpha=min_alpha, steps=steps) - return dot(matutils.unitvec(d1), matutils.unitvec(d2)) - - -class Doctag(namedtuple('Doctag', 'offset, word_count, doc_count')): - """A string document tag discovered during the initial vocabulary - scan. (The document-vector equivalent of a Vocab object.) - - Will not be used if all presented document tags are ints. - - The offset is only the true index into the doctags_syn0/doctags_syn0_lockf - if-and-only-if no raw-int tags were used. If any raw-int tags were used, - string Doctag vectors begin at index (max_rawint + 1), so the true index is - (rawint_index + 1 + offset). See also DocvecsArray.index_to_doctag(). - """ - __slots__ = () - - def repeat(self, word_count): - return self._replace(word_count=self.word_count + word_count, doc_count=self.doc_count + 1) - - -class Doc2Vec(Word2Vec): - """Class for training, using and evaluating neural networks described in http://arxiv.org/pdf/1405.4053v2.pdf""" - - def __init__(self, documents=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, - docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, **kwargs): - """ - Initialize the model from an iterable of `documents`. Each document is a - TaggedDocument object that will be used for training. - - The `documents` iterable can be simply a list of TaggedDocument elements, but for larger corpora, - consider an iterable that streams the documents directly from disk/network. - - If you don't supply `documents`, the model is left uninitialized -- use if - you plan to initialize it in some other way. - - `dm` defines the training algorithm. By default (`dm=1`), 'distributed memory' (PV-DM) is used. - Otherwise, `distributed bag of words` (PV-DBOW) is employed. - - `size` is the dimensionality of the feature vectors. - - `window` is the maximum distance between the predicted word and context words used for prediction - within a document. - - `alpha` is the initial learning rate (will linearly drop to `min_alpha` as training progresses). - - `seed` = for the random number generator. - Note that for a fully deterministically-reproducible run, you must also limit the model to - a single worker thread, to eliminate ordering jitter from OS thread scheduling. (In Python - 3, reproducibility between interpreter launches also requires use of the PYTHONHASHSEED - environment variable to control hash randomization.) - - `min_count` = ignore all words with total frequency lower than this. - - `max_vocab_size` = limit RAM during vocabulary building; if there are more unique - words than this, then prune the infrequent ones. Every 10 million word types - need about 1GB of RAM. Set to `None` for no limit (default). - - `sample` = threshold for configuring which higher-frequency words are randomly downsampled; - default is 1e-3, values of 1e-5 (or lower) may also be useful, set to 0.0 to disable downsampling. - - `workers` = use this many worker threads to train the model (=faster training with multicore machines). - - `iter` = number of iterations (epochs) over the corpus. The default inherited from Word2Vec is 5, - but values of 10 or 20 are common in published 'Paragraph Vector' experiments. - - `hs` = if 1, hierarchical softmax will be used for model training. - If set to 0 (default), and `negative` is non-zero, negative sampling will be used. - - `negative` = if > 0, negative sampling will be used, the int for negative - specifies how many "noise words" should be drawn (usually between 5-20). - Default is 5. If set to 0, no negative samping is used. - - `dm_mean` = if 0 (default), use the sum of the context word vectors. If 1, use the mean. - Only applies when dm is used in non-concatenative mode. - - `dm_concat` = if 1, use concatenation of context vectors rather than sum/average; - default is 0 (off). Note concatenation results in a much-larger model, as the input - is no longer the size of one (sampled or arithmetically combined) word vector, but the - size of the tag(s) and all words in the context strung together. - - `dm_tag_count` = expected constant number of document tags per document, when using - dm_concat mode; default is 1. - - `dbow_words` if set to 1 trains word-vectors (in skip-gram fashion) simultaneous with DBOW - doc-vector training; default is 0 (faster training of doc-vectors only). - - `trim_rule` = vocabulary trimming rule, specifies whether certain words should remain - in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count). - Can be None (min_count will be used), or a callable that accepts parameters (word, count, min_count) and - returns either util.RULE_DISCARD, util.RULE_KEEP or util.RULE_DEFAULT. - Note: The rule, if given, is only used prune vocabulary during build_vocab() and is not stored as part - of the model. - """ - - if 'sentences' in kwargs: - raise DeprecationWarning( - "Parameter 'sentences' was renamed to 'documents', and will be removed in 4.0.0, " - "use 'documents' instead." - ) - - super(Doc2Vec, self).__init__( - sg=(1 + dm) % 2, - null_word=dm_concat, - **kwargs) - - self.load = call_on_class_only - - if dm_mean is not None: - self.cbow_mean = dm_mean - - self.dbow_words = dbow_words - self.dm_concat = dm_concat - self.dm_tag_count = dm_tag_count - if self.dm and self.dm_concat: - self.layer1_size = (self.dm_tag_count + (2 * self.window)) * self.vector_size - - self.docvecs = docvecs or DocvecsArray(docvecs_mapfile) - self.comment = comment - if documents is not None: - self.build_vocab(documents, trim_rule=trim_rule) - self.train(documents, total_examples=self.corpus_count, epochs=self.iter) - - @property - def dm(self): - return not self.sg # opposite of SG - - @property - def dbow(self): - return self.sg # same as SG - - def clear_sims(self): - super(Doc2Vec, self).clear_sims() - self.docvecs.clear_sims() - - def reset_weights(self): - if self.dm and self.dm_concat: - # expand l1 size to match concatenated tags+words length - self.layer1_size = (self.dm_tag_count + (2 * self.window)) * self.vector_size - logger.info("using concatenative %d-dimensional layer1", self.layer1_size) - super(Doc2Vec, self).reset_weights() - self.docvecs.reset_weights(self) - - def reset_from(self, other_model): - """Reuse shareable structures from other_model.""" - self.docvecs.borrow_from(other_model.docvecs) - super(Doc2Vec, self).reset_from(other_model) - - def scan_vocab(self, documents, progress_per=10000, trim_rule=None, update=False): - logger.info("collecting all words and their counts") - document_no = -1 - total_words = 0 - min_reduce = 1 - interval_start = default_timer() - 0.00001 # guard against next sample being identical - interval_count = 0 - checked_string_types = 0 - vocab = defaultdict(int) - for document_no, document in enumerate(documents): - if not checked_string_types: - if isinstance(document.words, string_types): - logger.warning( - "Each 'words' should be a list of words (usually unicode strings). " - "First 'words' here is instead plain %s.", - type(document.words) - ) - checked_string_types += 1 - if document_no % progress_per == 0: - interval_rate = (total_words - interval_count) / (default_timer() - interval_start) - logger.info( - "PROGRESS: at example #%i, processed %i words (%i/s), %i word types, %i tags", - document_no, total_words, interval_rate, len(vocab), len(self.docvecs) - ) - interval_start = default_timer() - interval_count = total_words - document_length = len(document.words) - - for tag in document.tags: - self.docvecs.note_doctag(tag, document_no, document_length) - - for word in document.words: - vocab[word] += 1 - total_words += len(document.words) - - if self.max_vocab_size and len(vocab) > self.max_vocab_size: - utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) - min_reduce += 1 - - logger.info( - "collected %i word types and %i unique tags from a corpus of %i examples and %i words", - len(vocab), len(self.docvecs), document_no + 1, total_words - ) - self.corpus_count = document_no + 1 - self.raw_vocab = vocab - - def _do_train_job(self, job, alpha, inits): - work, neu1 = inits - tally = 0 - for doc in job: - indexed_doctags = self.docvecs.indexed_doctags(doc.tags) - doctag_indexes, doctag_vectors, doctag_locks, ignored = indexed_doctags - if self.sg: - tally += train_document_dbow( - self, doc.words, doctag_indexes, alpha, work, train_words=self.dbow_words, - doctag_vectors=doctag_vectors, doctag_locks=doctag_locks - ) - elif self.dm_concat: - tally += train_document_dm_concat( - self, doc.words, doctag_indexes, alpha, work, neu1, - doctag_vectors=doctag_vectors, doctag_locks=doctag_locks - ) - else: - tally += train_document_dm( - self, doc.words, doctag_indexes, alpha, work, neu1, - doctag_vectors=doctag_vectors, doctag_locks=doctag_locks - ) - self.docvecs.trained_item(indexed_doctags) - return tally, self._raw_word_count(job) - - def _raw_word_count(self, job): - """Return the number of words in a given job.""" - return sum(len(sentence.words) for sentence in job) - - def infer_vector(self, doc_words, alpha=0.1, min_alpha=0.0001, steps=5): - """ - Infer a vector for given post-bulk training document. - - Document should be a list of (word) tokens. - """ - doctag_vectors = empty((1, self.vector_size), dtype=REAL) - doctag_vectors[0] = self.seeded_vector(' '.join(doc_words)) - doctag_locks = ones(1, dtype=REAL) - doctag_indexes = [0] - - work = zeros(self.layer1_size, dtype=REAL) - if not self.sg: - neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) - - for i in range(steps): - if self.sg: - train_document_dbow( - self, doc_words, doctag_indexes, alpha, work, - learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks - ) - elif self.dm_concat: - train_document_dm_concat( - self, doc_words, doctag_indexes, alpha, work, neu1, - learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks - ) - else: - train_document_dm( - self, doc_words, doctag_indexes, alpha, work, neu1, - learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctag_locks=doctag_locks - ) - alpha = ((alpha - min_alpha) / (steps - i)) + min_alpha - - return doctag_vectors[0] - - def estimate_memory(self, vocab_size=None, report=None): - """Estimate required memory for a model using current settings.""" - report = report or {} - report['doctag_lookup'] = self.docvecs.estimated_lookup_memory() - report['doctag_syn0'] = self.docvecs.count * self.vector_size * dtype(REAL).itemsize - return super(Doc2Vec, self).estimate_memory(vocab_size, report=report) - - def __str__(self): - """Abbreviated name reflecting major configuration paramaters.""" - segments = [] - if self.comment: - segments.append('"%s"' % self.comment) - if self.sg: - if self.dbow_words: - segments.append('dbow+w') # also training words - else: - segments.append('dbow') # PV-DBOW (skip-gram-style) - - else: # PV-DM... - if self.dm_concat: - segments.append('dm/c') # ...with concatenative context layer - else: - if self.cbow_mean: - segments.append('dm/m') - else: - segments.append('dm/s') - segments.append('d%d' % self.vector_size) # dimensions - if self.negative: - segments.append('n%d' % self.negative) # negative samples - if self.hs: - segments.append('hs') - if not self.sg or (self.sg and self.dbow_words): - segments.append('w%d' % self.window) # window size, when relevant - if self.min_count > 1: - segments.append('mc%d' % self.min_count) - if self.sample > 0: - segments.append('s%g' % self.sample) - if self.workers > 1: - segments.append('t%d' % self.workers) - return '%s(%s)' % (self.__class__.__name__, ','.join(segments)) - - def delete_temporary_training_data(self, keep_doctags_vectors=True, keep_inference=True): - """ - Discard parameters that are used in training and score. Use if you're sure you're done training a model. - Set `keep_doctags_vectors` to False if you don't want to save doctags vectors, - in this case you can't to use docvecs's most_similar, similarity etc. methods. - Set `keep_inference` to False if you don't want to store parameters that is used for infer_vector method - """ - if not keep_inference: - self._minimize_model(False, False, False) - if self.docvecs and hasattr(self.docvecs, 'doctag_syn0') and not keep_doctags_vectors: - del self.docvecs.doctag_syn0 - if self.docvecs and hasattr(self.docvecs, 'doctag_syn0_lockf'): - del self.docvecs.doctag_syn0_lockf - - def save_word2vec_format(self, fname, doctag_vec=False, word_vec=True, prefix='*dt_', fvocab=None, binary=False): - """ - Store the input-hidden weight matrix. - - `fname` is the file used to save the vectors in - `doctag_vec` is an optional boolean indicating whether to store document vectors - `word_vec` is an optional boolean indicating whether to store word vectors - (if both doctag_vec and word_vec are True, then both vectors are stored in the same file) - `prefix` to uniquely identify doctags from word vocab, and avoid collision - in case of repeated string in doctag and word vocab - `fvocab` is an optional file used to save the vocabulary - `binary` is an optional boolean indicating whether the data is to be saved - in binary word2vec format (default: False) - - """ - total_vec = len(self.wv.vocab) + len(self.docvecs) - # save word vectors - if word_vec: - if not doctag_vec: - total_vec = len(self.wv.vocab) - KeyedVectors.save_word2vec_format(self.wv, fname, fvocab, binary, total_vec) - # save document vectors - if doctag_vec: - with utils.open(fname, 'ab') as fout: - if not word_vec: - total_vec = len(self.docvecs) - logger.info("storing %sx%s projection weights into %s", total_vec, self.vector_size, fname) - fout.write(utils.to_utf8("%s %s\n" % (total_vec, self.vector_size))) - # store as in input order - for i in range(len(self.docvecs)): - doctag = u"%s%s" % (prefix, self.docvecs.index_to_doctag(i)) - row = self.docvecs.doctag_syn0[i] - if binary: - fout.write(utils.to_utf8(doctag) + b" " + row.tostring()) - else: - fout.write(utils.to_utf8("%s %s\n" % (doctag, ' '.join("%f" % val for val in row)))) - - -class TaggedBrownCorpus(object): - """Iterate over documents from the Brown corpus (part of NLTK data), yielding - each document out as a TaggedDocument object.""" - - def __init__(self, dirname): - self.dirname = dirname - - def __iter__(self): - for fname in os.listdir(self.dirname): - fname = os.path.join(self.dirname, fname) - if not os.path.isfile(fname): - continue - with utils.open(fname, 'rb') as f: - for item_no, line in enumerate(f): - line = utils.to_unicode(line) - # each file line is a single document in the Brown corpus - # each token is WORD/POS_TAG - token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2] - # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff) - words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()] - if not words: # don't bother sending out empty documents - continue - yield TaggedDocument(words, ['%s_SENT_%s' % (fname, item_no)]) - - -class TaggedLineDocument(object): - """Simple format: one document = one line = one TaggedDocument object. - - Words are expected to be already preprocessed and separated by whitespace, - tags are constructed automatically from the document line number.""" - - def __init__(self, source): - """ - `source` can be either a string (filename) or a file object. - - Example:: - - documents = TaggedLineDocument('myfile.txt') - - Or for compressed files:: - - documents = TaggedLineDocument('compressed_text.txt.bz2') - documents = TaggedLineDocument('compressed_text.txt.gz') - - """ - self.source = source - - def __iter__(self): - """Iterate through the lines in the source.""" - try: - # Assume it is a file-like object and try treating it as such - # Things that don't have seek will trigger an exception - self.source.seek(0) - for item_no, line in enumerate(self.source): - yield TaggedDocument(utils.to_unicode(line).split(), [item_no]) - except AttributeError: - # If it didn't work like a file, use it as a string filename - with utils.open(self.source, 'rb') as fin: - for item_no, line in enumerate(fin): - yield TaggedDocument(utils.to_unicode(line).split(), [item_no]) diff --git a/gensim/models/deprecated/fasttext.py b/gensim/models/deprecated/fasttext.py deleted file mode 100644 index 0d46b6f1cc..0000000000 --- a/gensim/models/deprecated/fasttext.py +++ /dev/null @@ -1,711 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# Authors: Chinmaya Pancholi , Shiva Manne -# Copyright (C) 2017 RaRe Technologies s.r.o. - -""" -Warnings --------- -.. deprecated:: 3.3.0 - Use :mod:`gensim.models.fasttext` instead. - - -Learn word representations via fasttext's "skip-gram and CBOW models", using either -hierarchical softmax or negative sampling [1]_. - -Notes ------ -There are more ways to get word vectors in Gensim than just FastText. -See wrappers for VarEmbed and WordRank or Word2Vec - -This module allows training a word embedding from a training corpus with the additional ability -to obtain word vectors for out-of-vocabulary words. - -For a tutorial on gensim's native fasttext, refer to the noteboook -- [2]_ - -**Make sure you have a C compiler before installing gensim, to use optimized (compiled) fasttext training** - -.. [1] P. Bojanowski, E. Grave, A. Joulin, T. Mikolov - Enriching Word Vectors with Subword Information. In arXiv preprint arXiv:1607.04606. - https://arxiv.org/abs/1607.04606 - -.. [2] https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/FastText_Tutorial.ipynb - -""" - -import logging - -import numpy as np -from numpy import zeros, ones, vstack, sum as np_sum, empty, float32 as REAL - -from gensim.models.deprecated.word2vec import Word2Vec, train_sg_pair, train_cbow_pair -from gensim.models.deprecated.fasttext_wrapper import FastTextKeyedVectors -from gensim.models.deprecated.fasttext_wrapper import FastText as Ft_Wrapper, compute_ngrams, ft_hash -from gensim.models.fasttext import FastText as NewFastText - -logger = logging.getLogger(__name__) - -MAX_WORDS_IN_BATCH = 10000 - - -def load_old_fasttext(*args, **kwargs): - old_model = FastText.load(*args, **kwargs) - params = { - 'size': old_model.vector_size, - 'alpha': old_model.alpha, - 'window': old_model.window, - 'min_count': old_model.min_count, - 'max_vocab_size': old_model.__dict__.get('max_vocab_size', None), - 'sample': old_model.sample, - 'seed': old_model.seed, - 'workers': old_model.workers, - 'min_alpha': old_model.min_alpha, - 'sg': old_model.sg, - 'hs': old_model.hs, - 'negative': old_model.negative, - 'cbow_mean': old_model.cbow_mean, - 'hashfxn': old_model.hashfxn, - 'iter': old_model.iter, - 'null_word': old_model.null_word, - 'sorted_vocab': old_model.sorted_vocab, - 'batch_words': old_model.batch_words, - 'min_n': old_model.min_n, - 'max_n': old_model.max_n, - 'word_ngrams': old_model.word_ngrams, - 'bucket': old_model.bucket - } - new_model = NewFastText(**params) - # set trainables attributes - new_model.wv.vectors = old_model.wv.syn0 - new_model.wv.vectors_vocab = old_model.wv.syn0_vocab - new_model.wv.vectors_ngrams = old_model.wv.syn0_ngrams - if hasattr(old_model.wv, 'syn0norm'): - new_model.wv.vectors_norm = old_model.wv.syn0norm - if hasattr(old_model, 'syn1'): - new_model.trainables.syn1 = old_model.syn1 - if hasattr(old_model, 'syn1neg'): - new_model.trainables.syn1neg = old_model.syn1neg - if hasattr(old_model, 'syn0_lockf'): - new_model.trainables.vectors_lockf = old_model.syn0_lockf - - if hasattr(old_model, 'syn0_vocab_lockf'): - new_model.trainables.vectors_vocab_lockf = old_model.syn0_vocab_lockf - if hasattr(old_model, 'syn0_ngrams_lockf'): - new_model.trainables.vectors_ngrams_lockf = old_model.syn0_ngrams_lockf - if hasattr(old_model.wv, 'syn0_vocab_norm'): - new_model.trainables.vectors_vocab_norm = old_model.wv.syn0_vocab_norm - if hasattr(old_model.wv, 'syn0_ngrams_norm'): - new_model.trainables.vectors_ngrams_norm = old_model.wv.syn0_ngrams_norm - - # set vocabulary attributes - new_model.wv.vocab = old_model.wv.vocab - new_model.wv.index2word = old_model.wv.index2word - new_model.vocabulary.cum_table = old_model.cum_table - - new_model.wv.hash2index = old_model.wv.hash2index - - new_model.train_count = old_model.train_count - new_model.corpus_count = old_model.corpus_count - new_model.corpus_total_words = old_model.corpus_total_words - new_model.running_training_loss = old_model.running_training_loss - new_model.total_train_time = old_model.total_train_time - new_model.min_alpha_yet_reached = old_model.min_alpha_yet_reached - new_model.model_trimmed_post_training = old_model.model_trimmed_post_training - - new_model.trainables.num_ngram_vectors = old_model.num_ngram_vectors - - return new_model - - -def train_batch_cbow(model, sentences, alpha, work=None, neu1=None): - """Update CBOW model by training on a sequence of sentences. - - Each sentence is a list of string tokens, which are looked up in the model's - vocab dictionary. Called internally from :meth:`gensim.models.fasttext.FastText.train()`. - - This is the non-optimized, Python version. If you have cython installed, gensim - will use the optimized version from fasttext_inner instead. - - Parameters - ---------- - model : :class:`~gensim.models.fasttext.FastText` - `FastText` instance. - sentences : iterable of iterables - Iterable of the sentences directly from disk/network. - alpha : float - Learning rate. - work : :class:`numpy.ndarray` - Private working memory for each worker. - neu1 : :class:`numpy.ndarray` - Private working memory for each worker. - - Returns - ------- - int - Effective number of words trained. - - """ - result = 0 - for sentence in sentences: - word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab - and model.wv.vocab[w].sample_int > model.random.rand() * 2**32] - for pos, word in enumerate(word_vocabs): - reduced_window = model.random.randint(model.window) - start = max(0, pos - model.window + reduced_window) - window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) - word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] - - word2_subwords = [] - vocab_subwords_indices = [] - ngrams_subwords_indices = [] - - for index in word2_indices: - vocab_subwords_indices += [index] - word2_subwords += model.wv.ngrams_word[model.wv.index2word[index]] - - for subword in word2_subwords: - ngrams_subwords_indices.append(model.wv.ngrams[subword]) - - l1_vocab = np_sum(model.wv.syn0_vocab[vocab_subwords_indices], axis=0) # 1 x vector_size - l1_ngrams = np_sum(model.wv.syn0_ngrams[ngrams_subwords_indices], axis=0) # 1 x vector_size - - l1 = np_sum([l1_vocab, l1_ngrams], axis=0) - subwords_indices = [vocab_subwords_indices] + [ngrams_subwords_indices] - if (subwords_indices[0] or subwords_indices[1]) and model.cbow_mean: - l1 /= (len(subwords_indices[0]) + len(subwords_indices[1])) - - # train on the sliding window for target word - train_cbow_pair(model, word, subwords_indices, l1, alpha, is_ft=True) - result += len(word_vocabs) - return result - - -def train_batch_sg(model, sentences, alpha, work=None, neu1=None): - """Update skip-gram model by training on a sequence of sentences. - - Each sentence is a list of string tokens, which are looked up in the model's - vocab dictionary. Called internally from :meth:`gensim.models.fasttext.FastText.train()`. - - This is the non-optimized, Python version. If you have cython installed, gensim - will use the optimized version from fasttext_inner instead. - - Parameters - ---------- - model : :class:`~gensim.models.fasttext.FastText` - `FastText` instance. - sentences : iterable of iterables - Iterable of the sentences directly from disk/network. - alpha : float - Learning rate. - work : :class:`numpy.ndarray` - Private working memory for each worker. - neu1 : :class:`numpy.ndarray` - Private working memory for each worker. - - Returns - ------- - int - Effective number of words trained. - - """ - result = 0 - for sentence in sentences: - word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab - and model.wv.vocab[w].sample_int > model.random.rand() * 2**32] - for pos, word in enumerate(word_vocabs): - reduced_window = model.random.randint(model.window) # `b` in the original word2vec code - # now go over all words from the (reduced) window, predicting each one in turn - start = max(0, pos - model.window + reduced_window) - - subwords_indices = [word.index] - word2_subwords = model.wv.ngrams_word[model.wv.index2word[word.index]] - - for subword in word2_subwords: - subwords_indices.append(model.wv.ngrams[subword]) - - for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): - if pos2 != pos: # don't train on the `word` itself - train_sg_pair(model, model.wv.index2word[word2.index], subwords_indices, alpha, is_ft=True) - - result += len(word_vocabs) - return result - - -class FastText(Word2Vec): - """Class for training, using and evaluating word representations learned using method - described in [1]_ aka Fasttext. - - The model can be stored/loaded via its :meth:`~gensim.models.fasttext.FastText.save()` and - :meth:`~gensim.models.fasttext.FastText.load()` methods, or loaded in a format compatible with the original - fasttext implementation via :meth:`~gensim.models.fasttext.FastText.load_fasttext_format()`. - - """ - def __init__( - self, sentences=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, - max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, - negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, sorted_vocab=1, - bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH): - """Initialize the model from an iterable of `sentences`. Each sentence is a - list of words (unicode strings) that will be used for training. - - Parameters - ---------- - sentences : iterable of iterables - The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, - consider an iterable that streams the sentences directly from disk/network. - See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. - If you don't supply `sentences`, the model is left uninitialized -- use if you plan to initialize it - in some other way. - sg : int {1, 0} - Defines the training algorithm. If 1, skip-gram is used, otherwise, CBOW is employed. - size : int - Dimensionality of the feature vectors. - window : int - The maximum distance between the current and predicted word within a sentence. - alpha : float - The initial learning rate. - min_alpha : float - Learning rate will linearly drop to `min_alpha` as training progresses. - seed : int - Seed for the random number generator. Initial vectors for each word are seeded with a hash of - the concatenation of word + `str(seed)`. Note that for a fully deterministically-reproducible run, - you must also limit the model to a single worker thread (`workers=1`), to eliminate ordering jitter - from OS thread scheduling. (In Python 3, reproducibility between interpreter launches also requires - use of the `PYTHONHASHSEED` environment variable to control hash randomization). - min_count : int - Ignores all words with total frequency lower than this. - max_vocab_size : int - Limits the RAM during vocabulary building; if there are more unique - words than this, then prune the infrequent ones. Every 10 million word types need about 1GB of RAM. - Set to `None` for no limit. - sample : float - The threshold for configuring which higher-frequency words are randomly downsampled, - useful range is (0, 1e-5). - workers : int - Use these many worker threads to train the model (=faster training with multicore machines). - hs : int {1,0} - If 1, hierarchical softmax will be used for model training. - If set to 0, and `negative` is non-zero, negative sampling will be used. - negative : int - If > 0, negative sampling will be used, the int for negative specifies how many "noise words" - should be drawn (usually between 5-20). - If set to 0, no negative sampling is used. - cbow_mean : int {1,0} - If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used. - hashfxn : function - Hash function to use to randomly initialize weights, for increased training reproducibility. - iter : int - Number of iterations (epochs) over the corpus. - trim_rule : function - Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, - be trimmed away, or handled using the default (discard if word count < min_count). - Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), - or a callable that accepts parameters (word, count, min_count) and returns either - :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. - Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part - of the model. - sorted_vocab : int {1,0} - If 1, sort the vocabulary by descending frequency before assigning word indexes. - batch_words : int - Target size (in words) for batches of examples passed to worker threads (and - thus cython routines).(Larger batches will be passed if individual - texts are longer than 10000 words, but the standard cython code truncates to that maximum.) - min_n : int - Min length of char ngrams to be used for training word representations. - max_n : int - Max length of char ngrams to be used for training word representations. Set `max_n` to be - lesser than `min_n` to avoid char ngrams being used. - word_ngrams : int {1,0} - If 1, uses enriches word vectors with subword(ngrams) information. - If 0, this is equivalent to word2vec. - bucket : int - Character ngrams are hashed into a fixed number of buckets, in order to limit the - memory usage of the model. This option specifies the number of buckets used by the model. - - Examples - -------- - Initialize and train a `FastText` model - - .. sourcecode:: pycon - - >>> from gensim.models import FastText - >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] - >>> - >>> model = FastText(sentences, min_count=1) - >>> say_vector = model['say'] # get vector for word - >>> of_vector = model['of'] # get vector for out-of-vocab word - - """ - # fastText specific params - self.bucket = bucket - self.word_ngrams = word_ngrams - self.min_n = min_n - self.max_n = max_n - if self.word_ngrams <= 1 and self.max_n == 0: - self.bucket = 0 - - super(FastText, self).__init__( - sentences=sentences, size=size, alpha=alpha, window=window, min_count=min_count, - max_vocab_size=max_vocab_size, sample=sample, seed=seed, workers=workers, min_alpha=min_alpha, - sg=sg, hs=hs, negative=negative, cbow_mean=cbow_mean, hashfxn=hashfxn, iter=iter, null_word=null_word, - trim_rule=trim_rule, sorted_vocab=sorted_vocab, batch_words=batch_words) - - def initialize_word_vectors(self): - """Initializes FastTextKeyedVectors instance to store all vocab/ngram vectors for the model.""" - self.wv = FastTextKeyedVectors() - self.wv.min_n = self.min_n - self.wv.max_n = self.max_n - - def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_per=10000, update=False): - """Build vocabulary from a sequence of sentences (can be a once-only generator stream). - Each sentence must be a list of unicode strings. - - Parameters - ---------- - sentences : iterable of iterables - The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, - consider an iterable that streams the sentences directly from disk/network. - See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. - keep_raw_vocab : bool - If not true, delete the raw vocabulary after the scaling is done and free up RAM. - trim_rule : function - Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, - be trimmed away, or handled using the default (discard if word count < min_count). - Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), - or a callable that accepts parameters (word, count, min_count) and returns either - :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. - Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part - of the model. - progress_per : int - Indicates how many words to process before showing/updating the progress. - update: bool - If true, the new words in `sentences` will be added to model's vocab. - - Example - ------- - Train a model and update vocab for online training - - .. sourcecode:: pycon - - >>> from gensim.models import FastText - >>> sentences_1 = [["cat", "say", "meow"], ["dog", "say", "woof"]] - >>> sentences_2 = [["dude", "say", "wazzup!"]] - >>> - >>> model = FastText(min_count=1) - >>> model.build_vocab(sentences_1) - >>> model.train(sentences_1, total_examples=model.corpus_count, epochs=model.iter) - >>> model.build_vocab(sentences_2, update=True) - >>> model.train(sentences_2, total_examples=model.corpus_count, epochs=model.iter) - - """ - if update: - if not len(self.wv.vocab): - raise RuntimeError( - "You cannot do an online vocabulary-update of a model which has no prior vocabulary. " - "First build the vocabulary of your model with a corpus " - "before doing an online update.") - self.old_vocab_len = len(self.wv.vocab) - self.old_hash2index_len = len(self.wv.hash2index) - - super(FastText, self).build_vocab( - sentences, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, progress_per=progress_per, update=update) - self.init_ngrams(update=update) - - def init_ngrams(self, update=False): - """Compute ngrams of all words present in vocabulary and stores vectors for only those ngrams. - Vectors for other ngrams are initialized with a random uniform distribution in FastText. - - Parameters - ---------- - update : bool - If True, the new vocab words and their new ngrams word vectors are initialized - with random uniform distribution and updated/added to the existing vocab word and ngram vectors. - - """ - if not update: - self.wv.ngrams = {} - self.wv.syn0_vocab = empty((len(self.wv.vocab), self.vector_size), dtype=REAL) - self.syn0_vocab_lockf = ones((len(self.wv.vocab), self.vector_size), dtype=REAL) - - self.wv.syn0_ngrams = empty((self.bucket, self.vector_size), dtype=REAL) - self.syn0_ngrams_lockf = ones((self.bucket, self.vector_size), dtype=REAL) - - all_ngrams = [] - for w, v in self.wv.vocab.items(): - self.wv.ngrams_word[w] = compute_ngrams(w, self.min_n, self.max_n) - all_ngrams += self.wv.ngrams_word[w] - - all_ngrams = list(set(all_ngrams)) - self.num_ngram_vectors = len(all_ngrams) - logger.info("Total number of ngrams is %d", len(all_ngrams)) - - self.wv.hash2index = {} - ngram_indices = [] - new_hash_count = 0 - for i, ngram in enumerate(all_ngrams): - ngram_hash = ft_hash(ngram) % self.bucket - if ngram_hash in self.wv.hash2index: - self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] - else: - ngram_indices.append(ngram_hash % self.bucket) - self.wv.hash2index[ngram_hash] = new_hash_count - self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] - new_hash_count = new_hash_count + 1 - - self.wv.syn0_ngrams = self.wv.syn0_ngrams.take(ngram_indices, axis=0) - self.syn0_ngrams_lockf = self.syn0_ngrams_lockf.take(ngram_indices, axis=0) - self.reset_ngram_weights() - else: - new_ngrams = [] - for w, v in self.wv.vocab.items(): - self.wv.ngrams_word[w] = compute_ngrams(w, self.min_n, self.max_n) - new_ngrams += [ng for ng in self.wv.ngrams_word[w] if ng not in self.wv.ngrams] - - new_ngrams = list(set(new_ngrams)) - logger.info("Number of new ngrams is %d", len(new_ngrams)) - new_hash_count = 0 - for i, ngram in enumerate(new_ngrams): - ngram_hash = ft_hash(ngram) % self.bucket - if ngram_hash not in self.wv.hash2index: - self.wv.hash2index[ngram_hash] = new_hash_count + self.old_hash2index_len - self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] - new_hash_count = new_hash_count + 1 - else: - self.wv.ngrams[ngram] = self.wv.hash2index[ngram_hash] - - rand_obj = np.random - rand_obj.seed(self.seed) - new_vocab_rows = rand_obj.uniform( - -1.0 / self.vector_size, 1.0 / self.vector_size, - (len(self.wv.vocab) - self.old_vocab_len, self.vector_size) - ).astype(REAL) - new_vocab_lockf_rows = ones((len(self.wv.vocab) - self.old_vocab_len, self.vector_size), dtype=REAL) - new_ngram_rows = rand_obj.uniform( - -1.0 / self.vector_size, 1.0 / self.vector_size, - (len(self.wv.hash2index) - self.old_hash2index_len, self.vector_size) - ).astype(REAL) - new_ngram_lockf_rows = ones( - (len(self.wv.hash2index) - self.old_hash2index_len, - self.vector_size), - dtype=REAL) - - self.wv.syn0_vocab = vstack([self.wv.syn0_vocab, new_vocab_rows]) - self.syn0_vocab_lockf = vstack([self.syn0_vocab_lockf, new_vocab_lockf_rows]) - self.wv.syn0_ngrams = vstack([self.wv.syn0_ngrams, new_ngram_rows]) - self.syn0_ngrams_lockf = vstack([self.syn0_ngrams_lockf, new_ngram_lockf_rows]) - - def reset_ngram_weights(self): - """Reset all projection weights to an initial (untrained) state, - but keep the existing vocabulary and their ngrams. - - """ - rand_obj = np.random - rand_obj.seed(self.seed) - for index in range(len(self.wv.vocab)): - self.wv.syn0_vocab[index] = rand_obj.uniform( - -1.0 / self.vector_size, 1.0 / self.vector_size, self.vector_size - ).astype(REAL) - for index in range(len(self.wv.hash2index)): - self.wv.syn0_ngrams[index] = rand_obj.uniform( - -1.0 / self.vector_size, 1.0 / self.vector_size, self.vector_size - ).astype(REAL) - - def _do_train_job(self, sentences, alpha, inits): - """Train a single batch of sentences. Return 2-tuple `(effective word count after - ignoring unknown words and sentence length trimming, total word count)`. - - Parameters - ---------- - sentences : iterable of iterables - The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, - consider an iterable that streams the sentences directly from disk/network. - See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. - alpha : float - The current learning rate. - inits : (:class:`numpy.ndarray`, :class:`numpy.ndarray`) - Each worker's private work memory. - - Returns - ------- - (int, int) - Tuple of (effective word count after ignoring unknown words and sentence length trimming, total word count) - - """ - work, neu1 = inits - tally = 0 - if self.sg: - tally += train_batch_sg(self, sentences, alpha, work, neu1) - else: - tally += train_batch_cbow(self, sentences, alpha, work, neu1) - - return tally, self._raw_word_count(sentences) - - def train(self, sentences, total_examples=None, total_words=None, - epochs=None, start_alpha=None, end_alpha=None, - word_count=0, queue_factor=2, report_delay=1.0): - """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). - For FastText, each sentence must be a list of unicode strings. (Subclasses may accept other examples.) - - To support linear learning-rate decay from (initial) alpha to min_alpha, and accurate - progress-percentage logging, either total_examples (count of sentences) or total_words (count of - raw words in sentences) **MUST** be provided (if the corpus is the same as was provided to - :meth:`~gensim.models.fasttext.FastText.build_vocab()`, the count of examples in that corpus - will be available in the model's :attr:`corpus_count` property). - - To avoid common mistakes around the model's ability to do multiple training passes itself, an - explicit `epochs` argument **MUST** be provided. In the common and recommended case, - where :meth:`~gensim.models.fasttext.FastText.train()` is only called once, - the model's cached `iter` value should be supplied as `epochs` value. - - Parameters - ---------- - sentences : iterable of iterables - The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, - consider an iterable that streams the sentences directly from disk/network. - See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. - total_examples : int - Count of sentences. - total_words : int - Count of raw words in sentences. - epochs : int - Number of iterations (epochs) over the corpus. - start_alpha : float - Initial learning rate. - end_alpha : float - Final learning rate. Drops linearly from `start_alpha`. - word_count : int - Count of words already trained. Set this to 0 for the usual - case of training on all words in sentences. - queue_factor : int - Multiplier for size of queue (number of workers * queue_factor). - report_delay : float - Seconds to wait before reporting progress. - - Examples - -------- - - .. sourcecode:: pycon - - >>> from gensim.models import FastText - >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] - >>> - >>> model = FastText(min_count=1) - >>> model.build_vocab(sentences) - >>> model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) - - """ - self.neg_labels = [] - if self.negative > 0: - # precompute negative labels optimization for pure-python training - self.neg_labels = zeros(self.negative + 1) - self.neg_labels[0] = 1. - - Word2Vec.train( - self, sentences, total_examples=self.corpus_count, epochs=self.iter, - start_alpha=self.alpha, end_alpha=self.min_alpha) - self.get_vocab_word_vecs() - - def __getitem__(self, word): - """Get `word` representations in vector space, as a 1D numpy array. - - Parameters - ---------- - word : str - A single word whose vector needs to be returned. - - Returns - ------- - :class:`numpy.ndarray` - The word's representations in vector space, as a 1D numpy array. - - Raises - ------ - KeyError - For words with all ngrams absent, a KeyError is raised. - - Example - ------- - .. sourcecode:: pycon - - >>> from gensim.models import FastText - >>> from gensim.test.utils import datapath - >>> - >>> trained_model = FastText.load_fasttext_format(datapath('lee_fasttext')) - >>> meow_vector = trained_model['hello'] # get vector for word - - """ - return self.word_vec(word) - - def get_vocab_word_vecs(self): - """Calculate vectors for words in vocabulary and stores them in `wv.syn0`.""" - for w, v in self.wv.vocab.items(): - word_vec = np.copy(self.wv.syn0_vocab[v.index]) - ngrams = self.wv.ngrams_word[w] - ngram_weights = self.wv.syn0_ngrams - for ngram in ngrams: - word_vec += ngram_weights[self.wv.ngrams[ngram]] - word_vec /= (len(ngrams) + 1) - self.wv.syn0[v.index] = word_vec - - def word_vec(self, word, use_norm=False): - """Get the word's representations in vector space, as a 1D numpy array. - - Parameters - ---------- - word : str - A single word whose vector needs to be returned. - use_norm : bool - If True, returns normalized vector. - - Returns - ------- - :class:`numpy.ndarray` - The word's representations in vector space, as a 1D numpy array. - - Raises - ------ - KeyError - For words with all ngrams absent, a KeyError is raised. - - Example - ------- - .. sourcecode:: pycon - - >>> from gensim.models import FastText - >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] - >>> - >>> model = FastText(sentences, min_count=1) - >>> meow_vector = model.word_vec('meow') # get vector for word - - """ - return FastTextKeyedVectors.word_vec(self.wv, word, use_norm=use_norm) - - @classmethod - def load_fasttext_format(cls, *args, **kwargs): - """Load a :class:`~gensim.models.fasttext.FastText` model from a format compatible with - the original fasttext implementation. - - Parameters - ---------- - fname : str - Path to the file. - - """ - return Ft_Wrapper.load_fasttext_format(*args, **kwargs) - - def save(self, *args, **kwargs): - """Save the model. This saved model can be loaded again using :func:`~gensim.models.fasttext.FastText.load`, - which supports online training and getting vectors for out-of-vocabulary words. - - Parameters - ---------- - fname : str - Path to the file. - - """ - kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'syn0_vocab_norm', 'syn0_ngrams_norm']) - super(FastText, self).save(*args, **kwargs) diff --git a/gensim/models/deprecated/fasttext_wrapper.py b/gensim/models/deprecated/fasttext_wrapper.py deleted file mode 100644 index 727db0e1e0..0000000000 --- a/gensim/models/deprecated/fasttext_wrapper.py +++ /dev/null @@ -1,461 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Author: Jayant Jain -# Copyright (C) 2017 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - - -""" -Warnings --------- -.. deprecated:: 3.2.0 - Use :mod:`gensim.models.fasttext` instead. - - -Python wrapper around word representation learning from FastText, a library for efficient learning -of word representations and sentence classification [1]. - -This module allows training a word embedding from a training corpus with the additional ability -to obtain word vectors for out-of-vocabulary words, using the fastText C implementation. - -The wrapped model can NOT be updated with new documents for online training -- use gensim's -`Word2Vec` for that. - -Example: -.. sourcecode:: pycon - - >>> from gensim.models.wrappers import FastText - >>> model = FastText.train('/Users/kofola/fastText/fasttext', corpus_file='text8') - >>> print model['forests'] # prints vector for given out-of-vocabulary word - -.. [1] https://github.com/facebookresearch/fastText#enriching-word-vectors-with-subword-information - - - -""" - - -import logging -import tempfile -import os -import struct - -import numpy as np -from numpy import float32 as REAL, sqrt, newaxis -from gensim import utils -from gensim.models.deprecated.keyedvectors import KeyedVectors, Vocab -from gensim.models.deprecated.word2vec import Word2Vec - -logger = logging.getLogger(__name__) - -try: - FileNotFoundError -except NameError: - FileNotFoundError = IOError - -FASTTEXT_FILEFORMAT_MAGIC = 793712314 - - -class FastTextKeyedVectors(KeyedVectors): - """ - Class to contain vectors, vocab and ngrams for the FastText training class and other methods not directly - involved in training such as most_similar(). - Subclasses KeyedVectors to implement oov lookups, storing ngrams and other FastText specific methods - - """ - - def __init__(self): - super(FastTextKeyedVectors, self).__init__() - self.syn0_vocab = None - self.syn0_vocab_norm = None - self.syn0_ngrams = None - self.syn0_ngrams_norm = None - self.ngrams = {} - self.hash2index = {} - self.ngrams_word = {} - self.min_n = 0 - self.max_n = 0 - - def save(self, *args, **kwargs): - # don't bother storing the cached normalized vectors - kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'syn0_vocab_norm', 'syn0_ngrams_norm']) - super(FastTextKeyedVectors, self).save(*args, **kwargs) - - def word_vec(self, word, use_norm=False): - """ - Accept a single word as input. - Returns the word's representations in vector space, as a 1D numpy array. - - The word can be out-of-vocabulary as long as ngrams for the word are present. - For words with all ngrams absent, a KeyError is raised. - - Example: - - .. sourcecode:: pycon - - >>> trained_model['office'] - array([ -1.40128313e-02, ...]) - - """ - if word in self.vocab: - return super(FastTextKeyedVectors, self).word_vec(word, use_norm) - else: - word_vec = np.zeros(self.syn0_ngrams.shape[1], dtype=np.float32) - ngrams = compute_ngrams(word, self.min_n, self.max_n) - ngrams = [ng for ng in ngrams if ng in self.ngrams] - if use_norm: - ngram_weights = self.syn0_ngrams_norm - else: - ngram_weights = self.syn0_ngrams - for ngram in ngrams: - word_vec += ngram_weights[self.ngrams[ngram]] - if word_vec.any(): - return word_vec / len(ngrams) - else: # No ngrams of the word are present in self.ngrams - raise KeyError('all ngrams for word %s absent from model' % word) - - def init_sims(self, replace=False): - """ - Precompute L2-normalized vectors. - - If `replace` is set, forget the original vectors and only keep the normalized - ones = saves lots of memory! - - Note that you **cannot continue training** after doing a replace. The model becomes - effectively read-only = you can only call `most_similar`, `similarity` etc. - - """ - super(FastTextKeyedVectors, self).init_sims(replace) - if getattr(self, 'syn0_ngrams_norm', None) is None or replace: - logger.info("precomputing L2-norms of ngram weight vectors") - if replace: - for i in range(self.syn0_ngrams.shape[0]): - self.syn0_ngrams[i, :] /= sqrt((self.syn0_ngrams[i, :] ** 2).sum(-1)) - self.syn0_ngrams_norm = self.syn0_ngrams - else: - self.syn0_ngrams_norm = \ - (self.syn0_ngrams / sqrt((self.syn0_ngrams ** 2).sum(-1))[..., newaxis]).astype(REAL) - - def __contains__(self, word): - """ - Check if `word` or any character ngrams in `word` are present in the vocabulary. - A vector for the word is guaranteed to exist if `__contains__` returns True. - """ - if word in self.vocab: - return True - else: - char_ngrams = compute_ngrams(word, self.min_n, self.max_n) - return any(ng in self.ngrams for ng in char_ngrams) - - @classmethod - def load_word2vec_format(cls, *args, **kwargs): - """Not suppported. Use gensim.models.KeyedVectors.load_word2vec_format instead.""" - raise NotImplementedError("Not supported. Use gensim.models.KeyedVectors.load_word2vec_format instead.") - - -class FastText(Word2Vec): - """ - Class for word vector training using FastText. Communication between FastText and Python - takes place by working with data files on disk and calling the FastText binary with - subprocess.call(). - Implements functionality similar to [fasttext.py](https://github.com/salestock/fastText.py), - improving speed and scope of functionality like `most_similar`, `similarity` by extracting vectors - into numpy matrix. - - Warnings - -------- - .. deprecated:: 3.2.0 - Use :class:`gensim.models.fasttext.FastText` instead of :class:`gensim.models.wrappers.fasttext.FastText`. - - - """ - - def initialize_word_vectors(self): - self.wv = FastTextKeyedVectors() - - @classmethod - def train(cls, ft_path, corpus_file, output_file=None, model='cbow', size=100, alpha=0.025, window=5, min_count=5, - word_ngrams=1, loss='ns', sample=1e-3, negative=5, iter=5, min_n=3, max_n=6, sorted_vocab=1, threads=12): - """ - `ft_path` is the path to the FastText executable, e.g. `/home/kofola/fastText/fasttext`. - - `corpus_file` is the filename of the text file to be used for training the FastText model. - Expects file to contain utf-8 encoded text. - - `model` defines the training algorithm. By default, cbow is used. Accepted values are - 'cbow', 'skipgram'. - - `size` is the dimensionality of the feature vectors. - - `window` is the maximum distance between the current and predicted word within a sentence. - - `alpha` is the initial learning rate. - - `min_count` = ignore all words with total occurrences lower than this. - - `word_ngram` = max length of word ngram - - `loss` = defines training objective. Allowed values are `hs` (hierarchical softmax), - `ns` (negative sampling) and `softmax`. Defaults to `ns` - - `sample` = threshold for configuring which higher-frequency words are randomly downsampled; - default is 1e-3, useful range is (0, 1e-5). - - `negative` = the value for negative specifies how many "noise words" should be drawn - (usually between 5-20). Default is 5. If set to 0, no negative samping is used. - Only relevant when `loss` is set to `ns` - - `iter` = number of iterations (epochs) over the corpus. Default is 5. - - `min_n` = min length of char ngrams to be used for training word representations. Default is 3. - - `max_n` = max length of char ngrams to be used for training word representations. Set `max_n` to be - lesser than `min_n` to avoid char ngrams being used. Default is 6. - - `sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before - assigning word indexes. - - `threads` = number of threads to use. Default is 12. - - """ - ft_path = ft_path - output_file = output_file or os.path.join(tempfile.gettempdir(), 'ft_model') - ft_args = { - 'input': corpus_file, - 'output': output_file, - 'lr': alpha, - 'dim': size, - 'ws': window, - 'epoch': iter, - 'minCount': min_count, - 'wordNgrams': word_ngrams, - 'neg': negative, - 'loss': loss, - 'minn': min_n, - 'maxn': max_n, - 'thread': threads, - 't': sample - } - cmd = [ft_path, model] - for option, value in ft_args.items(): - cmd.append("-%s" % option) - cmd.append(str(value)) - - utils.check_output(args=cmd) - model = cls.load_fasttext_format(output_file) - cls.delete_training_files(output_file) - return model - - def save(self, *args, **kwargs): - # don't bother storing the cached normalized vectors - kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'syn0_vocab_norm', 'syn0_ngrams_norm']) - super(FastText, self).save(*args, **kwargs) - - @classmethod - def load_fasttext_format(cls, model_file, encoding='utf8'): - """ - Load the input-hidden weight matrix from the fast text output files. - - Note that due to limitations in the FastText API, you cannot continue training - with a model loaded this way, though you can query for word similarity etc. - - `model_file` is the path to the FastText output files. - FastText outputs two model files - `/path/to/model.vec` and `/path/to/model.bin` - - Expected value for this example: `/path/to/model` or `/path/to/model.bin`, - as gensim requires only `.bin` file to load entire fastText model. - - """ - model = cls() - if not model_file.endswith('.bin'): - model_file += '.bin' - model.file_name = model_file - model.load_binary_data(encoding=encoding) - return model - - @classmethod - def load(cls, *args, **kwargs): - model = super(FastText, cls).load(*args, **kwargs) - if hasattr(model.wv, 'syn0_all'): - setattr(model.wv, 'syn0_ngrams', model.wv.syn0_all) - delattr(model.wv, 'syn0_all') - return model - - @classmethod - def delete_training_files(cls, model_file): - """Deletes the files created by FastText training""" - try: - os.remove('%s.vec' % model_file) - os.remove('%s.bin' % model_file) - except FileNotFoundError: - logger.debug('Training files %s not found when attempting to delete', model_file) - pass - - def load_binary_data(self, encoding='utf8'): - """Loads data from the output binary file created by FastText training""" - - # TODO use smart_open again when https://github.com/RaRe-Technologies/smart_open/issues/207 will be fixed - with open(self.file_name, 'rb') as f: - self.load_model_params(f) - self.load_dict(f, encoding=encoding) - self.load_vectors(f) - - def load_model_params(self, file_handle): - magic, version = self.struct_unpack(file_handle, '@2i') - if magic == FASTTEXT_FILEFORMAT_MAGIC: # newer format - self.new_format = True - dim, ws, epoch, min_count, neg, _, loss, model, bucket, minn, maxn, _, t = \ - self.struct_unpack(file_handle, '@12i1d') - else: # older format - self.new_format = False - dim = magic - ws = version - epoch, min_count, neg, _, loss, model, bucket, minn, maxn, _, t = self.struct_unpack(file_handle, '@10i1d') - # Parameters stored by [Args::save](https://github.com/facebookresearch/fastText/blob/master/src/args.cc) - self.vector_size = dim - self.window = ws - self.iter = epoch - self.min_count = min_count - self.negative = neg - self.hs = loss == 1 - self.sg = model == 2 - self.bucket = bucket - self.wv.min_n = minn - self.wv.max_n = maxn - self.sample = t - - def load_dict(self, file_handle, encoding='utf8'): - vocab_size, nwords, nlabels = self.struct_unpack(file_handle, '@3i') - # Vocab stored by [Dictionary::save](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc) - if nlabels > 0: - raise NotImplementedError("Supervised fastText models are not supported") - logger.info("loading %s words for fastText model from %s", vocab_size, self.file_name) - - self.struct_unpack(file_handle, '@1q') # number of tokens - if self.new_format: - pruneidx_size, = self.struct_unpack(file_handle, '@q') - for i in range(vocab_size): - word_bytes = b'' - char_byte = file_handle.read(1) - # Read vocab word - while char_byte != b'\x00': - word_bytes += char_byte - char_byte = file_handle.read(1) - word = word_bytes.decode(encoding) - count, _ = self.struct_unpack(file_handle, '@qb') - - self.wv.vocab[word] = Vocab(index=i, count=count) - self.wv.index2word.append(word) - - assert len(self.wv.vocab) == nwords, ( - 'mismatch between final vocab size ({} words), ' - 'and expected number of words ({} words)'.format(len(self.wv.vocab), nwords)) - if len(self.wv.vocab) != vocab_size: - # expecting to log this warning only for pretrained french vector, wiki.fr - logger.warning( - "mismatch between final vocab size (%s words), and expected vocab size (%s words)", - len(self.wv.vocab), vocab_size - ) - - if self.new_format: - for j in range(pruneidx_size): - self.struct_unpack(file_handle, '@2i') - - def load_vectors(self, file_handle): - if self.new_format: - self.struct_unpack(file_handle, '@?') # bool quant_input in fasttext.cc - num_vectors, dim = self.struct_unpack(file_handle, '@2q') - # Vectors stored by [Matrix::save](https://github.com/facebookresearch/fastText/blob/master/src/matrix.cc) - assert self.vector_size == dim, ( - 'mismatch between vector size in model params ({}) and model vectors ({})' - .format(self.vector_size, dim) - ) - float_size = struct.calcsize('@f') - if float_size == 4: - dtype = np.dtype(np.float32) - elif float_size == 8: - dtype = np.dtype(np.float64) - - self.num_original_vectors = num_vectors - self.wv.syn0_ngrams = np.fromfile(file_handle, dtype=dtype, count=num_vectors * dim) - self.wv.syn0_ngrams = self.wv.syn0_ngrams.reshape((num_vectors, dim)) - assert self.wv.syn0_ngrams.shape == (self.bucket + len(self.wv.vocab), self.vector_size), \ - 'mismatch between actual weight matrix shape {} and expected shape {}'\ - .format( - self.wv.syn0_ngrams.shape, (self.bucket + len(self.wv.vocab), self.vector_size) - ) - - self.init_ngrams() - - def struct_unpack(self, file_handle, fmt): - num_bytes = struct.calcsize(fmt) - return struct.unpack(fmt, file_handle.read(num_bytes)) - - def init_ngrams(self): - """ - Computes ngrams of all words present in vocabulary and stores vectors for only those ngrams. - Vectors for other ngrams are initialized with a random uniform distribution in FastText. These - vectors are discarded here to save space. - - """ - self.wv.ngrams = {} - all_ngrams = [] - self.wv.syn0 = np.zeros((len(self.wv.vocab), self.vector_size), dtype=REAL) - - for w, vocab in self.wv.vocab.items(): - all_ngrams += compute_ngrams(w, self.wv.min_n, self.wv.max_n) - self.wv.syn0[vocab.index] += np.array(self.wv.syn0_ngrams[vocab.index]) - - all_ngrams = set(all_ngrams) - self.num_ngram_vectors = len(all_ngrams) - ngram_indices = [] - for i, ngram in enumerate(all_ngrams): - ngram_hash = ft_hash(ngram) - ngram_indices.append(len(self.wv.vocab) + ngram_hash % self.bucket) - self.wv.ngrams[ngram] = i - self.wv.syn0_ngrams = self.wv.syn0_ngrams.take(ngram_indices, axis=0) - - ngram_weights = self.wv.syn0_ngrams - - logger.info( - "loading weights for %s words for fastText model from %s", - len(self.wv.vocab), self.file_name - ) - - for w, vocab in self.wv.vocab.items(): - word_ngrams = compute_ngrams(w, self.wv.min_n, self.wv.max_n) - for word_ngram in word_ngrams: - self.wv.syn0[vocab.index] += np.array(ngram_weights[self.wv.ngrams[word_ngram]]) - - self.wv.syn0[vocab.index] /= (len(word_ngrams) + 1) - logger.info( - "loaded %s weight matrix for fastText model from %s", - self.wv.syn0.shape, self.file_name - ) - - -def compute_ngrams(word, min_n, max_n): - BOW, EOW = ('<', '>') # Used by FastText to attach to all words as prefix and suffix - extended_word = BOW + word + EOW - ngrams = [] - for ngram_length in range(min_n, min(len(extended_word), max_n) + 1): - for i in range(0, len(extended_word) - ngram_length + 1): - ngrams.append(extended_word[i:i + ngram_length]) - return ngrams - - -def ft_hash(string): - """ - Reproduces [hash method](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc) - used in fastText. - - """ - # Runtime warnings for integer overflow are raised, this is expected behaviour. These warnings are suppressed. - old_settings = np.seterr(all='ignore') - h = np.uint32(2166136261) - for c in string: - h = h ^ np.uint32(ord(c)) - h = h * np.uint32(16777619) - np.seterr(**old_settings) - return h diff --git a/gensim/models/deprecated/keyedvectors.py b/gensim/models/deprecated/keyedvectors.py deleted file mode 100644 index a8983909d0..0000000000 --- a/gensim/models/deprecated/keyedvectors.py +++ /dev/null @@ -1,1115 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2016 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -""" -Warnings --------- -.. deprecated:: 3.3.0 - Use :mod:`gensim.models.keyedvectors` instead. - - -Word vector storage and similarity look-ups. -Common code independent of the way the vectors are trained(Word2Vec, FastText, WordRank, VarEmbed etc) - -The word vectors are considered read-only in this class. - -Initialize the vectors by training e.g. Word2Vec: - -.. sourcecode:: pycon - - >>> model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4) - >>> word_vectors = model.wv - -Persist the word vectors to disk with: - -.. sourcecode:: pycon - - >>> word_vectors.save(fname) - >>> word_vectors = KeyedVectors.load(fname) - -The vectors can also be instantiated from an existing file on disk -in the original Google's word2vec C format as a KeyedVectors instance: - -.. sourcecode:: pycon - - >>> from gensim.models.keyedvectors import KeyedVectors - >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False) # C text format - >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.bin', binary=True) # C binary format - -You can perform various syntactic/semantic NLP word tasks with the vectors. Some of them -are already built-in: - -.. sourcecode:: pycon - - >>> word_vectors.most_similar(positive=['woman', 'king'], negative=['man']) - [('queen', 0.50882536), ...] - - >>> word_vectors.most_similar_cosmul(positive=['woman', 'king'], negative=['man']) - [('queen', 0.71382287), ...] - - >>> word_vectors.doesnt_match("breakfast cereal dinner lunch".split()) - 'cereal' - - >>> word_vectors.similarity('woman', 'man') - 0.73723527 - -Correlation with human opinion on word similarity: - -.. sourcecode:: pycon - - >>> word_vectors.evaluate_word_pairs(os.path.join(module_path, 'test_data','wordsim353.tsv')) - 0.51, 0.62, 0.13 - -And on analogies: - -.. sourcecode:: pycon - - >>> word_vectors.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt')) - -and so on. - -""" -from __future__ import division # py3 "true division" - -import logging - -try: - from queue import Queue, Empty -except ImportError: - from Queue import Queue, Empty # noqa:F401 - -# If pyemd C extension is available, import it. -# If pyemd is attempted to be used, but isn't installed, ImportError will be raised in wmdistance -try: - from pyemd import emd - PYEMD_EXT = True -except (ImportError, ValueError): - PYEMD_EXT = False - -from numpy import dot, zeros, dtype, float32 as REAL,\ - double, array, vstack, fromstring, sqrt, newaxis,\ - ndarray, sum as np_sum, prod, ascontiguousarray,\ - argmax -import numpy as np - -from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc -from gensim.corpora.dictionary import Dictionary -from six import string_types, iteritems -from six.moves import range -from scipy import stats - - -logger = logging.getLogger(__name__) - - -class Vocab(object): - """ - A single vocabulary item, used internally for collecting per-word frequency/sampling info, - and for constructing binary trees (incl. both word leaves and inner nodes). - - """ - - def __init__(self, **kwargs): - self.count = 0 - self.__dict__.update(kwargs) - - def __lt__(self, other): # used for sorting in a priority queue - return self.count < other.count - - def __str__(self): - vals = ['%s:%r' % (key, self.__dict__[key]) for key in sorted(self.__dict__) if not key.startswith('_')] - return "%s(%s)" % (self.__class__.__name__, ', '.join(vals)) - - -class KeyedVectorsBase(utils.SaveLoad): - """ - Base class to contain vectors and vocab for any set of vectors which are each associated with a key. - - """ - - def __init__(self): - self.syn0 = [] - self.vocab = {} - self.index2word = [] - self.vector_size = None - - def save_word2vec_format(self, fname, fvocab=None, binary=False, total_vec=None): - """ - Store the input-hidden weight matrix in the same format used by the original - C word2vec-tool, for compatibility. - - `fname` is the file used to save the vectors in - `fvocab` is an optional file used to save the vocabulary - `binary` is an optional boolean indicating whether the data is to be saved - in binary word2vec format (default: False) - `total_vec` is an optional parameter to explicitly specify total no. of vectors - (in case word vectors are appended with document vectors afterwards) - - """ - if total_vec is None: - total_vec = len(self.vocab) - vector_size = self.syn0.shape[1] - if fvocab is not None: - logger.info("storing vocabulary in %s", fvocab) - with utils.open(fvocab, 'wb') as vout: - for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): - vout.write(utils.to_utf8("%s %s\n" % (word, vocab.count))) - logger.info("storing %sx%s projection weights into %s", total_vec, vector_size, fname) - assert (len(self.vocab), vector_size) == self.syn0.shape - with utils.open(fname, 'wb') as fout: - fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size))) - # store in sorted order: most frequent words at the top - for word, vocab in sorted(iteritems(self.vocab), key=lambda item: -item[1].count): - row = self.syn0[vocab.index] - if binary: - fout.write(utils.to_utf8(word) + b" " + row.tostring()) - else: - fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join("%f" % val for val in row)))) - - @classmethod - def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', - limit=None, datatype=REAL): - """ - Load the input-hidden weight matrix from the original C word2vec-tool format. - - Note that the information stored in the file is incomplete (the binary tree is missing), - so while you can query for word similarity etc., you cannot continue training - with a model loaded this way. - - `binary` is a boolean indicating whether the data is in binary word2vec format. - `norm_only` is a boolean indicating whether to only store normalised word2vec vectors in memory. - Word counts are read from `fvocab` filename, if set (this is the file generated - by `-save-vocab` flag of the original C tool). - - If you trained the C model using non-utf8 encoding for words, specify that - encoding in `encoding`. - - `unicode_errors`, default 'strict', is a string suitable to be passed as the `errors` - argument to the unicode() (Python 2.x) or str() (Python 3.x) function. If your source - file may include word tokens truncated in the middle of a multibyte unicode character - (as is common from the original word2vec.c tool), 'ignore' or 'replace' may help. - - `limit` sets a maximum number of word-vectors to read from the file. The default, - None, means read all. - - `datatype` (experimental) can coerce dimensions to a non-default float type (such - as np.float16) to save memory. (Such types may result in much slower bulk operations - or incompatibility with optimized routines.) - - """ - counts = None - if fvocab is not None: - logger.info("loading word counts from %s", fvocab) - counts = {} - with utils.open(fvocab, 'rb') as fin: - for line in fin: - word, count = utils.to_unicode(line).strip().split() - counts[word] = int(count) - - logger.info("loading projection weights from %s", fname) - with utils.open(fname, 'rb') as fin: - header = utils.to_unicode(fin.readline(), encoding=encoding) - vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format - if limit: - vocab_size = min(vocab_size, limit) - result = cls() - result.vector_size = vector_size - result.syn0 = zeros((vocab_size, vector_size), dtype=datatype) - - def add_word(word, weights): - word_id = len(result.vocab) - if word in result.vocab: - logger.warning("duplicate word '%s' in %s, ignoring all but first", word, fname) - return - if counts is None: - # most common scenario: no vocab file given. just make up some bogus counts, in descending order - result.vocab[word] = Vocab(index=word_id, count=vocab_size - word_id) - elif word in counts: - # use count from the vocab file - result.vocab[word] = Vocab(index=word_id, count=counts[word]) - else: - # vocab file given, but word is missing -- set count to None (TODO: or raise?) - logger.warning("vocabulary file is incomplete: '%s' is missing", word) - result.vocab[word] = Vocab(index=word_id, count=None) - result.syn0[word_id] = weights - result.index2word.append(word) - - if binary: - binary_len = dtype(REAL).itemsize * vector_size - for _ in range(vocab_size): - # mixed text and binary: read text first, then binary - word = [] - while True: - ch = fin.read(1) - if ch == b' ': - break - if ch == b'': - raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?") - if ch != b'\n': # ignore newlines in front of words (some binary files have) - word.append(ch) - word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors) - weights = fromstring(fin.read(binary_len), dtype=REAL) - add_word(word, weights) - else: - for line_no in range(vocab_size): - line = fin.readline() - if line == b'': - raise EOFError("unexpected end of input; is count incorrect or file otherwise damaged?") - parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ") - if len(parts) != vector_size + 1: - raise ValueError("invalid vector on line %s (is this really the text format?)" % line_no) - word, weights = parts[0], [REAL(x) for x in parts[1:]] - add_word(word, weights) - if result.syn0.shape[0] != len(result.vocab): - logger.info( - "duplicate words detected, shrinking matrix size from %i to %i", - result.syn0.shape[0], len(result.vocab) - ) - result.syn0 = ascontiguousarray(result.syn0[: len(result.vocab)]) - assert (len(result.vocab), vector_size) == result.syn0.shape - - logger.info("loaded %s matrix from %s", result.syn0.shape, fname) - return result - - def similarity(self, w1, w2): - """ - Compute similarity between vectors of two input words. - To be implemented by child class. - - """ - raise NotImplementedError - - def distance(self, w1, w2): - """ - Compute distance between vectors of two input words. - To be implemented by child class. - - """ - raise NotImplementedError - - def distances(self, word_or_vector, other_words=()): - """ - Compute distances from given word or vector to all words in `other_words`. - If `other_words` is empty, return distance between `word_or_vectors` and all words in vocab. - To be implemented by child class. - - """ - raise NotImplementedError - - def word_vec(self, word): - """ - Accept a single word as input. - Returns the word's representations in vector space, as a 1D numpy array. - - Example: - - .. sourcecode:: pycon - - >>> trained_model.word_vec('office') - array([ -1.40128313e-02, ...]) - - """ - if word in self.vocab: - result = self.syn0[self.vocab[word].index] - result.setflags(write=False) - return result - else: - raise KeyError("word '%s' not in vocabulary" % word) - - def __getitem__(self, words): - """ - Accept a single word or a list of words as input. - - If a single word: returns the word's representations in vector space, as - a 1D numpy array. - - Multiple words: return the words' representations in vector space, as a - 2d numpy array: #words x #vector_size. Matrix rows are in the same order - as in input. - - Example: - - .. sourcecode:: pycon - - >>> trained_model['office'] - array([ -1.40128313e-02, ...]) - - >>> trained_model[['office', 'products']] - array([ -1.40128313e-02, ...] - [ -1.70425311e-03, ...] - ...) - - """ - if isinstance(words, string_types): - # allow calls like trained_model['office'], as a shorthand for trained_model[['office']] - return self.word_vec(words) - - return vstack([self.word_vec(word) for word in words]) - - def __contains__(self, word): - return word in self.vocab - - def most_similar_to_given(self, w1, word_list): - """Return the word from word_list most similar to w1. - - Args: - w1 (str): a word - word_list (list): list of words containing a word most similar to w1 - - Returns: - the word in word_list with the highest similarity to w1 - - Raises: - KeyError: If w1 or any word in word_list is not in the vocabulary - - Example: - - .. sourcecode:: pycon - - >>> trained_model.most_similar_to_given('music', ['water', 'sound', 'backpack', 'mouse']) - 'sound' - - >>> trained_model.most_similar_to_given('snake', ['food', 'pencil', 'animal', 'phone']) - 'animal' - - """ - return word_list[argmax([self.similarity(w1, word) for word in word_list])] - - def words_closer_than(self, w1, w2): - """ - Returns all words that are closer to `w1` than `w2` is to `w1`. - - Parameters - ---------- - w1 : str - Input word. - w2 : str - Input word. - - Returns - ------- - list (str) - List of words that are closer to `w1` than `w2` is to `w1`. - - Examples - -------- - - .. sourcecode:: pycon - - >>> model.words_closer_than('carnivore.n.01', 'mammal.n.01') - ['dog.n.01', 'canine.n.02'] - - """ - all_distances = self.distances(w1) - w1_index = self.vocab[w1].index - w2_index = self.vocab[w2].index - closer_node_indices = np.where(all_distances < all_distances[w2_index])[0] - return [self.index2word[index] for index in closer_node_indices if index != w1_index] - - def rank(self, w1, w2): - """ - Rank of the distance of `w2` from `w1`, in relation to distances of all words from `w1`. - - Parameters - ---------- - w1 : str - Input word. - w2 : str - Input word. - - Returns - ------- - int - Rank of `w2` from `w1` in relation to all other nodes. - - Examples - -------- - - .. sourcecode:: pycon - - >>> model.rank('mammal.n.01', 'carnivore.n.01') - 3 - - """ - return len(self.words_closer_than(w1, w2)) + 1 - - -class EuclideanKeyedVectors(KeyedVectorsBase): - """ - Class to contain vectors and vocab for the Word2Vec training class and other w2v methods not directly - involved in training such as most_similar() - """ - - def __init__(self): - super(EuclideanKeyedVectors, self).__init__() - self.syn0norm = None - - @property - def wv(self): - return self - - def save(self, *args, **kwargs): - # don't bother storing the cached normalized vectors - kwargs['ignore'] = kwargs.get('ignore', ['syn0norm']) - super(EuclideanKeyedVectors, self).save(*args, **kwargs) - - def word_vec(self, word, use_norm=False): - """ - Accept a single word as input. - Returns the word's representations in vector space, as a 1D numpy array. - - If `use_norm` is True, returns the normalized word vector. - - Example: - - .. sourcecode:: pycon - - >>> trained_model['office'] - array([ -1.40128313e-02, ...]) - - """ - if word in self.vocab: - if use_norm: - result = self.syn0norm[self.vocab[word].index] - else: - result = self.syn0[self.vocab[word].index] - - result.setflags(write=False) - return result - else: - raise KeyError("word '%s' not in vocabulary" % word) - - def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=None, indexer=None): - """ - Find the top-N most similar words. Positive words contribute positively towards the - similarity, negative words negatively. - - This method computes cosine similarity between a simple mean of the projection - weight vectors of the given words and the vectors for each word in the model. - The method corresponds to the `word-analogy` and `distance` scripts in the original - word2vec implementation. - - If topn is False, most_similar returns the vector of similarity scores. - - `restrict_vocab` is an optional integer which limits the range of vectors which - are searched for most-similar values. For example, restrict_vocab=10000 would - only check the first 10000 word vectors in the vocabulary order. (This may be - meaningful if you've sorted the vocabulary by descending frequency.) - - Example: - - .. sourcecode:: pycon - - >>> trained_model.most_similar(positive=['woman', 'king'], negative=['man']) - [('queen', 0.50882536), ...] - - """ - if positive is None: - positive = [] - if negative is None: - negative = [] - - self.init_sims() - - if isinstance(positive, string_types) and not negative: - # allow calls like most_similar('dog'), as a shorthand for most_similar(['dog']) - positive = [positive] - - # add weights for each word, if not already present; default to 1.0 for positive and -1.0 for negative words - positive = [ - (word, 1.0) if isinstance(word, string_types + (ndarray,)) else word - for word in positive - ] - negative = [ - (word, -1.0) if isinstance(word, string_types + (ndarray,)) else word - for word in negative - ] - - # compute the weighted average of all words - all_words, mean = set(), [] - for word, weight in positive + negative: - if isinstance(word, ndarray): - mean.append(weight * word) - else: - mean.append(weight * self.word_vec(word, use_norm=True)) - if word in self.vocab: - all_words.add(self.vocab[word].index) - if not mean: - raise ValueError("cannot compute similarity with no input") - mean = matutils.unitvec(array(mean).mean(axis=0)).astype(REAL) - - if indexer is not None: - return indexer.most_similar(mean, topn) - - limited = self.syn0norm if restrict_vocab is None else self.syn0norm[:restrict_vocab] - dists = dot(limited, mean) - if not topn: - return dists - best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True) - # ignore (don't return) words from the input - result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words] - return result[:topn] - - def similar_by_word(self, word, topn=10, restrict_vocab=None): - """ - Find the top-N most similar words. - - If topn is False, similar_by_word returns the vector of similarity scores. - - `restrict_vocab` is an optional integer which limits the range of vectors which - are searched for most-similar values. For example, restrict_vocab=10000 would - only check the first 10000 word vectors in the vocabulary order. (This may be - meaningful if you've sorted the vocabulary by descending frequency.) - - Example: - - .. sourcecode:: pycon - - >>> trained_model.similar_by_word('graph') - [('user', 0.9999163150787354), ...] - - """ - return self.most_similar(positive=[word], topn=topn, restrict_vocab=restrict_vocab) - - def similar_by_vector(self, vector, topn=10, restrict_vocab=None): - """ - Find the top-N most similar words by vector. - - If topn is False, similar_by_vector returns the vector of similarity scores. - - `restrict_vocab` is an optional integer which limits the range of vectors which - are searched for most-similar values. For example, restrict_vocab=10000 would - only check the first 10000 word vectors in the vocabulary order. (This may be - meaningful if you've sorted the vocabulary by descending frequency.) - - Example:: - - >>> trained_model.similar_by_vector([1,2]) - [('survey', 0.9942699074745178), ...] - - """ - return self.most_similar(positive=[vector], topn=topn, restrict_vocab=restrict_vocab) - - def wmdistance(self, document1, document2): - """ - Compute the Word Mover's Distance between two documents. When using this - code, please consider citing the following papers: - - .. Ofir Pele and Michael Werman, "A linear time histogram metric for improved SIFT matching". - .. Ofir Pele and Michael Werman, "Fast and robust earth mover's distances". - .. Matt Kusner et al. "From Word Embeddings To Document Distances". - - Note that if one of the documents have no words that exist in the - Word2Vec vocab, `float('inf')` (i.e. infinity) will be returned. - - This method only works if `pyemd` is installed (can be installed via pip, but requires a C compiler). - - Example: - - .. sourcecode:: pycon - - >>> # Train word2vec model. - >>> model = Word2Vec(sentences) - - >>> # Some sentences to test. - >>> sentence_obama = 'Obama speaks to the media in Illinois'.lower().split() - >>> sentence_president = 'The president greets the press in Chicago'.lower().split() - - >>> # Remove their stopwords. - >>> from nltk.corpus import stopwords - >>> stopwords = nltk.corpus.stopwords.words('english') - >>> sentence_obama = [w for w in sentence_obama if w not in stopwords] - >>> sentence_president = [w for w in sentence_president if w not in stopwords] - - >>> # Compute WMD. - >>> distance = model.wmdistance(sentence_obama, sentence_president) - """ - - if not PYEMD_EXT: - raise ImportError("Please install pyemd Python package to compute WMD.") - - # Remove out-of-vocabulary words. - len_pre_oov1 = len(document1) - len_pre_oov2 = len(document2) - document1 = [token for token in document1 if token in self] - document2 = [token for token in document2 if token in self] - diff1 = len_pre_oov1 - len(document1) - diff2 = len_pre_oov2 - len(document2) - if diff1 > 0 or diff2 > 0: - logger.info('Removed %d and %d OOV words from document 1 and 2 (respectively).', diff1, diff2) - - if len(document1) == 0 or len(document2) == 0: - logger.info( - "At least one of the documents had no words that werein the vocabulary. " - "Aborting (returning inf)." - ) - return float('inf') - - dictionary = Dictionary(documents=[document1, document2]) - vocab_len = len(dictionary) - - if vocab_len == 1: - # Both documents are composed by a single unique token - return 0.0 - - # Sets for faster look-up. - docset1 = set(document1) - docset2 = set(document2) - - # Compute distance matrix. - distance_matrix = zeros((vocab_len, vocab_len), dtype=double) - for i, t1 in dictionary.items(): - for j, t2 in dictionary.items(): - if t1 not in docset1 or t2 not in docset2: - continue - # Compute Euclidean distance between word vectors. - distance_matrix[i, j] = sqrt(np_sum((self[t1] - self[t2])**2)) - - if np_sum(distance_matrix) == 0.0: - # `emd` gets stuck if the distance matrix contains only zeros. - logger.info('The distance matrix is all zeros. Aborting (returning inf).') - return float('inf') - - def nbow(document): - d = zeros(vocab_len, dtype=double) - nbow = dictionary.doc2bow(document) # Word frequencies. - doc_len = len(document) - for idx, freq in nbow: - d[idx] = freq / float(doc_len) # Normalized word frequencies. - return d - - # Compute nBOW representation of documents. - d1 = nbow(document1) - d2 = nbow(document2) - - # Compute WMD. - return emd(d1, d2, distance_matrix) - - def most_similar_cosmul(self, positive=None, negative=None, topn=10): - """ - Find the top-N most similar words, using the multiplicative combination objective - proposed by Omer Levy and Yoav Goldberg in [4]_. Positive words still contribute - positively towards the similarity, negative words negatively, but with less - susceptibility to one large distance dominating the calculation. - - In the common analogy-solving case, of two positive and one negative examples, - this method is equivalent to the "3CosMul" objective (equation (4)) of Levy and Goldberg. - - Additional positive or negative examples contribute to the numerator or denominator, - respectively – a potentially sensible but untested extension of the method. (With - a single positive example, rankings will be the same as in the default most_similar.) - - Example: - - .. sourcecode:: pycon - - >>> trained_model.most_similar_cosmul(positive=['baghdad', 'england'], negative=['london']) - [(u'iraq', 0.8488819003105164), ...] - - .. [4] Omer Levy and Yoav Goldberg. Linguistic Regularities in Sparse and Explicit Word Representations, 2014. - - """ - if positive is None: - positive = [] - if negative is None: - negative = [] - - self.init_sims() - - if isinstance(positive, string_types) and not negative: - # allow calls like most_similar_cosmul('dog'), as a shorthand for most_similar_cosmul(['dog']) - positive = [positive] - - all_words = { - self.vocab[word].index for word in positive + negative - if not isinstance(word, ndarray) and word in self.vocab - } - - positive = [ - self.word_vec(word, use_norm=True) if isinstance(word, string_types) else word - for word in positive - ] - negative = [ - self.word_vec(word, use_norm=True) if isinstance(word, string_types) else word - for word in negative - ] - - if not positive: - raise ValueError("cannot compute similarity with no input") - - # equation (4) of Levy & Goldberg "Linguistic Regularities...", - # with distances shifted to [0,1] per footnote (7) - pos_dists = [((1 + dot(self.syn0norm, term)) / 2) for term in positive] - neg_dists = [((1 + dot(self.syn0norm, term)) / 2) for term in negative] - dists = prod(pos_dists, axis=0) / (prod(neg_dists, axis=0) + 0.000001) - - if not topn: - return dists - best = matutils.argsort(dists, topn=topn + len(all_words), reverse=True) - # ignore (don't return) words from the input - result = [(self.index2word[sim], float(dists[sim])) for sim in best if sim not in all_words] - return result[:topn] - - def doesnt_match(self, words): - """ - Which word from the given list doesn't go with the others? - - Example:: - - >>> trained_model.doesnt_match("breakfast cereal dinner lunch".split()) - 'cereal' - - """ - self.init_sims() - - used_words = [word for word in words if word in self] - if len(used_words) != len(words): - ignored_words = set(words) - set(used_words) - logger.warning("vectors for words %s are not present in the model, ignoring these words", ignored_words) - if not used_words: - raise ValueError("cannot select a word from an empty list") - vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL) - mean = matutils.unitvec(vectors.mean(axis=0)).astype(REAL) - dists = dot(vectors, mean) - return sorted(zip(dists, used_words))[0][1] - - @staticmethod - def cosine_similarities(vector_1, vectors_all): - """ - Return cosine similarities between one vector and a set of other vectors. - - Parameters - ---------- - vector_1 : numpy.array - vector from which similarities are to be computed. - expected shape (dim,) - vectors_all : numpy.array - for each row in vectors_all, distance from vector_1 is computed. - expected shape (num_vectors, dim) - - Returns - ------- - numpy.array - Contains cosine distance between vector_1 and each row in vectors_all. - shape (num_vectors,) - - """ - norm = np.linalg.norm(vector_1) - all_norms = np.linalg.norm(vectors_all, axis=1) - dot_products = dot(vectors_all, vector_1) - similarities = dot_products / (norm * all_norms) - return similarities - - def distances(self, word_or_vector, other_words=()): - """ - Compute cosine distances from given word or vector to all words in `other_words`. - If `other_words` is empty, return distance between `word_or_vectors` and all words in vocab. - - Parameters - ---------- - word_or_vector : str or numpy.array - Word or vector from which distances are to be computed. - - other_words : iterable(str) or None - For each word in `other_words` distance from `word_or_vector` is computed. - If None or empty, distance of `word_or_vector` from all words in vocab is computed (including itself). - - Returns - ------- - numpy.array - Array containing distances to all words in `other_words` from input `word_or_vector`, - in the same order as `other_words`. - - Notes - ----- - Raises KeyError if either `word_or_vector` or any word in `other_words` is absent from vocab. - - """ - if isinstance(word_or_vector, string_types): - input_vector = self.word_vec(word_or_vector) - else: - input_vector = word_or_vector - if not other_words: - other_vectors = self.syn0 - else: - other_indices = [self.vocab[word].index for word in other_words] - other_vectors = self.syn0[other_indices] - return 1 - self.cosine_similarities(input_vector, other_vectors) - - def distance(self, w1, w2): - """ - Compute cosine distance between two words. - - Example: - - .. sourcecode:: pycon - - >>> trained_model.distance('woman', 'man') - 0.34 - - >>> trained_model.distance('woman', 'woman') - 0.0 - - """ - return 1 - self.similarity(w1, w2) - - def similarity(self, w1, w2): - """ - Compute cosine similarity between two words. - - Example: - - .. sourcecode:: pycon - - >>> trained_model.similarity('woman', 'man') - 0.73723527 - - >>> trained_model.similarity('woman', 'woman') - 1.0 - - """ - return dot(matutils.unitvec(self[w1]), matutils.unitvec(self[w2])) - - def n_similarity(self, ws1, ws2): - """ - Compute cosine similarity between two sets of words. - - Example: - - .. sourcecode:: pycon - - >>> trained_model.n_similarity(['sushi', 'shop'], ['japanese', 'restaurant']) - 0.61540466561049689 - - >>> trained_model.n_similarity(['restaurant', 'japanese'], ['japanese', 'restaurant']) - 1.0000000000000004 - - >>> trained_model.n_similarity(['sushi'], ['restaurant']) == trained_model.similarity('sushi', 'restaurant') - True - - """ - if not(len(ws1) and len(ws2)): - raise ZeroDivisionError('At least one of the passed list is empty.') - v1 = [self[word] for word in ws1] - v2 = [self[word] for word in ws2] - return dot(matutils.unitvec(array(v1).mean(axis=0)), matutils.unitvec(array(v2).mean(axis=0))) - - @staticmethod - def log_accuracy(section): - correct, incorrect = len(section['correct']), len(section['incorrect']) - if correct + incorrect > 0: - logger.info( - "%s: %.1f%% (%i/%i)", - section['section'], 100.0 * correct / (correct + incorrect), correct, correct + incorrect - ) - - def accuracy(self, questions, restrict_vocab=30000, most_similar=most_similar, case_insensitive=True): - """ - Compute accuracy of the model. `questions` is a filename where lines are - 4-tuples of words, split into sections by ": SECTION NAME" lines. - See questions-words.txt in - https://storage.googleapis.com/google-code-archive-source/v2/code.google.com/word2vec/source-archive.zip - for an example. - - The accuracy is reported (=printed to log and returned as a list) for each - section separately, plus there's one aggregate summary at the end. - - Use `restrict_vocab` to ignore all questions containing a word not in the first `restrict_vocab` - words (default 30,000). This may be meaningful if you've sorted the vocabulary by descending frequency. - In case `case_insensitive` is True, the first `restrict_vocab` words are taken first, and then - case normalization is performed. - - Use `case_insensitive` to convert all words in questions and vocab to their uppercase form before - evaluating the accuracy (default True). Useful in case of case-mismatch between training tokens - and question words. In case of multiple case variants of a single word, the vector for the first - occurrence (also the most frequent if vocabulary is sorted) is taken. - - This method corresponds to the `compute-accuracy` script of the original C word2vec. - - """ - ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]] - ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab) - - sections, section = [], None - with utils.open(questions, 'rb') as f: - for line_no, line in enumerate(f): - # TODO: use level3 BLAS (=evaluate multiple questions at once), for speed - line = utils.to_unicode(line) - if line.startswith(': '): - # a new section starts => store the old section - if section: - sections.append(section) - self.log_accuracy(section) - section = {'section': line.lstrip(': ').strip(), 'correct': [], 'incorrect': []} - else: - if not section: - raise ValueError("missing section header before line #%i in %s" % (line_no, questions)) - try: - if case_insensitive: - a, b, c, expected = [word.upper() for word in line.split()] - else: - a, b, c, expected = [word for word in line.split()] - except ValueError: - logger.info("skipping invalid line #%i in %s", line_no, questions) - continue - if a not in ok_vocab or b not in ok_vocab or c not in ok_vocab or expected not in ok_vocab: - logger.debug("skipping line #%i with OOV words: %s", line_no, line.strip()) - continue - - original_vocab = self.vocab - self.vocab = ok_vocab - ignore = {a, b, c} # input words to be ignored - predicted = None - # find the most likely prediction, ignoring OOV words and input words - sims = most_similar(self, positive=[b, c], negative=[a], topn=False, restrict_vocab=restrict_vocab) - self.vocab = original_vocab - for index in matutils.argsort(sims, reverse=True): - predicted = self.index2word[index].upper() if case_insensitive else self.index2word[index] - if predicted in ok_vocab and predicted not in ignore: - if predicted != expected: - logger.debug("%s: expected %s, predicted %s", line.strip(), expected, predicted) - break - if predicted == expected: - section['correct'].append((a, b, c, expected)) - else: - section['incorrect'].append((a, b, c, expected)) - if section: - # store the last section, too - sections.append(section) - self.log_accuracy(section) - - total = { - 'section': 'total', - 'correct': sum((s['correct'] for s in sections), []), - 'incorrect': sum((s['incorrect'] for s in sections), []), - } - self.log_accuracy(total) - sections.append(total) - return sections - - @staticmethod - def log_evaluate_word_pairs(pearson, spearman, oov, pairs): - logger.info('Pearson correlation coefficient against %s: %.4f', pairs, pearson[0]) - logger.info('Spearman rank-order correlation coefficient against %s: %.4f', pairs, spearman[0]) - logger.info('Pairs with unknown words ratio: %.1f%%', oov) - - def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, - case_insensitive=True, dummy4unknown=False): - """ - Compute correlation of the model with human similarity judgments. `pairs` is a filename of a dataset where - lines are 3-tuples, each consisting of a word pair and a similarity value, separated by `delimiter`. - An example dataset is included in Gensim (test/test_data/wordsim353.tsv). More datasets can be found at - http://technion.ac.il/~ira.leviant/MultilingualVSMdata.html or https://www.cl.cam.ac.uk/~fh295/simlex.html. - - The model is evaluated using Pearson correlation coefficient and Spearman rank-order correlation coefficient - between the similarities from the dataset and the similarities produced by the model itself. - The results are printed to log and returned as a triple (pearson, spearman, ratio of pairs with unknown words). - - Use `restrict_vocab` to ignore all word pairs containing a word not in the first `restrict_vocab` - words (default 300,000). This may be meaningful if you've sorted the vocabulary by descending frequency. - If `case_insensitive` is True, the first `restrict_vocab` words are taken, and then case normalization - is performed. - - Use `case_insensitive` to convert all words in the pairs and vocab to their uppercase form before - evaluating the model (default True). Useful when you expect case-mismatch between training tokens - and words pairs in the dataset. If there are multiple case variants of a single word, the vector for the first - occurrence (also the most frequent if vocabulary is sorted) is taken. - - Use `dummy4unknown=True` to produce zero-valued similarities for pairs with out-of-vocabulary words. - Otherwise (default False), these pairs are skipped entirely. - """ - ok_vocab = [(w, self.vocab[w]) for w in self.index2word[:restrict_vocab]] - ok_vocab = {w.upper(): v for w, v in reversed(ok_vocab)} if case_insensitive else dict(ok_vocab) - - similarity_gold = [] - similarity_model = [] - oov = 0 - - original_vocab = self.vocab - self.vocab = ok_vocab - - with utils.open(pairs, 'rb') as f: - for line_no, line in enumerate(f): - line = utils.to_unicode(line) - if line.startswith('#'): - # May be a comment - continue - else: - try: - if case_insensitive: - a, b, sim = [word.upper() for word in line.split(delimiter)] - else: - a, b, sim = [word for word in line.split(delimiter)] - sim = float(sim) - except (ValueError, TypeError): - logger.info('skipping invalid line #%d in %s', line_no, pairs) - continue - if a not in ok_vocab or b not in ok_vocab: - oov += 1 - if dummy4unknown: - similarity_model.append(0.0) - similarity_gold.append(sim) - continue - else: - logger.debug('skipping line #%d with OOV words: %s', line_no, line.strip()) - continue - similarity_gold.append(sim) # Similarity from the dataset - similarity_model.append(self.similarity(a, b)) # Similarity from the model - self.vocab = original_vocab - spearman = stats.spearmanr(similarity_gold, similarity_model) - pearson = stats.pearsonr(similarity_gold, similarity_model) - oov_ratio = float(oov) / (len(similarity_gold) + oov) * 100 - - logger.debug('Pearson correlation coefficient against %s: %f with p-value %f', pairs, pearson[0], pearson[1]) - logger.debug( - 'Spearman rank-order correlation coefficient against %s: %f with p-value %f', - pairs, spearman[0], spearman[1] - ) - logger.debug('Pairs with unknown words: %d', oov) - self.log_evaluate_word_pairs(pearson, spearman, oov_ratio, pairs) - return pearson, spearman, oov_ratio - - def init_sims(self, replace=False): - """ - Precompute L2-normalized vectors. - - If `replace` is set, forget the original vectors and only keep the normalized - ones = saves lots of memory! - - Note that you **cannot continue training** after doing a replace. The model becomes - effectively read-only = you can call `most_similar`, `similarity` etc., but not `train`. - - """ - if getattr(self, 'syn0norm', None) is None or replace: - logger.info("precomputing L2-norms of word weight vectors") - if replace: - for i in range(self.syn0.shape[0]): - self.syn0[i, :] /= sqrt((self.syn0[i, :] ** 2).sum(-1)) - self.syn0norm = self.syn0 - else: - self.syn0norm = (self.syn0 / sqrt((self.syn0 ** 2).sum(-1))[..., newaxis]).astype(REAL) - - def get_keras_embedding(self, train_embeddings=False): - """ - Return a Keras 'Embedding' layer with weights set as the Word2Vec model's learned word embeddings - """ - try: - from keras.layers import Embedding - except ImportError: - raise ImportError("Please install Keras to use this function") - weights = self.syn0 - - # set `trainable` as `False` to use the pretrained word embedding - # No extra mem usage here as `Embedding` layer doesn't create any new matrix for weights - layer = Embedding( - input_dim=weights.shape[0], output_dim=weights.shape[1], - weights=[weights], trainable=train_embeddings - ) - return layer - - -# For backward compatibility -KeyedVectors = EuclideanKeyedVectors diff --git a/gensim/models/deprecated/old_saveload.py b/gensim/models/deprecated/old_saveload.py deleted file mode 100644 index 750d83ed44..0000000000 --- a/gensim/models/deprecated/old_saveload.py +++ /dev/null @@ -1,398 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2018 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Warnings --------- -.. deprecated:: 3.3.0 - Use :mod:`gensim.utils` instead. - - -Class containing the old SaveLoad class with modeified `unpickle` function is support loading models saved using -an older gensim version. - -""" -from __future__ import with_statement - -import logging - -try: - import cPickle as _pickle -except ImportError: - import pickle as _pickle - -import re -import sys - -import numpy as np -import scipy.sparse - -from six import iteritems - -from gensim import utils - -if sys.version_info[0] >= 3: - unicode = str - -logger = logging.getLogger(__name__) - - -PAT_ALPHABETIC = re.compile(r'(((?![\d])\w)+)', re.UNICODE) -RE_HTML_ENTITY = re.compile(r'&(#?)([xX]?)(\w{1,8});', re.UNICODE) - - -class SaveLoad(object): - """Class which inherit from this class have save/load functions, which un/pickle them to disk. - - Warnings - -------- - This uses pickle for de/serializing, so objects must not contain unpicklable attributes, - such as lambda functions etc. - - """ - @classmethod - def load(cls, fname, mmap=None): - """Load a previously saved object (using :meth:`~gensim.utils.SaveLoad.save`) from file. - - Parameters - ---------- - fname : str - Path to file that contains needed object. - mmap : str, optional - Memory-map option. If the object was saved with large arrays stored separately, you can load these arrays - via mmap (shared memory) using `mmap='r'. - If the file being loaded is compressed (either '.gz' or '.bz2'), then `mmap=None` **must be** set. - - See Also - -------- - :meth:`~gensim.utils.SaveLoad.save` - - Returns - ------- - object - Object loaded from `fname`. - - Raises - ------ - IOError - When methods are called on instance (should be called from class). - - """ - logger.info("loading %s object from %s", cls.__name__, fname) - - compress, subname = SaveLoad._adapt_by_suffix(fname) - - obj = unpickle(fname) - obj._load_specials(fname, mmap, compress, subname) - logger.info("loaded %s", fname) - return obj - - def _load_specials(self, fname, mmap, compress, subname): - """Loads any attributes that were stored specially, and gives the same opportunity - to recursively included :class:`~gensim.utils.SaveLoad` instances. - - Parameters - ---------- - fname : str - Path to file that contains needed object. - mmap : str - Memory-map option. - compress : bool - Set to True if file is compressed. - subname : str - ... - - - """ - def mmap_error(obj, filename): - return IOError( - 'Cannot mmap compressed object %s in file %s. ' % (obj, filename) - + 'Use `load(fname, mmap=None)` or uncompress files manually.' - ) - - for attrib in getattr(self, '__recursive_saveloads', []): - cfname = '.'.join((fname, attrib)) - logger.info("loading %s recursively from %s.* with mmap=%s", attrib, cfname, mmap) - getattr(self, attrib)._load_specials(cfname, mmap, compress, subname) - - for attrib in getattr(self, '__numpys', []): - logger.info("loading %s from %s with mmap=%s", attrib, subname(fname, attrib), mmap) - - if compress: - if mmap: - raise mmap_error(attrib, subname(fname, attrib)) - - val = np.load(subname(fname, attrib))['val'] - else: - val = np.load(subname(fname, attrib), mmap_mode=mmap) - - setattr(self, attrib, val) - - for attrib in getattr(self, '__scipys', []): - logger.info("loading %s from %s with mmap=%s", attrib, subname(fname, attrib), mmap) - sparse = unpickle(subname(fname, attrib)) - if compress: - if mmap: - raise mmap_error(attrib, subname(fname, attrib)) - - with np.load(subname(fname, attrib, 'sparse')) as f: - sparse.data = f['data'] - sparse.indptr = f['indptr'] - sparse.indices = f['indices'] - else: - sparse.data = np.load(subname(fname, attrib, 'data'), mmap_mode=mmap) - sparse.indptr = np.load(subname(fname, attrib, 'indptr'), mmap_mode=mmap) - sparse.indices = np.load(subname(fname, attrib, 'indices'), mmap_mode=mmap) - - setattr(self, attrib, sparse) - - for attrib in getattr(self, '__ignoreds', []): - logger.info("setting ignored attribute %s to None", attrib) - setattr(self, attrib, None) - - @staticmethod - def _adapt_by_suffix(fname): - """Give appropriate compress setting and filename formula. - - Parameters - ---------- - fname : str - Input filename. - - Returns - ------- - (bool, function) - First argument will be True if `fname` compressed. - - """ - compress, suffix = (True, 'npz') if fname.endswith('.gz') or fname.endswith('.bz2') else (False, 'npy') - return compress, lambda *args: '.'.join(args + (suffix,)) - - def _smart_save(self, fname, separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=2): - """Save the object to file. - - Parameters - ---------- - fname : str - Path to file. - separately : list, optional - Iterable of attributes than need to store distinctly. - sep_limit : int, optional - Limit for separation. - ignore : frozenset, optional - Attributes that shouldn't be store. - pickle_protocol : int, optional - Protocol number for pickle. - - Notes - ----- - If `separately` is None, automatically detect large - numpy/scipy.sparse arrays in the object being stored, and store - them into separate files. This avoids pickle memory errors and - allows mmap'ing large arrays back on load efficiently. - - You can also set `separately` manually, in which case it must be - a list of attribute names to be stored in separate files. The - automatic check is not performed in this case. - - See Also - -------- - :meth:`~gensim.utils.SaveLoad.load` - - """ - logger.info("saving %s object under %s, separately %s", self.__class__.__name__, fname, separately) - - compress, subname = SaveLoad._adapt_by_suffix(fname) - - restores = self._save_specials(fname, separately, sep_limit, ignore, pickle_protocol, - compress, subname) - try: - pickle(self, fname, protocol=pickle_protocol) - finally: - # restore attribs handled specially - for obj, asides in restores: - for attrib, val in iteritems(asides): - setattr(obj, attrib, val) - logger.info("saved %s", fname) - - def _save_specials(self, fname, separately, sep_limit, ignore, pickle_protocol, compress, subname): - """Save aside any attributes that need to be handled separately, including - by recursion any attributes that are themselves :class:`~gensim.utils.SaveLoad` instances. - - Parameters - ---------- - fname : str - Output filename. - separately : list or None - Iterable of attributes than need to store distinctly - sep_limit : int - Limit for separation. - ignore : iterable of str - Attributes that shouldn't be store. - pickle_protocol : int - Protocol number for pickle. - compress : bool - If True - compress output with :func:`numpy.savez_compressed`. - subname : function - Produced by :meth:`~gensim.utils.SaveLoad._adapt_by_suffix` - - Returns - ------- - list of (obj, {attrib: value, ...}) - Settings that the caller should use to restore each object's attributes that were set aside - during the default :func:`~gensim.utils.pickle`. - - """ - asides = {} - sparse_matrices = (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix) - if separately is None: - separately = [] - for attrib, val in iteritems(self.__dict__): - if isinstance(val, np.ndarray) and val.size >= sep_limit: - separately.append(attrib) - elif isinstance(val, sparse_matrices) and val.nnz >= sep_limit: - separately.append(attrib) - - # whatever's in `separately` or `ignore` at this point won't get pickled - for attrib in separately + list(ignore): - if hasattr(self, attrib): - asides[attrib] = getattr(self, attrib) - delattr(self, attrib) - - recursive_saveloads = [] - restores = [] - for attrib, val in iteritems(self.__dict__): - if hasattr(val, '_save_specials'): # better than 'isinstance(val, SaveLoad)' if IPython reloading - recursive_saveloads.append(attrib) - cfname = '.'.join((fname, attrib)) - restores.extend(val._save_specials(cfname, None, sep_limit, ignore, pickle_protocol, compress, subname)) - - try: - numpys, scipys, ignoreds = [], [], [] - for attrib, val in iteritems(asides): - if isinstance(val, np.ndarray) and attrib not in ignore: - numpys.append(attrib) - logger.info("storing np array '%s' to %s", attrib, subname(fname, attrib)) - - if compress: - np.savez_compressed(subname(fname, attrib), val=np.ascontiguousarray(val)) - else: - np.save(subname(fname, attrib), np.ascontiguousarray(val)) - - elif isinstance(val, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)) and attrib not in ignore: - scipys.append(attrib) - logger.info("storing scipy.sparse array '%s' under %s", attrib, subname(fname, attrib)) - - if compress: - np.savez_compressed( - subname(fname, attrib, 'sparse'), - data=val.data, - indptr=val.indptr, - indices=val.indices - ) - else: - np.save(subname(fname, attrib, 'data'), val.data) - np.save(subname(fname, attrib, 'indptr'), val.indptr) - np.save(subname(fname, attrib, 'indices'), val.indices) - - data, indptr, indices = val.data, val.indptr, val.indices - val.data, val.indptr, val.indices = None, None, None - - try: - # store array-less object - pickle(val, subname(fname, attrib), protocol=pickle_protocol) - finally: - val.data, val.indptr, val.indices = data, indptr, indices - else: - logger.info("not storing attribute %s", attrib) - ignoreds.append(attrib) - - self.__dict__['__numpys'] = numpys - self.__dict__['__scipys'] = scipys - self.__dict__['__ignoreds'] = ignoreds - self.__dict__['__recursive_saveloads'] = recursive_saveloads - except Exception: - # restore the attributes if exception-interrupted - for attrib, val in iteritems(asides): - setattr(self, attrib, val) - raise - return restores + [(self, asides)] - - def save(self, fname_or_handle, separately=None, sep_limit=10 * 1024**2, ignore=frozenset(), pickle_protocol=2): - """Save the object to file. - - Parameters - ---------- - fname_or_handle : str or file-like - Path to output file or already opened file-like object. If the object is a file handle, - no special array handling will be performed, all attributes will be saved to the same file. - separately : list of str or None, optional - If None - automatically detect large numpy/scipy.sparse arrays in the object being stored, and store - them into separate files. This avoids pickle memory errors and allows mmap'ing large arrays - back on load efficiently. - If list of str - this attributes will be stored in separate files, the automatic check - is not performed in this case. - sep_limit : int - Limit for automatic separation. - ignore : frozenset of str - Attributes that shouldn't be serialize/store. - pickle_protocol : int - Protocol number for pickle. - - See Also - -------- - :meth:`~gensim.utils.SaveLoad.load` - - """ - try: - _pickle.dump(self, fname_or_handle, protocol=pickle_protocol) - logger.info("saved %s object", self.__class__.__name__) - except TypeError: # `fname_or_handle` does not have write attribute - self._smart_save(fname_or_handle, separately, sep_limit, ignore, pickle_protocol=pickle_protocol) - - -def unpickle(fname): - """Load object from `fname`. - - Parameters - ---------- - fname : str - Path to pickle file. - - Returns - ------- - object - Python object loaded from `fname`. - - """ - with utils.open(fname, 'rb') as f: - file_bytes = f.read() - file_bytes = file_bytes.replace(b'gensim.models.word2vec', b'gensim.models.deprecated.word2vec') - file_bytes = file_bytes.replace(b'gensim.models.keyedvectors', b'gensim.models.deprecated.keyedvectors') - file_bytes = file_bytes.replace(b'gensim.models.doc2vec', b'gensim.models.deprecated.doc2vec') - file_bytes = file_bytes.replace(b'gensim.models.fasttext', b'gensim.models.deprecated.fasttext') - file_bytes = file_bytes.replace( - b'gensim.models.wrappers.fasttext', b'gensim.models.deprecated.fasttext_wrapper') - if sys.version_info > (3, 0): - return _pickle.loads(file_bytes, encoding='latin1') - else: - return _pickle.loads(file_bytes) - - -def pickle(obj, fname, protocol=2): - """Pickle object `obj` to file `fname`. - - Parameters - ---------- - obj : object - Any python object. - fname : str - Path to pickle file. - protocol : int, optional - Pickle protocol number, default is 2 to support compatible across python 2.x and 3.x. - - """ - with utils.open(fname, 'wb') as fout: # 'b' for binary, needed on Windows - _pickle.dump(obj, fout, protocol=protocol) diff --git a/gensim/models/deprecated/word2vec.py b/gensim/models/deprecated/word2vec.py deleted file mode 100644 index d57a902c55..0000000000 --- a/gensim/models/deprecated/word2vec.py +++ /dev/null @@ -1,1907 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2013 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - - -""" -Warnings --------- -.. deprecated:: 3.3.0 - Use :mod:`gensim.models.word2vec` instead. - - -Produce word vectors with deep learning via word2vec's "skip-gram and CBOW models", using either -hierarchical softmax or negative sampling [1]_ [2]_. - -NOTE: There are more ways to get word vectors in Gensim than just Word2Vec. -See wrappers for FastText, VarEmbed and WordRank. - -The training algorithms were originally ported from the C package https://code.google.com/p/word2vec/ -and extended with additional functionality. - -For a blog tutorial on gensim word2vec, with an interactive web app trained on GoogleNews, -visit http://radimrehurek.com/2014/02/word2vec-tutorial/ - -**Make sure you have a C compiler before installing gensim, to use optimized (compiled) word2vec training** -(70x speedup compared to plain NumPy implementation [3]_). - -Initialize a model with e.g.: - -.. sourcecode:: pycon - - >>> model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4) - -Persist a model to disk with: - -.. sourcecode:: pycon - - >>> model.save(fname) - >>> model = Word2Vec.load(fname) # you can continue training with the loaded model! - -The word vectors are stored in a KeyedVectors instance in model.wv. -This separates the read-only word vector lookup operations in KeyedVectors from the training code in Word2Vec: - -.. sourcecode:: pycon - - >>> model.wv['computer'] # numpy vector of a word - array([-0.00449447, -0.00310097, 0.02421786, ...], dtype=float32) - -The word vectors can also be instantiated from an existing file on disk in the word2vec C format -as a KeyedVectors instance:: - - NOTE: It is impossible to continue training the vectors loaded from the C format because hidden weights, - vocabulary frequency and the binary tree is missing: - - .. sourcecode:: pycon - - >>> from gensim.models.keyedvectors import KeyedVectors - >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.txt', binary=False) # C text format - >>> word_vectors = KeyedVectors.load_word2vec_format('/tmp/vectors.bin', binary=True) # C binary format - - -You can perform various NLP word tasks with the model. Some of them -are already built-in: - -.. sourcecode:: pycon - - >>> model.wv.most_similar(positive=['woman', 'king'], negative=['man']) - [('queen', 0.50882536), ...] - - >>> model.wv.most_similar_cosmul(positive=['woman', 'king'], negative=['man']) - [('queen', 0.71382287), ...] - - >>> model.wv.doesnt_match("breakfast cereal dinner lunch".split()) - 'cereal' - - >>> model.wv.similarity('woman', 'man') - 0.73723527 - -Probability of a text under the model: - -.. sourcecode:: pycon - - >>> model.score(["The fox jumped over a lazy dog".split()]) - 0.2158356 - -Correlation with human opinion on word similarity: - -.. sourcecode:: pycon - - >>> model.wv.evaluate_word_pairs(os.path.join(module_path, 'test_data','wordsim353.tsv')) - 0.51, 0.62, 0.13 - -And on analogies: - -.. sourcecode:: pycon - - >>> model.wv.accuracy(os.path.join(module_path, 'test_data', 'questions-words.txt')) - -and so on. - -If you're finished training a model (i.e. no more updates, only querying), -then switch to the :mod:`gensim.models.KeyedVectors` instance in wv - -.. sourcecode:: pycon - - >>> word_vectors = model.wv - >>> del model - -to trim unneeded model memory = use much less RAM. - -Note that there is a :mod:`gensim.models.phrases` module which lets you automatically -detect phrases longer than one word. Using phrases, you can learn a word2vec model -where "words" are actually multiword expressions, such as `new_york_times` or `financial_crisis`: - -.. sourcecode:: pycon - - >>> bigram_transformer = gensim.models.Phrases(sentences) - >>> model = Word2Vec(bigram_transformer[sentences], size=100, ...) - -.. [1] Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean. - Efficient Estimation of Word Representations in Vector Space. In Proceedings of Workshop at ICLR, 2013. -.. [2] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean. - Distributed Representations of Words and Phrases and their Compositionality. In Proceedings of NIPS, 2013. -.. [3] Optimizing word2vec in gensim, http://radimrehurek.com/2013/09/word2vec-in-python-part-two-optimizing/ -""" -from __future__ import division # py3 "true division" - -import logging -import sys -import os -import heapq -from timeit import default_timer -from copy import deepcopy -from collections import defaultdict -import threading -import itertools -import warnings - -from gensim.utils import keep_vocab_item, call_on_class_only -from gensim.models.deprecated.keyedvectors import KeyedVectors, Vocab -from gensim.models.word2vec import Word2Vec as NewWord2Vec -from gensim.models.deprecated.old_saveload import SaveLoad - -try: - from queue import Queue, Empty -except ImportError: - from Queue import Queue, Empty - -from numpy import exp, log, dot, zeros, outer, random, dtype, float32 as REAL,\ - uint32, seterr, array, uint8, vstack, fromstring, sqrt,\ - empty, sum as np_sum, ones, logaddexp - -from scipy.special import expit - -from gensim import utils -from gensim import matutils # utility fnc for pickling, common scipy operations etc -from six import iteritems, itervalues, string_types -from six.moves import range -from types import GeneratorType - -logger = logging.getLogger(__name__) - -MAX_WORDS_IN_BATCH = 10000 - - -def load_old_word2vec(*args, **kwargs): - old_model = Word2Vec.load(*args, **kwargs) - vector_size = getattr(old_model, 'vector_size', old_model.layer1_size) - params = { - 'size': vector_size, - 'alpha': old_model.alpha, - 'window': old_model.window, - 'min_count': old_model.min_count, - 'max_vocab_size': old_model.__dict__.get('max_vocab_size', None), - 'sample': old_model.__dict__.get('sample', 1e-3), - 'seed': old_model.seed, - 'workers': old_model.workers, - 'min_alpha': old_model.min_alpha, - 'sg': old_model.sg, - 'hs': old_model.hs, - 'negative': old_model.negative, - 'cbow_mean': old_model.cbow_mean, - 'hashfxn': old_model.__dict__.get('hashfxn', hash), - 'iter': old_model.__dict__.get('iter', 5), - 'null_word': old_model.__dict__.get('null_word', 0), - 'sorted_vocab': old_model.__dict__.get('sorted_vocab', 1), - 'batch_words': old_model.__dict__.get('batch_words', MAX_WORDS_IN_BATCH), - 'compute_loss': old_model.__dict__.get('compute_loss', None) - } - new_model = NewWord2Vec(**params) - # set trainables attributes - new_model.wv.vectors = old_model.wv.syn0 - if hasattr(old_model.wv, 'syn0norm'): - new_model.wv.vectors_norm = old_model.wv.syn0norm - if hasattr(old_model, 'syn1'): - new_model.trainables.syn1 = old_model.syn1 - if hasattr(old_model, 'syn1neg'): - new_model.trainables.syn1neg = old_model.syn1neg - if hasattr(old_model, 'syn0_lockf'): - new_model.trainables.vectors_lockf = old_model.syn0_lockf - # set vocabulary attributes - new_model.wv.vocab = old_model.wv.vocab - new_model.wv.index2word = old_model.wv.index2word - new_model.vocabulary.cum_table = old_model.__dict__.get('cum_table', None) - - new_model.train_count = old_model.__dict__.get('train_count', None) - new_model.corpus_count = old_model.__dict__.get('corpus_count', None) - new_model.corpus_total_words = old_model.__dict__.get('corpus_total_words', None) - new_model.running_training_loss = old_model.__dict__.get('running_training_loss', 0) - new_model.total_train_time = old_model.__dict__.get('total_train_time', None) - new_model.min_alpha_yet_reached = old_model.__dict__.get('min_alpha_yet_reached', old_model.alpha) - new_model.model_trimmed_post_training = old_model.__dict__.get('model_trimmed_post_training', None) - - return new_model - - -def train_batch_sg(model, sentences, alpha, work=None, compute_loss=False): - """ - Update skip-gram model by training on a sequence of sentences. - - Each sentence is a list of string tokens, which are looked up in the model's - vocab dictionary. Called internally from `Word2Vec.train()`. - - This is the non-optimized, Python version. If you have cython installed, gensim - will use the optimized version from word2vec_inner instead. - - """ - result = 0 - for sentence in sentences: - word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab - and model.wv.vocab[w].sample_int > model.random.rand() * 2**32] - for pos, word in enumerate(word_vocabs): - reduced_window = model.random.randint(model.window) # `b` in the original word2vec code - - # now go over all words from the (reduced) window, predicting each one in turn - start = max(0, pos - model.window + reduced_window) - for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start): - # don't train on the `word` itself - if pos2 != pos: - train_sg_pair( - model, model.wv.index2word[word.index], word2.index, alpha, compute_loss=compute_loss - ) - - result += len(word_vocabs) - return result - - -def train_batch_cbow(model, sentences, alpha, work=None, neu1=None, compute_loss=False): - """ - Update CBOW model by training on a sequence of sentences. - - Each sentence is a list of string tokens, which are looked up in the model's - vocab dictionary. Called internally from `Word2Vec.train()`. - - This is the non-optimized, Python version. If you have cython installed, gensim - will use the optimized version from word2vec_inner instead. - - """ - result = 0 - for sentence in sentences: - word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab - and model.wv.vocab[w].sample_int > model.random.rand() * 2**32] - for pos, word in enumerate(word_vocabs): - reduced_window = model.random.randint(model.window) # `b` in the original word2vec code - start = max(0, pos - model.window + reduced_window) - window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start) - word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] - l1 = np_sum(model.wv.syn0[word2_indices], axis=0) # 1 x vector_size - if word2_indices and model.cbow_mean: - l1 /= len(word2_indices) - train_cbow_pair(model, word, word2_indices, l1, alpha, compute_loss=compute_loss) - result += len(word_vocabs) - return result - - -def score_sentence_sg(model, sentence, work=None): - """ - Obtain likelihood score for a single sentence in a fitted skip-gram representaion. - - The sentence is a list of Vocab objects (or None, when the corresponding - word is not in the vocabulary). Called internally from `Word2Vec.score()`. - - This is the non-optimized, Python version. If you have cython installed, gensim - will use the optimized version from word2vec_inner instead. - - """ - log_prob_sentence = 0.0 - if model.negative: - raise RuntimeError("scoring is only available for HS=True") - - word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab] - for pos, word in enumerate(word_vocabs): - if word is None: - continue # OOV word in the input sentence => skip - - # now go over all words from the window, predicting each one in turn - start = max(0, pos - model.window) - for pos2, word2 in enumerate(word_vocabs[start: pos + model.window + 1], start): - # don't train on OOV words and on the `word` itself - if word2 is not None and pos2 != pos: - log_prob_sentence += score_sg_pair(model, word, word2) - - return log_prob_sentence - - -def score_sentence_cbow(model, sentence, work=None, neu1=None): - """ - Obtain likelihood score for a single sentence in a fitted CBOW representaion. - - The sentence is a list of Vocab objects (or None, where the corresponding - word is not in the vocabulary. Called internally from `Word2Vec.score()`. - - This is the non-optimized, Python version. If you have cython installed, gensim - will use the optimized version from word2vec_inner instead. - - """ - log_prob_sentence = 0.0 - if model.negative: - raise RuntimeError("scoring is only available for HS=True") - - word_vocabs = [model.wv.vocab[w] for w in sentence if w in model.wv.vocab] - for pos, word in enumerate(word_vocabs): - if word is None: - continue # OOV word in the input sentence => skip - - start = max(0, pos - model.window) - window_pos = enumerate(word_vocabs[start:(pos + model.window + 1)], start) - word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)] - l1 = np_sum(model.wv.syn0[word2_indices], axis=0) # 1 x layer1_size - if word2_indices and model.cbow_mean: - l1 /= len(word2_indices) - log_prob_sentence += score_cbow_pair(model, word, l1) - - return log_prob_sentence - - -def train_sg_pair(model, word, context_index, alpha, learn_vectors=True, learn_hidden=True, - context_vectors=None, context_locks=None, compute_loss=False, is_ft=False): - if context_vectors is None: - if is_ft: - context_vectors_vocab = model.wv.syn0_vocab - context_vectors_ngrams = model.wv.syn0_ngrams - else: - context_vectors = model.wv.syn0 - if context_locks is None: - if is_ft: - context_locks_vocab = model.syn0_vocab_lockf - context_locks_ngrams = model.syn0_ngrams_lockf - else: - context_locks = model.syn0_lockf - - if word not in model.wv.vocab: - return - predict_word = model.wv.vocab[word] # target word (NN output) - - if is_ft: - l1_vocab = context_vectors_vocab[context_index[0]] - l1_ngrams = np_sum(context_vectors_ngrams[context_index[1:]], axis=0) - if context_index: - l1 = np_sum([l1_vocab, l1_ngrams], axis=0) / len(context_index) - else: - l1 = context_vectors[context_index] # input word (NN input/projection layer) - lock_factor = context_locks[context_index] - - neu1e = zeros(l1.shape) - - if model.hs: - # work on the entire tree at once, to push as much work into numpy's C routines as possible (performance) - l2a = deepcopy(model.syn1[predict_word.point]) # 2d matrix, codelen x layer1_size - prod_term = dot(l1, l2a.T) - fa = expit(prod_term) # propagate hidden -> output - ga = (1 - predict_word.code - fa) * alpha # vector of error gradients multiplied by the learning rate - if learn_hidden: - model.syn1[predict_word.point] += outer(ga, l1) # learn hidden -> output - neu1e += dot(ga, l2a) # save error - - # loss component corresponding to hierarchical softmax - if compute_loss: - sgn = (-1.0)**predict_word.code # `ch` function, 0 -> 1, 1 -> -1 - lprob = -log(expit(-sgn * prod_term)) - model.running_training_loss += sum(lprob) - - if model.negative: - # use this word (label = 1) + `negative` other random words not from this sentence (label = 0) - word_indices = [predict_word.index] - while len(word_indices) < model.negative + 1: - w = model.cum_table.searchsorted(model.random.randint(model.cum_table[-1])) - if w != predict_word.index: - word_indices.append(w) - l2b = model.syn1neg[word_indices] # 2d matrix, k+1 x layer1_size - prod_term = dot(l1, l2b.T) - fb = expit(prod_term) # propagate hidden -> output - gb = (model.neg_labels - fb) * alpha # vector of error gradients multiplied by the learning rate - if learn_hidden: - model.syn1neg[word_indices] += outer(gb, l1) # learn hidden -> output - neu1e += dot(gb, l2b) # save error - - # loss component corresponding to negative sampling - if compute_loss: - model.running_training_loss -= sum(log(expit(-1 * prod_term[1:]))) # for the sampled words - model.running_training_loss -= log(expit(prod_term[0])) # for the output word - - if learn_vectors: - if is_ft: - model.wv.syn0_vocab[context_index[0]] += neu1e * context_locks_vocab[context_index[0]] - for i in context_index[1:]: - model.wv.syn0_ngrams[i] += neu1e * context_locks_ngrams[i] - else: - l1 += neu1e * lock_factor # learn input -> hidden (mutates model.wv.syn0[word2.index], if that is l1) - return neu1e - - -def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=True, learn_hidden=True, - compute_loss=False, context_vectors=None, context_locks=None, is_ft=False): - if context_vectors is None: - if is_ft: - context_vectors_vocab = model.wv.syn0_vocab - context_vectors_ngrams = model.wv.syn0_ngrams - else: - context_vectors = model.wv.syn0 - if context_locks is None: - if is_ft: - context_locks_vocab = model.syn0_vocab_lockf - context_locks_ngrams = model.syn0_ngrams_lockf - else: - context_locks = model.syn0_lockf - - neu1e = zeros(l1.shape) - - if model.hs: - l2a = model.syn1[word.point] # 2d matrix, codelen x layer1_size - prod_term = dot(l1, l2a.T) - fa = expit(prod_term) # propagate hidden -> output - ga = (1. - word.code - fa) * alpha # vector of error gradients multiplied by the learning rate - if learn_hidden: - model.syn1[word.point] += outer(ga, l1) # learn hidden -> output - neu1e += dot(ga, l2a) # save error - - # loss component corresponding to hierarchical softmax - if compute_loss: - sgn = (-1.0)**word.code # ch function, 0-> 1, 1 -> -1 - model.running_training_loss += sum(-log(expit(-sgn * prod_term))) - - if model.negative: - # use this word (label = 1) + `negative` other random words not from this sentence (label = 0) - word_indices = [word.index] - while len(word_indices) < model.negative + 1: - w = model.cum_table.searchsorted(model.random.randint(model.cum_table[-1])) - if w != word.index: - word_indices.append(w) - l2b = model.syn1neg[word_indices] # 2d matrix, k+1 x layer1_size - prod_term = dot(l1, l2b.T) - fb = expit(prod_term) # propagate hidden -> output - gb = (model.neg_labels - fb) * alpha # vector of error gradients multiplied by the learning rate - if learn_hidden: - model.syn1neg[word_indices] += outer(gb, l1) # learn hidden -> output - neu1e += dot(gb, l2b) # save error - - # loss component corresponding to negative sampling - if compute_loss: - model.running_training_loss -= sum(log(expit(-1 * prod_term[1:]))) # for the sampled words - model.running_training_loss -= log(expit(prod_term[0])) # for the output word - - if learn_vectors: - # learn input -> hidden, here for all words in the window separately - if is_ft: - if not model.cbow_mean and input_word_indices: - neu1e /= (len(input_word_indices[0]) + len(input_word_indices[1])) - for i in input_word_indices[0]: - context_vectors_vocab[i] += neu1e * context_locks_vocab[i] - for i in input_word_indices[1]: - context_vectors_ngrams[i] += neu1e * context_locks_ngrams[i] - else: - if not model.cbow_mean and input_word_indices: - neu1e /= len(input_word_indices) - for i in input_word_indices: - context_vectors[i] += neu1e * context_locks[i] - - return neu1e - - -def score_sg_pair(model, word, word2): - l1 = model.wv.syn0[word2.index] - l2a = deepcopy(model.syn1[word.point]) # 2d matrix, codelen x layer1_size - sgn = (-1.0)**word.code # ch function, 0-> 1, 1 -> -1 - lprob = -logaddexp(0, -sgn * dot(l1, l2a.T)) - return sum(lprob) - - -def score_cbow_pair(model, word, l1): - l2a = model.syn1[word.point] # 2d matrix, codelen x layer1_size - sgn = (-1.0)**word.code # ch function, 0-> 1, 1 -> -1 - lprob = -logaddexp(0, -sgn * dot(l1, l2a.T)) - return sum(lprob) - - -class Word2Vec(SaveLoad): - """ - Class for training, using and evaluating neural networks described in https://code.google.com/p/word2vec/ - - If you're finished training a model (=no more updates, only querying) - then switch to the :mod:`gensim.models.KeyedVectors` instance in wv - - The model can be stored/loaded via its `save()` and `load()` methods, or stored/loaded in a format - compatible with the original word2vec implementation via `wv.save_word2vec_format()` - and `KeyedVectors.load_word2vec_format()`. - - """ - - def __init__(self, sentences=None, size=100, alpha=0.025, window=5, min_count=5, - max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, - sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, - trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False): - """ - Initialize the model from an iterable of `sentences`. Each sentence is a - list of words (unicode strings) that will be used for training. - - The `sentences` iterable can be simply a list, but for larger corpora, - consider an iterable that streams the sentences directly from disk/network. - See :class:`BrownCorpus`, :class:`Text8Corpus` or :class:`LineSentence` in - this module for such examples. - - If you don't supply `sentences`, the model is left uninitialized -- use if - you plan to initialize it in some other way. - - `sg` defines the training algorithm. By default (`sg=0`), CBOW is used. - Otherwise (`sg=1`), skip-gram is employed. - - `size` is the dimensionality of the feature vectors. - - `window` is the maximum distance between the current and predicted word within a sentence. - - `alpha` is the initial learning rate (will linearly drop to `min_alpha` as training progresses). - - `seed` = for the random number generator. Initial vectors for each - word are seeded with a hash of the concatenation of word + str(seed). - Note that for a fully deterministically-reproducible run, you must also limit the model to - a single worker thread, to eliminate ordering jitter from OS thread scheduling. (In Python - 3, reproducibility between interpreter launches also requires use of the PYTHONHASHSEED - environment variable to control hash randomization.) - - `min_count` = ignore all words with total frequency lower than this. - - `max_vocab_size` = limit RAM during vocabulary building; if there are more unique - words than this, then prune the infrequent ones. Every 10 million word types - need about 1GB of RAM. Set to `None` for no limit (default). - - `sample` = threshold for configuring which higher-frequency words are randomly downsampled; - default is 1e-3, useful range is (0, 1e-5). - - `workers` = use this many worker threads to train the model (=faster training with multicore machines). - - `hs` = if 1, hierarchical softmax will be used for model training. - If set to 0 (default), and `negative` is non-zero, negative sampling will be used. - - `negative` = if > 0, negative sampling will be used, the int for negative - specifies how many "noise words" should be drawn (usually between 5-20). - Default is 5. If set to 0, no negative samping is used. - - `cbow_mean` = if 0, use the sum of the context word vectors. If 1 (default), use the mean. - Only applies when cbow is used. - - `hashfxn` = hash function to use to randomly initialize weights, for increased - training reproducibility. Default is Python's rudimentary built in hash function. - - `iter` = number of iterations (epochs) over the corpus. Default is 5. - - `trim_rule` = vocabulary trimming rule, specifies whether certain words should remain - in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count). - Can be None (min_count will be used), or a callable that accepts parameters (word, count, min_count) and - returns either `utils.RULE_DISCARD`, `utils.RULE_KEEP` or `utils.RULE_DEFAULT`. - Note: The rule, if given, is only used to prune vocabulary during build_vocab() and is not stored as part - of the model. - - `sorted_vocab` = if 1 (default), sort the vocabulary by descending frequency before - assigning word indexes. - - `batch_words` = target size (in words) for batches of examples passed to worker threads (and - thus cython routines). Default is 10000. (Larger batches will be passed if individual - texts are longer than 10000 words, but the standard cython code truncates to that maximum.) - - """ - - self.load = call_on_class_only - - self.initialize_word_vectors() - self.sg = int(sg) - self.cum_table = None # for negative sampling - self.vector_size = int(size) - self.layer1_size = int(size) - if size % 4 != 0: - logger.warning("consider setting layer size to a multiple of 4 for greater performance") - self.alpha = float(alpha) - self.min_alpha_yet_reached = float(alpha) # To warn user if alpha increases - self.window = int(window) - self.max_vocab_size = max_vocab_size - self.seed = seed - self.random = random.RandomState(seed) - self.min_count = min_count - self.sample = sample - self.workers = int(workers) - self.min_alpha = float(min_alpha) - self.hs = hs - self.negative = negative - self.cbow_mean = int(cbow_mean) - self.hashfxn = hashfxn - self.iter = iter - self.null_word = null_word - self.train_count = 0 - self.total_train_time = 0 - self.sorted_vocab = sorted_vocab - self.batch_words = batch_words - self.model_trimmed_post_training = False - self.compute_loss = compute_loss - self.running_training_loss = 0 - if sentences is not None: - if isinstance(sentences, GeneratorType): - raise TypeError("You can't pass a generator as the sentences argument. Try an iterator.") - self.build_vocab(sentences, trim_rule=trim_rule) - self.train( - sentences, total_examples=self.corpus_count, epochs=self.iter, - start_alpha=self.alpha, end_alpha=self.min_alpha - ) - else: - if trim_rule is not None: - logger.warning( - "The rule, if given, is only used to prune vocabulary during build_vocab() " - "and is not stored as part of the model. Model initialized without sentences. " - "trim_rule provided, if any, will be ignored." - ) - - def initialize_word_vectors(self): - self.wv = KeyedVectors() - - def make_cum_table(self, power=0.75, domain=2**31 - 1): - """ - Create a cumulative-distribution table using stored vocabulary word counts for - drawing random words in the negative-sampling training routines. - - To draw a word index, choose a random integer up to the maximum value in the - table (cum_table[-1]), then finding that integer's sorted insertion point - (as if by bisect_left or ndarray.searchsorted()). That insertion point is the - drawn index, coming up in proportion equal to the increment at that slot. - - Called internally from 'build_vocab()'. - """ - vocab_size = len(self.wv.index2word) - self.cum_table = zeros(vocab_size, dtype=uint32) - # compute sum of all power (Z in paper) - train_words_pow = 0.0 - for word_index in range(vocab_size): - train_words_pow += self.wv.vocab[self.wv.index2word[word_index]].count**power - cumulative = 0.0 - for word_index in range(vocab_size): - cumulative += self.wv.vocab[self.wv.index2word[word_index]].count**power - self.cum_table[word_index] = round(cumulative / train_words_pow * domain) - if len(self.cum_table) > 0: - assert self.cum_table[-1] == domain - - def create_binary_tree(self): - """ - Create a binary Huffman tree using stored vocabulary word counts. Frequent words - will have shorter binary codes. Called internally from `build_vocab()`. - - """ - logger.info("constructing a huffman tree from %i words", len(self.wv.vocab)) - - # build the huffman tree - heap = list(itervalues(self.wv.vocab)) - heapq.heapify(heap) - for i in range(len(self.wv.vocab) - 1): - min1, min2 = heapq.heappop(heap), heapq.heappop(heap) - heapq.heappush( - heap, Vocab(count=min1.count + min2.count, index=i + len(self.wv.vocab), left=min1, right=min2) - ) - - # recurse over the tree, assigning a binary code to each vocabulary word - if heap: - max_depth, stack = 0, [(heap[0], [], [])] - while stack: - node, codes, points = stack.pop() - if node.index < len(self.wv.vocab): - # leaf node => store its path from the root - node.code, node.point = codes, points - max_depth = max(len(codes), max_depth) - else: - # inner node => continue recursion - points = array(list(points) + [node.index - len(self.wv.vocab)], dtype=uint32) - stack.append((node.left, array(list(codes) + [0], dtype=uint8), points)) - stack.append((node.right, array(list(codes) + [1], dtype=uint8), points)) - - logger.info("built huffman tree with maximum node depth %i", max_depth) - - def build_vocab(self, sentences, keep_raw_vocab=False, trim_rule=None, progress_per=10000, update=False): - """ - Build vocabulary from a sequence of sentences (can be a once-only generator stream). - Each sentence must be a list of unicode strings. - """ - self.scan_vocab(sentences, progress_per=progress_per, trim_rule=trim_rule) # initial survey - # trim by min_count & precalculate downsampling - self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) - self.finalize_vocab(update=update) # build tables & arrays - - def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False): - """ - Build vocabulary from a dictionary of word frequencies. - Build model vocabulary from a passed dictionary that contains (word,word count). - Words must be of type unicode strings. - - Parameters - ---------- - `word_freq` : dict - Word,Word_Count dictionary. - `keep_raw_vocab` : bool - If not true, delete the raw vocabulary after the scaling is done and free up RAM. - `corpus_count`: int - Even if no corpus is provided, this argument can set corpus_count explicitly. - `trim_rule` = vocabulary trimming rule, specifies whether certain words should remain - in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count). - Can be None (min_count will be used), or a callable that accepts parameters (word, count, min_count) and - returns either `utils.RULE_DISCARD`, `utils.RULE_KEEP` or `utils.RULE_DEFAULT`. - `update`: bool - If true, the new provided words in `word_freq` dict will be added to model's vocab. - - Returns - -------- - None - - Examples - -------- - - .. sourcecode:: pycon - - >>> from gensim.models.word2vec import Word2Vec - >>> model = Word2Vec() - >>> model.build_vocab_from_freq({"Word1": 15, "Word2": 20}) - - """ - logger.info("Processing provided word frequencies") - # Instead of scanning text, this will assign provided word frequencies dictionary(word_freq) - # to be directly the raw vocab - raw_vocab = word_freq - logger.info( - "collected %i different raw word, with total frequency of %i", - len(raw_vocab), sum(itervalues(raw_vocab)) - ) - - # Since no sentences are provided, this is to control the corpus_count - self.corpus_count = corpus_count if corpus_count else 0 - self.raw_vocab = raw_vocab - - # trim by min_count & precalculate downsampling - self.scale_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) - self.finalize_vocab(update=update) # build tables & arrays - - def scan_vocab(self, sentences, progress_per=10000, trim_rule=None): - """Do an initial scan of all words appearing in sentences.""" - logger.info("collecting all words and their counts") - sentence_no = -1 - total_words = 0 - min_reduce = 1 - vocab = defaultdict(int) - checked_string_types = 0 - for sentence_no, sentence in enumerate(sentences): - if not checked_string_types: - if isinstance(sentence, string_types): - logger.warning( - "Each 'sentences' item should be a list of words (usually unicode strings). " - "First item here is instead plain %s.", - type(sentence) - ) - checked_string_types += 1 - if sentence_no % progress_per == 0: - logger.info( - "PROGRESS: at sentence #%i, processed %i words, keeping %i word types", - sentence_no, total_words, len(vocab) - ) - for word in sentence: - vocab[word] += 1 - total_words += len(sentence) - - if self.max_vocab_size and len(vocab) > self.max_vocab_size: - utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) - min_reduce += 1 - - logger.info( - "collected %i word types from a corpus of %i raw words and %i sentences", - len(vocab), total_words, sentence_no + 1 - ) - self.corpus_count = sentence_no + 1 - self.raw_vocab = vocab - return total_words - - def scale_vocab(self, min_count=None, sample=None, dry_run=False, - keep_raw_vocab=False, trim_rule=None, update=False): - """ - Apply vocabulary settings for `min_count` (discarding less-frequent words) - and `sample` (controlling the downsampling of more-frequent words). - - Calling with `dry_run=True` will only simulate the provided settings and - report the size of the retained vocabulary, effective corpus length, and - estimated memory requirements. Results are both printed via logging and - returned as a dict. - - Delete the raw vocabulary after the scaling is done to free up RAM, - unless `keep_raw_vocab` is set. - - """ - min_count = min_count or self.min_count - sample = sample or self.sample - drop_total = drop_unique = 0 - - if not update: - logger.info("Loading a fresh vocabulary") - retain_total, retain_words = 0, [] - # Discard words less-frequent than min_count - if not dry_run: - self.wv.index2word = [] - # make stored settings match these applied settings - self.min_count = min_count - self.sample = sample - self.wv.vocab = {} - - for word, v in iteritems(self.raw_vocab): - if keep_vocab_item(word, v, min_count, trim_rule=trim_rule): - retain_words.append(word) - retain_total += v - if not dry_run: - self.wv.vocab[word] = Vocab(count=v, index=len(self.wv.index2word)) - self.wv.index2word.append(word) - else: - drop_unique += 1 - drop_total += v - original_unique_total = len(retain_words) + drop_unique - retain_unique_pct = len(retain_words) * 100 / max(original_unique_total, 1) - logger.info( - "min_count=%d retains %i unique words (%i%% of original %i, drops %i)", - min_count, len(retain_words), retain_unique_pct, original_unique_total, drop_unique - ) - original_total = retain_total + drop_total - retain_pct = retain_total * 100 / max(original_total, 1) - logger.info( - "min_count=%d leaves %i word corpus (%i%% of original %i, drops %i)", - min_count, retain_total, retain_pct, original_total, drop_total - ) - else: - logger.info("Updating model with new vocabulary") - new_total = pre_exist_total = 0 - new_words = pre_exist_words = [] - for word, v in iteritems(self.raw_vocab): - if keep_vocab_item(word, v, min_count, trim_rule=trim_rule): - if word in self.wv.vocab: - pre_exist_words.append(word) - pre_exist_total += v - if not dry_run: - self.wv.vocab[word].count += v - else: - new_words.append(word) - new_total += v - if not dry_run: - self.wv.vocab[word] = Vocab(count=v, index=len(self.wv.index2word)) - self.wv.index2word.append(word) - else: - drop_unique += 1 - drop_total += v - original_unique_total = len(pre_exist_words) + len(new_words) + drop_unique - pre_exist_unique_pct = len(pre_exist_words) * 100 / max(original_unique_total, 1) - new_unique_pct = len(new_words) * 100 / max(original_unique_total, 1) - logger.info( - "New added %i unique words (%i%% of original %i) " - "and increased the count of %i pre-existing words (%i%% of original %i)", - len(new_words), new_unique_pct, original_unique_total, len(pre_exist_words), - pre_exist_unique_pct, original_unique_total - ) - retain_words = new_words + pre_exist_words - retain_total = new_total + pre_exist_total - - # Precalculate each vocabulary item's threshold for sampling - if not sample: - # no words downsampled - threshold_count = retain_total - elif sample < 1.0: - # traditional meaning: set parameter as proportion of total - threshold_count = sample * retain_total - else: - # new shorthand: sample >= 1 means downsample all words with higher count than sample - threshold_count = int(sample * (3 + sqrt(5)) / 2) - - downsample_total, downsample_unique = 0, 0 - for w in retain_words: - v = self.raw_vocab[w] - word_probability = (sqrt(v / threshold_count) + 1) * (threshold_count / v) - if word_probability < 1.0: - downsample_unique += 1 - downsample_total += word_probability * v - else: - word_probability = 1.0 - downsample_total += v - if not dry_run: - self.wv.vocab[w].sample_int = int(round(word_probability * 2**32)) - - if not dry_run and not keep_raw_vocab: - logger.info("deleting the raw counts dictionary of %i items", len(self.raw_vocab)) - self.raw_vocab = defaultdict(int) - - logger.info("sample=%g downsamples %i most-common words", sample, downsample_unique) - logger.info( - "downsampling leaves estimated %i word corpus (%.1f%% of prior %i)", - downsample_total, downsample_total * 100.0 / max(retain_total, 1), retain_total - ) - - # return from each step: words-affected, resulting-corpus-size, extra memory estimates - report_values = { - 'drop_unique': drop_unique, 'retain_total': retain_total, 'downsample_unique': downsample_unique, - 'downsample_total': int(downsample_total), 'memory': self.estimate_memory(vocab_size=len(retain_words)) - } - - return report_values - - def finalize_vocab(self, update=False): - """Build tables and model weights based on final vocabulary settings.""" - if not self.wv.index2word: - self.scale_vocab() - if self.sorted_vocab and not update: - self.sort_vocab() - if self.hs: - # add info about each word's Huffman encoding - self.create_binary_tree() - if self.negative: - # build the table for drawing random words (for negative sampling) - self.make_cum_table() - if self.null_word: - # create null pseudo-word for padding when using concatenative L1 (run-of-words) - # this word is only ever input – never predicted – so count, huffman-point, etc doesn't matter - word, v = '\0', Vocab(count=1, sample_int=0) - v.index = len(self.wv.vocab) - self.wv.index2word.append(word) - self.wv.vocab[word] = v - # set initial input/projection and hidden weights - if not update: - self.reset_weights() - else: - self.update_weights() - - def sort_vocab(self): - """Sort the vocabulary so the most frequent words have the lowest indexes.""" - if len(self.wv.syn0): - raise RuntimeError("cannot sort vocabulary after model weights already initialized.") - self.wv.index2word.sort(key=lambda word: self.wv.vocab[word].count, reverse=True) - for i, word in enumerate(self.wv.index2word): - self.wv.vocab[word].index = i - - def reset_from(self, other_model): - """ - Borrow shareable pre-built structures (like vocab) from the other_model. Useful - if testing multiple models in parallel on the same corpus. - """ - self.wv.vocab = other_model.wv.vocab - self.wv.index2word = other_model.wv.index2word - self.cum_table = other_model.cum_table - self.corpus_count = other_model.corpus_count - self.reset_weights() - - def _do_train_job(self, sentences, alpha, inits): - """ - Train a single batch of sentences. Return 2-tuple `(effective word count after - ignoring unknown words and sentence length trimming, total word count)`. - """ - work, neu1 = inits - tally = 0 - if self.sg: - tally += train_batch_sg(self, sentences, alpha, work, self.compute_loss) - else: - tally += train_batch_cbow(self, sentences, alpha, work, neu1, self.compute_loss) - return tally, self._raw_word_count(sentences) - - def _raw_word_count(self, job): - """Return the number of words in a given job.""" - return sum(len(sentence) for sentence in job) - - def train(self, sentences, total_examples=None, total_words=None, - epochs=None, start_alpha=None, end_alpha=None, word_count=0, - queue_factor=2, report_delay=1.0, compute_loss=None): - """ - Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). - For Word2Vec, each sentence must be a list of unicode strings. (Subclasses may accept other examples.) - - To support linear learning-rate decay from (initial) alpha to min_alpha, and accurate - progres-percentage logging, either total_examples (count of sentences) or total_words (count of - raw words in sentences) MUST be provided. (If the corpus is the same as was provided to - `build_vocab()`, the count of examples in that corpus will be available in the model's - `corpus_count` property.) - - To avoid common mistakes around the model's ability to do multiple training passes itself, an - explicit `epochs` argument MUST be provided. In the common and recommended case, where `train()` - is only called once, the model's cached `iter` value should be supplied as `epochs` value. - """ - if self.model_trimmed_post_training: - raise RuntimeError("Parameters for training were discarded using model_trimmed_post_training method") - - if compute_loss: - self.compute_loss = compute_loss - self.running_training_loss = 0 - - logger.info( - "training model with %i workers on %i vocabulary and %i features, " - "using sg=%s hs=%s sample=%s negative=%s window=%s", - self.workers, len(self.wv.vocab), self.layer1_size, self.sg, - self.hs, self.sample, self.negative, self.window - ) - - if not self.wv.vocab: - raise RuntimeError("you must first build vocabulary before training the model") - if not len(self.wv.syn0): - raise RuntimeError("you must first finalize vocabulary before training the model") - - if not hasattr(self, 'corpus_count'): - raise ValueError( - "The number of sentences in the training corpus is missing. " - "Did you load the model via KeyedVectors.load_word2vec_format?" - "Models loaded via load_word2vec_format don't support further training. " - "Instead start with a blank model, scan_vocab on the new corpus, " - "intersect_word2vec_format with the old model, then train." - ) - - if total_words is None and total_examples is None: - raise ValueError( - "You must specify either total_examples or total_words, for proper alpha and progress calculations. " - "The usual value is total_examples=model.corpus_count." - ) - if epochs is None: - raise ValueError("You must specify an explict epochs count. The usual value is epochs=model.iter.") - start_alpha = start_alpha or self.alpha - end_alpha = end_alpha or self.min_alpha - - job_tally = 0 - - if epochs > 1: - sentences = utils.RepeatCorpusNTimes(sentences, epochs) - total_words = total_words and total_words * epochs - total_examples = total_examples and total_examples * epochs - - def worker_loop(): - """Train the model, lifting lists of sentences from the job_queue.""" - work = matutils.zeros_aligned(self.layer1_size, dtype=REAL) # per-thread private work memory - neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) - jobs_processed = 0 - while True: - job = job_queue.get() - if job is None: - progress_queue.put(None) - break # no more jobs => quit this worker - sentences, alpha = job - tally, raw_tally = self._do_train_job(sentences, alpha, (work, neu1)) - progress_queue.put((len(sentences), tally, raw_tally)) # report back progress - jobs_processed += 1 - logger.debug("worker exiting, processed %i jobs", jobs_processed) - - def job_producer(): - """Fill jobs queue using the input `sentences` iterator.""" - job_batch, batch_size = [], 0 - pushed_words, pushed_examples = 0, 0 - next_alpha = start_alpha - if next_alpha > self.min_alpha_yet_reached: - logger.warning("Effective 'alpha' higher than previous training cycles") - self.min_alpha_yet_reached = next_alpha - job_no = 0 - - for sent_idx, sentence in enumerate(sentences): - sentence_length = self._raw_word_count([sentence]) - - # can we fit this sentence into the existing job batch? - if batch_size + sentence_length <= self.batch_words: - # yes => add it to the current job - job_batch.append(sentence) - batch_size += sentence_length - else: - # no => submit the existing job - logger.debug( - "queueing job #%i (%i words, %i sentences) at alpha %.05f", - job_no, batch_size, len(job_batch), next_alpha - ) - job_no += 1 - job_queue.put((job_batch, next_alpha)) - - # update the learning rate for the next job - if end_alpha < next_alpha: - if total_examples: - # examples-based decay - pushed_examples += len(job_batch) - progress = 1.0 * pushed_examples / total_examples - else: - # words-based decay - pushed_words += self._raw_word_count(job_batch) - progress = 1.0 * pushed_words / total_words - next_alpha = start_alpha - (start_alpha - end_alpha) * progress - next_alpha = max(end_alpha, next_alpha) - - # add the sentence that didn't fit as the first item of a new job - job_batch, batch_size = [sentence], sentence_length - - # add the last job too (may be significantly smaller than batch_words) - if job_batch: - logger.debug( - "queueing job #%i (%i words, %i sentences) at alpha %.05f", - job_no, batch_size, len(job_batch), next_alpha - ) - job_no += 1 - job_queue.put((job_batch, next_alpha)) - - if job_no == 0 and self.train_count == 0: - logger.warning( - "train() called with an empty iterator (if not intended, " - "be sure to provide a corpus that offers restartable iteration = an iterable)." - ) - - # give the workers heads up that they can finish -- no more work! - for _ in range(self.workers): - job_queue.put(None) - logger.debug("job loop exiting, total %i jobs", job_no) - - # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( - job_queue = Queue(maxsize=queue_factor * self.workers) - progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers) - - workers = [threading.Thread(target=worker_loop) for _ in range(self.workers)] - unfinished_worker_count = len(workers) - workers.append(threading.Thread(target=job_producer)) - - for thread in workers: - thread.daemon = True # make interrupting the process with ctrl+c easier - thread.start() - - example_count, trained_word_count, raw_word_count = 0, 0, word_count - start, next_report = default_timer() - 0.00001, 1.0 - - while unfinished_worker_count > 0: - report = progress_queue.get() # blocks if workers too slow - if report is None: # a thread reporting that it finished - unfinished_worker_count -= 1 - logger.info("worker thread finished; awaiting finish of %i more threads", unfinished_worker_count) - continue - examples, trained_words, raw_words = report - job_tally += 1 - - # update progress stats - example_count += examples - trained_word_count += trained_words # only words in vocab & sampled - raw_word_count += raw_words - - # log progress once every report_delay seconds - elapsed = default_timer() - start - if elapsed >= next_report: - if total_examples: - # examples-based progress % - logger.info( - "PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i", - 100.0 * example_count / total_examples, trained_word_count / elapsed, - utils.qsize(job_queue), utils.qsize(progress_queue) - ) - else: - # words-based progress % - logger.info( - "PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i", - 100.0 * raw_word_count / total_words, trained_word_count / elapsed, - utils.qsize(job_queue), utils.qsize(progress_queue) - ) - next_report = elapsed + report_delay - - # all done; report the final stats - elapsed = default_timer() - start - logger.info( - "training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s", - raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed - ) - if job_tally < 10 * self.workers: - logger.warning( - "under 10 jobs per worker: consider setting a smaller `batch_words' for smoother alpha decay" - ) - - # check that the input corpus hasn't changed during iteration - if total_examples and total_examples != example_count: - logger.warning( - "supplied example count (%i) did not equal expected count (%i)", example_count, total_examples - ) - if total_words and total_words != raw_word_count: - logger.warning( - "supplied raw word count (%i) did not equal expected count (%i)", raw_word_count, total_words - ) - - self.train_count += 1 # number of times train() has been called - self.total_train_time += elapsed - self.clear_sims() - return trained_word_count - - # basics copied from the train() function - def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor=2, report_delay=1): - """ - Score the log probability for a sequence of sentences (can be a once-only generator stream). - Each sentence must be a list of unicode strings. - This does not change the fitted model in any way (see Word2Vec.train() for that). - - We have currently only implemented score for the hierarchical softmax scheme, - so you need to have run word2vec with hs=1 and negative=0 for this to work. - - Note that you should specify total_sentences; we'll run into problems if you ask to - score more than this number of sentences but it is inefficient to set the value too high. - - See the article by [#taddy]_ and the gensim demo at [#deepir]_ for examples of - how to use such scores in document classification. - - .. [#taddy] Taddy, Matt. Document Classification by Inversion of Distributed Language Representations, - in Proceedings of the 2015 Conference of the Association of Computational Linguistics. - .. [#deepir] https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/deepir.ipynb - - """ - logger.info( - "scoring sentences with %i workers on %i vocabulary and %i features, " - "using sg=%s hs=%s sample=%s and negative=%s", - self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, self.sample, self.negative - ) - - if not self.wv.vocab: - raise RuntimeError("you must first build vocabulary before scoring new data") - - if not self.hs: - raise RuntimeError( - "We have currently only implemented score for the hierarchical softmax scheme, " - "so you need to have run word2vec with hs=1 and negative=0 for this to work." - ) - - def worker_loop(): - """Compute log probability for each sentence, lifting lists of sentences from the jobs queue.""" - work = zeros(1, dtype=REAL) # for sg hs, we actually only need one memory loc (running sum) - neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) - while True: - job = job_queue.get() - if job is None: # signal to finish - break - ns = 0 - for sentence_id, sentence in job: - if sentence_id >= total_sentences: - break - if self.sg: - score = score_sentence_sg(self, sentence, work) - else: - score = score_sentence_cbow(self, sentence, work, neu1) - sentence_scores[sentence_id] = score - ns += 1 - progress_queue.put(ns) # report progress - - start, next_report = default_timer(), 1.0 - # buffer ahead only a limited number of jobs.. this is the reason we can't simply use ThreadPool :( - job_queue = Queue(maxsize=queue_factor * self.workers) - progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers) - - workers = [threading.Thread(target=worker_loop) for _ in range(self.workers)] - for thread in workers: - thread.daemon = True # make interrupting the process with ctrl+c easier - thread.start() - - sentence_count = 0 - sentence_scores = matutils.zeros_aligned(total_sentences, dtype=REAL) - - push_done = False - done_jobs = 0 - jobs_source = enumerate(utils.grouper(enumerate(sentences), chunksize)) - - # fill jobs queue with (id, sentence) job items - while True: - try: - job_no, items = next(jobs_source) - if (job_no - 1) * chunksize > total_sentences: - logger.warning( - "terminating after %i sentences (set higher total_sentences if you want more).", - total_sentences - ) - job_no -= 1 - raise StopIteration() - logger.debug("putting job #%i in the queue", job_no) - job_queue.put(items) - except StopIteration: - logger.info("reached end of input; waiting to finish %i outstanding jobs", job_no - done_jobs + 1) - for _ in range(self.workers): - job_queue.put(None) # give the workers heads up that they can finish -- no more work! - push_done = True - try: - while done_jobs < (job_no + 1) or not push_done: - ns = progress_queue.get(push_done) # only block after all jobs pushed - sentence_count += ns - done_jobs += 1 - elapsed = default_timer() - start - if elapsed >= next_report: - logger.info( - "PROGRESS: at %.2f%% sentences, %.0f sentences/s", - 100.0 * sentence_count, sentence_count / elapsed - ) - next_report = elapsed + report_delay # don't flood log, wait report_delay seconds - else: - # loop ended by job count; really done - break - except Empty: - pass # already out of loop; continue to next push - - elapsed = default_timer() - start - self.clear_sims() - logger.info( - "scoring %i sentences took %.1fs, %.0f sentences/s", - sentence_count, elapsed, sentence_count / elapsed - ) - return sentence_scores[:sentence_count] - - def clear_sims(self): - """ - Removes all L2-normalized vectors for words from the model. - You will have to recompute them using init_sims method. - """ - - self.wv.syn0norm = None - - def update_weights(self): - """ - Copy all the existing weights, and reset the weights for the newly - added vocabulary. - """ - logger.info("updating layer weights") - gained_vocab = len(self.wv.vocab) - len(self.wv.syn0) - newsyn0 = empty((gained_vocab, self.vector_size), dtype=REAL) - - # randomize the remaining words - for i in range(len(self.wv.syn0), len(self.wv.vocab)): - # construct deterministic seed from word AND seed argument - newsyn0[i - len(self.wv.syn0)] = self.seeded_vector(self.wv.index2word[i] + str(self.seed)) - - # Raise an error if an online update is run before initial training on a corpus - if not len(self.wv.syn0): - raise RuntimeError( - "You cannot do an online vocabulary-update of a model which has no prior vocabulary. " - "First build the vocabulary of your model with a corpus before doing an online update." - ) - - self.wv.syn0 = vstack([self.wv.syn0, newsyn0]) - - if self.hs: - self.syn1 = vstack([self.syn1, zeros((gained_vocab, self.layer1_size), dtype=REAL)]) - if self.negative: - self.syn1neg = vstack([self.syn1neg, zeros((gained_vocab, self.layer1_size), dtype=REAL)]) - self.wv.syn0norm = None - - # do not suppress learning for already learned words - self.syn0_lockf = ones(len(self.wv.vocab), dtype=REAL) # zeros suppress learning - - def reset_weights(self): - """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary.""" - logger.info("resetting layer weights") - self.wv.syn0 = empty((len(self.wv.vocab), self.vector_size), dtype=REAL) - # randomize weights vector by vector, rather than materializing a huge random matrix in RAM at once - for i in range(len(self.wv.vocab)): - # construct deterministic seed from word AND seed argument - self.wv.syn0[i] = self.seeded_vector(self.wv.index2word[i] + str(self.seed)) - if self.hs: - self.syn1 = zeros((len(self.wv.vocab), self.layer1_size), dtype=REAL) - if self.negative: - self.syn1neg = zeros((len(self.wv.vocab), self.layer1_size), dtype=REAL) - self.wv.syn0norm = None - - self.syn0_lockf = ones(len(self.wv.vocab), dtype=REAL) # zeros suppress learning - - def seeded_vector(self, seed_string): - """Create one 'random' vector (but deterministic by seed_string)""" - # Note: built-in hash() may vary by Python version or even (in Py3.x) per launch - once = random.RandomState(self.hashfxn(seed_string) & 0xffffffff) - return (once.rand(self.vector_size) - 0.5) / self.vector_size - - def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='utf8', unicode_errors='strict'): - """ - Merge the input-hidden weight matrix from the original C word2vec-tool format - given, where it intersects with the current vocabulary. (No words are added to the - existing vocabulary, but intersecting words adopt the file's weights, and - non-intersecting words are left alone.) - - `binary` is a boolean indicating whether the data is in binary word2vec format. - - `lockf` is a lock-factor value to be set for any imported word-vectors; the - default value of 0.0 prevents further updating of the vector during subsequent - training. Use 1.0 to allow further training updates of merged vectors. - """ - overlap_count = 0 - logger.info("loading projection weights from %s", fname) - with utils.open(fname, 'rb') as fin: - header = utils.to_unicode(fin.readline(), encoding=encoding) - vocab_size, vector_size = (int(x) for x in header.split()) # throws for invalid file format - if not vector_size == self.vector_size: - raise ValueError("incompatible vector size %d in file %s" % (vector_size, fname)) - # TOCONSIDER: maybe mismatched vectors still useful enough to merge (truncating/padding)? - if binary: - binary_len = dtype(REAL).itemsize * vector_size - for _ in range(vocab_size): - # mixed text and binary: read text first, then binary - word = [] - while True: - ch = fin.read(1) - if ch == b' ': - break - if ch != b'\n': # ignore newlines in front of words (some binary files have) - word.append(ch) - word = utils.to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors) - weights = fromstring(fin.read(binary_len), dtype=REAL) - if word in self.wv.vocab: - overlap_count += 1 - self.wv.syn0[self.wv.vocab[word].index] = weights - self.syn0_lockf[self.wv.vocab[word].index] = lockf # lock-factor: 0.0 stops further changes - else: - for line_no, line in enumerate(fin): - parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ") - if len(parts) != vector_size + 1: - raise ValueError("invalid vector on line %s (is this really the text format?)" % line_no) - word, weights = parts[0], [REAL(x) for x in parts[1:]] - if word in self.wv.vocab: - overlap_count += 1 - self.wv.syn0[self.wv.vocab[word].index] = weights - self.syn0_lockf[self.wv.vocab[word].index] = lockf # lock-factor: 0.0 stops further changes - logger.info("merged %d vectors into %s matrix from %s", overlap_count, self.wv.syn0.shape, fname) - - def most_similar(self, positive=None, negative=None, topn=10, restrict_vocab=None, indexer=None): - """ - Deprecated. Use self.wv.most_similar() instead. - Refer to the documentation for `gensim.models.KeyedVectors.most_similar` - """ - return self.wv.most_similar(positive, negative, topn, restrict_vocab, indexer) - - def wmdistance(self, document1, document2): - """ - Deprecated. Use self.wv.wmdistance() instead. - Refer to the documentation for `gensim.models.KeyedVectors.wmdistance` - """ - return self.wv.wmdistance(document1, document2) - - def most_similar_cosmul(self, positive=None, negative=None, topn=10): - """ - Deprecated. Use self.wv.most_similar_cosmul() instead. - Refer to the documentation for `gensim.models.KeyedVectors.most_similar_cosmul` - """ - return self.wv.most_similar_cosmul(positive, negative, topn) - - def similar_by_word(self, word, topn=10, restrict_vocab=None): - """ - Deprecated. Use self.wv.similar_by_word() instead. - Refer to the documentation for `gensim.models.KeyedVectors.similar_by_word` - """ - return self.wv.similar_by_word(word, topn, restrict_vocab) - - def similar_by_vector(self, vector, topn=10, restrict_vocab=None): - """ - Deprecated. Use self.wv.similar_by_vector() instead. - Refer to the documentation for `gensim.models.KeyedVectors.similar_by_vector` - """ - return self.wv.similar_by_vector(vector, topn, restrict_vocab) - - def doesnt_match(self, words): - """ - Deprecated. Use self.wv.doesnt_match() instead. - Refer to the documentation for `gensim.models.KeyedVectors.doesnt_match` - """ - return self.wv.doesnt_match(words) - - def __getitem__(self, words): - """ - Deprecated. Use self.wv.__getitem__() instead. - Refer to the documentation for `gensim.models.KeyedVectors.__getitem__` - """ - return self.wv.__getitem__(words) - - def __contains__(self, word): - """ - Deprecated. Use self.wv.__contains__() instead. - Refer to the documentation for `gensim.models.KeyedVectors.__contains__` - """ - return self.wv.__contains__(word) - - def similarity(self, w1, w2): - """ - Deprecated. Use self.wv.similarity() instead. - Refer to the documentation for `gensim.models.KeyedVectors.similarity` - """ - return self.wv.similarity(w1, w2) - - def n_similarity(self, ws1, ws2): - """ - Deprecated. Use self.wv.n_similarity() instead. - Refer to the documentation for `gensim.models.KeyedVectors.n_similarity` - """ - return self.wv.n_similarity(ws1, ws2) - - def predict_output_word(self, context_words_list, topn=10): - """Report the probability distribution of the center word given the context words - as input to the trained model.""" - if not self.negative: - raise RuntimeError( - "We have currently only implemented predict_output_word for the negative sampling scheme, " - "so you need to have run word2vec with negative > 0 for this to work." - ) - - if not hasattr(self.wv, 'syn0') or not hasattr(self, 'syn1neg'): - raise RuntimeError("Parameters required for predicting the output words not found.") - - word_vocabs = [self.wv.vocab[w] for w in context_words_list if w in self.wv.vocab] - if not word_vocabs: - warnings.warn("All the input context words are out-of-vocabulary for the current model.") - return None - - word2_indices = [word.index for word in word_vocabs] - - l1 = np_sum(self.wv.syn0[word2_indices], axis=0) - if word2_indices and self.cbow_mean: - l1 /= len(word2_indices) - - prob_values = exp(dot(l1, self.syn1neg.T)) # propagate hidden -> output and take softmax to get probabilities - prob_values /= sum(prob_values) - top_indices = matutils.argsort(prob_values, topn=topn, reverse=True) - # returning the most probable output words with their probabilities - return [(self.wv.index2word[index1], prob_values[index1]) for index1 in top_indices] - - def init_sims(self, replace=False): - """ - init_sims() resides in KeyedVectors because it deals with syn0 mainly, but because syn1 is not an attribute - of KeyedVectors, it has to be deleted in this class, and the normalizing of syn0 happens inside of KeyedVectors - """ - if replace and hasattr(self, 'syn1'): - del self.syn1 - return self.wv.init_sims(replace) - - def estimate_memory(self, vocab_size=None, report=None): - """Estimate required memory for a model using current settings and provided vocabulary size.""" - vocab_size = vocab_size or len(self.wv.vocab) - report = report or {} - report['vocab'] = vocab_size * (700 if self.hs else 500) - report['syn0'] = vocab_size * self.vector_size * dtype(REAL).itemsize - if self.hs: - report['syn1'] = vocab_size * self.layer1_size * dtype(REAL).itemsize - if self.negative: - report['syn1neg'] = vocab_size * self.layer1_size * dtype(REAL).itemsize - report['total'] = sum(report.values()) - logger.info( - "estimated required memory for %i words and %i dimensions: %i bytes", - vocab_size, self.vector_size, report['total'] - ) - return report - - @staticmethod - def log_accuracy(section): - return KeyedVectors.log_accuracy(section) - - def accuracy(self, questions, restrict_vocab=30000, most_similar=None, case_insensitive=True): - most_similar = most_similar or KeyedVectors.most_similar - return self.wv.accuracy(questions, restrict_vocab, most_similar, case_insensitive) - - @staticmethod - def log_evaluate_word_pairs(pearson, spearman, oov, pairs): - """ - Deprecated. Use self.wv.log_evaluate_word_pairs() instead. - Refer to the documentation for `gensim.models.KeyedVectors.log_evaluate_word_pairs` - """ - return KeyedVectors.log_evaluate_word_pairs(pearson, spearman, oov, pairs) - - def evaluate_word_pairs(self, pairs, delimiter='\t', restrict_vocab=300000, - case_insensitive=True, dummy4unknown=False): - """ - Deprecated. Use self.wv.evaluate_word_pairs() instead. - Refer to the documentation for `gensim.models.KeyedVectors.evaluate_word_pairs` - """ - return self.wv.evaluate_word_pairs(pairs, delimiter, restrict_vocab, case_insensitive, dummy4unknown) - - def __str__(self): - return "%s(vocab=%s, size=%s, alpha=%s)" % ( - self.__class__.__name__, len(self.wv.index2word), self.vector_size, self.alpha - ) - - def _minimize_model(self, save_syn1=False, save_syn1neg=False, save_syn0_lockf=False): - warnings.warn( - "This method would be deprecated in the future. " - "Keep just_word_vectors = model.wv to retain just the KeyedVectors instance " - "for read-only querying of word vectors." - ) - if save_syn1 and save_syn1neg and save_syn0_lockf: - return - if hasattr(self, 'syn1') and not save_syn1: - del self.syn1 - if hasattr(self, 'syn1neg') and not save_syn1neg: - del self.syn1neg - if hasattr(self, 'syn0_lockf') and not save_syn0_lockf: - del self.syn0_lockf - self.model_trimmed_post_training = True - - def delete_temporary_training_data(self, replace_word_vectors_with_normalized=False): - """ - Discard parameters that are used in training and score. Use if you're sure you're done training a model. - If `replace_word_vectors_with_normalized` is set, forget the original vectors and only keep the normalized - ones = saves lots of memory! - """ - if replace_word_vectors_with_normalized: - self.init_sims(replace=True) - self._minimize_model() - - def save(self, *args, **kwargs): - # don't bother storing the cached normalized vectors, recalculable table - kwargs['ignore'] = kwargs.get('ignore', ['syn0norm', 'table', 'cum_table']) - - super(Word2Vec, self).save(*args, **kwargs) - - save.__doc__ = SaveLoad.save.__doc__ - - @classmethod - def load(cls, *args, **kwargs): - model = super(Word2Vec, cls).load(*args, **kwargs) - # update older models - if hasattr(model, 'table'): - delattr(model, 'table') # discard in favor of cum_table - if model.negative and hasattr(model.wv, 'index2word'): - model.make_cum_table() # rebuild cum_table from vocabulary - if not hasattr(model, 'corpus_count'): - model.corpus_count = None - if not hasattr(model, 'corpus_total_words'): - model.corpus_total_words = None - for v in model.wv.vocab.values(): - if hasattr(v, 'sample_int'): - break # already 0.12.0+ style int probabilities - elif hasattr(v, 'sample_probability'): - v.sample_int = int(round(v.sample_probability * 2**32)) - del v.sample_probability - if not hasattr(model, 'syn0_lockf') and hasattr(model, 'syn0'): - model.syn0_lockf = ones(len(model.wv.syn0), dtype=REAL) - if not hasattr(model, 'random'): - model.random = random.RandomState(model.seed) - if not hasattr(model, 'train_count'): - model.train_count = 0 - model.total_train_time = 0 - return model - - def _load_specials(self, *args, **kwargs): - super(Word2Vec, self)._load_specials(*args, **kwargs) - # loading from a pre-KeyedVectors word2vec model - if not hasattr(self, 'wv'): - wv = KeyedVectors() - wv.syn0 = self.__dict__.get('syn0', []) - wv.syn0norm = self.__dict__.get('syn0norm', None) - wv.vocab = self.__dict__.get('vocab', {}) - wv.index2word = self.__dict__.get('index2word', []) - self.wv = wv - - @classmethod - def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', unicode_errors='strict', - limit=None, datatype=REAL): - """Deprecated. Use gensim.models.KeyedVectors.load_word2vec_format instead.""" - raise DeprecationWarning("Deprecated. Use gensim.models.KeyedVectors.load_word2vec_format instead.") - - def save_word2vec_format(self, fname, fvocab=None, binary=False): - """Deprecated. Use model.wv.save_word2vec_format instead.""" - raise DeprecationWarning("Deprecated. Use model.wv.save_word2vec_format instead.") - - def get_latest_training_loss(self): - return self.running_training_loss - - -class BrownCorpus(object): - """Iterate over sentences from the Brown corpus (part of NLTK data).""" - - def __init__(self, dirname): - self.dirname = dirname - - def __iter__(self): - for fname in os.listdir(self.dirname): - fname = os.path.join(self.dirname, fname) - if not os.path.isfile(fname): - continue - with utils.open(fname, 'rb') as fin: - for line in fin: - line = utils.to_unicode(line) - # each file line is a single sentence in the Brown corpus - # each token is WORD/POS_TAG - token_tags = [t.split('/') for t in line.split() if len(t.split('/')) == 2] - # ignore words with non-alphabetic tags like ",", "!" etc (punctuation, weird stuff) - words = ["%s/%s" % (token.lower(), tag[:2]) for token, tag in token_tags if tag[:2].isalpha()] - if not words: # don't bother sending out empty sentences - continue - yield words - - -class Text8Corpus(object): - """Iterate over sentences from the "text8" corpus, unzipped from http://mattmahoney.net/dc/text8.zip .""" - - def __init__(self, fname, max_sentence_length=MAX_WORDS_IN_BATCH): - self.fname = fname - self.max_sentence_length = max_sentence_length - - def __iter__(self): - # the entire corpus is one gigantic line -- there are no sentence marks at all - # so just split the sequence of tokens arbitrarily: 1 sentence = 1000 tokens - sentence, rest = [], b'' - with utils.open(self.fname, 'rb') as fin: - while True: - text = rest + fin.read(8192) # avoid loading the entire file (=1 line) into RAM - if text == rest: # EOF - words = utils.to_unicode(text).split() - sentence.extend(words) # return the last chunk of words, too (may be shorter/longer) - if sentence: - yield sentence - break - last_token = text.rfind(b' ') # last token may have been split in two... keep for next iteration - words, rest = (utils.to_unicode(text[:last_token]).split(), - text[last_token:].strip()) if last_token >= 0 else ([], text) - sentence.extend(words) - while len(sentence) >= self.max_sentence_length: - yield sentence[:self.max_sentence_length] - sentence = sentence[self.max_sentence_length:] - - -class LineSentence(object): - """ - Simple format: one sentence = one line; words already preprocessed and separated by whitespace. - """ - - def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): - """ - `source` can be either a string or a file object. Clip the file to the first - `limit` lines (or not clipped if limit is None, the default). - - Example:: - - sentences = LineSentence('myfile.txt') - - Or for compressed files:: - - sentences = LineSentence('compressed_text.txt.bz2') - sentences = LineSentence('compressed_text.txt.gz') - - """ - self.source = source - self.max_sentence_length = max_sentence_length - self.limit = limit - - def __iter__(self): - """Iterate through the lines in the source.""" - try: - # Assume it is a file-like object and try treating it as such - # Things that don't have seek will trigger an exception - self.source.seek(0) - for line in itertools.islice(self.source, self.limit): - line = utils.to_unicode(line).split() - i = 0 - while i < len(line): - yield line[i: i + self.max_sentence_length] - i += self.max_sentence_length - except AttributeError: - # If it didn't work like a file, use it as a string filename - with utils.open(self.source, 'rb') as fin: - for line in itertools.islice(fin, self.limit): - line = utils.to_unicode(line).split() - i = 0 - while i < len(line): - yield line[i: i + self.max_sentence_length] - i += self.max_sentence_length - - -class PathLineSentences(object): - """ - - Works like word2vec.LineSentence, but will process all files in a directory in alphabetical order by filename. - The directory can only contain files that can be read by LineSentence: .bz2, .gz, and text files. - Any file not ending with .bz2 or .gz is assumed to be a text file. Does not work with subdirectories. - - The format of files (either text, or compressed text files) in the path is one sentence = one line, - with words already preprocessed and separated by whitespace. - - """ - - def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): - """ - `source` should be a path to a directory (as a string) where all files can be opened by the - LineSentence class. Each file will be read up to `limit` lines (or not clipped if limit is None, the default). - - Example:: - - sentences = PathLineSentences(os.getcwd() + '\\corpus\\') - - The files in the directory should be either text files, .bz2 files, or .gz files. - - """ - self.source = source - self.max_sentence_length = max_sentence_length - self.limit = limit - - if os.path.isfile(self.source): - logger.debug('single file given as source, rather than a directory of files') - logger.debug('consider using models.word2vec.LineSentence for a single file') - self.input_files = [self.source] # force code compatibility with list of files - elif os.path.isdir(self.source): - self.source = os.path.join(self.source, '') # ensures os-specific slash at end of path - logger.info('reading directory %s', self.source) - self.input_files = os.listdir(self.source) - self.input_files = [self.source + filename for filename in self.input_files] # make full paths - self.input_files.sort() # makes sure it happens in filename order - else: # not a file or a directory, then we can't do anything with it - raise ValueError('input is neither a file nor a path') - logger.info('files read into PathLineSentences:%s', '\n'.join(self.input_files)) - - def __iter__(self): - """iterate through the files""" - for file_name in self.input_files: - logger.info('reading file %s', file_name) - with utils.open(file_name, 'rb') as fin: - for line in itertools.islice(fin, self.limit): - line = utils.to_unicode(line).split() - i = 0 - while i < len(line): - yield line[i:i + self.max_sentence_length] - i += self.max_sentence_length - - -# Example: ./word2vec.py -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 \ -# -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3 -if __name__ == "__main__": - import argparse - logging.basicConfig( - format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s', - level=logging.INFO - ) - logger.info("running %s", " ".join(sys.argv)) - - # check and process cmdline input - program = os.path.basename(sys.argv[0]) - if len(sys.argv) < 2: - print(globals()['__doc__'] % locals()) - sys.exit(1) - - from gensim.models.word2vec import Word2Vec # noqa:F811 avoid referencing __main__ in pickle - - seterr(all='raise') # don't ignore numpy errors - - parser = argparse.ArgumentParser() - parser.add_argument("-train", help="Use text data from file TRAIN to train the model", required=True) - parser.add_argument("-output", help="Use file OUTPUT to save the resulting word vectors") - parser.add_argument("-window", help="Set max skip length WINDOW between words; default is 5", type=int, default=5) - parser.add_argument("-size", help="Set size of word vectors; default is 100", type=int, default=100) - parser.add_argument( - "-sample", - help="Set threshold for occurrence of words. " - "Those that appear with higher frequency in the training data will be randomly down-sampled;" - " default is 1e-3, useful range is (0, 1e-5)", - type=float, default=1e-3 - ) - parser.add_argument( - "-hs", help="Use Hierarchical Softmax; default is 0 (not used)", - type=int, default=0, choices=[0, 1] - ) - parser.add_argument( - "-negative", help="Number of negative examples; default is 5, common values are 3 - 10 (0 = not used)", - type=int, default=5 - ) - parser.add_argument("-threads", help="Use THREADS threads (default 12)", type=int, default=12) - parser.add_argument("-iter", help="Run more training iterations (default 5)", type=int, default=5) - parser.add_argument( - "-min_count", help="This will discard words that appear less than MIN_COUNT times; default is 5", - type=int, default=5 - ) - parser.add_argument( - "-cbow", help="Use the continuous bag of words model; default is 1 (use 0 for skip-gram model)", - type=int, default=1, choices=[0, 1] - ) - parser.add_argument( - "-binary", help="Save the resulting vectors in binary mode; default is 0 (off)", - type=int, default=0, choices=[0, 1] - ) - parser.add_argument("-accuracy", help="Use questions from file ACCURACY to evaluate the model") - - args = parser.parse_args() - - if args.cbow == 0: - skipgram = 1 - else: - skipgram = 0 - - corpus = LineSentence(args.train) - - model = Word2Vec( - corpus, size=args.size, min_count=args.min_count, workers=args.threads, - window=args.window, sample=args.sample, sg=skipgram, hs=args.hs, - negative=args.negative, cbow_mean=1, iter=args.iter - ) - - if args.output: - outfile = args.output - model.wv.save_word2vec_format(outfile, binary=args.binary) - else: - outfile = args.train - model.save(outfile + '.model') - if args.binary == 1: - model.wv.save_word2vec_format(outfile + '.model.bin', binary=True) - else: - model.wv.save_word2vec_format(outfile + '.model.txt', binary=False) - - if args.accuracy: - model.accuracy(args.accuracy) - - logger.info("finished running %s", program) diff --git a/gensim/models/doc2vec.py b/gensim/models/doc2vec.py index 624875bf18..4a2a1761ac 100644 --- a/gensim/models/doc2vec.py +++ b/gensim/models/doc2vec.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# Author: Shiva Manne +# Author: Gensim Contributors # Copyright (C) 2018 RaRe Technologies s.r.o. # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html @@ -69,7 +69,8 @@ except ImportError: from Queue import Queue # noqa:F401 -from collections import namedtuple, defaultdict, Iterable +from collections import namedtuple, defaultdict +from collections.abc import Iterable from timeit import default_timer from dataclasses import dataclass @@ -77,14 +78,11 @@ memmap as np_memmap, vstack, integer, dtype import numpy as np -from gensim.utils import call_on_class_only from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc -from gensim.models.word2vec import Word2VecVocab, Word2VecTrainables +from gensim.models import Word2Vec from six.moves import range from six import string_types, integer_types, itervalues -from gensim.models.base_any2vec import BaseWordEmbeddingsModel from gensim.models.keyedvectors import KeyedVectors, ConcatList, pseudorandom_weak_vector -from types import GeneratorType logger = logging.getLogger(__name__) @@ -170,10 +168,10 @@ def count(self, new_val): Doctag = DoctagVocab -class Doc2Vec(BaseWordEmbeddingsModel): - def __init__(self, documents=None, corpus_file=None, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, +class Doc2Vec(Word2Vec): + def __init__(self, documents=None, corpus_file=None, vector_size=100, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None, docvecs_mapfile=None, comment=None, trim_rule=None, callbacks=(), - **kwargs): + window=5, epochs=10, **kwargs): """Class for training, using and evaluating neural networks described in `Distributed Representations of Sentences and Documents `_. @@ -219,7 +217,7 @@ def __init__(self, documents=None, corpus_file=None, dm_mean=None, dm=1, dbow_wo workers : int, optional Use these many worker threads to train the model (=faster training with multicore machines). epochs : int, optional - Number of iterations (epochs) over the corpus. + Number of iterations (epochs) over the corpus. Defaults to 10 for Doc2Vec. hs : {1,0}, optional If 1, hierarchical softmax will be used for model training. If set to 0, and `negative` is non-zero, negative sampling will be used. @@ -280,28 +278,8 @@ def __init__(self, documents=None, corpus_file=None, dm_mean=None, dm=1, dbow_wo .. sourcecode:: pycon >>> model.docvecs['doc003'] - - vocabulary : :class:`~gensim.models.doc2vec.Doc2VecVocab` - This object represents the vocabulary (sometimes called Dictionary in gensim) of the model. - Besides keeping track of all unique words, this object provides extra functionality, such as - sorting words by frequency, or discarding extremely rare words. - - trainables : :class:`~gensim.models.doc2vec.Doc2VecTrainables` - This object represents the inner shallow neural network used to train the embeddings. The semantics - of the network differ slightly in the two available training modes (CBOW or SG) but you can think - of it as a NN with a single projection and hidden layer which we train on the corpus. The weights are - then used as our embeddings. The only addition to the underlying NN used in - :class:`~gensim.models.word2vec.Word2Vec` is that the input includes not only the word vectors of - each word in the context, but also the paragraph vector. - """ - super(Doc2Vec, self).__init__( - sg=(1 + dm) % 2, - null_word=dm_concat, - callbacks=callbacks, - **kwargs) - - self.load = call_on_class_only + corpus_iterable = documents if dm_mean is not None: self.cbow_mean = dm_mean @@ -309,34 +287,23 @@ def __init__(self, documents=None, corpus_file=None, dm_mean=None, dm=1, dbow_wo self.dbow_words = int(dbow_words) self.dm_concat = int(dm_concat) self.dm_tag_count = int(dm_tag_count) + if dm and dm_concat: + self.layer1_size = (dm_tag_count + (2 * window)) * vector_size + logger.info("using concatenative %d-dimensional layer1", self.layer1_size) - kwargs['null_word'] = dm_concat - vocabulary_keys = ['max_vocab_size', 'min_count', 'sample', 'sorted_vocab', 'null_word', 'ns_exponent'] - vocabulary_kwargs = dict((k, kwargs[k]) for k in vocabulary_keys if k in kwargs) - self.vocabulary = Doc2VecVocab(**vocabulary_kwargs) - - trainables_keys = ['seed', 'hashfxn', 'window'] - trainables_kwargs = dict((k, kwargs[k]) for k in trainables_keys if k in kwargs) - self.trainables = Doc2VecTrainables( - dm=dm, dm_concat=dm_concat, dm_tag_count=dm_tag_count, - vector_size=self.vector_size, **trainables_kwargs) - - self.wv = KeyedVectors(self.vector_size) + self.vector_size = vector_size self.docvecs = docvecs or KeyedVectors(self.vector_size, mapfile_path=docvecs_mapfile) - self.comment = comment - - if documents is not None or corpus_file is not None: - self._check_input_data_sanity(data_iterable=documents, corpus_file=corpus_file) - if corpus_file is not None and not isinstance(corpus_file, string_types): - raise TypeError("You must pass string as the corpus_file argument.") - elif isinstance(documents, GeneratorType): - raise TypeError("You can't pass a generator as the documents argument. Try a sequence.") - self.build_vocab(documents=documents, corpus_file=corpus_file, trim_rule=trim_rule) - self.train( - documents=documents, corpus_file=corpus_file, total_examples=self.corpus_count, - total_words=self.corpus_total_words, epochs=self.epochs, start_alpha=self.alpha, - end_alpha=self.min_alpha, callbacks=callbacks) + super(Doc2Vec, self).__init__( + sentences=corpus_iterable, + corpus_file=corpus_file, + vector_size=self.vector_size, + sg=(1 + dm) % 2, + null_word=self.dm_concat, + callbacks=callbacks, + window=window, + epochs=epochs, + **kwargs) @property def dm(self): @@ -354,9 +321,6 @@ def dbow(self): """ return self.sg # same as SG - def _set_train_params(self, **kwargs): - pass - def _clear_post_train(self): """Alias for :meth:`~gensim.models.doc2vec.Doc2Vec.clear_sims`.""" self.clear_sims() @@ -366,6 +330,18 @@ def clear_sims(self): self.wv.vectors_norm = None self.docvecs.vectors_norm = None + def reset_weights(self): + super(Doc2Vec, self).reset_weights() + self.docvecs.resize_vectors() + self.docvecs.randomly_initialize_vectors() + if self.docvecs.mapfile_path: + self.docvecs.vectors_lockf = np_memmap( + self.docvecs.mapfile_path + '.vectors_lockf', dtype=REAL, mode='w+', shape=(len(self.docvecs.vectors),) + ) + self.docvecs.vectors_lockf.fill(1.0) + else: + self.docvecs.vectors_lockf = ones((len(self.docvecs.vectors),), dtype=REAL) # zeros suppress learning + def reset_from(self, other_model): """Copy shareable data structures from another (possibly pre-trained) model. @@ -377,17 +353,17 @@ def reset_from(self, other_model): """ self.wv.vocab = other_model.wv.vocab self.wv.index2key = other_model.wv.index2key - self.vocabulary.cum_table = other_model.vocabulary.cum_table + self.cum_table = other_model.cum_table self.corpus_count = other_model.corpus_count self.docvecs.vocab = other_model.docvecs.vocab self.docvecs.index2key = other_model.docvecs.index2key - self.trainables.reset_weights(self.hs, self.negative, self.wv, self.docvecs) + self.reset_weights() def _do_train_epoch(self, corpus_file, thread_id, offset, cython_vocab, thread_private_mem, cur_epoch, total_examples=None, total_words=None, offsets=None, start_doctags=None, **kwargs): work, neu1 = thread_private_mem doctag_vectors = self.docvecs.vectors - doctag_locks = self.trainables.vectors_docs_lockf + doctag_locks = self.docvecs.vectors_lockf offset = offsets[thread_id] start_doctag = start_doctags[thread_id] @@ -433,7 +409,7 @@ def _do_train_job(self, job, alpha, inits): for doc in job: doctag_indexes = [self.docvecs.get_index(tag) for tag in doc.tags if tag in self.docvecs] doctag_vectors = self.docvecs.vectors - doctag_locks = self.trainables.vectors_docs_lockf + doctag_locks = self.docvecs.vectors_lockf if self.sg: tally += train_document_dbow( self, doc.words, doctag_indexes, alpha, work, train_words=self.dbow_words, @@ -451,9 +427,10 @@ def _do_train_job(self, job, alpha, inits): ) return tally, self._raw_word_count(job) - def train(self, documents=None, corpus_file=None, total_examples=None, total_words=None, + def train(self, corpus_iterable=None, corpus_file=None, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, - word_count=0, queue_factor=2, report_delay=1.0, callbacks=()): + word_count=0, queue_factor=2, report_delay=1.0, callbacks=(), + **kwargs): """Update the model's neural weights. To support linear learning-rate decay from (initial) `alpha` to `min_alpha`, and accurate @@ -469,7 +446,7 @@ def train(self, documents=None, corpus_file=None, total_examples=None, total_wor Parameters ---------- - documents : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument`, optional + corpus_iterable : iterable of list of :class:`~gensim.models.doc2vec.TaggedDocument`, optional Can be simply a list of elements, but for larger corpora,consider an iterable that streams the documents directly from disk/network. If you don't supply `documents` (or `corpus_file`), the model is left uninitialized -- use if you plan to initialize it in some other way. @@ -506,19 +483,17 @@ def train(self, documents=None, corpus_file=None, total_examples=None, total_wor List of callbacks that need to be executed/run at specific stages during training. """ - kwargs = {} - - if corpus_file is None and documents is None: + if corpus_file is None and corpus_iterable is None: raise TypeError("Either one of corpus_file or documents value must be provided") - if corpus_file is not None and documents is not None: - raise TypeError("Both corpus_file and documents must not be provided at the same time") + if corpus_file is not None and corpus_iterable is not None: + raise TypeError("Both corpus_file and corpus_iterable must not be provided at the same time") - if documents is None and not os.path.isfile(corpus_file): + if corpus_iterable is None and not os.path.isfile(corpus_file): raise TypeError("Parameter corpus_file must be a valid path to a file, got %r instead" % corpus_file) - if documents is not None and not isinstance(documents, Iterable): - raise TypeError("documents must be an iterable of list, got %r instead" % documents) + if corpus_iterable is not None and not isinstance(corpus_iterable, Iterable): + raise TypeError("corpus_iterable must be an iterable of TaggedDocument, got %r instead" % corpus_iterable) if corpus_file is not None: # Calculate offsets for each worker along with initial doctags (doctag ~ document/line number in a file) @@ -527,7 +502,8 @@ def train(self, documents=None, corpus_file=None, total_examples=None, total_wor kwargs['start_doctags'] = start_doctags super(Doc2Vec, self).train( - sentences=documents, corpus_file=corpus_file, total_examples=total_examples, total_words=total_words, + corpus_iterable=corpus_iterable, corpus_file=corpus_file, + total_examples=total_examples, total_words=total_words, epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count, queue_factor=queue_factor, report_delay=report_delay, callbacks=callbacks, **kwargs) @@ -642,9 +618,9 @@ def infer_vector(self, doc_words, alpha=None, min_alpha=None, epochs=None, steps doctag_locks = np.ones(1, dtype=REAL) doctag_indexes = [0] - work = zeros(self.trainables.layer1_size, dtype=REAL) + work = zeros(self.layer1_size, dtype=REAL) if not self.sg: - neu1 = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL) + neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) alpha_delta = (alpha - min_alpha) / max(epochs - 1, 1) @@ -721,10 +697,10 @@ def __str__(self): segments.append('hs') if not self.sg or (self.sg and self.dbow_words): segments.append('w%d' % self.window) # window size, when relevant - if self.vocabulary.min_count > 1: - segments.append('mc%d' % self.vocabulary.min_count) - if self.vocabulary.sample > 0: - segments.append('s%g' % self.vocabulary.sample) + if self.min_count > 1: + segments.append('mc%d' % self.min_count) + if self.sample > 0: + segments.append('s%g' % self.sample) if self.workers > 1: segments.append('t%d' % self.workers) return '%s(%s)' % (self.__class__.__name__, ','.join(segments)) @@ -788,9 +764,9 @@ def load(cls, *args, **kwargs): fname : str Path to the saved file. *args : object - Additional arguments, see `~gensim.models.base_any2vec.BaseWordEmbeddingsModel.load`. + Additional arguments, see `~gensim.models.word2vec.Word2Vec.load`. **kwargs : object - Additional arguments, see `~gensim.models.base_any2vec.BaseWordEmbeddingsModel.load`. + Additional arguments, see `~gensim.models.word2vec.Word2Vec.load`. See Also -------- @@ -804,11 +780,13 @@ def load(cls, *args, **kwargs): """ try: - return super(Doc2Vec, cls).load(*args, **kwargs) - except AttributeError: - logger.info('Model saved using code from earlier Gensim Version. Re-loading old model in a compatible way.') - from gensim.models.deprecated.doc2vec import load_old_doc2vec - return load_old_doc2vec(*args, **kwargs) + return super(Doc2Vec, cls).load(*args, rethrow=True, **kwargs) + except AttributeError as ae: + logger.error( + "Model load error. Was model saved using code from an older Gensim Version? " + "Try loading older model using gensim-3.8.1, then re-saving, to restore " + "compatibility with current code.") + raise ae def estimate_memory(self, vocab_size=None, report=None): """Estimate required memory for a model using current settings. @@ -834,8 +812,8 @@ def estimate_memory(self, vocab_size=None, report=None): report['doctag_syn0'] = len(self.docvecs) * self.vector_size * dtype(REAL).itemsize return super(Doc2Vec, self).estimate_memory(vocab_size, report=report) - def build_vocab(self, documents=None, corpus_file=None, update=False, progress_per=10000, keep_raw_vocab=False, - trim_rule=None, **kwargs): + def build_vocab(self, corpus_iterable=None, corpus_file=None, update=False, progress_per=10000, + keep_raw_vocab=False, trim_rule=None, **kwargs): """Build vocabulary from a sequence of documents (can be a once-only generator stream). Parameters @@ -873,19 +851,16 @@ def build_vocab(self, documents=None, corpus_file=None, update=False, progress_p Additional key word arguments passed to the internal vocabulary construction. """ - total_words, corpus_count = self.vocabulary.scan_vocab( - documents=documents, corpus_file=corpus_file, docvecs=self.docvecs, + total_words, corpus_count = self.scan_vocab( + corpus_iterable=corpus_iterable, corpus_file=corpus_file, docvecs=self.docvecs, progress_per=progress_per, trim_rule=trim_rule ) self.corpus_count = corpus_count self.corpus_total_words = total_words - report_values = self.vocabulary.prepare_vocab( - self.hs, self.negative, self.wv, update=update, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, - **kwargs) + report_values = self.prepare_vocab(update=update, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, **kwargs) report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words']) - self.trainables.prepare_weights( - self.hs, self.negative, self.wv, self.docvecs, update=update) + self.prepare_weights(update=update) def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False): """Build vocabulary from a dictionary of word frequencies. @@ -930,80 +905,14 @@ def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=No # Since no documents are provided, this is to control the corpus_count self.corpus_count = corpus_count or 0 - self.vocabulary.raw_vocab = raw_vocab + self.raw_vocab = raw_vocab # trim by min_count & precalculate downsampling - report_values = self.vocabulary.prepare_vocab( - self.hs, self.negative, self.wv, keep_raw_vocab=keep_raw_vocab, - trim_rule=trim_rule, update=update) + report_values = self.prepare_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words']) - self.trainables.prepare_weights( - self.hs, self.negative, self.wv, self.docvecs, update=update) - - def similarity_unseen_docs(self, doc_words1, doc_words2, alpha=None, min_alpha=None, steps=None): - """Compute cosine similarity between two post-bulk out of training documents. - - Parameters - ---------- - model : :class:`~gensim.models.doc2vec.Doc2Vec` - An instance of a trained `Doc2Vec` model. - doc_words1 : list of str - Input document. - doc_words2 : list of str - Input document. - alpha : float, optional - The initial learning rate. - min_alpha : float, optional - Learning rate will linearly drop to `min_alpha` as training progresses. - steps : int, optional - Number of epoch to train the new document. - - Returns - ------- - float - The cosine similarity between `doc_words1` and `doc_words2`. - - """ - d1 = self.infer_vector(doc_words=doc_words1, alpha=alpha, min_alpha=min_alpha, steps=steps) - d2 = self.infer_vector(doc_words=doc_words2, alpha=alpha, min_alpha=min_alpha, steps=steps) - return np.dot(matutils.unitvec(d1), matutils.unitvec(d2)) - + self.prepare_weights(update=update) -class Doc2VecVocab(Word2VecVocab): - def __init__(self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0, ns_exponent=0.75): - """Vocabulary used by :class:`~gensim.models.doc2vec.Doc2Vec`. - - This includes a mapping from words found in the corpus to their total frequency count. - - Parameters - ---------- - max_vocab_size : int, optional - Maximum number of words in the Vocabulary. Used to limit the RAM during vocabulary building; - if there are more unique words than this, then prune the infrequent ones. - Every 10 million word types need about 1GB of RAM, set to `None` for no limit. - min_count : int - Words with frequency lower than this limit will be discarded from the vocabulary. - sample : float, optional - The threshold for configuring which higher-frequency words are randomly downsampled, - useful range is (0, 1e-5). - sorted_vocab : bool - If True, sort the vocabulary by descending frequency before assigning word indexes. - null_word : {0, 1} - If True, a null pseudo-word will be created for padding when using concatenative L1 (run-of-words). - This word is only ever input – never predicted – so count, huffman-point, etc doesn't matter. - ns_exponent : float, optional - The exponent used to shape the negative sampling distribution. A value of 1.0 samples exactly in proportion - to the frequencies, 0.0 samples all words equally, while a negative value samples low-frequency words more - than high-frequency words. The popular default value of 0.75 was chosen by the original Word2Vec paper. - More recently, in https://arxiv.org/abs/1804.04212, Caselles-Dupré, Lesaint, & Royo-Letelier suggest that - other values may perform better for recommendation applications. - - """ - super(Doc2VecVocab, self).__init__( - max_vocab_size=max_vocab_size, min_count=min_count, sample=sample, - sorted_vocab=sorted_vocab, null_word=null_word, ns_exponent=ns_exponent) - - def _scan_vocab(self, documents, docvecs, progress_per, trim_rule): + def _scan_vocab(self, corpus_iterable, progress_per, trim_rule): document_no = -1 total_words = 0 min_reduce = 1 @@ -1014,7 +923,7 @@ def _scan_vocab(self, documents, docvecs, progress_per, trim_rule): max_rawint = -1 # highest raw int tag seen (-1 for none) doctags_lookup = {} doctags_list = [] - for document_no, document in enumerate(documents): + for document_no, document in enumerate(corpus_iterable): if not checked_string_types: if isinstance(document.words, string_types): logger.warning( @@ -1027,7 +936,7 @@ def _scan_vocab(self, documents, docvecs, progress_per, trim_rule): interval_rate = (total_words - interval_count) / (default_timer() - interval_start) logger.info( "PROGRESS: at example #%i, processed %i words (%i/s), %i word types, %i tags", - document_no, total_words, interval_rate, len(vocab), len(docvecs) + document_no, total_words, interval_rate, len(vocab), len(doctags_list) ) interval_start = default_timer() interval_count = total_words @@ -1053,21 +962,26 @@ def _scan_vocab(self, documents, docvecs, progress_per, trim_rule): utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) min_reduce += 1 + corpus_count = document_no + 1 + if len(doctags_list) > corpus_count: + logger.warning("More unique tags (%i) than documents (%i).", len(doctags_list), corpus_count) + if max_rawint > corpus_count: + logger.warning( + "Highest int doctag (%i) larger than count of documents (%i). This means " + "at least %i excess, unused slots (%i bytes) will be allocated for vectors.", + max_rawint, corpus_count, ((max_rawint - corpus_count) * self.vector_size * 4)) if max_rawint > -1: # adjust indexes/list to account for range of pure-int keyed doctags for key in doctags_list: doctags_lookup[key].index = doctags_lookup[key].index + max_rawint + 1 doctags_list = ConcatList([range(0, max_rawint + 1), doctags_list]) - docvecs.vocab = doctags_lookup - docvecs.index2key = doctags_list - corpus_count = document_no + 1 - if len(doctags_list) > corpus_count: - logger.warn("More unique tags (%i) than documents (%i).", len(doctags_list), corpus_count) + self.docvecs.map = doctags_lookup + self.docvecs.index2key = doctags_list self.raw_vocab = vocab return total_words, corpus_count - def scan_vocab(self, documents=None, corpus_file=None, docvecs=None, progress_per=10000, trim_rule=None): + def scan_vocab(self, corpus_iterable=None, corpus_file=None, docvecs=None, progress_per=10000, trim_rule=None): """Create the models Vocabulary: A mapping from unique words in the corpus to their frequency count. Parameters @@ -1104,49 +1018,54 @@ def scan_vocab(self, documents=None, corpus_file=None, docvecs=None, progress_pe """ logger.info("collecting all words and their counts") if corpus_file is not None: - documents = TaggedLineDocument(corpus_file) + corpus_iterable = TaggedLineDocument(corpus_file) - total_words, corpus_count = self._scan_vocab(documents, docvecs, progress_per, trim_rule) + total_words, corpus_count = self._scan_vocab(corpus_iterable, progress_per, trim_rule) logger.info( "collected %i word types and %i unique tags from a corpus of %i examples and %i words", - len(self.raw_vocab), len(docvecs), corpus_count, total_words + len(self.raw_vocab), len(self.docvecs), corpus_count, total_words ) return total_words, corpus_count + def similarity_unseen_docs(self, doc_words1, doc_words2, alpha=None, min_alpha=None, steps=None): + """Compute cosine similarity between two post-bulk out of training documents. -class Doc2VecTrainables(Word2VecTrainables): - def __init__(self, dm=1, dm_concat=0, dm_tag_count=1, vector_size=100, seed=1, hashfxn=hash, window=5): - """Represents the inner shallow neural network used to train :class:`~gensim.models.doc2vec.Doc2Vec`.""" - super(Doc2VecTrainables, self).__init__( - vector_size=vector_size, seed=seed, hashfxn=hashfxn) - if dm and dm_concat: - self.layer1_size = (dm_tag_count + (2 * window)) * vector_size - logger.info("using concatenative %d-dimensional layer1", self.layer1_size) + Parameters + ---------- + model : :class:`~gensim.models.doc2vec.Doc2Vec` + An instance of a trained `Doc2Vec` model. + doc_words1 : list of str + Input document. + doc_words2 : list of str + Input document. + alpha : float, optional + The initial learning rate. + min_alpha : float, optional + Learning rate will linearly drop to `min_alpha` as training progresses. + steps : int, optional + Number of epoch to train the new document. - def prepare_weights(self, hs, negative, wv, docvecs, update=False): - """Build tables and model weights based on final vocabulary settings.""" - # set initial input/projection and hidden weights - if not update: - self.reset_weights(hs, negative, wv, docvecs) - else: - self.update_weights(hs, negative, wv) - - def reset_weights(self, hs, negative, wv, docvecs, vocabulary=None): - super(Doc2VecTrainables, self).reset_weights(hs, negative, wv) - self.reset_doc_weights(docvecs) - - def reset_doc_weights(self, docvecs): - docvecs.resize_vectors() - docvecs.randomly_initialize_vectors() - if docvecs.mapfile_path: - self.vectors_docs_lockf = np_memmap( - docvecs.mapfile_path + '.vectors_docs_lockf', dtype=REAL, mode='w+', shape=(len(docvecs.vectors),) - ) - self.vectors_docs_lockf.fill(1.0) - else: - self.vectors_docs_lockf = ones((len(docvecs.vectors),), dtype=REAL) # zeros suppress learning + Returns + ------- + float + The cosine similarity between `doc_words1` and `doc_words2`. + + """ + d1 = self.infer_vector(doc_words=doc_words1, alpha=alpha, min_alpha=min_alpha, steps=steps) + d2 = self.infer_vector(doc_words=doc_words2, alpha=alpha, min_alpha=min_alpha, steps=steps) + return np.dot(matutils.unitvec(d1), matutils.unitvec(d2)) + + +class Doc2VecVocab(utils.SaveLoad): + """Obsolete class retained for now as load-compatibility state capture""" + pass + + +class Doc2VecTrainables(utils.SaveLoad): + """Obsolete class retained for now as load-compatibility state capture""" + pass class TaggedBrownCorpus(object): diff --git a/gensim/models/doc2vec_inner.pyx b/gensim/models/doc2vec_inner.pyx index 8d9ca4862f..e06aa00a35 100644 --- a/gensim/models/doc2vec_inner.pyx +++ b/gensim/models/doc2vec_inner.pyx @@ -225,14 +225,14 @@ cdef init_d2v_config(Doc2VecConfig *c, model, alpha, learn_doctags, learn_words, doctag_locks=None, docvecs_count=0): c[0].hs = model.hs c[0].negative = model.negative - c[0].sample = (model.vocabulary.sample != 0) + c[0].sample = (model.sample != 0) c[0].cbow_mean = model.cbow_mean c[0].train_words = train_words c[0].learn_doctags = learn_doctags c[0].learn_words = learn_words c[0].learn_hidden = learn_hidden c[0].alpha = alpha - c[0].layer1_size = model.trainables.layer1_size + c[0].layer1_size = model.layer1_size c[0].vector_size = model.docvecs.vector_size c[0].workers = model.workers c[0].docvecs_count = docvecs_count @@ -251,28 +251,28 @@ cdef init_d2v_config(Doc2VecConfig *c, model, alpha, learn_doctags, learn_words, doctag_vectors = model.docvecs.vectors_docs c[0].doctag_vectors = (np.PyArray_DATA(doctag_vectors)) if word_locks is None: - word_locks = model.trainables.vectors_lockf + word_locks = model.wv.vectors_lockf c[0].word_locks = (np.PyArray_DATA(word_locks)) if doctag_locks is None: - doctag_locks = model.trainables.vectors_docs_lockf + doctag_locks = model.docvecs.vectors_lockf c[0].doctag_locks = (np.PyArray_DATA(doctag_locks)) if c[0].hs: - c[0].syn1 = (np.PyArray_DATA(model.trainables.syn1)) + c[0].syn1 = (np.PyArray_DATA(model.syn1)) if c[0].negative: - c[0].syn1neg = (np.PyArray_DATA(model.trainables.syn1neg)) - c[0].cum_table = (np.PyArray_DATA(model.vocabulary.cum_table)) - c[0].cum_table_len = len(model.vocabulary.cum_table) + c[0].syn1neg = (np.PyArray_DATA(model.syn1neg)) + c[0].cum_table = (np.PyArray_DATA(model.cum_table)) + c[0].cum_table_len = len(model.cum_table) if c[0].negative or c[0].sample: c[0].next_random = (2**24) * model.random.randint(0, 2**24) + model.random.randint(0, 2**24) # convert Python structures to primitive types, so we can release the GIL if work is None: - work = zeros(model.trainables.layer1_size, dtype=REAL) + work = zeros(model.layer1_size, dtype=REAL) c[0].work = np.PyArray_DATA(work) if neu1 is None: - neu1 = zeros(model.trainables.layer1_size, dtype=REAL) + neu1 = zeros(model.layer1_size, dtype=REAL) c[0].neu1 = np.PyArray_DATA(neu1) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 7f0c482362..dd299ec964 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# Authors: Shiva Manne , Chinmaya Pancholi +# Authors: Gensim Contributors # Copyright (C) 2018 RaRe Technologies s.r.o. # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html @@ -37,7 +37,7 @@ ['human', 'interface', 'computer'] >>> print(len(common_texts)) 9 - >>> model = FastText(size=4, window=3, min_count=1) # instantiate + >>> model = FastText(vector_size=4, window=3, min_count=1) # instantiate >>> model.build_vocab(sentences=common_texts) >>> model.train(sentences=common_texts, total_examples=len(common_texts), epochs=10) # train @@ -50,7 +50,7 @@ .. sourcecode:: pycon - >>> model2 = FastText(size=4, window=3, min_count=1, sentences=common_texts, iter=10) + >>> model2 = FastText(vector_size=4, window=3, min_count=1, sentences=common_texts, iter=10) .. Important:: This style of initialize-and-train in a single line is **deprecated**. We include it here @@ -84,7 +84,7 @@ >>> from gensim.test.utils import datapath >>> >>> corpus_file = datapath('lee_background.cor') # absolute path to corpus - >>> model3 = FastText(size=4, window=3, min_count=1) + >>> model3 = FastText(vector_size=4, window=3, min_count=1) >>> model3.build_vocab(corpus_file=corpus_file) # scan over corpus to build the vocabulary >>> >>> total_words = model3.corpus_total_words # number of words in the corpus @@ -116,7 +116,7 @@ ... yield list(tokenize(line)) >>> >>> - >>> model4 = FastText(size=4, window=3, min_count=1) + >>> model4 = FastText(vector_size=4, window=3, min_count=1) >>> model4.build_vocab(sentences=MyIter()) >>> total_examples = model4.corpus_count >>> model4.train(sentences=MyIter(), total_examples=total_examples, epochs=5) @@ -258,10 +258,7 @@ - :mod:`gensim.models.fasttext`: This module. Contains FastText-specific functionality only. - :mod:`gensim.models.keyedvectors`: Implements generic functionality. -- :mod:`gensim.models.word2vec`: Contains implementations for the vocabulary - and the trainables for FastText. -- :mod:`gensim.models.base_any2vec`: Contains implementations for the base. - classes, including functionality such as callbacks, logging. +- :mod:`gensim.models.word2vec`: Provides much of the basic scan & train framework. - :mod:`gensim.utils`: Implements model I/O (loading and saving). Our implementation relies heavily on inheritance. @@ -288,9 +285,8 @@ import gensim.models._fasttext_bin -from gensim.models.word2vec import Word2VecVocab, Word2VecTrainables +from gensim.models.word2vec import Word2Vec from gensim.models.keyedvectors import KeyedVectors -from gensim.models.base_any2vec import BaseWordEmbeddingsModel from gensim.utils import deprecated, call_on_class_only, open, NO_CYTHON @@ -312,10 +308,11 @@ raise NO_CYTHON -class FastText(BaseWordEmbeddingsModel): - def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, +class FastText(Word2Vec): + def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, vector_size=100, alpha=0.025, + window=5, min_count=5, max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, - negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, max_n=6, + negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0, min_n=3, max_n=6, sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=MAX_WORDS_IN_BATCH, callbacks=(), compatible_hash=True): """Train, use and evaluate word representations learned using the method @@ -470,27 +467,62 @@ def __init__(self, sentences=None, corpus_file=None, sg=0, hs=0, size=100, alpha if self.word_ngrams <= 1 and max_n == 0: bucket = 0 - self.wv = FastTextKeyedVectors(size, min_n, max_n, bucket, compatible_hash) - self.vocabulary = FastTextVocab( - max_vocab_size=max_vocab_size, min_count=min_count, sample=sample, - sorted_vocab=bool(sorted_vocab), null_word=null_word, ns_exponent=ns_exponent) - self.trainables = FastTextTrainables(vector_size=size, seed=seed, bucket=bucket, hashfxn=hashfxn) - self.trainables.prepare_weights(hs, negative, self.wv, update=False, vocabulary=self.vocabulary) - self.wv.bucket = self.trainables.bucket + self.wv = FastTextKeyedVectors(vector_size, min_n, max_n, bucket, compatible_hash) + self.bucket = bucket + self.wv.bucket = bucket super(FastText, self).__init__( - sentences=sentences, corpus_file=corpus_file, workers=workers, vector_size=size, epochs=iter, + sentences=sentences, corpus_file=corpus_file, workers=workers, vector_size=vector_size, epochs=epochs, callbacks=callbacks, batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window, + max_vocab_size=max_vocab_size, min_count=min_count, sample=sample, sorted_vocab=sorted_vocab, + null_word=null_word, ns_exponent=ns_exponent, hashfxn=hashfxn, seed=seed, hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha) - def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_per=10000, keep_raw_vocab=False, - trim_rule=None, **kwargs): + def prepare_weights(self, update=False): + """In addition to superclass allocations, compute ngrams of all words present in vocabulary. + + Parameters + ---------- + update : bool + If True, the new vocab words and their new ngrams word vectors are initialized + with random uniform distribution and updated/added to the existing vocab word and ngram vectors. + """ + super(FastText, self).prepare_weights(update=update) + if not update: + self.wv.init_ngrams_weights(self.seed) + self.wv.vectors_vocab_lockf = ones(len(self.wv.vectors_vocab), dtype=REAL) + self.wv.vectors_ngrams_lockf = ones(len(self.wv.vectors_ngrams), dtype=REAL) + else: + self.wv.update_ngrams_weights(self.seed, self.old_vocab_len) + self.wv.vectors_vocab_lockf = _pad_ones(self.wv.vectors_vocab_lockf, len(self.wv.vectors_vocab)) + self.wv.vectors_ngrams_lockf = _pad_ones(self.wv.vectors_ngrams_lockf, len(self.wv.vectors_ngrams)) + + def init_post_load(self, hidden_output): + num_vectors = len(self.wv.vectors) + vocab_size = len(self.wv.vocab) + vector_size = self.wv.vector_size + + assert num_vectors > 0, 'expected num_vectors to be initialized already' + assert vocab_size > 0, 'expected vocab_size to be initialized already' + + self.wv.vectors_ngrams_lockf = ones(len(self.wv.vectors_ngrams), dtype=REAL) + self.wv.vectors_vocab_lockf = ones(len(self.wv.vectors_vocab.shape), dtype=REAL) + + if self.hs: + self.syn1 = hidden_output + if self.negative: + self.syn1neg = hidden_output + + self.layer1_size = vector_size + + def build_vocab(self, corpus_iterable=None, corpus_file=None, update=False, progress_per=10000, + keep_raw_vocab=False, trim_rule=None, **kwargs): """Build vocabulary from a sequence of sentences (can be a once-only generator stream). Each sentence must be a list of unicode strings. Parameters ---------- - sentences : iterable of list of str, optional + corpus_iterable : iterable of list of str, optional Can be simply a list of lists of tokens, but for larger corpora, consider an iterable that streams the sentences directly from disk/network. See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` @@ -521,7 +553,7 @@ def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_p **kwargs Additional key word parameters passed to - :meth:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.build_vocab`. + :meth:`~gensim.models.word2vec.Word2Vec.build_vocab`. Examples -------- @@ -542,7 +574,7 @@ def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_p """ if not update: - self.wv.init_ngrams_weights(self.trainables.seed) + self.wv.init_ngrams_weights(self.seed) elif not len(self.wv.vocab): raise RuntimeError( "You cannot do an online vocabulary-update of a model which has no prior vocabulary. " @@ -551,43 +583,30 @@ def build_vocab(self, sentences=None, corpus_file=None, update=False, progress_p "before doing an online update." ) else: - self.vocabulary.old_vocab_len = len(self.wv.vocab) + self.old_vocab_len = len(self.wv.vocab) retval = super(FastText, self).build_vocab( - sentences=sentences, corpus_file=corpus_file, update=update, progress_per=progress_per, + corpus_iterable=corpus_iterable, corpus_file=corpus_file, update=update, progress_per=progress_per, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, **kwargs) if update: - self.wv.update_ngrams_weights(self.trainables.seed, self.vocabulary.old_vocab_len) + self.wv.update_ngrams_weights(self.seed, self.old_vocab_len) return retval - def _set_train_params(self, **kwargs): - # - # We need the wv.buckets_word member to be initialized in order to - # continue training. The _clear_post_train method destroys this - # variable, so we reinitialize it here, if needed. - # - # The .old_vocab_len member is set only to keep the init_ngrams_weights method happy. - # - if self.wv.buckets_word is None: - self.vocabulary.old_vocab_len = len(self.wv.vocab) - self.trainables.init_ngrams_weights(self.wv, update=True, vocabulary=self.vocabulary) - def _clear_post_train(self): """Clear the model's internal structures after training has finished to free up RAM.""" self.wv.vectors_norm = None - self.wv.buckets_word = None self.wv.adjust_vectors() # ensure composite-word vecs reflect latest training def estimate_memory(self, vocab_size=None, report=None): vocab_size = vocab_size or len(self.wv.vocab) vec_size = self.vector_size * np.dtype(np.float32).itemsize - l1_size = self.trainables.layer1_size * np.dtype(np.float32).itemsize + l1_size = self.layer1_size * np.dtype(np.float32).itemsize report = report or {} report['vocab'] = len(self.wv.vocab) * (700 if self.hs else 500) report['syn0_vocab'] = len(self.wv.vocab) * vec_size - num_buckets = self.trainables.bucket + num_buckets = self.bucket if self.hs: report['syn1'] = len(self.wv.vocab) * l1_size if self.negative: @@ -595,7 +614,7 @@ def estimate_memory(self, vocab_size=None, report=None): if self.word_ngrams > 0 and self.wv.vocab: num_buckets = num_ngrams = 0 - if self.trainables.bucket: + if self.bucket: buckets = set() num_ngrams = 0 for word in self.wv.vocab: @@ -603,7 +622,7 @@ def estimate_memory(self, vocab_size=None, report=None): word, self.wv.min_n, self.wv.max_n, - self.trainables.bucket, + self.bucket, self.wv.compatible_hash ) num_ngrams += len(hashes) @@ -669,7 +688,7 @@ def _do_train_job(self, sentences, alpha, inits): return tally, self._raw_word_count(sentences) - def train(self, sentences=None, corpus_file=None, total_examples=None, total_words=None, + def train(self, corpus_iterable=None, corpus_file=None, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, queue_factor=2, report_delay=1.0, callbacks=(), **kwargs): """Update the model's neural weights from a sequence of sentences (can be a once-only generator stream). @@ -736,20 +755,26 @@ def train(self, sentences=None, corpus_file=None, total_examples=None, total_wor """ - if corpus_file is None and sentences is None: - raise TypeError("Either one of corpus_file or sentences value must be provided") + if corpus_file is None and corpus_iterable is None: + raise TypeError("Either one of corpus_file or corpus_iterable value must be provided") - if corpus_file is not None and sentences is not None: - raise TypeError("Both corpus_file and sentences must not be provided at the same time") + if corpus_file is not None and corpus_iterable is not None: + raise TypeError("Both corpus_file and corpus_iterable must not be provided at the same time") - if sentences is None and not os.path.isfile(corpus_file): + if corpus_iterable is None and not os.path.isfile(corpus_file): raise TypeError("Parameter corpus_file must be a valid path to a file, got %r instead" % corpus_file) - if sentences is not None and not isinstance(sentences, Iterable): - raise TypeError("sentences must be an iterable of list, got %r instead" % sentences) + if corpus_iterable is not None and not isinstance(corpus_iterable, Iterable): + raise TypeError("sentences must be an iterable of list, got %r instead" % corpus_iterable) + + if self.wv.buckets_word is None: + logger.warn("self.wv.buckets_word was None; fixing.") + self.old_vocab_len = len(self.wv.vocab) + self.wv.init_ngrams_weights(seed=self.seed) super(FastText, self).train( - sentences=sentences, corpus_file=corpus_file, total_examples=total_examples, total_words=total_words, + corpus_iterable=corpus_iterable, corpus_file=corpus_file, + total_examples=total_examples, total_words=total_words, epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count, queue_factor=queue_factor, report_delay=report_delay, callbacks=callbacks) self.wv.adjust_vectors() @@ -767,8 +792,8 @@ def init_sims(self, replace=False): # init_sims() resides in KeyedVectors because it deals with input layer mainly, but because the # hidden layer is not an attribute of KeyedVectors, it has to be deleted in this class. # The normalizing of input layer happens inside of KeyedVectors. - if replace and hasattr(self.trainables, 'syn1'): - del self.trainables.syn1 + if replace and hasattr(self, 'syn1'): + del self.syn1 self.wv.init_sims(replace) def clear_sims(self): @@ -850,141 +875,36 @@ def load(cls, *args, **kwargs): Save :class:`~gensim.models.fasttext.FastText` model. """ - try: - model = super(FastText, cls).load(*args, **kwargs) - - if not hasattr(model.trainables, 'vectors_vocab_lockf') and hasattr(model.wv, 'vectors_vocab'): - model.trainables.vectors_vocab_lockf = ones(len(model.wv.vectors_vocab), dtype=REAL) - if not hasattr(model.trainables, 'vectors_ngrams_lockf') and hasattr(model.wv, 'vectors_ngrams'): - model.trainables.vectors_ngrams_lockf = ones(len(model.wv.vectors_ngrams), dtype=REAL) - - # fixup mistakenly overdimensioned gensim-3.x lockf arrays - if len(model.trainables.vectors_vocab_lockf.shape) > 1: - model.trainables.vectors_vocab_lockf = model.trainables.vectors_vocab_lockf[:, 0] - if len(model.trainables.vectors_ngrams_lockf.shape) > 1: - model.trainables.vectors_ngrams_lockf = model.trainables.vectors_ngrams_lockf[:, 0] - - if not hasattr(model.wv, 'bucket'): - model.wv.bucket = model.trainables.bucket - except AttributeError: - logger.info('Model saved using code from earlier Gensim Version. Re-loading old model in a compatible way.') - from gensim.models.deprecated.fasttext import load_old_fasttext - model = load_old_fasttext(*args, **kwargs) + model = super(FastText, cls).load(*args, rethrow=True, **kwargs) + + if not hasattr(model.wv, 'vectors_vocab_lockf') and hasattr(model.wv, 'vectors_vocab'): + # TODO: try trainables-location + model.wv.vectors_vocab_lockf = ones(len(model.wv.vectors_vocab), dtype=REAL) + if not hasattr(model, 'vectors_ngrams_lockf') and hasattr(model.wv, 'vectors_ngrams'): + # TODO: try trainables-location + model.wv.vectors_ngrams_lockf = ones(len(model.wv.vectors_ngrams), dtype=REAL) + # fixup mistakenly overdimensioned gensim-3.x lockf arrays + if len(model.wv.vectors_vocab_lockf.shape) > 1: + model.wv.vectors_vocab_lockf = model.wv.vectors_vocab_lockf[:, 0] + if len(model.wv.vectors_ngrams_lockf.shape) > 1: + model.wv.vectors_ngrams_lockf = model.wv.vectors_ngrams_lockf[:, 0] + if not hasattr(model, 'bucket'): + model.bucket = model.wv.bucket _try_upgrade(model.wv) return model -class FastTextVocab(Word2VecVocab): +class FastTextVocab(utils.SaveLoad): """This is a redundant class. It exists only to maintain backwards compatibility with older gensim versions.""" pass -class FastTextTrainables(Word2VecTrainables): - """Represents the inner shallow neural network used to train :class:`~gensim.models.fasttext.FastText`. - - Mostly inherits from its parent (:class:`~gensim.models.word2vec.Word2VecTrainables`). - Adds logic for calculating and maintaining ngram weights. - - Attributes - ---------- - hashfxn : function - Used for randomly initializing weights. Defaults to the built-in hash() - layer1_size : int - The size of the inner layer of the NN. Equal to the vector dimensionality. - Set in the :class:`~gensim.models.word2vec.Word2VecTrainables` constructor. - seed : float - The random generator seed used in reset_weights and update_weights. - syn1 : numpy.array - The inner layer of the NN. Each row corresponds to a term in the vocabulary. - Columns correspond to weights of the inner layer. - There are layer1_size such weights. - Set in the reset_weights and update_weights methods, only if hierarchical sampling is used. - syn1neg : numpy.array - Similar to syn1, but only set if negative sampling is used. - vectors_lockf : numpy.array - A one-dimensional array with one element for each term in the vocab. Set in reset_weights to an array of ones. - vectors_vocab_lockf : numpy.array - Similar to vectors_vocab_lockf, ones(len(model.trainables.vectors), dtype=REAL) - vectors_ngrams_lockf : numpy.array - np.ones((self.bucket, wv.vector_size), dtype=REAL) - - """ - def __init__(self, vector_size=100, seed=1, hashfxn=hash, bucket=2000000): - super(FastTextTrainables, self).__init__( - vector_size=vector_size, seed=seed, hashfxn=hashfxn) - self.bucket = int(bucket) - - # - # There are also two "hidden" attributes that get initialized outside - # this constructor: - # - # 1. vectors_vocab_lockf - # 2. vectors_ngrams_lockf - # - # These are both 1D matrices of shapes equal to the lengths of - # wv.vectors_vocab and wv.vectors_ngrams. So, each row corresponds to - # a vector. - # - # Lockf stands for "lock factor": zero values suppress learning, one - # values enable it. The vectors_vocab_lockf and vectors_ngrams_lockf - # are used only by the Cython code in fasttext_inner.pyx. - # - # The word2vec implementation also uses vectors_lockf: in that case, - # it's a 1D array, with a real number for each vector. The FastText - # implementation inherits this vectors_lockf attribute but doesn't - # appear to use it. - # - - def prepare_weights(self, hs, negative, wv, update=False, vocabulary=None): - super(FastTextTrainables, self).prepare_weights(hs, negative, wv, update=update, vocabulary=vocabulary) - self.init_ngrams_weights(wv, update=update, vocabulary=vocabulary) - - def init_ngrams_weights(self, wv, update=False, vocabulary=None): - """Compute ngrams of all words present in vocabulary and stores vectors for only those ngrams. - Vectors for other ngrams are initialized with a random uniform distribution in FastText. - - Parameters - ---------- - wv : :class:`~gensim.models.fasttext.FastTextKeyedVectors` - Contains the mapping between the words and embeddings. - The vectors for the computed ngrams will go here. - update : bool - If True, the new vocab words and their new ngrams word vectors are initialized - with random uniform distribution and updated/added to the existing vocab word and ngram vectors. - vocabulary : :class:`~gensim.models.fasttext.FastTextVocab` - This object represents the vocabulary of the model. - If update is True, then vocabulary may not be None. - - """ - if not update: - wv.init_ngrams_weights(self.seed) - self.vectors_vocab_lockf = ones(len(wv.vectors_vocab), dtype=REAL) - self.vectors_ngrams_lockf = ones(len(wv.vectors_ngrams), dtype=REAL) - else: - wv.update_ngrams_weights(self.seed, vocabulary.old_vocab_len) - self.vectors_vocab_lockf = _pad_ones(self.vectors_vocab_lockf, len(wv.vectors_vocab)) - self.vectors_ngrams_lockf = _pad_ones(self.vectors_ngrams_lockf, len(wv.vectors_ngrams)) - - def init_post_load(self, model, hidden_output): - num_vectors = len(model.wv.vectors) - vocab_size = len(model.wv.vocab) - vector_size = model.wv.vector_size - - assert num_vectors > 0, 'expected num_vectors to be initialized already' - assert vocab_size > 0, 'expected vocab_size to be initialized already' - - self.vectors_ngrams_lockf = ones(len(model.wv.vectors_ngrams), dtype=REAL) - self.vectors_vocab_lockf = ones(len(model.wv.vectors_vocab.shape), dtype=REAL) - - if model.hs: - self.syn1 = hidden_output - if model.negative: - self.syn1neg = hidden_output - - self.layer1_size = vector_size +class FastTextTrainables(utils.SaveLoad): + """Obsolete class retained for backward-compatible load()s""" + pass def _pad_ones(m, new_len): @@ -1113,8 +1033,8 @@ def load_facebook_vectors(path, encoding='utf-8'): model training. """ - model_wrapper = _load_fasttext_format(path, encoding=encoding, full_model=False) - return model_wrapper.wv + full_model = _load_fasttext_format(path, encoding=encoding, full_model=False) + return full_model.wv def _load_fasttext_format(model_file, encoding='utf-8', full_model=True): @@ -1140,9 +1060,9 @@ def _load_fasttext_format(model_file, encoding='utf-8', full_model=True): m = gensim.models._fasttext_bin.load(fin, encoding=encoding, full_model=full_model) model = FastText( - size=m.dim, + vector_size=m.dim, window=m.ws, - iter=m.epoch, + epochs=m.epoch, negative=m.neg, hs=int(m.loss == 1), sg=int(m.model == 2), @@ -1153,9 +1073,9 @@ def _load_fasttext_format(model_file, encoding='utf-8', full_model=True): max_n=m.maxn, ) model.corpus_total_words = m.ntokens - model.vocabulary.raw_vocab = m.raw_vocab - model.vocabulary.nwords = m.nwords - model.vocabulary.vocab_size = m.vocab_size + model.raw_vocab = m.raw_vocab + model.nwords = m.nwords + model.vocab_size = m.vocab_size # # This is here to fix https://github.com/RaRe-Technologies/gensim/pull/2373. @@ -1169,15 +1089,13 @@ def _load_fasttext_format(model_file, encoding='utf-8', full_model=True): # Native models trained _without_ pretrained vectors already contain the # trimmed raw_vocab, so this change does not affect them. # - model.vocabulary.prepare_vocab( - model.hs, model.negative, model.wv, - update=True, min_count=1, - ) + model.prepare_vocab(update=True, min_count=1) model.num_original_vectors = m.vectors_ngrams.shape[0] model.wv.init_post_load(m.vectors_ngrams) - model.trainables.init_post_load(model, m.hidden_output) + model.init_post_load(m.hidden_output) + _check_model(model) logger.info("loaded %s weight matrix for fastText model from %s", m.vectors_ngrams.shape, fin.name) @@ -1192,28 +1110,22 @@ def _check_model(m): 'mismatch between vector size in model params ({}) and model vectors ({})' .format(m.wv.vector_size, m.wv.vectors_ngrams) ) - - try: - syn1neg = m.trainables.syn1neg - except AttributeError: - syn1neg = None - - if syn1neg is not None: - assert m.wv.vector_size == m.trainables.syn1neg.shape[1], ( + if m.syn1neg is not None: + assert m.wv.vector_size == m.syn1neg.shape[1], ( 'mismatch between vector size in model params ({}) and trainables ({})' .format(m.wv.vector_size, m.wv.vectors_ngrams) ) - assert len(m.wv.vocab) == m.vocabulary.nwords, ( + assert len(m.wv.vocab) == m.nwords, ( 'mismatch between final vocab size ({} words), ' - 'and expected number of words ({} words)'.format(len(m.wv.vocab), m.vocabulary.nwords) + 'and expected number of words ({} words)'.format(len(m.wv.vocab), m.nwords) ) - if len(m.wv.vocab) != m.vocabulary.vocab_size: + if len(m.wv.vocab) != m.vocab_size: # expecting to log this warning only for pretrained french vector, wiki.fr logger.warning( "mismatch between final vocab size (%s words), and expected vocab size (%s words)", - len(m.wv.vocab), m.vocabulary.vocab_size + len(m.wv.vocab), m.vocab_size ) @@ -1524,7 +1436,6 @@ def init_post_load(self, fb_vectors): self.vectors_vocab = np.array(fb_vectors[:vocab_words, :]) self.vectors_ngrams = np.array(fb_vectors[vocab_words:, :]) self.buckets_word = None # This can get initialized later - self.adjust_vectors() # calculate composite full-word vectors def adjust_vectors(self): diff --git a/gensim/models/fasttext_inner.pyx b/gensim/models/fasttext_inner.pyx index c2794d7d11..0702729c90 100644 --- a/gensim/models/fasttext_inner.pyx +++ b/gensim/models/fasttext_inner.pyx @@ -454,26 +454,26 @@ cdef void init_ft_config(FastTextConfig *c, model, alpha, _work, _neu1): """ c.hs = model.hs c.negative = model.negative - c.sample = (model.vocabulary.sample != 0) + c.sample = (model.sample != 0) c.cbow_mean = model.cbow_mean c.window = model.window c.workers = model.workers c.syn0_vocab = (np.PyArray_DATA(model.wv.vectors_vocab)) - c.word_locks_vocab = (np.PyArray_DATA(model.trainables.vectors_vocab_lockf)) + c.word_locks_vocab = (np.PyArray_DATA(model.wv.vectors_vocab_lockf)) c.syn0_ngrams = (np.PyArray_DATA(model.wv.vectors_ngrams)) - c.word_locks_ngrams = (np.PyArray_DATA(model.trainables.vectors_ngrams_lockf)) + c.word_locks_ngrams = (np.PyArray_DATA(model.wv.vectors_ngrams_lockf)) c.alpha = alpha c.size = model.wv.vector_size if c.hs: - c.syn1 = (np.PyArray_DATA(model.trainables.syn1)) + c.syn1 = (np.PyArray_DATA(model.syn1)) if c.negative: - c.syn1neg = (np.PyArray_DATA(model.trainables.syn1neg)) - c.cum_table = (np.PyArray_DATA(model.vocabulary.cum_table)) - c.cum_table_len = len(model.vocabulary.cum_table) + c.syn1neg = (np.PyArray_DATA(model.syn1neg)) + c.cum_table = (np.PyArray_DATA(model.cum_table)) + c.cum_table_len = len(model.cum_table) if c.negative or c.sample: c.next_random = (2**24) * model.random.randint(0, 2**24) + model.random.randint(0, 2**24) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index d4882778d0..68fc62858e 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -187,9 +187,6 @@ from six.moves import zip, range from scipy import stats -# For backwards compatibility, see https://github.com/RaRe-Technologies/gensim/issues/2201 -# -from gensim.models.deprecated.keyedvectors import EuclideanKeyedVectors # noqa logger = logging.getLogger(__name__) @@ -220,6 +217,11 @@ def _load_specials(self, *args, **kwargs): # fixup rename/consolidation into index2key of older index2word, index2entity if not hasattr(self, 'index2key'): self.index2key = self.__dict__.pop('index2word', self.__dict__.pop('index2word', None)) + # fixup rename into vectors of older syn0 + if not hasattr(self, 'vectors'): + self.vectors = self.__dict__.pop('syn0', None) + self.vectors_norm = None + self.vector_size = self.vectors.shape[1] # fixup rename of vocab into map if 'map' not in self.__dict__: self.map = self.__dict__.pop('vocab', None) @@ -1383,6 +1385,7 @@ def similarity_unseen_docs(self, *args, **kwargs): # to help 3.8.1 & older pickles load properly Word2VecKeyedVectors = KeyedVectors Doc2VecKeyedVectors = KeyedVectors +EuclideanKeyedVectors = KeyedVectors def _l2_norm(m, replace=False): diff --git a/gensim/models/word2vec.py b/gensim/models/word2vec.py index b6a6c8c2d6..5432059ec4 100755 --- a/gensim/models/word2vec.py +++ b/gensim/models/word2vec.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# Author: Shiva Manne +# Author: Gensim Contributors # Copyright (C) 2018 RaRe Technologies s.r.o. # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html @@ -129,13 +129,13 @@ from collections import defaultdict, namedtuple from dataclasses import dataclass from typing import List +from types import GeneratorType import threading import itertools -import warnings +import copy from gensim.utils import keep_vocab_item, call_on_class_only, deprecated from gensim.models.keyedvectors import KeyedVectors, pseudorandom_weak_vector -from gensim.models.base_any2vec import BaseWordEmbeddingsModel try: from queue import Queue, Empty @@ -145,6 +145,7 @@ from numpy import exp, dot, zeros, dtype, float32 as REAL,\ uint32, seterr, array, uint8, vstack, fromstring, sqrt,\ sum as np_sum, ones, logaddexp +import numpy as np from gensim import utils, matutils # utility fnc for pickling, common scipy operations etc from six import iteritems, itervalues, string_types @@ -228,12 +229,12 @@ def score_cbow_pair(model, word, l1): return sum(lprob) -class Word2Vec(BaseWordEmbeddingsModel): - def __init__(self, sentences=None, corpus_file=None, size=100, alpha=0.025, window=5, min_count=5, +class Word2Vec(utils.SaveLoad): + def __init__(self, sentences=None, corpus_file=None, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, - sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, + sg=0, hs=0, negative=5, ns_exponent=0.75, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=MAX_WORDS_IN_BATCH, compute_loss=False, callbacks=(), - max_final_vocab=None): + comment=None, max_final_vocab=None): """Train, use and evaluate neural networks described in https://code.google.com/p/word2vec/. Once you're finished training a model (=no more updates, only querying) @@ -262,7 +263,7 @@ def __init__(self, sentences=None, corpus_file=None, size=100, alpha=0.025, wind Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or `corpus_file` arguments need to be passed (or none of them, in that case, the model is left uninitialized). - size : int, optional + vector_size : int, optional Dimensionality of the word vectors. window : int, optional Maximum distance between the current and predicted word within a sentence. @@ -310,8 +311,8 @@ def __init__(self, sentences=None, corpus_file=None, size=100, alpha=0.025, wind useful range is (0, 1e-5). hashfxn : function, optional Hash function to use to randomly initialize weights, for increased training reproducibility. - iter : int, optional - Number of iterations (epochs) over the corpus. + epochs : int, optional + Number of iterations (epochs) over the corpus. (Formerly: `iter`) trim_rule : function, optional Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, be trimmed away, or handled using the default (discard if word count < min_count). @@ -342,48 +343,516 @@ def __init__(self, sentences=None, corpus_file=None, size=100, alpha=0.025, wind -------- Initialize and train a :class:`~gensim.models.word2vec.Word2Vec` model - .. sourcecode:: pycon + .. sourcecode:: pycon + + >>> from gensim.models import Word2Vec + >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] + >>> model = Word2Vec(sentences, min_count=1) + + Attributes + ---------- + wv : :class:`~gensim.models.keyedvectors.KeyedVectors` + This object essentially contains the mapping between words and embeddings. After training, it can be used + directly to query those embeddings in various ways. See the module level docstring for examples. + + """ + corpus_iterable = sentences + + self.vector_size = int(vector_size) + self.workers = int(workers) + self.epochs = epochs + self.train_count = 0 + self.total_train_time = 0 + self.batch_words = batch_words + + self.sg = int(sg) + self.alpha = float(alpha) + self.min_alpha = float(min_alpha) + + self.window = int(window) + self.random = np.random.RandomState(seed) + + self.hs = int(hs) + self.negative = int(negative) + self.ns_exponent = ns_exponent + self.cbow_mean = int(cbow_mean) + self.compute_loss = bool(compute_loss) + self.running_training_loss = 0 + self.min_alpha_yet_reached = float(alpha) + self.corpus_count = 0 + self.corpus_total_words = 0 + + self.max_final_vocab = max_final_vocab + self.max_vocab_size = max_vocab_size + self.min_count = min_count + self.sample = sample + self.sorted_vocab = sorted_vocab + self.null_word = null_word + self.cum_table = None # for negative sampling + self.raw_vocab = None + + if not hasattr(self, 'wv'): # set unless subclass already set (eg: FastText) + self.wv = KeyedVectors(vector_size) + + self.hashfxn = hashfxn + self.seed = seed + if not hasattr(self, 'layer1_size'): # set unless subclass already set (as for Doc2Vec dm_concat mode) + self.layer1_size = vector_size + + self.comment = comment + + self.load = call_on_class_only + + if corpus_iterable is not None or corpus_file is not None: + self.build_vocab_and_train(corpus_iterable=corpus_iterable, corpus_file=corpus_file, + trim_rule=trim_rule, callbacks=callbacks) + else: + if trim_rule is not None: + logger.warning( + "The rule, if given, is only used to prune vocabulary during build_vocab() " + "and is not stored as part of the model. Model initialized without sentences. " + "trim_rule provided, if any, will be ignored.") + if callbacks: + logger.warning( + "Callbacks are no longer retained by the model, so must be provided whenever " + "training is triggered, as in initialization with a corpus or calling `train()`. " + "The callbacks provided in this initialization without triggering train will " + "be ignored.") + + def build_vocab_and_train(self, corpus_iterable=None, corpus_file=None, trim_rule=None, callbacks=None): + if not (corpus_iterable is None) ^ (corpus_file is None): + raise ValueError("You must provide only one of corpus_iterable or corpus_file arguments.") + if corpus_file is not None and not isinstance(corpus_file, string_types): + raise TypeError("You must pass string as the corpus_file argument.") + elif isinstance(corpus_iterable, GeneratorType): + raise TypeError("You can't pass a generator as the sentences argument. Try a sequence.") + # TODO: test for restartable? + self.build_vocab(corpus_iterable=corpus_iterable, corpus_file=corpus_file, trim_rule=trim_rule) + self.train( + corpus_iterable=corpus_iterable, corpus_file=corpus_file, total_examples=self.corpus_count, + total_words=self.corpus_total_words, epochs=self.epochs, start_alpha=self.alpha, + end_alpha=self.min_alpha, compute_loss=self.compute_loss, callbacks=callbacks) + + def build_vocab(self, corpus_iterable=None, corpus_file=None, update=False, progress_per=10000, + keep_raw_vocab=False, trim_rule=None, **kwargs): + """Build vocabulary from a sequence of sentences (can be a once-only generator stream). + + Parameters + ---------- + corpus_iterable : iterable of list of str + Can be simply a list of lists of tokens, but for larger corpora, + consider an iterable that streams the sentences directly from disk/network. + See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` + or :class:`~gensim.models.word2vec.LineSentence` module for such examples. + corpus_file : str, optional + Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. + You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or + `corpus_file` arguments need to be passed (not both of them). + update : bool + If true, the new words in `sentences` will be added to model's vocab. + progress_per : int, optional + Indicates how many words to process before showing/updating the progress. + keep_raw_vocab : bool, optional + If False, the raw vocabulary will be deleted after the scaling is done to free up RAM. + trim_rule : function, optional + Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, + be trimmed away, or handled using the default (discard if word count < min_count). + Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), + or a callable that accepts parameters (word, count, min_count) and returns either + :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. + The rule, if given, is only used to prune vocabulary during current method call and is not stored as part + of the model. + + The input parameters are of the following types: + * `word` (str) - the word we are examining + * `count` (int) - the word's frequency count in the corpus + * `min_count` (int) - the minimum count threshold. + + **kwargs : object + Key word arguments propagated to `self.prepare_vocab` + + """ + total_words, corpus_count = self.scan_vocab( + corpus_iterable=corpus_iterable, corpus_file=corpus_file, progress_per=progress_per, trim_rule=trim_rule) + self.corpus_count = corpus_count + self.corpus_total_words = total_words + report_values = self.prepare_vocab(update=update, keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, **kwargs) + report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words']) + self.prepare_weights(update=update) + + def build_vocab_from_freq(self, word_freq, keep_raw_vocab=False, corpus_count=None, trim_rule=None, update=False): + """Build vocabulary from a dictionary of word frequencies. + + Parameters + ---------- + word_freq : dict of (str, int) + A mapping from a word in the vocabulary to its frequency count. + keep_raw_vocab : bool, optional + If False, delete the raw vocabulary after the scaling is done to free up RAM. + corpus_count : int, optional + Even if no corpus is provided, this argument can set corpus_count explicitly. + trim_rule : function, optional + Vocabulary trimming rule, specifies whether certain words should remain in the vocabulary, + be trimmed away, or handled using the default (discard if word count < min_count). + Can be None (min_count will be used, look to :func:`~gensim.utils.keep_vocab_item`), + or a callable that accepts parameters (word, count, min_count) and returns either + :attr:`gensim.utils.RULE_DISCARD`, :attr:`gensim.utils.RULE_KEEP` or :attr:`gensim.utils.RULE_DEFAULT`. + The rule, if given, is only used to prune vocabulary during current method call and is not stored as part + of the model. + + The input parameters are of the following types: + * `word` (str) - the word we are examining + * `count` (int) - the word's frequency count in the corpus + * `min_count` (int) - the minimum count threshold. + + update : bool, optional + If true, the new provided words in `word_freq` dict will be added to model's vocab. + + """ + logger.info("Processing provided word frequencies") + # Instead of scanning text, this will assign provided word frequencies dictionary(word_freq) + # to be directly the raw vocab + raw_vocab = word_freq + logger.info( + "collected %i different raw word, with total frequency of %i", + len(raw_vocab), sum(itervalues(raw_vocab)) + ) + + # Since no sentences are provided, this is to control the corpus_count. + self.corpus_count = corpus_count or 0 + self.raw_vocab = raw_vocab + + # trim by min_count & precalculate downsampling + report_values = self.prepare_vocab(keep_raw_vocab=keep_raw_vocab, trim_rule=trim_rule, update=update) + report_values['memory'] = self.estimate_memory(vocab_size=report_values['num_retained_words']) + self.prepare_weights(update=update) # build tables & arrays + + def _scan_vocab(self, sentences, progress_per, trim_rule): + sentence_no = -1 + total_words = 0 + min_reduce = 1 + vocab = defaultdict(int) + checked_string_types = 0 + for sentence_no, sentence in enumerate(sentences): + if not checked_string_types: + if isinstance(sentence, string_types): + logger.warning( + "Each 'sentences' item should be a list of words (usually unicode strings). " + "First item here is instead plain %s.", + type(sentence) + ) + checked_string_types += 1 + if sentence_no % progress_per == 0: + logger.info( + "PROGRESS: at sentence #%i, processed %i words, keeping %i word types", + sentence_no, total_words, len(vocab) + ) + for word in sentence: + vocab[word] += 1 + total_words += len(sentence) + + if self.max_vocab_size and len(vocab) > self.max_vocab_size: + utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) + min_reduce += 1 + + corpus_count = sentence_no + 1 + self.raw_vocab = vocab + return total_words, corpus_count + + def scan_vocab(self, corpus_iterable=None, corpus_file=None, progress_per=10000, workers=None, trim_rule=None): + logger.info("collecting all words and their counts") + if corpus_file: + corpus_iterable = LineSentence(corpus_file) + + total_words, corpus_count = self._scan_vocab(corpus_iterable, progress_per, trim_rule) + + logger.info( + "collected %i word types from a corpus of %i raw words and %i sentences", + len(self.raw_vocab), total_words, corpus_count + ) + + return total_words, corpus_count + + def sort_vocab(self): + """Sort the vocabulary so the most frequent words have the lowest indexes.""" + if len(self.wv.vectors): + raise RuntimeError("cannot sort vocabulary after model weights already initialized.") + self.wv.index2key.sort(key=lambda word: self.wv.vocab[word].count, reverse=True) + for i, word in enumerate(self.wv.index2key): + self.wv.vocab[word].index = i + + def prepare_vocab( + self, update=False, keep_raw_vocab=False, trim_rule=None, + min_count=None, sample=None, dry_run=False): + """Apply vocabulary settings for `min_count` (discarding less-frequent words) + and `sample` (controlling the downsampling of more-frequent words). + + Calling with `dry_run=True` will only simulate the provided settings and + report the size of the retained vocabulary, effective corpus length, and + estimated memory requirements. Results are both printed via logging and + returned as a dict. + + Delete the raw vocabulary after the scaling is done to free up RAM, + unless `keep_raw_vocab` is set. + + """ + min_count = min_count or self.min_count + sample = sample or self.sample + drop_total = drop_unique = 0 + + # set effective_min_count to min_count in case max_final_vocab isn't set + self.effective_min_count = min_count + + # if max_final_vocab is specified instead of min_count + # pick a min_count which satisfies max_final_vocab as well as possible + if self.max_final_vocab is not None: + sorted_vocab = sorted(self.raw_vocab.keys(), key=lambda word: self.raw_vocab[word], reverse=True) + calc_min_count = 1 + + if self.max_final_vocab < len(sorted_vocab): + calc_min_count = self.raw_vocab[sorted_vocab[self.max_final_vocab]] + 1 + + self.effective_min_count = max(calc_min_count, min_count) + logger.info( + "max_final_vocab=%d and min_count=%d resulted in calc_min_count=%d, effective_min_count=%d", + self.max_final_vocab, min_count, calc_min_count, self.effective_min_count + ) + + if not update: + logger.info("Loading a fresh vocabulary") + retain_total, retain_words = 0, [] + # Discard words less-frequent than min_count + if not dry_run: + self.wv.index2key = [] + # make stored settings match these applied settings + self.min_count = min_count + self.sample = sample + self.wv.vocab = {} + + for word, v in iteritems(self.raw_vocab): + if keep_vocab_item(word, v, self.effective_min_count, trim_rule=trim_rule): + retain_words.append(word) + retain_total += v + if not dry_run: + self.wv.vocab[word] = W2VVocab(count=v, index=len(self.wv.index2key)) + self.wv.index2key.append(word) + else: + drop_unique += 1 + drop_total += v + original_unique_total = len(retain_words) + drop_unique + retain_unique_pct = len(retain_words) * 100 / max(original_unique_total, 1) + logger.info( + "effective_min_count=%d retains %i unique words (%i%% of original %i, drops %i)", + self.effective_min_count, len(retain_words), retain_unique_pct, original_unique_total, drop_unique + ) + original_total = retain_total + drop_total + retain_pct = retain_total * 100 / max(original_total, 1) + logger.info( + "effective_min_count=%d leaves %i word corpus (%i%% of original %i, drops %i)", + self.effective_min_count, retain_total, retain_pct, original_total, drop_total + ) + else: + logger.info("Updating model with new vocabulary") + new_total = pre_exist_total = 0 + new_words = pre_exist_words = [] + for word, v in iteritems(self.raw_vocab): + if keep_vocab_item(word, v, self.effective_min_count, trim_rule=trim_rule): + if word in self.wv.vocab: + pre_exist_words.append(word) + pre_exist_total += v + if not dry_run: + self.wv.vocab[word].count += v + else: + new_words.append(word) + new_total += v + if not dry_run: + self.wv.vocab[word] = W2VVocab(count=v, index=len(self.wv.index2key)) + self.wv.index2key.append(word) + else: + drop_unique += 1 + drop_total += v + original_unique_total = len(pre_exist_words) + len(new_words) + drop_unique + pre_exist_unique_pct = len(pre_exist_words) * 100 / max(original_unique_total, 1) + new_unique_pct = len(new_words) * 100 / max(original_unique_total, 1) + logger.info( + "New added %i unique words (%i%% of original %i) " + "and increased the count of %i pre-existing words (%i%% of original %i)", + len(new_words), new_unique_pct, original_unique_total, len(pre_exist_words), + pre_exist_unique_pct, original_unique_total + ) + retain_words = new_words + pre_exist_words + retain_total = new_total + pre_exist_total + + # Precalculate each vocabulary item's threshold for sampling + if not sample: + # no words downsampled + threshold_count = retain_total + elif sample < 1.0: + # traditional meaning: set parameter as proportion of total + threshold_count = sample * retain_total + else: + # new shorthand: sample >= 1 means downsample all words with higher count than sample + threshold_count = int(sample * (3 + sqrt(5)) / 2) + + downsample_total, downsample_unique = 0, 0 + for w in retain_words: + v = self.raw_vocab[w] + word_probability = (sqrt(v / threshold_count) + 1) * (threshold_count / v) + if word_probability < 1.0: + downsample_unique += 1 + downsample_total += word_probability * v + else: + word_probability = 1.0 + downsample_total += v + if not dry_run: + self.wv.vocab[w].sample_int = int(round(word_probability * 2**32)) + + if not dry_run and not keep_raw_vocab: + logger.info("deleting the raw counts dictionary of %i items", len(self.raw_vocab)) + self.raw_vocab = defaultdict(int) + + logger.info("sample=%g downsamples %i most-common words", sample, downsample_unique) + logger.info( + "downsampling leaves estimated %i word corpus (%.1f%% of prior %i)", + downsample_total, downsample_total * 100.0 / max(retain_total, 1), retain_total + ) + + # return from each step: words-affected, resulting-corpus-size, extra memory estimates + report_values = { + 'drop_unique': drop_unique, 'retain_total': retain_total, 'downsample_unique': downsample_unique, + 'downsample_total': int(downsample_total), 'num_retained_words': len(retain_words) + } + + if self.null_word: + # create null pseudo-word for padding when using concatenative L1 (run-of-words) + # this word is only ever input – never predicted – so count, huffman-point, etc doesn't matter + self.add_null_word() + + if self.sorted_vocab and not update: + self.sort_vocab() + if self.hs: + # add info about each word's Huffman encoding + self.create_binary_tree() + if self.negative: + # build the table for drawing random words (for negative sampling) + self.make_cum_table() + + return report_values + + def estimate_memory(self, vocab_size=None, report=None): + """Estimate required memory for a model using current settings and provided vocabulary size. + + Parameters + ---------- + vocab_size : int, optional + Number of unique tokens in the vocabulary + report : dict of (str, int), optional + A dictionary from string representations of the model's memory consuming members to their size in bytes. + + Returns + ------- + dict of (str, int) + A dictionary from string representations of the model's memory consuming members to their size in bytes. + + """ + vocab_size = vocab_size or len(self.wv.vocab) + report = report or {} + report['vocab'] = vocab_size * (700 if self.hs else 500) + report['vectors'] = vocab_size * self.vector_size * dtype(REAL).itemsize + if self.hs: + report['syn1'] = vocab_size * self.layer1_size * dtype(REAL).itemsize + if self.negative: + report['syn1neg'] = vocab_size * self.layer1_size * dtype(REAL).itemsize + report['total'] = sum(report.values()) + logger.info( + "estimated required memory for %i words and %i dimensions: %i bytes", + vocab_size, self.vector_size, report['total'] + ) + return report + + def add_null_word(self): + word, v = '\0', W2VVocab(count=1, sample_int=0) + v.index = len(self.wv.vocab) + self.wv.index2key.append(word) + self.wv.vocab[word] = v + + def create_binary_tree(self): + """Create a `binary Huffman tree `_ using stored vocabulary + word counts. Frequent words will have shorter binary codes. + Called internally from :meth:`~gensim.models.word2vec.Word2VecVocab.build_vocab`. + + """ + _assign_binary_codes(self.wv.vocab) + + def make_cum_table(self, domain=2**31 - 1): + """Create a cumulative-distribution table using stored vocabulary word counts for + drawing random words in the negative-sampling training routines. + + To draw a word index, choose a random integer up to the maximum value in the table (cum_table[-1]), + then finding that integer's sorted insertion point (as if by `bisect_left` or `ndarray.searchsorted()`). + That insertion point is the drawn index, coming up in proportion equal to the increment at that slot. - >>> from gensim.models import Word2Vec - >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] - >>> model = Word2Vec(sentences, min_count=1) + """ + vocab_size = len(self.wv.index2key) + self.cum_table = zeros(vocab_size, dtype=uint32) + # compute sum of all power (Z in paper) + train_words_pow = 0.0 + for word_index in range(vocab_size): + train_words_pow += self.wv.vocab[self.wv.index2key[word_index]].count**self.ns_exponent + cumulative = 0.0 + for word_index in range(vocab_size): + cumulative += self.wv.vocab[self.wv.index2key[word_index]].count**self.ns_exponent + self.cum_table[word_index] = round(cumulative / train_words_pow * domain) + if len(self.cum_table) > 0: + assert self.cum_table[-1] == domain - Some important attributes are the following: + def prepare_weights(self, update=False): + """Build tables and model weights based on final vocabulary settings.""" + # set initial input/projection and hidden weights + if not update: + self.reset_weights() + else: + self.update_weights() - Attributes - ---------- - wv : :class:`~gensim.models.keyedvectors.KeyedVectors` - This object essentially contains the mapping between words and embeddings. After training, it can be used - directly to query those embeddings in various ways. See the module level docstring for examples. + @deprecated("Use gensim.models.keyedvectors.pseudorandom_weak_vector() directly") + def seeded_vector(self, seed_string, vector_size): + return pseudorandom_weak_vector(vector_size, seed_string=seed_string, hashfxn=self.hashfxn) - vocabulary : :class:`~gensim.models.word2vec.Word2VecVocab` - This object represents the vocabulary (sometimes called Dictionary in gensim) of the model. - Besides keeping track of all unique words, this object provides extra functionality, such as - constructing a huffman tree (frequent words are closer to the root), or discarding extremely rare words. + def reset_weights(self): + """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary.""" + logger.info("resetting layer weights") + self.wv.resize_vectors() + self.wv.randomly_initialize_vectors(seed=self.seed) + if self.hs: + self.syn1 = zeros((len(self.wv.vocab), self.layer1_size), dtype=REAL) + if self.negative: + self.syn1neg = zeros((len(self.wv.vocab), self.layer1_size), dtype=REAL) - trainables : :class:`~gensim.models.word2vec.Word2VecTrainables` - This object represents the inner shallow neural network used to train the embeddings. The semantics - of the network differ slightly in the two available training modes (CBOW or SG) but you can think of it - as a NN with single projection and hidden layer which we train on the corpus. The weights are then used - as our embeddings (which means that the size of the hidden layer is equal to the number of features - `self.size`). + self.wv.vectors_lockf = ones(len(self.wv.vocab), dtype=REAL) # zeros suppress learning - """ - self.max_final_vocab = max_final_vocab + def update_weights(self): + """Copy all the existing weights, and reset the weights for the newly added vocabulary.""" + logger.info("updating layer weights") + new_range = self.wv.resize_vectors() + gained_vocab = len(new_range) + self.wv.randomly_initialize_vectors(indexes=new_range) - self.callbacks = callbacks - self.load = call_on_class_only + # Raise an error if an online update is run before initial training on a corpus + if not len(self.wv.vectors): + raise RuntimeError( + "You cannot do an online vocabulary-update of a model which has no prior vocabulary. " + "First build the vocabulary of your model with a corpus before doing an online update." + ) - self.wv = KeyedVectors(size) - self.vocabulary = Word2VecVocab( - max_vocab_size=max_vocab_size, min_count=min_count, sample=sample, sorted_vocab=bool(sorted_vocab), - null_word=null_word, max_final_vocab=max_final_vocab, ns_exponent=ns_exponent) - self.trainables = Word2VecTrainables(seed=seed, vector_size=size, hashfxn=hashfxn) + if self.hs: + self.syn1 = vstack([self.syn1, zeros((gained_vocab, self.layer1_size), dtype=REAL)]) + if self.negative: + pad = zeros((gained_vocab, self.layer1_size), dtype=REAL) + self.syn1neg = vstack([self.syn1neg, pad]) + self.wv.vectors_norm = None - super(Word2Vec, self).__init__( - sentences=sentences, corpus_file=corpus_file, workers=workers, vector_size=size, epochs=iter, - callbacks=callbacks, batch_words=batch_words, trim_rule=trim_rule, sg=sg, alpha=alpha, window=window, - seed=seed, hs=hs, negative=negative, cbow_mean=cbow_mean, min_alpha=min_alpha, compute_loss=compute_loss) + # do not suppress learning for already learned words + self.wv.vectors_lockf = ones(len(self.wv.vocab), dtype=REAL) # zeros suppress learning def _do_train_epoch(self, corpus_file, thread_id, offset, cython_vocab, thread_private_mem, cur_epoch, total_examples=None, total_words=None, **kwargs): @@ -428,14 +897,10 @@ def _clear_post_train(self): """Remove all L2-normalized word vectors from the model.""" self.wv.vectors_norm = None - def _set_train_params(self, **kwargs): - if 'compute_loss' in kwargs: - self.compute_loss = kwargs['compute_loss'] - self.running_training_loss = 0 - - def train(self, sentences=None, corpus_file=None, total_examples=None, total_words=None, + def train(self, corpus_iterable=None, corpus_file=None, total_examples=None, total_words=None, epochs=None, start_alpha=None, end_alpha=None, word_count=0, - queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=()): + queue_factor=2, report_delay=1.0, compute_loss=False, callbacks=(), + **kwargs): """Update the model's neural weights from a sequence of sentences. Notes @@ -454,63 +919,699 @@ def train(self, sentences=None, corpus_file=None, total_examples=None, total_wor Parameters ---------- - sentences : iterable of list of str - The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, - consider an iterable that streams the sentences directly from disk/network. - See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` - or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. - See also the `tutorial on data streaming in Python - `_. - corpus_file : str, optional - Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. - You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or - `corpus_file` arguments need to be passed (not both of them). + corpus_iterable : iterable of list of str + The `sentences` iterable can be simply a list of lists of tokens, but for larger corpora, + consider an iterable that streams the sentences directly from disk/network. + See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus` + or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples. + See also the `tutorial on data streaming in Python + `_. + corpus_file : str, optional + Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. + You may use this argument instead of `sentences` to get performance boost. Only one of `sentences` or + `corpus_file` arguments need to be passed (not both of them). + total_examples : int + Count of sentences. + total_words : int + Count of raw words in sentences. + epochs : int + Number of iterations (epochs) over the corpus. + start_alpha : float, optional + Initial learning rate. If supplied, replaces the starting `alpha` from the constructor, + for this one call to`train()`. + Use only if making multiple calls to `train()`, when you want to manage the alpha learning-rate yourself + (not recommended). + end_alpha : float, optional + Final learning rate. Drops linearly from `start_alpha`. + If supplied, this replaces the final `min_alpha` from the constructor, for this one call to `train()`. + Use only if making multiple calls to `train()`, when you want to manage the alpha learning-rate yourself + (not recommended). + word_count : int, optional + Count of words already trained. Set this to 0 for the usual + case of training on all words in sentences. + queue_factor : int, optional + Multiplier for size of queue (number of workers * queue_factor). + report_delay : float, optional + Seconds to wait before reporting progress. + compute_loss: bool, optional + If True, computes and stores loss value which can be retrieved using + :meth:`~gensim.models.word2vec.Word2Vec.get_latest_training_loss`. + callbacks : iterable of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional + Sequence of callbacks to be executed at specific stages during training. + + Examples + -------- + .. sourcecode:: pycon + + >>> from gensim.models import Word2Vec + >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] + >>> + >>> model = Word2Vec(min_count=1) + >>> model.build_vocab(sentences) # prepare the model vocabulary + >>> model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) # train word vectors + (1, 30) + + """ + self.alpha = start_alpha or self.alpha + self.min_alpha = end_alpha or self.min_alpha + self.epochs = epochs + + self._check_training_sanity( + epochs=epochs, + total_examples=total_examples, + total_words=total_words) + + self.compute_loss = compute_loss + self.running_training_loss = 0.0 + + for callback in callbacks: + callback.on_train_begin(self) + + trained_word_count = 0 + raw_word_count = 0 + start = default_timer() - 0.00001 + job_tally = 0 + + for cur_epoch in range(self.epochs): + for callback in callbacks: + callback.on_epoch_begin(self) + + if corpus_iterable is not None: + trained_word_count_epoch, raw_word_count_epoch, job_tally_epoch = self._train_epoch( + corpus_iterable, cur_epoch=cur_epoch, total_examples=total_examples, + total_words=total_words, queue_factor=queue_factor, report_delay=report_delay, + callbacks=callbacks, **kwargs) + else: + trained_word_count_epoch, raw_word_count_epoch, job_tally_epoch = self._train_epoch_corpusfile( + corpus_file, cur_epoch=cur_epoch, total_examples=total_examples, total_words=total_words, + callbacks=callbacks, **kwargs) + + trained_word_count += trained_word_count_epoch + raw_word_count += raw_word_count_epoch + job_tally += job_tally_epoch + + for callback in callbacks: + callback.on_epoch_end(self) + + # Log overall time + total_elapsed = default_timer() - start + self._log_train_end(raw_word_count, trained_word_count, total_elapsed, job_tally) + + self.train_count += 1 # number of times train() has been called + self._clear_post_train() + + for callback in callbacks: + callback.on_train_end(self) + return trained_word_count, raw_word_count + + def _worker_loop_corpusfile(self, corpus_file, thread_id, offset, cython_vocab, progress_queue, cur_epoch=0, + total_examples=None, total_words=None, **kwargs): + """Train the model on a `corpus_file` in LineSentence format. + + This function will be called in parallel by multiple workers (threads or processes) to make + optimal use of multicore machines. + + Parameters + ---------- + corpus_file : str + Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. + thread_id : int + Thread index starting from 0 to `number of workers - 1`. + offset : int + Offset (in bytes) in the `corpus_file` for particular worker. + cython_vocab : :class:`~gensim.models.word2vec_inner.CythonVocab` + Copy of the vocabulary in order to access it without GIL. + progress_queue : Queue of (int, int, int) + A queue of progress reports. Each report is represented as a tuple of these 3 elements: + * Size of data chunk processed, for example number of sentences in the corpus chunk. + * Effective word count used in training (after ignoring unknown words and trimming the sentence length). + * Total word count used in training. + **kwargs : object + Additional key word parameters for the specific model inheriting from this class. + + """ + thread_private_mem = self._get_thread_working_mem() + + examples, tally, raw_tally = self._do_train_epoch( + corpus_file, thread_id, offset, cython_vocab, thread_private_mem, cur_epoch, + total_examples=total_examples, total_words=total_words, **kwargs) + + progress_queue.put((examples, tally, raw_tally)) + progress_queue.put(None) + + def _worker_loop(self, job_queue, progress_queue): + """Train the model, lifting batches of data from the queue. + + This function will be called in parallel by multiple workers (threads or processes) to make + optimal use of multicore machines. + + Parameters + ---------- + job_queue : Queue of (list of objects, (str, int)) + A queue of jobs still to be processed. The worker will take up jobs from this queue. + Each job is represented by a tuple where the first element is the corpus chunk to be processed and + the second is the dictionary of parameters. + progress_queue : Queue of (int, int, int) + A queue of progress reports. Each report is represented as a tuple of these 3 elements: + * Size of data chunk processed, for example number of sentences in the corpus chunk. + * Effective word count used in training (after ignoring unknown words and trimming the sentence length). + * Total word count used in training. + + """ + thread_private_mem = self._get_thread_working_mem() + jobs_processed = 0 + callbacks = progress_queue.callbacks + while True: + job = job_queue.get() + if job is None: + progress_queue.put(None) + break # no more jobs => quit this worker + data_iterable, job_parameters = job + + for callback in callbacks: + callback.on_batch_begin(self) + + tally, raw_tally = self._do_train_job(data_iterable, job_parameters, thread_private_mem) + + for callback in callbacks: + callback.on_batch_end(self) + + progress_queue.put((len(data_iterable), tally, raw_tally)) # report back progress + jobs_processed += 1 + logger.debug("worker exiting, processed %i jobs", jobs_processed) + + def _job_producer(self, data_iterator, job_queue, cur_epoch=0, total_examples=None, total_words=None): + """Fill the jobs queue using the data found in the input stream. + + Each job is represented by a tuple where the first element is the corpus chunk to be processed and + the second is a dictionary of parameters. + + Parameters + ---------- + data_iterator : iterable of list of objects + The input dataset. This will be split in chunks and these chunks will be pushed to the queue. + job_queue : Queue of (list of object, dict of (str, int)) + A queue of jobs still to be processed. The worker will take up jobs from this queue. + Each job is represented by a tuple where the first element is the corpus chunk to be processed and + the second is the dictionary of parameters. + cur_epoch : int, optional + The current training epoch, needed to compute the training parameters for each job. + For example in many implementations the learning rate would be dropping with the number of epochs. + total_examples : int, optional + Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences + in a corpus. Used to log progress. + total_words : int, optional + Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw + words in a corpus. Used to log progress. + + """ + job_batch, batch_size = [], 0 + pushed_words, pushed_examples = 0, 0 + next_job_params = self._get_job_params(cur_epoch) + job_no = 0 + + for data_idx, data in enumerate(data_iterator): + data_length = self._raw_word_count([data]) + + # can we fit this sentence into the existing job batch? + if batch_size + data_length <= self.batch_words: + # yes => add it to the current job + job_batch.append(data) + batch_size += data_length + else: + job_no += 1 + job_queue.put((job_batch, next_job_params)) + + # update the learning rate for the next job + if total_examples: + # examples-based decay + pushed_examples += len(job_batch) + epoch_progress = 1.0 * pushed_examples / total_examples + else: + # words-based decay + pushed_words += self._raw_word_count(job_batch) + epoch_progress = 1.0 * pushed_words / total_words + next_job_params = self._update_job_params(next_job_params, epoch_progress, cur_epoch) + + # add the sentence that didn't fit as the first item of a new job + job_batch, batch_size = [data], data_length + # add the last job too (may be significantly smaller than batch_words) + if job_batch: + job_no += 1 + job_queue.put((job_batch, next_job_params)) + + if job_no == 0 and self.train_count == 0: + logger.warning( + "train() called with an empty iterator (if not intended, " + "be sure to provide a corpus that offers restartable iteration = an iterable)." + ) + + # give the workers heads up that they can finish -- no more work! + for _ in range(self.workers): + job_queue.put(None) + logger.debug("job loop exiting, total %i jobs", job_no) + + def _log_epoch_progress(self, progress_queue=None, job_queue=None, cur_epoch=0, total_examples=None, + total_words=None, report_delay=1.0, is_corpus_file_mode=None): + """Get the progress report for a single training epoch. + + Parameters + ---------- + progress_queue : Queue of (int, int, int) + A queue of progress reports. Each report is represented as a tuple of these 3 elements: + * size of data chunk processed, for example number of sentences in the corpus chunk. + * Effective word count used in training (after ignoring unknown words and trimming the sentence length). + * Total word count used in training. + job_queue : Queue of (list of object, dict of (str, int)) + A queue of jobs still to be processed. The worker will take up jobs from this queue. + Each job is represented by a tuple where the first element is the corpus chunk to be processed and + the second is the dictionary of parameters. + cur_epoch : int, optional + The current training epoch, needed to compute the training parameters for each job. + For example in many implementations the learning rate would be dropping with the number of epochs. + total_examples : int, optional + Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences + in a corpus. Used to log progress. + total_words : int, optional + Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw + words in a corpus. Used to log progress. + report_delay : float, optional + Number of seconds between two consecutive progress report messages in the logger. + is_corpus_file_mode : bool, optional + Whether training is file-based (corpus_file argument) or not. + + Returns + ------- + (int, int, int) + The epoch report consisting of three elements: + * size of data chunk processed, for example number of sentences in the corpus chunk. + * Effective word count used in training (after ignoring unknown words and trimming the sentence length). + * Total word count used in training. + + """ + example_count, trained_word_count, raw_word_count = 0, 0, 0 + start, next_report = default_timer() - 0.00001, 1.0 + job_tally = 0 + unfinished_worker_count = self.workers + + while unfinished_worker_count > 0: + report = progress_queue.get() # blocks if workers too slow + if report is None: # a thread reporting that it finished + unfinished_worker_count -= 1 + logger.info("worker thread finished; awaiting finish of %i more threads", unfinished_worker_count) + continue + examples, trained_words, raw_words = report + job_tally += 1 + + # update progress stats + example_count += examples + trained_word_count += trained_words # only words in vocab & sampled + raw_word_count += raw_words + + # log progress once every report_delay seconds + elapsed = default_timer() - start + if elapsed >= next_report: + self._log_progress( + job_queue, progress_queue, cur_epoch, example_count, total_examples, + raw_word_count, total_words, trained_word_count, elapsed) + next_report = elapsed + report_delay + # all done; report the final stats + elapsed = default_timer() - start + self._log_epoch_end( + cur_epoch, example_count, total_examples, raw_word_count, total_words, + trained_word_count, elapsed, is_corpus_file_mode) + self.total_train_time += elapsed + return trained_word_count, raw_word_count, job_tally + + def _train_epoch_corpusfile( + self, corpus_file, cur_epoch=0, total_examples=None, total_words=None, callbacks=(), **kwargs): + """Train the model for a single epoch. + + Parameters + ---------- + corpus_file : str + Path to a corpus file in :class:`~gensim.models.word2vec.LineSentence` format. + cur_epoch : int, optional + The current training epoch, needed to compute the training parameters for each job. + For example in many implementations the learning rate would be dropping with the number of epochs. + total_examples : int, optional + Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences + in a corpus, used to log progress. + total_words : int + Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw + words in a corpus, used to log progress. Must be provided in order to seek in `corpus_file`. + **kwargs : object + Additional key word parameters for the specific model inheriting from this class. + + Returns + ------- + (int, int, int) + The training report for this epoch consisting of three elements: + * Size of data chunk processed, for example number of sentences in the corpus chunk. + * Effective word count used in training (after ignoring unknown words and trimming the sentence length). + * Total word count used in training. + + """ + if not total_words: + raise ValueError("total_words must be provided alongside corpus_file argument.") + + from gensim.models.word2vec_corpusfile import CythonVocab + from gensim.models.fasttext import FastText + cython_vocab = CythonVocab(self.wv, hs=self.hs, fasttext=isinstance(self, FastText)) + + progress_queue = Queue() + + corpus_file_size = os.path.getsize(corpus_file) + + thread_kwargs = copy.copy(kwargs) + thread_kwargs['cur_epoch'] = cur_epoch + thread_kwargs['total_examples'] = total_examples + thread_kwargs['total_words'] = total_words + workers = [ + threading.Thread( + target=self._worker_loop_corpusfile, + args=( + corpus_file, thread_id, corpus_file_size / self.workers * thread_id, cython_vocab, progress_queue + ), + kwargs=thread_kwargs + ) for thread_id in range(self.workers) + ] + + for thread in workers: + thread.daemon = True + thread.start() + + trained_word_count, raw_word_count, job_tally = self._log_epoch_progress( + progress_queue=progress_queue, job_queue=None, cur_epoch=cur_epoch, + total_examples=total_examples, total_words=total_words, is_corpus_file_mode=True) + + return trained_word_count, raw_word_count, job_tally + + def _train_epoch(self, data_iterable, cur_epoch=0, total_examples=None, total_words=None, + queue_factor=2, report_delay=1.0, callbacks=()): + """Train the model for a single epoch. + + Parameters + ---------- + data_iterable : iterable of list of object + The input corpus. This will be split in chunks and these chunks will be pushed to the queue. + cur_epoch : int, optional + The current training epoch, needed to compute the training parameters for each job. + For example in many implementations the learning rate would be dropping with the number of epochs. + total_examples : int, optional + Count of objects in the `data_iterator`. In the usual case this would correspond to the number of sentences + in a corpus, used to log progress. + total_words : int, optional + Count of total objects in `data_iterator`. In the usual case this would correspond to the number of raw + words in a corpus, used to log progress. + queue_factor : int, optional + Multiplier for size of queue -> size = number of workers * queue_factor. + report_delay : float, optional + Number of seconds between two consecutive progress report messages in the logger. + + Returns + ------- + (int, int, int) + The training report for this epoch consisting of three elements: + * Size of data chunk processed, for example number of sentences in the corpus chunk. + * Effective word count used in training (after ignoring unknown words and trimming the sentence length). + * Total word count used in training. + + """ + job_queue = Queue(maxsize=queue_factor * self.workers) + progress_queue = Queue(maxsize=(queue_factor + 1) * self.workers) + progress_queue.callbacks = callbacks # messy way to pass along for just this session + + workers = [ + threading.Thread( + target=self._worker_loop, + args=(job_queue, progress_queue,)) + for _ in range(self.workers) + ] + + workers.append(threading.Thread( + target=self._job_producer, + args=(data_iterable, job_queue), + kwargs={'cur_epoch': cur_epoch, 'total_examples': total_examples, 'total_words': total_words})) + + for thread in workers: + thread.daemon = True # make interrupting the process with ctrl+c easier + thread.start() + + trained_word_count, raw_word_count, job_tally = self._log_epoch_progress( + progress_queue, job_queue, cur_epoch=cur_epoch, total_examples=total_examples, total_words=total_words, + report_delay=report_delay, is_corpus_file_mode=False) + + return trained_word_count, raw_word_count, job_tally + + def _get_job_params(self, cur_epoch): + """Get the learning rate used in the current epoch. + + Parameters + ---------- + cur_epoch : int + Current iteration through the corpus + + Returns + ------- + float + The learning rate for this epoch (it is linearly reduced with epochs from `self.alpha` to `self.min_alpha`). + + """ + alpha = self.alpha - ((self.alpha - self.min_alpha) * float(cur_epoch) / self.epochs) + return alpha + + def _update_job_params(self, job_params, epoch_progress, cur_epoch): + """Get the correct learning rate for the next iteration. + + Parameters + ---------- + job_params : dict of (str, obj) + UNUSED. + epoch_progress : float + Ratio of finished work in the current epoch. + cur_epoch : int + Number of current iteration. + + Returns + ------- + float + The learning rate to be used in the next training epoch. + + """ + start_alpha = self.alpha + end_alpha = self.min_alpha + progress = (cur_epoch + epoch_progress) / self.epochs + next_alpha = start_alpha - (start_alpha - end_alpha) * progress + next_alpha = max(end_alpha, next_alpha) + self.min_alpha_yet_reached = next_alpha + return next_alpha + + def _get_thread_working_mem(self): + """Computes the memory used per worker thread. + + Returns + ------- + (np.ndarray, np.ndarray) + Each worker threads private work memory. + + """ + work = matutils.zeros_aligned(self.layer1_size, dtype=REAL) # per-thread private work memory + neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) + return work, neu1 + + def _raw_word_count(self, job): + """Get the number of words in a given job. + + Parameters + ---------- + job: iterable of list of str + The corpus chunk processed in a single batch. + + Returns + ------- + int + Number of raw words in the corpus chunk. + + """ + return sum(len(sentence) for sentence in job) + + def _check_training_sanity(self, epochs=None, total_examples=None, total_words=None, **kwargs): + """Checks whether the training parameters make sense. + + Called right before training starts in :meth:`~gensim.models.base_any2vec.BaseWordEmbeddingsModel.train` + and raises warning or errors depending on the severity of the issue in case an inconsistent parameter + combination is detected. + + Parameters + ---------- + epochs : int, optional + Number of training epochs. Must have a (non None) value. + total_examples : int, optional + Number of documents in the corpus. Either `total_examples` or `total_words` **must** be supplied. + total_words : int, optional + Number of words in the corpus. Either `total_examples` or `total_words` **must** be supplied. + **kwargs : object + Unused. Present to preserve signature among base and inherited implementations. + + Raises + ------ + RuntimeError + If one of the required training pre/post processing steps have not been performed. + ValueError + If the combination of input parameters is inconsistent. + + """ + if self.alpha > self.min_alpha_yet_reached: + logger.warning("Effective 'alpha' higher than previous training cycles") + + if not self.wv.vocab: # should be set by `build_vocab` + raise RuntimeError("you must first build vocabulary before training the model") + if not len(self.wv.vectors): + raise RuntimeError("you must initialize vectors before training the model") + + if not hasattr(self, 'corpus_count'): + raise ValueError( + "The number of examples in the training corpus is missing. " + "Please make sure this is set inside `build_vocab` function." + "Call the `build_vocab` function before calling `train`." + ) + + if total_words is None and total_examples is None: + raise ValueError( + "You must specify either total_examples or total_words, for proper job parameters updation" + "and progress calculations. " + "The usual value is total_examples=model.corpus_count." + ) + if epochs is None: + raise ValueError("You must specify an explict epochs count. The usual value is epochs=model.epochs.") + logger.info( + "training model with %i workers on %i vocabulary and %i features, " + "using sg=%s hs=%s sample=%s negative=%s window=%s", + self.workers, len(self.wv.vocab), self.layer1_size, self.sg, + self.hs, self.sample, self.negative, self.window + ) + + def _log_progress(self, job_queue, progress_queue, cur_epoch, example_count, total_examples, + raw_word_count, total_words, trained_word_count, elapsed): + """Callback used to log progress for long running jobs. + + Parameters + ---------- + job_queue : Queue of (list of object, dict of (str, float)) + The queue of jobs still to be performed by workers. Each job is represented as a tuple containing + the batch of data to be processed and the parameters to be used for the processing as a dict. + progress_queue : Queue of (int, int, int) + A queue of progress reports. Each report is represented as a tuple of these 3 elements: + * size of data chunk processed, for example number of sentences in the corpus chunk. + * Effective word count used in training (after ignoring unknown words and trimming the sentence length). + * Total word count used in training. + cur_epoch : int + The current training iteration through the corpus. + example_count : int + Number of examples (could be sentences for example) processed until now. + total_examples : int + Number of all examples present in the input corpus. + raw_word_count : int + Number of words used in training until now. + total_words : int + Number of all words in the input corpus. + trained_word_count : int + Number of effective words used in training until now (after ignoring unknown words and trimming + the sentence length). + elapsed : int + Elapsed time since the beginning of training in seconds. + + Notes + ----- + If you train the model via `corpus_file` argument, there is no job_queue, so reported job_queue size will + always be equal to -1. + + """ + if total_examples: + # examples-based progress % + logger.info( + "EPOCH %i - PROGRESS: at %.2f%% examples, %.0f words/s, in_qsize %i, out_qsize %i", + cur_epoch + 1, 100.0 * example_count / total_examples, trained_word_count / elapsed, + -1 if job_queue is None else utils.qsize(job_queue), utils.qsize(progress_queue) + ) + else: + # words-based progress % + logger.info( + "EPOCH %i - PROGRESS: at %.2f%% words, %.0f words/s, in_qsize %i, out_qsize %i", + cur_epoch + 1, 100.0 * raw_word_count / total_words, trained_word_count / elapsed, + -1 if job_queue is None else utils.qsize(job_queue), utils.qsize(progress_queue) + ) + + def _log_epoch_end(self, cur_epoch, example_count, total_examples, raw_word_count, total_words, + trained_word_count, elapsed, is_corpus_file_mode): + """Callback used to log the end of a training epoch. + + Parameters + ---------- + cur_epoch : int + The current training iteration through the corpus. + example_count : int + Number of examples (could be sentences for example) processed until now. total_examples : int - Count of sentences. + Number of all examples present in the input corpus. + raw_word_count : int + Number of words used in training until now. total_words : int - Count of raw words in sentences. - epochs : int - Number of iterations (epochs) over the corpus. - start_alpha : float, optional - Initial learning rate. If supplied, replaces the starting `alpha` from the constructor, - for this one call to`train()`. - Use only if making multiple calls to `train()`, when you want to manage the alpha learning-rate yourself - (not recommended). - end_alpha : float, optional - Final learning rate. Drops linearly from `start_alpha`. - If supplied, this replaces the final `min_alpha` from the constructor, for this one call to `train()`. - Use only if making multiple calls to `train()`, when you want to manage the alpha learning-rate yourself - (not recommended). - word_count : int, optional - Count of words already trained. Set this to 0 for the usual - case of training on all words in sentences. - queue_factor : int, optional - Multiplier for size of queue (number of workers * queue_factor). - report_delay : float, optional - Seconds to wait before reporting progress. - compute_loss: bool, optional - If True, computes and stores loss value which can be retrieved using - :meth:`~gensim.models.word2vec.Word2Vec.get_latest_training_loss`. - callbacks : iterable of :class:`~gensim.models.callbacks.CallbackAny2Vec`, optional - Sequence of callbacks to be executed at specific stages during training. + Number of all words in the input corpus. + trained_word_count : int + Number of effective words used in training until now (after ignoring unknown words and trimming + the sentence length). + elapsed : int + Elapsed time since the beginning of training in seconds. + is_corpus_file_mode : bool + Whether training is file-based (corpus_file argument) or not. - Examples + Warnings -------- - .. sourcecode:: pycon + In case the corpus is changed while the epoch was running. - >>> from gensim.models import Word2Vec - >>> sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]] - >>> - >>> model = Word2Vec(min_count=1) - >>> model.build_vocab(sentences) # prepare the model vocabulary - >>> model.train(sentences, total_examples=model.corpus_count, epochs=model.iter) # train word vectors - (1, 30) + """ + logger.info( + "EPOCH - %i : training on %i raw words (%i effective words) took %.1fs, %.0f effective words/s", + cur_epoch + 1, raw_word_count, trained_word_count, elapsed, trained_word_count / elapsed + ) + + # don't warn if training in file-based mode, because it's expected behavior + if is_corpus_file_mode: + return + + # check that the input corpus hasn't changed during iteration + if total_examples and total_examples != example_count: + logger.warning( + "EPOCH - %i : supplied example count (%i) did not equal expected count (%i)", cur_epoch + 1, + example_count, total_examples + ) + if total_words and total_words != raw_word_count: + logger.warning( + "EPOCH - %i : supplied raw word count (%i) did not equal expected count (%i)", cur_epoch + 1, + raw_word_count, total_words + ) + + def _log_train_end(self, raw_word_count, trained_word_count, total_elapsed, job_tally): + """Callback to log the end of training. + + Parameters + ---------- + raw_word_count : int + Number of words used in the whole training. + trained_word_count : int + Number of effective words used in training (after ignoring unknown words and trimming the sentence length). + total_elapsed : int + Total time spent during training in seconds. + job_tally : int + Total number of jobs processed during training. """ - return super(Word2Vec, self).train( - sentences=sentences, corpus_file=corpus_file, total_examples=total_examples, total_words=total_words, - epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count, - queue_factor=queue_factor, report_delay=report_delay, compute_loss=compute_loss, callbacks=callbacks) + logger.info( + "training on a %i raw words (%i effective words) took %.1fs, %.0f effective words/s", + raw_word_count, trained_word_count, total_elapsed, trained_word_count / total_elapsed + ) def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor=2, report_delay=1): """Score the log probability for a sequence of sentences. @@ -547,8 +1648,8 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor logger.info( "scoring sentences with %i workers on %i vocabulary and %i features, " "using sg=%s hs=%s sample=%s and negative=%s", - self.workers, len(self.wv.vocab), self.trainables.layer1_size, self.sg, self.hs, - self.vocabulary.sample, self.negative + self.workers, len(self.wv.vocab), self.layer1_size, self.sg, self.hs, + self.sample, self.negative ) if not self.wv.vocab: @@ -563,7 +1664,7 @@ def score(self, sentences, total_sentences=int(1e6), chunksize=100, queue_factor def worker_loop(): """Compute log probability for each sentence, lifting lists of sentences from the jobs queue.""" work = zeros(1, dtype=REAL) # for sg hs, we actually only need one memory loc (running sum) - neu1 = matutils.zeros_aligned(self.trainables.layer1_size, dtype=REAL) + neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL) while True: job = job_queue.get() if job is None: # signal to finish @@ -696,7 +1797,7 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut if word in self.wv.vocab: overlap_count += 1 self.wv.vectors[self.wv.vocab[word].index] = weights - self.trainables.vectors_lockf[self.wv.vocab[word].index] = lockf # lock-factor: 0.0=no changes + self.wv.vectors_lockf[self.wv.vocab[word].index] = lockf # lock-factor: 0.0=no changes else: for line_no, line in enumerate(fin): parts = utils.to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ") @@ -706,7 +1807,7 @@ def intersect_word2vec_format(self, fname, lockf=0.0, binary=False, encoding='ut if word in self.wv.vocab: overlap_count += 1 self.wv.vectors[self.wv.vocab[word].index] = weights - self.trainables.vectors_lockf[self.wv.vocab[word].index] = lockf # lock-factor: 0.0=no changes + self.wv.vectors_lockf[self.wv.vocab[word].index] = lockf # lock-factor: 0.0=no changes logger.info("merged %d vectors into %s matrix from %s", overlap_count, self.wv.vectors.shape, fname) def predict_output_word(self, context_words_list, topn=10): @@ -731,12 +1832,12 @@ def predict_output_word(self, context_words_list, topn=10): "so you need to have run word2vec with negative > 0 for this to work." ) - if not hasattr(self.wv, 'vectors') or not hasattr(self.trainables, 'syn1neg'): + if not hasattr(self.wv, 'vectors') or not hasattr(self, 'syn1neg'): raise RuntimeError("Parameters required for predicting the output words not found.") word_vocabs = [self.wv.vocab[w] for w in context_words_list if w in self.wv.vocab] if not word_vocabs: - warnings.warn("All the input context words are out-of-vocabulary for the current model.") + logger.warning("All the input context words are out-of-vocabulary for the current model.") return None word2_indices = [word.index for word in word_vocabs] @@ -746,7 +1847,7 @@ def predict_output_word(self, context_words_list, topn=10): l1 /= len(word2_indices) # propagate hidden -> output and take softmax to get probabilities - prob_values = exp(dot(l1, self.trainables.syn1neg.T)) + prob_values = exp(dot(l1, self.syn1neg.T)) prob_values /= sum(prob_values) top_indices = matutils.argsort(prob_values, topn=topn, reverse=True) # returning the most probable output words with their probabilities @@ -771,9 +1872,9 @@ def reset_from(self, other_model): """ self.wv.vocab = other_model.wv.vocab self.wv.index2key = other_model.wv.index2key - self.vocabulary.cum_table = other_model.vocabulary.cum_table + self.cum_table = other_model.cum_table self.corpus_count = other_model.corpus_count - self.trainables.reset_weights(self.hs, self.negative, self.wv) + self.reset_weights() def __str__(self): """Human readable representation of the model's state. @@ -816,7 +1917,7 @@ def get_latest_training_loss(self): return self.running_training_loss @classmethod - def load(cls, *args, **kwargs): + def load(cls, *args, rethrow=False, **kwargs): """Load a previously saved :class:`~gensim.models.word2vec.Word2Vec` model. See Also @@ -837,17 +1938,51 @@ def load(cls, *args, **kwargs): """ try: model = super(Word2Vec, cls).load(*args, **kwargs) - - # for backward compatibility for `max_final_vocab` feature + if not isinstance(model, Word2Vec): + rethrow = True + raise AttributeError("Model of type %s can't be loaded by %s" % (type(model), str(cls))) + # for backward compatibility + if not hasattr(model, 'ns_exponent'): + model.ns_exponent = 0.75 + if model.negative and hasattr(model.wv, 'index2word'): + model.make_cum_table() # rebuild cum_table from vocabulary ## TODO: ??? + if not hasattr(model, 'corpus_count'): + model.corpus_count = None + if not hasattr(model, 'corpus_total_words'): + model.corpus_total_words = None + if not hasattr(model.wv, 'vectors_lockf') and hasattr(model.wv, 'vectors'): + model.wv.vectors_lockf = getattr(model, 'vectors_lockf', ones(len(model.wv.vectors), dtype=REAL)) + if not hasattr(model, 'random'): + model.random = np.random.RandomState(model.seed) + if not hasattr(model, 'train_count'): + model.train_count = 0 + model.total_train_time = 0 + if not hasattr(model, 'epochs'): + model.epochs = model.iter + del model.iter if not hasattr(model, 'max_final_vocab'): model.max_final_vocab = None - model.vocabulary.max_final_vocab = None - + if hasattr(model, 'vocabulary'): # re-integrate state that had been moved + for a in ('max_vocab_size', 'min_count', 'sample', 'sorted_vocab', 'null_word', 'raw_vocab'): + setattr(model, a, getattr(model.vocabulary, a)) + del model.vocabulary + if hasattr(model, 'trainables'): # re-integrate state that had been moved + for a in ('hashfxn', 'layer1_size', 'seed', 'syn1neg', 'syn1'): + if hasattr(model.trainables, a): + setattr(model, a, getattr(model.trainables, a)) + if hasattr(model, 'syn1'): + model.syn1 = model.syn1 + del model.syn1 + del model.trainables return model - except AttributeError: - logger.info('Model saved using code from earlier Gensim Version. Re-loading old model in a compatible way.') - from gensim.models.deprecated.word2vec import load_old_word2vec - return load_old_word2vec(*args, **kwargs) + except AttributeError as ae: + if rethrow: + raise ae + logger.error( + "Model load error. Was model saved using code from an older Gensim Version? " + "Try loading older model using gensim-3.8.1, then re-saving, to restore " + "compatibility with current code.") + raise ae class BrownCorpus(object): @@ -934,412 +2069,122 @@ def __iter__(self): """Iterate through the lines in the source.""" try: # Assume it is a file-like object and try treating it as such - # Things that don't have seek will trigger an exception - self.source.seek(0) - for line in itertools.islice(self.source, self.limit): - line = utils.to_unicode(line).split() - i = 0 - while i < len(line): - yield line[i: i + self.max_sentence_length] - i += self.max_sentence_length - except AttributeError: - # If it didn't work like a file, use it as a string filename - with utils.open(self.source, 'rb') as fin: - for line in itertools.islice(fin, self.limit): - line = utils.to_unicode(line).split() - i = 0 - while i < len(line): - yield line[i: i + self.max_sentence_length] - i += self.max_sentence_length - - -class PathLineSentences(object): - def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): - """Like :class:`~gensim.models.word2vec.LineSentence`, but process all files in a directory - in alphabetical order by filename. - - The directory must only contain files that can be read by :class:`gensim.models.word2vec.LineSentence`: - .bz2, .gz, and text files. Any file not ending with .bz2 or .gz is assumed to be a text file. - - The format of files (either text, or compressed text files) in the path is one sentence = one line, - with words already preprocessed and separated by whitespace. - - Warnings - -------- - Does **not recurse** into subdirectories. - - Parameters - ---------- - source : str - Path to the directory. - limit : int or None - Read only the first `limit` lines from each file. Read all if limit is None (the default). - - """ - self.source = source - self.max_sentence_length = max_sentence_length - self.limit = limit - - if os.path.isfile(self.source): - logger.debug('single file given as source, rather than a directory of files') - logger.debug('consider using models.word2vec.LineSentence for a single file') - self.input_files = [self.source] # force code compatibility with list of files - elif os.path.isdir(self.source): - self.source = os.path.join(self.source, '') # ensures os-specific slash at end of path - logger.info('reading directory %s', self.source) - self.input_files = os.listdir(self.source) - self.input_files = [self.source + filename for filename in self.input_files] # make full paths - self.input_files.sort() # makes sure it happens in filename order - else: # not a file or a directory, then we can't do anything with it - raise ValueError('input is neither a file nor a path') - logger.info('files read into PathLineSentences:%s', '\n'.join(self.input_files)) - - def __iter__(self): - """iterate through the files""" - for file_name in self.input_files: - logger.info('reading file %s', file_name) - with utils.open(file_name, 'rb') as fin: - for line in itertools.islice(fin, self.limit): - line = utils.to_unicode(line).split() - i = 0 - while i < len(line): - yield line[i:i + self.max_sentence_length] - i += self.max_sentence_length - - -def _scan_vocab_worker(stream, progress_queue, max_vocab_size=None, trim_rule=None): - """Do an initial scan of all words appearing in stream. - - Note: This function can not be Word2VecVocab's method because - of multiprocessing synchronization specifics in Python. - """ - min_reduce = 1 - vocab = defaultdict(int) - checked_string_types = 0 - sentence_no = -1 - total_words = 0 - for sentence_no, sentence in enumerate(stream): - if not checked_string_types: - if isinstance(sentence, string_types): - log_msg = "Each 'sentences' item should be a list of words (usually unicode strings). " \ - "First item here is instead plain %s." % type(sentence) - progress_queue.put(log_msg) - - checked_string_types += 1 - - for word in sentence: - vocab[word] += 1 - - if max_vocab_size and len(vocab) > max_vocab_size: - utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) - min_reduce += 1 - - total_words += len(sentence) - - progress_queue.put((total_words, sentence_no + 1)) - progress_queue.put(None) - return vocab - - -@dataclass -class W2VVocab: - """A dataclass shape-compatible with keyedvectors.SimpleVocab, extended with the - `sample_int` property needed by `Word2Vec` models.""" - __slots__ = ('count', 'index', 'sample_int') - count: int - index: int - sample_int: int - - def __init__(self, count=0, index=0, sample_int=2**32): - self.count, self.index, self.sample_int = count, index, sample_int - - def __lt__(self, other): - return self.count < other.count - - -@dataclass -class W2VHSVocab: - """A dataclass shape-compatible with W2VVocab, extended with the `code` and - `point` properties needed by hierarchical-sampling (`hs=1`) `Word2Vec` models.""" - __slots__ = ('count', 'index', 'sample_int', 'code', 'point') - count: int - index: int - sample_int: int - code: List[int] - point: List[int] - - def __init__(self, count=0, index=0, sample_int=2**32, code=None, point=None): - self.count, self.index, self.sample_int, self.code, self.point = \ - count, index, sample_int, code, point - - def __lt__(self, other): - return self.count < other.count - - -class Word2VecVocab(utils.SaveLoad): - def __init__( - self, max_vocab_size=None, min_count=5, sample=1e-3, sorted_vocab=True, null_word=0, - max_final_vocab=None, ns_exponent=0.75): - """Vocabulary used by :class:`~gensim.models.word2vec.Word2Vec`.""" - self.max_vocab_size = max_vocab_size - self.min_count = min_count - self.sample = sample - self.sorted_vocab = sorted_vocab - self.null_word = null_word - self.cum_table = None # for negative sampling - self.raw_vocab = None - self.max_final_vocab = max_final_vocab - self.ns_exponent = ns_exponent - - def _scan_vocab(self, sentences, progress_per, trim_rule): - sentence_no = -1 - total_words = 0 - min_reduce = 1 - vocab = defaultdict(int) - checked_string_types = 0 - for sentence_no, sentence in enumerate(sentences): - if not checked_string_types: - if isinstance(sentence, string_types): - logger.warning( - "Each 'sentences' item should be a list of words (usually unicode strings). " - "First item here is instead plain %s.", - type(sentence) - ) - checked_string_types += 1 - if sentence_no % progress_per == 0: - logger.info( - "PROGRESS: at sentence #%i, processed %i words, keeping %i word types", - sentence_no, total_words, len(vocab) - ) - for word in sentence: - vocab[word] += 1 - total_words += len(sentence) - - if self.max_vocab_size and len(vocab) > self.max_vocab_size: - utils.prune_vocab(vocab, min_reduce, trim_rule=trim_rule) - min_reduce += 1 - - corpus_count = sentence_no + 1 - self.raw_vocab = vocab - return total_words, corpus_count - - def scan_vocab(self, sentences=None, corpus_file=None, progress_per=10000, workers=None, trim_rule=None): - logger.info("collecting all words and their counts") - if corpus_file: - sentences = LineSentence(corpus_file) - - total_words, corpus_count = self._scan_vocab(sentences, progress_per, trim_rule) - - logger.info( - "collected %i word types from a corpus of %i raw words and %i sentences", - len(self.raw_vocab), total_words, corpus_count - ) - - return total_words, corpus_count - - def sort_vocab(self, wv): - """Sort the vocabulary so the most frequent words have the lowest indexes.""" - if len(wv.vectors): - raise RuntimeError("cannot sort vocabulary after model weights already initialized.") - wv.index2key.sort(key=lambda word: wv.vocab[word].count, reverse=True) - for i, word in enumerate(wv.index2key): - wv.vocab[word].index = i - - def prepare_vocab( - self, hs, negative, wv, update=False, keep_raw_vocab=False, trim_rule=None, - min_count=None, sample=None, dry_run=False): - """Apply vocabulary settings for `min_count` (discarding less-frequent words) - and `sample` (controlling the downsampling of more-frequent words). - - Calling with `dry_run=True` will only simulate the provided settings and - report the size of the retained vocabulary, effective corpus length, and - estimated memory requirements. Results are both printed via logging and - returned as a dict. - - Delete the raw vocabulary after the scaling is done to free up RAM, - unless `keep_raw_vocab` is set. - - """ - min_count = min_count or self.min_count - sample = sample or self.sample - drop_total = drop_unique = 0 + # Things that don't have seek will trigger an exception + self.source.seek(0) + for line in itertools.islice(self.source, self.limit): + line = utils.to_unicode(line).split() + i = 0 + while i < len(line): + yield line[i: i + self.max_sentence_length] + i += self.max_sentence_length + except AttributeError: + # If it didn't work like a file, use it as a string filename + with utils.open(self.source, 'rb') as fin: + for line in itertools.islice(fin, self.limit): + line = utils.to_unicode(line).split() + i = 0 + while i < len(line): + yield line[i: i + self.max_sentence_length] + i += self.max_sentence_length - # set effective_min_count to min_count in case max_final_vocab isn't set - self.effective_min_count = min_count - # if max_final_vocab is specified instead of min_count - # pick a min_count which satisfies max_final_vocab as well as possible - if self.max_final_vocab is not None: - sorted_vocab = sorted(self.raw_vocab.keys(), key=lambda word: self.raw_vocab[word], reverse=True) - calc_min_count = 1 +class PathLineSentences(object): + def __init__(self, source, max_sentence_length=MAX_WORDS_IN_BATCH, limit=None): + """Like :class:`~gensim.models.word2vec.LineSentence`, but process all files in a directory + in alphabetical order by filename. - if self.max_final_vocab < len(sorted_vocab): - calc_min_count = self.raw_vocab[sorted_vocab[self.max_final_vocab]] + 1 + The directory must only contain files that can be read by :class:`gensim.models.word2vec.LineSentence`: + .bz2, .gz, and text files. Any file not ending with .bz2 or .gz is assumed to be a text file. - self.effective_min_count = max(calc_min_count, min_count) - logger.info( - "max_final_vocab=%d and min_count=%d resulted in calc_min_count=%d, effective_min_count=%d", - self.max_final_vocab, min_count, calc_min_count, self.effective_min_count - ) + The format of files (either text, or compressed text files) in the path is one sentence = one line, + with words already preprocessed and separated by whitespace. - if not update: - logger.info("Loading a fresh vocabulary") - retain_total, retain_words = 0, [] - # Discard words less-frequent than min_count - if not dry_run: - wv.index2key = [] - # make stored settings match these applied settings - self.min_count = min_count - self.sample = sample - wv.vocab = {} + Warnings + -------- + Does **not recurse** into subdirectories. - for word, v in iteritems(self.raw_vocab): - if keep_vocab_item(word, v, self.effective_min_count, trim_rule=trim_rule): - retain_words.append(word) - retain_total += v - if not dry_run: - wv.vocab[word] = W2VVocab(count=v, index=len(wv.index2key)) - wv.index2key.append(word) - else: - drop_unique += 1 - drop_total += v - original_unique_total = len(retain_words) + drop_unique - retain_unique_pct = len(retain_words) * 100 / max(original_unique_total, 1) - logger.info( - "effective_min_count=%d retains %i unique words (%i%% of original %i, drops %i)", - self.effective_min_count, len(retain_words), retain_unique_pct, original_unique_total, drop_unique - ) - original_total = retain_total + drop_total - retain_pct = retain_total * 100 / max(original_total, 1) - logger.info( - "effective_min_count=%d leaves %i word corpus (%i%% of original %i, drops %i)", - self.effective_min_count, retain_total, retain_pct, original_total, drop_total - ) - else: - logger.info("Updating model with new vocabulary") - new_total = pre_exist_total = 0 - new_words = pre_exist_words = [] - for word, v in iteritems(self.raw_vocab): - if keep_vocab_item(word, v, self.effective_min_count, trim_rule=trim_rule): - if word in wv.vocab: - pre_exist_words.append(word) - pre_exist_total += v - if not dry_run: - wv.vocab[word].count += v - else: - new_words.append(word) - new_total += v - if not dry_run: - wv.vocab[word] = W2VVocab(count=v, index=len(wv.index2key)) - wv.index2key.append(word) - else: - drop_unique += 1 - drop_total += v - original_unique_total = len(pre_exist_words) + len(new_words) + drop_unique - pre_exist_unique_pct = len(pre_exist_words) * 100 / max(original_unique_total, 1) - new_unique_pct = len(new_words) * 100 / max(original_unique_total, 1) - logger.info( - "New added %i unique words (%i%% of original %i) " - "and increased the count of %i pre-existing words (%i%% of original %i)", - len(new_words), new_unique_pct, original_unique_total, len(pre_exist_words), - pre_exist_unique_pct, original_unique_total - ) - retain_words = new_words + pre_exist_words - retain_total = new_total + pre_exist_total + Parameters + ---------- + source : str + Path to the directory. + limit : int or None + Read only the first `limit` lines from each file. Read all if limit is None (the default). - # Precalculate each vocabulary item's threshold for sampling - if not sample: - # no words downsampled - threshold_count = retain_total - elif sample < 1.0: - # traditional meaning: set parameter as proportion of total - threshold_count = sample * retain_total - else: - # new shorthand: sample >= 1 means downsample all words with higher count than sample - threshold_count = int(sample * (3 + sqrt(5)) / 2) + """ + self.source = source + self.max_sentence_length = max_sentence_length + self.limit = limit - downsample_total, downsample_unique = 0, 0 - for w in retain_words: - v = self.raw_vocab[w] - word_probability = (sqrt(v / threshold_count) + 1) * (threshold_count / v) - if word_probability < 1.0: - downsample_unique += 1 - downsample_total += word_probability * v - else: - word_probability = 1.0 - downsample_total += v - if not dry_run: - wv.vocab[w].sample_int = int(round(word_probability * 2**32)) + if os.path.isfile(self.source): + logger.debug('single file given as source, rather than a directory of files') + logger.debug('consider using models.word2vec.LineSentence for a single file') + self.input_files = [self.source] # force code compatibility with list of files + elif os.path.isdir(self.source): + self.source = os.path.join(self.source, '') # ensures os-specific slash at end of path + logger.info('reading directory %s', self.source) + self.input_files = os.listdir(self.source) + self.input_files = [self.source + filename for filename in self.input_files] # make full paths + self.input_files.sort() # makes sure it happens in filename order + else: # not a file or a directory, then we can't do anything with it + raise ValueError('input is neither a file nor a path') + logger.info('files read into PathLineSentences:%s', '\n'.join(self.input_files)) - if not dry_run and not keep_raw_vocab: - logger.info("deleting the raw counts dictionary of %i items", len(self.raw_vocab)) - self.raw_vocab = defaultdict(int) + def __iter__(self): + """iterate through the files""" + for file_name in self.input_files: + logger.info('reading file %s', file_name) + with utils.open(file_name, 'rb') as fin: + for line in itertools.islice(fin, self.limit): + line = utils.to_unicode(line).split() + i = 0 + while i < len(line): + yield line[i:i + self.max_sentence_length] + i += self.max_sentence_length - logger.info("sample=%g downsamples %i most-common words", sample, downsample_unique) - logger.info( - "downsampling leaves estimated %i word corpus (%.1f%% of prior %i)", - downsample_total, downsample_total * 100.0 / max(retain_total, 1), retain_total - ) - # return from each step: words-affected, resulting-corpus-size, extra memory estimates - report_values = { - 'drop_unique': drop_unique, 'retain_total': retain_total, 'downsample_unique': downsample_unique, - 'downsample_total': int(downsample_total), 'num_retained_words': len(retain_words) - } +@dataclass +class W2VVocab: + """A dataclass shape-compatible with keyedvectors.SimpleVocab, extended with the + `sample_int` property needed by `Word2Vec` models.""" + __slots__ = ('count', 'index', 'sample_int') + count: int + index: int + sample_int: int - if self.null_word: - # create null pseudo-word for padding when using concatenative L1 (run-of-words) - # this word is only ever input – never predicted – so count, huffman-point, etc doesn't matter - self.add_null_word(wv) + def __init__(self, count=0, index=0, sample_int=2**32): + self.count, self.index, self.sample_int = count, index, sample_int - if self.sorted_vocab and not update: - self.sort_vocab(wv) - if hs: - # add info about each word's Huffman encoding - self.create_binary_tree(wv) - if negative: - # build the table for drawing random words (for negative sampling) - self.make_cum_table(wv) + def __lt__(self, other): + return self.count < other.count - return report_values - def add_null_word(self, wv): - word, v = '\0', W2VVocab(count=1, sample_int=0) - v.index = len(wv.vocab) - wv.index2key.append(word) - wv.vocab[word] = v +@dataclass +class W2VHSVocab: + """A dataclass shape-compatible with W2VVocab, extended with the `code` and + `point` properties needed by hierarchical-sampling (`hs=1`) `Word2Vec` models.""" + __slots__ = ('count', 'index', 'sample_int', 'code', 'point') + count: int + index: int + sample_int: int + code: List[int] + point: List[int] - def create_binary_tree(self, wv): - """Create a `binary Huffman tree `_ using stored vocabulary - word counts. Frequent words will have shorter binary codes. - Called internally from :meth:`~gensim.models.word2vec.Word2VecVocab.build_vocab`. + def __init__(self, count=0, index=0, sample_int=2**32, code=None, point=None): + self.count, self.index, self.sample_int, self.code, self.point = \ + count, index, sample_int, code, point - """ - _assign_binary_codes(wv.vocab) + def __lt__(self, other): + return self.count < other.count - def make_cum_table(self, wv, domain=2**31 - 1): - """Create a cumulative-distribution table using stored vocabulary word counts for - drawing random words in the negative-sampling training routines. - To draw a word index, choose a random integer up to the maximum value in the table (cum_table[-1]), - then finding that integer's sorted insertion point (as if by `bisect_left` or `ndarray.searchsorted()`). - That insertion point is the drawn index, coming up in proportion equal to the increment at that slot. +class Word2VecVocab(utils.SaveLoad): + """Obsolete class retained for now as load-compatibility state capture""" + pass - Called internally from :meth:`~gensim.models.word2vec.Word2VecVocab.build_vocab`. - """ - vocab_size = len(wv.index2key) - self.cum_table = zeros(vocab_size, dtype=uint32) - # compute sum of all power (Z in paper) - train_words_pow = 0.0 - for word_index in range(vocab_size): - train_words_pow += wv.vocab[wv.index2key[word_index]].count**self.ns_exponent - cumulative = 0.0 - for word_index in range(vocab_size): - cumulative += wv.vocab[wv.index2key[word_index]].count**self.ns_exponent - self.cum_table[word_index] = round(cumulative / train_words_pow * domain) - if len(self.cum_table) > 0: - assert self.cum_table[-1] == domain +class Word2VecTrainables(utils.SaveLoad): + """Obsolete class retained for now as load-compatibility state capture""" + pass class Heapitem(namedtuple('Heapitem', 'count, index, left, right')): @@ -1409,62 +2254,6 @@ def _assign_binary_codes(vocab): logger.info("built huffman tree with maximum node depth %i", max_depth) -class Word2VecTrainables(utils.SaveLoad): - def __init__(self, vector_size=100, seed=1, hashfxn=hash): - """Represents the inner shallow neural network used to train :class:`~gensim.models.word2vec.Word2Vec`.""" - self.hashfxn = hashfxn - self.layer1_size = vector_size - self.seed = seed - - def prepare_weights(self, hs, negative, wv, update=False, vocabulary=None): - """Build tables and model weights based on final vocabulary settings.""" - # set initial input/projection and hidden weights - if not update: - self.reset_weights(hs, negative, wv) - else: - self.update_weights(hs, negative, wv) - - @deprecated("Use gensim.models.keyedvectors.pseudorandom_weak_vector() directly") - def seeded_vector(self, seed_string, vector_size): - return pseudorandom_weak_vector(vector_size, seed_string=seed_string, hashfxn=self.hashfxn) - - def reset_weights(self, hs, negative, wv): - """Reset all projection weights to an initial (untrained) state, but keep the existing vocabulary.""" - logger.info("resetting layer weights") - wv.resize_vectors() - wv.randomly_initialize_vectors(seed=self.seed) - if hs: - self.syn1 = zeros((len(wv.vocab), self.layer1_size), dtype=REAL) - if negative: - self.syn1neg = zeros((len(wv.vocab), self.layer1_size), dtype=REAL) - - self.vectors_lockf = ones(len(wv.vocab), dtype=REAL) # zeros suppress learning - - def update_weights(self, hs, negative, wv): - """Copy all the existing weights, and reset the weights for the newly added vocabulary.""" - logger.info("updating layer weights") - new_range = wv.resize_vectors() - gained_vocab = len(new_range) - wv.randomly_initialize_vectors(indexes=new_range) - - # Raise an error if an online update is run before initial training on a corpus - if not len(wv.vectors): - raise RuntimeError( - "You cannot do an online vocabulary-update of a model which has no prior vocabulary. " - "First build the vocabulary of your model with a corpus before doing an online update." - ) - - if hs: - self.syn1 = vstack([self.syn1, zeros((gained_vocab, self.layer1_size), dtype=REAL)]) - if negative: - pad = zeros((gained_vocab, self.layer1_size), dtype=REAL) - self.syn1neg = vstack([self.syn1neg, pad]) - wv.vectors_norm = None - - # do not suppress learning for already learned words - self.vectors_lockf = ones(len(wv.vocab), dtype=REAL) # zeros suppress learning - - # Example: ./word2vec.py -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 \ # -negative 5 -hs 0 -binary 0 -cbow 1 -iter 3 if __name__ == "__main__": diff --git a/gensim/models/word2vec_inner.pyx b/gensim/models/word2vec_inner.pyx index 0576773bd5..076ff54b1c 100755 --- a/gensim/models/word2vec_inner.pyx +++ b/gensim/models/word2vec_inner.pyx @@ -467,7 +467,7 @@ cdef unsigned long long w2v_fast_sentence_cbow_neg( cdef init_w2v_config(Word2VecConfig *c, model, alpha, compute_loss, _work, _neu1=None): c[0].hs = model.hs c[0].negative = model.negative - c[0].sample = (model.vocabulary.sample != 0) + c[0].sample = (model.sample != 0) c[0].cbow_mean = model.cbow_mean c[0].window = model.window c[0].workers = model.workers @@ -476,17 +476,17 @@ cdef init_w2v_config(Word2VecConfig *c, model, alpha, compute_loss, _work, _neu1 c[0].running_training_loss = model.running_training_loss c[0].syn0 = (np.PyArray_DATA(model.wv.vectors)) - c[0].word_locks = (np.PyArray_DATA(model.trainables.vectors_lockf)) + c[0].word_locks = (np.PyArray_DATA(model.wv.vectors_lockf)) c[0].alpha = alpha c[0].size = model.wv.vector_size if c[0].hs: - c[0].syn1 = (np.PyArray_DATA(model.trainables.syn1)) + c[0].syn1 = (np.PyArray_DATA(model.syn1)) if c[0].negative: - c[0].syn1neg = (np.PyArray_DATA(model.trainables.syn1neg)) - c[0].cum_table = (np.PyArray_DATA(model.vocabulary.cum_table)) - c[0].cum_table_len = len(model.vocabulary.cum_table) + c[0].syn1neg = (np.PyArray_DATA(model.syn1neg)) + c[0].cum_table = (np.PyArray_DATA(model.cum_table)) + c[0].cum_table_len = len(model.cum_table) if c[0].negative or c[0].sample: c[0].next_random = (2**24) * model.random.randint(0, 2**24) + model.random.randint(0, 2**24) @@ -709,7 +709,7 @@ def score_sentence_sg(model, sentence, _work): cdef long result = 0 cdef int sentence_len - c.syn1 = (np.PyArray_DATA(model.trainables.syn1)) + c.syn1 = (np.PyArray_DATA(model.syn1)) # convert Python structures to primitive types, so we can release the GIL c.work = np.PyArray_DATA(_work) @@ -804,7 +804,7 @@ def score_sentence_cbow(model, sentence, _work, _neu1): cdef int i, j, k cdef long result = 0 - c.syn1 = (np.PyArray_DATA(model.trainables.syn1)) + c.syn1 = (np.PyArray_DATA(model.syn1)) # convert Python structures to primitive types, so we can release the GIL c.work = np.PyArray_DATA(_work) diff --git a/gensim/models/wrappers/__init__.py b/gensim/models/wrappers/__init__.py index 9cd14ea8e7..330abce500 100644 --- a/gensim/models/wrappers/__init__.py +++ b/gensim/models/wrappers/__init__.py @@ -5,6 +5,5 @@ from .ldamallet import LdaMallet # noqa:F401 from .dtmmodel import DtmModel # noqa:F401 from .ldavowpalwabbit import LdaVowpalWabbit # noqa:F401 -from .fasttext import FastText # noqa:F401 from .wordrank import Wordrank # noqa:F401 from .varembed import VarEmbed # noqa:F401 diff --git a/gensim/models/wrappers/fasttext.py b/gensim/models/wrappers/fasttext.py deleted file mode 100644 index bca36c7cb9..0000000000 --- a/gensim/models/wrappers/fasttext.py +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Author: Jayant Jain -# Copyright (C) 2017 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - - -""" -Warnings --------- -.. deprecated:: 3.2.0 - Use :mod:`gensim.models.fasttext` instead. - - - -Python wrapper around word representation learning from FastText, a library for efficient learning -of word representations and sentence classification [1]. - -This module allows training a word embedding from a training corpus with the additional ability -to obtain word vectors for out-of-vocabulary words, using the fastText C implementation. - -The wrapped model can NOT be updated with new documents for online training -- use gensim's -`Word2Vec` for that. - -Example: - -.. sourcecode:: pycon - - >>> from gensim.models.wrappers import FastText - >>> model = FastText.train('/Users/kofola/fastText/fasttext', corpus_file='text8') - >>> print(model['forests']) # prints vector for given out-of-vocabulary word - -.. [1] https://github.com/facebookresearch/fastText#enriching-word-vectors-with-subword-information - - - -""" -from gensim.models.deprecated.fasttext_wrapper import FastText, FastTextKeyedVectors # noqa:F401 -from gensim.models.deprecated.fasttext_wrapper import ft_hash, compute_ngrams # noqa:F401 diff --git a/gensim/sklearn_api/d2vmodel.py b/gensim/sklearn_api/d2vmodel.py index fa154a2497..c49d1b2baf 100644 --- a/gensim/sklearn_api/d2vmodel.py +++ b/gensim/sklearn_api/d2vmodel.py @@ -36,9 +36,10 @@ class D2VTransformer(TransformerMixin, BaseEstimator): """ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1, docvecs=None, - docvecs_mapfile=None, comment=None, trim_rule=None, size=100, alpha=0.025, window=5, min_count=5, - max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, hs=0, negative=5, cbow_mean=1, - hashfxn=hash, iter=5, sorted_vocab=1, batch_words=10000): + docvecs_mapfile=None, comment=None, trim_rule=None, vector_size=100, alpha=0.025, window=5, + min_count=5, max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, + hs=0, negative=5, cbow_mean=1, + hashfxn=hash, epochs=5, sorted_vocab=1, batch_words=10000): """ Parameters @@ -72,7 +73,7 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 be trimmed away (:attr:`gensim.utils.RULE_DISCARD`), or handled using the default (:attr:`gensim.utils.RULE_DEFAULT`). If None, then :func:`gensim.utils.keep_vocab_item` will be used. - size : int, optional + vector_size : int, optional Dimensionality of the feature vectors. alpha : float, optional The initial learning rate. @@ -108,7 +109,7 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 Same as `dm_mean`, **unused**. hashfxn : function (object -> int), optional A hashing function. Used to create an initial random reproducible vector by hashing the random seed. - iter : int, optional + epochs : int, optional Number of epochs to iterate through the corpus. sorted_vocab : bool, optional Whether the vocabulary should be sorted internally. @@ -128,7 +129,7 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 self.trim_rule = trim_rule # attributes associated with gensim.models.Word2Vec - self.size = size + self.vector_size = vector_size self.alpha = alpha self.window = window self.min_count = min_count @@ -141,7 +142,7 @@ def __init__(self, dm_mean=None, dm=1, dbow_words=0, dm_concat=0, dm_tag_count=1 self.negative = negative self.cbow_mean = int(cbow_mean) self.hashfxn = hashfxn - self.iter = iter + self.epochs = epochs self.sorted_vocab = sorted_vocab self.batch_words = batch_words @@ -167,11 +168,11 @@ def fit(self, X, y=None): documents=d2v_sentences, dm_mean=self.dm_mean, dm=self.dm, dbow_words=self.dbow_words, dm_concat=self.dm_concat, dm_tag_count=self.dm_tag_count, docvecs=self.docvecs, docvecs_mapfile=self.docvecs_mapfile, comment=self.comment, - trim_rule=self.trim_rule, vector_size=self.size, alpha=self.alpha, window=self.window, + trim_rule=self.trim_rule, vector_size=self.vector_size, alpha=self.alpha, window=self.window, min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample, seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean, hashfxn=self.hashfxn, - epochs=self.iter, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words + epochs=self.epochs, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words ) return self diff --git a/gensim/sklearn_api/ftmodel.py b/gensim/sklearn_api/ftmodel.py index a1edd6c338..7acd22cfc2 100644 --- a/gensim/sklearn_api/ftmodel.py +++ b/gensim/sklearn_api/ftmodel.py @@ -18,7 +18,7 @@ >>> from gensim.sklearn_api import FTTransformer >>> >>> # Create a model to represent each word by a 10 dimensional vector. - >>> model = FTTransformer(size=10, min_count=1, seed=1) + >>> model = FTTransformer(vector_size=10, min_count=1, seed=1) >>> >>> # What is the vector representations of the word 'graph' and 'system'? >>> wordvecs = model.fit(common_texts).transform(['graph', 'system']) @@ -56,10 +56,10 @@ class FTTransformer(TransformerMixin, BaseEstimator): Information `_. """ - def __init__(self, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, + def __init__(self, sg=0, hs=0, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, word_ngrams=1, sample=1e-3, seed=1, workers=3, min_alpha=0.0001, negative=5, ns_exponent=0.75, - cbow_mean=1, hashfxn=hash, iter=5, null_word=0, min_n=3, + cbow_mean=1, hashfxn=hash, epochs=5, null_word=0, min_n=3, max_n=6, sorted_vocab=1, bucket=2000000, trim_rule=None, batch_words=10000): """ @@ -71,7 +71,7 @@ def __init__(self, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, hs : {1,0}, optional If 1, hierarchical softmax will be used for model training. If set to 0, and `negative` is non-zero, negative sampling will be used. - size : int, optional + vector_size : int, optional Dimensionality of the word vectors. alpha : float, optional The initial learning rate. @@ -113,7 +113,7 @@ def __init__(self, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used. hashfxn : function, optional Hash function to use to randomly initialize weights, for increased training reproducibility. - iter : int, optional + epochs : int, optional Number of iterations (epochs) over the corpus. min_n : int, optional Minimum length of char n-grams to be used for training word representations. @@ -148,7 +148,7 @@ def __init__(self, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, self.gensim_model = None self.sg = sg self.hs = hs - self.size = size + self.vector_size = vector_size self.alpha = alpha self.window = window self.min_count = min_count @@ -162,7 +162,7 @@ def __init__(self, sg=0, hs=0, size=100, alpha=0.025, window=5, min_count=5, self.ns_exponent = ns_exponent self.cbow_mean = cbow_mean self.hashfxn = hashfxn - self.iter = iter + self.epochs = epochs self.null_word = null_word self.min_n = min_n self.max_n = max_n @@ -189,13 +189,13 @@ def fit(self, X, y=None): """ self.gensim_model = models.FastText( - sentences=X, sg=self.sg, hs=self.hs, size=self.size, + sentences=X, sg=self.sg, hs=self.hs, vector_size=self.vector_size, alpha=self.alpha, window=self.window, min_count=self.min_count, max_vocab_size=self.max_vocab_size, word_ngrams=self.word_ngrams, sample=self.sample, seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, negative=self.negative, ns_exponent=self.ns_exponent, cbow_mean=self.cbow_mean, - hashfxn=self.hashfxn, iter=self.iter, null_word=self.null_word, + hashfxn=self.hashfxn, epochs=self.epochs, null_word=self.null_word, min_n=self.min_n, max_n=self.max_n, sorted_vocab=self.sorted_vocab, bucket=self.bucket, trim_rule=self.trim_rule, batch_words=self.batch_words @@ -212,7 +212,7 @@ def transform(self, words): Returns ------- - np.ndarray of shape [`len(words)`, `size`] + np.ndarray of shape [`len(words)`, `vector_size`] A 2D array where each row is the vector of one word. """ @@ -225,4 +225,4 @@ def transform(self, words): if isinstance(words, six.string_types): words = [words] vectors = [self.gensim_model.wv[word] for word in words] - return np.reshape(np.array(vectors), (len(words), self.size)) + return np.reshape(np.array(vectors), (len(words), self.vector_size)) diff --git a/gensim/sklearn_api/w2vmodel.py b/gensim/sklearn_api/w2vmodel.py index 07091c2dde..ae64b56e3e 100644 --- a/gensim/sklearn_api/w2vmodel.py +++ b/gensim/sklearn_api/w2vmodel.py @@ -18,7 +18,7 @@ >>> from gensim.sklearn_api import W2VTransformer >>> >>> # Create a model to represent each word by a 10 dimensional vector. - >>> model = W2VTransformer(size=10, min_count=1, seed=1) + >>> model = W2VTransformer(vector_size=10, min_count=1, seed=1) >>> >>> # What is the vector representation of the word 'graph'? >>> wordvecs = model.fit(common_texts).transform(['graph', 'system']) @@ -40,14 +40,14 @@ class W2VTransformer(TransformerMixin, BaseEstimator): Estimation of Word Representations in Vector Space" `_. """ - def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, - workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0, + def __init__(self, vector_size=100, alpha=0.025, window=5, min_count=5, max_vocab_size=None, sample=1e-3, seed=1, + workers=3, min_alpha=0.0001, sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, epochs=5, null_word=0, trim_rule=None, sorted_vocab=1, batch_words=10000): """ Parameters ---------- - size : int + vector_size : int Dimensionality of the feature vectors. alpha : float The initial learning rate. @@ -85,7 +85,7 @@ def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size= If 0, use the sum of the context word vectors. If 1, use the mean, only applies when cbow is used. hashfxn : callable (object -> int), optional A hashing function. Used to create an initial random reproducible vector by hashing the random seed. - iter : int + epochs : int Number of iterations (epochs) over the corpus. null_word : int {1, 0} If 1, a null pseudo-word will be created for padding when using concatenative L1 (run-of-words) @@ -106,7 +106,7 @@ def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size= """ self.gensim_model = None - self.size = size + self.vector_size = vector_size self.alpha = alpha self.window = window self.min_count = min_count @@ -120,7 +120,7 @@ def __init__(self, size=100, alpha=0.025, window=5, min_count=5, max_vocab_size= self.negative = negative self.cbow_mean = int(cbow_mean) self.hashfxn = hashfxn - self.iter = iter + self.epochs = epochs self.null_word = null_word self.trim_rule = trim_rule self.sorted_vocab = sorted_vocab @@ -144,11 +144,11 @@ def fit(self, X, y=None): """ self.gensim_model = models.Word2Vec( - sentences=X, size=self.size, alpha=self.alpha, + sentences=X, vector_size=self.vector_size, alpha=self.alpha, window=self.window, min_count=self.min_count, max_vocab_size=self.max_vocab_size, sample=self.sample, seed=self.seed, workers=self.workers, min_alpha=self.min_alpha, sg=self.sg, hs=self.hs, negative=self.negative, cbow_mean=self.cbow_mean, - hashfxn=self.hashfxn, iter=self.iter, null_word=self.null_word, trim_rule=self.trim_rule, + hashfxn=self.hashfxn, epochs=self.epochs, null_word=self.null_word, trim_rule=self.trim_rule, sorted_vocab=self.sorted_vocab, batch_words=self.batch_words ) return self @@ -163,7 +163,7 @@ def transform(self, words): Returns ------- - np.ndarray of shape [`len(words)`, `size`] + np.ndarray of shape [`len(words)`, `vector_size`] A 2D array where each row is the vector of one word. """ @@ -176,7 +176,7 @@ def transform(self, words): if isinstance(words, six.string_types): words = [words] vectors = [self.gensim_model.wv[word] for word in words] - return np.reshape(np.array(vectors), (len(words), self.size)) + return np.reshape(np.array(vectors), (len(words), self.vector_size)) def partial_fit(self, X): raise NotImplementedError( diff --git a/gensim/test/test_doc2vec.py b/gensim/test/test_doc2vec.py index 0dbd70e5a4..6eb09671de 100644 --- a/gensim/test/test_doc2vec.py +++ b/gensim/test/test_doc2vec.py @@ -75,7 +75,7 @@ def test_persistence(self): @unittest.skipIf(os.name == 'nt' and six.PY2, "corpus_file training is not supported on Windows + Py27") def test_persistence_fromfile(self): """Test storing/loading the entire model.""" - with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file: + with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) tmpf = get_tmpfile('gensim_doc2vec.tst') @@ -102,7 +102,7 @@ def testPersistenceWord2VecFormat(self): binary_model_dv = keyedvectors.KeyedVectors.load_word2vec_format(test_word, binary=True) self.assertEqual(len(model.wv.vocab), len(binary_model_dv.vocab)) - def testLoadOldModel(self): + def obsolete_testLoadOldModel(self): """Test loading an old doc2vec model from indeterminate version""" model_file = 'doc2vec_old' # which version?!? @@ -111,17 +111,17 @@ def testLoadOldModel(self): self.assertTrue(len(model.wv.vocab) == 3955) self.assertTrue(len(model.wv.index2word) == 3955) self.assertIsNone(model.corpus_total_words) - self.assertTrue(model.trainables.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) - self.assertTrue(model.trainables.vectors_lockf.shape == (3955, )) - self.assertTrue(model.vocabulary.cum_table.shape == (3955, )) + self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) + self.assertTrue(model.wv.vectors_lockf.shape == (3955, )) + self.assertTrue(model.cum_table.shape == (3955, )) self.assertTrue(model.docvecs.vectors.shape == (300, 100)) - self.assertTrue(model.trainables.vectors_docs_lockf.shape == (300, )) + self.assertTrue(model.docvecs.vectors_lockf.shape == (300, )) self.assertTrue(len(model.docvecs) == 300) self.model_sanity(model) - def testLoadOldModelSeparates(self): + def obsolete_testLoadOldModelSeparates(self): """Test loading an old doc2vec model from indeterminate version""" # Model stored in multiple files @@ -131,16 +131,16 @@ def testLoadOldModelSeparates(self): self.assertTrue(len(model.wv.vocab) == 3955) self.assertTrue(len(model.wv.index2word) == 3955) self.assertIsNone(model.corpus_total_words) - self.assertTrue(model.trainables.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) - self.assertTrue(model.trainables.vectors_lockf.shape == (3955, )) - self.assertTrue(model.vocabulary.cum_table.shape == (3955, )) + self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) + self.assertTrue(model.wv.vectors_lockf.shape == (3955, )) + self.assertTrue(model.cum_table.shape == (3955, )) self.assertTrue(model.docvecs.vectors.shape == (300, 100)) - self.assertTrue(model.trainables.vectors_docs_lockf.shape == (300, )) + self.assertTrue(model.docvecs.vectors_lockf.shape == (300, )) self.assertTrue(len(model.docvecs) == 300) self.model_sanity(model) - def test_load_old_models_pre_1_0(self): + def obsolete_test_load_old_models_pre_1_0(self): """Test loading pre-1.0 models""" model_file = 'd2v-lee-v0.13.0' model = doc2vec.Doc2Vec.load(datapath(model_file)) @@ -153,7 +153,7 @@ def test_load_old_models_pre_1_0(self): for old_version in old_versions: self._check_old_version(old_version) - def test_load_old_models_1_x(self): + def obsolete_test_load_old_models_1_x(self): """Test loading 1.x models""" old_versions = [ '1.0.0', '1.0.1', @@ -161,7 +161,7 @@ def test_load_old_models_1_x(self): for old_version in old_versions: self._check_old_version(old_version) - def test_load_old_models_2_x(self): + def obsolete_test_load_old_models_2_x(self): """Test loading 2.x models""" old_versions = [ '2.0.0', '2.1.0', '2.2.0', '2.3.0', @@ -169,10 +169,18 @@ def test_load_old_models_2_x(self): for old_version in old_versions: self._check_old_version(old_version) - def test_load_old_models_3_x(self): + def obsolete_test_load_old_models_pre_3_3(self): """Test loading 3.x models""" old_versions = [ - '3.0.0', '3.1.0', '3.2.0', '3.3.0', '3.4.0' + '3.2.0', '3.1.0', '3.0.0' + ] + for old_version in old_versions: + self._check_old_version(old_version) + + def obsolete_test_load_old_models_post_3_2(self): + """Test loading 3.x models""" + old_versions = [ + '3.4.0', '3.3.0', ] for old_version in old_versions: self._check_old_version(old_version) @@ -201,12 +209,12 @@ def _check_old_version(self, old_version): def testDoc2vecTrainParameters(self): model = doc2vec.Doc2Vec(vector_size=50) - model.build_vocab(documents=list_corpus) + model.build_vocab(corpus_iterable=list_corpus) self.assertRaises(TypeError, model.train, corpus_file=11111) - self.assertRaises(TypeError, model.train, documents=11111) - self.assertRaises(TypeError, model.train, documents=sentences, corpus_file='test') - self.assertRaises(TypeError, model.train, documents=None, corpus_file=None) + self.assertRaises(TypeError, model.train, corpus_iterable=11111) + self.assertRaises(TypeError, model.train, corpus_iterable=sentences, corpus_file='test') + self.assertRaises(TypeError, model.train, corpus_iterable=None, corpus_file=None) self.assertRaises(TypeError, model.train, corpus_file=sentences) @unittest.skipIf(os.name == 'nt', "See another test for Windows below") @@ -418,10 +426,10 @@ def model_sanity(self, model, keep_training=True): # keep training after save if keep_training: - tmpf = get_tmpfile('gensim_doc2vec.tst') + tmpf = get_tmpfile('gensim_doc2vec_resave.tst') model.save(tmpf) loaded = doc2vec.Doc2Vec.load(tmpf) - loaded.train(documents=sentences, total_examples=loaded.corpus_count, epochs=loaded.epochs) + loaded.train(corpus_iterable=sentences, total_examples=loaded.corpus_count, epochs=loaded.epochs) def test_training(self): """Test doc2vec training.""" @@ -440,7 +448,7 @@ def test_training(self): @unittest.skipIf(os.name == 'nt' and six.PY2, "corpus_file training is not supported on Windows + Py27") def test_training_fromfile(self): """Test doc2vec training.""" - with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file: + with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) model = doc2vec.Doc2Vec(vector_size=100, min_count=2, epochs=20, workers=1) @@ -461,7 +469,7 @@ def test_dbow_hs(self): @unittest.skipIf(os.name == 'nt' and six.PY2, "corpus_file training is not supported on Windows + Py27") def test_dbow_hs_fromfile(self): """Test DBOW doc2vec training.""" - with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file: + with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) model = doc2vec.Doc2Vec(corpus_file=corpus_file, dm=0, hs=1, negative=0, min_count=2, epochs=20) self.model_sanity(model) @@ -477,7 +485,7 @@ def test_dmm_hs(self): @unittest.skipIf(os.name == 'nt' and six.PY2, "corpus_file training is not supported on Windows + Py27") def test_dmm_hs_fromfile(self): """Test DBOW doc2vec training.""" - with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file: + with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) model = doc2vec.Doc2Vec( list_corpus, dm=1, dm_mean=1, vector_size=24, window=4, @@ -496,7 +504,7 @@ def test_dms_hs(self): @unittest.skipIf(os.name == 'nt' and six.PY2, "corpus_file training is not supported on Windows + Py27") def test_dms_hs_fromfile(self): """Test DBOW doc2vec training.""" - with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file: + with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) model = doc2vec.Doc2Vec( list_corpus, dm=1, dm_mean=0, vector_size=24, window=4, hs=1, @@ -515,7 +523,7 @@ def test_dmc_hs(self): @unittest.skipIf(os.name == 'nt' and six.PY2, "corpus_file training is not supported on Windows + Py27") def test_dmc_hs_fromfile(self): """Test DBOW doc2vec training.""" - with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file: + with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) model = doc2vec.Doc2Vec( list_corpus, dm=1, dm_concat=1, vector_size=24, window=4, @@ -531,7 +539,7 @@ def test_dbow_neg(self): @unittest.skipIf(os.name == 'nt' and six.PY2, "corpus_file training is not supported on Windows + Py27") def test_dbow_neg_fromfile(self): """Test DBOW doc2vec training.""" - with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file: + with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) model = doc2vec.Doc2Vec(list_corpus, dm=0, hs=0, negative=10, min_count=2, epochs=20) self.model_sanity(model) @@ -547,7 +555,7 @@ def test_dmm_neg(self): @unittest.skipIf(os.name == 'nt' and six.PY2, "corpus_file training is not supported on Windows + Py27") def test_dmm_neg_fromfile(self): """Test DBOW doc2vec training.""" - with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file: + with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) model = doc2vec.Doc2Vec( list_corpus, dm=1, dm_mean=1, vector_size=24, window=4, hs=0, @@ -566,7 +574,7 @@ def test_dms_neg(self): @unittest.skipIf(os.name == 'nt' and six.PY2, "corpus_file training is not supported on Windows + Py27") def test_dms_neg_fromfile(self): """Test DBOW doc2vec training.""" - with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file: + with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) model = doc2vec.Doc2Vec( list_corpus, dm=1, dm_mean=0, vector_size=24, window=4, hs=0, @@ -585,7 +593,7 @@ def test_dmc_neg(self): @unittest.skipIf(os.name == 'nt' and six.PY2, "corpus_file training is not supported on Windows + Py27") def test_dmc_neg_fromfile(self): """Test DBOW doc2vec training.""" - with temporary_file(get_tmpfile('gensim_word2vec.tst')) as corpus_file: + with temporary_file(get_tmpfile('gensim_doc2vec.tst')) as corpus_file: save_lee_corpus_as_line_sentence(corpus_file) model = doc2vec.Doc2Vec( list_corpus, dm=1, dm_concat=1, vector_size=24, window=4, hs=0, @@ -641,9 +649,9 @@ def models_equal(self, model, model2): self.assertEqual(len(model.wv.vocab), len(model2.wv.vocab)) self.assertTrue(np.allclose(model.wv.vectors, model2.wv.vectors)) if model.hs: - self.assertTrue(np.allclose(model.trainables.syn1, model2.trainables.syn1)) + self.assertTrue(np.allclose(model.syn1, model2.syn1)) if model.negative: - self.assertTrue(np.allclose(model.trainables.syn1neg, model2.trainables.syn1neg)) + self.assertTrue(np.allclose(model.syn1neg, model2.syn1neg)) # check docvecs self.assertEqual(len(model.docvecs.map), len(model2.docvecs.map)) self.assertEqual(len(model.docvecs.index2key), len(model2.docvecs.index2key)) diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 791386eb8d..1e4b431e88 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -16,14 +16,11 @@ from gensim import utils from gensim.models.word2vec import LineSentence -from gensim.models.fasttext import FastText as FT_gensim, _unpack, _unpack_copy -from gensim.models.wrappers.fasttext import FastTextKeyedVectors -from gensim.models.wrappers.fasttext import FastText as FT_wrapper +from gensim.models.fasttext import FastText as FT_gensim, FastTextKeyedVectors, _unpack, _unpack_copy from gensim.models.keyedvectors import KeyedVectors from gensim.test.utils import datapath, get_tmpfile, temporary_file, common_texts as sentences import gensim.models._fasttext_bin from gensim.models.fasttext_inner import compute_ngrams, compute_ngrams_bytes, ft_hash_broken, ft_hash_bytes -from gensim.models.fasttext import _unpack, _unpack_copy import gensim.models.fasttext @@ -70,7 +67,7 @@ def setUp(self): self.test_new_model_file = datapath('lee_fasttext_new.bin') def test_training(self): - model = FT_gensim(size=12, min_count=1, hs=1, negative=0, seed=42, workers=1) + model = FT_gensim(vector_size=12, min_count=1, hs=1, negative=0, seed=42, workers=1) model.build_vocab(sentences) self.model_sanity(model) @@ -90,7 +87,7 @@ def test_training(self): self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above - model2 = FT_gensim(sentences, size=12, min_count=1, hs=1, negative=0, seed=42, workers=1) + model2 = FT_gensim(sentences, vector_size=12, min_count=1, hs=1, negative=0, seed=42, workers=1) self.models_equal(model, model2) # verify oov-word vector retrieval @@ -102,20 +99,20 @@ def test_training(self): def testFastTextTrainParameters(self): - model = FT_gensim(size=12, min_count=1, hs=1, negative=0, seed=42, workers=1) - model.build_vocab(sentences=sentences) + model = FT_gensim(vector_size=12, min_count=1, hs=1, negative=0, seed=42, workers=1) + model.build_vocab(corpus_iterable=sentences) self.assertRaises(TypeError, model.train, corpus_file=11111) - self.assertRaises(TypeError, model.train, sentences=11111) - self.assertRaises(TypeError, model.train, sentences=sentences, corpus_file='test') - self.assertRaises(TypeError, model.train, sentences=None, corpus_file=None) + self.assertRaises(TypeError, model.train, corpus_iterable=11111) + self.assertRaises(TypeError, model.train, corpus_iterable=sentences, corpus_file='test') + self.assertRaises(TypeError, model.train, corpus_iterable=None, corpus_file=None) self.assertRaises(TypeError, model.train, corpus_file=sentences) def test_training_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file: utils.save_as_line_sentence(sentences, corpus_file) - model = FT_gensim(size=12, min_count=1, hs=1, negative=0, seed=42, workers=1) + model = FT_gensim(vector_size=12, min_count=1, hs=1, negative=0, seed=42, workers=1) model.build_vocab(corpus_file=corpus_file) self.model_sanity(model) @@ -148,9 +145,9 @@ def models_equal(self, model, model2): self.assertTrue(np.allclose(model.wv.vectors_ngrams, model2.wv.vectors_ngrams)) self.assertTrue(np.allclose(model.wv.vectors, model2.wv.vectors)) if model.hs: - self.assertTrue(np.allclose(model.trainables.syn1, model2.trainables.syn1)) + self.assertTrue(np.allclose(model.syn1, model2.syn1)) if model.negative: - self.assertTrue(np.allclose(model.trainables.syn1neg, model2.trainables.syn1neg)) + self.assertTrue(np.allclose(model.syn1neg, model2.syn1neg)) most_common_word = max(model.wv.vocab.items(), key=lambda item: item[1].count)[0] self.assertTrue(np.allclose(model.wv[most_common_word], model2.wv[most_common_word])) @@ -243,12 +240,12 @@ def test_load_fasttext_format(self): actual_vec_oov = model.wv["rejection"] self.assertTrue(np.allclose(actual_vec_oov, expected_vec_oov, atol=1e-4)) - self.assertEqual(model.vocabulary.min_count, 5) + self.assertEqual(model.min_count, 5) self.assertEqual(model.window, 5) self.assertEqual(model.epochs, 5) self.assertEqual(model.negative, 5) - self.assertEqual(model.vocabulary.sample, 0.0001) - self.assertEqual(model.trainables.bucket, 1000) + self.assertEqual(model.sample, 0.0001) + self.assertEqual(model.bucket, 1000) self.assertEqual(model.wv.max_n, 6) self.assertEqual(model.wv.min_n, 3) self.assertEqual(model.wv.vectors.shape, (len(model.wv.vocab), model.vector_size)) @@ -296,12 +293,12 @@ def test_load_fasttext_new_format(self): actual_vec_oov = new_model.wv["rejection"] self.assertTrue(np.allclose(actual_vec_oov, expected_vec_oov, atol=1e-4)) - self.assertEqual(new_model.vocabulary.min_count, 5) + self.assertEqual(new_model.min_count, 5) self.assertEqual(new_model.window, 5) self.assertEqual(new_model.epochs, 5) self.assertEqual(new_model.negative, 5) - self.assertEqual(new_model.vocabulary.sample, 0.0001) - self.assertEqual(new_model.trainables.bucket, 1000) + self.assertEqual(new_model.sample, 0.0001) + self.assertEqual(new_model.bucket, 1000) self.assertEqual(new_model.wv.max_n, 6) self.assertEqual(new_model.wv.min_n, 3) self.assertEqual(new_model.wv.vectors.shape, (len(new_model.wv.vocab), new_model.vector_size)) @@ -396,8 +393,8 @@ def test_wm_distance(self): def test_cbow_hs_training(self): model_gensim = FT_gensim( - size=48, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0, - min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + vector_size=48, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0, + min_count=5, epochs=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0) lee_data = LineSentence(datapath('lee_background.cor')) @@ -425,8 +422,8 @@ def test_cbow_hs_training(self): def test_cbow_hs_training_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file: model_gensim = FT_gensim( - size=48, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0, - min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + vector_size=48, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0, + min_count=5, epochs=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0) lee_data = LineSentence(datapath('lee_background.cor')) @@ -458,8 +455,8 @@ def test_cbow_hs_training_fromfile(self): def test_sg_hs_training(self): model_gensim = FT_gensim( - size=48, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0, - min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + vector_size=48, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0, + min_count=5, epochs=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0) lee_data = LineSentence(datapath('lee_background.cor')) @@ -487,8 +484,8 @@ def test_sg_hs_training(self): def test_sg_hs_training_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file: model_gensim = FT_gensim( - size=48, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0, - min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + vector_size=48, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=1, negative=0, + min_count=5, epochs=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0) lee_data = LineSentence(datapath('lee_background.cor')) @@ -520,8 +517,8 @@ def test_sg_hs_training_fromfile(self): def test_cbow_neg_training(self): model_gensim = FT_gensim( - size=48, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=5, - min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + vector_size=48, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=5, + min_count=5, epochs=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0) lee_data = LineSentence(datapath('lee_background.cor')) @@ -549,8 +546,8 @@ def test_cbow_neg_training(self): def test_cbow_neg_training_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file: model_gensim = FT_gensim( - size=48, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=5, - min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + vector_size=48, sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=5, + min_count=5, epochs=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0) lee_data = LineSentence(datapath('lee_background.cor')) @@ -582,8 +579,8 @@ def test_cbow_neg_training_fromfile(self): def test_sg_neg_training(self): model_gensim = FT_gensim( - size=48, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=0, negative=5, - min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + vector_size=48, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=0, negative=5, + min_count=5, epochs=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0) lee_data = LineSentence(datapath('lee_background.cor')) @@ -611,8 +608,8 @@ def test_sg_neg_training(self): def test_sg_neg_training_fromfile(self): with temporary_file(get_tmpfile('gensim_fasttext.tst')) as corpus_file: model_gensim = FT_gensim( - size=48, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=0, negative=5, - min_count=5, iter=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, + vector_size=48, sg=1, cbow_mean=1, alpha=0.025, window=5, hs=0, negative=5, + min_count=5, epochs=5, batch_words=1000, word_ngrams=1, sample=1e-3, min_n=3, max_n=6, sorted_vocab=1, workers=1, min_alpha=0.0) lee_data = LineSentence(datapath('lee_background.cor')) @@ -642,7 +639,7 @@ def test_sg_neg_training_fromfile(self): self.assertGreaterEqual(overlap_count, 2) def test_online_learning(self): - model_hs = FT_gensim(sentences, size=12, min_count=1, seed=42, hs=1, negative=0) + model_hs = FT_gensim(sentences, vector_size=12, min_count=1, seed=42, hs=1, negative=0) self.assertTrue(len(model_hs.wv.vocab), 12) self.assertTrue(model_hs.wv.vocab['graph'].count, 3) model_hs.build_vocab(new_sentences, update=True) # update vocab @@ -656,7 +653,7 @@ def test_online_learning_fromfile(self): utils.save_as_line_sentence(sentences, corpus_file) utils.save_as_line_sentence(new_sentences, new_corpus_file) - model_hs = FT_gensim(corpus_file=corpus_file, size=12, min_count=1, seed=42, hs=1, negative=0) + model_hs = FT_gensim(corpus_file=corpus_file, vector_size=12, min_count=1, seed=42, hs=1, negative=0) self.assertTrue(len(model_hs.wv.vocab), 12) self.assertTrue(model_hs.wv.vocab['graph'].count, 3) model_hs.build_vocab(corpus_file=new_corpus_file, update=True) # update vocab @@ -666,7 +663,7 @@ def test_online_learning_fromfile(self): def test_online_learning_after_save(self): tmpf = get_tmpfile('gensim_fasttext.tst') - model_neg = FT_gensim(sentences, size=12, min_count=0, seed=42, hs=0, negative=5) + model_neg = FT_gensim(sentences, vector_size=12, min_count=0, seed=42, hs=0, negative=5) model_neg.save(tmpf) model_neg = FT_gensim.load(tmpf) self.assertTrue(len(model_neg.wv.vocab), 12) @@ -681,7 +678,7 @@ def test_online_learning_after_save_fromfile(self): utils.save_as_line_sentence(new_sentences, new_corpus_file) tmpf = get_tmpfile('gensim_fasttext.tst') - model_neg = FT_gensim(corpus_file=corpus_file, size=12, min_count=0, seed=42, hs=0, negative=5) + model_neg = FT_gensim(corpus_file=corpus_file, vector_size=12, min_count=0, seed=42, hs=0, negative=5) model_neg.save(tmpf) model_neg = FT_gensim.load(tmpf) self.assertTrue(len(model_neg.wv.vocab), 12) @@ -717,18 +714,18 @@ def online_sanity(self, model): @unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32") def test_sg_hs_online(self): - model = FT_gensim(sg=1, window=2, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=1) + model = FT_gensim(sg=1, window=2, hs=1, negative=0, min_count=3, epochs=1, seed=42, workers=1) self.online_sanity(model) @unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32") def test_sg_neg_online(self): - model = FT_gensim(sg=1, window=2, hs=0, negative=5, min_count=3, iter=1, seed=42, workers=1) + model = FT_gensim(sg=1, window=2, hs=0, negative=5, min_count=3, epochs=1, seed=42, workers=1) self.online_sanity(model) @unittest.skipIf(IS_WIN32, "avoid memory error with Appveyor x32") def test_cbow_hs_online(self): model = FT_gensim( - sg=0, cbow_mean=1, alpha=0.05, window=2, hs=1, negative=0, min_count=3, iter=1, seed=42, workers=1 + sg=0, cbow_mean=1, alpha=0.05, window=2, hs=1, negative=0, min_count=3, epochs=1, seed=42, workers=1 ) self.online_sanity(model) @@ -736,12 +733,12 @@ def test_cbow_hs_online(self): def test_cbow_neg_online(self): model = FT_gensim( sg=0, cbow_mean=1, alpha=0.05, window=2, hs=0, negative=5, - min_count=5, iter=1, seed=42, workers=1, sample=0 + min_count=5, epochs=1, seed=42, workers=1, sample=0 ) self.online_sanity(model) def test_get_vocab_word_vecs(self): - model = FT_gensim(size=12, min_count=1, seed=42) + model = FT_gensim(vector_size=12, min_count=1, seed=42) model.build_vocab(sentences) original_syn0_vocab = np.copy(model.wv.vectors_vocab) model.wv.adjust_vectors() @@ -750,21 +747,21 @@ def test_get_vocab_word_vecs(self): def test_persistence_word2vec_format(self): """Test storing/loading the model in word2vec format.""" tmpf = get_tmpfile('gensim_fasttext_w2v_format.tst') - model = FT_gensim(sentences, min_count=1, size=12) + model = FT_gensim(sentences, min_count=1, vector_size=12) model.wv.save_word2vec_format(tmpf, binary=True) loaded_model_kv = KeyedVectors.load_word2vec_format(tmpf, binary=True) self.assertEqual(len(model.wv.vocab), len(loaded_model_kv.vocab)) self.assertTrue(np.allclose(model.wv['human'], loaded_model_kv['human'])) def test_bucket_ngrams(self): - model = FT_gensim(size=12, min_count=1, bucket=20) + model = FT_gensim(vector_size=12, min_count=1, bucket=20) model.build_vocab(sentences) self.assertEqual(model.wv.vectors_ngrams.shape, (20, 12)) model.build_vocab(new_sentences, update=True) self.assertEqual(model.wv.vectors_ngrams.shape, (20, 12)) def test_estimate_memory(self): - model = FT_gensim(sg=1, hs=1, size=12, negative=5, min_count=3) + model = FT_gensim(sg=1, hs=1, vector_size=12, negative=5, min_count=3) model.build_vocab(sentences) report = model.estimate_memory() self.assertEqual(report['vocab'], 2800) @@ -775,7 +772,7 @@ def test_estimate_memory(self): self.assertEqual(report['buckets_word'], 640) self.assertEqual(report['total'], 6704) - def testLoadOldModel(self): + def obsolete_testLoadOldModel(self): """Test loading fasttext models from previous version""" model_file = 'fasttext_old' @@ -784,9 +781,9 @@ def testLoadOldModel(self): self.assertTrue(len(model.wv.vocab) == 12) self.assertTrue(len(model.wv.index2word) == 12) self.assertIsNone(model.corpus_total_words) - self.assertTrue(model.trainables.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) - self.assertTrue(model.trainables.vectors_lockf.shape == (12, )) - self.assertTrue(model.vocabulary.cum_table.shape == (12, )) + self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) + self.assertTrue(model.wv.vectors_lockf.shape == (12, )) + self.assertTrue(model.cum_table.shape == (12, )) self.assertEqual(model.wv.vectors_vocab.shape, (12, 100)) self.assertEqual(model.wv.vectors_ngrams.shape, (2000000, 100)) @@ -798,9 +795,9 @@ def testLoadOldModel(self): self.assertTrue(len(model.wv.vocab) == 12) self.assertTrue(len(model.wv.index2word) == 12) self.assertIsNone(model.corpus_total_words) - self.assertTrue(model.trainables.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) - self.assertTrue(model.trainables.vectors_lockf.shape == (12, )) - self.assertTrue(model.vocabulary.cum_table.shape == (12, )) + self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) + self.assertTrue(model.wv.vectors_lockf.shape == (12, )) + self.assertTrue(model.cum_table.shape == (12, )) self.assertEqual(model.wv.vectors_vocab.shape, (12, 100)) self.assertEqual(model.wv.vectors_ngrams.shape, (2000000, 100)) @@ -869,7 +866,7 @@ def train_gensim(bucket=100, min_count=5): # # Set parameters to match those in the load_native function # - model = FT_gensim(bucket=bucket, size=5, alpha=0.05, workers=1, sample=0.0001, min_count=min_count) + model = FT_gensim(bucket=bucket, vector_size=5, alpha=0.05, workers=1, sample=0.0001, min_count=min_count) model.build_vocab(TOY_SENTENCES) model.train(TOY_SENTENCES, total_examples=len(TOY_SENTENCES), epochs=model.epochs) return model @@ -1025,8 +1022,8 @@ def test_sanity(self): # self.assertEqual(trained.bucket, native.bucket) compare_wv(trained.wv, native.wv, self) - compare_vocabulary(trained.vocabulary, native.vocabulary, self) - compare_nn(trained.trainables, native.trainables, self) + compare_vocabulary(trained, native, self) + compare_nn(trained, native, self) def test_continuation_native(self): """Ensure that training has had a measurable effect.""" @@ -1149,7 +1146,7 @@ class HashCompatibilityTest(unittest.TestCase): def test_compatibility_true(self): m = FT_gensim.load(datapath('compatible-hash-true.model')) self.assertTrue(m.wv.compatible_hash) - self.assertEqual(m.trainables.bucket, m.wv.bucket) + self.assertEqual(m.bucket, m.wv.bucket) def test_compatibility_false(self): # @@ -1157,12 +1154,12 @@ def test_compatibility_false(self): # m = FT_gensim.load(datapath('compatible-hash-false.model')) self.assertFalse(m.wv.compatible_hash) - self.assertEqual(m.trainables.bucket, m.wv.bucket) + self.assertEqual(m.bucket, m.wv.bucket) def test_hash_native(self): m = load_native() self.assertTrue(m.wv.compatible_hash) - self.assertEqual(m.trainables.bucket, m.wv.bucket) + self.assertEqual(m.bucket, m.wv.bucket) class FTHashResultsTest(unittest.TestCase): diff --git a/gensim/test/test_fasttext_wrapper.py b/gensim/test/test_fasttext_wrapper.py deleted file mode 100644 index 66dd7b47c5..0000000000 --- a/gensim/test/test_fasttext_wrapper.py +++ /dev/null @@ -1,382 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# -# Copyright (C) 2010 Radim Rehurek -# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html - -""" -Automated tests for checking transformation algorithms (the models package). -""" - -import logging -import unittest -import os - -import numpy - -from gensim.models.wrappers import fasttext -from gensim.models import keyedvectors -from gensim.test.utils import datapath, get_tmpfile - - -try: - from pyemd import emd # noqa:F401 - PYEMD_EXT = True -except (ImportError, ValueError): - PYEMD_EXT = False - - -logger = logging.getLogger(__name__) - - -class TestFastText(unittest.TestCase): - def setUp(self): - ft_home = os.environ.get('FT_HOME', None) - self.ft_path = os.path.join(ft_home, 'fasttext') if ft_home else None - self.corpus_file = datapath('lee_background.cor') - self.test_model_file = datapath('lee_fasttext') - self.test_new_model_file = datapath('lee_fasttext_new') - # Load pre-trained model to perform tests in case FastText binary isn't available in test environment - self.test_model = fasttext.FastText.load_fasttext_format(self.test_model_file) - - def model_sanity(self, model): - """Even tiny models trained on any corpus should pass these sanity checks""" - self.assertEqual(model.wv.syn0.shape, (len(model.wv.vocab), model.vector_size)) - self.assertEqual(model.wv.syn0_ngrams.shape, (model.num_ngram_vectors, model.vector_size)) - - def models_equal(self, model1, model2): - self.assertEqual(len(model1.wv.vocab), len(model2.wv.vocab)) - self.assertEqual(set(model1.wv.vocab.keys()), set(model2.wv.vocab.keys())) - self.assertTrue(numpy.allclose(model1.wv.syn0, model2.wv.syn0)) - self.assertTrue(numpy.allclose(model1.wv.syn0_ngrams, model2.wv.syn0_ngrams)) - - def testTraining(self): - """Test self.test_model successfully trained, parameters and weights correctly loaded""" - if self.ft_path is None: - logger.info("FT_HOME env variable not set, skipping test") - return # Use self.skipTest once python < 2.7 is no longer supported - vocab_size, model_size = 1763, 10 - tmpf = get_tmpfile('gensim_fasttext_wrapper.tst') - trained_model = fasttext.FastText.train( - self.ft_path, self.corpus_file, size=model_size, output_file=tmpf - ) - - self.assertEqual(trained_model.wv.syn0.shape, (vocab_size, model_size)) - self.assertEqual(len(trained_model.wv.vocab), vocab_size) - self.assertEqual(trained_model.wv.syn0_ngrams.shape[1], model_size) - self.model_sanity(trained_model) - - # Tests temporary training files deleted - self.assertFalse(os.path.exists('%s.bin' % tmpf)) - - def testMinCount(self): - """Tests words with frequency less than `min_count` absent from vocab""" - if self.ft_path is None: - logger.info("FT_HOME env variable not set, skipping test") - return # Use self.skipTest once python < 2.7 is no longer supported - tmpf = get_tmpfile('gensim_fasttext_wrapper.tst') - test_model_min_count_5 = fasttext.FastText.train( - self.ft_path, self.corpus_file, output_file=tmpf, size=10, min_count=5 - ) - self.assertTrue('forests' not in test_model_min_count_5.wv.vocab) - - test_model_min_count_1 = fasttext.FastText.train( - self.ft_path, self.corpus_file, output_file=tmpf, size=10, min_count=1 - ) - self.assertTrue('forests' in test_model_min_count_1.wv.vocab) - - def testModelSize(self): - """Tests output vector dimensions are the same as the value for `size` param""" - if self.ft_path is None: - logger.info("FT_HOME env variable not set, skipping test") - return # Use self.skipTest once python < 2.7 is no longer supported - tmpf = get_tmpfile('gensim_fasttext_wrapper.tst') - test_model_size_20 = fasttext.FastText.train( - self.ft_path, self.corpus_file, output_file=tmpf, size=20 - ) - self.assertEqual(test_model_size_20.vector_size, 20) - self.assertEqual(test_model_size_20.wv.syn0.shape[1], 20) - self.assertEqual(test_model_size_20.wv.syn0_ngrams.shape[1], 20) - - def testPersistence(self): - """Test storing/loading the entire model.""" - tmpf = get_tmpfile('gensim_fasttext_wrapper.tst') - self.test_model.save(tmpf) - loaded = fasttext.FastText.load(tmpf) - self.models_equal(self.test_model, loaded) - - self.test_model.save(tmpf, sep_limit=0) - self.models_equal(self.test_model, fasttext.FastText.load(tmpf)) - - def testNormalizedVectorsNotSaved(self): - """Test syn0norm/syn0_ngrams_norm aren't saved in model file""" - tmpf = get_tmpfile('gensim_fasttext_wrapper.tst') - self.test_model.init_sims() - self.test_model.save(tmpf) - loaded = fasttext.FastText.load(tmpf) - self.assertTrue(loaded.wv.syn0norm is None) - self.assertTrue(loaded.wv.syn0_ngrams_norm is None) - - wv = self.test_model.wv - wv.save(tmpf) - loaded_kv = keyedvectors.KeyedVectors.load(tmpf) - self.assertTrue(loaded_kv.syn0norm is None) - self.assertTrue(loaded_kv.syn0_ngrams_norm is None) - - def testLoadFastTextFormat(self): - """Test model successfully loaded from fastText .bin file""" - try: - model = fasttext.FastText.load_fasttext_format(self.test_model_file) - except Exception as exc: - self.fail('Unable to load FastText model from file %s: %s' % (self.test_model_file, exc)) - vocab_size, model_size = 1762, 10 - self.assertEqual(model.wv.syn0.shape, (vocab_size, model_size)) - self.assertEqual(len(model.wv.vocab), vocab_size, model_size) - self.assertEqual(model.wv.syn0_ngrams.shape, (model.num_ngram_vectors, model_size)) - - expected_vec = [ - -0.57144, - -0.0085561, - 0.15748, - -0.67855, - -0.25459, - -0.58077, - -0.09913, - 1.1447, - 0.23418, - 0.060007 - ] # obtained using ./fasttext print-word-vectors lee_fasttext_new.bin - self.assertTrue(numpy.allclose(model["hundred"], expected_vec, atol=1e-4)) - - # vector for oov words are slightly different from original FastText due to discarding unused ngrams - # obtained using a modified version of ./fasttext print-word-vectors lee_fasttext_new.bin - expected_vec_oov = [ - -0.23825, - -0.58482, - -0.22276, - -0.41215, - 0.91015, - -1.6786, - -0.26724, - 0.58818, - 0.57828, - 0.75801 - ] - self.assertTrue(numpy.allclose(model["rejection"], expected_vec_oov, atol=1e-4)) - - self.assertEqual(model.min_count, 5) - self.assertEqual(model.window, 5) - self.assertEqual(model.iter, 5) - self.assertEqual(model.negative, 5) - self.assertEqual(model.sample, 0.0001) - self.assertEqual(model.bucket, 1000) - self.assertEqual(model.wv.max_n, 6) - self.assertEqual(model.wv.min_n, 3) - self.model_sanity(model) - - def testLoadFastTextNewFormat(self): - """ Test model successfully loaded from fastText (new format) .bin file """ - try: - new_model = fasttext.FastText.load_fasttext_format(self.test_new_model_file) - except Exception as exc: - self.fail('Unable to load FastText model from file %s: %s' % (self.test_new_model_file, exc)) - vocab_size, model_size = 1763, 10 - self.assertEqual(new_model.wv.syn0.shape, (vocab_size, model_size)) - self.assertEqual(len(new_model.wv.vocab), vocab_size, model_size) - self.assertEqual(new_model.wv.syn0_ngrams.shape, (new_model.num_ngram_vectors, model_size)) - - expected_vec = [ - -0.025627, - -0.11448, - 0.18116, - -0.96779, - 0.2532, - -0.93224, - 0.3929, - 0.12679, - -0.19685, - -0.13179 - ] # obtained using ./fasttext print-word-vectors lee_fasttext_new.bin - self.assertTrue(numpy.allclose(new_model["hundred"], expected_vec, atol=1e-4)) - - # vector for oov words are slightly different from original FastText due to discarding unused ngrams - # obtained using a modified version of ./fasttext print-word-vectors lee_fasttext_new.bin - expected_vec_oov = [ - -0.53378, - -0.19, - 0.013482, - -0.86767, - -0.21684, - -0.89928, - 0.45124, - 0.18025, - -0.14128, - 0.22508 - ] - self.assertTrue(numpy.allclose(new_model["rejection"], expected_vec_oov, atol=1e-4)) - - self.assertEqual(new_model.min_count, 5) - self.assertEqual(new_model.window, 5) - self.assertEqual(new_model.iter, 5) - self.assertEqual(new_model.negative, 5) - self.assertEqual(new_model.sample, 0.0001) - self.assertEqual(new_model.bucket, 1000) - self.assertEqual(new_model.wv.max_n, 6) - self.assertEqual(new_model.wv.min_n, 3) - self.model_sanity(new_model) - - def testLoadFileName(self): - """ Test model accepts input as both `/path/to/model` or `/path/to/model.bin` """ - self.assertTrue(fasttext.FastText.load_fasttext_format(datapath('lee_fasttext_new'))) - self.assertTrue(fasttext.FastText.load_fasttext_format(datapath('lee_fasttext_new.bin'))) - - def testLoadModelSupervised(self): - """Test loading model with supervised learning labels""" - with self.assertRaises(NotImplementedError): - fasttext.FastText.load_fasttext_format(datapath('pang_lee_polarity_fasttext')) - - def testLoadModelWithNonAsciiVocab(self): - """Test loading model with non-ascii words in vocab""" - model = fasttext.FastText.load_fasttext_format(datapath('non_ascii_fasttext')) - self.assertTrue(u'který' in model) - try: - vector = model[u'který'] # noqa:F841 - except UnicodeDecodeError: - self.fail('Unable to access vector for utf8 encoded non-ascii word') - - def testLoadModelNonUtf8Encoding(self): - """Test loading model with words in user-specified encoding""" - model = fasttext.FastText.load_fasttext_format(datapath('cp852_fasttext'), encoding='cp852') - self.assertTrue(u'který' in model) - try: - vector = model[u'který'] # noqa:F841 - except KeyError: - self.fail('Unable to access vector for cp-852 word') - - def testNSimilarity(self): - """Test n_similarity for in-vocab and out-of-vocab words""" - # In vocab, sanity check - self.assertTrue(numpy.allclose(self.test_model.n_similarity(['the', 'and'], ['and', 'the']), 1.0)) - self.assertEqual(self.test_model.n_similarity(['the'], ['and']), self.test_model.n_similarity(['and'], ['the'])) - # Out of vocab check - self.assertTrue(numpy.allclose(self.test_model.n_similarity(['night', 'nights'], ['nights', 'night']), 1.0)) - self.assertEqual( - self.test_model.n_similarity(['night'], ['nights']), - self.test_model.n_similarity(['nights'], ['night']) - ) - - def testSimilarity(self): - """Test similarity for in-vocab and out-of-vocab words""" - # In vocab, sanity check - self.assertTrue(numpy.allclose(self.test_model.similarity('the', 'the'), 1.0)) - self.assertEqual(self.test_model.similarity('the', 'and'), self.test_model.similarity('and', 'the')) - # Out of vocab check - self.assertTrue(numpy.allclose(self.test_model.similarity('nights', 'nights'), 1.0)) - self.assertEqual(self.test_model.similarity('night', 'nights'), self.test_model.similarity('nights', 'night')) - - def testMostSimilar(self): - """Test most_similar for in-vocab and out-of-vocab words""" - # In vocab, sanity check - self.assertEqual(len(self.test_model.most_similar(positive=['the', 'and'], topn=5)), 5) - self.assertEqual(self.test_model.most_similar('the'), self.test_model.most_similar(positive=['the'])) - # Out of vocab check - self.assertEqual(len(self.test_model.most_similar(['night', 'nights'], topn=5)), 5) - self.assertEqual(self.test_model.most_similar('nights'), self.test_model.most_similar(positive=['nights'])) - - def testMostSimilarCosmul(self): - """Test most_similar_cosmul for in-vocab and out-of-vocab words""" - # In vocab, sanity check - self.assertEqual(len(self.test_model.most_similar_cosmul(positive=['the', 'and'], topn=5)), 5) - self.assertEqual( - self.test_model.most_similar_cosmul('the'), - self.test_model.most_similar_cosmul(positive=['the'])) - # Out of vocab check - self.assertEqual(len(self.test_model.most_similar_cosmul(['night', 'nights'], topn=5)), 5) - self.assertEqual( - self.test_model.most_similar_cosmul('nights'), - self.test_model.most_similar_cosmul(positive=['nights'])) - - def testLookup(self): - """Tests word vector lookup for in-vocab and out-of-vocab words""" - # In vocab, sanity check - self.assertTrue('night' in self.test_model.wv.vocab) - self.assertTrue(numpy.allclose(self.test_model['night'], self.test_model[['night']])) - # Out of vocab check - self.assertFalse('nights' in self.test_model.wv.vocab) - self.assertTrue(numpy.allclose(self.test_model['nights'], self.test_model[['nights']])) - # Word with no ngrams in model - self.assertRaises(KeyError, lambda: self.test_model['a!@']) - - def testContains(self): - """Tests __contains__ for in-vocab and out-of-vocab words""" - # In vocab, sanity check - self.assertTrue('night' in self.test_model.wv.vocab) - self.assertTrue('night' in self.test_model) - # Out of vocab check - self.assertFalse('nights' in self.test_model.wv.vocab) - self.assertTrue('nights' in self.test_model) - # Word with no ngrams in model - self.assertFalse('a!@' in self.test_model.wv.vocab) - self.assertFalse('a!@' in self.test_model) - - @unittest.skipIf(PYEMD_EXT is False, "pyemd not installed or have some issues") - def testWmdistance(self): - """Tests wmdistance for docs with in-vocab and out-of-vocab words""" - doc = ['night', 'payment'] - oov_doc = ['nights', 'forests', 'payments'] - ngrams_absent_doc = ['a!@', 'b#$'] - - dist = self.test_model.wmdistance(doc, oov_doc) - self.assertNotEqual(float('inf'), dist) - dist = self.test_model.wmdistance(doc, ngrams_absent_doc) - self.assertEqual(float('inf'), dist) - - def testDoesntMatch(self): - """Tests doesnt_match for list of out-of-vocab words""" - oov_words = ['nights', 'forests', 'payments'] - # Out of vocab check - for word in oov_words: - self.assertFalse(word in self.test_model.wv.vocab) - try: - self.test_model.doesnt_match(oov_words) - except Exception: - self.fail('model.doesnt_match raises exception for oov words') - - def testHash(self): - # Tests FastText.ft_hash method return values to those obtained from original C implementation - ft_hash = fasttext.ft_hash('test') - self.assertEqual(ft_hash, 2949673445) - ft_hash = fasttext.ft_hash('word') - self.assertEqual(ft_hash, 1788406269) - - def testConsistentDtype(self): - """Test that the same dtype is returned for OOV words as for words in the vocabulary""" - vocab_word = 'night' - oov_word = 'wordnotpresentinvocabulary' - self.assertIn(vocab_word, self.test_model.wv.vocab) - self.assertNotIn(oov_word, self.test_model.wv.vocab) - - vocab_embedding = self.test_model[vocab_word] - oov_embedding = self.test_model[oov_word] - self.assertEqual(vocab_embedding.dtype, oov_embedding.dtype) - - def testPersistenceForOldVersions(self): - """Test backward compatibility for models saved with versions < 3.0.0""" - old_model_path = datapath('ft_model_2.3.0') - loaded_model = fasttext.FastText.load(old_model_path) - self.assertEqual(loaded_model.vector_size, 10) - self.assertEqual(loaded_model.wv.syn0.shape[1], 10) - self.assertEqual(loaded_model.wv.syn0_ngrams.shape[1], 10) - # in-vocab word - in_expected_vec = numpy.array([-2.44566941, -1.54802394, -2.61103821, -1.88549316, 1.02860415, - 1.19031894, 2.01627707, 1.98942184, -1.39095843, -0.65036952]) - self.assertTrue(numpy.allclose(loaded_model["the"], in_expected_vec, atol=1e-4)) - # out-of-vocab word - out_expected_vec = numpy.array([-1.34948218, -0.8686831, -1.51483142, -1.0164026, 0.56272298, - 0.66228276, 1.06477463, 1.1355902, -0.80972326, -0.39845538]) - self.assertTrue(numpy.allclose(loaded_model["random_word"], out_expected_vec, atol=1e-4)) - - -if __name__ == '__main__': - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) - unittest.main() diff --git a/gensim/test/test_keras_integration.py b/gensim/test/test_keras_integration.py index bad0bb8b95..3eb2841f58 100644 --- a/gensim/test/test_keras_integration.py +++ b/gensim/test/test_keras_integration.py @@ -25,7 +25,7 @@ class TestKerasWord2VecWrapper(unittest.TestCase): def setUp(self): - self.model_cos_sim = word2vec.Word2Vec(common_texts, size=100, min_count=1, hs=1) + self.model_cos_sim = word2vec.Word2Vec(common_texts, vector_size=100, min_count=1, hs=1) self.model_twenty_ng = word2vec.Word2Vec(min_count=1) def testWord2VecTraining(self): @@ -34,7 +34,7 @@ def testWord2VecTraining(self): """ model = self.model_cos_sim self.assertTrue(model.wv.vectors.shape == (len(model.wv.vocab), 100)) - self.assertTrue(model.trainables.syn1.shape == (len(model.wv.vocab), 100)) + self.assertTrue(model.syn1.shape == (len(model.wv.vocab), 100)) sims = model.wv.most_similar('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar diff --git a/gensim/test/test_keyedvectors.py b/gensim/test/test_keyedvectors.py index 0b46afec5e..ba759fde5f 100644 --- a/gensim/test/test_keyedvectors.py +++ b/gensim/test/test_keyedvectors.py @@ -142,11 +142,11 @@ def test_similarity(self): self.assertTrue(np.allclose(self.vectors.similarity('war', 'war'), 1)) self.assertTrue(np.allclose(self.vectors.similarity('war', 'conflict'), 0.93305397)) - def test_words_closer_than(self): + def test_closer_than(self): """Test words_closer_than returns expected value for distinct and identical nodes.""" - self.assertEqual(self.vectors.words_closer_than('war', 'war'), []) + self.assertEqual(self.vectors.closer_than('war', 'war'), []) expected = set(['conflict', 'administration']) - self.assertEqual(set(self.vectors.words_closer_than('war', 'terrorism')), expected) + self.assertEqual(set(self.vectors.closer_than('war', 'terrorism')), expected) def test_rank(self): """Test rank returns expected value for distinct and identical nodes.""" diff --git a/gensim/test/test_poincare.py b/gensim/test/test_poincare.py index c4fe8af433..f0520d0a7f 100644 --- a/gensim/test/test_poincare.py +++ b/gensim/test/test_poincare.py @@ -383,11 +383,11 @@ def test_difference_in_hierarchy(self): self.assertTrue(np.allclose(self.vectors.difference_in_hierarchy('mammal.n.01', 'dog.n.01'), 0.9384287)) self.assertTrue(np.allclose(self.vectors.difference_in_hierarchy('dog.n.01', 'mammal.n.01'), -0.9384287)) - def test_words_closer_than(self): - """Test words_closer_than returns expected value for distinct and identical nodes.""" - self.assertEqual(self.vectors.words_closer_than('dog.n.01', 'dog.n.01'), []) + def test_closer_than(self): + """Test closer_than returns expected value for distinct and identical nodes.""" + self.assertEqual(self.vectors.closer_than('dog.n.01', 'dog.n.01'), []) expected = set(['canine.n.02', 'hunting_dog.n.01']) - self.assertEqual(set(self.vectors.words_closer_than('dog.n.01', 'carnivore.n.01')), expected) + self.assertEqual(set(self.vectors.closer_than('dog.n.01', 'carnivore.n.01')), expected) def test_rank(self): """Test rank returns expected value for distinct and identical nodes.""" diff --git a/gensim/test/test_sklearn_api.py b/gensim/test/test_sklearn_api.py index e325910b48..a8d9e3e6eb 100644 --- a/gensim/test/test_sklearn_api.py +++ b/gensim/test/test_sklearn_api.py @@ -655,7 +655,7 @@ def testModelNotFitted(self): class TestWord2VecWrapper(unittest.TestCase): def setUp(self): numpy.random.seed(0) - self.model = W2VTransformer(size=10, min_count=0, seed=42) + self.model = W2VTransformer(vector_size=10, min_count=0, seed=42) self.model.fit(texts) def testTransform(self): @@ -664,21 +664,21 @@ def testTransform(self): words = words + texts[0] matrix = self.model.transform(words) self.assertEqual(matrix.shape[0], 3) - self.assertEqual(matrix.shape[1], self.model.size) + self.assertEqual(matrix.shape[1], self.model.vector_size) # tranform one word word = texts[0][0] matrix = self.model.transform(word) self.assertEqual(matrix.shape[0], 1) - self.assertEqual(matrix.shape[1], self.model.size) + self.assertEqual(matrix.shape[1], self.model.vector_size) def testConsistencyWithGensimModel(self): # training a W2VTransformer - self.model = W2VTransformer(size=10, min_count=0, seed=42) + self.model = W2VTransformer(vector_size=10, min_count=0, seed=42) self.model.fit(texts) # training a Gensim Word2Vec model with the same params - gensim_w2vmodel = models.Word2Vec(texts, size=10, min_count=0, seed=42) + gensim_w2vmodel = models.Word2Vec(texts, vector_size=10, min_count=0, seed=42) word = texts[0][0] vec_transformer_api = self.model.transform(word) # vector returned by W2VTransformer @@ -688,7 +688,7 @@ def testConsistencyWithGensimModel(self): def testPipeline(self): numpy.random.seed(0) # set fixed seed to get similar values everytime - model = W2VTransformer(size=10, min_count=1) + model = W2VTransformer(vector_size=10, min_count=1) model.fit(w2v_texts) class_dict = {'mathematics': 1, 'physics': 0} @@ -725,7 +725,7 @@ def testPersistence(self): # sanity check for transformation operation self.assertEqual(loaded_transformed_vecs.shape[0], 1) - self.assertEqual(loaded_transformed_vecs.shape[1], model_load.size) + self.assertEqual(loaded_transformed_vecs.shape[1], model_load.vector_size) # comparing the original and loaded models original_transformed_vecs = self.model.transform(word) @@ -733,7 +733,7 @@ def testPersistence(self): self.assertTrue(passed) def testModelNotFitted(self): - w2vmodel_wrapper = W2VTransformer(size=10, min_count=0, seed=42) + w2vmodel_wrapper = W2VTransformer(vector_size=10, min_count=0, seed=42) word = texts[0][0] self.assertRaises(NotFittedError, w2vmodel_wrapper.transform, word) @@ -832,13 +832,13 @@ def testTransform(self): docs = [w2v_texts[0], w2v_texts[1], w2v_texts[2]] matrix = self.model.transform(docs) self.assertEqual(matrix.shape[0], 3) - self.assertEqual(matrix.shape[1], self.model.size) + self.assertEqual(matrix.shape[1], self.model.vector_size) # tranform one document doc = w2v_texts[0] matrix = self.model.transform(doc) self.assertEqual(matrix.shape[0], 1) - self.assertEqual(matrix.shape[1], self.model.size) + self.assertEqual(matrix.shape[1], self.model.vector_size) def testFitTransform(self): model = D2VTransformer(min_count=1) @@ -847,13 +847,13 @@ def testFitTransform(self): docs = [w2v_texts[0], w2v_texts[1], w2v_texts[2]] matrix = model.fit_transform(docs) self.assertEqual(matrix.shape[0], 3) - self.assertEqual(matrix.shape[1], model.size) + self.assertEqual(matrix.shape[1], model.vector_size) # fit and transform one document doc = w2v_texts[0] matrix = model.fit_transform(doc) self.assertEqual(matrix.shape[0], 1) - self.assertEqual(matrix.shape[1], model.size) + self.assertEqual(matrix.shape[1], model.vector_size) def testSetGetParams(self): # updating only one param @@ -893,7 +893,7 @@ def testPersistence(self): # sanity check for transformation operation self.assertEqual(loaded_transformed_vecs.shape[0], 1) - self.assertEqual(loaded_transformed_vecs.shape[1], model_load.size) + self.assertEqual(loaded_transformed_vecs.shape[1], model_load.vector_size) # comparing the original and loaded models original_transformed_vecs = self.model.transform(doc) @@ -1297,9 +1297,9 @@ def testModelNotFitted(self): self.assertRaises(NotFittedError, phrases_transformer.transform, phrases_sentences[0]) -class TestFastTextWrapper(unittest.TestCase): +class TestFTTransformer(unittest.TestCase): def setUp(self): - self.model = FTTransformer(size=10, min_count=0, seed=42) + self.model = FTTransformer(vector_size=10, min_count=0, seed=42) self.model.fit(texts) def testTransform(self): @@ -1308,30 +1308,30 @@ def testTransform(self): words = words + texts[0] matrix = self.model.transform(words) self.assertEqual(matrix.shape[0], 3) - self.assertEqual(matrix.shape[1], self.model.size) + self.assertEqual(matrix.shape[1], self.model.vector_size) # tranform one word word = texts[0][0] matrix = self.model.transform(word) self.assertEqual(matrix.shape[0], 1) - self.assertEqual(matrix.shape[1], self.model.size) + self.assertEqual(matrix.shape[1], self.model.vector_size) # verify oov-word vector retrieval invocab_vec = self.model.transform("computer") # invocab word self.assertEqual(invocab_vec.shape[0], 1) - self.assertEqual(invocab_vec.shape[1], self.model.size) + self.assertEqual(invocab_vec.shape[1], self.model.vector_size) oov_vec = self.model.transform('compute') # oov word self.assertEqual(oov_vec.shape[0], 1) - self.assertEqual(oov_vec.shape[1], self.model.size) + self.assertEqual(oov_vec.shape[1], self.model.vector_size) def testConsistencyWithGensimModel(self): # training a FTTransformer - self.model = FTTransformer(size=10, min_count=0, seed=42, workers=1) + self.model = FTTransformer(vector_size=10, min_count=0, seed=42, workers=1) self.model.fit(texts) # training a Gensim FastText model with the same params - gensim_ftmodel = models.FastText(texts, size=10, min_count=0, seed=42, + gensim_ftmodel = models.FastText(texts, vector_size=10, min_count=0, seed=42, workers=1) # vectors returned by FTTransformer @@ -1350,7 +1350,7 @@ def testConsistencyWithGensimModel(self): self.assertTrue(passed) def testPipeline(self): - model = FTTransformer(size=10, min_count=1) + model = FTTransformer(vector_size=10, min_count=1) model.fit(w2v_texts) class_dict = {'mathematics': 1, 'physics': 0} @@ -1388,7 +1388,7 @@ def testPersistence(self): # sanity check for transformation operation self.assertEqual(loaded_transformed_vecs.shape[0], len(words)) - self.assertEqual(loaded_transformed_vecs.shape[1], model_load.size) + self.assertEqual(loaded_transformed_vecs.shape[1], model_load.vector_size) # comparing the original and loaded models original_transformed_vecs = self.model.transform(words) @@ -1396,7 +1396,7 @@ def testPersistence(self): self.assertTrue(passed) def testModelNotFitted(self): - ftmodel_wrapper = FTTransformer(size=10, min_count=0, seed=42) + ftmodel_wrapper = FTTransformer(vector_size=10, min_count=0, seed=42) word = texts[0][0] self.assertRaises(NotFittedError, ftmodel_wrapper.transform, word) diff --git a/gensim/test/test_translation_matrix.py b/gensim/test/test_translation_matrix.py index f6798ac9cc..2841845e6c 100644 --- a/gensim/test/test_translation_matrix.py +++ b/gensim/test/test_translation_matrix.py @@ -2,7 +2,6 @@ # encoding: utf-8 from collections import namedtuple import unittest -import math import logging import numpy as np @@ -92,31 +91,33 @@ def setUp(self): filename = datapath("alldata-id-10.txt") train_docs = read_sentiment_docs(filename) self.train_docs = train_docs - self.source_doc_vec_file = datapath("small_tag_doc_5_iter50") - self.target_doc_vec_file = datapath("large_tag_doc_10_iter50") - - self.source_doc_vec = Doc2Vec.load(self.source_doc_vec_file) - self.target_doc_vec = Doc2Vec.load(self.target_doc_vec_file) + self.source_doc_vec = Doc2Vec(documents=train_docs[:5], vector_size=8, epochs=50, seed=1) + self.target_doc_vec = Doc2Vec(documents=train_docs, vector_size=8, epochs=50, seed=2) def test_translation_matrix(self): model = translation_matrix.BackMappingTranslationMatrix( self.source_doc_vec, self.target_doc_vec, self.train_docs[:5] ) transmat = model.train(self.train_docs[:5]) - self.assertEqual(transmat.shape, (100, 100)) + self.assertEqual(transmat.shape, (8, 8)) def test_infer_vector(self): + """Test that translation gives similar results to traditional inference. + + This may not be completely sensible/salient with such tiny data, but + replaces a nonsensical test. + """ model = translation_matrix.BackMappingTranslationMatrix( self.source_doc_vec, self.target_doc_vec, self.train_docs[:5] ) model.train(self.train_docs[:5]) - infered_vec = model.infer_vector(self.target_doc_vec.docvecs[self.train_docs[5].tags]) - self.assertEqual(infered_vec.shape, (100, )) + backmapped_vec = model.infer_vector(self.target_doc_vec.docvecs[self.train_docs[5].tags]) + self.assertEqual(backmapped_vec.shape, (8, )) + + d2v_inferred_vector = self.source_doc_vec.infer_vector(self.train_docs[5].words) - expected = 0.6453547135 - eps = 1e-6 - caculated = cosine(self.target_doc_vec.docvecs[self.train_docs[5].tags], infered_vec) - self.assertLessEqual(math.fabs(caculated - expected), eps) + distance = cosine(backmapped_vec, d2v_inferred_vector) + self.assertLessEqual(distance, 0.1) if __name__ == '__main__': diff --git a/gensim/test/test_word2vec.py b/gensim/test/test_word2vec.py index b610047a84..9e0c83c946 100644 --- a/gensim/test/test_word2vec.py +++ b/gensim/test/test_word2vec.py @@ -75,8 +75,8 @@ def testBuildVocabFromFreq(self): 'survey': 2, 'user': 3, 'human': 2, 'time': 2, 'interface': 2, 'response': 2 } - model_hs = word2vec.Word2Vec(size=10, min_count=0, seed=42, hs=1, negative=0) - model_neg = word2vec.Word2Vec(size=10, min_count=0, seed=42, hs=0, negative=5) + model_hs = word2vec.Word2Vec(vector_size=10, min_count=0, seed=42, hs=1, negative=0) + model_neg = word2vec.Word2Vec(vector_size=10, min_count=0, seed=42, hs=0, negative=5) model_hs.build_vocab_from_freq(freq_dict) model_neg.build_vocab_from_freq(freq_dict) self.assertEqual(len(model_hs.wv.vocab), 12) @@ -123,7 +123,7 @@ def testPruneVocab(self): ["system", "eps"], ["graph", "system"] ] - model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0) + model = word2vec.Word2Vec(sentences, vector_size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0) self.assertEqual(len(model.wv.vocab), 2) self.assertEqual(model.wv.vocab['graph'].count, 3) self.assertEqual(model.wv.vocab['system'].count, 4) @@ -135,43 +135,43 @@ def testPruneVocab(self): ["graph", "system"], ["minors", "survey", "minors", "survey", "minors"] ] - model = word2vec.Word2Vec(sentences, size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0) + model = word2vec.Word2Vec(sentences, vector_size=10, min_count=0, max_vocab_size=2, seed=42, hs=1, negative=0) self.assertEqual(len(model.wv.vocab), 3) self.assertEqual(model.wv.vocab['graph'].count, 3) self.assertEqual(model.wv.vocab['minors'].count, 3) self.assertEqual(model.wv.vocab['system'].count, 4) def testTotalWordCount(self): - model = word2vec.Word2Vec(size=10, min_count=0, seed=42) - total_words = model.vocabulary.scan_vocab(sentences)[0] + model = word2vec.Word2Vec(vector_size=10, min_count=0, seed=42) + total_words = model.scan_vocab(sentences)[0] self.assertEqual(total_words, 29) def testMaxFinalVocab(self): # Test for less restricting effect of max_final_vocab # max_final_vocab is specified but has no effect - model = word2vec.Word2Vec(size=10, max_final_vocab=4, min_count=4, sample=0) - model.vocabulary.scan_vocab(sentences) - reported_values = model.vocabulary.prepare_vocab(wv=model.wv, hs=0, negative=0) + model = word2vec.Word2Vec(vector_size=10, max_final_vocab=4, min_count=4, sample=0) + model.scan_vocab(sentences) + reported_values = model.prepare_vocab() self.assertEqual(reported_values['drop_unique'], 11) self.assertEqual(reported_values['retain_total'], 4) self.assertEqual(reported_values['num_retained_words'], 1) - self.assertEqual(model.vocabulary.effective_min_count, 4) + self.assertEqual(model.effective_min_count, 4) # Test for more restricting effect of max_final_vocab # results in setting a min_count more restricting than specified min_count - model = word2vec.Word2Vec(size=10, max_final_vocab=4, min_count=2, sample=0) - model.vocabulary.scan_vocab(sentences) - reported_values = model.vocabulary.prepare_vocab(wv=model.wv, hs=0, negative=0) + model = word2vec.Word2Vec(vector_size=10, max_final_vocab=4, min_count=2, sample=0) + model.scan_vocab(sentences) + reported_values = model.prepare_vocab() self.assertEqual(reported_values['drop_unique'], 8) self.assertEqual(reported_values['retain_total'], 13) self.assertEqual(reported_values['num_retained_words'], 4) - self.assertEqual(model.vocabulary.effective_min_count, 3) + self.assertEqual(model.effective_min_count, 3) def testOnlineLearning(self): """Test that the algorithm is able to add new words to the vocabulary and to a trained model when using a sorted vocabulary""" - model_hs = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=1, negative=0) - model_neg = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=0, negative=5) + model_hs = word2vec.Word2Vec(sentences, vector_size=10, min_count=0, seed=42, hs=1, negative=0) + model_neg = word2vec.Word2Vec(sentences, vector_size=10, min_count=0, seed=42, hs=0, negative=5) self.assertTrue(len(model_hs.wv.vocab), 12) self.assertTrue(model_hs.wv.vocab['graph'].count, 3) model_hs.build_vocab(new_sentences, update=True) @@ -185,7 +185,7 @@ def testOnlineLearningAfterSave(self): """Test that the algorithm is able to add new words to the vocabulary and to a trained model when using a sorted vocabulary""" tmpf = get_tmpfile('gensim_word2vec.tst') - model_neg = word2vec.Word2Vec(sentences, size=10, min_count=0, seed=42, hs=0, negative=5) + model_neg = word2vec.Word2Vec(sentences, vector_size=10, min_count=0, seed=42, hs=0, negative=5) model_neg.save(tmpf) model_neg = word2vec.Word2Vec.load(tmpf) self.assertTrue(len(model_neg.wv.vocab), 12) @@ -202,8 +202,10 @@ def testOnlineLearningFromFile(self): utils.save_as_line_sentence(sentences, corpus_file) utils.save_as_line_sentence(new_sentences, new_corpus_file) - model_hs = word2vec.Word2Vec(corpus_file=corpus_file, size=10, min_count=0, seed=42, hs=1, negative=0) - model_neg = word2vec.Word2Vec(corpus_file=corpus_file, size=10, min_count=0, seed=42, hs=0, negative=5) + model_hs = word2vec.Word2Vec(corpus_file=corpus_file, vector_size=10, min_count=0, seed=42, + hs=1, negative=0) + model_neg = word2vec.Word2Vec(corpus_file=corpus_file, vector_size=10, min_count=0, seed=42, + hs=0, negative=5) self.assertTrue(len(model_hs.wv.vocab), 12) self.assertTrue(model_hs.wv.vocab['graph'].count, 3) model_hs.build_vocab(corpus_file=new_corpus_file, update=True) @@ -227,7 +229,8 @@ def testOnlineLearningAfterSaveFromFile(self): utils.save_as_line_sentence(new_sentences, new_corpus_file) tmpf = get_tmpfile('gensim_word2vec.tst') - model_neg = word2vec.Word2Vec(corpus_file=corpus_file, size=10, min_count=0, seed=42, hs=0, negative=5) + model_neg = word2vec.Word2Vec(corpus_file=corpus_file, vector_size=10, min_count=0, seed=42, + hs=0, negative=5) model_neg.save(tmpf) model_neg = word2vec.Word2Vec.load(tmpf) self.assertTrue(len(model_neg.wv.vocab), 12) @@ -260,19 +263,19 @@ def onlineSanity(self, model, trained_model=False): def test_sg_hs_online(self): """Test skipgram w/ hierarchical softmax""" - model = word2vec.Word2Vec(sg=1, window=5, hs=1, negative=0, min_count=3, iter=10, seed=42, workers=2) + model = word2vec.Word2Vec(sg=1, window=5, hs=1, negative=0, min_count=3, epochs=10, seed=42, workers=2) self.onlineSanity(model) def test_sg_neg_online(self): """Test skipgram w/ negative sampling""" - model = word2vec.Word2Vec(sg=1, window=4, hs=0, negative=15, min_count=3, iter=10, seed=42, workers=2) + model = word2vec.Word2Vec(sg=1, window=4, hs=0, negative=15, min_count=3, epochs=10, seed=42, workers=2) self.onlineSanity(model) def test_cbow_hs_online(self): """Test CBOW w/ hierarchical softmax""" model = word2vec.Word2Vec( sg=0, cbow_mean=1, alpha=0.05, window=5, hs=1, negative=0, - min_count=3, iter=10, seed=42, workers=2 + min_count=3, epochs=10, seed=42, workers=2 ) self.onlineSanity(model) @@ -280,7 +283,7 @@ def test_cbow_neg_online(self): """Test CBOW w/ negative sampling""" model = word2vec.Word2Vec( sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=15, - min_count=5, iter=10, seed=42, workers=2, sample=0 + min_count=5, epochs=10, seed=42, workers=2, sample=0 ) self.onlineSanity(model) @@ -356,7 +359,7 @@ def testVectorsNormNotSaved(self): loaded_kv = keyedvectors.KeyedVectors.load(tmpf) self.assertTrue(loaded_kv.vectors_norm is None) - def testLoadPreKeyedVectorModel(self): + def obsolete_testLoadPreKeyedVectorModel(self): """Test loading pre-KeyedVectors word2vec model""" if sys.version_info[:2] == (3, 4): @@ -370,13 +373,13 @@ def testLoadPreKeyedVectorModel(self): model_file = 'word2vec_pre_kv%s' % model_file_suffix model = word2vec.Word2Vec.load(datapath(model_file)) self.assertTrue(model.wv.vectors.shape == (len(model.wv.vocab), model.vector_size)) - self.assertTrue(model.trainables.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) + self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) # Model stored in multiple files model_file = 'word2vec_pre_kv_sep%s' % model_file_suffix model = word2vec.Word2Vec.load(datapath(model_file)) self.assertTrue(model.wv.vectors.shape == (len(model.wv.vocab), model.vector_size)) - self.assertTrue(model.trainables.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) + self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.vector_size)) def testLoadPreKeyedVectorModelCFormat(self): """Test loading pre-KeyedVectors word2vec model saved in word2vec format""" @@ -479,6 +482,8 @@ def testPersistenceWord2VecFormatCombinationWithStandardPersistence(self): testvocab = get_tmpfile('gensim_word2vec.vocab') model.wv.save_word2vec_format(tmpf, testvocab, binary=True) binary_model_with_vocab_kv = keyedvectors.KeyedVectors.load_word2vec_format(tmpf, testvocab, binary=True) + print("BIN") + print(binary_model_with_vocab_kv) binary_model_with_vocab_kv.save(tmpf) self.assertRaises(AttributeError, word2vec.Word2Vec.load, tmpf) @@ -524,11 +529,11 @@ def testVocab(self): def testTraining(self): """Test word2vec training.""" # build vocabulary, don't train yet - model = word2vec.Word2Vec(size=2, min_count=1, hs=1, negative=0) + model = word2vec.Word2Vec(vector_size=2, min_count=1, hs=1, negative=0) model.build_vocab(sentences) self.assertTrue(model.wv.vectors.shape == (len(model.wv.vocab), 2)) - self.assertTrue(model.trainables.syn1.shape == (len(model.wv.vocab), 2)) + self.assertTrue(model.syn1.shape == (len(model.wv.vocab), 2)) model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs) sims = model.wv.most_similar('graph', topn=10) @@ -541,7 +546,7 @@ def testTraining(self): self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above - model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0) + model2 = word2vec.Word2Vec(sentences, vector_size=2, min_count=1, hs=1, negative=0) self.models_equal(model, model2) @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27") @@ -551,11 +556,11 @@ def testTrainingFromFile(self): with temporary_file(get_tmpfile('gensim_word2vec.tst')) as tf: utils.save_as_line_sentence(sentences, tf) - model = word2vec.Word2Vec(size=2, min_count=1, hs=1, negative=0) + model = word2vec.Word2Vec(vector_size=2, min_count=1, hs=1, negative=0) model.build_vocab(corpus_file=tf) self.assertTrue(model.wv.vectors.shape == (len(model.wv.vocab), 2)) - self.assertTrue(model.trainables.syn1.shape == (len(model.wv.vocab), 2)) + self.assertTrue(model.syn1.shape == (len(model.wv.vocab), 2)) model.train(corpus_file=tf, total_words=model.corpus_total_words, epochs=model.epochs) sims = model.wv.most_similar('graph', topn=10) @@ -569,7 +574,7 @@ def testTrainingFromFile(self): def testScoring(self): """Test word2vec scoring.""" - model = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0) + model = word2vec.Word2Vec(sentences, vector_size=2, min_count=1, hs=1, negative=0) # just score and make sure they exist scores = model.score(sentences, len(sentences)) @@ -580,14 +585,14 @@ def testLocking(self): corpus = LeeCorpus() # build vocabulary, don't train yet for sg in range(2): # test both cbow and sg - model = word2vec.Word2Vec(size=4, hs=1, negative=5, min_count=1, sg=sg, window=5) + model = word2vec.Word2Vec(vector_size=4, hs=1, negative=5, min_count=1, sg=sg, window=5) model.build_vocab(corpus) # remember two vectors locked0 = np.copy(model.wv.vectors[0]) unlocked1 = np.copy(model.wv.vectors[1]) # lock the vector in slot 0 against change - model.trainables.vectors_lockf[0] = 0.0 + model.wv.vectors_lockf[0] = 0.0 model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs) self.assertFalse((unlocked1 == model.wv.vectors[1]).all()) # unlocked vector should vary @@ -609,7 +614,7 @@ def testEvaluateWordAnalogies(self): def testEvaluateWordPairs(self): """Test Spearman and Pearson correlation coefficients give sane results on similarity datasets""" corpus = word2vec.LineSentence(datapath('head500.noblanks.cor.bz2')) - model = word2vec.Word2Vec(corpus, min_count=3, iter=10) + model = word2vec.Word2Vec(corpus, min_count=3, epochs=10) correlation = model.wv.evaluate_word_pairs(datapath('wordsim353.tsv')) pearson = correlation[0][0] spearman = correlation[1][0] @@ -624,7 +629,7 @@ def testEvaluateWordPairsFromFile(self): with temporary_file(get_tmpfile('gensim_word2vec.tst')) as tf: utils.save_as_line_sentence(word2vec.LineSentence(datapath('head500.noblanks.cor.bz2')), tf) - model = word2vec.Word2Vec(corpus_file=tf, min_count=3, iter=10) + model = word2vec.Word2Vec(corpus_file=tf, min_count=3, epochs=10) correlation = model.wv.evaluate_word_pairs(datapath('wordsim353.tsv')) pearson = correlation[0][0] spearman = correlation[1][0] @@ -658,29 +663,29 @@ def model_sanity(self, model, train=True, with_corpus_file=False): def test_sg_hs(self): """Test skipgram w/ hierarchical softmax""" - model = word2vec.Word2Vec(sg=1, window=4, hs=1, negative=0, min_count=5, iter=10, workers=2) + model = word2vec.Word2Vec(sg=1, window=4, hs=1, negative=0, min_count=5, epochs=10, workers=2) self.model_sanity(model) @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27") def test_sg_hs_fromfile(self): - model = word2vec.Word2Vec(sg=1, window=4, hs=1, negative=0, min_count=5, iter=10, workers=2) + model = word2vec.Word2Vec(sg=1, window=4, hs=1, negative=0, min_count=5, epochs=10, workers=2) self.model_sanity(model, with_corpus_file=True) def test_sg_neg(self): """Test skipgram w/ negative sampling""" - model = word2vec.Word2Vec(sg=1, window=4, hs=0, negative=15, min_count=5, iter=10, workers=2) + model = word2vec.Word2Vec(sg=1, window=4, hs=0, negative=15, min_count=5, epochs=10, workers=2) self.model_sanity(model) @unittest.skipIf(os.name == 'nt' and six.PY2, "CythonLineSentence is not supported on Windows + Py27") def test_sg_neg_fromfile(self): - model = word2vec.Word2Vec(sg=1, window=4, hs=0, negative=15, min_count=5, iter=10, workers=2) + model = word2vec.Word2Vec(sg=1, window=4, hs=0, negative=15, min_count=5, epochs=10, workers=2) self.model_sanity(model, with_corpus_file=True) def test_cbow_hs(self): """Test CBOW w/ hierarchical softmax""" model = word2vec.Word2Vec( sg=0, cbow_mean=1, alpha=0.05, window=8, hs=1, negative=0, - min_count=5, iter=10, workers=2, batch_words=1000 + min_count=5, epochs=10, workers=2, batch_words=1000 ) self.model_sanity(model) @@ -688,7 +693,7 @@ def test_cbow_hs(self): def test_cbow_hs_fromfile(self): model = word2vec.Word2Vec( sg=0, cbow_mean=1, alpha=0.05, window=8, hs=1, negative=0, - min_count=5, iter=10, workers=2, batch_words=1000 + min_count=5, epochs=10, workers=2, batch_words=1000 ) self.model_sanity(model, with_corpus_file=True) @@ -696,7 +701,7 @@ def test_cbow_neg(self): """Test CBOW w/ negative sampling""" model = word2vec.Word2Vec( sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=15, - min_count=5, iter=10, workers=2, sample=0 + min_count=5, epochs=10, workers=2, sample=0 ) self.model_sanity(model) @@ -704,12 +709,12 @@ def test_cbow_neg(self): def test_cbow_neg_fromfile(self): model = word2vec.Word2Vec( sg=0, cbow_mean=1, alpha=0.05, window=5, hs=0, negative=15, - min_count=5, iter=10, workers=2, sample=0 + min_count=5, epochs=10, workers=2, sample=0 ) self.model_sanity(model, with_corpus_file=True) def test_cosmul(self): - model = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0) + model = word2vec.Word2Vec(sentences, vector_size=2, min_count=1, hs=1, negative=0) sims = model.wv.most_similar_cosmul('graph', topn=10) # self.assertTrue(sims[0][0] == 'trees', sims) # most similar @@ -723,10 +728,10 @@ def testTrainingCbow(self): """Test CBOW word2vec training.""" # to test training, make the corpus larger by repeating its sentences over and over # build vocabulary, don't train yet - model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=1, negative=0) + model = word2vec.Word2Vec(vector_size=2, min_count=1, sg=0, hs=1, negative=0) model.build_vocab(sentences) self.assertTrue(model.wv.vectors.shape == (len(model.wv.vocab), 2)) - self.assertTrue(model.trainables.syn1.shape == (len(model.wv.vocab), 2)) + self.assertTrue(model.syn1.shape == (len(model.wv.vocab), 2)) model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs) sims = model.wv.most_similar('graph', topn=10) @@ -739,17 +744,17 @@ def testTrainingCbow(self): self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above - model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=0, hs=1, negative=0) + model2 = word2vec.Word2Vec(sentences, vector_size=2, min_count=1, sg=0, hs=1, negative=0) self.models_equal(model, model2) def testTrainingSgNegative(self): """Test skip-gram (negative sampling) word2vec training.""" # to test training, make the corpus larger by repeating its sentences over and over # build vocabulary, don't train yet - model = word2vec.Word2Vec(size=2, min_count=1, sg=1, hs=0, negative=2) + model = word2vec.Word2Vec(vector_size=2, min_count=1, sg=1, hs=0, negative=2) model.build_vocab(sentences) self.assertTrue(model.wv.vectors.shape == (len(model.wv.vocab), 2)) - self.assertTrue(model.trainables.syn1neg.shape == (len(model.wv.vocab), 2)) + self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), 2)) model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs) sims = model.wv.most_similar('graph', topn=10) @@ -762,17 +767,17 @@ def testTrainingSgNegative(self): self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above - model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=1, hs=0, negative=2) + model2 = word2vec.Word2Vec(sentences, vector_size=2, min_count=1, sg=1, hs=0, negative=2) self.models_equal(model, model2) def testTrainingCbowNegative(self): """Test CBOW (negative sampling) word2vec training.""" # to test training, make the corpus larger by repeating its sentences over and over # build vocabulary, don't train yet - model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=0, negative=2) + model = word2vec.Word2Vec(vector_size=2, min_count=1, sg=0, hs=0, negative=2) model.build_vocab(sentences) self.assertTrue(model.wv.vectors.shape == (len(model.wv.vocab), 2)) - self.assertTrue(model.trainables.syn1neg.shape == (len(model.wv.vocab), 2)) + self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), 2)) model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs) sims = model.wv.most_similar('graph', topn=10) @@ -785,13 +790,13 @@ def testTrainingCbowNegative(self): self.assertEqual(sims, sims2) # build vocab and train in one step; must be the same as above - model2 = word2vec.Word2Vec(sentences, size=2, min_count=1, sg=0, hs=0, negative=2) + model2 = word2vec.Word2Vec(sentences, vector_size=2, min_count=1, sg=0, hs=0, negative=2) self.models_equal(model, model2) def testSimilarities(self): """Test similarity and n_similarity methods.""" # The model is trained using CBOW - model = word2vec.Word2Vec(size=2, min_count=1, sg=0, hs=0, negative=2) + model = word2vec.Word2Vec(vector_size=2, min_count=1, sg=0, hs=0, negative=2) model.build_vocab(sentences) model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs) @@ -803,7 +808,7 @@ def testSimilarities(self): def testSimilarBy(self): """Test word2vec similar_by_word and similar_by_vector.""" - model = word2vec.Word2Vec(sentences, size=2, min_count=1, hs=1, negative=0) + model = word2vec.Word2Vec(sentences, vector_size=2, min_count=1, hs=1, negative=0) wordsims = model.wv.similar_by_word('graph', topn=10) wordsims2 = model.wv.most_similar(positive='graph', topn=10) vectorsims = model.wv.similar_by_vector(model.wv['graph'], topn=10) @@ -833,9 +838,9 @@ def models_equal(self, model, model2): self.assertEqual(len(model.wv.vocab), len(model2.wv.vocab)) self.assertTrue(np.allclose(model.wv.vectors, model2.wv.vectors)) if model.hs: - self.assertTrue(np.allclose(model.trainables.syn1, model2.trainables.syn1)) + self.assertTrue(np.allclose(model.syn1, model2.syn1)) if model.negative: - self.assertTrue(np.allclose(model.trainables.syn1neg, model2.trainables.syn1neg)) + self.assertTrue(np.allclose(model.syn1neg, model2.syn1neg)) most_common_word = max(model.wv.vocab.items(), key=lambda item: item[1].count)[0] self.assertTrue(np.allclose(model.wv[most_common_word], model2.wv[most_common_word])) @@ -871,9 +876,9 @@ def testLoadOldModel(self): self.assertTrue(model.wv.vectors.shape == (12, 100)) self.assertTrue(len(model.wv.vocab) == 12) self.assertTrue(len(model.wv.index2word) == 12) - self.assertTrue(model.trainables.syn1neg.shape == (len(model.wv.vocab), model.wv.vector_size)) - self.assertTrue(model.trainables.vectors_lockf.shape == (12,)) - self.assertTrue(model.vocabulary.cum_table.shape == (12,)) + self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.wv.vector_size)) + self.assertTrue(model.wv.vectors_lockf.shape == (12,)) + self.assertTrue(model.cum_table.shape == (12,)) self.onlineSanity(model, trained_model=True) @@ -886,13 +891,13 @@ def testLoadOldModelSeparates(self): self.assertTrue(model.wv.vectors.shape == (12, 100)) self.assertTrue(len(model.wv.vocab) == 12) self.assertTrue(len(model.wv.index2word) == 12) - self.assertTrue(model.trainables.syn1neg.shape == (len(model.wv.vocab), model.wv.vector_size)) - self.assertTrue(model.trainables.vectors_lockf.shape == (12,)) - self.assertTrue(model.vocabulary.cum_table.shape == (12,)) + self.assertTrue(model.syn1neg.shape == (len(model.wv.vocab), model.wv.vector_size)) + self.assertTrue(model.wv.vectors_lockf.shape == (12,)) + self.assertTrue(model.cum_table.shape == (12,)) self.onlineSanity(model, trained_model=True) - def test_load_old_models_pre_1_0(self): + def obsolete_test_load_old_models_pre_1_0(self): """Test loading pre-1.0 models""" # load really old model model_file = 'w2v-lee-v0.12.0' @@ -934,7 +939,7 @@ def test_load_old_models_3_x(self): model_file = 'word2vec_3.3' model = word2vec.Word2Vec.load(datapath(model_file)) self.assertEqual(model.max_final_vocab, None) - self.assertEqual(model.vocabulary.max_final_vocab, None) + self.assertEqual(model.max_final_vocab, None) old_versions = [ '3.0.0', '3.1.0', '3.2.0', '3.3.0', '3.4.0' @@ -949,7 +954,14 @@ def _check_old_version(self, old_version): model = word2vec.Word2Vec.load(saved_models_dir.format(old_version)) self.assertIsNone(model.corpus_total_words) self.assertTrue(len(model.wv.vocab) == 3) - self.assertTrue(model.wv.vectors.shape == (3, 4)) + try: + self.assertTrue(model.wv.vectors.shape == (3, 4)) + except AttributeError as ae: + print("WV") + print(model.wv) + print(dir(model.wv)) + print(model.wv.syn0) + raise ae # check if similarity search and online training works. self.assertTrue(len(model.wv.most_similar('sentence')) == 2) model.build_vocab(list_corpus, update=True) @@ -989,7 +1001,7 @@ def testTrainWarning(self, l): self.assertTrue(warning in str(l)) def test_train_with_explicit_param(self): - model = word2vec.Word2Vec(size=2, min_count=1, hs=1, negative=0) + model = word2vec.Word2Vec(vector_size=2, min_count=1, hs=1, negative=0) model.build_vocab(sentences) with self.assertRaises(ValueError): model.train(sentences, total_examples=model.corpus_count)