piskvorky · mpenkov · Mar 7, 2019 · Feb 6, 2019 · Feb 6, 2019 · Feb 15, 2019
diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py
@@ -171,27 +171,43 @@
 .. sourcecode:: pycon
 
     >>> cap_path = datapath("crime-and-punishment.bin")
-    >>> # Partial model: loads quickly, uses less RAM, but cannot continue training
-    >>> fb_partial = FastText.load_fasttext_format(cap_path, full_model=False)
-    >>> # Full model: loads slowly, consumes RAM, but can continue training (see below)
-    >>> fb_full = FastText.load_fasttext_format(cap_path, full_model=True)
+    >>> fb_model = load_facebook_model(cap_path)
 
 Once loaded, such models behave identically to those trained from scratch.
 You may continue training them on new data:
 
 .. sourcecode:: pycon
 
-    >>> 'computer' in fb_full.wv.vocab  # New word, currently out of vocab
+    >>> 'computer' in fb_model.wv.vocab  # New word, currently out of vocab
     False
-    >>> old_computer = np.copy(fb_full.wv['computer'])  # Calculate current vectors
-    >>> fb_full.build_vocab(new_sentences, update=True)
-    >>> fb_full.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs)
-    >>> new_computer = fb_full.wv['computer']
+    >>> old_computer = np.copy(fb_model.wv['computer'])  # Calculate current vectors
+    >>> fb_model.build_vocab(new_sentences, update=True)
+    >>> fb_model.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs)
+    >>> new_computer = fb_model.wv['computer']
     >>> np.allclose(old_computer, new_computer, atol=1e-4)  # Vector has changed, model has learnt something
     False
-    >>> 'computer' in fb_full.wv.vocab  # New word is now in the vocabulary
+    >>> 'computer' in fb_model.wv.vocab  # New word is now in the vocabulary
     True
 
+If you do not intend to continue training the model, consider using the
+:func:`gensim.models.FastText.load_facebook_vectors` function instead.
+That function only loads the word embeddings (keyed vectors), consuming much less CPU and RAM:
+
+.. sourcecode:: pycon
+
+    >>> from gensim.test.utils import datapath
+    >>>
+    >>> cap_path = datapath("crime-and-punishment.bin")
+    >>> wv = load_facebook_vectors(cap_path)
+    >>>
+    >>> 'landlord' in wv.vocab  # Word is out of vocabulary
+    False
+    >>> oov_vector = wv['landlord']
+    >>>
+    >>> 'landlady' in wv.vocab  # Word is in the vocabulary
+    True
+    >>> iv_vector = wv['landlady']
+
 Retrieve word-vector for vocab and out-of-vocab word:
 
 .. sourcecode:: pycon
@@ -879,14 +895,6 @@ def train(self, sentences=None, corpus_file=None, total_examples=None, total_wor
             >>> model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)
 
         """
-        cant_train = hasattr(self.trainables, 'syn1neg') and self.trainables.syn1neg is None
-        if cant_train:
-            raise ValueError(
-                'this model cannot be trained any further, '
-                'if this is a native model, try loading it with '
-                'FastText.load_fasttext_format(path, full_model=True)'
-            )
-
         super(FastText, self).train(
             sentences=sentences, corpus_file=corpus_file, total_examples=total_examples, total_words=total_words,
             epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count,
@@ -937,82 +945,17 @@ def __contains__(self, word):
         return self.wv.__contains__(word)
 
     @classmethod
-    def load_fasttext_format(cls, model_file, encoding='utf8', full_model=True):
-        """Load the input-hidden weight matrix from Facebook's native fasttext `.bin` and `.vec` output files.
-
-        By default, this function loads the full model.  A full model allows
-        continuing training with more data, but also consumes more RAM and
-        takes longer to load.  If you do not need to continue training and only
-        wish the work with the already-trained embeddings, use `full_model=False`
-        for faster loading and to save RAM.
-
-        Notes
-        ------
-        Facebook provides both `.vec` and `.bin` files with their modules.
-        The former contains human-readable vectors.
-        The latter contains machine-readable vectors along with other model parameters.
-        This function effectively ignores `.vec` output file, since that file is redundant.
-        It only needs the `.bin` file.
-
-        Parameters
-        ----------
-        model_file : str
-            Path to the FastText output files.
-            FastText outputs two model files - `/path/to/model.vec` and `/path/to/model.bin`
-            Expected value for this example: `/path/to/model` or `/path/to/model.bin`,
-            as Gensim requires only `.bin` file to the load entire fastText model.
-        encoding : str, optional
-            Specifies the file encoding.
-        full_model : boolean, optional
-            If False, skips loading the hidden output matrix. This saves a fair bit
-            of CPU time and RAM, but **prevents training continuation**.
-
-        Examples
-        --------
-
-        Load, infer, continue training:
-
-        .. sourcecode:: pycon
-
-            >>> from gensim.test.utils import datapath
-            >>>
-            >>> cap_path = datapath("crime-and-punishment.bin")
-            >>> fb_full = FastText.load_fasttext_format(cap_path, full_model=True)
-            >>>
-            >>> 'landlord' in fb_full.wv.vocab  # Word is out of vocabulary
-            False
-            >>> oov_term = fb_full.wv['landlord']
-            >>>
-            >>> 'landlady' in fb_full.wv.vocab  # Word is in the vocabulary
-            True
-            >>> iv_term = fb_full.wv['landlady']
-            >>>
-            >>> new_sent = [['lord', 'of', 'the', 'rings'], ['lord', 'of', 'the', 'flies']]
-            >>> fb_full.build_vocab(new_sent, update=True)
-            >>> fb_full.train(sentences=new_sent, total_examples=len(new_sent), epochs=5)
-
-        Load quickly, infer (forego training continuation):
+    @deprecated('use load_facebook_vectors (to use pretrained embeddings) or load_facebook_model (to continue training with the loaded full model, more RAM) instead')
+    def load_fasttext_format(cls, model_file, encoding='utf8'):
+        """Deprecated.
 
-        .. sourcecode:: pycon
-
-            >>> fb_partial = FastText.load_fasttext_format(cap_path, full_model=False)
-            >>>
-            >>> 'landlord' in fb_partial.wv.vocab  # Word is out of vocabulary
-            False
-            >>> oov_term = fb_partial.wv['landlord']
-            >>>
-            >>> 'landlady' in fb_partial.wv.vocab  # Word is in the vocabulary
-            True
-            >>> iv_term = fb_partial.wv['landlady']
-
-        Returns
-        -------
-        gensim.models.fasttext.FastText
-            The loaded model.
+        Use :func:`gensim.models.fasttext.load_facebook_model` or
+        :func:`gensim.models.fasttext.load_facebook_vectors` instead.
 
         """
-        return _load_fasttext_format(model_file, encoding=encoding, full_model=full_model)
+        return load_facebook_model(model_file, encoding=encoding)
 
+    @deprecated('use load_facebook_model or load_facebook_vectors instead')
     def load_binary_data(self, encoding='utf8'):
         """Load data from a binary file created by Facebook's native FastText.
 
@@ -1220,6 +1163,105 @@ def _pad_ones(m, new_shape):
     return vstack([m, suffix])
 
 
+def load_facebook_model(path, encoding='utf-8'):
+    """Load the input-hidden weight matrix from Facebook's native fasttext `.bin` output file.
+
+    Notes
+    ------
+    Facebook provides both `.vec` and `.bin` files with their modules.
+    The former contains human-readable vectors.
+    The latter contains machine-readable vectors along with other model parameters.
+    This function effectively ignores `.vec` output file, since that file is redundant.
+    It only needs the `.bin` file.
+
+    Parameters
+    ----------
+    model_file : str
+        Path to the FastText output files.
+        FastText outputs two model files - `/path/to/model.vec` and `/path/to/model.bin`
+        Expected value for this example: `/path/to/model` or `/path/to/model.bin`,
+        as Gensim requires only `.bin` file to the load entire fastText model.
+    encoding : str, optional
+        Specifies the file encoding.
+
+    Examples
+    --------
+
+    Load, infer, continue training:
+
+    .. sourcecode:: pycon
+
+        >>> from gensim.test.utils import datapath
+        >>>
+        >>> cap_path = datapath("crime-and-punishment.bin")
+        >>> fb_model = load_facebook_model(cap_path)
+        >>>
+        >>> 'landlord' in fb_model.wv.vocab  # Word is out of vocabulary
+        False
+        >>> oov_term = fb_model.wv['landlord']
+        >>>
+        >>> 'landlady' in fb_model.wv.vocab  # Word is in the vocabulary
+        True
+        >>> iv_term = fb_model.wv['landlady']
+        >>>
+        >>> new_sent = [['lord', 'of', 'the', 'rings'], ['lord', 'of', 'the', 'flies']]
+        >>> fb_model.build_vocab(new_sent, update=True)
+        >>> fb_model.train(sentences=new_sent, total_examples=len(new_sent), epochs=5)
+
+    Returns
+    -------
+    gensim.models.fasttext.FastText
+        The loaded model.
+
+    """
+    return _load_fasttext_format(path, encoding=encoding, full_model=True)
+
+
+def load_facebook_vectors(path, encoding='utf-8'):
+    """Load word embeddings from a model saved in Facebook's native fasttext `.bin` format.
+
+    Parameters
+    ----------
+    path : str
+        The location of the model file.
+    encoding : str, optional
+        Specifies the file encoding.
+
+    Returns
+    -------
+    gensim.models.keyedvectors.FastTextKeyedVectors
+        The word embeddings.
+
+    Examples
+    --------
+
+    Load and infer:
+
+        >>> from gensim.test.utils import datapath
+        >>>
+        >>> cap_path = datapath("crime-and-punishment.bin")
+        >>> fbkv = load_facebook_vectors(cap_path)
+        >>>
+        >>> 'landlord' in fbkv.vocab  # Word is out of vocabulary
+        False
+        >>> oov_vector = fbkv['landlord']
+        >>>
+        >>> 'landlady' in fbkv.vocab  # Word is in the vocabulary
+        True
+        >>> iv_vector = fbkv['landlady']
+
+    See Also
+    --------
+
+    :meth:`gensim.models.fasttext.FastText.load_facebook_model` loads
+    the full model, not just word embeddings, and enables you to continue
+    model training.
+
+    """
+    model_wrapper = _load_fasttext_format(path, encoding=encoding, full_model=False)
+    return model_wrapper.wv
+
+
 def _load_fasttext_format(model_file, encoding='utf-8', full_model=True):
     """Load the input-hidden weight matrix from Facebook's native fasttext `.bin` and `.vec` output files.
 

diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py
@@ -20,6 +20,8 @@
 from gensim.models.keyedvectors import Word2VecKeyedVectors
 from gensim.test.utils import datapath, get_tmpfile, temporary_file, common_texts as sentences
 
+import gensim.models.fasttext
+
 try:
     from pyemd import emd  # noqa:F401
     PYEMD_EXT = True
@@ -56,24 +58,9 @@ def setUp(self):
         ft_home = os.environ.get('FT_HOME', None)
         self.ft_path = os.path.join(ft_home, 'fasttext') if ft_home else None
         self.test_model_file = datapath('lee_fasttext')
-        self.test_model = FT_gensim.load_fasttext_format(self.test_model_file)
+        self.test_model = gensim.models.fasttext.load_facebook_model(self.test_model_file)
         self.test_new_model_file = datapath('lee_fasttext_new')
 
-    def test_native_partial_model(self):
-        """Can we skip loading the NN and still get a working model?"""
-        model = FT_gensim.load_fasttext_format(self.test_model_file, full_model=False)
-
-        #
-        # Training continuation should be impossible
-        #
-        self.assertIsNone(model.trainables.syn1neg)
-        self.assertRaises(ValueError, model.train, sentences,
-                          total_examples=model.corpus_count, epochs=model.epochs)
-
-        model.wv['green']
-        model.wv['foobar']
-        model.wv['thisworddoesnotexist']
-
     def test_training(self):
         model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
         model.build_vocab(sentences)
@@ -202,7 +189,7 @@ def model_sanity(self, model):
 
     def test_load_fasttext_format(self):
         try:
-            model = FT_gensim.load_fasttext_format(self.test_model_file)
+            model = gensim.models.fasttext.load_facebook_model(self.test_model_file)
         except Exception as exc:
             self.fail('Unable to load FastText model from file %s: %s' % (self.test_model_file, exc))
         vocab_size, model_size = 1762, 10
@@ -255,7 +242,7 @@ def test_load_fasttext_format(self):
 
     def test_load_fasttext_new_format(self):
         try:
-            new_model = FT_gensim.load_fasttext_format(self.test_new_model_file)
+            new_model = gensim.models.fasttext.load_facebook_model(self.test_new_model_file)
         except Exception as exc:
             self.fail('Unable to load FastText model from file %s: %s' % (self.test_new_model_file, exc))
         vocab_size, model_size = 1763, 10
@@ -308,18 +295,18 @@ def test_load_fasttext_new_format(self):
 
     def test_load_model_supervised(self):
         with self.assertRaises(NotImplementedError):
-            FT_gensim.load_fasttext_format(datapath('pang_lee_polarity_fasttext'))
+            gensim.models.fasttext.load_facebook_model(datapath('pang_lee_polarity_fasttext'))
 
     def test_load_model_with_non_ascii_vocab(self):
-        model = FT_gensim.load_fasttext_format(datapath('non_ascii_fasttext'))
+        model = gensim.models.fasttext.load_facebook_model(datapath('non_ascii_fasttext'))
         self.assertTrue(u'který' in model.wv)
         try:
             model.wv[u'který']
         except UnicodeDecodeError:
             self.fail('Unable to access vector for utf8 encoded non-ascii word')
 
     def test_load_model_non_utf8_encoding(self):
-        model = FT_gensim.load_fasttext_format(datapath('cp852_fasttext'), encoding='cp852')
+        model = gensim.models.fasttext.load_facebook_model(datapath('cp852_fasttext'), encoding='cp852')
         self.assertTrue(u'který' in model.wv)
         try:
             model.wv[u'který']
@@ -891,7 +878,7 @@ def load_native():
     # ./fasttext cbow -input toy-data.txt -output toy-model -bucket 100 -dim 5
     #
     path = datapath('toy-model.bin')
-    model = FT_gensim.load_fasttext_format(path)
+    model = gensim.models.fasttext.load_facebook_model(path)
     return model
 
 
@@ -1115,11 +1102,20 @@ def test_save_load_native(self):
             model.save(model_name)
 
     def test_load_native_pretrained(self):
-        model = FT_gensim.load_fasttext_format(datapath('toy-model-pretrained.bin'))
+        model = gensim.models.fasttext.load_facebook_model(datapath('toy-model-pretrained.bin'))
         actual = model['monarchist']
         expected = np.array([0.76222, 1.0669, 0.7055, -0.090969, -0.53508])
         self.assertTrue(np.allclose(expected, actual, atol=10e-4))
 
+    def test_load_native_vectors(self):
+        cap_path = datapath("crime-and-punishment.bin")
+        fbkv = gensim.models.fasttext.load_facebook_vectors(cap_path)
+        self.assertFalse('landlord' in fbkv.vocab)
+        self.assertTrue('landlady' in fbkv.vocab)
+        oov_vector = fbkv['landlord']
+        iv_vector = fbkv['landlady']
+        self.assertFalse(np.allclose(oov_vector, iv_vector))
+
 
 def _train_model_with_pretrained_vectors():
     """Generate toy-model-pretrained.bin for use in test_load_native_pretrained.
@@ -1170,7 +1166,7 @@ def setUp(self):
         #
         # ./fasttext skipgram -minCount 0 -bucket 100 -input crime-and-punishment.txt -output crime-and-punishment -dim 5  # noqa: E501
         #
-        self.model = FT_gensim.load_fasttext_format(datapath('crime-and-punishment.bin'))
+        self.model = gensim.models.fasttext.load_facebook_model(datapath('crime-and-punishment.bin'))
         with smart_open.smart_open(datapath('crime-and-punishment.vec'), 'r', encoding='utf-8') as fin:
             self.expected = dict(load_vec(fin))