Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implement separate functions to load FT embeddings and models #2376

Merged
merged 7 commits into from
Mar 7, 2019
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 22 additions & 45 deletions gensim/models/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,27 +171,28 @@
.. sourcecode:: pycon

>>> cap_path = datapath("crime-and-punishment.bin")
>>> # Partial model: loads quickly, uses less RAM, but cannot continue training
>>> fb_partial = FastText.load_fasttext_format(cap_path, full_model=False)
>>> # Full model: loads slowly, consumes RAM, but can continue training (see below)
>>> fb_full = FastText.load_fasttext_format(cap_path, full_model=True)
>>> fb_model = FastText.load_fasttext_format(cap_path)

Once loaded, such models behave identically to those trained from scratch.
You may continue training them on new data:

.. sourcecode:: pycon

>>> 'computer' in fb_full.wv.vocab # New word, currently out of vocab
>>> 'computer' in fb_model.wv.vocab # New word, currently out of vocab
False
>>> old_computer = np.copy(fb_full.wv['computer']) # Calculate current vectors
>>> fb_full.build_vocab(new_sentences, update=True)
>>> fb_full.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs)
>>> new_computer = fb_full.wv['computer']
>>> old_computer = np.copy(fb_model.wv['computer']) # Calculate current vectors
>>> fb_model.build_vocab(new_sentences, update=True)
>>> fb_model.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs)
>>> new_computer = fb_model.wv['computer']
>>> np.allclose(old_computer, new_computer, atol=1e-4) # Vector has changed, model has learnt something
False
>>> 'computer' in fb_full.wv.vocab # New word is now in the vocabulary
>>> 'computer' in fb_model.wv.vocab # New word is now in the vocabulary
True

If you do not intend to continue training the model, consider using the
:meth:`gensim.models.KeyedVectors.from_fasttext_format` method instead.
That method only loads the word embeddings, consuming much less CPU and RAM.

Retrieve word-vector for vocab and out-of-vocab word:

.. sourcecode:: pycon
Expand Down Expand Up @@ -879,14 +880,6 @@ def train(self, sentences=None, corpus_file=None, total_examples=None, total_wor
>>> model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)

"""
cant_train = hasattr(self.trainables, 'syn1neg') and self.trainables.syn1neg is None
if cant_train:
raise ValueError(
'this model cannot be trained any further, '
'if this is a native model, try loading it with '
'FastText.load_fasttext_format(path, full_model=True)'
)

super(FastText, self).train(
sentences=sentences, corpus_file=corpus_file, total_examples=total_examples, total_words=total_words,
epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count,
Expand Down Expand Up @@ -937,13 +930,14 @@ def __contains__(self, word):
return self.wv.__contains__(word)

@classmethod
def load_fasttext_format(cls, model_file, encoding='utf8', full_model=True):
def load_fasttext_format(cls, model_file, encoding='utf8'):
"""Load the input-hidden weight matrix from Facebook's native fasttext `.bin` and `.vec` output files.

By default, this function loads the full model. A full model allows
continuing training with more data, but also consumes more RAM and
takes longer to load. If you do not need to continue training and only
wish the work with the already-trained embeddings, use `full_model=False`
wish the work with the already-trained embeddings, use
:meth:`gensim.models.KeyedVectors.from_fasttext_format`
for faster loading and to save RAM.

Notes
Expand All @@ -963,9 +957,6 @@ def load_fasttext_format(cls, model_file, encoding='utf8', full_model=True):
as Gensim requires only `.bin` file to the load entire fastText model.
encoding : str, optional
Specifies the file encoding.
full_model : boolean, optional
If False, skips loading the hidden output matrix. This saves a fair bit
of CPU time and RAM, but **prevents training continuation**.

Examples
--------
Expand All @@ -977,41 +968,27 @@ def load_fasttext_format(cls, model_file, encoding='utf8', full_model=True):
>>> from gensim.test.utils import datapath
>>>
>>> cap_path = datapath("crime-and-punishment.bin")
>>> fb_full = FastText.load_fasttext_format(cap_path, full_model=True)
>>> fb_model = FastText.load_fasttext_format(cap_path)
>>>
>>> 'landlord' in fb_full.wv.vocab # Word is out of vocabulary
>>> 'landlord' in fb_model.wv.vocab # Word is out of vocabulary
False
>>> oov_term = fb_full.wv['landlord']
>>> oov_term = fb_model.wv['landlord']
>>>
>>> 'landlady' in fb_full.wv.vocab # Word is in the vocabulary
>>> 'landlady' in fb_model.wv.vocab # Word is in the vocabulary
True
>>> iv_term = fb_full.wv['landlady']
>>> iv_term = fb_model.wv['landlady']
>>>
>>> new_sent = [['lord', 'of', 'the', 'rings'], ['lord', 'of', 'the', 'flies']]
>>> fb_full.build_vocab(new_sent, update=True)
>>> fb_full.train(sentences=new_sent, total_examples=len(new_sent), epochs=5)

Load quickly, infer (forego training continuation):

.. sourcecode:: pycon

>>> fb_partial = FastText.load_fasttext_format(cap_path, full_model=False)
>>>
>>> 'landlord' in fb_partial.wv.vocab # Word is out of vocabulary
False
>>> oov_term = fb_partial.wv['landlord']
>>>
>>> 'landlady' in fb_partial.wv.vocab # Word is in the vocabulary
True
>>> iv_term = fb_partial.wv['landlady']
>>> fb_model.build_vocab(new_sent, update=True)
>>> fb_model.train(sentences=new_sent, total_examples=len(new_sent), epochs=5)

Returns
-------
gensim.models.fasttext.FastText
The loaded model.

"""
return _load_fasttext_format(model_file, encoding=encoding, full_model=full_model)
return _load_fasttext_format(model_file, encoding=encoding, full_model=True)

def load_binary_data(self, encoding='utf8'):
"""Load data from a binary file created by Facebook's native FastText.
Expand Down
63 changes: 63 additions & 0 deletions gensim/models/keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,23 @@
>>> wv_from_text = KeyedVectors.load_word2vec_format(datapath('word2vec_pre_kv_c'), binary=False) # C text format
>>> wv_from_bin = KeyedVectors.load_word2vec_format(datapath("euclidean_vectors.bin"), binary=True) # C bin format

You can also load vectors from Facebook's fastText binary format:

.. sourcecode:: pycon

>>> from gensim.test.utils import datapath
>>>
>>> cap_path = datapath("crime-and-punishment.bin")
>>> fbkv = KeyedVectors.load_fasttext_format(cap_path)
>>>
>>> 'landlord' in fbkv.vocab # Word is out of vocabulary
False
>>> oov_vector = fbkv['landlord']
>>>
>>> 'landlady' in fbkv.vocab # Word is in the vocabulary
True
>>> iv_vector = fbkv['landlady']

What can I do with word vectors?
================================

Expand Down Expand Up @@ -1475,6 +1492,52 @@ def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8',
cls, fname, fvocab=fvocab, binary=binary, encoding=encoding, unicode_errors=unicode_errors,
limit=limit, datatype=datatype)

@staticmethod
def load_fasttext_format(path, encoding='utf-8'):
"""Load word embeddings from a model saved in Facebook's native fasttext `.bin` format.

Parameters
----------
path : str
The location of the model file.
encoding : str, optional
Specifies the file encoding.

Returns
-------
gensim.models.keyedvectors.FastTextKeyedVectors
The word embeddings.

Examples
--------

Load and infer:

>>> from gensim.test.utils import datapath
>>>
>>> cap_path = datapath("crime-and-punishment.bin")
>>> fbkv = KeyedVectors.load_fasttext_format(cap_path)
>>>
>>> 'landlord' in fbkv.vocab # Word is out of vocabulary
False
>>> oov_vector = fbkv['landlord']
>>>
>>> 'landlady' in fbkv.vocab # Word is in the vocabulary
True
>>> iv_vector = fbkv['landlady']

See Also
--------

:meth:`gensim.models.fasttext.FastText.load_fasttext_format` loads
the full model, not just word embeddings, and enables you to continue
model training.

"""
from gensim.models.fasttext import _load_fasttext_format
model_wrapper = _load_fasttext_format(path, full_model=False, encoding=encoding)
return model_wrapper.wv

def get_keras_embedding(self, train_embeddings=False):
"""Get a Keras 'Embedding' layer with weights set as the Word2Vec model's learned word embeddings.

Expand Down
15 changes: 0 additions & 15 deletions gensim/test/test_fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,21 +59,6 @@ def setUp(self):
self.test_model = FT_gensim.load_fasttext_format(self.test_model_file)
self.test_new_model_file = datapath('lee_fasttext_new')

def test_native_partial_model(self):
"""Can we skip loading the NN and still get a working model?"""
model = FT_gensim.load_fasttext_format(self.test_model_file, full_model=False)

#
# Training continuation should be impossible
#
self.assertIsNone(model.trainables.syn1neg)
self.assertRaises(ValueError, model.train, sentences,
total_examples=model.corpus_count, epochs=model.epochs)

model.wv['green']
model.wv['foobar']
model.wv['thisworddoesnotexist']

def test_training(self):
model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
model.build_vocab(sentences)
Expand Down
11 changes: 11 additions & 0 deletions gensim/test/test_keyedvectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,17 @@ def test(self):
self.assertTrue(np.allclose(m, norm))


class LoadFastTextFormatTest(unittest.TestCase):
def test(self):
cap_path = datapath("crime-and-punishment.bin")
fbkv = gensim.models.keyedvectors.KeyedVectors.load_fasttext_format(cap_path)
self.assertFalse('landlord' in fbkv.vocab)
self.assertTrue('landlady' in fbkv.vocab)
oov_vector = fbkv['landlord']
iv_vector = fbkv['landlady']
self.assertFalse(np.allclose(oov_vector, iv_vector))


if __name__ == '__main__':
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG)
unittest.main()