Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implement separate functions to load FT embeddings and models #2376

Merged
merged 7 commits into from
Mar 7, 2019
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
222 changes: 132 additions & 90 deletions gensim/models/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,27 +171,43 @@
.. sourcecode:: pycon

>>> cap_path = datapath("crime-and-punishment.bin")
>>> # Partial model: loads quickly, uses less RAM, but cannot continue training
>>> fb_partial = FastText.load_fasttext_format(cap_path, full_model=False)
>>> # Full model: loads slowly, consumes RAM, but can continue training (see below)
>>> fb_full = FastText.load_fasttext_format(cap_path, full_model=True)
>>> fb_model = load_facebook_model(cap_path)

Once loaded, such models behave identically to those trained from scratch.
You may continue training them on new data:

.. sourcecode:: pycon

>>> 'computer' in fb_full.wv.vocab # New word, currently out of vocab
>>> 'computer' in fb_model.wv.vocab # New word, currently out of vocab
False
>>> old_computer = np.copy(fb_full.wv['computer']) # Calculate current vectors
>>> fb_full.build_vocab(new_sentences, update=True)
>>> fb_full.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs)
>>> new_computer = fb_full.wv['computer']
>>> old_computer = np.copy(fb_model.wv['computer']) # Calculate current vectors
>>> fb_model.build_vocab(new_sentences, update=True)
>>> fb_model.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs)
>>> new_computer = fb_model.wv['computer']
>>> np.allclose(old_computer, new_computer, atol=1e-4) # Vector has changed, model has learnt something
False
>>> 'computer' in fb_full.wv.vocab # New word is now in the vocabulary
>>> 'computer' in fb_model.wv.vocab # New word is now in the vocabulary
True

If you do not intend to continue training the model, consider using the
:func:`gensim.models.FastText.load_facebook_vectors` function instead.
That function only loads the word embeddings (keyed vectors), consuming much less CPU and RAM:

.. sourcecode:: pycon

>>> from gensim.test.utils import datapath
>>>
>>> cap_path = datapath("crime-and-punishment.bin")
>>> wv = load_facebook_vectors(cap_path)
>>>
>>> 'landlord' in wv.vocab # Word is out of vocabulary
False
>>> oov_vector = wv['landlord']
>>>
>>> 'landlady' in wv.vocab # Word is in the vocabulary
True
>>> iv_vector = wv['landlady']

Retrieve word-vector for vocab and out-of-vocab word:

.. sourcecode:: pycon
Expand Down Expand Up @@ -879,14 +895,6 @@ def train(self, sentences=None, corpus_file=None, total_examples=None, total_wor
>>> model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs)

"""
cant_train = hasattr(self.trainables, 'syn1neg') and self.trainables.syn1neg is None
if cant_train:
raise ValueError(
'this model cannot be trained any further, '
'if this is a native model, try loading it with '
'FastText.load_fasttext_format(path, full_model=True)'
)

super(FastText, self).train(
sentences=sentences, corpus_file=corpus_file, total_examples=total_examples, total_words=total_words,
epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count,
Expand Down Expand Up @@ -937,82 +945,17 @@ def __contains__(self, word):
return self.wv.__contains__(word)

@classmethod
def load_fasttext_format(cls, model_file, encoding='utf8', full_model=True):
"""Load the input-hidden weight matrix from Facebook's native fasttext `.bin` and `.vec` output files.

By default, this function loads the full model. A full model allows
continuing training with more data, but also consumes more RAM and
takes longer to load. If you do not need to continue training and only
wish the work with the already-trained embeddings, use `full_model=False`
for faster loading and to save RAM.

Notes
------
Facebook provides both `.vec` and `.bin` files with their modules.
The former contains human-readable vectors.
The latter contains machine-readable vectors along with other model parameters.
This function effectively ignores `.vec` output file, since that file is redundant.
It only needs the `.bin` file.

Parameters
----------
model_file : str
Path to the FastText output files.
FastText outputs two model files - `/path/to/model.vec` and `/path/to/model.bin`
Expected value for this example: `/path/to/model` or `/path/to/model.bin`,
as Gensim requires only `.bin` file to the load entire fastText model.
encoding : str, optional
Specifies the file encoding.
full_model : boolean, optional
If False, skips loading the hidden output matrix. This saves a fair bit
of CPU time and RAM, but **prevents training continuation**.

Examples
--------

Load, infer, continue training:

.. sourcecode:: pycon

>>> from gensim.test.utils import datapath
>>>
>>> cap_path = datapath("crime-and-punishment.bin")
>>> fb_full = FastText.load_fasttext_format(cap_path, full_model=True)
>>>
>>> 'landlord' in fb_full.wv.vocab # Word is out of vocabulary
False
>>> oov_term = fb_full.wv['landlord']
>>>
>>> 'landlady' in fb_full.wv.vocab # Word is in the vocabulary
True
>>> iv_term = fb_full.wv['landlady']
>>>
>>> new_sent = [['lord', 'of', 'the', 'rings'], ['lord', 'of', 'the', 'flies']]
>>> fb_full.build_vocab(new_sent, update=True)
>>> fb_full.train(sentences=new_sent, total_examples=len(new_sent), epochs=5)

Load quickly, infer (forego training continuation):
@deprecated('use load_facebook_vectors (to use pretrained embeddings) or load_facebook_model (to continue training with the loaded full model, more RAM) instead')
def load_fasttext_format(cls, model_file, encoding='utf8'):
"""Deprecated.

.. sourcecode:: pycon

>>> fb_partial = FastText.load_fasttext_format(cap_path, full_model=False)
>>>
>>> 'landlord' in fb_partial.wv.vocab # Word is out of vocabulary
False
>>> oov_term = fb_partial.wv['landlord']
>>>
>>> 'landlady' in fb_partial.wv.vocab # Word is in the vocabulary
True
>>> iv_term = fb_partial.wv['landlady']

Returns
-------
gensim.models.fasttext.FastText
The loaded model.
Use :func:`gensim.models.fasttext.load_facebook_model` or
:func:`gensim.models.fasttext.load_facebook_vectors` instead.

"""
return _load_fasttext_format(model_file, encoding=encoding, full_model=full_model)
return load_facebook_model(model_file, encoding=encoding)

@deprecated('use load_facebook_model or load_facebook_vectors instead')
piskvorky marked this conversation as resolved.
Show resolved Hide resolved
def load_binary_data(self, encoding='utf8'):
"""Load data from a binary file created by Facebook's native FastText.

Expand Down Expand Up @@ -1220,6 +1163,105 @@ def _pad_ones(m, new_shape):
return vstack([m, suffix])


def load_facebook_model(path, encoding='utf-8'):
"""Load the input-hidden weight matrix from Facebook's native fasttext `.bin` output file.

Notes
------
Facebook provides both `.vec` and `.bin` files with their modules.
The former contains human-readable vectors.
The latter contains machine-readable vectors along with other model parameters.
This function effectively ignores `.vec` output file, since that file is redundant.
It only needs the `.bin` file.

Parameters
----------
model_file : str
Path to the FastText output files.
FastText outputs two model files - `/path/to/model.vec` and `/path/to/model.bin`
Expected value for this example: `/path/to/model` or `/path/to/model.bin`,
as Gensim requires only `.bin` file to the load entire fastText model.
encoding : str, optional
Specifies the file encoding.

Examples
--------

Load, infer, continue training:

.. sourcecode:: pycon

>>> from gensim.test.utils import datapath
>>>
>>> cap_path = datapath("crime-and-punishment.bin")
>>> fb_model = load_facebook_model(cap_path)
>>>
>>> 'landlord' in fb_model.wv.vocab # Word is out of vocabulary
False
>>> oov_term = fb_model.wv['landlord']
>>>
>>> 'landlady' in fb_model.wv.vocab # Word is in the vocabulary
True
>>> iv_term = fb_model.wv['landlady']
>>>
>>> new_sent = [['lord', 'of', 'the', 'rings'], ['lord', 'of', 'the', 'flies']]
>>> fb_model.build_vocab(new_sent, update=True)
>>> fb_model.train(sentences=new_sent, total_examples=len(new_sent), epochs=5)

Returns
-------
gensim.models.fasttext.FastText
The loaded model.

"""
return _load_fasttext_format(path, encoding=encoding, full_model=True)


def load_facebook_vectors(path, encoding='utf-8'):
"""Load word embeddings from a model saved in Facebook's native fasttext `.bin` format.

Parameters
----------
path : str
The location of the model file.
encoding : str, optional
Specifies the file encoding.

Returns
-------
gensim.models.keyedvectors.FastTextKeyedVectors
The word embeddings.

Examples
--------

Load and infer:

>>> from gensim.test.utils import datapath
>>>
>>> cap_path = datapath("crime-and-punishment.bin")
>>> fbkv = load_facebook_vectors(cap_path)
>>>
>>> 'landlord' in fbkv.vocab # Word is out of vocabulary
False
>>> oov_vector = fbkv['landlord']
>>>
>>> 'landlady' in fbkv.vocab # Word is in the vocabulary
True
>>> iv_vector = fbkv['landlady']

See Also
--------

:meth:`gensim.models.fasttext.FastText.load_facebook_model` loads
the full model, not just word embeddings, and enables you to continue
model training.

"""
model_wrapper = _load_fasttext_format(path, encoding=encoding, full_model=False)
return model_wrapper.wv


def _load_fasttext_format(model_file, encoding='utf-8', full_model=True):
"""Load the input-hidden weight matrix from Facebook's native fasttext `.bin` and `.vec` output files.

Expand Down
44 changes: 20 additions & 24 deletions gensim/test/test_fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
from gensim.models.keyedvectors import Word2VecKeyedVectors
from gensim.test.utils import datapath, get_tmpfile, temporary_file, common_texts as sentences

import gensim.models.fasttext

try:
from pyemd import emd # noqa:F401
PYEMD_EXT = True
Expand Down Expand Up @@ -56,24 +58,9 @@ def setUp(self):
ft_home = os.environ.get('FT_HOME', None)
self.ft_path = os.path.join(ft_home, 'fasttext') if ft_home else None
self.test_model_file = datapath('lee_fasttext')
self.test_model = FT_gensim.load_fasttext_format(self.test_model_file)
self.test_model = gensim.models.fasttext.load_facebook_model(self.test_model_file)
self.test_new_model_file = datapath('lee_fasttext_new')

def test_native_partial_model(self):
"""Can we skip loading the NN and still get a working model?"""
model = FT_gensim.load_fasttext_format(self.test_model_file, full_model=False)

#
# Training continuation should be impossible
#
self.assertIsNone(model.trainables.syn1neg)
self.assertRaises(ValueError, model.train, sentences,
total_examples=model.corpus_count, epochs=model.epochs)

model.wv['green']
model.wv['foobar']
model.wv['thisworddoesnotexist']

def test_training(self):
model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1)
model.build_vocab(sentences)
Expand Down Expand Up @@ -202,7 +189,7 @@ def model_sanity(self, model):

def test_load_fasttext_format(self):
try:
model = FT_gensim.load_fasttext_format(self.test_model_file)
model = gensim.models.fasttext.load_facebook_model(self.test_model_file)
except Exception as exc:
self.fail('Unable to load FastText model from file %s: %s' % (self.test_model_file, exc))
vocab_size, model_size = 1762, 10
Expand Down Expand Up @@ -255,7 +242,7 @@ def test_load_fasttext_format(self):

def test_load_fasttext_new_format(self):
try:
new_model = FT_gensim.load_fasttext_format(self.test_new_model_file)
new_model = gensim.models.fasttext.load_facebook_model(self.test_new_model_file)
except Exception as exc:
self.fail('Unable to load FastText model from file %s: %s' % (self.test_new_model_file, exc))
vocab_size, model_size = 1763, 10
Expand Down Expand Up @@ -308,18 +295,18 @@ def test_load_fasttext_new_format(self):

def test_load_model_supervised(self):
with self.assertRaises(NotImplementedError):
FT_gensim.load_fasttext_format(datapath('pang_lee_polarity_fasttext'))
gensim.models.fasttext.load_facebook_model(datapath('pang_lee_polarity_fasttext'))

def test_load_model_with_non_ascii_vocab(self):
model = FT_gensim.load_fasttext_format(datapath('non_ascii_fasttext'))
model = gensim.models.fasttext.load_facebook_model(datapath('non_ascii_fasttext'))
self.assertTrue(u'který' in model.wv)
try:
model.wv[u'který']
except UnicodeDecodeError:
self.fail('Unable to access vector for utf8 encoded non-ascii word')

def test_load_model_non_utf8_encoding(self):
model = FT_gensim.load_fasttext_format(datapath('cp852_fasttext'), encoding='cp852')
model = gensim.models.fasttext.load_facebook_model(datapath('cp852_fasttext'), encoding='cp852')
self.assertTrue(u'který' in model.wv)
try:
model.wv[u'který']
Expand Down Expand Up @@ -891,7 +878,7 @@ def load_native():
# ./fasttext cbow -input toy-data.txt -output toy-model -bucket 100 -dim 5
#
path = datapath('toy-model.bin')
model = FT_gensim.load_fasttext_format(path)
model = gensim.models.fasttext.load_facebook_model(path)
return model


Expand Down Expand Up @@ -1115,11 +1102,20 @@ def test_save_load_native(self):
model.save(model_name)

def test_load_native_pretrained(self):
model = FT_gensim.load_fasttext_format(datapath('toy-model-pretrained.bin'))
model = gensim.models.fasttext.load_facebook_model(datapath('toy-model-pretrained.bin'))
actual = model['monarchist']
expected = np.array([0.76222, 1.0669, 0.7055, -0.090969, -0.53508])
self.assertTrue(np.allclose(expected, actual, atol=10e-4))

def test_load_native_vectors(self):
cap_path = datapath("crime-and-punishment.bin")
fbkv = gensim.models.fasttext.load_facebook_vectors(cap_path)
self.assertFalse('landlord' in fbkv.vocab)
self.assertTrue('landlady' in fbkv.vocab)
oov_vector = fbkv['landlord']
iv_vector = fbkv['landlady']
self.assertFalse(np.allclose(oov_vector, iv_vector))


def _train_model_with_pretrained_vectors():
"""Generate toy-model-pretrained.bin for use in test_load_native_pretrained.
Expand Down Expand Up @@ -1170,7 +1166,7 @@ def setUp(self):
#
# ./fasttext skipgram -minCount 0 -bucket 100 -input crime-and-punishment.txt -output crime-and-punishment -dim 5 # noqa: E501
#
self.model = FT_gensim.load_fasttext_format(datapath('crime-and-punishment.bin'))
self.model = gensim.models.fasttext.load_facebook_model(datapath('crime-and-punishment.bin'))
with smart_open.smart_open(datapath('crime-and-punishment.vec'), 'r', encoding='utf-8') as fin:
self.expected = dict(load_vec(fin))

Expand Down