From e5951d0335443a842937d7a182ab630aa2104553 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Thu, 7 Feb 2019 10:12:18 +1100 Subject: [PATCH 1/7] implement KeyedVectors.load_fasttext_format method --- gensim/models/fasttext.py | 67 +++++++++++--------------------- gensim/models/keyedvectors.py | 62 +++++++++++++++++++++++++++++ gensim/test/test_fasttext.py | 15 ------- gensim/test/test_keyedvectors.py | 11 ++++++ 4 files changed, 95 insertions(+), 60 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 7fd1db6dbc..e9ca471311 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -171,27 +171,28 @@ .. sourcecode:: pycon >>> cap_path = datapath("crime-and-punishment.bin") - >>> # Partial model: loads quickly, uses less RAM, but cannot continue training - >>> fb_partial = FastText.load_fasttext_format(cap_path, full_model=False) - >>> # Full model: loads slowly, consumes RAM, but can continue training (see below) - >>> fb_full = FastText.load_fasttext_format(cap_path, full_model=True) + >>> fb_model = FastText.load_fasttext_format(cap_path) Once loaded, such models behave identically to those trained from scratch. You may continue training them on new data: .. sourcecode:: pycon - >>> 'computer' in fb_full.wv.vocab # New word, currently out of vocab + >>> 'computer' in fb_model.wv.vocab # New word, currently out of vocab False - >>> old_computer = np.copy(fb_full.wv['computer']) # Calculate current vectors - >>> fb_full.build_vocab(new_sentences, update=True) - >>> fb_full.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs) - >>> new_computer = fb_full.wv['computer'] + >>> old_computer = np.copy(fb_model.wv['computer']) # Calculate current vectors + >>> fb_model.build_vocab(new_sentences, update=True) + >>> fb_model.train(new_sentences, total_examples=len(new_sentences), epochs=model.epochs) + >>> new_computer = fb_model.wv['computer'] >>> np.allclose(old_computer, new_computer, atol=1e-4) # Vector has changed, model has learnt something False - >>> 'computer' in fb_full.wv.vocab # New word is now in the vocabulary + >>> 'computer' in fb_model.wv.vocab # New word is now in the vocabulary True +If you do not intend to continue training the model, consider using the +:meth:`gensim.models.KeyedVectors.from_fasttext_format` method instead. +That method only loads the word embeddings, consuming much less CPU and RAM. + Retrieve word-vector for vocab and out-of-vocab word: .. sourcecode:: pycon @@ -879,14 +880,6 @@ def train(self, sentences=None, corpus_file=None, total_examples=None, total_wor >>> model.train(sentences, total_examples=model.corpus_count, epochs=model.epochs) """ - cant_train = hasattr(self.trainables, 'syn1neg') and self.trainables.syn1neg is None - if cant_train: - raise ValueError( - 'this model cannot be trained any further, ' - 'if this is a native model, try loading it with ' - 'FastText.load_fasttext_format(path, full_model=True)' - ) - super(FastText, self).train( sentences=sentences, corpus_file=corpus_file, total_examples=total_examples, total_words=total_words, epochs=epochs, start_alpha=start_alpha, end_alpha=end_alpha, word_count=word_count, @@ -937,13 +930,14 @@ def __contains__(self, word): return self.wv.__contains__(word) @classmethod - def load_fasttext_format(cls, model_file, encoding='utf8', full_model=True): + def load_fasttext_format(cls, model_file, encoding='utf8'): """Load the input-hidden weight matrix from Facebook's native fasttext `.bin` and `.vec` output files. By default, this function loads the full model. A full model allows continuing training with more data, but also consumes more RAM and takes longer to load. If you do not need to continue training and only - wish the work with the already-trained embeddings, use `full_model=False` + wish the work with the already-trained embeddings, use + :meth:`gensim.models.KeyedVectors.from_fasttext_format` for faster loading and to save RAM. Notes @@ -963,9 +957,6 @@ def load_fasttext_format(cls, model_file, encoding='utf8', full_model=True): as Gensim requires only `.bin` file to the load entire fastText model. encoding : str, optional Specifies the file encoding. - full_model : boolean, optional - If False, skips loading the hidden output matrix. This saves a fair bit - of CPU time and RAM, but **prevents training continuation**. Examples -------- @@ -977,33 +968,19 @@ def load_fasttext_format(cls, model_file, encoding='utf8', full_model=True): >>> from gensim.test.utils import datapath >>> >>> cap_path = datapath("crime-and-punishment.bin") - >>> fb_full = FastText.load_fasttext_format(cap_path, full_model=True) + >>> fb_model = FastText.load_fasttext_format(cap_path) >>> - >>> 'landlord' in fb_full.wv.vocab # Word is out of vocabulary + >>> 'landlord' in fb_model.wv.vocab # Word is out of vocabulary False - >>> oov_term = fb_full.wv['landlord'] + >>> oov_term = fb_model.wv['landlord'] >>> - >>> 'landlady' in fb_full.wv.vocab # Word is in the vocabulary + >>> 'landlady' in fb_model.wv.vocab # Word is in the vocabulary True - >>> iv_term = fb_full.wv['landlady'] + >>> iv_term = fb_model.wv['landlady'] >>> >>> new_sent = [['lord', 'of', 'the', 'rings'], ['lord', 'of', 'the', 'flies']] - >>> fb_full.build_vocab(new_sent, update=True) - >>> fb_full.train(sentences=new_sent, total_examples=len(new_sent), epochs=5) - - Load quickly, infer (forego training continuation): - - .. sourcecode:: pycon - - >>> fb_partial = FastText.load_fasttext_format(cap_path, full_model=False) - >>> - >>> 'landlord' in fb_partial.wv.vocab # Word is out of vocabulary - False - >>> oov_term = fb_partial.wv['landlord'] - >>> - >>> 'landlady' in fb_partial.wv.vocab # Word is in the vocabulary - True - >>> iv_term = fb_partial.wv['landlady'] + >>> fb_model.build_vocab(new_sent, update=True) + >>> fb_model.train(sentences=new_sent, total_examples=len(new_sent), epochs=5) Returns ------- @@ -1011,7 +988,7 @@ def load_fasttext_format(cls, model_file, encoding='utf8', full_model=True): The loaded model. """ - return _load_fasttext_format(model_file, encoding=encoding, full_model=full_model) + return _load_fasttext_format(model_file, encoding=encoding, full_model=True) def load_binary_data(self, encoding='utf8'): """Load data from a binary file created by Facebook's native FastText. diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 1428503c8a..c53966dd7d 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -86,6 +86,22 @@ >>> wv_from_text = KeyedVectors.load_word2vec_format(datapath('word2vec_pre_kv_c'), binary=False) # C text format >>> wv_from_bin = KeyedVectors.load_word2vec_format(datapath("euclidean_vectors.bin"), binary=True) # C bin format +You can also load vectors from Facebook's fastText binary format: + +.. sourcecode:: pycon + >>> from gensim.test.utils import datapath + >>> + >>> cap_path = datapath("crime-and-punishment.bin") + >>> fbkv = KeyedVectors.load_fasttext_format(cap_path) + >>> + >>> 'landlord' in fbkv.vocab # Word is out of vocabulary + False + >>> oov_vector = fbkv['landlord'] + >>> + >>> 'landlady' in fbkv.vocab # Word is in the vocabulary + True + >>> iv_vector = fbkv['landlady'] + What can I do with word vectors? ================================ @@ -1475,6 +1491,52 @@ def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', cls, fname, fvocab=fvocab, binary=binary, encoding=encoding, unicode_errors=unicode_errors, limit=limit, datatype=datatype) + @staticmethod + def load_fasttext_format(path, encoding='utf-8'): + """Load word embeddings from a model saved in Facebook's native fasttext `.bin` format. + + Parameters + ---------- + path : str + The location of the model file. + encoding : str, optional + Specifies the file encoding. + + Returns + ------- + gensim.models.keyedvectors.FastTextKeyedVectors + The word embeddings. + + Examples + -------- + + Load and infer: + + >>> from gensim.test.utils import datapath + >>> + >>> cap_path = datapath("crime-and-punishment.bin") + >>> fbkv = KeyedVectors.load_fasttext_format(cap_path) + >>> + >>> 'landlord' in fbkv.vocab # Word is out of vocabulary + False + >>> oov_vector = fbkv['landlord'] + >>> + >>> 'landlady' in fbkv.vocab # Word is in the vocabulary + True + >>> iv_vector = fbkv['landlady'] + + See Also + -------- + + :method:`gensim.models.fasttext.FastText.load_fasttext_format` loads + the full model, not just word embeddings, and enables you to continue + model training. + + """ + from gensim.models.fasttext import _load_fasttext_format + model_wrapper = _load_fasttext_format(path, full_model=False, encoding=encoding) + return model_wrapper.wv + def get_keras_embedding(self, train_embeddings=False): """Get a Keras 'Embedding' layer with weights set as the Word2Vec model's learned word embeddings. diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 71c6d55feb..3abf6495fa 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -59,21 +59,6 @@ def setUp(self): self.test_model = FT_gensim.load_fasttext_format(self.test_model_file) self.test_new_model_file = datapath('lee_fasttext_new') - def test_native_partial_model(self): - """Can we skip loading the NN and still get a working model?""" - model = FT_gensim.load_fasttext_format(self.test_model_file, full_model=False) - - # - # Training continuation should be impossible - # - self.assertIsNone(model.trainables.syn1neg) - self.assertRaises(ValueError, model.train, sentences, - total_examples=model.corpus_count, epochs=model.epochs) - - model.wv['green'] - model.wv['foobar'] - model.wv['thisworddoesnotexist'] - def test_training(self): model = FT_gensim(size=10, min_count=1, hs=1, negative=0, seed=42, workers=1) model.build_vocab(sentences) diff --git a/gensim/test/test_keyedvectors.py b/gensim/test/test_keyedvectors.py index 59e361cc6c..d1988facf0 100644 --- a/gensim/test/test_keyedvectors.py +++ b/gensim/test/test_keyedvectors.py @@ -306,6 +306,17 @@ def test(self): self.assertTrue(np.allclose(m, norm)) +class LoadFastTextFormatTest(unittest.TestCase): + def test(self): + cap_path = datapath("crime-and-punishment.bin") + fbkv = gensim.models.keyedvectors.KeyedVectors.load_fasttext_format(cap_path) + self.assertFalse('landlord' in fbkv.vocab) + self.assertTrue('landlady' in fbkv.vocab) + oov_vector = fbkv['landlord'] + iv_vector = fbkv['landlady'] + self.assertFalse(np.allclose(oov_vector, iv_vector)) + + if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() From 5c7d4542d0d4b12a1687ac545f5d2493f3382354 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Thu, 7 Feb 2019 10:33:00 +1100 Subject: [PATCH 2/7] minor doco fixup --- gensim/models/keyedvectors.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index c53966dd7d..7b731889f5 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -89,6 +89,7 @@ You can also load vectors from Facebook's fastText binary format: .. sourcecode:: pycon + >>> from gensim.test.utils import datapath >>> >>> cap_path = datapath("crime-and-punishment.bin") @@ -1528,7 +1529,7 @@ def load_fasttext_format(path, encoding='utf-8'): See Also -------- - :method:`gensim.models.fasttext.FastText.load_fasttext_format` loads + :meth:`gensim.models.fasttext.FastText.load_fasttext_format` loads the full model, not just word embeddings, and enables you to continue model training. From d7160c8f930daa8f45d1a15af2dd8952e4d4fd76 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 15 Feb 2019 09:47:51 +0900 Subject: [PATCH 3/7] introduced pure functions for facebook model/vector I/O --- gensim/models/fasttext.py | 181 +++++++++++++++++++++---------- gensim/models/keyedvectors.py | 63 ----------- gensim/test/test_fasttext.py | 29 +++-- gensim/test/test_keyedvectors.py | 11 -- 4 files changed, 143 insertions(+), 141 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index e9ca471311..ab047f2437 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -171,7 +171,7 @@ .. sourcecode:: pycon >>> cap_path = datapath("crime-and-punishment.bin") - >>> fb_model = FastText.load_fasttext_format(cap_path) + >>> fb_model = load_facebook_model(cap_path) Once loaded, such models behave identically to those trained from scratch. You may continue training them on new data: @@ -190,8 +190,23 @@ True If you do not intend to continue training the model, consider using the -:meth:`gensim.models.KeyedVectors.from_fasttext_format` method instead. -That method only loads the word embeddings, consuming much less CPU and RAM. +:func:`gensim.models.FastText.load_facebook_vectors` function instead. +That function only loads the word embeddings (keyed vectors), consuming much less CPU and RAM: + +.. sourcecode:: pycon + + >>> from gensim.test.utils import datapath + >>> + >>> cap_path = datapath("crime-and-punishment.bin") + >>> wv = load_facebook_vectors(cap_path) + >>> + >>> 'landlord' in wv.vocab # Word is out of vocabulary + False + >>> oov_vector = wv['landlord'] + >>> + >>> 'landlady' in wv.vocab # Word is in the vocabulary + True + >>> iv_vector = wv['landlady'] Retrieve word-vector for vocab and out-of-vocab word: @@ -929,67 +944,18 @@ def __contains__(self, word): """ return self.wv.__contains__(word) + @deprecated('use load_facebook_model or load_facebook_vectors instead') @classmethod def load_fasttext_format(cls, model_file, encoding='utf8'): - """Load the input-hidden weight matrix from Facebook's native fasttext `.bin` and `.vec` output files. - - By default, this function loads the full model. A full model allows - continuing training with more data, but also consumes more RAM and - takes longer to load. If you do not need to continue training and only - wish the work with the already-trained embeddings, use - :meth:`gensim.models.KeyedVectors.from_fasttext_format` - for faster loading and to save RAM. - - Notes - ------ - Facebook provides both `.vec` and `.bin` files with their modules. - The former contains human-readable vectors. - The latter contains machine-readable vectors along with other model parameters. - This function effectively ignores `.vec` output file, since that file is redundant. - It only needs the `.bin` file. + """Deprecated. - Parameters - ---------- - model_file : str - Path to the FastText output files. - FastText outputs two model files - `/path/to/model.vec` and `/path/to/model.bin` - Expected value for this example: `/path/to/model` or `/path/to/model.bin`, - as Gensim requires only `.bin` file to the load entire fastText model. - encoding : str, optional - Specifies the file encoding. - - Examples - -------- - - Load, infer, continue training: - - .. sourcecode:: pycon - - >>> from gensim.test.utils import datapath - >>> - >>> cap_path = datapath("crime-and-punishment.bin") - >>> fb_model = FastText.load_fasttext_format(cap_path) - >>> - >>> 'landlord' in fb_model.wv.vocab # Word is out of vocabulary - False - >>> oov_term = fb_model.wv['landlord'] - >>> - >>> 'landlady' in fb_model.wv.vocab # Word is in the vocabulary - True - >>> iv_term = fb_model.wv['landlady'] - >>> - >>> new_sent = [['lord', 'of', 'the', 'rings'], ['lord', 'of', 'the', 'flies']] - >>> fb_model.build_vocab(new_sent, update=True) - >>> fb_model.train(sentences=new_sent, total_examples=len(new_sent), epochs=5) - - Returns - ------- - gensim.models.fasttext.FastText - The loaded model. + Use :func:`gensim.models.fasttext.load_facebook_model` or + :func:`gensim.models.fasttext.load_facebook_vectors` instead. """ - return _load_fasttext_format(model_file, encoding=encoding, full_model=True) + return load_facebook_model(model_file, encoding=encoding) + @deprecated('use load_facebook_model or load_facebook_vectors instead') def load_binary_data(self, encoding='utf8'): """Load data from a binary file created by Facebook's native FastText. @@ -1197,6 +1163,105 @@ def _pad_ones(m, new_shape): return vstack([m, suffix]) +def load_facebook_model(path, encoding='utf-8'): + """Load the input-hidden weight matrix from Facebook's native fasttext `.bin` output file. + + Notes + ------ + Facebook provides both `.vec` and `.bin` files with their modules. + The former contains human-readable vectors. + The latter contains machine-readable vectors along with other model parameters. + This function effectively ignores `.vec` output file, since that file is redundant. + It only needs the `.bin` file. + + Parameters + ---------- + model_file : str + Path to the FastText output files. + FastText outputs two model files - `/path/to/model.vec` and `/path/to/model.bin` + Expected value for this example: `/path/to/model` or `/path/to/model.bin`, + as Gensim requires only `.bin` file to the load entire fastText model. + encoding : str, optional + Specifies the file encoding. + + Examples + -------- + + Load, infer, continue training: + + .. sourcecode:: pycon + + >>> from gensim.test.utils import datapath + >>> + >>> cap_path = datapath("crime-and-punishment.bin") + >>> fb_model = load_facebook_model(cap_path) + >>> + >>> 'landlord' in fb_model.wv.vocab # Word is out of vocabulary + False + >>> oov_term = fb_model.wv['landlord'] + >>> + >>> 'landlady' in fb_model.wv.vocab # Word is in the vocabulary + True + >>> iv_term = fb_model.wv['landlady'] + >>> + >>> new_sent = [['lord', 'of', 'the', 'rings'], ['lord', 'of', 'the', 'flies']] + >>> fb_model.build_vocab(new_sent, update=True) + >>> fb_model.train(sentences=new_sent, total_examples=len(new_sent), epochs=5) + + Returns + ------- + gensim.models.fasttext.FastText + The loaded model. + + """ + return _load_fasttext_format(path, encoding=encoding, full_model=True) + + +def load_facebook_vectors(path, encoding='utf-8'): + """Load word embeddings from a model saved in Facebook's native fasttext `.bin` format. + + Parameters + ---------- + path : str + The location of the model file. + encoding : str, optional + Specifies the file encoding. + + Returns + ------- + gensim.models.keyedvectors.FastTextKeyedVectors + The word embeddings. + + Examples + -------- + + Load and infer: + + >>> from gensim.test.utils import datapath + >>> + >>> cap_path = datapath("crime-and-punishment.bin") + >>> fbkv = load_facebook_vectors(cap_path) + >>> + >>> 'landlord' in fbkv.vocab # Word is out of vocabulary + False + >>> oov_vector = fbkv['landlord'] + >>> + >>> 'landlady' in fbkv.vocab # Word is in the vocabulary + True + >>> iv_vector = fbkv['landlady'] + + See Also + -------- + + :meth:`gensim.models.fasttext.FastText.load_facebook_model` loads + the full model, not just word embeddings, and enables you to continue + model training. + + """ + model_wrapper = _load_fasttext_format(path, encoding=encoding, full_model=False) + return model_wrapper.wv + + def _load_fasttext_format(model_file, encoding='utf-8', full_model=True): """Load the input-hidden weight matrix from Facebook's native fasttext `.bin` and `.vec` output files. diff --git a/gensim/models/keyedvectors.py b/gensim/models/keyedvectors.py index 7b731889f5..1428503c8a 100644 --- a/gensim/models/keyedvectors.py +++ b/gensim/models/keyedvectors.py @@ -86,23 +86,6 @@ >>> wv_from_text = KeyedVectors.load_word2vec_format(datapath('word2vec_pre_kv_c'), binary=False) # C text format >>> wv_from_bin = KeyedVectors.load_word2vec_format(datapath("euclidean_vectors.bin"), binary=True) # C bin format -You can also load vectors from Facebook's fastText binary format: - -.. sourcecode:: pycon - - >>> from gensim.test.utils import datapath - >>> - >>> cap_path = datapath("crime-and-punishment.bin") - >>> fbkv = KeyedVectors.load_fasttext_format(cap_path) - >>> - >>> 'landlord' in fbkv.vocab # Word is out of vocabulary - False - >>> oov_vector = fbkv['landlord'] - >>> - >>> 'landlady' in fbkv.vocab # Word is in the vocabulary - True - >>> iv_vector = fbkv['landlady'] - What can I do with word vectors? ================================ @@ -1492,52 +1475,6 @@ def load_word2vec_format(cls, fname, fvocab=None, binary=False, encoding='utf8', cls, fname, fvocab=fvocab, binary=binary, encoding=encoding, unicode_errors=unicode_errors, limit=limit, datatype=datatype) - @staticmethod - def load_fasttext_format(path, encoding='utf-8'): - """Load word embeddings from a model saved in Facebook's native fasttext `.bin` format. - - Parameters - ---------- - path : str - The location of the model file. - encoding : str, optional - Specifies the file encoding. - - Returns - ------- - gensim.models.keyedvectors.FastTextKeyedVectors - The word embeddings. - - Examples - -------- - - Load and infer: - - >>> from gensim.test.utils import datapath - >>> - >>> cap_path = datapath("crime-and-punishment.bin") - >>> fbkv = KeyedVectors.load_fasttext_format(cap_path) - >>> - >>> 'landlord' in fbkv.vocab # Word is out of vocabulary - False - >>> oov_vector = fbkv['landlord'] - >>> - >>> 'landlady' in fbkv.vocab # Word is in the vocabulary - True - >>> iv_vector = fbkv['landlady'] - - See Also - -------- - - :meth:`gensim.models.fasttext.FastText.load_fasttext_format` loads - the full model, not just word embeddings, and enables you to continue - model training. - - """ - from gensim.models.fasttext import _load_fasttext_format - model_wrapper = _load_fasttext_format(path, full_model=False, encoding=encoding) - return model_wrapper.wv - def get_keras_embedding(self, train_embeddings=False): """Get a Keras 'Embedding' layer with weights set as the Word2Vec model's learned word embeddings. diff --git a/gensim/test/test_fasttext.py b/gensim/test/test_fasttext.py index 3abf6495fa..98fa30387e 100644 --- a/gensim/test/test_fasttext.py +++ b/gensim/test/test_fasttext.py @@ -20,6 +20,8 @@ from gensim.models.keyedvectors import Word2VecKeyedVectors from gensim.test.utils import datapath, get_tmpfile, temporary_file, common_texts as sentences +import gensim.models.fasttext + try: from pyemd import emd # noqa:F401 PYEMD_EXT = True @@ -56,7 +58,7 @@ def setUp(self): ft_home = os.environ.get('FT_HOME', None) self.ft_path = os.path.join(ft_home, 'fasttext') if ft_home else None self.test_model_file = datapath('lee_fasttext') - self.test_model = FT_gensim.load_fasttext_format(self.test_model_file) + self.test_model = gensim.models.fasttext.load_facebook_model(self.test_model_file) self.test_new_model_file = datapath('lee_fasttext_new') def test_training(self): @@ -187,7 +189,7 @@ def model_sanity(self, model): def test_load_fasttext_format(self): try: - model = FT_gensim.load_fasttext_format(self.test_model_file) + model = gensim.models.fasttext.load_facebook_model(self.test_model_file) except Exception as exc: self.fail('Unable to load FastText model from file %s: %s' % (self.test_model_file, exc)) vocab_size, model_size = 1762, 10 @@ -240,7 +242,7 @@ def test_load_fasttext_format(self): def test_load_fasttext_new_format(self): try: - new_model = FT_gensim.load_fasttext_format(self.test_new_model_file) + new_model = gensim.models.fasttext.load_facebook_model(self.test_new_model_file) except Exception as exc: self.fail('Unable to load FastText model from file %s: %s' % (self.test_new_model_file, exc)) vocab_size, model_size = 1763, 10 @@ -293,10 +295,10 @@ def test_load_fasttext_new_format(self): def test_load_model_supervised(self): with self.assertRaises(NotImplementedError): - FT_gensim.load_fasttext_format(datapath('pang_lee_polarity_fasttext')) + gensim.models.fasttext.load_facebook_model(datapath('pang_lee_polarity_fasttext')) def test_load_model_with_non_ascii_vocab(self): - model = FT_gensim.load_fasttext_format(datapath('non_ascii_fasttext')) + model = gensim.models.fasttext.load_facebook_model(datapath('non_ascii_fasttext')) self.assertTrue(u'který' in model.wv) try: model.wv[u'který'] @@ -304,7 +306,7 @@ def test_load_model_with_non_ascii_vocab(self): self.fail('Unable to access vector for utf8 encoded non-ascii word') def test_load_model_non_utf8_encoding(self): - model = FT_gensim.load_fasttext_format(datapath('cp852_fasttext'), encoding='cp852') + model = gensim.models.fasttext.load_facebook_model(datapath('cp852_fasttext'), encoding='cp852') self.assertTrue(u'který' in model.wv) try: model.wv[u'který'] @@ -876,7 +878,7 @@ def load_native(): # ./fasttext cbow -input toy-data.txt -output toy-model -bucket 100 -dim 5 # path = datapath('toy-model.bin') - model = FT_gensim.load_fasttext_format(path) + model = gensim.models.fasttext.load_facebook_model(path) return model @@ -1100,11 +1102,20 @@ def test_save_load_native(self): model.save(model_name) def test_load_native_pretrained(self): - model = FT_gensim.load_fasttext_format(datapath('toy-model-pretrained.bin')) + model = gensim.models.fasttext.load_facebook_model(datapath('toy-model-pretrained.bin')) actual = model['monarchist'] expected = np.array([0.76222, 1.0669, 0.7055, -0.090969, -0.53508]) self.assertTrue(np.allclose(expected, actual, atol=10e-4)) + def test_load_native_vectors(self): + cap_path = datapath("crime-and-punishment.bin") + fbkv = gensim.models.fasttext.load_facebook_vectors(cap_path) + self.assertFalse('landlord' in fbkv.vocab) + self.assertTrue('landlady' in fbkv.vocab) + oov_vector = fbkv['landlord'] + iv_vector = fbkv['landlady'] + self.assertFalse(np.allclose(oov_vector, iv_vector)) + def _train_model_with_pretrained_vectors(): """Generate toy-model-pretrained.bin for use in test_load_native_pretrained. @@ -1155,7 +1166,7 @@ def setUp(self): # # ./fasttext skipgram -minCount 0 -bucket 100 -input crime-and-punishment.txt -output crime-and-punishment -dim 5 # noqa: E501 # - self.model = FT_gensim.load_fasttext_format(datapath('crime-and-punishment.bin')) + self.model = gensim.models.fasttext.load_facebook_model(datapath('crime-and-punishment.bin')) with smart_open.smart_open(datapath('crime-and-punishment.vec'), 'r', encoding='utf-8') as fin: self.expected = dict(load_vec(fin)) diff --git a/gensim/test/test_keyedvectors.py b/gensim/test/test_keyedvectors.py index d1988facf0..59e361cc6c 100644 --- a/gensim/test/test_keyedvectors.py +++ b/gensim/test/test_keyedvectors.py @@ -306,17 +306,6 @@ def test(self): self.assertTrue(np.allclose(m, norm)) -class LoadFastTextFormatTest(unittest.TestCase): - def test(self): - cap_path = datapath("crime-and-punishment.bin") - fbkv = gensim.models.keyedvectors.KeyedVectors.load_fasttext_format(cap_path) - self.assertFalse('landlord' in fbkv.vocab) - self.assertTrue('landlady' in fbkv.vocab) - oov_vector = fbkv['landlord'] - iv_vector = fbkv['landlady'] - self.assertFalse(np.allclose(oov_vector, iv_vector)) - - if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG) unittest.main() From be89ebdc8a7f7681c8de36d81a533f8fc6cfad54 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 15 Feb 2019 10:18:44 +0900 Subject: [PATCH 4/7] adjust decorators --- gensim/models/fasttext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index ab047f2437..7885a4e1c7 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -944,8 +944,8 @@ def __contains__(self, word): """ return self.wv.__contains__(word) - @deprecated('use load_facebook_model or load_facebook_vectors instead') @classmethod + @deprecated('use load_facebook_model or load_facebook_vectors instead') def load_fasttext_format(cls, model_file, encoding='utf8'): """Deprecated. From d637cfed1e4b7237a0aed6989308867a61b4213d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Radim=20=C5=98eh=C5=AF=C5=99ek?= Date: Fri, 15 Feb 2019 18:01:00 +0900 Subject: [PATCH 5/7] Update gensim/models/fasttext.py Co-Authored-By: mpenkov --- gensim/models/fasttext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 7885a4e1c7..6407313faf 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -945,7 +945,7 @@ def __contains__(self, word): return self.wv.__contains__(word) @classmethod - @deprecated('use load_facebook_model or load_facebook_vectors instead') + @deprecated('use load_facebook_vectors (to use pretrained embeddings) or load_facebook_model (to continue training with the loaded full model, more RAM) instead') def load_fasttext_format(cls, model_file, encoding='utf8'): """Deprecated. From cc3068dca8e86966c6783ccf5d4a38e6b2a6dca2 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Fri, 15 Feb 2019 20:20:30 +0900 Subject: [PATCH 6/7] split long deprecation warning across multiple lines --- gensim/models/fasttext.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index 6407313faf..e19f117288 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -945,7 +945,10 @@ def __contains__(self, word): return self.wv.__contains__(word) @classmethod - @deprecated('use load_facebook_vectors (to use pretrained embeddings) or load_facebook_model (to continue training with the loaded full model, more RAM) instead') + @deprecated( + 'use load_facebook_vectors (to use pretrained embeddings) or load_facebook_model ' + '(to continue training with the loaded full model, more RAM) instead' + ) def load_fasttext_format(cls, model_file, encoding='utf8'): """Deprecated. @@ -955,7 +958,10 @@ def load_fasttext_format(cls, model_file, encoding='utf8'): """ return load_facebook_model(model_file, encoding=encoding) - @deprecated('use load_facebook_model or load_facebook_vectors instead') + @deprecated( + 'use load_facebook_vectors (to use pretrained embeddings) or load_facebook_model ' + '(to continue training with the loaded full model, more RAM) instead' + ) def load_binary_data(self, encoding='utf8'): """Load data from a binary file created by Facebook's native FastText. From c19b4982da05d68d209d4a92b7d5a42def917905 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Thu, 7 Mar 2019 13:31:21 +0900 Subject: [PATCH 7/7] adjust comment --- gensim/models/fasttext.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py index e19f117288..2a8a861f9e 100644 --- a/gensim/models/fasttext.py +++ b/gensim/models/fasttext.py @@ -433,7 +433,7 @@ class FastText(BaseWordEmbeddingsModel): The model can be stored/loaded via its :meth:`~gensim.models.fasttext.FastText.save` and :meth:`~gensim.models.fasttext.FastText.load` methods, or loaded from a format compatible with the original - Fasttext implementation via :meth:`~gensim.models.fasttext.FastText.load_fasttext_format`. + Fasttext implementation via :func:`~gensim.models.fasttext.load_facebook_model`. Attributes ----------