[MRG] Load FastText models with specified encoding (piskvorky#1189)

* fixes fasttext wrapper file header * allows user specified encoding for loading fasttext models, corresponding tests
pranaydeeps · Mar 21, 2017 · b285dab · b285dab
1 parent 1474c30
commit b285dab
Show file tree

Hide file tree

Showing 4 changed files with 192 additions and 9 deletions.
diff --git a/gensim/models/wrappers/fasttext.py b/gensim/models/wrappers/fasttext.py
@@ -1,7 +1,8 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #
-# Copyright (C) 2013 Radim Rehurek <me@radimrehurek.com>
+# Author: Jayant Jain <jayantjain1992@gmail.com>
+# Copyright (C) 2017 Radim Rehurek <me@radimrehurek.com>
 # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
 
 
@@ -221,7 +222,7 @@ def load_word2vec_format(cls, *args, **kwargs):
         return FastTextKeyedVectors.load_word2vec_format(*args, **kwargs)
 
     @classmethod
-    def load_fasttext_format(cls, model_file):
+    def load_fasttext_format(cls, model_file, encoding='utf8'):
         """
         Load the input-hidden weight matrix from the fast text output files.
 
@@ -234,8 +235,8 @@ def load_fasttext_format(cls, model_file):
 
         """
         model = cls()
-        model.wv = cls.load_word2vec_format('%s.vec' % model_file)
-        model.load_binary_data('%s.bin' % model_file)
+        model.wv = cls.load_word2vec_format('%s.vec' % model_file, encoding=encoding)
+        model.load_binary_data('%s.bin' % model_file, encoding=encoding)
         return model
 
     @classmethod
@@ -248,11 +249,11 @@ def delete_training_files(cls, model_file):
             logger.debug('Training files %s not found when attempting to delete', model_file)
             pass
 
-    def load_binary_data(self, model_binary_file):
+    def load_binary_data(self, model_binary_file, encoding='utf8'):
         """Loads data from the output binary file created by FastText training"""
         with utils.smart_open(model_binary_file, 'rb') as f:
             self.load_model_params(f)
-            self.load_dict(f)
+            self.load_dict(f, encoding=encoding)
             self.load_vectors(f)
 
     def load_model_params(self, file_handle):
@@ -270,7 +271,7 @@ def load_model_params(self, file_handle):
         self.wv.max_n = maxn
         self.sample = t
 
-    def load_dict(self, file_handle):
+    def load_dict(self, file_handle, encoding='utf8'):
         (vocab_size, nwords, _) = self.struct_unpack(file_handle, '@3i')
         # Vocab stored by [Dictionary::save](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc)
         assert len(self.wv.vocab) == nwords, 'mismatch between vocab sizes'
@@ -283,7 +284,7 @@ def load_dict(self, file_handle):
             while char_byte != b'\x00':
                 word_bytes += char_byte
                 char_byte = file_handle.read(1)
-            word = word_bytes.decode('utf8')
+            word = word_bytes.decode(encoding)
             count, _ = self.struct_unpack(file_handle, '@ib')
             _ = self.struct_unpack(file_handle, '@i')
             assert self.wv.vocab[word].index == i, 'mismatch between gensim word index and fastText word index'

diff --git a/gensim/test/test_data/cp852_fasttext.bin b/gensim/test/test_data/cp852_fasttext.bin
diff --git a/gensim/test/test_data/cp852_fasttext.vec b/gensim/test/test_data/cp852_fasttext.vec
@@ -0,0 +1,172 @@
+171 2
+ji -0.79132 1.9605 
+kter� -0.90811 1.6411 
+jen -0.91547 2.0157 
+podle -0.64689 1.6221 
+zde -0.79732 2.4019 
+u� -0.69159 1.7167 
+b�t -0.455 1.3266 
+v�ce -0.75901 1.688 
+bude -0.71114 2.0771 
+ji� -0.73027 1.267 
+ne� -0.97888 1.8332 
+v�s -0.72803 1.6653 
+by -0.75761 1.9683 
+kter� -0.68791 1.6069 
+co -1.0059 1.6869 
+nebo -0.94393 1.9611 
+ten -0.71975 2.124 
+tak -0.80566 2.0783 
+m� -0.83065 1.3732 
+p�i -0.62158 1.8313 
+od -0.44113 1.7755 
+po -0.7059 2.2615 
+tipy -0.60682 1.7247 
+je�t� -0.68854 1.7517 
+a� -0.63201 1.4618 
+bez -0.52021 1.4513 
+tak� -0.67762 1.8138 
+pouze -0.62611 1.82 
+prvn� -0.42235 1.6216 
+va�e -0.7407 1.5659 
+kter� -0.70914 1.7359 
+n�s -0.38286 1.6016 
+nov� -0.83421 1.7609 
+jsou -0.82699 1.9694 
+pokud -0.35516 1.5075 
+m��e -0.78928 1.6357 
+strana -0.57276 1.4149 
+jeho -0.78568 2.0226 
+sv� -0.44488 1.459 
+jin� -0.90751 1.9602 
+zpr�vy -0.90152 1.9703 
+nov� -0.78853 1.8593 
+nen� -0.63949 1.5191 
+tomu -0.68126 1.8729 
+ona -0.74442 1.825 
+ono -0.78171 1.9268 
+oni -0.64023 2.0525 
+ony -0.78142 1.7097 
+my -0.61062 1.8857 
+vy -0.9356 1.8875 
+j� -0.44615 0.92715 
+m� -0.73676 1.4089 
+mne -0.71006 1.7072 
+jemu -0.92237 2.1452 
+on -0.71417 1.9224 
+t�m -0.65242 1.8779 
+t�mu -0.83376 2.054 
+n�mu -0.79287 1.8645 
+n�mu� -0.51786 1.7297 
+jeho� -0.88721 1.7431 
+j�� -0.12627 0.68014 
+jeliko� -0.61809 1.7576 
+je� -0.8843 1.6723 
+jako� -0.94336 1.827 
+na�e� -0.76919 1.8106 
+ze -0.8277 2.0542 
+jak -0.97146 1.9164 
+dal� -0.5719 1.5148 
+ale -0.79733 1.8867 
+si -0.61439 1.7134 
+se -0.80843 1.8957 
+ve -0.7186 1.7891 
+to -0.84494 2.3933 
+jako -1.1045 2.2656 
+za -0.7136 1.9602 
+zp�t -0.79965 1.6329 
+jejich -0.49038 1.6366 
+do -0.69806 1.8364 
+pro -0.7878 2.2066 
+je -1.1291 3.0005 
+na -1.0203 2.4399 
+atd -0.70418 1.7405 
+atp -0.69278 1.5772 
+jakmile -0.87231 1.6896 
+p�i�em� -0.64617 1.4417 
+j� -0.7135 1.5517 
+n�m -0.42164 1.7603 
+jej -0.77603 1.9544 
+zda -0.76742 2.0163 
+pro� -0.47241 1.7053 
+m�te -0.75963 1.9814 
+tato -0.64318 2.0382 
+kam -0.45101 1.498 
+tohoto -0.73702 1.8305 
+kdo -0.80535 1.8551 
+kte�� -0.72498 1.6669 
+mi -0.46791 1.7784 
+tyto -0.50319 1.7659 
+tom -0.59138 1.8657 
+tomuto -0.74312 1.7725 
+m�t -0.27199 1.1315 
+nic -0.56441 1.8591 
+proto -0.6649 1.946 
+kterou -0.84109 1.7498 
+byla -0.58737 1.941 
+toho -0.76081 1.8002 
+proto�e -0.55749 1.6686 
+asi -0.51689 1.7079 
+bude� -0.55392 1.6052 
+s -0.74207 1.8989 
+k -0.61082 2.079 
+o -0.76465 1.8956 
+i -0.85412 1.6611 
+u -0.68535 1.5332 
+v -0.73033 1.3855 
+z -0.60751 1.9108 
+dnes -0.6001 1.7531 
+cz -0.59754 1.4239 
+t�mto -0.69011 1.6643 
+ho -0.55961 1.6968 
+budem -0.54027 1.7894 
+byli -0.60956 1.793 
+jse� -0.63127 1.5972 
+m�j -0.48904 1.2814 
+sv�m -0.48494 1.8751 
+ta -0.78131 2.4286 
+tomto -0.60948 1.7083 
+tohle -0.74747 1.7907 
+tuto -0.74687 1.9464 
+neg -0.60997 1.7777 
+pod -0.49619 1.914 
+t�ma -0.55525 1.6668 
+mezi -0.46979 1.3583 
+p�es -0.5712 1.9908 
+ty -0.78637 2.2804 
+pak -0.60084 1.7026 
+v�m -0.48545 1.4611 
+ani -0.65672 1.7897 
+kdy� -0.42318 1.4884 
+v�ak -0.60908 1.6867 
+�i -0.36843 1.7586 
+jsem -0.54047 1.827 
+tento -0.64813 1.9799 
+�l�nku -0.65578 1.9129 
+�l�nky -0.55868 1.8642 
+aby -0.80989 1.8384 
+jsme -0.60673 1.843 
+p�ed -0.53861 2.0502 
+pta -0.49464 1.714 
+a -0.63056 2.2477 
+aj -0.62546 1.6357 
+na�i -0.5915 1.6066 
+napi�te -0.50964 1.777 
+re -0.95733 1.9544 
+co� -0.54673 1.6466 
+t�m -0.70952 1.8565 
+tak�e -0.55439 1.8013 
+sv�ch -0.36878 1.4883 
+jej� -0.7694 1.6612 
+sv�mi -0.63149 2.1581 
+jste -0.68444 2.0978 
+byl -0.57205 1.7836 
+tu -0.88384 2.2256 
+tedy -0.62474 2.0469 
+teto -0.63187 1.884 
+bylo -0.56362 2.0282 
+kde -0.7308 2.0316 
+ke -0.60918 1.9317 
+prav� -0.52626 1.9058 
+nad -0.54689 1.8666 
+nejsou -0.66814 1.8323 
diff --git a/gensim/test/test_fasttext_wrapper.py b/gensim/test/test_fasttext_wrapper.py
@@ -121,12 +121,22 @@ def testLoadFastTextFormat(self):
         self.model_sanity(model)
 
     def testLoadModelWithNonAsciiVocab(self):
+        """Test loading model with non-ascii words in vocab"""
         model = fasttext.FastText.load_fasttext_format(datapath('non_ascii_fasttext'))
         self.assertTrue(u'který' in model)
         try:
             vector = model[u'který']
         except UnicodeDecodeError:
-            self.fail('Unable to access vector for non-ascii word')
+            self.fail('Unable to access vector for utf8 encoded non-ascii word')
+
+    def testLoadModelNonUtf8Encoding(self):
+        """Test loading model with words in user-specified encoding"""
+        model = fasttext.FastText.load_fasttext_format(datapath('cp852_fasttext'), encoding='cp852')
+        self.assertTrue(u'který' in model)
+        try:
+            vector = model[u'který']
+        except KeyError:
+            self.fail('Unable to access vector for cp-852 word')
 
     def testNSimilarity(self):
         """Test n_similarity for in-vocab and out-of-vocab words"""