Skip to content

Commit

Permalink
[MRG] Load FastText models with specified encoding (piskvorky#1189)
Browse files Browse the repository at this point in the history
* fixes fasttext wrapper file header

* allows user specified encoding for loading fasttext models, corresponding tests
  • Loading branch information
jayantj authored and Pranaydeep Singh committed Mar 21, 2017
1 parent 1474c30 commit b285dab
Show file tree
Hide file tree
Showing 4 changed files with 192 additions and 9 deletions.
17 changes: 9 additions & 8 deletions gensim/models/wrappers/fasttext.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2013 Radim Rehurek <me@radimrehurek.com>
# Author: Jayant Jain <jayantjain1992@gmail.com>
# Copyright (C) 2017 Radim Rehurek <me@radimrehurek.com>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html


Expand Down Expand Up @@ -221,7 +222,7 @@ def load_word2vec_format(cls, *args, **kwargs):
return FastTextKeyedVectors.load_word2vec_format(*args, **kwargs)

@classmethod
def load_fasttext_format(cls, model_file):
def load_fasttext_format(cls, model_file, encoding='utf8'):
"""
Load the input-hidden weight matrix from the fast text output files.
Expand All @@ -234,8 +235,8 @@ def load_fasttext_format(cls, model_file):
"""
model = cls()
model.wv = cls.load_word2vec_format('%s.vec' % model_file)
model.load_binary_data('%s.bin' % model_file)
model.wv = cls.load_word2vec_format('%s.vec' % model_file, encoding=encoding)
model.load_binary_data('%s.bin' % model_file, encoding=encoding)
return model

@classmethod
Expand All @@ -248,11 +249,11 @@ def delete_training_files(cls, model_file):
logger.debug('Training files %s not found when attempting to delete', model_file)
pass

def load_binary_data(self, model_binary_file):
def load_binary_data(self, model_binary_file, encoding='utf8'):
"""Loads data from the output binary file created by FastText training"""
with utils.smart_open(model_binary_file, 'rb') as f:
self.load_model_params(f)
self.load_dict(f)
self.load_dict(f, encoding=encoding)
self.load_vectors(f)

def load_model_params(self, file_handle):
Expand All @@ -270,7 +271,7 @@ def load_model_params(self, file_handle):
self.wv.max_n = maxn
self.sample = t

def load_dict(self, file_handle):
def load_dict(self, file_handle, encoding='utf8'):
(vocab_size, nwords, _) = self.struct_unpack(file_handle, '@3i')
# Vocab stored by [Dictionary::save](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc)
assert len(self.wv.vocab) == nwords, 'mismatch between vocab sizes'
Expand All @@ -283,7 +284,7 @@ def load_dict(self, file_handle):
while char_byte != b'\x00':
word_bytes += char_byte
char_byte = file_handle.read(1)
word = word_bytes.decode('utf8')
word = word_bytes.decode(encoding)
count, _ = self.struct_unpack(file_handle, '@ib')
_ = self.struct_unpack(file_handle, '@i')
assert self.wv.vocab[word].index == i, 'mismatch between gensim word index and fastText word index'
Expand Down
Binary file added gensim/test/test_data/cp852_fasttext.bin
Binary file not shown.
172 changes: 172 additions & 0 deletions gensim/test/test_data/cp852_fasttext.vec
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
171 2
ji -0.79132 1.9605
kter� -0.90811 1.6411
jen -0.91547 2.0157
podle -0.64689 1.6221
zde -0.79732 2.4019
u� -0.69159 1.7167
b�t -0.455 1.3266
v�ce -0.75901 1.688
bude -0.71114 2.0771
ji� -0.73027 1.267
ne� -0.97888 1.8332
v�s -0.72803 1.6653
by -0.75761 1.9683
kter� -0.68791 1.6069
co -1.0059 1.6869
nebo -0.94393 1.9611
ten -0.71975 2.124
tak -0.80566 2.0783
m� -0.83065 1.3732
p�i -0.62158 1.8313
od -0.44113 1.7755
po -0.7059 2.2615
tipy -0.60682 1.7247
je�t� -0.68854 1.7517
a� -0.63201 1.4618
bez -0.52021 1.4513
tak� -0.67762 1.8138
pouze -0.62611 1.82
prvn� -0.42235 1.6216
va�e -0.7407 1.5659
kter� -0.70914 1.7359
n�s -0.38286 1.6016
nov� -0.83421 1.7609
jsou -0.82699 1.9694
pokud -0.35516 1.5075
m��e -0.78928 1.6357
strana -0.57276 1.4149
jeho -0.78568 2.0226
sv� -0.44488 1.459
jin� -0.90751 1.9602
zpr�vy -0.90152 1.9703
nov� -0.78853 1.8593
nen� -0.63949 1.5191
tomu -0.68126 1.8729
ona -0.74442 1.825
ono -0.78171 1.9268
oni -0.64023 2.0525
ony -0.78142 1.7097
my -0.61062 1.8857
vy -0.9356 1.8875
j� -0.44615 0.92715
m� -0.73676 1.4089
mne -0.71006 1.7072
jemu -0.92237 2.1452
on -0.71417 1.9224
t�m -0.65242 1.8779
t�mu -0.83376 2.054
n�mu -0.79287 1.8645
n�mu� -0.51786 1.7297
jeho� -0.88721 1.7431
j�� -0.12627 0.68014
jeliko� -0.61809 1.7576
je� -0.8843 1.6723
jako� -0.94336 1.827
na�e� -0.76919 1.8106
ze -0.8277 2.0542
jak -0.97146 1.9164
dal� -0.5719 1.5148
ale -0.79733 1.8867
si -0.61439 1.7134
se -0.80843 1.8957
ve -0.7186 1.7891
to -0.84494 2.3933
jako -1.1045 2.2656
za -0.7136 1.9602
zp�t -0.79965 1.6329
jejich -0.49038 1.6366
do -0.69806 1.8364
pro -0.7878 2.2066
je -1.1291 3.0005
na -1.0203 2.4399
atd -0.70418 1.7405
atp -0.69278 1.5772
jakmile -0.87231 1.6896
p�i�em� -0.64617 1.4417
j� -0.7135 1.5517
n�m -0.42164 1.7603
jej -0.77603 1.9544
zda -0.76742 2.0163
pro� -0.47241 1.7053
m�te -0.75963 1.9814
tato -0.64318 2.0382
kam -0.45101 1.498
tohoto -0.73702 1.8305
kdo -0.80535 1.8551
kte�� -0.72498 1.6669
mi -0.46791 1.7784
tyto -0.50319 1.7659
tom -0.59138 1.8657
tomuto -0.74312 1.7725
m�t -0.27199 1.1315
nic -0.56441 1.8591
proto -0.6649 1.946
kterou -0.84109 1.7498
byla -0.58737 1.941
toho -0.76081 1.8002
proto�e -0.55749 1.6686
asi -0.51689 1.7079
bude� -0.55392 1.6052
s -0.74207 1.8989
k -0.61082 2.079
o -0.76465 1.8956
i -0.85412 1.6611
u -0.68535 1.5332
v -0.73033 1.3855
z -0.60751 1.9108
dnes -0.6001 1.7531
cz -0.59754 1.4239
t�mto -0.69011 1.6643
ho -0.55961 1.6968
budem -0.54027 1.7894
byli -0.60956 1.793
jse� -0.63127 1.5972
m�j -0.48904 1.2814
sv�m -0.48494 1.8751
ta -0.78131 2.4286
tomto -0.60948 1.7083
tohle -0.74747 1.7907
tuto -0.74687 1.9464
neg -0.60997 1.7777
pod -0.49619 1.914
t�ma -0.55525 1.6668
mezi -0.46979 1.3583
p�es -0.5712 1.9908
ty -0.78637 2.2804
pak -0.60084 1.7026
v�m -0.48545 1.4611
ani -0.65672 1.7897
kdy� -0.42318 1.4884
v�ak -0.60908 1.6867
�i -0.36843 1.7586
jsem -0.54047 1.827
tento -0.64813 1.9799
�l�nku -0.65578 1.9129
�l�nky -0.55868 1.8642
aby -0.80989 1.8384
jsme -0.60673 1.843
p�ed -0.53861 2.0502
pta -0.49464 1.714
a -0.63056 2.2477
aj -0.62546 1.6357
na�i -0.5915 1.6066
napi�te -0.50964 1.777
re -0.95733 1.9544
co� -0.54673 1.6466
t�m -0.70952 1.8565
tak�e -0.55439 1.8013
sv�ch -0.36878 1.4883
jej� -0.7694 1.6612
sv�mi -0.63149 2.1581
jste -0.68444 2.0978
byl -0.57205 1.7836
tu -0.88384 2.2256
tedy -0.62474 2.0469
teto -0.63187 1.884
bylo -0.56362 2.0282
kde -0.7308 2.0316
ke -0.60918 1.9317
prav� -0.52626 1.9058
nad -0.54689 1.8666
nejsou -0.66814 1.8323
12 changes: 11 additions & 1 deletion gensim/test/test_fasttext_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,12 +121,22 @@ def testLoadFastTextFormat(self):
self.model_sanity(model)

def testLoadModelWithNonAsciiVocab(self):
"""Test loading model with non-ascii words in vocab"""
model = fasttext.FastText.load_fasttext_format(datapath('non_ascii_fasttext'))
self.assertTrue(u'který' in model)
try:
vector = model[u'který']
except UnicodeDecodeError:
self.fail('Unable to access vector for non-ascii word')
self.fail('Unable to access vector for utf8 encoded non-ascii word')

def testLoadModelNonUtf8Encoding(self):
"""Test loading model with words in user-specified encoding"""
model = fasttext.FastText.load_fasttext_format(datapath('cp852_fasttext'), encoding='cp852')
self.assertTrue(u'který' in model)
try:
vector = model[u'který']
except KeyError:
self.fail('Unable to access vector for cp-852 word')

def testNSimilarity(self):
"""Test n_similarity for in-vocab and out-of-vocab words"""
Expand Down

0 comments on commit b285dab

Please sign in to comment.