Skip to content

Commit

Permalink
Merge pull request #1319 from prakhar2b/mismatch
Browse files Browse the repository at this point in the history
support both old and new fastText model
  • Loading branch information
menshikh-iv authored May 25, 2017
2 parents b2f15ff + 092ef86 commit 7b6afc0
Show file tree
Hide file tree
Showing 5 changed files with 1,819 additions and 6 deletions.
2 changes: 1 addition & 1 deletion continuous_integration/travis/flake8_diff.sh
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,6 @@ check_files() {
if [[ "$MODIFIED_FILES" == "no_match" ]]; then
echo "No file has been modified"
else
check_files "$(echo "$MODIFIED_FILES" )" "--ignore=E501,E731,E12,W503 --exclude=*.sh,*.md,*.yml,*.rst,*.ipynb,Dockerfile*"
check_files "$(echo "$MODIFIED_FILES" )" "--ignore=E501,E731,E12,W503 --exclude=*.sh,*.md,*.yml,*.rst,*.ipynb,*.vec,Dockerfile*"
fi
echo -e "No problem detected by flake8\n"
27 changes: 22 additions & 5 deletions gensim/models/wrappers/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@

logger = logging.getLogger(__name__)

FASTTEXT_FILEFORMAT_MAGIC = 793712314


class FastTextKeyedVectors(KeyedVectors):
"""
Expand Down Expand Up @@ -257,7 +259,15 @@ def load_binary_data(self, model_binary_file, encoding='utf8'):
self.load_vectors(f)

def load_model_params(self, file_handle):
(dim, ws, epoch, minCount, neg, _, loss, model, bucket, minn, maxn, _, t) = self.struct_unpack(file_handle, '@12i1d')
magic, version = self.struct_unpack(file_handle, '@2i')
if magic == FASTTEXT_FILEFORMAT_MAGIC: # newer format
self.new_format = True
dim, ws, epoch, minCount, neg, _, loss, model, bucket, minn, maxn, _, t = self.struct_unpack(file_handle, '@12i1d')
else: # older format
self.new_format = False
dim = magic
ws = version
epoch, minCount, neg, _, loss, model, bucket, minn, maxn, _, t = self.struct_unpack(file_handle, '@10i1d')
# Parameters stored by [Args::save](https://github.com/facebookresearch/fastText/blob/master/src/args.cc)
self.vector_size = dim
self.window = ws
Expand All @@ -272,11 +282,13 @@ def load_model_params(self, file_handle):
self.sample = t

def load_dict(self, file_handle, encoding='utf8'):
(vocab_size, nwords, _) = self.struct_unpack(file_handle, '@3i')
vocab_size, nwords, _ = self.struct_unpack(file_handle, '@3i')
# Vocab stored by [Dictionary::save](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc)
assert len(self.wv.vocab) == nwords, 'mismatch between vocab sizes'
assert len(self.wv.vocab) == vocab_size, 'mismatch between vocab sizes'
ntokens, = self.struct_unpack(file_handle, '@q')
self.struct_unpack(file_handle, '@1q') # number of tokens
if self.new_format:
pruneidx_size, = self.struct_unpack(file_handle, '@q')
for i in range(nwords):
word_bytes = b''
char_byte = file_handle.read(1)
Expand All @@ -285,12 +297,17 @@ def load_dict(self, file_handle, encoding='utf8'):
word_bytes += char_byte
char_byte = file_handle.read(1)
word = word_bytes.decode(encoding)
count, _ = self.struct_unpack(file_handle, '@ib')
_ = self.struct_unpack(file_handle, '@i')
count, _ = self.struct_unpack(file_handle, '@qb')
assert self.wv.vocab[word].index == i, 'mismatch between gensim word index and fastText word index'
self.wv.vocab[word].count = count

if self.new_format:
for j in range(pruneidx_size):
self.struct_unpack(file_handle, '@2i')

def load_vectors(self, file_handle):
if self.new_format:
self.struct_unpack(file_handle, '@?') # bool quant_input in fasttext.cc
num_vectors, dim = self.struct_unpack(file_handle, '@2q')
# Vectors stored by [Matrix::save](https://github.com/facebookresearch/fastText/blob/master/src/matrix.cc)
assert self.vector_size == dim, 'mismatch between model sizes'
Expand Down
Binary file added gensim/test/test_data/lee_fasttext_new.bin
Binary file not shown.
Loading

0 comments on commit 7b6afc0

Please sign in to comment.