From 41afcfa8068a0178d092bde1703978b4d40b0c96 Mon Sep 17 00:00:00 2001
From: Michal Lopuszynski <lopusz@users.noreply.github.com>
Date: Sat, 7 Dec 2019 00:17:07 +0100
Subject: [PATCH] Adding writing vocabulary, vectors, output layer for FB
 format (#2611)

---
 gensim/models/_fasttext_bin.py | 185 ++++++++++++++++++++++++++++-----
 gensim/models/fasttext.py      |   6 +-
 2 files changed, 162 insertions(+), 29 deletions(-)

diff --git a/gensim/models/_fasttext_bin.py b/gensim/models/_fasttext_bin.py
index 7004bbd8d1..9897343b03 100644
--- a/gensim/models/_fasttext_bin.py
+++ b/gensim/models/_fasttext_bin.py
@@ -41,6 +41,14 @@
 
 _END_OF_WORD_MARKER = b'\x00'
 
+# FastText dictionary data structure holds elements of type `entry` which can have `entry_type`
+# either `word` (0 :: int8) or `label` (1 :: int8). Here we deal with unsupervised case only
+# so we want `word` type.
+# See https://github.com/facebookresearch/fastText/blob/master/src/dictionary.h
+
+_DICT_WORD_ENTRY_TYPE_MARKER = b'\x00'
+
+
 logger = logging.getLogger(__name__)
 
 # Constants for FastText vesrion and FastText file format magic (both int32)
@@ -101,6 +109,7 @@ def _yield_field_names():
     yield 'nwords'
     yield 'vectors_ngrams'
     yield 'hidden_output'
+    yield 'ntokens'
 
 
 _FIELD_NAMES = sorted(set(_yield_field_names()))
@@ -184,7 +193,8 @@ def _load_vocab(fin, new_format, encoding='utf-8'):
         raise NotImplementedError("Supervised fastText models are not supported")
     logger.info("loading %s words for fastText model from %s", vocab_size, fin.name)
 
-    _struct_unpack(fin, '@1q')  # number of tokens
+    ntokens, = _struct_unpack(fin, '@q')  # number of tokens
+
     if new_format:
         pruneidx_size, = _struct_unpack(fin, '@q')
 
@@ -213,7 +223,7 @@ def _load_vocab(fin, new_format, encoding='utf-8'):
         for j in range(pruneidx_size):
             _struct_unpack(fin, '@2i')
 
-    return raw_vocab, vocab_size, nwords
+    return raw_vocab, vocab_size, nwords, ntokens
 
 
 def _load_matrix(fin, new_format=True):
@@ -319,16 +329,17 @@ def load(fin, encoding='utf-8', full_model=True):
         fin = open(fin, 'rb')
 
     magic, version = _struct_unpack(fin, '@2i')
-    print(version)
     new_format = magic == _FASTTEXT_FILEFORMAT_MAGIC
 
     header_spec = _NEW_HEADER_FORMAT if new_format else _OLD_HEADER_FORMAT
     model = {name: _struct_unpack(fin, fmt)[0] for (name, fmt) in header_spec}
+
+
     if not new_format:
         model.update(dim=magic, ws=version)
 
-    raw_vocab, vocab_size, nwords = _load_vocab(fin, new_format, encoding=encoding)
-    model.update(raw_vocab=raw_vocab, vocab_size=vocab_size, nwords=nwords)
+    raw_vocab, vocab_size, nwords, ntokens = _load_vocab(fin, new_format, encoding=encoding)
+    model.update(raw_vocab=raw_vocab, vocab_size=vocab_size, nwords=nwords, ntokens=ntokens)
 
     vectors_ngrams = _load_matrix(fin, new_format=new_format)
 
@@ -376,8 +387,7 @@ def _backslashreplace_backport(ex):
 
 
 def _sign_model(fout):
-    # Reimplementation of the FastText::signModel function, see
-    # https://github.com/facebookresearch/fastText/blob/master/src/fasttext.cc
+    # Reimplementation of the [FastText::signModel](https://github.com/facebookresearch/fastText/blob/master/src/fasttext.cc)
     fout.write(_FASTTEXT_FILEFORMAT_MAGIC.tobytes())
     fout.write(_FASTTEXT_VERSION.tobytes())
 
@@ -423,7 +433,7 @@ def _get_field(model, field, field_type):
         # cbow = continous bag of words (default)
         # sg = skip-gram
         # sup = supervised
-        res = 1 if model.sg == 1 else 2
+        res = 2 if model.sg == 1 else 1
     elif field == 'neg':
         res = model.negative
     elif field == 't':
@@ -444,34 +454,88 @@ def _get_field(model, field, field_type):
 
 def _args_save(fout, model):
 
-    # Reimplementation of the Args::save method, see
-    # https://github.com/facebookresearch/fastText/blob/master/src/args.cc
+    # Reimplementation of the [Args::save](https://github.com/facebookresearch/fastText/blob/master/src/args.cc)
 
     for field, field_type in _NEW_HEADER_FORMAT:
         fout.write(_get_field(model, field, field_type))
 
 
-def _dict_save(fout, model):
-    pass
+def _dict_save(fout, model, encoding):
+    # Reimplementation of the [Dictionary::save](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc)
+
+    # out.write((char*)&size_, sizeof(int32_t));
+    # out.write((char*)&nwords_, sizeof(int32_t));
+    # out.write((char*)&nlabels_, sizeof(int32_t));
+    # out.write((char*)&ntokens_, sizeof(int64_t));
+    # out.write((char*)&pruneidx_size_, sizeof(int64_t));
+    # for (int32_t i = 0; i < size_; i++) {
+    #   entry e = words_[i];
+    #   out.write(e.word.data(), e.word.size() * sizeof(char));
+    #   out.put(0);
+    #   out.write((char*)&(e.count), sizeof(int64_t));
+    #   out.write((char*)&(e.type), sizeof(entry_type));
+    # }
+    # for (const auto pair : pruneidx_) {
+    #   out.write((char*)&(pair.first), sizeof(int32_t));
+    #       out.write((char*)&(pair.second), sizeof(int32_t));
+    # }
+
+    # TODO Check what is the difference between `size` and `nwords`
+
+    fout.write(np.int32(len(model.wv.vocab)).tobytes())
+
+    fout.write(np.int32(len(model.wv.vocab)).tobytes())
+
+    # nlabels=0 <- no labels  we are in unsupervised mode
+    fout.write(np.int32(0).tobytes())
 
+    fout.write(np.int64(model.corpus_total_words).tobytes())
 
-def _save_vocab(fout, model):
-    pass
+    # prunedidx_size_=-1, -1 value denotes no prunning index (prunning is only supported in supervised mode)
+    fout.write(np.int64(-1))
 
+    for word, vocab_entry in model.wv.vocab.items():
+        fout.write(word.encode(encoding))
+        fout.write(_END_OF_WORD_MARKER)
+        fout.write(np.int64(vocab_entry.count).tobytes())
+        fout.write(_DICT_WORD_ENTRY_TYPE_MARKER)
 
-def _save_vector_ngrams(fout, model):
-    pass
+    # We are in unsupervised case, therefore pruned_idx is empty. so we do not need to write anything else
 
 
-def _save_hidden_outputs(fout, model):
-    pass
+def _input_save(fout, model):
+    vocab_n, vocab_dim = model.wv.vectors_vocab.shape
+    ngrams_n, ngrams_dim = model.wv.vectors_ngrams.shape
 
+    assert vocab_dim == ngrams_dim
+    assert vocab_n == len(model.wv.vocab)
+    assert ngrams_n == model.wv.bucket
 
-def _save(fout, model):
+    fout.write(struct.pack('@2q', vocab_n + ngrams_n, vocab_dim))
+    fout.write(model.wv.vectors_vocab.tobytes())
+    fout.write(model.wv.vectors_ngrams.tobytes())
+
+
+def _output_save(fout, model):
+
+    # TODO Can model.hs and model.negative be both False?
+    # TODO Can model.hs and model.negative be both True?
+
+    if model.hs:
+        hidden_output = model.trainables.syn1
+    if model.negative:
+        hidden_output = model.trainables.syn1neg
+
+    hidden_n, hidden_dim = hidden_output.shape
+    fout.write(struct.pack('@2q', hidden_n, hidden_dim))
+    fout.write(hidden_output.tobytes())
+
+
+def _save(fout, model, encoding):
 
     # Unfortunatelly there is no documentation of the FB binary format
-    # This is just reimplementation of FastText::saveModel method
-    # See https://github.com/facebookresearch/fastText/blob/master/src/fasttext.cc
+    # This is just reimplementation of
+    # [FastText::saveModel](https://github.com/facebookresearch/fastText/blob/master/src/fasttext.cc)
 
     #   As of writing this (12.2019) the C++ code looks as follows
     #
@@ -487,18 +551,87 @@ def _save(fout, model):
 
     _sign_model(fout)
     _args_save(fout, model)
-    _dict_save(fout, model)
-    _save_vector_ngrams(fout, model)
-    _save_hidden_outputs(fout, model)
+    _dict_save(fout, model, encoding)
+    fout.write(struct.pack('@?', False))  # TODO Check if quantization works for unsupervised models
+
+    # Save words and ngrams vectors
+    _input_save(fout, model)
+    fout.write(struct.pack('@?', False))  # TODO Check if quantization works for unsupervised models
 
+    # Save output layers of the model
+    _output_save(fout, model)
 
-def save(fout, model):
+
+def save(fout, model, encoding='utf-8'):
     if isinstance(fout, str):
         with open(fout, "wb") as fout_stream:
-            _save(fout_stream, model)
+            _save(fout_stream, model, encoding)
     else:
         _save(fout, model)
 
 
+# COMPARING FUNCTIONALITY
+
+
+def _sign_load(fin):
+    keys = ['fileformat_magic', 'version']
+    vals = _struct_unpack(fin, '@2i')
+    return dict(zip(keys, vals))
+
+
+def _load_key_fmt_list_to_dict(fin, key_fmt_list):
+    res = {}
+    for key, fmt in key_fmt_list:
+        res[key] = _struct_unpack(fin, fmt)[0]
+    return res
+
+
+def _args_load(fin):
+    return _load_key_fmt_list_to_dict(fin, _NEW_HEADER_FORMAT)
+
+
+def _dict_header_load(fin):
+    DICT_HEADER_FORMAT = [('size', 'i'),
+                          ('nwords', 'i'),
+                          ('nlabels', 'i'),
+                          ('ntokens', 'i'),
+                          ('pruneidx_size', '@q')]
+    return _load_key_fmt_list_to_dict(fin, DICT_HEADER_FORMAT)
+
+
+def _yield_differing_keys(d1, d2):
+    assert set(d1.keys()) == set(d2.keys())
+
+    for k in d1.keys():
+        v1, v2 = d1[k], d2[k]
+
+        if v1 != v2:
+            yield k, v1, v2
+
+    return None
+
+
+def _print_differences_between_dicts(d1, d2):
+    if d1 != d2:
+        for k, v1, v2 in _yield_differing_keys(d1, d2):
+            print('Key "%s" differs -> %s != %s' % (k, str(v1), str(v2)))
+
+
+def compare_fasttext_files(fname1, fname2):
+
+    with open(fname1, 'rb') as fin1, open(fname2, 'rb') as fin2:
+        sign1 = _sign_load(fin1)
+        sign2 = _sign_load(fin2)
+        _print_differences_between_dicts(sign1, sign2)
+
+        args1 = _args_load(fin1)
+        args2 = _args_load(fin2)
+        _print_differences_between_dicts(args1, args2)
+
+        dict_header1 = _dict_header_load(fin1)
+        dict_header2 = _dict_header_load(fin2)
+        _print_differences_between_dicts(dict_header1, dict_header2)
+
+
 if six.PY2:
     codecs.register_error('backslashreplace', _backslashreplace_backport)
diff --git a/gensim/models/fasttext.py b/gensim/models/fasttext.py
index 82af368508..8c97e0ddcd 100644
--- a/gensim/models/fasttext.py
+++ b/gensim/models/fasttext.py
@@ -1216,15 +1216,15 @@ def _load_fasttext_format(model_file, encoding='utf-8', full_model=True):
         window=m.ws,
         iter=m.epoch,
         negative=m.neg,
-        hs=(m.loss == 1),
-        sg=(m.model == 2),
+        hs=int(m.loss == 2),
+        sg=int(m.model == 2),
         bucket=m.bucket,
         min_count=m.min_count,
         sample=m.t,
         min_n=m.minn,
         max_n=m.maxn,
     )
-
+    model.corpus_total_words = m.ntokens
     model.vocabulary.raw_vocab = m.raw_vocab
     model.vocabulary.nwords = m.nwords
     model.vocabulary.vocab_size = m.vocab_size