#!/usr/bin/env python # -*- coding: utf-8 -*- # # Authors: Michael Penkov # Copyright (C) 2019 RaRe Technologies s.r.o. # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """Load models from the native binary format released by Facebook. The main entry point is the :func:`~gensim.models._fasttext_bin.load` function. It returns a :class:`~gensim.models._fasttext_bin.Model` namedtuple containing everything loaded from the binary. Examples -------- Load a model from a binary file: .. sourcecode:: pycon >>> from gensim.test.utils import datapath >>> from gensim.models.fasttext_bin import load >>> with open(datapath('crime-and-punishment.bin'), 'rb') as fin: ... model = load(fin) >>> model.nwords 291 >>> model.vectors_ngrams.shape (391, 5) >>> sorted(model.raw_vocab, key=lambda w: len(w), reverse=True)[:5] ['останавливаться', 'изворачиваться,', 'раздражительном', 'exceptionally', 'проскользнуть'] See Also -------- `FB Implementation `_. """ import collections import gzip import io import logging import struct import numpy as np _END_OF_WORD_MARKER = b'\x00' # FastText dictionary data structure holds elements of type `entry` which can have `entry_type` # either `word` (0 :: int8) or `label` (1 :: int8). Here we deal with unsupervised case only # so we want `word` type. # See https://github.com/facebookresearch/fastText/blob/master/src/dictionary.h _DICT_WORD_ENTRY_TYPE_MARKER = b'\x00' logger = logging.getLogger(__name__) # Constants for FastText vesrion and FastText file format magic (both int32) # https://github.com/facebookresearch/fastText/blob/master/src/fasttext.cc#L25 _FASTTEXT_VERSION = np.int32(12) _FASTTEXT_FILEFORMAT_MAGIC = np.int32(793712314) # _NEW_HEADER_FORMAT is constructed on the basis of args::save method, see # https://github.com/facebookresearch/fastText/blob/master/src/args.cc _NEW_HEADER_FORMAT = [ ('dim', 'i'), ('ws', 'i'), ('epoch', 'i'), ('min_count', 'i'), ('neg', 'i'), ('word_ngrams', 'i'), # Unused in loading ('loss', 'i'), ('model', 'i'), ('bucket', 'i'), ('minn', 'i'), ('maxn', 'i'), ('lr_update_rate', 'i'), # Unused in loading ('t', 'd'), ] _OLD_HEADER_FORMAT = [ ('epoch', 'i'), ('min_count', 'i'), ('neg', 'i'), ('word_ngrams', 'i'), # Unused in loading ('loss', 'i'), ('model', 'i'), ('bucket', 'i'), ('minn', 'i'), ('maxn', 'i'), ('lr_update_rate', 'i'), # Unused in loading ('t', 'd'), ] _FLOAT_SIZE = struct.calcsize('@f') if _FLOAT_SIZE == 4: _FLOAT_DTYPE = np.dtype(np.float32) elif _FLOAT_SIZE == 8: _FLOAT_DTYPE = np.dtype(np.float64) else: _FLOAT_DTYPE = None def _yield_field_names(): for name, _ in _OLD_HEADER_FORMAT + _NEW_HEADER_FORMAT: if not name.startswith('_'): yield name yield 'raw_vocab' yield 'vocab_size' yield 'nwords' yield 'vectors_ngrams' yield 'hidden_output' yield 'ntokens' _FIELD_NAMES = sorted(set(_yield_field_names())) Model = collections.namedtuple('Model', _FIELD_NAMES) """Holds data loaded from the Facebook binary. Parameters ---------- dim : int The dimensionality of the vectors. ws : int The window size. epoch : int The number of training epochs. neg : int If non-zero, indicates that the model uses negative sampling. loss : int If equal to 1, indicates that the model uses hierarchical sampling. model : int If equal to 2, indicates that the model uses skip-grams. bucket : int The number of buckets. min_count : int The threshold below which the model ignores terms. t : float The sample threshold. minn : int The minimum ngram length. maxn : int The maximum ngram length. raw_vocab : collections.OrderedDict A map from words (str) to their frequency (int). The order in the dict corresponds to the order of the words in the Facebook binary. nwords : int The number of words. vocab_size : int The size of the vocabulary. vectors_ngrams : numpy.array This is a matrix that contains vectors learned by the model. Each row corresponds to a vector. The number of vectors is equal to the number of words plus the number of buckets. The number of columns is equal to the vector dimensionality. hidden_output : numpy.array This is a matrix that contains the shallow neural network output. This array has the same dimensions as vectors_ngrams. May be None - in that case, it is impossible to continue training the model. """ def _struct_unpack(fin, fmt): num_bytes = struct.calcsize(fmt) return struct.unpack(fmt, fin.read(num_bytes)) def _load_vocab(fin, new_format, encoding='utf-8'): """Load a vocabulary from a FB binary. Before the vocab is ready for use, call the prepare_vocab function and pass in the relevant parameters from the model. Parameters ---------- fin : file An open file pointer to the binary. new_format: boolean True if the binary is of the newer format. encoding : str The encoding to use when decoding binary data into words. Returns ------- tuple The loaded vocabulary. Keys are words, values are counts. The vocabulary size. The number of words. The number of tokens. """ vocab_size, nwords, nlabels = _struct_unpack(fin, '@3i') # Vocab stored by [Dictionary::save](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc) if nlabels > 0: raise NotImplementedError("Supervised fastText models are not supported") logger.info("loading %s words for fastText model from %s", vocab_size, fin.name) ntokens = _struct_unpack(fin, '@q')[0] # number of tokens if new_format: pruneidx_size, = _struct_unpack(fin, '@q') raw_vocab = collections.OrderedDict() for i in range(vocab_size): word_bytes = io.BytesIO() char_byte = fin.read(1) while char_byte != _END_OF_WORD_MARKER: word_bytes.write(char_byte) char_byte = fin.read(1) word_bytes = word_bytes.getvalue() try: word = word_bytes.decode(encoding) except UnicodeDecodeError: word = word_bytes.decode(encoding, errors='backslashreplace') logger.error( 'failed to decode invalid unicode bytes %r; replacing invalid characters, using %r', word_bytes, word ) count, _ = _struct_unpack(fin, '@qb') raw_vocab[word] = count if new_format: for j in range(pruneidx_size): _struct_unpack(fin, '@2i') return raw_vocab, vocab_size, nwords, ntokens def _load_matrix(fin, new_format=True): """Load a matrix from fastText native format. Interprets the matrix dimensions and type from the file stream. Parameters ---------- fin : file A file handle opened for reading. new_format : bool, optional True if the quant_input variable precedes the matrix declaration. Should be True for newer versions of fastText. Returns ------- :class:`numpy.array` The vectors as an array. Each vector will be a row in the array. The number of columns of the array will correspond to the vector size. """ if _FLOAT_DTYPE is None: raise ValueError('bad _FLOAT_SIZE: %r' % _FLOAT_SIZE) if new_format: _struct_unpack(fin, '@?') # bool quant_input in fasttext.cc num_vectors, dim = _struct_unpack(fin, '@2q') count = num_vectors * dim # # numpy.fromfile doesn't play well with gzip.GzipFile as input: # # - https://github.com/RaRe-Technologies/gensim/pull/2476 # - https://github.com/numpy/numpy/issues/13470 # # Until they fix it, we have to apply a workaround. We only apply the # workaround when it's necessary, because np.fromfile is heavily optimized # and very efficient (when it works). # if isinstance(fin, gzip.GzipFile): logger.warning( 'Loading model from a compressed .gz file. This can be slow. ' 'This is a work-around for a bug in NumPy: https://github.com/numpy/numpy/issues/13470. ' 'Consider decompressing your model file for a faster load. ' ) matrix = _fromfile(fin, _FLOAT_DTYPE, count) else: matrix = np.fromfile(fin, _FLOAT_DTYPE, count) assert matrix.shape == (count,), 'expected (%r,), got %r' % (count, matrix.shape) matrix = matrix.reshape((num_vectors, dim)) return matrix def _batched_generator(fin, count, batch_size=1e6): """Read `count` floats from `fin`. Batches up read calls to avoid I/O overhead. Keeps no more than batch_size floats in memory at once. Yields floats. """ while count > batch_size: batch = _struct_unpack(fin, '@%df' % batch_size) for f in batch: yield f count -= batch_size batch = _struct_unpack(fin, '@%df' % count) for f in batch: yield f def _fromfile(fin, dtype, count): """Reimplementation of numpy.fromfile.""" return np.fromiter(_batched_generator(fin, count), dtype=dtype) def load(fin, encoding='utf-8', full_model=True): """Load a model from a binary stream. Parameters ---------- fin : file The readable binary stream. encoding : str, optional The encoding to use for decoding text full_model : boolean, optional If False, skips loading the hidden output matrix. This saves a fair bit of CPU time and RAM, but prevents training continuation. Returns ------- :class:`~gensim.models._fasttext_bin.Model` The loaded model. """ if isinstance(fin, str): fin = open(fin, 'rb') magic, version = _struct_unpack(fin, '@2i') new_format = magic == _FASTTEXT_FILEFORMAT_MAGIC header_spec = _NEW_HEADER_FORMAT if new_format else _OLD_HEADER_FORMAT model = {name: _struct_unpack(fin, fmt)[0] for (name, fmt) in header_spec} if not new_format: model.update(dim=magic, ws=version) raw_vocab, vocab_size, nwords, ntokens = _load_vocab(fin, new_format, encoding=encoding) model.update(raw_vocab=raw_vocab, vocab_size=vocab_size, nwords=nwords, ntokens=ntokens) vectors_ngrams = _load_matrix(fin, new_format=new_format) if not full_model: hidden_output = None else: hidden_output = _load_matrix(fin, new_format=new_format) assert fin.read() == b'', 'expected to reach EOF' model.update(vectors_ngrams=vectors_ngrams, hidden_output=hidden_output) model = {k: v for k, v in model.items() if k in _FIELD_NAMES} return Model(**model) def _backslashreplace_backport(ex): """Replace byte sequences that failed to decode with character escapes. Does the same thing as errors="backslashreplace" from Python 3. Python 2 lacks this functionality out of the box, so we need to backport it. Parameters ---------- ex: UnicodeDecodeError contains arguments of the string and start/end indexes of the bad portion. Returns ------- text: unicode The Unicode string corresponding to the decoding of the bad section. end: int The index from which to continue decoding. Note ---- Works on Py2 only. Py3 already has backslashreplace built-in. """ # # Based on: # https://stackoverflow.com/questions/42860186/exact-equivalent-of-b-decodeutf-8-backslashreplace-in-python-2 # bstr, start, end = ex.object, ex.start, ex.end text = u''.join('\\x{:02x}'.format(ord(c)) for c in bstr[start:end]) return text, end def _sign_model(fout): """ Write signature of the file in Facebook's native fastText `.bin` format to the binary output stream `fout`. Signature includes magic bytes and version. Name mimics original C++ implementation, see [FastText::signModel](https://github.com/facebookresearch/fastText/blob/master/src/fasttext.cc) Parameters ---------- fout: writeable binary stream """ fout.write(_FASTTEXT_FILEFORMAT_MAGIC.tobytes()) fout.write(_FASTTEXT_VERSION.tobytes()) def _conv_field_to_bytes(field_value, field_type): """ Auxiliary function that converts `field_value` to bytes based on request `field_type`, for saving to the binary file. Parameters ---------- field_value: numerical contains arguments of the string and start/end indexes of the bad portion. field_type: str currently supported `field_types` are `i` for 32-bit integer and `d` for 64-bit float """ if field_type == 'i': return (np.int32(field_value).tobytes()) elif field_type == 'd': return (np.float64(field_value).tobytes()) else: raise NotImplementedError('Currently conversion to "%s" type is not implemmented.' % field_type) def _get_field_from_model(model, field): """ Extract `field` from `model`. Parameters ---------- model: gensim.models.fasttext.FastText model from which `field` is extracted field: str requested field name, fields are listed in the `_NEW_HEADER_FORMAT` list """ if field == 'bucket': return model.wv.bucket elif field == 'dim': return model.vector_size elif field == 'epoch': return model.epochs elif field == 'loss': # `loss` => hs: 1, ns: 2, softmax: 3, ova-vs-all: 4 # ns = negative sampling loss (default) # hs = hierarchical softmax loss # softmax = softmax loss # one-vs-all = one vs all loss (supervised) if model.hs == 1: return 1 elif model.hs == 0: return 2 elif model.hs == 0 and model.negative == 0: return 1 elif field == 'maxn': return model.wv.max_n elif field == 'minn': return model.wv.min_n elif field == 'min_count': return model.min_count elif field == 'model': # `model` => cbow:1, sg:2, sup:3 # cbow = continous bag of words (default) # sg = skip-gram # sup = supervised return 2 if model.sg == 1 else 1 elif field == 'neg': return model.negative elif field == 't': return model.sample elif field == 'word_ngrams': # This is skipped in gensim loading setting, using the default from FB C++ code return 1 elif field == 'ws': return model.window elif field == 'lr_update_rate': # This is skipped in gensim loading setting, using the default from FB C++ code return 100 else: msg = 'Extraction of header field "' + field + '" from Gensim FastText object not implemmented.' raise NotImplementedError(msg) def _args_save(fout, model, fb_fasttext_parameters): """ Saves header with `model` parameters to the binary stream `fout` containing a model in the Facebook's native fastText `.bin` format. Name mimics original C++ implementation, see [Args::save](https://github.com/facebookresearch/fastText/blob/master/src/args.cc) Parameters ---------- fout: writeable binary stream stream to which model is saved model: gensim.models.fasttext.FastText saved model fb_fasttext_parameters: dictionary dictionary contain parameters containing `lr_update_rate`, `word_ngrams` unused by gensim implementation, so they have to be provided externally """ for field, field_type in _NEW_HEADER_FORMAT: if field in fb_fasttext_parameters: field_value = fb_fasttext_parameters[field] else: field_value = _get_field_from_model(model, field) fout.write(_conv_field_to_bytes(field_value, field_type)) def _dict_save(fout, model, encoding): """ Saves the dictionary from `model` to the to the binary stream `fout` containing a model in the Facebook's native fastText `.bin` format. Name mimics the original C++ implementation [Dictionary::save](https://github.com/facebookresearch/fastText/blob/master/src/dictionary.cc) Parameters ---------- fout: writeable binary stream stream to which the dictionary from the model is saved model: gensim.models.fasttext.FastText the model that contains the dictionary to save encoding: str string encoding used in the output """ # In the FB format the dictionary can contain two types of entries, i.e. # words and labels. The first two fields of the dictionary contain # the dictionary size (size_) and the number of words (nwords_). # In the unsupervised case we have only words (no labels). Hence both fields # are equal. fout.write(np.int32(len(model.wv)).tobytes()) fout.write(np.int32(len(model.wv)).tobytes()) # nlabels=0 <- no labels we are in unsupervised mode fout.write(np.int32(0).tobytes()) fout.write(np.int64(model.corpus_total_words).tobytes()) # prunedidx_size_=-1, -1 value denotes no prunning index (prunning is only supported in supervised mode) fout.write(np.int64(-1)) for word in model.wv.index_to_key: word_count = model.wv.get_vecattr(word, 'count') fout.write(word.encode(encoding)) fout.write(_END_OF_WORD_MARKER) fout.write(np.int64(word_count).tobytes()) fout.write(_DICT_WORD_ENTRY_TYPE_MARKER) # We are in unsupervised case, therefore pruned_idx is empty, so we do not need to write anything else def _input_save(fout, model): """ Saves word and ngram vectors from `model` to the binary stream `fout` containing a model in the Facebook's native fastText `.bin` format. Corresponding C++ fastText code: [DenseMatrix::save](https://github.com/facebookresearch/fastText/blob/master/src/densematrix.cc) Parameters ---------- fout: writeable binary stream stream to which the vectors are saved model: gensim.models.fasttext.FastText the model that contains the vectors to save """ vocab_n, vocab_dim = model.wv.vectors_vocab.shape ngrams_n, ngrams_dim = model.wv.vectors_ngrams.shape assert vocab_dim == ngrams_dim assert vocab_n == len(model.wv) assert ngrams_n == model.wv.bucket fout.write(struct.pack('@2q', vocab_n + ngrams_n, vocab_dim)) fout.write(model.wv.vectors_vocab.tobytes()) fout.write(model.wv.vectors_ngrams.tobytes()) def _output_save(fout, model): """ Saves output layer of `model` to the binary stream `fout` containing a model in the Facebook's native fastText `.bin` format. Corresponding C++ fastText code: [DenseMatrix::save](https://github.com/facebookresearch/fastText/blob/master/src/densematrix.cc) Parameters ---------- fout: writeable binary stream the model that contains the output layer to save model: gensim.models.fasttext.FastText saved model """ if model.hs: hidden_output = model.syn1 if model.negative: hidden_output = model.syn1neg hidden_n, hidden_dim = hidden_output.shape fout.write(struct.pack('@2q', hidden_n, hidden_dim)) fout.write(hidden_output.tobytes()) def _save_to_stream(model, fout, fb_fasttext_parameters, encoding): """ Saves word embeddings to binary stream `fout` using the Facebook's native fasttext `.bin` format. Parameters ---------- fout: file name or writeable binary stream stream to which the word embeddings are saved model: gensim.models.fasttext.FastText the model that contains the word embeddings to save fb_fasttext_parameters: dictionary dictionary contain parameters containing `lr_update_rate`, `word_ngrams` unused by gensim implementation, so they have to be provided externally encoding: str encoding used in the output file """ _sign_model(fout) _args_save(fout, model, fb_fasttext_parameters) _dict_save(fout, model, encoding) fout.write(struct.pack('@?', False)) # Save 'quant_', which is False for unsupervised models # Save words and ngrams vectors _input_save(fout, model) fout.write(struct.pack('@?', False)) # Save 'quot_', which is False for unsupervised models # Save output layers of the model _output_save(fout, model) def save(model, fout, fb_fasttext_parameters, encoding): """ Saves word embeddings to the Facebook's native fasttext `.bin` format. Parameters ---------- fout: file name or writeable binary stream stream to which model is saved model: gensim.models.fasttext.FastText saved model fb_fasttext_parameters: dictionary dictionary contain parameters containing `lr_update_rate`, `word_ngrams` unused by gensim implementation, so they have to be provided externally encoding: str encoding used in the output file Notes ----- Unfortunately, there is no documentation of the Facebook's native fasttext `.bin` format This is just reimplementation of [FastText::saveModel](https://github.com/facebookresearch/fastText/blob/master/src/fasttext.cc) Based on v0.9.1, more precisely commit da2745fcccb848c7a225a7d558218ee4c64d5333 Code follows the original C++ code naming. """ if isinstance(fout, str): with open(fout, "wb") as fout_stream: _save_to_stream(model, fout_stream, fb_fasttext_parameters, encoding) else: _save_to_stream(model, fout, fb_fasttext_parameters, encoding)