#!/usr/bin/env python # -*- coding: utf-8 -*- # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """Corpus in `Mallet format `_.""" from __future__ import with_statement import logging from gensim import utils from gensim.corpora import LowCorpus logger = logging.getLogger(__name__) class MalletCorpus(LowCorpus): """Corpus handles input in `Mallet format `_. **Format description** One file, one instance per line, assume the data is in the following format :: [URL] [language] [text of the page...] Or, more generally, :: [document #1 id] [label] [text of the document...] [document #2 id] [label] [text of the document...] ... [document #N id] [label] [text of the document...] Note that language/label is *not* considered in Gensim, used `__unknown__` as default value. Examples -------- .. sourcecode:: pycon >>> from gensim.test.utils import get_tmpfile, common_texts >>> from gensim.corpora import MalletCorpus >>> from gensim.corpora import Dictionary >>> >>> # Prepare needed data >>> dictionary = Dictionary(common_texts) >>> corpus = [dictionary.doc2bow(doc) for doc in common_texts] >>> >>> # Write corpus in Mallet format to disk >>> output_fname = get_tmpfile("corpus.mallet") >>> MalletCorpus.serialize(output_fname, corpus, dictionary) >>> >>> # Read corpus >>> loaded_corpus = MalletCorpus(output_fname) """ def __init__(self, fname, id2word=None, metadata=False): """ Parameters ---------- fname : str Path to file in Mallet format. id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional Mapping between word_ids (integers) and words (strings). If not provided, the mapping is constructed directly from `fname`. metadata : bool, optional If True, return additional information ("document id" and "lang" when you call :meth:`~gensim.corpora.malletcorpus.MalletCorpus.line2doc`, :meth:`~gensim.corpora.malletcorpus.MalletCorpus.__iter__` or :meth:`~gensim.corpora.malletcorpus.MalletCorpus.docbyoffset` """ self.metadata = metadata LowCorpus.__init__(self, fname, id2word) def _calculate_num_docs(self): """Get number of documents. Returns ------- int Number of documents in file. """ with utils.open(self.fname, 'rb') as fin: result = sum(1 for _ in fin) return result def __iter__(self): """Iterate over the corpus. Yields ------ list of (int, int) Document in BoW format (+"document_id" and "lang" if metadata=True). """ with utils.open(self.fname, 'rb') as f: for line in f: yield self.line2doc(line) def line2doc(self, line): """Covert line into document in BoW format. Parameters ---------- line : str Line from input file. Returns ------- list of (int, int) Document in BoW format (+"document_id" and "lang" if metadata=True). Examples -------- .. sourcecode:: pycon >>> from gensim.test.utils import datapath >>> from gensim.corpora import MalletCorpus >>> >>> corpus = MalletCorpus(datapath("testcorpus.mallet")) >>> corpus.line2doc("en computer human interface") [(3, 1), (4, 1)] """ split_line = utils.to_unicode(line).strip().split(None, 2) docid, doclang = split_line[0], split_line[1] words = split_line[2] if len(split_line) >= 3 else '' doc = super(MalletCorpus, self).line2doc(words) if self.metadata: return doc, (docid, doclang) else: return doc @staticmethod def save_corpus(fname, corpus, id2word=None, metadata=False): """Save a corpus in the Mallet format. Warnings -------- This function is automatically called by :meth:`gensim.corpora.malletcorpus.MalletCorpus.serialize`, don't call it directly, call :meth:`gensim.corpora.lowcorpus.malletcorpus.MalletCorpus.serialize` instead. Parameters ---------- fname : str Path to output file. corpus : iterable of iterable of (int, int) Corpus in BoW format. id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional Mapping between word_ids (integers) and words (strings). If not provided, the mapping is constructed directly from `corpus`. metadata : bool, optional If True - ???? Return ------ list of int List of offsets in resulting file for each document (in bytes), can be used for :meth:`~gensim.corpora.malletcorpus.Malletcorpus.docbyoffset`. Notes ----- The document id will be generated by enumerating the corpus. That is, it will range between 0 and number of documents in the corpus. Since Mallet has a language field in the format, this defaults to the string '__unknown__'. If the language needs to be saved, post-processing will be required. """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) logger.info("storing corpus in Mallet format into %s", fname) truncated = 0 offsets = [] with utils.open(fname, 'wb') as fout: for doc_id, doc in enumerate(corpus): if metadata: doc_id, doc_lang = doc[1] doc = doc[0] else: doc_lang = '__unknown__' words = [] for wordid, value in doc: if abs(int(value) - value) > 1e-6: truncated += 1 words.extend([utils.to_unicode(id2word[wordid])] * int(value)) offsets.append(fout.tell()) fout.write(utils.to_utf8('%s %s %s\n' % (doc_id, doc_lang, ' '.join(words)))) if truncated: logger.warning( "Mallet format can only save vectors with integer elements; " "%i float entries were truncated to integer value", truncated ) return offsets def docbyoffset(self, offset): """Get the document stored in file by `offset` position. Parameters ---------- offset : int Offset (in bytes) to begin of document. Returns ------- list of (int, int) Document in BoW format (+"document_id" and "lang" if metadata=True). Examples -------- .. sourcecode:: pycon >>> from gensim.test.utils import datapath >>> from gensim.corpora import MalletCorpus >>> >>> data = MalletCorpus(datapath("testcorpus.mallet")) >>> data.docbyoffset(1) # end of first line [(3, 1), (4, 1)] >>> data.docbyoffset(4) # start of second line [(4, 1)] """ with utils.open(self.fname, 'rb') as f: f.seek(offset) return self.line2doc(f.readline())