#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright (C) 2010 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """Corpus in `GibbsLda++ format `_.""" import logging from collections import Counter from gensim import utils from gensim.corpora import IndexedCorpus from gensim.parsing.preprocessing import split_on_space logger = logging.getLogger(__name__) class LowCorpus(IndexedCorpus): """Corpus handles input in `GibbsLda++ format `_. **Format description** Both data for training/estimating the model and new data (i.e., previously unseen data) have the same format as follows :: [M] [document1] [document2] ... [documentM] in which the first line is the total number for documents [M]. Each line after that is one document. [documenti] is the ith document of the dataset that consists of a list of Ni words/terms :: [documenti] = [wordi1] [wordi2] ... [wordiNi] in which all [wordij] (i=1..M, j=1..Ni) are text strings and they are separated by the blank character. Examples -------- .. sourcecode:: pycon >>> from gensim.test.utils import get_tmpfile, common_texts >>> from gensim.corpora import LowCorpus >>> from gensim.corpora import Dictionary >>> >>> # Prepare needed data >>> dictionary = Dictionary(common_texts) >>> corpus = [dictionary.doc2bow(doc) for doc in common_texts] >>> >>> # Write corpus in GibbsLda++ format to disk >>> output_fname = get_tmpfile("corpus.low") >>> LowCorpus.serialize(output_fname, corpus, dictionary) >>> >>> # Read corpus >>> loaded_corpus = LowCorpus(output_fname) """ def __init__(self, fname, id2word=None, line2words=split_on_space): """ Parameters ---------- fname : str Path to file in GibbsLda++ format. id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional Mapping between word_ids (integers) and words (strings). If not provided, the mapping is constructed directly from `fname`. line2words : callable, optional Function which converts lines(str) into tokens(list of str), using :func:`~gensim.parsing.preprocessing.split_on_space` as default. """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s", fname) self.fname = fname # input file, see class doc for format self.line2words = line2words # how to translate lines into words (simply split on space by default) self.num_docs = self._calculate_num_docs() if not id2word: # build a list of all word types in the corpus (distinct words) logger.info("extracting vocabulary from the corpus") all_terms = set() self.use_wordids = False # return documents as (word, wordCount) 2-tuples for doc in self: all_terms.update(word for word, wordCnt in doc) all_terms = sorted(all_terms) # sort the list of all words; rank in that list = word's integer id # build a mapping of word id(int) -> word (string) self.id2word = dict(zip(range(len(all_terms)), all_terms)) else: logger.info("using provided word mapping (%i ids)", len(id2word)) self.id2word = id2word self.num_terms = len(self.word2id) self.use_wordids = True # return documents as (wordIndex, wordCount) 2-tuples logger.info( "loaded corpus with %i documents and %i terms from %s", self.num_docs, self.num_terms, fname ) def _calculate_num_docs(self): """Get number of documents in file. Returns ------- int Number of documents. """ # the first line in input data is the number of documents (integer). throws exception on bad input. with utils.open(self.fname, 'rb') as fin: try: result = int(next(fin)) except StopIteration: result = 0 return result def __len__(self): return self.num_docs def line2doc(self, line): """Covert line into document in BoW format. Parameters ---------- line : str Line from input file. Returns ------- list of (int, int) Document in BoW format """ words = self.line2words(line) if self.use_wordids: # the following creates a unique list of words *in the same order* # as they were in the input. when iterating over the documents, # the (word, count) pairs will appear in the same order as they # were in the input (bar duplicates), which looks better. # if this was not needed, we might as well have used useWords = set(words) use_words, counts = [], Counter() for word in words: if word not in self.word2id: continue if word not in counts: use_words.append(word) counts[word] += 1 # construct a list of (wordIndex, wordFrequency) 2-tuples doc = [(self.word2id[w], counts[w]) for w in use_words] else: word_freqs = Counter(words) # construct a list of (word, wordFrequency) 2-tuples doc = list(word_freqs.items()) # return the document, then forget it and move on to the next one # note that this way, only one doc is stored in memory at a time, not the whole corpus return doc def __iter__(self): """Iterate over the corpus. Yields ------ list of (int, int) Document in BoW format. """ with utils.open(self.fname, 'rb') as fin: for lineno, line in enumerate(fin): if lineno > 0: # ignore the first line = number of documents yield self.line2doc(line) @staticmethod def save_corpus(fname, corpus, id2word=None, metadata=False): """Save a corpus in the GibbsLda++ format. Warnings -------- This function is automatically called by :meth:`gensim.corpora.lowcorpus.LowCorpus.serialize`, don't call it directly, call :meth:`gensim.corpora.lowcorpus.LowCorpus.serialize` instead. Parameters ---------- fname : str Path to output file. corpus : iterable of iterable of (int, int) Corpus in BoW format. id2word : {dict of (int, str), :class:`~gensim.corpora.dictionary.Dictionary`}, optional Mapping between word_ids (integers) and words (strings). If not provided, the mapping is constructed directly from `corpus`. metadata : bool, optional THIS PARAMETER WILL BE IGNORED. Return ------ list of int List of offsets in resulting file for each document (in bytes), can be used for :meth:`~gensim.corpora.lowcorpus.LowCorpus.docbyoffset` """ if id2word is None: logger.info("no word id mapping provided; initializing from corpus") id2word = utils.dict_from_corpus(corpus) logger.info("storing corpus in List-Of-Words format into %s" % fname) truncated = 0 offsets = [] with utils.open(fname, 'wb') as fout: fout.write(utils.to_utf8('%i\n' % len(corpus))) for doc in corpus: words = [] for wordid, value in doc: if abs(int(value) - value) > 1e-6: truncated += 1 words.extend([utils.to_unicode(id2word[wordid])] * int(value)) offsets.append(fout.tell()) fout.write(utils.to_utf8('%s\n' % ' '.join(words))) if truncated: logger.warning( "List-of-words format can only save vectors with integer elements; " "%i float entries were truncated to integer value", truncated ) return offsets def docbyoffset(self, offset): """Get the document stored in file by `offset` position. Parameters ---------- offset : int Offset (in bytes) to begin of document. Returns ------- list of (int, int) Document in BoW format. Examples -------- .. sourcecode:: pycon >>> from gensim.test.utils import datapath >>> from gensim.corpora import LowCorpus >>> >>> data = LowCorpus(datapath("testcorpus.low")) >>> data.docbyoffset(1) # end of first line [] >>> data.docbyoffset(2) # start of second line [(0, 1), (3, 1), (4, 1)] """ with utils.open(self.fname, 'rb') as f: f.seek(offset) return self.line2doc(f.readline()) @property def id2word(self): """Get mapping between words and their ids.""" return self._id2word @id2word.setter def id2word(self, val): self._id2word = val self.word2id = utils.revdict(val)