#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright (C) 2010 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """Corpus in the `Matrix Market format `_.""" import logging from gensim import matutils from gensim.corpora import IndexedCorpus logger = logging.getLogger(__name__) class MmCorpus(matutils.MmReader, IndexedCorpus): """Corpus serialized using the `sparse coordinate Matrix Market format `_. Wrap a term-document matrix on disk (in matrix-market format), and present it as an object which supports iteration over the matrix rows (~documents). Notes ----- The file is read into memory one document at a time, not the whole matrix at once, unlike e.g. `scipy.io.mmread` and other implementations. This allows you to **process corpora which are larger than the available RAM**, in a streamed manner. Example -------- .. sourcecode:: pycon >>> from gensim.corpora.mmcorpus import MmCorpus >>> from gensim.test.utils import datapath >>> >>> corpus = MmCorpus(datapath('test_mmcorpus_with_index.mm')) >>> for document in corpus: ... pass """ def __init__(self, fname): """ Parameters ---------- fname : {str, file-like object} Path to file in MM format or a file-like object that supports `seek()` (e.g. a compressed file opened by `smart_open `_). """ # avoid calling super(), too confusing IndexedCorpus.__init__(self, fname) matutils.MmReader.__init__(self, fname) def __iter__(self): """Iterate through all documents. Yields ------ list of (int, numeric) Document in the `sparse Gensim bag-of-words format `__. Notes ------ The total number of vectors returned is always equal to the number of rows specified in the header. Empty documents are inserted and yielded where appropriate, even if they are not explicitly stored in the (sparse) Matrix Market file. """ for doc_id, doc in super(MmCorpus, self).__iter__(): yield doc # get rid of doc id, return the sparse vector only @staticmethod def save_corpus(fname, corpus, id2word=None, progress_cnt=1000, metadata=False): """Save a corpus to disk in the sparse coordinate Matrix Market format. Parameters ---------- fname : str Path to file. corpus : iterable of list of (int, number) Corpus in Bow format. id2word : dict of (int, str), optional Mapping between word_id -> word. Used to retrieve the total vocabulary size if provided. Otherwise, the total vocabulary size is estimated based on the highest feature id encountered in `corpus`. progress_cnt : int, optional How often to report (log) progress. metadata : bool, optional Writes out additional metadata? Warnings -------- This function is automatically called by :class:`~gensim.corpora.mmcorpus.MmCorpus.serialize`, don't call it directly, call :class:`~gensim.corpora.mmcorpus.MmCorpus.serialize` instead. Example ------- .. sourcecode:: pycon >>> from gensim.corpora.mmcorpus import MmCorpus >>> from gensim.test.utils import datapath >>> >>> corpus = MmCorpus(datapath('test_mmcorpus_with_index.mm')) >>> >>> MmCorpus.save_corpus("random", corpus) # Do not do it, use `serialize` instead. [97, 121, 169, 201, 225, 249, 258, 276, 303] """ logger.info("storing corpus in Matrix Market format to %s", fname) num_terms = len(id2word) if id2word is not None else None return matutils.MmWriter.write_corpus( fname, corpus, num_terms=num_terms, index=True, progress_cnt=progress_cnt, metadata=metadata )