#!/usr/bin/env python # -*- coding: utf-8 -*- # # Copyright (C) 2010 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """Corpus in SVMlight format.""" from __future__ import with_statement import logging from gensim import utils from gensim.corpora import IndexedCorpus logger = logging.getLogger(__name__) class SvmLightCorpus(IndexedCorpus): """Corpus in SVMlight format. Quoting http://svmlight.joachims.org/: The input file contains the training examples. The first lines may contain comments and are ignored if they start with #. Each of the following lines represents one training example and is of the following format:: .=. : : ... : # .=. +1 | -1 | 0 | .=. | "qid" .=. .=. The "qid" feature (used for SVMlight ranking), if present, is ignored. Notes ----- Although not mentioned in the specification above, SVMlight also expect its feature ids to be 1-based (counting starts at 1). We convert features to 0-base internally by decrementing all ids when loading a SVMlight input file, and increment them again when saving as SVMlight. """ def __init__(self, fname, store_labels=True): """ Parameters ---------- fname: str Path to corpus. store_labels : bool, optional Whether to store labels (~SVM target class). They currently have no application but stored in `self.labels` for convenience by default. """ IndexedCorpus.__init__(self, fname) logger.info("loading corpus from %s", fname) self.fname = fname # input file, see class doc for format self.length = None self.store_labels = store_labels self.labels = [] def __iter__(self): """ Iterate over the corpus, returning one sparse (BoW) vector at a time. Yields ------ list of (int, float) Document in BoW format. """ lineno = -1 self.labels = [] with utils.open(self.fname, 'rb') as fin: for lineno, line in enumerate(fin): doc = self.line2doc(line) if doc is not None: if self.store_labels: self.labels.append(doc[1]) yield doc[0] self.length = lineno + 1 @staticmethod def save_corpus(fname, corpus, id2word=None, labels=False, metadata=False): """Save a corpus in the SVMlight format. The SVMlight `` class tag is taken from the `labels` array, or set to 0 for all documents if `labels` is not supplied. Parameters ---------- fname : str Path to output file. corpus : iterable of iterable of (int, float) Corpus in BoW format. id2word : dict of (str, str), optional Mapping id -> word. labels : list or False An SVMlight `` class tags or False if not present. metadata : bool ARGUMENT WILL BE IGNORED. Returns ------- list of int Offsets for each line in file (in bytes). """ logger.info("converting corpus to SVMlight format: %s", fname) if labels is not False: # Cast any sequence (incl. a numpy array) to a list, to simplify the processing below. labels = list(labels) offsets = [] with utils.open(fname, 'wb') as fout: for docno, doc in enumerate(corpus): label = labels[docno] if labels else 0 # target class is 0 by default offsets.append(fout.tell()) fout.write(utils.to_utf8(SvmLightCorpus.doc2line(doc, label))) return offsets def docbyoffset(self, offset): """Get the document stored at file position `offset`. Parameters ---------- offset : int Document's position. Returns ------- tuple of (int, float) """ with utils.open(self.fname, 'rb') as f: f.seek(offset) return self.line2doc(f.readline())[0] # TODO: it brakes if gets None from line2doc def line2doc(self, line): """Get a document from a single line in SVMlight format. This method inverse of :meth:`~gensim.corpora.svmlightcorpus.SvmLightCorpus.doc2line`. Parameters ---------- line : str Line in SVMLight format. Returns ------- (list of (int, float), str) Document in BoW format and target class label. """ line = utils.to_unicode(line) line = line[: line.find('#')].strip() if not line: return None # ignore comments and empty lines parts = line.split() if not parts: raise ValueError('invalid line format in %s' % self.fname) target, fields = parts[0], [part.rsplit(':', 1) for part in parts[1:]] # ignore 'qid' features, convert 1-based feature ids to 0-based doc = [(int(p1) - 1, float(p2)) for p1, p2 in fields if p1 != 'qid'] return doc, target @staticmethod def doc2line(doc, label=0): """Convert BoW representation of document in SVMlight format. This method inverse of :meth:`~gensim.corpora.svmlightcorpus.SvmLightCorpus.line2doc`. Parameters ---------- doc : list of (int, float) Document in BoW format. label : int, optional Document label (if provided). Returns ------- str `doc` in SVMlight format. """ pairs = ' '.join("%i:%s" % (termid + 1, termval) for termid, termval in doc) # +1 to convert 0-base to 1-base return "%s %s\n" % (label, pairs)