""" This package contains algorithms for extracting document representations from their raw bag-of-word counts. """ # bring model classes directly into package namespace, to save some typing from .coherencemodel import CoherenceModel # noqa:F401 from .hdpmodel import HdpModel # noqa:F401 from .ldamodel import LdaModel # noqa:F401 from .lsimodel import LsiModel # noqa:F401 from .tfidfmodel import TfidfModel # noqa:F401 from .rpmodel import RpModel # noqa:F401 from .logentropy_model import LogEntropyModel # noqa:F401 from .word2vec import Word2Vec, FAST_VERSION # noqa:F401 from .doc2vec import Doc2Vec # noqa:F401 from .keyedvectors import KeyedVectors # noqa:F401 from .ldamulticore import LdaMulticore # noqa:F401 from .phrases import Phrases # noqa:F401 from .normmodel import NormModel # noqa:F401 from .atmodel import AuthorTopicModel # noqa:F401 from .ldaseqmodel import LdaSeqModel # noqa:F401 from .fasttext import FastText # noqa:F401 from .translation_matrix import TranslationMatrix, BackMappingTranslationMatrix # noqa:F401 from .ensemblelda import EnsembleLda # noqa:F401 from .nmf import Nmf # noqa:F401 from gensim import interfaces, utils class VocabTransform(interfaces.TransformationABC): """ Remap feature ids to new values. Given a mapping between old ids and new ids (some old ids may be missing = these features are to be discarded), this will wrap a corpus so that iterating over `VocabTransform[corpus]` returns the same vectors but with the new ids. Old features that have no counterpart in the new ids are discarded. This can be used to filter vocabulary of a corpus "online": .. sourcecode:: pycon >>> old2new = {oldid: newid for newid, oldid in enumerate(ids_you_want_to_keep)} >>> vt = VocabTransform(old2new) >>> for vec_with_new_ids in vt[corpus_with_old_ids]: >>> pass """ def __init__(self, old2new, id2token=None): self.old2new = old2new self.id2token = id2token def __getitem__(self, bow): """ Return representation with the ids transformed. """ # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus: return self._apply(bow) return sorted((self.old2new[oldid], weight) for oldid, weight in bow if oldid in self.old2new)