#!/usr/bin/env python # -*- coding: utf-8 -*- # # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """This module contains methods for parsing and preprocessing strings. Examples -------- .. sourcecode:: pycon >>> from gensim.parsing.preprocessing import remove_stopwords, preprocess_string >>> remove_stopwords("Better late than never, but better never late.") u'Better late never, better late.' >>> >>> preprocess_string("Hel 9lo Wo9 rld! Th3 weather_is really g00d today, isn't it?") [u'hel', u'rld', u'weather', u'todai', u'isn'] """ import re import string import glob from gensim import utils from gensim.parsing.porter import PorterStemmer STOPWORDS = frozenset([ 'all', 'six', 'just', 'less', 'being', 'indeed', 'over', 'move', 'anyway', 'four', 'not', 'own', 'through', 'using', 'fifty', 'where', 'mill', 'only', 'find', 'before', 'one', 'whose', 'system', 'how', 'somewhere', 'much', 'thick', 'show', 'had', 'enough', 'should', 'to', 'must', 'whom', 'seeming', 'yourselves', 'under', 'ours', 'two', 'has', 'might', 'thereafter', 'latterly', 'do', 'them', 'his', 'around', 'than', 'get', 'very', 'de', 'none', 'cannot', 'every', 'un', 'they', 'front', 'during', 'thus', 'now', 'him', 'nor', 'name', 'regarding', 'several', 'hereafter', 'did', 'always', 'who', 'didn', 'whither', 'this', 'someone', 'either', 'each', 'become', 'thereupon', 'sometime', 'side', 'towards', 'therein', 'twelve', 'because', 'often', 'ten', 'our', 'doing', 'km', 'eg', 'some', 'back', 'used', 'up', 'go', 'namely', 'computer', 'are', 'further', 'beyond', 'ourselves', 'yet', 'out', 'even', 'will', 'what', 'still', 'for', 'bottom', 'mine', 'since', 'please', 'forty', 'per', 'its', 'everything', 'behind', 'does', 'various', 'above', 'between', 'it', 'neither', 'seemed', 'ever', 'across', 'she', 'somehow', 'be', 'we', 'full', 'never', 'sixty', 'however', 'here', 'otherwise', 'were', 'whereupon', 'nowhere', 'although', 'found', 'alone', 're', 'along', 'quite', 'fifteen', 'by', 'both', 'about', 'last', 'would', 'anything', 'via', 'many', 'could', 'thence', 'put', 'against', 'keep', 'etc', 'amount', 'became', 'ltd', 'hence', 'onto', 'or', 'con', 'among', 'already', 'co', 'afterwards', 'formerly', 'within', 'seems', 'into', 'others', 'while', 'whatever', 'except', 'down', 'hers', 'everyone', 'done', 'least', 'another', 'whoever', 'moreover', 'couldnt', 'throughout', 'anyhow', 'yourself', 'three', 'from', 'her', 'few', 'together', 'top', 'there', 'due', 'been', 'next', 'anyone', 'eleven', 'cry', 'call', 'therefore', 'interest', 'then', 'thru', 'themselves', 'hundred', 'really', 'sincere', 'empty', 'more', 'himself', 'elsewhere', 'mostly', 'on', 'fire', 'am', 'becoming', 'hereby', 'amongst', 'else', 'part', 'everywhere', 'too', 'kg', 'herself', 'former', 'those', 'he', 'me', 'myself', 'made', 'twenty', 'these', 'was', 'bill', 'cant', 'us', 'until', 'besides', 'nevertheless', 'below', 'anywhere', 'nine', 'can', 'whether', 'of', 'your', 'toward', 'my', 'say', 'something', 'and', 'whereafter', 'whenever', 'give', 'almost', 'wherever', 'is', 'describe', 'beforehand', 'herein', 'doesn', 'an', 'as', 'itself', 'at', 'have', 'in', 'seem', 'whence', 'ie', 'any', 'fill', 'again', 'hasnt', 'inc', 'thereby', 'thin', 'no', 'perhaps', 'latter', 'meanwhile', 'when', 'detail', 'same', 'wherein', 'beside', 'also', 'that', 'other', 'take', 'which', 'becomes', 'you', 'if', 'nobody', 'unless', 'whereas', 'see', 'though', 'may', 'after', 'upon', 'most', 'hereupon', 'eight', 'but', 'serious', 'nothing', 'such', 'why', 'off', 'a', 'don', 'whereby', 'third', 'i', 'whole', 'noone', 'sometimes', 'well', 'amoungst', 'yours', 'their', 'rather', 'without', 'so', 'five', 'the', 'first', 'with', 'make', 'once' ]) RE_PUNCT = re.compile(r'([%s])+' % re.escape(string.punctuation), re.UNICODE) RE_TAGS = re.compile(r"<([^>]+)>", re.UNICODE) RE_NUMERIC = re.compile(r"[0-9]+", re.UNICODE) RE_NONALPHA = re.compile(r"\W", re.UNICODE) RE_AL_NUM = re.compile(r"([a-z]+)([0-9]+)", flags=re.UNICODE) RE_NUM_AL = re.compile(r"([0-9]+)([a-z]+)", flags=re.UNICODE) RE_WHITESPACE = re.compile(r"(\s)+", re.UNICODE) def remove_stopwords(s, stopwords=None): """Remove :const:`~gensim.parsing.preprocessing.STOPWORDS` from `s`. Parameters ---------- s : str stopwords : iterable of str, optional Sequence of stopwords If None - using :const:`~gensim.parsing.preprocessing.STOPWORDS` Returns ------- str Unicode string without `stopwords`. Examples -------- .. sourcecode:: pycon >>> from gensim.parsing.preprocessing import remove_stopwords >>> remove_stopwords("Better late than never, but better never late.") u'Better late never, better late.' """ s = utils.to_unicode(s) return " ".join(remove_stopword_tokens(s.split(), stopwords)) def remove_stopword_tokens(tokens, stopwords=None): """Remove stopword tokens using list `stopwords`. Parameters ---------- tokens : iterable of str Sequence of tokens. stopwords : iterable of str, optional Sequence of stopwords If None - using :const:`~gensim.parsing.preprocessing.STOPWORDS` Returns ------- list of str List of tokens without `stopwords`. """ if stopwords is None: stopwords = STOPWORDS return [token for token in tokens if token not in stopwords] def strip_punctuation(s): """Replace ASCII punctuation characters with spaces in `s` using :const:`~gensim.parsing.preprocessing.RE_PUNCT`. Parameters ---------- s : str Returns ------- str Unicode string without punctuation characters. Examples -------- .. sourcecode:: pycon >>> from gensim.parsing.preprocessing import strip_punctuation >>> strip_punctuation("A semicolon is a stronger break than a comma, but not as much as a full stop!") u'A semicolon is a stronger break than a comma but not as much as a full stop ' """ s = utils.to_unicode(s) # For unicode enhancement options see https://github.com/RaRe-Technologies/gensim/issues/2962 return RE_PUNCT.sub(" ", s) def strip_tags(s): """Remove tags from `s` using :const:`~gensim.parsing.preprocessing.RE_TAGS`. Parameters ---------- s : str Returns ------- str Unicode string without tags. Examples -------- .. sourcecode:: pycon >>> from gensim.parsing.preprocessing import strip_tags >>> strip_tags("Hello World!") u'Hello World!' """ s = utils.to_unicode(s) return RE_TAGS.sub("", s) def strip_short(s, minsize=3): """Remove words with length lesser than `minsize` from `s`. Parameters ---------- s : str minsize : int, optional Returns ------- str Unicode string without short words. Examples -------- .. sourcecode:: pycon >>> from gensim.parsing.preprocessing import strip_short >>> strip_short("salut les amis du 59") u'salut les amis' >>> >>> strip_short("one two three four five six seven eight nine ten", minsize=5) u'three seven eight' """ s = utils.to_unicode(s) return " ".join(remove_short_tokens(s.split(), minsize)) def remove_short_tokens(tokens, minsize=3): """Remove tokens shorter than `minsize` chars. Parameters ---------- tokens : iterable of str Sequence of tokens. minsize : int, optimal Minimal length of token (include). Returns ------- list of str List of tokens without short tokens. """ return [token for token in tokens if len(token) >= minsize] def strip_numeric(s): """Remove digits from `s` using :const:`~gensim.parsing.preprocessing.RE_NUMERIC`. Parameters ---------- s : str Returns ------- str Unicode string without digits. Examples -------- .. sourcecode:: pycon >>> from gensim.parsing.preprocessing import strip_numeric >>> strip_numeric("0text24gensim365test") u'textgensimtest' """ s = utils.to_unicode(s) return RE_NUMERIC.sub("", s) def strip_non_alphanum(s): """Remove non-alphabetic characters from `s` using :const:`~gensim.parsing.preprocessing.RE_NONALPHA`. Parameters ---------- s : str Returns ------- str Unicode string with alphabetic characters only. Notes ----- Word characters - alphanumeric & underscore. Examples -------- .. sourcecode:: pycon >>> from gensim.parsing.preprocessing import strip_non_alphanum >>> strip_non_alphanum("if-you#can%read$this&then@this#method^works") u'if you can read this then this method works' """ s = utils.to_unicode(s) return RE_NONALPHA.sub(" ", s) def strip_multiple_whitespaces(s): r"""Remove repeating whitespace characters (spaces, tabs, line breaks) from `s` and turns tabs & line breaks into spaces using :const:`~gensim.parsing.preprocessing.RE_WHITESPACE`. Parameters ---------- s : str Returns ------- str Unicode string without repeating in a row whitespace characters. Examples -------- .. sourcecode:: pycon >>> from gensim.parsing.preprocessing import strip_multiple_whitespaces >>> strip_multiple_whitespaces("salut" + '\r' + " les" + '\n' + " loulous!") u'salut les loulous!' """ s = utils.to_unicode(s) return RE_WHITESPACE.sub(" ", s) def split_alphanum(s): """Add spaces between digits & letters in `s` using :const:`~gensim.parsing.preprocessing.RE_AL_NUM`. Parameters ---------- s : str Returns ------- str Unicode string with spaces between digits & letters. Examples -------- .. sourcecode:: pycon >>> from gensim.parsing.preprocessing import split_alphanum >>> split_alphanum("24.0hours7 days365 a1b2c3") u'24.0 hours 7 days 365 a 1 b 2 c 3' """ s = utils.to_unicode(s) s = RE_AL_NUM.sub(r"\1 \2", s) return RE_NUM_AL.sub(r"\1 \2", s) def stem_text(text): """Transform `s` into lowercase and stem it. Parameters ---------- text : str Returns ------- str Unicode lowercased and porter-stemmed version of string `text`. Examples -------- .. sourcecode:: pycon >>> from gensim.parsing.preprocessing import stem_text >>> stem_text("While it is quite useful to be able to search a large collection of documents almost instantly.") u'while it is quit us to be abl to search a larg collect of document almost instantly.' """ text = utils.to_unicode(text) p = PorterStemmer() return ' '.join(p.stem(word) for word in text.split()) stem = stem_text def lower_to_unicode(text, encoding='utf8', errors='strict'): """Lowercase `text` and convert to unicode, using :func:`gensim.utils.any2unicode`. Parameters ---------- text : str Input text. encoding : str, optional Encoding that will be used for conversion. errors : str, optional Error handling behaviour, used as parameter for `unicode` function (python2 only). Returns ------- str Unicode version of `text`. See Also -------- :func:`gensim.utils.any2unicode` Convert any string to unicode-string. """ return utils.to_unicode(text.lower(), encoding, errors) def split_on_space(s): """Split line by spaces, used in :class:`gensim.corpora.lowcorpus.LowCorpus`. Parameters ---------- s : str Some line. Returns ------- list of str List of tokens from `s`. """ return [word for word in utils.to_unicode(s).strip().split(' ') if word] DEFAULT_FILTERS = [ lambda x: x.lower(), strip_tags, strip_punctuation, strip_multiple_whitespaces, strip_numeric, remove_stopwords, strip_short, stem_text ] def preprocess_string(s, filters=DEFAULT_FILTERS): """Apply list of chosen filters to `s`. Default list of filters: * :func:`~gensim.parsing.preprocessing.strip_tags`, * :func:`~gensim.parsing.preprocessing.strip_punctuation`, * :func:`~gensim.parsing.preprocessing.strip_multiple_whitespaces`, * :func:`~gensim.parsing.preprocessing.strip_numeric`, * :func:`~gensim.parsing.preprocessing.remove_stopwords`, * :func:`~gensim.parsing.preprocessing.strip_short`, * :func:`~gensim.parsing.preprocessing.stem_text`. Parameters ---------- s : str filters: list of functions, optional Returns ------- list of str Processed strings (cleaned). Examples -------- .. sourcecode:: pycon >>> from gensim.parsing.preprocessing import preprocess_string >>> preprocess_string("Hel 9lo Wo9 rld! Th3 weather_is really g00d today, isn't it?") [u'hel', u'rld', u'weather', u'todai', u'isn'] >>> >>> s = "Hel 9lo Wo9 rld! Th3 weather_is really g00d today, isn't it?" >>> CUSTOM_FILTERS = [lambda x: x.lower(), strip_tags, strip_punctuation] >>> preprocess_string(s, CUSTOM_FILTERS) [u'hel', u'9lo', u'wo9', u'rld', u'th3', u'weather', u'is', u'really', u'g00d', u'today', u'isn', u't', u'it'] """ s = utils.to_unicode(s) for f in filters: s = f(s) return s.split() def preprocess_documents(docs): """Apply :const:`~gensim.parsing.preprocessing.DEFAULT_FILTERS` to the documents strings. Parameters ---------- docs : list of str Returns ------- list of list of str Processed documents split by whitespace. Examples -------- .. sourcecode:: pycon >>> from gensim.parsing.preprocessing import preprocess_documents >>> preprocess_documents(["Hel 9lo Wo9 rld!", "Th3 weather_is really g00d today, isn't it?"]) [[u'hel', u'rld'], [u'weather', u'todai', u'isn']] """ return [preprocess_string(d) for d in docs] def read_file(path): with utils.open(path, 'rb') as fin: return fin.read() def read_files(pattern): return [read_file(fname) for fname in glob.glob(pattern)]