#!/usr/bin/env python # -*- coding: utf-8 -*- """ Automated tests for the parsing module. """ import logging import unittest import mock import numpy as np from gensim.parsing.preprocessing import ( remove_short_tokens, remove_stopword_tokens, remove_stopwords, stem_text, split_alphanum, split_on_space, strip_multiple_whitespaces, strip_non_alphanum, strip_numeric, strip_punctuation, strip_short, strip_tags, ) # several documents doc1 = """C'est un trou de verdure où chante une rivière, Accrochant follement aux herbes des haillons D'argent ; où le soleil, de la montagne fière, Luit : c'est un petit val qui mousse de rayons.""" doc2 = """Un soldat jeune, bouche ouverte, tête nue, Et la nuque baignant dans le frais cresson bleu, Dort ; il est étendu dans l'herbe, sous la nue, Pâle dans son lit vert où la lumière pleut.""" doc3 = """Les pieds dans les glaïeuls, il dort. Souriant comme Sourirait un enfant malade, il fait un somme : Nature, berce-le chaudement : il a froid.""" doc4 = """Les parfums ne font pas frissonner sa narine ; Il dort dans le soleil, la main sur sa poitrine, Tranquille. Il a deux trous rouges au côté droit.""" doc5 = """While it is quite useful to be able to search a large collection of documents almost instantly for a joint occurrence of a collection of exact words, for many searching purposes, a little fuzziness would help. """ dataset = [strip_punctuation(x.lower()) for x in [doc1, doc2, doc3, doc4]] # doc1 and doc2 have class 0, doc3 and doc4 avec class 1 classes = np.array([[1, 0], [1, 0], [0, 1], [0, 1]]) class TestPreprocessing(unittest.TestCase): def test_strip_numeric(self): self.assertEqual(strip_numeric("salut les amis du 59"), "salut les amis du ") def test_strip_short(self): self.assertEqual(strip_short("salut les amis du 59", 3), "salut les amis") def test_strip_tags(self): self.assertEqual(strip_tags("Hello World!"), "Hello World!") def test_strip_multiple_whitespaces(self): self.assertEqual(strip_multiple_whitespaces("salut les\r\nloulous!"), "salut les loulous!") def test_strip_non_alphanum(self): self.assertEqual(strip_non_alphanum("toto nf-kappa titi"), "toto nf kappa titi") def test_split_alphanum(self): self.assertEqual(split_alphanum("toto diet1 titi"), "toto diet 1 titi") self.assertEqual(split_alphanum("toto 1diet titi"), "toto 1 diet titi") def test_strip_stopwords(self): self.assertEqual(remove_stopwords("the world is square"), "world square") # confirm redifining the global `STOPWORDS` working with mock.patch('gensim.parsing.preprocessing.STOPWORDS', frozenset(["the"])): self.assertEqual(remove_stopwords("the world is square"), "world is square") def test_strip_stopword_tokens(self): self.assertEqual(remove_stopword_tokens(["the", "world", "is", "sphere"]), ["world", "sphere"]) # confirm redifining the global `STOPWORDS` working with mock.patch('gensim.parsing.preprocessing.STOPWORDS', frozenset(["the"])): self.assertEqual( remove_stopword_tokens(["the", "world", "is", "sphere"]), ["world", "is", "sphere"] ) def test_strip_short_tokens(self): self.assertEqual(remove_short_tokens(["salut", "les", "amis", "du", "59"], 3), ["salut", "les", "amis"]) def test_split_on_space(self): self.assertEqual(split_on_space(" salut les amis du 59 "), ["salut", "les", "amis", "du", "59"]) def test_stem_text(self): target = \ "while it is quit us to be abl to search a larg " + \ "collect of document almost instantli for a joint occurr " + \ "of a collect of exact words, for mani search purposes, " + \ "a littl fuzzi would help." self.assertEqual(stem_text(doc5), target) if __name__ == "__main__": logging.basicConfig(level=logging.WARNING) unittest.main()