""" UDHR corpus reader. It mostly deals with encodings. """ from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk.corpus.reader.util import find_corpus_fileids class UdhrCorpusReader(PlaintextCorpusReader): ENCODINGS = [ (".*-Latin1$", "latin-1"), (".*-Hebrew$", "hebrew"), (".*-Arabic$", "cp1256"), ("Czech_Cesky-UTF8", "cp1250"), # yeah (".*-Cyrillic$", "cyrillic"), (".*-SJIS$", "SJIS"), (".*-GB2312$", "GB2312"), (".*-Latin2$", "ISO-8859-2"), (".*-Greek$", "greek"), (".*-UTF8$", "utf-8"), ("Hungarian_Magyar-Unicode", "utf-16-le"), ("Amahuaca", "latin1"), ("Turkish_Turkce-Turkish", "latin5"), ("Lithuanian_Lietuviskai-Baltic", "latin4"), ("Japanese_Nihongo-EUC", "EUC-JP"), ("Japanese_Nihongo-JIS", "iso2022_jp"), ("Chinese_Mandarin-HZ", "hz"), (r"Abkhaz\-Cyrillic\+Abkh", "cp1251"), ] SKIP = { # The following files are not fully decodable because they # were truncated at wrong bytes: "Burmese_Myanmar-UTF8", "Japanese_Nihongo-JIS", "Chinese_Mandarin-HZ", "Chinese_Mandarin-UTF8", "Gujarati-UTF8", "Hungarian_Magyar-Unicode", "Lao-UTF8", "Magahi-UTF8", "Marathi-UTF8", "Tamil-UTF8", # Unfortunately, encodings required for reading # the following files are not supported by Python: "Vietnamese-VPS", "Vietnamese-VIQR", "Vietnamese-TCVN", "Magahi-Agra", "Bhojpuri-Agra", "Esperanto-T61", # latin3 raises an exception # The following files are encoded for specific fonts: "Burmese_Myanmar-WinResearcher", "Armenian-DallakHelv", "Tigrinya_Tigrigna-VG2Main", "Amharic-Afenegus6..60375", # ? "Navaho_Dine-Navajo-Navaho-font", # What are these? "Azeri_Azerbaijani_Cyrillic-Az.Times.Cyr.Normal0117", "Azeri_Azerbaijani_Latin-Az.Times.Lat0117", # The following files are unintended: "Czech-Latin2-err", "Russian_Russky-UTF8~", } def __init__(self, root="udhr"): fileids = find_corpus_fileids(root, r"(?!README|\.).*") super().__init__( root, [fileid for fileid in fileids if fileid not in self.SKIP], encoding=self.ENCODINGS, )