# Natural Language Toolkit: Language ID module using TextCat algorithm # # Copyright (C) 2001-2022 NLTK Project # Author: Avital Pekker # # URL: # For license information, see LICENSE.TXT """ A module for language identification using the TextCat algorithm. An implementation of the text categorization algorithm presented in Cavnar, W. B. and J. M. Trenkle, "N-Gram-Based Text Categorization". The algorithm takes advantage of Zipf's law and uses n-gram frequencies to profile languages and text-yet to be identified-then compares using a distance measure. Language n-grams are provided by the "An Crubadan" project. A corpus reader was created separately to read those files. For details regarding the algorithm, see: https://www.let.rug.nl/~vannoord/TextCat/textcat.pdf For details about An Crubadan, see: https://borel.slu.edu/crubadan/index.html """ from sys import maxsize from nltk.util import trigrams # Note: this is NOT "re" you're likely used to. The regex module # is an alternative to the standard re module that supports # Unicode codepoint properties with the \p{} syntax. # You may have to "pip install regx" try: import regex as re except ImportError: re = None ###################################################################### ## Language identification using TextCat ###################################################################### class TextCat: _corpus = None fingerprints = {} _START_CHAR = "<" _END_CHAR = ">" last_distances = {} def __init__(self): if not re: raise OSError( "classify.textcat requires the regex module that " "supports unicode. Try '$ pip install regex' and " "see https://pypi.python.org/pypi/regex for " "further details." ) from nltk.corpus import crubadan self._corpus = crubadan # Load all language ngrams into cache for lang in self._corpus.langs(): self._corpus.lang_freq(lang) def remove_punctuation(self, text): """Get rid of punctuation except apostrophes""" return re.sub(r"[^\P{P}\']+", "", text) def profile(self, text): """Create FreqDist of trigrams within text""" from nltk import FreqDist, word_tokenize clean_text = self.remove_punctuation(text) tokens = word_tokenize(clean_text) fingerprint = FreqDist() for t in tokens: token_trigram_tuples = trigrams(self._START_CHAR + t + self._END_CHAR) token_trigrams = ["".join(tri) for tri in token_trigram_tuples] for cur_trigram in token_trigrams: if cur_trigram in fingerprint: fingerprint[cur_trigram] += 1 else: fingerprint[cur_trigram] = 1 return fingerprint def calc_dist(self, lang, trigram, text_profile): """Calculate the "out-of-place" measure between the text and language profile for a single trigram""" lang_fd = self._corpus.lang_freq(lang) dist = 0 if trigram in lang_fd: idx_lang_profile = list(lang_fd.keys()).index(trigram) idx_text = list(text_profile.keys()).index(trigram) # print(idx_lang_profile, ", ", idx_text) dist = abs(idx_lang_profile - idx_text) else: # Arbitrary but should be larger than # any possible trigram file length # in terms of total lines dist = maxsize return dist def lang_dists(self, text): """Calculate the "out-of-place" measure between the text and all languages""" distances = {} profile = self.profile(text) # For all the languages for lang in self._corpus._all_lang_freq.keys(): # Calculate distance metric for every trigram in # input text to be identified lang_dist = 0 for trigram in profile: lang_dist += self.calc_dist(lang, trigram, profile) distances[lang] = lang_dist return distances def guess_language(self, text): """Find the language with the min distance to the text and return its ISO 639-3 code""" self.last_distances = self.lang_dists(text) return min(self.last_distances, key=self.last_distances.get) #################################################') def demo(): from nltk.corpus import udhr langs = [ "Kurdish-UTF8", "Abkhaz-UTF8", "Farsi_Persian-UTF8", "Hindi-UTF8", "Hawaiian-UTF8", "Russian-UTF8", "Vietnamese-UTF8", "Serbian_Srpski-UTF8", "Esperanto-UTF8", ] friendly = { "kmr": "Northern Kurdish", "abk": "Abkhazian", "pes": "Iranian Persian", "hin": "Hindi", "haw": "Hawaiian", "rus": "Russian", "vie": "Vietnamese", "srp": "Serbian", "epo": "Esperanto", } tc = TextCat() for cur_lang in langs: # Get raw data from UDHR corpus raw_sentences = udhr.sents(cur_lang) rows = len(raw_sentences) - 1 cols = list(map(len, raw_sentences)) sample = "" # Generate a sample text of the language for i in range(0, rows): cur_sent = "" for j in range(0, cols[i]): cur_sent += " " + raw_sentences[i][j] sample += cur_sent # Try to detect what it is print("Language snippet: " + sample[0:140] + "...") guess = tc.guess_language(sample) print(f"Language detection: {guess} ({friendly[guess]})") print("#" * 140) if __name__ == "__main__": demo()