# Natural Language Toolkit: ChrF score # # Copyright (C) 2001-2022 NLTK Project # Authors: Maja Popovic # Contributors: Liling Tan, Aleš Tamchyna (Memsource) # URL: # For license information, see LICENSE.TXT """ ChrF score implementation """ import re from collections import Counter, defaultdict from nltk.util import ngrams def sentence_chrf( reference, hypothesis, min_len=1, max_len=6, beta=3.0, ignore_whitespace=True ): """ Calculates the sentence level CHRF (Character n-gram F-score) described in - Maja Popovic. 2015. CHRF: Character n-gram F-score for Automatic MT Evaluation. In Proceedings of the 10th Workshop on Machine Translation. https://www.statmt.org/wmt15/pdf/WMT49.pdf - Maja Popovic. 2016. CHRF Deconstructed: β Parameters and n-gram Weights. In Proceedings of the 1st Conference on Machine Translation. https://www.statmt.org/wmt16/pdf/W16-2341.pdf This implementation of CHRF only supports a single reference at the moment. For details not reported in the paper, consult Maja Popovic's original implementation: https://github.com/m-popovic/chrF The code should output results equivalent to running CHRF++ with the following options: -nw 0 -b 3 An example from the original BLEU paper https://www.aclweb.org/anthology/P02-1040.pdf >>> ref1 = str('It is a guide to action that ensures that the military ' ... 'will forever heed Party commands').split() >>> hyp1 = str('It is a guide to action which ensures that the military ' ... 'always obeys the commands of the party').split() >>> hyp2 = str('It is to insure the troops forever hearing the activity ' ... 'guidebook that party direct').split() >>> sentence_chrf(ref1, hyp1) # doctest: +ELLIPSIS 0.6349... >>> sentence_chrf(ref1, hyp2) # doctest: +ELLIPSIS 0.3330... The infamous "the the the ... " example >>> ref = 'the cat is on the mat'.split() >>> hyp = 'the the the the the the the'.split() >>> sentence_chrf(ref, hyp) # doctest: +ELLIPSIS 0.1468... An example to show that this function allows users to use strings instead of tokens, i.e. list(str) as inputs. >>> ref1 = str('It is a guide to action that ensures that the military ' ... 'will forever heed Party commands') >>> hyp1 = str('It is a guide to action which ensures that the military ' ... 'always obeys the commands of the party') >>> sentence_chrf(ref1, hyp1) # doctest: +ELLIPSIS 0.6349... >>> type(ref1) == type(hyp1) == str True >>> sentence_chrf(ref1.split(), hyp1.split()) # doctest: +ELLIPSIS 0.6349... To skip the unigrams and only use 2- to 3-grams: >>> sentence_chrf(ref1, hyp1, min_len=2, max_len=3) # doctest: +ELLIPSIS 0.6617... :param references: reference sentence :type references: list(str) / str :param hypothesis: a hypothesis sentence :type hypothesis: list(str) / str :param min_len: The minimum order of n-gram this function should extract. :type min_len: int :param max_len: The maximum order of n-gram this function should extract. :type max_len: int :param beta: the parameter to assign more importance to recall over precision :type beta: float :param ignore_whitespace: ignore whitespace characters in scoring :type ignore_whitespace: bool :return: the sentence level CHRF score. :rtype: float """ return corpus_chrf( [reference], [hypothesis], min_len, max_len, beta=beta, ignore_whitespace=ignore_whitespace, ) def _preprocess(sent, ignore_whitespace): if type(sent) != str: # turn list of tokens into a string sent = " ".join(sent) if ignore_whitespace: sent = re.sub(r"\s+", "", sent) return sent def chrf_precision_recall_fscore_support( reference, hypothesis, n, beta=3.0, epsilon=1e-16 ): """ This function computes the precision, recall and fscore from the ngram overlaps. It returns the `support` which is the true positive score. By underspecifying the input type, the function will be agnostic as to how it computes the ngrams and simply take the whichever element in the list; it could be either token or character. :param reference: The reference sentence. :type reference: list :param hypothesis: The hypothesis sentence. :type hypothesis: list :param n: Extract up to the n-th order ngrams :type n: int :param beta: The parameter to assign more importance to recall over precision. :type beta: float :param epsilon: The fallback value if the hypothesis or reference is empty. :type epsilon: float :return: Returns the precision, recall and f-score and support (true positive). :rtype: tuple(float) """ ref_ngrams = Counter(ngrams(reference, n)) hyp_ngrams = Counter(ngrams(hypothesis, n)) # calculate the number of ngram matches overlap_ngrams = ref_ngrams & hyp_ngrams tp = sum(overlap_ngrams.values()) # True positives. tpfp = sum(hyp_ngrams.values()) # True positives + False positives. tpfn = sum(ref_ngrams.values()) # True positives + False negatives. try: prec = tp / tpfp # precision rec = tp / tpfn # recall factor = beta ** 2 fscore = (1 + factor) * (prec * rec) / (factor * prec + rec) except ZeroDivisionError: prec = rec = fscore = epsilon return prec, rec, fscore, tp def corpus_chrf( references, hypotheses, min_len=1, max_len=6, beta=3.0, ignore_whitespace=True ): """ Calculates the corpus level CHRF (Character n-gram F-score), it is the macro-averaged value of the sentence/segment level CHRF score. This implementation of CHRF only supports a single reference at the moment. >>> ref1 = str('It is a guide to action that ensures that the military ' ... 'will forever heed Party commands').split() >>> ref2 = str('It is the guiding principle which guarantees the military ' ... 'forces always being under the command of the Party').split() >>> >>> hyp1 = str('It is a guide to action which ensures that the military ' ... 'always obeys the commands of the party').split() >>> hyp2 = str('It is to insure the troops forever hearing the activity ' ... 'guidebook that party direct') >>> corpus_chrf([ref1, ref2, ref1, ref2], [hyp1, hyp2, hyp2, hyp1]) # doctest: +ELLIPSIS 0.3910... :param references: a corpus of list of reference sentences, w.r.t. hypotheses :type references: list(list(str)) :param hypotheses: a list of hypothesis sentences :type hypotheses: list(list(str)) :param min_len: The minimum order of n-gram this function should extract. :type min_len: int :param max_len: The maximum order of n-gram this function should extract. :type max_len: int :param beta: the parameter to assign more importance to recall over precision :type beta: float :param ignore_whitespace: ignore whitespace characters in scoring :type ignore_whitespace: bool :return: the sentence level CHRF score. :rtype: float """ assert len(references) == len( hypotheses ), "The number of hypotheses and their references should be the same" num_sents = len(hypotheses) # Keep f-scores for each n-gram order separate ngram_fscores = defaultdict(lambda: list()) # Iterate through each hypothesis and their corresponding references. for reference, hypothesis in zip(references, hypotheses): # preprocess both reference and hypothesis reference = _preprocess(reference, ignore_whitespace) hypothesis = _preprocess(hypothesis, ignore_whitespace) # Calculate f-scores for each sentence and for each n-gram order # separately. for n in range(min_len, max_len + 1): # Compute the precision, recall, fscore and support. prec, rec, fscore, tp = chrf_precision_recall_fscore_support( reference, hypothesis, n, beta=beta ) ngram_fscores[n].append(fscore) # how many n-gram sizes num_ngram_sizes = len(ngram_fscores) # sum of f-scores over all sentences for each n-gram order total_scores = [sum(fscores) for n, fscores in ngram_fscores.items()] # macro-average over n-gram orders and over all sentences return (sum(total_scores) / num_ngram_sizes) / num_sents