a
    Kb(                     @   sL   d Z ddlZddlZddlmZ ddlmZ ddlmZ G dd deZ	dS )a  
This is a NLTK port of the tokenizer used in the NIST BLEU evaluation script,
https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/mteval-v14.pl#L926
which was also ported into Python in
https://github.com/lium-lst/nmtpy/blob/master/nmtpy/metrics/mtevalbleu.py#L162
    N)perluniprops)
TokenizerI)xml_unescapec                   @   sf  e Zd ZdZeddfZeddfZeddfZedd	fZ	ed
dfZ
edd	fZee	e
egZedeedZedeedZedeedZeddeZeddeZeddeZeddfZede de dd	fZede de ddfZede ddfZeeeegZdd ZdddZd ddZ dS )!NISTTokenizeruT  
    This NIST tokenizer is sentence-based instead of the original
    paragraph-based tokenization from mteval-14.pl; The sentence-based
    tokenization is consistent with the other tokenizers available in NLTK.

    >>> from nltk.tokenize.nist import NISTTokenizer
    >>> nist = NISTTokenizer()
    >>> s = "Good muffins cost $3.88 in New York."
    >>> expected_lower = [u'good', u'muffins', u'cost', u'$', u'3.88', u'in', u'new', u'york', u'.']
    >>> expected_cased = [u'Good', u'muffins', u'cost', u'$', u'3.88', u'in', u'New', u'York', u'.']
    >>> nist.tokenize(s, lowercase=False) == expected_cased
    True
    >>> nist.tokenize(s, lowercase=True) == expected_lower  # Lowercased.
    True

    The international_tokenize() is the preferred function when tokenizing
    non-european text, e.g.

    >>> from nltk.tokenize.nist import NISTTokenizer
    >>> nist = NISTTokenizer()

    # Input strings.
    >>> albb = u'Alibaba Group Holding Limited (Chinese: 阿里巴巴集团控股 有限公司) us a Chinese e-commerce company...'
    >>> amz = u'Amazon.com, Inc. (/ˈæməzɒn/) is an American electronic commerce...'
    >>> rkt = u'Rakuten, Inc. (楽天株式会社 Rakuten Kabushiki-gaisha) is a Japanese electronic commerce and Internet company based in Tokyo.'

    # Expected tokens.
    >>> expected_albb = [u'Alibaba', u'Group', u'Holding', u'Limited', u'(', u'Chinese', u':', u'阿里巴巴集团控股', u'有限公司', u')']
    >>> expected_amz = [u'Amazon', u'.', u'com', u',', u'Inc', u'.', u'(', u'/', u'ˈæ', u'm']
    >>> expected_rkt = [u'Rakuten', u',', u'Inc', u'.', u'(', u'楽天株式会社', u'Rakuten', u'Kabushiki', u'-', u'gaisha']

    >>> nist.international_tokenize(albb)[:10] == expected_albb
    True
    >>> nist.international_tokenize(amz)[:10] == expected_amz
    True
    >>> nist.international_tokenize(rkt)[:10] == expected_rkt
    True

    # Doctest for patching issue #1926
    >>> sent = u'this is a foo☄sentence.'
    >>> expected_sent = [u'this', u'is', u'a', u'foo', u'☄', u'sentence', u'.']
    >>> nist.international_tokenize(sent) == expected_sent
    True
    z	<skipped> u     z([\{-\~\[-\` -\&\(-\+\:-\@\/])z \1 z([^0-9])([\.,])z\1 \2 z([\.,])([^0-9])z \1 \2z
([0-9])(-)NumberZPunctuationZSymbolz[]^\\-]z\\\g<0>z([ -]+)z([z])([z])c                 C   s8   | j \}}|||}t|}| j\}}|||}|S )z8Performs the language independent string substituitions.)
STRIP_SKIPsubr   STRIP_EOL_HYPHEN)selftextregexpsubstitution r   1lib/python3.9/site-packages/nltk/tokenize/nist.pylang_independent_sub   s    

z"NISTTokenizer.lang_independent_subFTc                 C   st   t |}| |}|rJd| d }|r.| }| jD ]\}}|||}q4d| }t | }|rl|S | S Nr   )strr   lowerLANG_DEPENDENT_REGEXESr
   joinsplitstrip)r   r   	lowercaseZwestern_lang
return_strr   r   r   r   r   tokenize   s    
zNISTTokenizer.tokenizec                 C   s   t |}| j\}}|||}| j\}}|||}t|}|rH| }| jD ]\}}|||}qNd| 	 }|r~|S |	 S r   )
r   r	   r
   r   r   r   INTERNATIONAL_REGEXESr   r   r   )r   r   r   Zsplit_non_asciir   r   r   r   r   r   international_tokenize   s    

z$NISTTokenizer.international_tokenizeN)FTF)FTF)!__name__
__module____qualname____doc__recompiler	   r   ZPUNCTZPERIOD_COMMA_PRECEEDZPERIOD_COMMA_FOLLOWZDASH_PRECEED_DIGITr   r   r   setr   charsZ
pup_numberZ	pup_punctZ
pup_symbolr
   Znumber_regexZpunct_regexZsymbol_regexZNONASCIIZPUNCT_1ZPUNCT_2ZSYMBOLSr   r   r   r   r   r   r   r   r      s>   .
 r   )
r"   ior#   Znltk.corpusr   Znltk.tokenize.apir   Znltk.tokenize.utilr   r   r   r   r   r   <module>
   s   