a
    ±KbX%  ã                   @   s^   d dl Z d dlZd dlmZmZmZ d dlmZ d dlm	Z	 G dd„ dƒZ
G dd„ deƒZdS )	é    N)ÚIteratorÚListÚTuple)Ú
TokenizerI)Úalign_tokensc                   @   s(   e Zd ZdZg d¢ZddgZddgZdS )ÚMacIntyreContractionszI
    List of contractions adapted from Robert MacIntyre's tokenizer.
    )z(?i)\b(can)(?#X)(not)\bz(?i)\b(d)(?#X)('ye)\bz(?i)\b(gim)(?#X)(me)\bz(?i)\b(gon)(?#X)(na)\bz(?i)\b(got)(?#X)(ta)\bz(?i)\b(lem)(?#X)(me)\bz(?i)\b(more)(?#X)('n)\bz(?i)\b(wan)(?#X)(na)(?=\s)z(?i) ('t)(?#X)(is)\bz(?i) ('t)(?#X)(was)\bz(?i)\b(whad)(dd)(ya)\bz(?i)\b(wha)(t)(cha)\bN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__ÚCONTRACTIONS2ÚCONTRACTIONS3ZCONTRACTIONS4© r   r   ú8lib/python3.9/site-packages/nltk/tokenize/destructive.pyr      s   
r   c                   @   sà  e Zd ZdZe dej¡dfe d¡dfe d¡dfe d¡dfe d	ej¡d
fgZe dej¡dfe d¡dfe d¡dfe d¡dfe d¡dfgZe dej¡dfe d¡dfe d¡dfe dej¡dfe d¡dfe d¡dfe d¡dfe d¡dfe dej¡dfg	Z	e d ¡dfZ
e d!¡d"fe d#¡d$fe d%¡d&fe d'¡d(fe d)¡d*fe d+¡d,fgZe d-¡d.fZeƒ ZeeejejƒƒZeeejejƒƒZd7eeeee d0œd1d2„Zeeeeef  d3œd4d5„Zd6S )8ÚNLTKWordTokenizeraE  
    The NLTK tokenizer that has improved upon the TreebankWordTokenizer.

    This is the method that is invoked by ``word_tokenize()``.  It assumes that the
    text has already been segmented into sentences, e.g. using ``sent_tokenize()``.

    The tokenizer is "destructive" such that the regexes applied will munge the
    input string to a state beyond re-construction. It is possible to apply
    `TreebankWordDetokenizer.detokenize` to the tokenized outputs of
    `NLTKDestructiveWordTokenizer.tokenize` but there's no guarantees to
    revert to the original string.
    u   ([Â«â€œâ€˜â€ž]|[`]+)z \1 z^\"ú``z(``)z([ \(\[{<])(\"|\'{2})z\1 `` z$(?i)(\')(?!re|ve|ll|m|t|s|d|n)(\w)\bz\1 \2u   ([Â»â€â€™])ú''z '' ú"z([^' ])('[sS]|'[mM]|'[dD]|') z\1 \2 z)([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T) u&   ([^\.])(\.)([\]\)}>"\'Â»â€â€™ ]*)\s*$z	\1 \2 \3 z([:,])([^\d])z \1 \2z([:,])$z\.{2,}z \g<0> z[;@#$%&]z([^\.])(\.)([\]\)}>"\']*)\s*$z\1 \2\3 z[?!]z([^'])' z\1 ' z[*]z[\]\[\(\)\{\}\<\>]z\(z-LRB-z\)z-RRB-z\[z-LSB-z\]z-RSB-z\{z-LCB-z\}z-RCB-z--z -- F)ÚtextÚconvert_parenthesesÚ
return_strÚreturnc                 C   sø   |rt jdtdd | jD ]\}}| ||¡}q| jD ]\}}| ||¡}q6| j\}}| ||¡}|r‚| jD ]\}}| ||¡}ql| j\}}| ||¡}d| d }| j	D ]\}}| ||¡}qª| j
D ]}| d|¡}qÆ| jD ]}| d|¡}qÞ| ¡ S )ab  Return a tokenized copy of `text`.

        >>> from nltk.tokenize import NLTKWordTokenizer
        >>> s = '''Good muffins cost $3.88 (roughly 3,36 euros)\nin New York.  Please buy me\ntwo of them.\nThanks.'''
        >>> NLTKWordTokenizer().tokenize(s)
        ['Good', 'muffins', 'cost', '$', '3.88', '(', 'roughly', '3,36',
        'euros', ')', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
        'of', 'them.', 'Thanks', '.']
        >>> NLTKWordTokenizer().tokenize(s, convert_parentheses=True)
        ['Good', 'muffins', 'cost', '$', '3.88', '-LRB-', 'roughly', '3,36',
        'euros', '-RRB-', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two',
        'of', 'them.', 'Thanks', '.']
        >>> NLTKWordTokenizer().tokenize(s, return_str=True)
        ' Good muffins cost  $ 3.88  ( roughly 3,36 euros ) \nin New York.  Please buy me\ntwo of them.\nThanks  .  '

        :param text: A string with a sentence or sentences.
        :type text: str
        :param convert_parentheses: if True, replace parentheses to PTB symbols,
            e.g. `(` to `-LRB-`. Defaults to False.
        :type convert_parentheses: bool, optional
        :param return_str: If True, return tokens as space-separated string,
            defaults to False.
        :type return_str: bool, optional
        :return: List of tokens from `text`.
        :rtype: List[str]
        zHParameter 'return_str' has been deprecated and should no longer be used.é   )ÚcategoryÚ
stacklevelú z \1 \2 )ÚwarningsÚwarnÚDeprecationWarningÚSTARTING_QUOTESÚsubÚPUNCTUATIONÚPARENS_BRACKETSÚCONVERT_PARENTHESESÚDOUBLE_DASHESÚENDING_QUOTESr   r   Úsplit)Úselfr   r   r   ZregexpZsubstitutionr   r   r   Útokenizex   s2    ü



zNLTKWordTokenizer.tokenize)r   r   c                 #   s\   |   |¡}d|v sd|v rDdd„ t d|¡D ƒ‰ ‡ fdd„|D ƒ}n|}t||ƒE dH  dS )a}  
        Returns the spans of the tokens in ``text``.
        Uses the post-hoc nltk.tokens.align_tokens to return the offset spans.

            >>> from nltk.tokenize import NLTKWordTokenizer
            >>> s = '''Good muffins cost $3.88\nin New (York).  Please (buy) me\ntwo of them.\n(Thanks).'''
            >>> expected = [(0, 4), (5, 12), (13, 17), (18, 19), (19, 23),
            ... (24, 26), (27, 30), (31, 32), (32, 36), (36, 37), (37, 38),
            ... (40, 46), (47, 48), (48, 51), (51, 52), (53, 55), (56, 59),
            ... (60, 62), (63, 68), (69, 70), (70, 76), (76, 77), (77, 78)]
            >>> list(NLTKWordTokenizer().span_tokenize(s)) == expected
            True
            >>> expected = ['Good', 'muffins', 'cost', '$', '3.88', 'in',
            ... 'New', '(', 'York', ')', '.', 'Please', '(', 'buy', ')',
            ... 'me', 'two', 'of', 'them.', '(', 'Thanks', ')', '.']
            >>> [s[start:end] for start, end in NLTKWordTokenizer().span_tokenize(s)] == expected
            True

        :param text: A string with a sentence or sentences.
        :type text: str
        :yield: Tuple[int, int]
        r   r   c                 S   s   g | ]}|  ¡ ‘qS r   )Úgroup)Ú.0Úmr   r   r   Ú
<listcomp>à   ó    z3NLTKWordTokenizer.span_tokenize.<locals>.<listcomp>z
``|'{2}|\"c                    s"   g | ]}|d v rˆ   d¡n|‘qS ))r   r   r   r   )Úpop)r*   Útok©Zmatchedr   r   r,   ã   s   ÿN)r(   ÚreÚfinditerr   )r'   r   Z
raw_tokensÚtokensr   r0   r   Úspan_tokenizeÁ   s    

þzNLTKWordTokenizer.span_tokenizeN)FF)r   r	   r
   r   r1   ÚcompileÚUr   r%   r!   r"   r#   r$   r   Z_contractionsÚlistÚmapr   r   ÚstrÚboolr   r(   r   r   Úintr4   r   r   r   r   r   %   s^   û
ûþþþñú	 ÿþIr   )r1   r   Útypingr   r   r   Znltk.tokenize.apir   Znltk.tokenize.utilr   r   r   r   r   r   r   Ú<module>
   s   