a
    :'a?                     @   s4  d dl mZ d dlmZmZ d dlmZ d dlmZm	Z	m
Z
mZmZmZmZmZmZmZmZmZmZmZ G dd dZG dd deZG d	d
 d
eZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZee ee e dddZ!eddd"ee"e e"ddd Z#d!S )#    )	lru_cache)OptionalList)UNICODE_SECONDARY_RANGE_KEYWORD)is_punctuation	is_symbolunicode_rangeis_accentuatedis_latinremove_accentis_separatoris_cjkis_case_variable	is_hangulis_katakanais_hiraganais_asciiis_thaic                   @   sP   e Zd ZdZeedddZeddddZddd	d
Ze	e
dddZdS )MessDetectorPluginzy
    Base abstract class used for mess detection plugins.
    All detectors MUST extend and implement given methods.
    	characterreturnc                 C   s   t dS )z@
        Determine if given character should be fed in.
        NNotImplementedErrorselfr    r   4lib/python3.9/site-packages/charset_normalizer/md.pyeligible   s    zMessDetectorPlugin.eligibleNc                 C   s   t dS )z
        The main routine to be executed upon character.
        Insert the logic in witch the text would be considered chaotic.
        Nr   r   r   r   r   feed   s    zMessDetectorPlugin.feedr   c                 C   s   t dS )zB
        Permit to reset the plugin to the initial state.
        Nr   r   r   r   r   reset   s    zMessDetectorPlugin.resetc                 C   s   t dS )z
        Compute the chaos ratio based on what your feed() has seen.
        Must NOT be lower than 0.; No restriction gt 0.
        Nr   r!   r   r   r   ratio"   s    zMessDetectorPlugin.ratio)__name__
__module____qualname____doc__strboolr   r   r"   propertyfloatr#   r   r   r   r   r   	   s   r   c                   @   sT   e Zd Zdd ZeedddZeddddZdd	d
dZe	e
d	ddZdS ) TooManySymbolOrPunctuationPluginc                 C   s"   d| _ d| _d| _d | _d| _d S )Nr   F)_punctuation_count_symbol_count_character_count_last_printable_charZ_frenzy_symbol_in_wordr!   r   r   r   __init__-   s
    z)TooManySymbolOrPunctuationPlugin.__init__r   c                 C   s   |  S Nisprintabler   r   r   r   r   5   s    z)TooManySymbolOrPunctuationPlugin.eligibleNc                 C   sd   |  j d7  _ || jkrZ|dvrZt|r8|  jd7  _n"| du rZt|rZ|  jd7  _|| _d S )N   )<>=:/&;{}[],|"F   )r/   r0   r   r-   isdigitr   r.   r   r   r   r   r   8   s    z%TooManySymbolOrPunctuationPlugin.feedr    c                 C   s   d| _ d| _d| _d S Nr   )r-   r/   r.   r!   r   r   r   r"   C   s    z&TooManySymbolOrPunctuationPlugin.resetc                 C   s0   | j dkrdS | j| j | j  }|dkr,|S dS )Nr           333333?)r/   r-   r.   )r   Zratio_of_punctuationr   r   r   r#   H   s    
z&TooManySymbolOrPunctuationPlugin.ratior$   r%   r&   r1   r(   r)   r   r   r"   r*   r+   r#   r   r   r   r   r,   +   s   r,   c                   @   sT   e Zd Zdd ZeedddZeddddZdd	d
dZe	e
d	ddZdS )TooManyAccentuatedPluginc                 C   s   d| _ d| _d S rF   r/   _accentuated_countr!   r   r   r   r1   T   s    z!TooManyAccentuatedPlugin.__init__r   c                 C   s   |  S r2   )isalphar   r   r   r   r   X   s    z!TooManyAccentuatedPlugin.eligibleNc                 C   s(   |  j d7  _ t|r$|  jd7  _d S Nr5   )r/   r	   rL   r   r   r   r   r   [   s    zTooManyAccentuatedPlugin.feedr    c                 C   s   d| _ d| _d S rF   rK   r!   r   r   r   r"   a   s    zTooManyAccentuatedPlugin.resetc                 C   s*   | j dkrdS | j| j  }|dkr&|S dS )Nr   rG   gffffff?rK   )r   Zratio_of_accentuationr   r   r   r#   e   s    
zTooManyAccentuatedPlugin.ratiorI   r   r   r   r   rJ   R   s   rJ   c                   @   sT   e Zd Zdd ZeedddZeddddZdd	d
dZe	e
d	ddZdS )UnprintablePluginc                 C   s   d| _ d| _d S rF   )_unprintable_countr/   r!   r   r   r   r1   o   s    zUnprintablePlugin.__init__r   c                 C   s   dS NTr   r   r   r   r   r   s   s    zUnprintablePlugin.eligibleNc                 C   s4   |dvr"|  du r"|  jd7  _|  jd7  _d S )N>   
	Fr5   )r4   rP   r/   r   r   r   r   r   v   s    zUnprintablePlugin.feedr    c                 C   s
   d| _ d S rF   )rP   r!   r   r   r   r"   {   s    zUnprintablePlugin.resetc                 C   s   | j dkrdS | jd | j  S )Nr   rG      )r/   rP   r!   r   r   r   r#   ~   s    
zUnprintablePlugin.ratiorI   r   r   r   r   rO   m   s   rO   c                   @   sT   e Zd Zdd ZeedddZeddddZdd	d
dZe	e
d	ddZdS )SuspiciousDuplicateAccentPluginc                 C   s   d| _ d| _d | _d S rF   _successive_countr/   _last_latin_characterr!   r   r   r   r1      s    z(SuspiciousDuplicateAccentPlugin.__init__r   c                 C   s   |  ot|S r2   )rM   r
   r   r   r   r   r      s    z(SuspiciousDuplicateAccentPlugin.eligibleNc                 C   st   |  j d7  _ | jd urjt|rjt| jrj| rJ| j rJ|  jd7  _t|t| jkrj|  jd7  _|| _d S rN   )r/   rZ   r	   isupperrY   r   r   r   r   r   r      s    
z$SuspiciousDuplicateAccentPlugin.feedr    c                 C   s   d| _ d| _d | _d S rF   rX   r!   r   r   r   r"      s    z%SuspiciousDuplicateAccentPlugin.resetc                 C   s   | j dkrdS | jd | j  S )Nr   rG   rD   )r/   rY   r!   r   r   r   r#      s    
z%SuspiciousDuplicateAccentPlugin.ratiorI   r   r   r   r   rW      s   rW   c                   @   sT   e Zd Zdd ZeedddZeddddZdd	d
dZe	e
d	ddZdS )SuspiciousRangec                 C   s   d| _ d| _d | _d S rF   )"_suspicious_successive_range_countr/   _last_printable_seenr!   r   r   r   r1      s    zSuspiciousRange.__init__r   c                 C   s   |  S r2   r3   r   r   r   r   r      s    zSuspiciousRange.eligibleNc                 C   sp   |  j d7  _ | st|r(d | _d S | jd u r<|| _d S t| j}t|}t||rf|  jd7  _|| _d S rN   )r/   isspacer   r^   r    is_suspiciously_successive_ranger]   )r   r   unicode_range_aunicode_range_br   r   r   r      s    


zSuspiciousRange.feedr    c                 C   s   d| _ d| _d | _d S rF   )r/   r]   r^   r!   r   r   r   r"      s    zSuspiciousRange.resetc                 C   s.   | j dkrdS | jd | j  }|dk r*dS |S )Nr   rG   rD   g?)r/   r]   )r   Zratio_of_suspicious_range_usager   r   r   r#      s    
zSuspiciousRange.ratiorI   r   r   r   r   r\      s   r\   c                   @   sT   e Zd Zdd ZeedddZeddddZdd	d
dZe	e
d	ddZdS )SuperWeirdWordPluginc                 C   s4   d| _ d| _d| _d| _d| _d| _d| _d| _d S )Nr   F )_word_count_bad_word_count_is_current_word_bad_foreign_long_watchr/   _bad_character_count_buffer_buffer_accent_countr!   r   r   r   r1      s    zSuperWeirdWordPlugin.__init__r   c                 C   s   dS rQ   r   r   r   r   r   r      s    zSuperWeirdWordPlugin.eligibleNc                 C   s  |  rd| j|g| _t|r0|  jd7  _| jdu rt|du rt|du rt|du rt	|du rt
|du rt|du rd| _d S | jsd S | st|st|rV| jrV|  jd7  _t| j}|  j|7  _|dkr| j| dkrd| _|dkr| jrd| _| jrB|  jd7  _|  jt| j7  _d| _d| _d| _d| _n6|d	vr| du rt|rd| _|  j|7  _d S )
Nrd   r5   FT   rH      r   >   r7   r6   r8   -)rM   joinrj   r	   rk   rh   r
   r   r   r   r   r   r_   r   r   re   lenr/   rg   rf   ri   rE   r   )r   r   Zbuffer_lengthr   r   r   r      s6    R"
"zSuperWeirdWordPlugin.feedr    c                 C   s.   d| _ d| _d| _d| _d| _d| _d| _d S )Nrd   Fr   )rj   rg   rh   rf   re   r/   ri   r!   r   r   r   r"   
  s    zSuperWeirdWordPlugin.resetc                 C   s   | j dkrdS | j| j S )N
   rG   )re   ri   r/   r!   r   r   r   r#     s    
zSuperWeirdWordPlugin.ratiorI   r   r   r   r   rc      s   !	rc   c                   @   sX   e Zd ZdZdd ZeedddZedddd	Zdd
ddZ	e
ed
ddZdS )CjkInvalidStopPluginu   
    GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and can be easily detected.
    Searching for the overuse of '丅' and '丄'.
    c                 C   s   d| _ d| _d S rF   _wrong_stop_count_cjk_character_countr!   r   r   r   r1   !  s    zCjkInvalidStopPlugin.__init__r   c                 C   s   dS rQ   r   r   r   r   r   r   %  s    zCjkInvalidStopPlugin.eligibleNc                 C   s4   |dv r|  j d7  _ d S t|r0|  jd7  _d S )N)u   丅u   丄r5   )rt   r   ru   r   r   r   r   r   (  s
    zCjkInvalidStopPlugin.feedr    c                 C   s   d| _ d| _d S rF   rs   r!   r   r   r   r"   /  s    zCjkInvalidStopPlugin.resetc                 C   s   | j dk rdS | j| j  S )N   rG   )ru   rt   r!   r   r   r   r#   3  s    
zCjkInvalidStopPlugin.ratio)r$   r%   r&   r'   r1   r(   r)   r   r   r"   r*   r+   r#   r   r   r   r   rr     s   rr   c                   @   sT   e Zd Zdd ZeedddZeddddZdd	d
dZe	e
d	ddZdS )ArchaicUpperLowerPluginc                 C   s.   d| _ d| _d| _d| _d| _d | _d| _d S )NFr   T)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalr/   _last_alpha_seen_current_ascii_onlyr!   r   r   r   r1   <  s    z ArchaicUpperLowerPlugin.__init__r   c                 C   s   dS rQ   r   r   r   r   r   r   I  s    z ArchaicUpperLowerPlugin.eligibleNc                 C   s$  |  ot|}|du }|r| jdkr| jdkrV| du rV| jdu rV|  j| j7  _d| _d| _d | _d| _|  j	d7  _	d| _d S | jdu rt
|du rd| _| jd ur| r| j s| r| j r| jdu r|  jd7  _d| _qd| _nd| _|  j	d7  _	|  jd7  _|| _d S )NFr   @   r5   TrD   )rM   r   ry   rE   r}   r{   rz   r|   rx   r/   r   r[   islower)r   r   Zis_concernedZ	chunk_sepr   r   r   r   L  s0     
$
zArchaicUpperLowerPlugin.feedr    c                 C   s.   d| _ d| _d| _d| _d | _d| _d| _d S )Nr   FT)r/   ry   rz   r{   r|   rx   r}   r!   r   r   r   r"   n  s    zArchaicUpperLowerPlugin.resetc                 C   s   | j dkrdS | j| j  S )Nr   rG   )r/   r{   r!   r   r   r   r#   w  s    
zArchaicUpperLowerPlugin.ratiorI   r   r   r   r   rw   :  s   "	rw   )ra   rb   r   c                 C   sL  | du s|du rdS | |kr dS d| v r4d|v r4dS d| v sDd|v rHdS |  d| d }}|D ]}|tv rpqb||v rb dS qb| dv r|dv rdS | dv s|dv rd| v sd|v rdS d	| v sd	|v rd| v sd|v rdS | d
ks|d
krdS d| v sd|v s| dv rH|dv rHd| v s,d|v r0dS d| v sDd|v rHdS dS )za
    Determine if two Unicode range seen next to each other can be considered as suspicious.
    NTFZLatinZ	Emoticons )ZKatakanaZHiraganaZCJKZHangulzBasic LatinZPunctuationZForms)splitr   )ra   rb   Zkeywords_range_aZkeywords_range_belr   r   r   r`     s<    (r`   i   )maxsize皙?F)decoded_sequencemaximum_thresholddebugr   c                 C   s   g }t  D ]}||  qt| }d}|dk r8d}n|dkrFd}nd}t| td|D ]d\}}	|D ]}
|
|rf|
| qf|	dkr|	| dks|	|d krZtd	d
 |D }||krZ qqZ|r|D ]}t	|j
|j qt|dS )zw
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    rG   i       i   r~      r   r5   c                 S   s   g | ]
}|j qS r   )r#   ).0dtr   r   r   
<listcomp>  s   zmess_ratio.<locals>.<listcomp>   )r   __subclasses__appendrp   zipranger   r   sumprint	__class__r#   round)r   r   r   Z	detectorsZmd_classlengthZmean_mess_ratioZ!intermediary_mean_mess_ratio_calcr   indexZdetectorr   r   r   r   
mess_ratio  sD    
 r   N)r   F)$	functoolsr   typingr   r   Zcharset_normalizer.constantr   Zcharset_normalizer.utilsr   r   r   r	   r
   r   r   r   r   r   r   r   r   r   r   r,   rJ   rO   rW   r\   rc   rr   rw   r(   r)   r`   r+   r   r   r   r   r   <module>   s   @"'#/CE0