
    :'a?              
          d dl mZ d dlmZmZ d dlmZ d dlmZm	Z	m
Z
mZmZmZmZmZmZmZmZmZmZmZ  G d d      Z G d de      Z G d	 d
e      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Z G d de      Zdee   dee   de fdZ! ed      d"dede"de de"fd        Z#y!)#    )	lru_cache)OptionalList)UNICODE_SECONDARY_RANGE_KEYWORD)is_punctuation	is_symbolunicode_rangeis_accentuatedis_latinremove_accentis_separatoris_cjkis_case_variable	is_hangulis_katakanais_hiraganais_asciiis_thaic                   N    e Zd ZdZdedefdZdeddfdZd	dZe	de
fd       Zy)
MessDetectorPluginzy
    Base abstract class used for mess detection plugins.
    All detectors MUST extend and implement given methods.
    	characterreturnc                     t         )z@
        Determine if given character should be fed in.
        NotImplementedErrorselfr   s     5lib/python3.12/site-packages/charset_normalizer/md.pyeligiblezMessDetectorPlugin.eligible   
     "!    Nc                     t         )z
        The main routine to be executed upon character.
        Insert the logic in witch the text would be considered chaotic.
        r   r   s     r   feedzMessDetectorPlugin.feed   s
    
 "!r!   c                     t         )zB
        Permit to reset the plugin to the initial state.
        r   r   s    r   resetzMessDetectorPlugin.reset   r    r!   c                     t         )z
        Compute the chaos ratio based on what your feed() has seen.
        Must NOT be lower than 0.; No restriction gt 0.
        r   r%   s    r   ratiozMessDetectorPlugin.ratio"   s
     "!r!   r   N)__name__
__module____qualname____doc__strboolr   r#   r&   propertyfloatr(    r!   r   r   r   	   sM    
"# "$ ""c "d "" "u " "r!   r   c                   P    e Zd Zd ZdedefdZdeddfdZd	dZe	de
fd       Zy)
 TooManySymbolOrPunctuationPluginc                 J    d| _         d| _        d| _        d | _        d| _        y )Nr   F)_punctuation_count_symbol_count_character_count_last_printable_char_frenzy_symbol_in_wordr%   s    r   __init__z)TooManySymbolOrPunctuationPlugin.__init__-   s*    "# !$(!&+#r!   r   r   c                 "    |j                         S Nisprintabler   s     r   r   z)TooManySymbolOrPunctuationPlugin.eligible5       $$&&r!   Nc                    | xj                   dz  c_         || j                  k7  r^|dvrZt        |      r| xj                  dz  c_        || _        y |j	                         du r t        |      r| xj                  dz  c_        || _        y )N   )<>=:/&;{}[],|"F   )r8   r9   r   r6   isdigitr   r7   r   s     r   r#   z%TooManySymbolOrPunctuationPlugin.feed8   s    "111i  HN  7Ni(''1,' %.! ""$-)I2F""a'"$-!r!   c                 .    d| _         d| _        d| _        y Nr   )r6   r8   r7   r%   s    r   r&   z&TooManySymbolOrPunctuationPlugin.resetC   s    "# !r!   c                     | j                   dk(  ry| j                  | j                  z   | j                   z  }|dk\  r|S dS )Nr           333333?)r8   r6   r7   )r   ratio_of_punctuations     r   r(   z&TooManySymbolOrPunctuationPlugin.ratioH   sI      A% $ 7 7$:L:L LPTPePee';s'B#JJr!   r)   r*   r+   r,   r;   r.   r/   r   r#   r&   r0   r1   r(   r2   r!   r   r4   r4   +   sP    ,'# '$ '	.c 	.d 	.
 Ku K Kr!   r4   c                   P    e Zd Zd ZdedefdZdeddfdZd	dZe	de
fd       Zy)
TooManyAccentuatedPluginc                      d| _         d| _        y rT   r8   _accentuated_countr%   s    r   r;   z!TooManyAccentuatedPlugin.__init__T        !"#r!   r   r   c                 "    |j                         S r=   )isalphar   s     r   r   z!TooManyAccentuatedPlugin.eligibleX   s      ""r!   Nc                 p    | xj                   dz  c_         t        |      r| xj                  dz  c_        y y NrB   )r8   r
   r^   r   s     r   r#   zTooManyAccentuatedPlugin.feed[   s1    ")$##q(# %r!   c                      d| _         d| _        y rT   r]   r%   s    r   r&   zTooManyAccentuatedPlugin.reseta   r_   r!   c                 f    | j                   dk(  ry| j                  | j                   z  }|dk\  r|S dS )Nr   rV   gffffff?r]   )r   ratio_of_accentuations     r   r(   zTooManyAccentuatedPlugin.ratioe   s=      A% $ 7 7$:O:O O(=(E$M2Mr!   r)   rY   r2   r!   r   r[   r[   R   sP    $## #$ #)c )d )$ Nu N Nr!   r[   c                   P    e Zd Zd ZdedefdZdeddfdZd	dZe	de
fd       Zy)
UnprintablePluginc                      d| _         d| _        y rT   )_unprintable_countr8   r%   s    r   r;   zUnprintablePlugin.__init__o   s    "# !r!   r   r   c                      yNTr2   r   s     r   r   zUnprintablePlugin.eligibles       r!   Nc                     |dvr'|j                         du r| xj                  dz  c_        | xj                  dz  c_        y )N>   	
FrB   )r?   rj   r8   r   s     r   r#   zUnprintablePlugin.feedv   s>    449N9N9PTY9Y##q(#"r!   c                     d| _         y rT   )rj   r%   s    r   r&   zUnprintablePlugin.reset{   s
    "#r!   c                 Z    | j                   dk(  ry| j                  dz  | j                   z  S )Nr   rV      )r8   rj   r%   s    r   r(   zUnprintablePlugin.ratio~   s/      A%''!+t/D/DDDr!   r)   rY   r2   r!   r   rh   rh   m   sP    "# $ #c #d #
$ Eu E Er!   rh   c                   P    e Zd Zd ZdedefdZdeddfdZd	dZe	de
fd       Zy)
SuspiciousDuplicateAccentPluginc                 .    d| _         d| _        d | _        y rT   _successive_countr8   _last_latin_characterr%   s    r   r;   z(SuspiciousDuplicateAccentPlugin.__init__   s    !" !%)"r!   r   r   c                 <    |j                         xr t        |      S r=   )ra   r   r   s     r   r   z(SuspiciousDuplicateAccentPlugin.eligible   s      ":x	'::r!   Nc                 ~   | xj                   dz  c_         | j                  t        |      rt        | j                        ru|j                         r/| j                  j                         r| xj                  dz  c_        t        |      t        | j                        k(  r| xj                  dz  c_        || _        y rc   )r8   r{   r
   isupperrz   r   r   s     r   r#   z$SuspiciousDuplicateAccentPlugin.feed   s    "%%1i(^D<V<V-W$$&4+E+E+M+M+O**a/* +}T=W=W/XX**a/*%."r!   c                 .    d| _         d| _        d | _        y rT   ry   r%   s    r   r&   z%SuspiciousDuplicateAccentPlugin.reset   s    !" !%)"r!   c                 Z    | j                   dk(  ry| j                  dz  | j                   z  S )Nr   rV   rQ   )r8   rz   r%   s    r   r(   z%SuspiciousDuplicateAccentPlugin.ratio   s/      A%&&*d.C.CCCr!   r)   rY   r2   r!   r   rw   rw      sP    *;# ;$ ;	/c 	/d 	/*
 Du D Dr!   rw   c                   P    e Zd Zd ZdedefdZdeddfdZd	dZe	de
fd       Zy)
SuspiciousRangec                 .    d| _         d| _        d | _        y rT   )"_suspicious_successive_range_countr8   _last_printable_seenr%   s    r   r;   zSuspiciousRange.__init__   s    23/ !$(!r!   r   r   c                 "    |j                         S r=   r>   r   s     r   r   zSuspiciousRange.eligible   r@   r!   Nc                 ,   | xj                   dz  c_         |j                         st        |      rd | _        y | j                  || _        y t	        | j                        }t	        |      }t        ||      r| xj                  dz  c_        || _        y rc   )r8   isspacer   r   r	    is_suspiciously_successive_ranger   )r   r   unicode_range_aunicode_range_bs       r   r#   zSuspiciousRange.feed   s    ".";(,D%$$,(1D%'(A(AB'	2+O_M33q83$-!r!   c                 .    d| _         d| _        d | _        y rT   )r8   r   r   r%   s    r   r&   zSuspiciousRange.reset   s     !23/$(!r!   c                 j    | j                   dk(  ry| j                  dz  | j                   z  }|dk  ry|S )Nr   rV   rQ   g?)r8   r   )r   ratio_of_suspicious_range_usages     r   r(   zSuspiciousRange.ratio   sB      A%+/+R+RUV+VZ^ZoZo*o'*S0..r!   r)   rY   r2   r!   r   r   r      sM    )
'# '$ '.c .d .&)
 	/u 	/ 	/r!   r   c                   P    e Zd Zd ZdedefdZdeddfdZd	dZe	de
fd       Zy)
SuperWeirdWordPluginc                 t    d| _         d| _        d| _        d| _        d| _        d| _        d| _        d| _        y )Nr   F )_word_count_bad_word_count_is_current_word_bad_foreign_long_watchr8   _bad_character_count_buffer_buffer_accent_countr%   s    r   r;   zSuperWeirdWordPlugin.__init__   sA     $)!#(  !$%!$%!r!   r   r   c                      yrl   r2   r   s     r   r   zSuperWeirdWordPlugin.eligible   rm   r!   Nc                 $   |j                         rdj                  | j                  |g      | _        t        |      r| xj                  dz  c_        | j
                  du rUt        |      du rHt        |      du r;t        |      du r.t        |      du r!t        |      du rt        |      du rd| _        y | j                  sy |j                         st        |      st        |      r| j                  r| xj                  dz  c_        t!        | j                        }| xj"                  |z  c_        |dk\  r| j                  |z  dk\  rd| _        |dk\  r| j
                  rd| _        | j$                  rD| xj&                  dz  c_        | xj(                  t!        | j                        z  c_        d| _        d| _        d| _        d| _        y |d	vr<|j+                         du r)t-        |      rd| _        | xj                  |z  c_        y y y y )
Nr   rB   FT   rW      r   >   -rC   rE   rD   )ra   joinr   r
   r   r   r   r   r   r   r   r   r   r   r   r   lenr8   r   r   r   rR   r   )r   r   buffer_lengths      r   r#   zSuperWeirdWordPlugin.feed   s,   77DLL)#<=DLi())Q.)''50Xi5HE5QV\]fVgkpVpu~  @I  vJ  NS  vS  Xc  dm  Xn  rw  Xw  |G  HQ  |R  V[  |[  `g  hq  `r  v{  `{+/(||>)#<Y@W]a]i]i!-M!!]2!!d&?&?-&OSV&V,0)"t'?'?,0)(($$)$))S->>),1)',D$DL()D%22y7H7H7Je7SXabkXl(,D%LLI%L Ym7S2r!   c                 f    d| _         d| _        d| _        d| _        d| _        d| _        d| _        y )Nr   Fr   )r   r   r   r   r   r8   r   r%   s    r   r&   zSuperWeirdWordPlugin.reset
  s9    $)!#(   !$%!r!   c                 T    | j                   dk  ry| j                  | j                  z  S )N
   rV   )r   r   r8   r%   s    r   r(   zSuperWeirdWordPlugin.ratio  s*    r!((4+@+@@@r!   r)   rY   r2   r!   r   r   r      sQ    
&# $ &c &d &B& Au A Ar!   r   c                   T    e Zd ZdZd ZdedefdZdeddfdZd
dZ	e
defd	       Zy)CjkInvalidStopPluginu   
    GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and can be easily detected.
    Searching for the overuse of '丅' and '丄'.
    c                      d| _         d| _        y rT   _wrong_stop_count_cjk_character_countr%   s    r   r;   zCjkInvalidStopPlugin.__init__!      !"$%!r!   r   r   c                      yrl   r2   r   s     r   r   zCjkInvalidStopPlugin.eligible%  rm   r!   Nc                 z    |dv r| xj                   dz  c_         y t        |      r| xj                  dz  c_        y y )N)u   丅u   丄rB   )r   r   r   r   s     r   r#   zCjkInvalidStopPlugin.feed(  s<    &""a'")%%*% r!   c                      d| _         d| _        y rT   r   r%   s    r   r&   zCjkInvalidStopPlugin.reset/  r   r!   c                 T    | j                   dk  ry| j                  | j                   z  S )N   rV   )r   r   r%   s    r   r(   zCjkInvalidStopPlugin.ratio3  s*    $$r)%%(A(AAAr!   r)   )r*   r+   r,   r-   r;   r.   r/   r   r#   r&   r0   r1   r(   r2   r!   r   r   r     sU    
&# $ +c +d +& Bu B Br!   r   c                   P    e Zd Zd ZdedefdZdeddfdZd	dZe	de
fd       Zy)
ArchaicUpperLowerPluginc                 f    d| _         d| _        d| _        d| _        d| _        d | _        d| _        y )NFr   T)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalr8   _last_alpha_seen_current_ascii_onlyr%   s    r   r;   z ArchaicUpperLowerPlugin.__init__<  s9    	/0,-.*340 ! $#' r!   r   r   c                      yrl   r2   r   s     r   r   z ArchaicUpperLowerPlugin.eligibleI  rm   r!   Nc                 P   |j                         xr t        |      }|du }|r| j                  dkD  r| j                  dk  r?|j                         du r-| j                  du r| xj
                  | j                  z  c_        d| _        d| _        d | _        d| _        | xj                  dz  c_	        d| _        y | j                  du rt        |      du rd| _        | j                  |j                         r| j                  j                         s*|j                         rM| j                  j                         r3| j                  du r| xj                  dz  c_        d| _        nd| _        nd| _        | xj                  dz  c_	        | xj                  dz  c_        || _        y )NFr   @   rB   TrQ   )ra   r   r   rR   r   r   r   r   r   r8   r   r~   islower)r   r   is_concerned	chunk_seps       r   r#   zArchaicUpperLowerPlugin.feedL  s    ((*J/?	/J E)	==A33r9i>O>O>QUZ>Z_c_w_w  |A  `A88D<^<^^812D.34D0$(D!DI!!Q&!'+D$##t+0Cu0L',D$  ,!!#(=(=(E(E(GYM^M^M`eiezez  fC  fC  fE99$66!;6 %DI $DI!	",,1, )r!   c                 f    d| _         d| _        d| _        d| _        d | _        d| _        d| _        y )Nr   FT)r8   r   r   r   r   r   r   r%   s    r   r&   zArchaicUpperLowerPlugin.resetn  s9     !/0,-.*340 $	#' r!   c                 T    | j                   dk(  ry| j                  | j                   z  S )Nr   rV   )r8   r   r%   s    r   r(   zArchaicUpperLowerPlugin.ratiow  s*      A%77$:O:OOOr!   r)   rY   r2   r!   r   r   r   :  sQ    (# $  *c  *d  *D( Pu P Pr!   r   r   r   r   c                 ^   | |y| |k(  ryd| v rd|v ryd| v sd|v ry| j                  d      |j                  d      }}|D ]  }|t        v r||v s y | dv r|dv ry| dv s|dv r	d| v sd|v ryd| v sd|v rd| v sd|v ry| d	k(  s|d	k(  ryd| v sd|v s| dv r|dv rd
| v sd
|v ryd| v sd|v ryy)za
    Determine if two Unicode range seen next to each other can be considered as suspicious.
    TFLatin	Emoticons )KatakanaHiraganaCJKHangulzBasic LatinPunctuationForms)splitr   )r   r   keywords_range_akeywords_range_bels        r   r   r     sK    /"9/)/!g&@o%)G)8)>)>s)C_EZEZ[^E_& 00!!	 22Jb7b22oIa6aO#u'??"h/&AO#u'?m+-/O 	 E_$</UmBm  sB  F^  s^O+}/Oo%O)Cr!   i   )maxsizedecoded_sequencemaximum_thresholddebugc                 &   g }t         j                         D ]  }|j                   |               t        |       }d}|dk  rd}n
|dk  rd}nd}t	        | t        d|            D ]o  \  }}	|D ]%  }
|
j                  |      s|
j                  |       ' |	dkD  r|	|z  dk(  s	|	|dz
  k(  sFt        |D cg c]  }|j                   c}      }||k\  so n |r'|D ]"  }t        |j                  |j                         $ t        |d	      S c c}w )
zw
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    rV   i       i   r      r   rB      )r   __subclasses__appendr   zipranger   r#   sumr(   print	__class__round)r   r   r   	detectorsmd_classlengthmean_mess_ratio!intermediary_mean_mess_ratio_calcr   indexdetectordts               r   
mess_ratior     sI   
 I&557 
J	


 !"FO|,.)	4,.),/) 0%62BC 	5! 	)H  +i(	) AI%"CCqHUV\]^V^M^!'0!#BHHO "33  	B	 	 s   6D
N)g?F)$	functoolsr   typingr   r   charset_normalizer.constantr   charset_normalizer.utilsr   r   r	   r
   r   r   r   r   r   r   r   r   r   r   r   r4   r[   rh   rw   r   r   r   r   r.   r/   r   r1   r   r2   r!   r   <module>r      s    ! Gr r r r" "D$K'9 $KNN1 N6E* E2 D&8  DF,/( ,/^@A- @AFB- B>BP0 BPJ-hsm -V^_bVc -hl -` 4/ / /T /^c / /r!   