
    :'a0                         d dl Z d dlmZ d dlmZ d dlmZ d dlmZm	Z	m
Z
mZ d dlmZ d dlmZmZ d dlmZ d d	lmZ d d
lmZmZmZ  G d d      Z G d d      Ze
eef   Ze	e   Z G d d      Z eZ!y)    N)aliases)sha256)dumps)OptionalListTupleSet)Counter)subcompile)TOO_BIG_SEQUENCE)
mess_ratio)	iana_nameis_multi_byte_encodingunicode_rangec                   P   e Zd Z	 d'dededededddee   fd	Zd
efdZ	d
efdZ
ed
efd       Zed
efd       Zed
efd       Zd
efdZd
efdZd(dZed
efd       Zed
ee   fd       Zed
efd       Zed
efd       Zed
ee   fd       Zed
efd       Zed
efd       Zed
efd       Zed
efd       Zed
efd       Zed
efd       Zed
ed    fd       Zed
efd       Z ed
ee   fd        Z!ed
ee   fd!       Z"d)d"Z#d)d#Z$d*d$ed
efd%Z%ed
efd&       Z&y)+CharsetMatchNpayloadguessed_encodingmean_mess_ratiohas_sig_or_bom	languagesCoherenceMatchesdecoded_payloadc                     || _         || _        || _        || _        || _        d | _        g | _        d| _        d | _        d | _	        || _
        y )N        )_payload	_encoding_mean_mess_ratio
_languages_has_sig_or_bom_unicode_ranges_leaves_mean_coherence_ratio_output_payload_output_encoding_string)selfr   r   r   r   r   r   s          9lib/python3.12/site-packages/charset_normalizer/models.py__init__zCharsetMatch.__init__   sW      ) /#-#%'"# $&    returnc                    t        |t              sAt        dj                  t	        |j
                        t	        | j
                                    | j                  |j                  k(  xr | j                  |j                  k(  S )Nz&__eq__ cannot be invoked on {} and {}.)
isinstancer   	TypeErrorformatstr	__class__encodingfingerprintr(   others     r)   __eq__zCharsetMatch.__eq__(   sg    %.DKKCPUP_P_L`befjftftbuvww}}.X43C3CuGXGX3XXr+   c                     t        |t              st        t        | j                  |j                  z
        }|dk  r| j
                  |j
                  kD  S | j                  |j                  k  S )zQ
        Implemented to make sorted available upon CharsetMatches items.
        g{Gz?)r.   r   
ValueErrorabschaos	coherence)r(   r6   chaos_differences      r)   __lt__zCharsetMatch.__lt__-   s\     %.tzzEKK78 d">>EOO33zzEKK''r+   c                 `    t        j                  dt               t        t	        |       d      S )z
        Check once again chaos in decoded text, except this time, with full content.
        Use with caution, this can be very slow.
        Notice: Will be removed in 3.0
        z=chaos_secondary_pass is deprecated and will be removed in 3.0g      ?)warningswarnDeprecationWarningr   r1   r(   s    r)   chaos_secondary_passz!CharsetMatch.chaos_secondary_pass<   s+     	UWijI
 	
r+   c                 8    t        j                  dt               y)zy
        Coherence ratio on the first non-latin language detected if ANY.
        Notice: Will be removed in 3.0
        z<coherence_non_latin is deprecated and will be removed in 3.0r   )r@   rA   rB   rC   s    r)   coherence_non_latinz CharsetMatch.coherence_non_latinI   s     	TVhir+   c                     t        j                  dt               t        d      }t	        |dt        |       j                               }t        |j                               S )z_
        Word counter instance on decoded text.
        Notice: Will be removed in 3.0
        z2w_counter is deprecated and will be removed in 3.0z[0-9\W\n\r\t]+ )	r@   rA   rB   
re_compiler   r1   lowerr
   split)r(   not_printable_patternstring_printable_onlys      r)   	w_counterzCharsetMatch.w_counterR   sP     	JL^_ *+< = #$93D	@Q R,22455r+   c                 ~    | j                   &t        | j                  | j                  d      | _         | j                   S )Nstrict)r'   r1   r   r   rC   s    r)   __str__zCharsetMatch.__str__^   s.    <<t}}dnnhGDL||r+   c                 N    dj                  | j                  | j                        S )Nz<CharsetMatch '{}' bytes({})>)r0   r3   r4   rC   s    r)   __repr__zCharsetMatch.__repr__d   s    .55dmmTEUEUVVr+   c                     t        |t              r|| k(  r$t        dj                  |j                              d |_        | j                  j                  |       y )Nz;Unable to add instance <{}> as a submatch of a CharsetMatch)r.   r   r9   r0   r2   r'   r#   appendr5   s     r)   add_submatchzCharsetMatch.add_submatchg   sI    %.%4-ZaabgbqbqrssE"r+   c                     | j                   S N)r   rC   s    r)   r3   zCharsetMatch.encodingn   s    ~~r+   c                     g }t        j                         D ]G  \  }}| j                  |k(  r|j                  |       '| j                  |k(  s7|j                  |       I |S )z
        Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
        )r   itemsr3   rU   )r(   also_known_asups       r)   encoding_aliaseszCharsetMatch.encoding_aliasesr   s^    
 MMO 	(DAq}}!$$Q'!#$$Q'		(
 r+   c                     | j                   S rX   r!   rC   s    r)   bomzCharsetMatch.bom       ###r+   c                     | j                   S rX   r`   rC   s    r)   byte_order_markzCharsetMatch.byte_order_mark   rb   r+   c                 F    | j                   D cg c]  }|d   	 c}S c c}w )z
        Return the complete list of possible languages found in decoded sequence.
        Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
        r   r    )r(   es     r)   r   zCharsetMatch.languages   s      #oo.!...s   c                    | j                   shd| j                  v ryddlm}m} t        | j                        r || j                        n || j                        }t        |      dk(  sd|v ry|d   S | j                   d   d   S )z
        Most probable language found in decoded sequence. If none were detected or inferred, the property will return
        "Unknown".
        asciiEnglishr   )mb_encoding_languagesencoding_languageszLatin BasedUnknown)r    could_be_from_charsetcharset_normalizer.cdrk   rl   r   r3   len)r(   rk   rl   r   s       r)   languagezCharsetMatch.language   s      $444  X@VW[WdWd@e-dmm<k}  C  L  L  lMI9~"my&@ Q<q!!$$r+   c                     | j                   S rX   )r   rC   s    r)   r;   zCharsetMatch.chaos   s    $$$r+   c                 @    | j                   sy| j                   d   d   S )Nr   r      rf   rC   s    r)   r<   zCharsetMatch.coherence   s     q!!$$r+   c                 6    t        | j                  dz  d      S Nd      )ndigits)roundr;   rC   s    r)   percent_chaoszCharsetMatch.percent_chaos   s    TZZ#%q11r+   c                 6    t        | j                  dz  d      S rv   )rz   r<   rC   s    r)   percent_coherencezCharsetMatch.percent_coherence   s    T^^c)155r+   c                     | j                   S )z+
        Original untouched bytes.
        )r   rC   s    r)   rawzCharsetMatch.raw   s    
 }}r+   c                     | j                   S rX   )r#   rC   s    r)   submatchzCharsetMatch.submatch   s    ||r+   c                 2    t        | j                        dkD  S )Nr   )rp   r#   rC   s    r)   has_submatchzCharsetMatch.has_submatch   s    4<< 1$$r+   c                     | j                   | j                   S t               }t        |       D ]*  }t        |      }|s|j	                  t        |             , t        t        |            | _         | j                   S rX   )r"   setr1   r   addsortedlist)r(   detected_ranges	characterdetected_ranges       r)   	alphabetszCharsetMatch.alphabets   sw    +'''%T 	I*95N##!),	  &d?&;<###r+   c                 p    | j                   g| j                  D cg c]  }|j                   c}z   S c c}w )z
        The complete list of encoding that output the exact SAME str result and therefore could be the originating
        encoding.
        This list does include the encoding available in property 'encoding'.
        )r   r#   r3   )r(   ms     r)   rn   z"CharsetMatch.could_be_from_charset   s,     t||"D!1::"DDD"Ds   3c                     | S z>
        Kept for BC reasons. Will be removed in 3.0.
         rC   s    r)   firstzCharsetMatch.first   	     r+   c                     | S r   r   rC   s    r)   bestzCharsetMatch.best   r   r+   r3   c                     | j                   | j                   |k7  r'|| _         t        |       j                  |d      | _        | j                  S )z
        Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
        Any errors will be simply ignored by the encoder NOT replaced.
        replace)r&   r1   encoder%   )r(   r3   s     r)   outputzCharsetMatch.output   sJ    
   (D,A,AX,M$,D!#&t9#3#3Hi#HD ###r+   c                 P    t        | j                               j                         S )zw
        Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
        )r   r   	hexdigestrC   s    r)   r4   zCharsetMatch.fingerprint   s    
 dkkm$..00r+   rX   )r6   r   r,   N)r,   r   )utf_8)'__name__
__module____qualname__bytesr1   floatboolr   r*   r7   r>   propertyrD   rF   r
   rN   rQ   rS   rV   r3   r   r^   ra   rd   r   rq   r;   r<   r{   r}   r   r   r   r   rn   r   r   r   r4   r   r+   r)   r   r      s    .2'' "' #	'
 !' *' &c]'2Yt Y
(t ( 

e 

 

 U   	67 	6 	6 W# W# #   
$s) 
 
 $T $ $ $ $ $ /49 / / %# % %. %u % % %5 % %
 2u 2 2 65 6 6 U   $~.   %d % % $49 $ $ EtCy E E	$s 	$ 	$ 1S 1 1r+   r   c                   v    e Zd ZdZddee   fdZd ZdefdZde	fdZ
d	eddfd
Zded   fdZded   fdZy)CharsetMatchesz
    Container with every CharsetMatch items ordered by default from most probable to the less one.
    Act like a list(iterable) but does not implements all related methods.
    Nresultsc                 8    |rt        |      | _        y g | _        y rX   )r   _results)r(   r   s     r)   r*   zCharsetMatches.__init__  s    +2wr+   c              #   6   K   | j                   D ]  }|  y wrX   r   )r(   results     r)   __iter__zCharsetMatches.__iter__  s     mm 	FL	s   r,   c                     t        |t              r| j                  |   S t        |t              r/t	        |d      }| j                  D ]  }||j
                  v s|c S  t        )z
        Retrieve a single item either by its position or encoding name (alias may be used here).
        Raise KeyError upon invalid index or encoding not present in results.
        F)r.   intr   r1   r   rn   KeyError)r(   itemr   s      r)   __getitem__zCharsetMatches.__getitem__
  s`    
 dC ==&&dC T5)D-- "6777!M" r+   c                 ,    t        | j                        S rX   )rp   r   rC   s    r)   __len__zCharsetMatches.__len__  s    4==!!r+   r   c                    t        |t              s-t        dj                  t	        |j
                                    t        |j                        t        k  rW| j                  D ]H  }|j                  |j                  k(  s|j                  |j                  k(  s7|j                  |        y | j                  j                  |       t        | j                        | _	        y)z~
        Insert a single match. Will be inserted accordingly to preserve sort.
        Can be inserted as a submatch.
        z-Cannot append instance '{}' to CharsetMatchesN)r.   r   r9   r0   r1   r2   rp   r   r   r   r4   r;   rV   rU   r   )r(   r   matchs      r)   rU   zCharsetMatches.append  s    
 $-LSSTWX\XfXfTghiitxx=,, $$(8(88U[[DJJ=V&&t, 	T"t}}-r+   r   c                 :    | j                   sy| j                   d   S )zQ
        Simply return the first match. Strict equivalent to matches[0].
        Nr   r   rC   s    r)   r   zCharsetMatches.best+  s     }}}}Qr+   c                 "    | j                         S )zP
        Redundant method, call the method best(). Kept for BC reasons.
        )r   rC   s    r)   r   zCharsetMatches.first3  s     yy{r+   rX   )r   r   r   __doc__r   r   r*   r   r   r   r   rU   r   r   r   r   r+   r)   r   r      sj    ;\ 2 ;< " ".< .D .  h~.  x/ r+   r   c                   t    e Zd Zdededee   dee   dedee   deded	ed
ee   defdZe	d        Z
defdZy)CliDetectionResultpathr3   r^   alternative_encodingsrq   r   r   r;   r<   unicode_pathis_preferredc                     || _         |
| _        || _        || _        || _        || _        || _        || _        || _        |	| _	        || _
        y rX   )r   r   r3   r^   r   rq   r   r   r;   r<   r   )r(   r   r3   r^   r   rq   r   r   r;   r<   r   r   s               r)   r*   zCliDetectionResult.__init__@  sT    	(  0%:" ",
"(r+   c                     | j                   | j                  | j                  | j                  | j                  | j
                  | j                  | j                  | j                  | j                  | j                  dS )Nr   r3   r^   r   rq   r   r   r;   r<   r   r   r   rC   s    r)   __dict__zCliDetectionResult.__dict__M  se     II $ 5 5%)%?%?"11ZZ -- --
 	
r+   r,   c                 2    t        | j                  dd      S )NT   )ensure_asciiindent)r   r   rC   s    r)   to_jsonzCliDetectionResult.to_json]  s    MM
 	
r+   N)r   r   r   r1   r   r   r   r   r*   r   r   r   r   r+   r)   r   r   >  s    )S )C )49 )eijmen )z} )  KO  PS  KT )  fj )  sx )  EJ )  Zb  cf  Zg )  w{ ) 
 

 
r+   r   )"r@   encodings.aliasesr   hashlibr   jsonr   typingr   r   r   r	   collectionsr
   rer   r   rI   charset_normalizer.constantr   charset_normalizer.mdr   charset_normalizer.utilsr   r   r   r   r   r1   r   CoherenceMatchr   r   CharsetNormalizerMatchr   r+   r)   <module>r      so     %   - -  ) 8 , U Um1 m1`9 9x sEz"' $
 $
N & r+   