
    :'aE=                        d dl mZmZ d dlmZmZmZmZmZ 	 d dl	m
Z
 d dlmZmZmZ d dlmZ d dlmZmZ d dlmZ d d	lZd d
lmZmZmZmZmZmZ d dl m!Z!m"Z"m#Z#m$Z$  ejJ                  d      Z&e&jO                  ejP                          ejR                         Z*e*jW                   ejX                  d             e&j[                  e*       	 	 	 	 	 	 	 dde.de/de/de0dee   dee   de1de1defdZ2	 	 	 	 	 	 	 ddede/de/de0dee   dee   de1de1defdZ3	 	 	 	 	 	 	 dde
de/de/de0dee   dee   de1de1defdZ4dde
de/de/de0dee   dee   de1defdZ5y	# e$ r eedf   Z
Y Ew xY w)    )splitextbasename)ListBinaryIOOptionalSetUnion)PathLikezos.PathLike[str])TOO_SMALL_SEQUENCETOO_BIG_SEQUENCEIANA_SUPPORTED)
mess_ratio)CharsetMatchesCharsetMatch)warnN)any_specified_encodingis_multi_byte_encodingidentify_sig_or_bomshould_strip_sig_or_bomis_cp_similar	iana_name)coherence_ratioencoding_languagesmb_encoding_languagesmerge_coherence_ratioscharset_normalizerz)%(asctime)s | %(levelname)s | %(message)s	sequencessteps
chunk_size	thresholdcp_isolationcp_exclusionpreemptive_behaviourexplainreturnc                    |s$t         j                  t        j                         n#t         j                  t        j                         t        |       }|dk(  r/t         j                  d       t        t        | dddg d      g      S |?t         j                  dd	j                  |             |D 	cg c]  }	t        |	d       }}	ng }|?t         j                  d
d	j                  |             |D 	cg c]  }	t        |	d       }}	ng }|||z  k  rt         j                  d|||       d}|}|dkD  r||z  |k  rt        ||z        }t        |       t        k  }
t        |       t        k\  }|
rt        dj                  |             g }|du rt!        |       nd}|'|j#                  |       t         j%                  d|       t'               }g }g }d}d}d}d}d}t               }t)        |       \  }}|1|j#                  |       t         j%                  dt        |      |       |j#                  d       d|vr|j#                  d       |t*        z   D ]  }|r||vr|r||v r||v r|j-                  |       d}||k(  }|xr t/        |      }|dv r|du rt         j%                  d|       ]	 t1        |      }	 |r9|du r5t9        |du r| dt        d       n| t        |      t        d       |       nt9        |du r| n| t        |      d |      }d}|D ]  } t?        ||       sd} n |rt         j                  d|        tA        |du rdn
t        |      |t        ||z              }!|xr |duxr t        |      |k  }"|"rt         j%                  d|       t        t        |!      dz        }#|#dk  rd}#d}$g }%g }&|!D ]o  }'| |'|'|z    }(|r	|du r||(z   }(|(jC                  |d      })|%j#                  |)       |&j#                  tE        |)|             |&d   |k\  r|$dz  }$|$|#k\  s|sj|du so n |&rtG        |&      t        |&      z  }*nd}*|*|k\  s|$|#k\  ri|j#                  |       |s|dz  }t         j                  d||$tI        |*d z  d!"             |dd|fv r"t        | ||dg |      }+||k(  r|+}n
|dk(  r|+}n|+}vt         j%                  d#|tI        |*d z  d!"             |stK        |      },ntM        |      },|,r.t         j%                  d$j                  |t9        |,                   g }-|%D ]3  })tO        |)d%|,rd&j                  |,      nd      }.|-j#                  |.       5 tQ        |-      }/|/r%t         j%                  d'j                  |/|             |j#                  t        | ||*||/|             ||ddfv r,|*d%k  r't         j%                  d(|       t        ||   g      c S ||k(  r't         j%                  d)|       t        ||   g      c S |d   jR                  st         j%                  d*|||   jT                          t        |      dk(  r|s|s|rt         j                  d+       |r3t         j                  d,|jV                         |j#                  |       |S |r||rA|jX                  |jX                  k7  r(t         j                  d-       |j#                  |       |S |r&t         j                  d.       |j#                  |       |S c c}	w c c}	w # t2        t4        f$ r t         j7                  d|       Y w xY w# t:        $ rC}t         j                  d|t9        |             |j#                  |       |s|dz  }Y d}~Rd}~wt<        $ r |j#                  |       |s|dz  }Y yw xY w)/aD  
    Given a raw bytes sequence, return the best possibles charset usable to render str objects.
    If there is no results, it is a strong indicator that the source is binary/not text.
    By default, the process will extract 5 blocs of 512o each to assess the mess and coherence of a given sequence.
    And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.

    The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
    but never take it for granted. Can improve the performance.

    You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
    purpose.

    This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
    r   zXGiven content is empty, stopping the process very early, returning empty utf_8 str matchutf_8g        F Nz`cp_isolation is set. use this flag for debugging purpose. limited list of encoding allowed : %s.z, zacp_exclusion is set. use this flag for debugging purpose. limited list of encoding excluded : %s.z^override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.   z>Trying to detect encoding from a tiny portion of ({}) byte(s).Tz@Detected declarative mark in sequence. Priority +1 given for %s.zIDetected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.ascii>   utf_16utf_32z[Encoding %s wont be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.z2Encoding %s does not provide an IncrementalDecoderg    A)encodingz9Code page %s does not fit given bytes sequence at ALL. %szW%s is deemed too similar to code page %s and was consider unsuited already. Continuing!zpCode page %s is a multi byte encoding table and it appear that at least one character was encoded using n-bytes.      ignore)errorszc%s was excluded because of initial chaos probing. Gave up %i time(s). Computed mean chaos is %f %%.d      )ndigitsz=%s passed initial chaos probing. Mean measured chaos is %f %%z&{} should target any language(s) of {}g?,z We detected language {} using {}z0%s is most likely the one. Stopping the process.z[%s is most likely the one as we detected a BOM or SIG within the beginning of the sequence.z:Using %s code page we detected the following languages: %szONothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.z#%s will be used as a fallback matchz&utf_8 will be used as a fallback matchz&ascii will be used as a fallback match)-loggersetLevelloggingCRITICALINFOlenwarningr   r   joinr   intr   r   r   formatr   appendinfosetr   r   addr   r   ModuleNotFoundErrorImportErrordebugstrUnicodeDecodeErrorLookupErrorr   rangedecoder   sumroundr   r   r   r   	languages
_languagesr-   fingerprint)0r   r   r   r    r!   r"   r#   r$   lengthcpis_too_small_sequenceis_too_large_sequenceprioritized_encodingsspecified_encodingtestedtested_but_hard_failuretested_but_soft_failurefallback_asciifallback_u8fallback_specifiedsingle_byte_hard_failure_countsingle_byte_soft_failure_countresultssig_encodingsig_payloadencoding_ianadecoded_payloadbom_or_sig_availablestrip_sig_or_bomis_multi_byte_decoderesimilar_soft_failure_testencoding_soft_failedr_multi_byte_bonusmax_chunk_gave_upearly_stop_count	md_chunks	md_ratiosicut_sequencechunkmean_mess_ratiofallback_entrytarget_languages	cd_ratioschunk_languagescd_ratios_mergeds0                                                   6lib/python3.12/site-packages/charset_normalizer/api.py
from_bytesr{      s   2 (()%^F{qr	
 	
  @yy.	0 8DD	"e,DD6IIl#	% 8DD	"e,DD*u$%l:v	' 
qyVe^j0%(
	N-??	N.>>MTTU[\]>RVZ>Z/	:`d%$$%78VXjkUF  NK%&"%&"G 3I >L+$$\2_adepaqs  	A  )++$$W-.~= DM=M\9F"

=!+}</Z4KM4Z005IU5RKKu  xE  F	$:=$I!
	$)>%)G-=-FIjs4y)IVYZeVfgjkogpLq*
 #&!1U!:I	#kJZJ[@\*#  %*!$; 	 ],@A,0)	
 %NNt  wD  FZ  [%.AC4D
 1r_D5PrUXYhUilrUrKK  K  MZ  [B!,q  !		 	A$Qq:~6L#(8E(A*<7 ''h'GEU# }	) A%  $55;OTdhmTm+	. !)ns9~=O Oi'+;?P+P#**=9(.!3.NN ;(+ 3!6B	D '3E FF!-!#" !$66)7&"g-%3N"0KK/C'3	
 %1-@4]CKK@GGWZ[kWlmn	 	E-eSXh#((CS:TnrsO	 2)<KK:AABRTabc$ 		
 /'BBY\G\KKJMZ!'(  L(KKm "'(  2;  KKL&11ADL 7|q.,>NNlmNN@BTB]B]^NN-. N n4++JaJaeseeJNNCDNN;'
 N	 NNCDNN>*Nc E EV $[1 	LLM}]	 " 	NNVXegjklgmn#**=9(.!3. 	#**=9(.!3.		s=   4\17\6:\;A]';%]$#]$'	_08^..$__fpc           
      B    t        | j                         |||||||      S )z
    Same thing than the function from_bytes but using a file pointer that is already ready.
    Will not close the file pointer.
    )r{   read)r|   r   r   r    r!   r"   r#   r$   s           rz   from_fpr   b  s/     
		 	    pathc                 j    t        | d      5 }t        ||||||||      cddd       S # 1 sw Y   yxY w)z
    Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
    Can raise IOError.
    rbN)openr   )	r   r   r   r    r!   r"   r#   r$   r|   s	            rz   	from_pathr   |  sA     
dD	 tRr5*i|Uikrst t ts   )2c           
         t        | ||||||      }t        |       }t        t        |            }	t	        |      dk(  rt        dj                  |            |j                         }
|	dxx   d|
j                  z   z  cc<   t        dj                  | j                  |dj                  |	                  d      5 }|j                  |
j                                ddd       |
S # 1 sw Y   |
S xY w)zi
    Take a (text-based) file path and try to create another file next to it, this time using UTF-8.
    r   z;Unable to normalize "{}", no encoding charset seems to fit.-z{}r(   wbN)r   r   listr   r<   IOErrorr@   bestr-   r   replacer>   writeoutput)r   r   r   r    r!   r"   r#   r`   filenametarget_extensionsresultr|   s               rz   	normalizer     s     G ~HXh/0
7|qSZZ[cdee\\^FaC&//11	dkk$,,x9J1KLMt	T 
XZ
MMO	


 M

 Ms   = C''C1)      皙?NNTF)r   r   r   NNT)6os.pathr   r   typingr   r   r   r   r	   osr
   rF   rH   charset_normalizer.constantr   r   r   charset_normalizer.mdr   charset_normalizer.modelsr   r   warningsr   r9   charset_normalizer.utilsr   r   r   r   r   r   charset_normalizer.cdr   r   r   r   	getLoggerr7   r8   DEBUGStreamHandlerhandlersetFormatter	Formatter
addHandlerbytesr?   floatboolr{   r   r   r    r   rz   <module>r      s   & 7 7. ] \ , B  6 6 t t			/	0  
'


!   &W&&'RS T   ' 
 "&"&%)DDD D 	D
 3iD 3iD #D D DR
 "&"&%)  	
 3i 3i #  8 "&"&%)ttt t 	t
 3it 3it #t t t$H S # PU lpqtlu   MQ  RU  MV   uy   EQ Q  .S,,-H.s   E E$#E$