a
    v5`x                     @   sj  d dl Z d dlZd dlmZ d dlmZ d dlmZmZ ddl	m
Z zd dlZW n eyf   dZY n0 g dZzeefZW n ey   efZY n0 G dd	 d	eZG d
d deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZe Ze Ze Ze Z e Z!e Z"e Z#e Z$dS )    N)Counter)Fraction)groupbypermutations   )Base)ArithNCDLZMANCDBZ2NCDRLENCD	BWTRLENCDZLIBNCDSqrtNCD
EntropyNCDbz2_ncdlzma_ncd	arith_ncdrle_ncd
bwtrle_ncdzlib_ncdsqrt_ncdentropy_ncdc                   @   s6   e Zd ZdZdZdddZdd Zdd Zd	d
 ZdS )_NCDBasezNormalized compression distance (NCD)

    https://articles.orsinium.dev/other/ncd/
    https://en.wikipedia.org/wiki/Normalized_compression_distance#Normalized_compression_distance
    r   c                 C   s
   || _ d S Nqvalselfr    r   Hlib/python3.9/site-packages/textdistance/algorithms/compression_based.py__init__)   s    z_NCDBase.__init__c                 G   s   dS )Nr   r   r   	sequencesr   r   r   maximum,   s    z_NCDBase.maximumc                 C   s   t | |S r   )len	_compressr   datar   r   r   	_get_size/   s    z_NCDBase._get_sizec                    s   |sdS  j | }td}t|d  }t|D ]8}t|ttfrN||}n
t||}t	| 
|}q0 fdd|D }t|}|dkrdS |t	|t|d   | S )Nr   ZInfc                    s   g | ]}  |qS r   )r(   .0sr   r   r   
<listcomp>@       z%_NCDBase.__call__.<locals>.<listcomp>r   )Z_get_sequencesfloattyper   
isinstancestrbytesjoinsumminr(   maxr$   )r   r"   Z
concat_lenemptyr'   Zcompressed_lensZmax_lenr   r,   r   __call__2   s    

z_NCDBase.__call__N)r   )	__name__
__module____qualname____doc__r   r    r#   r(   r9   r   r   r   r   r   !   s   
r   c                       s$   e Zd Zdd Z fddZ  ZS )_BinaryNCDBasec                 C   s   d S r   r   r,   r   r   r   r    I   s    z_BinaryNCDBase.__init__c                    s0   |sdS t |d tr$dd |D }t j| S )Nr   c                 S   s   g | ]}| d qS )zutf-8)encoder)   r   r   r   r-   P   r.   z+_BinaryNCDBase.__call__.<locals>.<listcomp>)r1   string_typessuperr9   r!   	__class__r   r   r9   L   s
    z_BinaryNCDBase.__call__)r:   r;   r<   r    r9   __classcell__r   r   rB   r   r>   G   s   r>   c                   @   s:   e Zd ZdZdddZdd Zd	d
 Zdd Zdd ZdS )r   zArithmetic coding

    https://github.com/gw-c/arith
    http://www.drdobbs.com/cpp/data-compression-with-arithmetic-encodin/240169251
    https://en.wikipedia.org/wiki/Arithmetic_coding
       Nr   c                 C   s   || _ || _|| _d S r   )base
terminatorr   )r   rF   rG   r   r   r   r   r    \   s    zArithNCD.__init__c                 G   s   | j | }| j| }| jdur(d|| j< t| }i }d}t| dd dd}|D ](\}}t||t||f||< ||7 }qV||ksJ |S )zD
        https://github.com/gw-c/arith/blob/master/arith.py
        Nr   r   c                 S   s   | d | d fS )Nr   r   r   )xr   r   r   <lambda>m   r.   z&ArithNCD._make_probs.<locals>.<lambda>T)keyreverse)Z_get_countersZ_sum_countersrG   r5   valuessorteditemsr   )r   r"   ZcountsZtotal_lettersZ
prob_pairsZcumulative_countcharZcurrent_countr   r   r   _make_probsa   s    




zArithNCD._make_probsc                 C   sv   | j d ur,| j |v r"|| j d}|| j 7 }tdd}tdd}|D ]$}|| \}}||| 7 }||9 }qD||| fS )N r   r   )rG   replacer   )r   r'   probsstartwidthrO   Z
prob_startZ
prob_widthr   r   r   
_get_rangew   s    





zArithNCD._get_rangec                 C   sl   |  |}| j||d\}}tdd}d}||  kr>|k shn d|j| |j  }t||}|d9 }q*|S )N)r'   rS   r   r   rE   )rP   rV   r   	numeratordenominator)r   r'   rS   rT   endZoutput_fractionZoutput_denominatorZoutput_numeratorr   r   r   r%      s    



zArithNCD._compressc                 C   s,   |  |j}|dkrdS tt|| jS )Nr   )r%   rW   mathZceillogrF   )r   r'   rW   r   r   r   r(      s    zArithNCD._get_size)rE   Nr   )	r:   r;   r<   r=   r    rP   rV   r%   r(   r   r   r   r   r   T   s   
r   c                   @   s   e Zd ZdZdd ZdS )r   zORun-length encoding

    https://en.wikipedia.org/wiki/Run-length_encoding
    c                 C   sj   g }t |D ]R\}}tt|}|dkr<|t||  q|dkrP|| q|d|  qd|S )NrE   r   rQ   )r   r$   listappendr2   r4   )r   r'   Znew_datakgnr   r   r   r%      s    zRLENCD._compressNr:   r;   r<   r=   r%   r   r   r   r   r      s   r   c                       s*   e Zd ZdZdddZ fddZ  ZS )r   z
    https://en.wikipedia.org/wiki/Burrows%E2%80%93Wheeler_transform
    https://en.wikipedia.org/wiki/Run-length_encoding
     c                 C   s
   || _ d S r   )rG   )r   rG   r   r   r   r       s    zBWTRLENCD.__init__c                    s^    s| j  nF| j  vrR | j 7  t fddtt D }ddd |D  t  S )Nc                 3   s&   | ]} |d   d |  V  qd S r   r   )r*   ir'   r   r   	<genexpr>   r.   z&BWTRLENCD._compress.<locals>.<genexpr>rQ   c                 S   s   g | ]}|d  qS )r   )r*   Zsubdatar   r   r   r-      r.   z'BWTRLENCD._compress.<locals>.<listcomp>)rG   rM   ranger$   r4   rA   r%   )r   r'   ZmodifiedrB   rd   r   r%      s    

zBWTRLENCD._compress)rb   )r:   r;   r<   r=   r    r%   rD   r   r   rB   r   r      s   
r   c                   @   s*   e Zd ZdZd
ddZdd Zdd Zd	S )r   zSquare Root based NCD

    Size of compressed data equals to sum of square roots of counts of every
    element in the input sequence.
    r   c                 C   s
   || _ d S r   r   r   r   r   r   r       s    zSqrtNCD.__init__c                 C   s   dd t | D S )Nc                 S   s   i | ]\}}|t |qS r   )rZ   Zsqrt)r*   elementcountr   r   r   
<dictcomp>   r.   z%SqrtNCD._compress.<locals>.<dictcomp>)r   rN   r&   r   r   r   r%      s    zSqrtNCD._compressc                 C   s   t | | S r   )r5   r%   rL   r&   r   r   r   r(      s    zSqrtNCD._get_sizeN)r   r:   r;   r<   r=   r    r%   r(   r   r   r   r   r      s   
r   c                   @   s*   e Zd ZdZdddZdd Zdd	 Zd
S )r   zEntropy based NCD

    Get Entropy of input secueance as a size of compressed data.

    https://en.wikipedia.org/wiki/Entropy_(information_theory)
    https://en.wikipedia.org/wiki/Entropy_encoding
    r   rE   c                 C   s   || _ || _|| _d S r   )r   coefrF   )r   r   rl   rF   r   r   r   r       s    zEntropyNCD.__init__c                 C   sL   t |}d}t| D ]"}|| }||t|| j 8 }q|dksHJ |S )Ng        r   )r$   r   rL   rZ   r[   rF   )r   r'   Ztotal_countZentropyZelement_countpr   r   r   r%      s    zEntropyNCD._compressc                 C   s   | j | | S r   )rl   r%   r&   r   r   r   r(      s    zEntropyNCD._get_sizeN)r   r   rE   rk   r   r   r   r   r      s   
r   c                   @   s   e Zd ZdZdd ZdS )r
   z-
    https://en.wikipedia.org/wiki/Bzip2
    c                 C   s   t |ddd  S )N	bz2_codec   codecsr?   r&   r   r   r   r%      s    zBZ2NCD._compressNra   r   r   r   r   r
      s   r
   c                   @   s   e Zd ZdZdd ZdS )r	   z,
    https://en.wikipedia.org/wiki/LZMA
    c                 C   s   t stdt |dd  S )Nz$Please, install the PylibLZMA module   )lzmaImportErrorcompressr&   r   r   r   r%      s    zLZMANCD._compressNra   r   r   r   r   r	      s   r	   c                   @   s   e Zd ZdZdd ZdS )r   z,
    https://en.wikipedia.org/wiki/Zlib
    c                 C   s   t |ddd  S )N
zlib_codecrE   rp   r&   r   r   r   r%     s    zZLIBNCD._compressNra   r   r   r   r   r     s   r   )%rq   rZ   collectionsr   Z	fractionsr   	itertoolsr   r   rF   r   Z_Basers   rt   __all__r2   Zunicoder@   	NameErrorr   r>   r   r   r   r   r   r
   r	   r   r   r   r   r   r   r   r   r   r   r   r   r   <module>   s@   
	&C"
