a
    v5`n^                     @   s8  d dl mZ d dlmZ ddlmZmZ zd dl	Z	W n e
yJ   dZ	Y n0 g dZG dd deZG d	d
 d
eZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZe Ze Ze  ZZe Ze Ze Ze Ze Ze Ze Z dS )    )defaultdict)zip_longest   )BaseBaseSimilarityN)HammingMLIPNSLevenshteinDamerauLevenshteinJaroJaroWinklerStrCmp95NeedlemanWunschGotohSmithWatermanhammingmlipnslevenshteindamerau_levenshteinjarojaro_winklerstrcmp95needleman_wunschgotohsmith_watermanc                   @   s"   e Zd ZdZd
ddZdd	 ZdS )r   z
    Compute the Hamming distance between the two or more sequences.
    The Hamming distance is the number of differing items in ordered sequences.

    https://en.wikipedia.org/wiki/Hamming_distance
    r   NFTc                 C   s"   || _ |p| j| _|| _|| _d S N)qval_ident	test_functruncateexternal)selfr   r   r   r     r"   Alib/python3.9/site-packages/textdistance/algorithms/edit_based.py__init__#   s    zHamming.__init__c                    sH    j | } j| }|d ur |S  jr*tnt}t fdd|| D S )Nc                    s   g | ]} j |  qS r"   )r   ).0esr!   r"   r#   
<listcomp>1       z$Hamming.__call__.<locals>.<listcomp>)_get_sequencesquick_answerr   zipr   sum)r!   	sequencesresult_zipr"   r'   r#   __call__)   s    

zHamming.__call__)r   NFT)__name__
__module____qualname____doc__r$   r1   r"   r"   r"   r#   r      s   
r   c                   @   s2   e Zd ZdZdddZdd Zd	d
 Zdd ZdS )r	   a  
    Compute the absolute Levenshtein distance between the two sequences.
    The Levenshtein distance is the minimum number of edit operations necessary
    for transforming one sequence into the other. The edit operations allowed are:

        * deletion:     ABC -> BC, AC, AB
        * insertion:    ABC -> ABCD, EABC, AEBC..
        * substitution: ABC -> ABE, ADC, FBC..

    https://en.wikipedia.org/wiki/Levenshtein_distance
    TODO: https://gist.github.com/kylebgorman/1081951/9b38b7743a3cb5167ab2c6608ac8eea7fc629dca
    r   NTc                 C   s   || _ |p| j| _|| _d S r   r   r   r   r    r!   r   r   r    r"   r"   r#   r$   A   s    zLevenshtein.__init__c                 C   s   |r|st |t | S | |d |d rF| |d d |d d S t| |d d || ||d d }| |d d |d d }t||d S )Nr   )lenr   min)r!   s1s2dsr"   r"   r#   
_recursiveF   s    zLevenshtein._recursivec                 C   s   t |d }t |d }d}tr,t|}nt|}td|D ]}||gdg|d    }}td|D ]^}|| d }	||d  d }
| ||d  ||d  }||d  |  }t||	|
||< qfq>|d S )zp
        source:
        https://github.com/jamesturk/jellyfish/blob/master/jellyfish/_jellyfish.py#L18
        r   Nr   r8   )r9   numpyZarangeranger   r:   )r!   r;   r<   ZrowsZcolsprevZcurrcZdeletionZ	insertionZdistZeditr"   r"   r#   _cicledW   s    zLevenshtein._cicledc                 C   s4   |  ||\}}| ||}|d ur(|S | ||S r   )r*   r+   rE   r!   r;   r<   r/   r"   r"   r#   r1   n   s
    zLevenshtein.__call__)r   NT)r2   r3   r4   r5   r$   r?   rE   r1   r"   r"   r"   r#   r	   4   s
   
r	   c                   @   s2   e Zd ZdZdddZdd Zd	d
 Zdd ZdS )r
   a  
    Compute the absolute Damerau-Levenshtein distance between the two sequences.
    The Damerau-Levenshtein distance is the minimum number of edit operations necessary
    for transforming one sequence into the other. The edit operations allowed are:

        * deletion:      ABC -> BC, AC, AB
        * insertion:     ABC -> ABCD, EABC, AEBC..
        * substitution:  ABC -> ABE, ADC, FBC..
        * transposition: ABC -> ACB, BAC

    https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance
    r   NTc                 C   s   || _ |p| j| _|| _d S r   r6   r7   r"   r"   r#   r$      s    zDamerauLevenshtein.__init__c           	      C   s^  t jt|d t|d gt jd}tdt|d D ]}|d || d< q6tdt|d D ]}|d |d |< q^t|D ]\}}t|D ]\}}t| || }t||d  | d || |d  d ||d  |d  | || |< |r|sq| |||d  sqt|| | ||d  |d  | || |< qq||t|d  t|d  S )Nr   Zdtyper8      )r@   zerosr9   intrA   	enumerater   r:   	r!   r;   r<   r=   ijZcs1Zcs2Zcostr"   r"   r#   _numpy   s,    $
zDamerauLevenshtein._numpyc           	   	   C   sT  i }t dt|d D ]}|d ||df< qt dt|d D ]}|d |d|f< q>t|D ]\}}t|D ]\}}t| || }t||d |f d |||d f d ||d |d f | |||f< |rl|sql| |||d  sql| ||d  |sqlt|||f ||d |d f | |||f< qlq\|t|d t|d f S )za
        https://www.guyrutenberg.com/2008/12/15/damerau-levenshtein-distance-in-python/
        r8   r   rH   )rA   r9   rK   rJ   r   r:   rL   r"   r"   r#   _pure_python   s0    
zDamerauLevenshtein._pure_pythonc                 C   s4   |  ||\}}| ||}|d ur(|S | ||S r   )r*   r+   rP   rF   r"   r"   r#   r1      s
    zDamerauLevenshtein.__call__)r   NT)r2   r3   r4   r5   r$   rO   rP   r1   r"   r"   r"   r#   r
   x   s
   
!%r
   c                   @   s,   e Zd ZdZdddZdd Zdd
dZdS )r   a  
    Computes the Jaro-Winkler measure between two strings.
    The Jaro-Winkler measure is designed to capture cases where two strings
    have a low Jaro score, but share a prefix.
    and thus are likely to match.

    https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance
    https://github.com/Yomguithereal/talisman/blob/master/src/metrics/distance/jaro.js
    https://github.com/Yomguithereal/talisman/blob/master/src/metrics/distance/jaro-winkler.js
    FTr   c                 C   s   || _ || _|| _|| _d S r   )r   long_tolerance
winklerizer    )r!   rQ   rR   r   r    r"   r"   r#   r$      s    zJaroWinkler.__init__c                 G   s   dS Nr   r"   r!   r.   r"   r"   r#   maximum   s    zJaroWinkler.maximum皙?c                 C   s  |  ||\}}| ||}|d ur(|S t|}t|}|r@|sDdS t||}|d d }|dk rfd}dg| }	dg| }
d}t|D ]l\}}td|| }t|| |d }t||d D ]4}|
| s|| |krd |	|< |
|< |d7 } qqq|sdS d }}t|	D ]T\}}|rt||D ]}|
| r$|d } qDq$|| || kr|d7 }q|d }|| ||  }||| | 7 }|d }| js|S |dks|dks|dkr|S t|d	}d}||k r || || kr || r |d7 }q|r||| d
|  7 }| jr,|d	kr0|S ||d ksPd| || k rT|S || d || |d  d  }|d
| | 7 }|S )N        rH   r   r   FT   ffffff?         ?)	r*   r+   r9   maxrK   r:   rA   rR   rQ   )r!   r;   r<   Zprefix_weightr/   Zs1_lenZs2_lenZmin_lensearch_rangeZs1_flagsZs2_flagsZcommon_charsrM   Zs1_chZlowhirN   kZtrans_countZs1_fweighttmpr"   r"   r#   r1      sn    





&  zJaroWinkler.__call__N)FTr   T)rV   r2   r3   r4   r5   r$   rU   r1   r"   r"   r"   r#   r      s   

r   c                       s   e Zd Zd fdd	Z  ZS )r   Fr   Tc                    s   t  j|d||d d S )NF)rQ   rR   r   r    )superr$   )r!   rQ   r   r    	__class__r"   r#   r$   B  s    zJaro.__init__)Fr   T)r2   r3   r4   r$   __classcell__r"   r"   rd   r#   r   A  s   r   c                   @   sN   e Zd ZdZdZdddZd	d
 Zdd Zdd Zdd Z	dd Z
dd ZdS )r   a0  
    Computes the Needleman-Wunsch measure between two strings.
    The Needleman-Wunsch generalizes the Levenshtein distance and considers global
    alignment between two strings. Specifically, it is computed by assigning
    a score to each alignment between two input strings and choosing the
    score of the best alignment, that is, the maximal score.
    An alignment between two strings is a set of correspondences between the
    characters of between them, allowing for gaps.

    https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm
    Fr[   Nr   Tc                 C   s*   || _ || _|r|| _n| j| _|| _d S r   )r   gap_costsim_funcr   r    r!   rg   rh   r   r    r"   r"   r#   r$   Y  s    zNeedlemanWunsch.__init__c                 G   s   t tt| | j S r   )r\   mapr9   rg   rT   r"   r"   r#   minimumb  s    zNeedlemanWunsch.minimumc                 G   s   t tt|S r   )r\   rj   r9   rT   r"   r"   r#   rU   e  s    zNeedlemanWunsch.maximumc                 G   s   d| j |  S )z'Get distance between sequences
        r8   )
similarityrT   r"   r"   r#   distanceh  s    zNeedlemanWunsch.distancec                 G   s6   | j | }| j| }|dkr dS | j| | ||  S )!Get distance from 0 to 1
        r   )rk   rU   rm   r!   r.   rk   rU   r"   r"   r#   normalized_distancem  s
    

z#NeedlemanWunsch.normalized_distancec                 G   s6   | j | }| j| }|dkr dS | j| | |d  S )rn   r   r   rH   )rk   rU   rl   ro   r"   r"   r#   normalized_similarityv  s
    

z%NeedlemanWunsch.normalized_similarityc                 C   s<  t std| ||\}}t jt|d t|d ft jd}tt|d D ]}|| j  ||df< qPtt|d D ]}|| j  |d|f< qzt|dD ]|\}}t|dD ]h\}}||d |d f | 	|| }||d |f | j }	|||d f | j }
t
||	|
|||f< qq||jd d |jd d f S )Nz2Please, install numpy for Needleman-Wunsch measurer   rG   r   )r@   ImportErrorr*   rI   r9   floatrA   rg   rK   rh   r\   shape)r!   r;   r<   dist_matrM   rN   c1c2matchdeleteinsertr"   r"   r#   r1     s$     zNeedlemanWunsch.__call__)r[   Nr   T)r2   r3   r4   r5   Zpositiver$   rk   rU   rm   rp   rq   r1   r"   r"   r"   r#   r   K  s   
			r   c                   @   s*   e Zd ZdZdddZdd	 Zd
d ZdS )r   a  
    Computes the Smith-Waterman measure between two strings.
    The Smith-Waterman algorithm performs local sequence alignment;
    that is, for determining similar regions between two strings.
    Instead of looking at the total sequence, the Smith-Waterman algorithm compares
    segments of all possible lengths and optimizes the similarity measure.

    https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm
    https://github.com/Yomguithereal/talisman/blob/master/src/metrics/distance/smith-waterman.js
    r[   Nr   Tc                 C   s"   || _ || _|p| j| _|| _d S r   )r   rg   r   rh   r    ri   r"   r"   r#   r$     s    zSmithWaterman.__init__c                 G   s   t tt|S r   r:   rj   r9   rT   r"   r"   r#   rU     s    zSmithWaterman.maximumc                 C   s  t std| ||\}}| ||}|d ur4|S t jt|d t|d ft jd}t|ddD ]\}}t|ddD ]j\}}||d |d f | || }	||d |f | j	 }
|||d f | j	 }t
d|	|
||||f< qxqd||jd d |jd d f S )Nz0Please, install numpy for Smith-Waterman measurer   rG   startr   )r@   rr   r*   r+   rI   r9   rs   rK   rh   rg   r\   rt   )r!   r;   r<   r/   ru   rM   sc1rN   sc2rx   ry   rz   r"   r"   r#   r1     s"     zSmithWaterman.__call__)r[   Nr   Trb   r"   r"   r"   r#   r     s   

r   c                   @   s2   e Zd ZdZdddZdd	 Zd
d Zdd ZdS )r   zGotoh score
    Gotoh's algorithm is essentially Needleman-Wunsch with affine gap
    penalties:
    https://www.cs.umd.edu/class/spring2003/cmsc838t/papers/gotoh1982.pdf
    r   皙?NTc                 C   s0   || _ || _|| _|r|| _n| j| _|| _d S r   )r   gap_opengap_extrh   r   r    )r!   r   r   rh   r   r    r"   r"   r#   r$     s    zGotoh.__init__c                 G   s   t tt| S r   r{   rT   r"   r"   r#   rk     s    zGotoh.minimumc                 G   s   t tt|S r   r{   rT   r"   r"   r#   rU     s    zGotoh.maximumc              	   C   s  t std| ||\}}t|}t|}t j|d |d ft jd}t j|d |d ft jd}t j|d |d ft jd}d|d< td|d< td|d< td|d D ]R}td||df< | j | j|d   ||df< td||df< | j ||df< qtd|d D ]T}	td|d|	f< td|d|	f< | j |d|	f< | j | j|	d   |d|	f< qt	|ddD ]\}}
t	|ddD ]\}	}| 
|
|}t||d |	d f | ||d |	d f | ||d |	d f | |||	f< t||d |	f | j ||d |	f | j |||	f< t|||	d f | j |||	d f | j |||	f< qqrdd	 |jD \}}	t|||	f |||	f |||	f S )
Nz'Please, install numpy for Gotoh measurer   rG   r   )r   r   z-infr|   c                 s   s   | ]}|d  V  qdS )r   Nr"   )r%   nr"   r"   r#   	<genexpr>  r)   z!Gotoh.__call__.<locals>.<genexpr>)r@   rr   r*   r9   rI   rs   rA   r   r   rK   rh   r\   rt   )r!   r;   r<   len_s1len_s2Zd_matZp_matZq_matrM   rN   r~   r   Zsim_valr"   r"   r#   r1     sN    "zGotoh.__call__)r   r   Nr   T)r2   r3   r4   r5   r$   rk   rU   r1   r"   r"   r"   r#   r     s
   

r   c                   @   s:   e Zd ZdZdZdddZdd Zed	d
 Zdd Z	dS )r   z`strcmp95 similarity

    http://cpansearch.perl.org/src/SCW/Text-JaroWinkler-0.1/strcmp95.c
    )$)AE)r   I)r   O)r   U)BV)r   r   )r   r   )r   r   )r   r   )r   r   )r   r   )r   Y)r   r   )CG)r   F)Wr   )r   r   )XK)SZ)r   r   )Qr   )r   r   )MN)Lr   )r   r   )PR)r   J)2r   )5r   )8r   )1r   )r   r   )0r   )r   r   )r   r   )r   r   FTc                 C   s   || _ || _d S r   )long_stringsr    )r!   r   r    r"   r"   r#   r$   #  s    zStrCmp95.__init__c                 G   s   dS rS   r"   rT   r"   r"   r#   rU   '  s    zStrCmp95.maximumc                 C   s   dt |   k odk S   S )Nr   [   )ord)charr"   r"   r#   	_in_range*  s    zStrCmp95._in_rangec                 C   s  |   }|   }| ||}|d ur0|S t|}t|}tt}| jD ] \}}d|||f< d|||f< qN||kr|}	|}
n|}	|}
dg|	 }dg|	 }td|	d d }	d}|d }t|D ]l\}}t||	 d}t	||	 |}t
||d D ]8}|| dkr|| |krd||< d||< |d7 } qqq|dkr@dS d }}t|D ]\\}}|| sfqPt
||D ]"}|| dkrp|d } qqp||| krP|d7 }qP|d }d}|
|krpt
|D ]}|| dkrq| || sqt
|D ]l}|| dkrq| || s(q|| || f|vrBq|||| || f 7 }d||<  q̐qq|d | }|| ||  }||| | 7 }|d }|dkr|S t	|
d	}d}t||D ]B\}}||kr q||kr q| r q|d7 }q|r(||d
 d|  7 }| js4|S |
d	krB|S ||d ksbd| |
| k rf|S |d  rx|S || d || |d  d  }|d| | 7 }|S )NrX   r   rH   r   rW   g      $@g      @rY   rZ   rV   r[   )stripupperr+   r9   r   rJ   sp_mxr\   rK   r:   rA   r   r,   isdigitr   )r!   r;   r<   r/   r   r   Zadjwtrv   rw   r]   ZminvZs1_flagZs2_flagZnum_comZyl1rM   r~   ZlowlimZhilimrN   r_   Zn_transZn_simiZnum_simr`   r   resr"   r"   r#   r1   .  s    











  zStrCmp95.__call__N)FT)
r2   r3   r4   r5   r   r$   rU   staticmethodr   r1   r"   r"   r"   r#   r     s   	

r   c                   @   s*   e Zd ZdZdddZdd	 Zd
d ZdS )r   a*  
    Compute the Hamming distance between the two or more sequences.
    The Hamming distance is the number of differing items in ordered sequences.

    http://www.sial.iias.spb.su/files/386-386-1-PB.pdf
    https://github.com/Yomguithereal/talisman/blob/master/src/metrics/distance/mlipns.js
          ?rH   r   Tc                 C   s   || _ || _|| _|| _d S r   )r   	thresholdmaxmismatchesr    )r!   r   r   r   r    r"   r"   r#   r$     s    zMLIPNS.__init__c                 G   s   dS rS   r"   rT   r"   r"   r#   rU     s    zMLIPNS.maximumc                 G   s   | j | }| j| }|d ur |S d}t | }ttt|}t|r|| jkr|sVdS d|| |  | jkrpdS |d7 }|d8 }|d8 }q<|sdS dS )Nr   r   )	r*   r+   r   r\   rj   r9   allr   r   )r!   r.   r/   Z
mismatchesZhammaxlenr"   r"   r#   r1     s$    



zMLIPNS.__call__N)r   rH   r   Trb   r"   r"   r"   r#   r     s   
r   )!collectionsr   	itertoolsr   baser   Z_Baser   Z_BaseSimilarityr@   rr   __all__r   r	   r
   r   r   r   r   r   r   r   r   r   Zdameraur   r   r   r   r   r   r   r   r"   r"   r"   r#   <module>   s8   
Ded
R/I )
