a
    ±Kb„+  ã                   @   s  d Z ddlZddlZddlZddlmZ ddlmZm	Z	 zddl
mZ W n eyZ   Y n0 ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ G dd„ deƒZG dd„ deƒZdd„ Zdd„ Zdd„ Zd"dd„Zdd„ Zdd„ Zd#dd„Ze dkrdd l!mZ edƒ ed!ƒ dS )$z
Named entity chunker
é    N)ÚElementTree)ÚClassifierBasedTaggerÚpos_tag)ÚMaxentClassifier)ÚChunkParserI)Ú
ChunkScore)Úfind)Úword_tokenize)ÚTreec                   @   s0   e Zd ZdZdd„ Zdd„ Zdd„ Zdd	„ Zd
S )ÚNEChunkParserTaggerz2
    The IOB tagger used by the chunk parser.
    c                 C   s   t j| || jd d S )N)ÚtrainZclassifier_builder)r   Ú__init__Ú_classifier_builder©Úselfr   © r   ú6lib/python3.9/site-packages/nltk/chunk/named_entity.pyr   $   s    ÿzNEChunkParserTagger.__init__c                 C   s   t j|ddddS )NZmegamé   é   )Ú	algorithmZgaussian_prior_sigmaZtrace)r   r   r   r   r   r   r   )   s    ÿz'NEChunkParserTagger._classifier_builderc                 C   sD   z
| j }W n4 ty>   ddlm} t| d¡ƒ| _ | j }Y n0 |S )Nr   )Úwordszen-basic)Z_en_wordlistÚAttributeErrorZnltk.corpusr   Úset)r   Zwlr   r   r   r   Ú_english_wordlist.   s    
z%NEChunkParserTagger._english_wordlistc                 C   s0  || d }t || d ƒ}|dkrBd  }}d  }}	d  }
 }}nÂ|dkr”||d  d  ¡ }d }t ||d  d ƒ}d }	||d  d }d  }
}np||d  d  ¡ }||d  d  ¡ }t ||d  d ƒ}t ||d  d ƒ}	||d  }||d  }t|ƒ}
|t|ƒd kr(d  }}d  }}n”|t|ƒd krl||d  d  ¡ }||d  d  ¡ }d }d }nP||d  d  ¡ }||d  d  ¡ }||d  d  ¡ }||d  d  ¡ }dt|ƒt|ƒ|d d…  ¡ |dd …  ¡ ||||  ¡ v |||||| ¡ › d|› |› d|› |
› d|› dœ}|S )	Nr   r   r   Té   éýÿÿÿú+)ZbiasÚshapeZwordlenZprefix3Zsuffix3ÚposÚwordzen-wordlistÚprevtagÚprevposÚnextposÚprevwordÚnextwordzword+nextposzpos+prevtagzshape+prevtag)Úsimplify_posÚlowerr   Úlenr   )r   ÚtokensÚindexÚhistoryr   r   r#   Zprevprevwordr!   ZprevprevposZ	prevshaper    Zprevprevtagr$   Znextnextwordr"   ZnextnextposZfeaturesr   r   r   Ú_feature_detector8   sd    


ðz%NEChunkParserTagger._feature_detectorN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__r   r   r   r+   r   r   r   r   r      s
   
r   c                   @   s<   e Zd ZdZdd„ Zdd„ Zdd„ Zdd	„ Zed
d„ ƒZ	dS )ÚNEChunkParserz2
    Expected input: list of pos-tagged words
    c                 C   s   |   |¡ d S ©N)Ú_trainr   r   r   r   r   x   s    zNEChunkParser.__init__c                 C   s   | j  |¡}|  |¡}|S )z8
        Each token should be a pos-tagged word
        )Ú_taggerÚtagÚ_tagged_to_parse)r   r(   ZtaggedÚtreer   r   r   Úparse{   s    
zNEChunkParser.parsec                    s"   ‡ fdd„|D ƒ}t |dˆ _d S )Nc                    s   g | ]}ˆ   |¡‘qS r   )Ú_parse_to_tagged)Ú.0Ús©r   r   r   Ú
<listcomp>…   ó    z(NEChunkParser._train.<locals>.<listcomp>)r   )r   r3   )r   Zcorpusr   r;   r   r2   ƒ   s    zNEChunkParser._trainc                 C   s´   t dg ƒ}|D ] \}}|dkr*| |¡ q| d¡rP| t |dd… |gƒ¡ q| d¡r|r”t|d t ƒr”|d  ¡ |dd… kr”|d  |¡ q| t |dd… |gƒ¡ q|S )zH
        Convert a list of tagged tokens to a chunk-parse tree.
        ÚSÚOúB-r   NúI-éÿÿÿÿ)r
   ÚappendÚ
startswithÚ
isinstanceÚlabel)r   Ztagged_tokensÚsentÚtokr4   r   r   r   r5   ‰   s    


*zNEChunkParser._tagged_to_parsec                 C   sˆ   g }| D ]z}t |tƒrtt|ƒdkr,tdƒ q| |d d| ¡ › f¡ |dd… D ]}| |d| ¡ › f¡ qTq| |df¡ q|S )zH
        Convert a chunk-parse tree to a list of tagged tokens.
        r   z"Warning -- empty chunk in sentencer@   r   NrA   r?   )rE   r
   r'   ÚprintrC   rF   )rG   ÚtoksÚchildrH   r   r   r   r8   ›   s    
zNEChunkParser._parse_to_taggedN)
r,   r-   r.   r/   r   r7   r2   r5   Ústaticmethodr8   r   r   r   r   r0   s   s   r0   c                 C   s^   t  d| t j¡rdS t  d| t j¡r(dS t  d| t j¡rV|  ¡ rDdS |  ¡ rPdS dS nd	S d S )
Nz![0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$Znumberz\W+$Úpunctz\w+$ZupcaseZdowncaseZ	mixedcaseÚother)ÚreÚmatchÚUNICODEÚistitleÚislower)r   r   r   r   r   ®   s    r   c                 C   s    |   d¡rdS |  d¡d S d S )NÚVú-r   )rD   Úsplit)r:   r   r   r   r%   ¾   s    
r%   c                 C   s„   |   ¡ }dd„ t|ƒD ƒ}tdg ƒ}| D ]V}t|tƒrl| t| ¡ g ƒ¡ |D ]}|d  |t|ƒf¡ qNq(| |t|ƒf¡ q(|S )Nc                 s   s   | ]\}}|V  qd S r1   r   )r9   r   r   r   r   r   Ú	<genexpr>È   r=   zpostag_tree.<locals>.<genexpr>r>   rB   )Úleavesr   r
   rE   rC   rF   Únext)r6   r   Ztag_iterZnewtreerK   Zsubchildr   r   r   Úpostag_treeÅ   s    

rZ   ÚbinaryTc                 c   sb   | D ]X}t  |¡D ]H\}}}| d¡r,|r,q|D ](}| d¡r0tt j ||¡|ƒE d H  q0qqd S )NZbnewsz.sgm)ÚosÚwalkÚendswithÚload_ace_fileÚpathÚjoin)ÚrootsÚfmtZ
skip_bnewsÚrootÚdirsÚfilesÚfr   r   r   Úload_ace_dataÔ   s    
rh   c                 c   s   t dtj | ¡d › ƒ | d }g }t|ƒ}t |¡ ¡ }W d   ƒ n1 sR0    Y  | d¡D ]d}| 	d¡j
}| d¡D ]H}| d¡dkr”q€t| 	d	¡j
ƒ}	t| 	d
¡j
ƒd }
| |	|
|f¡ q€qft| ƒ}| ¡ }W d   ƒ n1 sò0    Y  t dd|¡}dd„ }t d||¡}t dd|¡}t dd|¡}t dd|¡}dd„ |D ƒ}|dkröd}tdg ƒ}t|ƒD ]^\}	}
}|	|k r|}	|
|	kržqx| t|||	… ƒ¡ | td||	|
…  ¡ ƒ¡ |
}qx| t||d … ƒ¡ |V  n¦|dkr”d}tdg ƒ}t|ƒD ]^\}	}
}|	|k r.|}	|
|	kr<q| t|||	… ƒ¡ | t|||	|
…  ¡ ƒ¡ |
}q| t||d … ƒ¡ |V  ntdƒ‚d S )Nz  - r   z.tmx.rdc.xmlzdocument/entityZentity_typeZentity_mentionZTYPEÚNAMEzhead/charseq/startzhead/charseq/endz<(?!/?TEXT)[^>]+>Ú c                 S   s   d|   ¡ |  ¡  d  S )Nú é   )ÚendÚstart)Úmr   r   r   Úsubfunc÷   s    zload_ace_file.<locals>.subfuncz[\s\S]*<TEXT>z</TEXT>[\s\S]*z``z "z''z" c                 S   s   h | ]\}}}|’qS r   r   )r9   r:   ÚeÚtypr   r   r   Ú	<setcomp>  r=   z load_ace_file.<locals>.<setcomp>r[   r   r>   ZNEÚ
multiclasszbad fmt value)rI   r\   r`   rV   ÚopenÚETr7   ZgetrootÚfindallr   ÚtextÚgetÚintrC   ÚreadrO   Úsubr
   ÚsortedÚextendr	   Ú
ValueError)Ztextfilerc   ZannfileZentitiesZinfileZxmlZentityrr   Zmentionr:   rq   rx   rp   Zentity_typesÚirJ   r   r   r   r_   Þ   sb    
,
&







r_   c                 C   s¬   t  | ¡} t  |¡}d}t| |ƒD ]„\\}}\}}||  krFdkr„n n:|s¦td|d›d|d›d|› ƒ td ddd¡ƒ d}q"d}td|d›d|d›d|› ƒ q"d S )	NFr?   z  Z15rk   z  {:15} {:15} {2}ú...T)r0   r8   ÚziprI   Úformat)ÚcorrectZguessedZellipsisÚwZctÚgtr   r   r   Ú
cmp_chunks'  s    

r‡   c                 C   s&  t dƒ tdƒtdƒtdƒtdƒg}t|| ƒ}dd„ |D ƒ}t dƒ t|ƒ}~t d	ƒ td
ƒg}t|| ƒ}dd„ |D ƒ}t dƒ tƒ }t|ƒD ]4\}	}
| |
 ¡ ¡}| |
|¡ |	dk rŽt	|
|ƒ qŽt |ƒ d| › d}t d|› dƒ t
|dƒ}t ||d¡ W d   ƒ n1 s0    Y  |S )NzLoading training data...zcorpora/ace_data/ace.devzcorpora/ace_data/ace.heldoutzcorpora/ace_data/bbn.devzcorpora/ace_data/muc.devc                 S   s   g | ]}t |ƒ‘qS r   ©rZ   ©r9   Útr   r   r   r<   ?  r=   zbuild_model.<locals>.<listcomp>zTraining...zLoading eval data...zcorpora/ace_data/ace.evalc                 S   s   g | ]}t |ƒ‘qS r   rˆ   r‰   r   r   r   r<   G  r=   zEvaluating...r   z/tmp/ne_chunker_z.picklezSaving chunker to r   ÚwbrB   )rI   r   rh   r0   r   Ú	enumerater7   rX   Zscorer‡   ru   ÚpickleÚdump)rc   Ztrain_pathsZtrain_treesZ
train_dataZcpZ
eval_pathsZ
eval_treesZ	eval_dataZ
chunkscorer€   r„   ZguessZoutfilenameZoutfiler   r   r   Úbuild_model6  s8    ü


.r   Ú__main__)r   rt   )r[   T)r[   )"r/   r\   r   rO   Z	xml.etreer   rv   Znltk.tagr   r   Znltk.classifyr   ÚImportErrorZnltk.chunk.apir   Znltk.chunk.utilr   Z	nltk.datar   Znltk.tokenizer	   Z	nltk.treer
   r   r0   r   r%   rZ   rh   r_   r‡   r   r,   Znltk.chunk.named_entityr   r   r   r   Ú<module>   s6   T;

I
%
