a
    I_B                     @   s   d dl Z d dlZd dlmZ d dlmZ d dlZd dlmZm	Z	m
Z
mZ e eZedddgZedg d	Zh d
ZdhZh dZh dZddhZddhZdhZddhZedZddgZdd Zdd ZG dd deZG dd deZ G dd deZ!dS )     N)
namedtuple)time)ParseResultquoteurlparse
urlunparseZRulefieldvalueRequestRate)requestsseconds
start_timeend_time>   Z	dissallowZdisalowZdissalowdisallowZdiasllowZdisallawallow>   Z	useragentz
user-agentz
user agent>   zsite-mapZsitemapsitemapszcrawl-delayzcrawl delayzrequest-ratezrequest ratehost*$0123456789ABCDEFabcdefProtegoc              	   C   s2   t | tv | tv | tv | tv | tv | tv | tv gS N)any_DISALLOW_DIRECTIVE_ALLOW_DIRECTIVE_USER_AGENT_DIRECTIVE_SITEMAP_DIRECTIVE_CRAWL_DELAY_DIRECTIVE_REQUEST_RATE_DIRECTIVE_HOST_DIRECTIVE)r    r    &lib/python3.9/site-packages/protego.py_is_valid_directive_field   s    r"   c                 C   s   |  dr| S d|  S )N/)
startswith)patternr    r    r!   _enforce_path)   s    
r&   c                   @   s(   e Zd ZdZdd Zdd Zdd ZdS )	_URLPatternz.Internal class which represents a URL pattern.c                 C   sj   || _ t|| _d| j v | _| j d| _| jrJ| j d | j d | _n| jr`| j d d | _d| _	d S )Nr   r   F)
_patternlenpriority_contains_asteriskendswith_contains_dollarfind_pattern_before_asterisk_pattern_before_dollar_pattern_compiledselfr%   r    r    r!   __init__3   s    
z_URLPattern.__init__c                 C   sr   | j r| j|S | js4| js*|| jS || jkS || jsDdS | | j| _t	
| j| _d| _ | j|S )zDRetun True if pattern matches the given URL, otherwise return False.FT)r2   r)   matchr,   r.   r$   r1   r0   _prepare_pattern_for_regexrecompile)r4   urlr    r    r!   r6   @   s    
z_URLPattern.matchc                 C   sf   t dd|}t d|}t|D ]4\}}|tvrBt |||< q"|| dkr"d||< q"d|}|S )z:Return equivalent regex pattern for the given URL pattern.z\*+r   z(\*|\$$)z.*? )r8   subsplit	enumerate
_WILDCARDSescapejoin)r4   r%   sindexZsubstrr    r    r!   r7   V   s    

z&_URLPattern._prepare_pattern_for_regexN)__name__
__module____qualname____doc__r5   r6   r7   r    r    r    r!   r'   0   s   r'   c                   @   s   e Zd ZdZdd Zdd Zddd	Zd
d Zdd Zdd Z	dd Z
dd Zdd Zdd Zedd Zejdd Zedd Zejdd ZdS ) _RuleSetz3Internal class which stores rules for a user agent.c                 C   s"   d | _ g | _d | _d | _|| _d S r   )
user_agent_rules_crawl_delay	_req_rate_parser_instance)r4   Zparser_instancer    r    r!   r5   f   s
    z_RuleSet.__init__c                 C   s2   |   }| jdkrdS | j|v r.t| jS dS )zReturn matching score.r      r   )striplowerrI   r*   )r4   Z	robotnamer    r    r!   
applies_tom   s    


z_RuleSet.applies_tor;   replacec           	      C   s   d|vr|S dd }dd |D }| d}|d d|d< tdt|D ]}t|| d	krt|| d
d	 tr|| d
d	  }|| d	d
 }||vr|||d ||< qLn|| ||< d|| d ||< qLd|	d|S )z9Replace %xy escapes by their single-character equivalent.%c                 S   s   t jrtt| dS t| S )z6Replaces a %xx escape with equivalent binary sequence.   )sixPY2chrintbytesfromhex)hr    r    r!   hex_to_byte{   s    z&_RuleSet._unquote.<locals>.hex_to_bytec                 S   s   h | ]}d  t|qS )z{:02X})formatord).0cr    r    r!   	<setcomp>       z$_RuleSet._unquote.<locals>.<setcomp>r   utf-8rN      N   %rb   )
r=   encoderanger*   setissubset_HEX_DIGITSupperrA   decode)	r4   r:   ignoreerrorsr\   partsiZhexcodeZleftoverr    r    r!   _unquotev   s"    
z_RuleSet._unquotec                 C   s4   t t|dd  }t|dkr,d| }d| S )z!Escape char as RFC 2396 specifiesrd   NrN   z0%srS   )hexr^   rk   r*   )r4   charZhex_reprr    r    r!   	hexescape   s    z_RuleSet.hexescapec                 C   sb   t |}| j|jdd}tjr2t|ddd}nt|dd}tdd||j|j	|j
}t|}|S )zReturn percent encoded path.z/%rm   rc   safer;   r   rq   pathrU   rV   r   rf   r   paramsqueryfragmentr   )r4   ry   ro   r    r    r!   _quote_path   s    z_RuleSet._quote_pathc                 C   s   d}|d dks(|d dks(|d dkr<|d }|d d }t |}| j|jdd}tjrnt|dd	d
}nt|d	d
}tdd|| |j|j	|j
}t|}|S )Nr;   r(   ?;r   z/*$%ru   rc   z/*%rv   rx   )r4   r%   Z	last_charro   r    r    r!   _quote_pattern   s    $z_RuleSet._quote_patternc                 C   sn   d|v r |  |d| d | |}|s2d S | jtdt|d |drj|  |d d d  d S )Nr   r   r   r	   z/index.htmli)	r   rR   rt   r   rJ   append_Ruler'   r-   r3   r    r    r!   r      s    

z_RuleSet.allowc                 C   sN   d|v r |  |d| d | |}|s2d S | jtdt|d d S )Nr   r   r   )r   rR   rt   r   rJ   r   r   r'   r3   r    r    r!   r      s    
z_RuleSet.disallowc                 C   s   | j jdd dd d S )Nc                 S   s   | j j| jdkfS )Nr   )r	   r+   r   )rr    r    r!   <lambda>   rb   z)_RuleSet.finalize_rules.<locals>.<lambda>T)keyreverse)rJ   sortr4   r    r    r!   finalize_rules   s    z_RuleSet.finalize_rulesc                 C   s<   |  |}d}| jD ]"}|j|r|jdkr2d} q8q|S )z!Return if the url can be fetched.Tr   F)r}   rJ   r	   r6   r   )r4   r:   allowedZruler    r    r!   	can_fetch   s    


z_RuleSet.can_fetchc                 C   s   | j S )z'Get & set crawl delay for the rule set.)rK   r   r    r    r!   crawl_delay   s    z_RuleSet.crawl_delayc              	   C   sB   zt |}W n* ty6   td| jj| Y d S 0 || _d S )NzOMalformed rule at line {} : cannot set crawl delay to '{}'. Ignoring this rule.)float
ValueErrorloggerdebugr]   rM   _total_line_seenrK   )r4   Zdelayr    r    r!   r      s    c                 C   s   | j S )z(Get & set request rate for the rule set.)rL   r   r    r    r!   request_rate   s    z_RuleSet.request_ratec           
   	   C   s8  z|  }t|dkr |\}}n|d d }}| d\}}|d  }t|t|d d  }}|dkrt|d9 }n"|dkr|d	9 }n|d
kr|d9 }d }d }	|r| d\}}	tt|d d t|dd  }tt|	d d t|	dd  }	W n, ty"   td| j	j
| Y d S 0 t||||	| _d S )Nrd   r   r;   r#   r(   m<   r[   i  diQ -zSMalformed rule at line {} : cannot set request rate using '{}'. Ignoring this rule.)r=   r*   rP   rX   r   	Exceptionr   r   r]   rM   r   r
   rL   )
r4   r	   ro   ZrateZtime_periodr   r   Z	time_unitr   r   r    r    r!   r      s4    


"&N)r;   rR   )rD   rE   rF   rG   r5   rQ   rq   rt   r}   r   r   r   r   r   propertyr   setterr   r    r    r    r!   rH   c   s&   	
$	


rH   c                   @   sl   e Zd Zdd Zedd Zdd Zdd Zd	d
 Zdd Z	dd Z
edd Zedd Zedd ZdS )r   c                 C   s.   i | _ d | _g | _i | _d| _d| _d| _d S Nr   )_user_agents_host_sitemap_list_matched_rule_setr   _invalid_directive_seen_total_directive_seenr   r    r    r!   r5   "  s    zProtego.__init__c                 C   s   |  }| | |S r   )_parse_robotstxt)clscontentor    r    r!   parse3  s    
zProtego.parsec                 C   s  |  }g }d }|D ]}|  jd7  _|d}|dkrJ|d|  }| }|sXq|ddkrx|dd\}}nj|d}	t|	dk rq|	d }
tdt|	D ]8}t|
r|
d|	|d   }} q|
d|	|  7 }
qq| 	 }| }|s|}q|s&|t
vr&td| j q|  jd7  _|t
v r|rR|t
vrRg }| 	 }d }|d	krd	|v r|d	d
}||fD ]`}|sq| j|d }|r||vr|| |st| }||_|| j|< || qn|tv r|D ]}|t| qn|tv r:|D ]}|t| q"nt|tv rR| j| n\|tv rp|D ]}||_q`n>|tv r|D ]}||_q~n |t v r|| _!n|  j"d7  _"|}q| j# D ]}|$  qd S )NrN   #r(   r   : rd   z8Rule at line {} without any user agent to enforce it on.r   r;   )%
splitlinesr   r/   rO   r=   r*   rg   r"   rA   rP   r   r   r   r]   r   rR   r   getr   rH   rI   r   r   r&   r   r   r   r   r   r   r   r   r   r   r   valuesr   )r4   r   linesZcurrent_rule_setsZprevious_rule_fieldlineZhash_posr   r	   ro   Zpossible_filedrp   rI   Zuser_agent_without_asteriskZrule_setr    r    r!   r   9  s    











zProtego._parse_robotstxtc                    sj   | j s
dS  | jv r| j  S  fdd| j  D }t|dd d\}}|s\d| j < dS || j < |S )z0Return the rule set with highest matching score.Nc                 3   s   | ]}|  |fV  qd S r   )rQ   )r_   ZrsrI   r    r!   	<genexpr>  rb   z1Protego._get_matching_rule_set.<locals>.<genexpr>c                 S   s   | d S r   r    )pr    r    r!   r     rb   z0Protego._get_matching_rule_set.<locals>.<lambda>)r   )r   r   r   max)r4   rI   Zscore_rule_set_pairsZmatch_scorematched_rule_setr    r   r!   _get_matching_rule_set  s    



zProtego._get_matching_rule_setc                 C   s   |  |}|sdS ||S )zHReturn True if the user agent can fetch the URL, otherwise return False.T)r   r   )r4   r:   rI   r   r    r    r!   r     s    
zProtego.can_fetchc                 C   s   |  |}|sdS |jS )zvReturn the crawl delay specified for the user agent as a float.
        If nothing is specified, return None.
        N)r   r   r4   rI   r   r    r    r!   r     s    
zProtego.crawl_delayc                 C   s   |  |}|sdS |jS )zReturn the request rate specified for the user agent as a named tuple
        RequestRate(requests, seconds, start_time, end_time). If nothing is
        specified, return None.
        N)r   r   r   r    r    r!   r     s    
zProtego.request_ratec                 C   s
   t | jS )z7Get an iterator containing links to sitemaps specified.)iterr   r   r    r    r!   r     s    zProtego.sitemapsc                 C   s   | j S )zGet the preferred host.)r   r   r    r    r!   preferred_host  s    zProtego.preferred_hostc                 C   s   | j | j S r   )r   r   r   r    r    r!   _valid_directive_seen  s    zProtego._valid_directive_seenN)rD   rE   rF   r5   classmethodr   r   r   r   r   r   r   r   r   r   r    r    r    r!   r      s   
o	


)"loggingr8   collectionsr   Zdatetimer   rU   Zsix.moves.urllib.parser   r   r   r   Z	getLoggerrD   r   r   r
   r   r   r   r   r   r   r   r?   rh   rj   __all__r"   r&   objectr'   rH   r   r    r    r    r!   <module>   s4   

3 >