a
    |%bœn  ã                   @   s¶  d Z ddlmZ ddlZddlZddlZzddlmZ ddlm	Z	 W n" e
yf   ddlmZm	Z	 Y n0 ddlmZ ddlmZ dd	lmZmZ dd
lmZmZ ze W n ey¾   eZY n0 ze W n eyÞ   eZY n0 ze W n ey   eefZY n0 g d¢Ze dejejB ¡jZ e dej¡jZ!ejdgej"d dkrTej#fnd¢R Ž j$Z%e dej¡j&Z'e dej¡j&Z(e dej¡j$Z)dd„ Z*e d¡jZ+e dejejB ¡Z,e -d¡Z.ej-ddeidZ/G dd„ de0ƒZ1e1ƒ Z2e2j3Z3e dej¡e dej¡gZ4g d ¢Z5e d!ej¡e d"ej¡e d#¡gZ6d$gZ7e4e5e6e7fd%d&„Z8d'd(„ Z9d)d*„ Z:e8j e:_ g d+¢Z;d,gZ<d-e;e<ed.ƒfd/d0„Z=d1d2„ Z>d3d4„ Z?e d5ej¡Z@d6d7„ ZAdS )8zcA cleanup tool for HTML.

Removes unwanted tags and content.  See the `Cleaner` class for
details.
é    )Úabsolute_importN)Úurlsplit)Úunquote_plus)r   r   )Úetree)Údefs)Ú
fromstringÚXHTML_NAMESPACE)Úxhtml_to_htmlÚ_transform_result)Ú
clean_htmlÚcleanÚCleanerÚautolinkÚautolink_htmlÚ
word_breakÚword_break_htmlzexpression\s*\(.*?\)z
@\s*importz</?[a-zA-Z]+|\son[a-zA-Z]+\s*=é   © zdata:image/(.+);base64,z:(javascript|jscript|livescript|vbscript|data|about|mocha):z	(xml|svg)c                 C   s8   d}t | ƒD ]}t|ƒr dS |d7 }qtt| ƒƒ|kS )Nr   Té   )Ú_find_image_dataurlsÚ_is_unsafe_image_typeÚlenÚ_possibly_malicious_schemes)ÚsZsafe_image_urlsZ
image_typer   r   ú.lib/python3.9/site-packages/lxml/html/clean.pyÚ_has_javascript_schemeV   s    
r   z[\s\x00-\x08\x0B\x0C\x0E-\x19]+z\[if[\s\n\r]+.*?][\s\n\r]*>zdescendant-or-self::*[@style]zÂdescendant-or-self::a  [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']Úx)Z
namespacesc                	   @   sê   e Zd ZdZdZdZdZdZdZdZ	dZ
dZdZdZdZdZdZdZdZdZdZdZejZdZdZddhZdd	„ Zed
dddgd
d
d
ddZdd„ Zdd„ Zdd„ Z dd„ Z!dd„ Z"d"dd„Z#dd„ Z$e% &de%j'¡j(Z)dd„ Z*d d!„ Z+dS )#r   a  
    Instances cleans the document of each of the possible offending
    elements.  The cleaning is controlled by attributes; you can
    override attributes in a subclass, or set them in the constructor.

    ``scripts``:
        Removes any ``<script>`` tags.

    ``javascript``:
        Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets
        as they could contain Javascript.

    ``comments``:
        Removes any comments.

    ``style``:
        Removes any style tags.

    ``inline_style``
        Removes any style attributes.  Defaults to the value of the ``style`` option.

    ``links``:
        Removes any ``<link>`` tags

    ``meta``:
        Removes any ``<meta>`` tags

    ``page_structure``:
        Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.

    ``processing_instructions``:
        Removes any processing instructions.

    ``embedded``:
        Removes any embedded objects (flash, iframes)

    ``frames``:
        Removes any frame-related tags

    ``forms``:
        Removes any form tags

    ``annoying_tags``:
        Tags that aren't *wrong*, but are annoying.  ``<blink>`` and ``<marquee>``

    ``remove_tags``:
        A list of tags to remove.  Only the tags will be removed,
        their content will get pulled up into the parent tag.

    ``kill_tags``:
        A list of tags to kill.  Killing also removes the tag's content,
        i.e. the whole subtree, not just the tag itself.

    ``allow_tags``:
        A list of tags to include (default include all).

    ``remove_unknown_tags``:
        Remove any tags that aren't standard parts of HTML.

    ``safe_attrs_only``:
        If true, only include 'safe' attributes (specifically the list
        from the feedparser HTML sanitisation web site).

    ``safe_attrs``:
        A set of attribute names to override the default list of attributes
        considered 'safe' (when safe_attrs_only=True).

    ``add_nofollow``:
        If true, then any <a> tags will have ``rel="nofollow"`` added to them.

    ``host_whitelist``:
        A list or set of hosts that you can use for embedded content
        (for content like ``<object>``, ``<link rel="stylesheet">``, etc).
        You can also implement/override the method
        ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
        implement more complex rules for what can be embedded.
        Anything that passes this test will be shown, regardless of
        the value of (for instance) ``embedded``.

        Note that this parameter might not work as intended if you do not
        make the links absolute before doing the cleaning.

        Note that you may also need to set ``whitelist_tags``.

    ``whitelist_tags``:
        A set of tags that can be included with ``host_whitelist``.
        The default is ``iframe`` and ``embed``; you may wish to
        include other tags like ``script``, or you may want to
        implement ``allow_embedded_url`` for more control.  Set to None to
        include all tags.

    This modifies the document *in place*.
    TFNr   ÚiframeÚembedc                 K   sª   t ƒ }| ¡ D ]Z\}}t| ||ƒ}|d ur\|dur\|dur\t|ttttfƒs\td||f ƒ‚t	| ||ƒ q| j
d u r„d|vr„| j| _
| d¡r¦| d¡r tdƒ‚d| _d S )NTFzUnknown parameter: %s=%rÚinline_styleÚ
allow_tagsÚremove_unknown_tagsúIIt does not make sense to pass in both allow_tags and remove_unknown_tags)ÚobjectÚitemsÚgetattrÚ
isinstanceÚ	frozensetÚsetÚtupleÚlistÚ	TypeErrorÚsetattrr   ÚstyleÚgetÚ
ValueErrorr!   )ÚselfÚkwZnot_an_attributeÚnameÚvalueÚdefaultr   r   r   Ú__init__ã   s     ÿ
ÿ

zCleaner.__init__ÚsrcÚhrefÚcoder#   )ÚscriptÚlinkÚappletr   r   ÚlayerÚac                 C   sÌ  z
|j }W n ty   Y n0 |ƒ }t|ƒ | d¡D ]
}d|_q6| jsR|  |¡ t| jp\dƒ}t| j	pjdƒ}t| j
pxdƒ}| jrŒ| d¡ | jrÐt| jƒ}| tj¡D ]&}|j}| ¡ D ]}	|	|vrº||	= qºq¨| jr | jrì| jtjks&| tj¡D ],}|j}| ¡ D ]}	|	 d¡r
||	= q
qø|j| jdd | js˜t|ƒD ]P}| d¡}
td	|
ƒ}td	|ƒ}|  |¡r~|jd= n||
krF| d|¡ qF| js t| d¡ƒD ]p}| d
d	¡  ¡  !¡ dkrØ| "¡  q®|j#pâd	}
td	|
ƒ}td	|ƒ}|  |¡rd|_#n||
kr®||_#q®| jr4| tj$¡ | j%rH| tj&¡ | jrZ| d¡ | jrnt '|d¡ | j(r‚| d¡ nP| js’| jrÒt| d¡ƒD ]0}d| dd	¡  ¡ v r |  )|¡s | "¡  q | j*rä| d¡ | j+rö| ,d¡ | j-rdt| d¡ƒD ]B}| .¡ }|dur:|jdvr:| .¡ }q|du r| "¡  q| ,d¡ | ,d¡ | j/rx| ,tj0¡ | j1r”| d¡ | ,d¡ | j2r¦| ,d¡ g }g }| ¡ D ]T}|j|v râ|  )|¡rÖq¶| 3|¡ n&|j|v r¶|  )|¡rþq¶| 3|¡ q¶|r<|d |kr<| 4d¡}d|_|j 5¡  n8|rt|d |krt| 4d¡}|jdkrld|_| 5¡  | 6¡  |D ]}| "¡  q€|D ]}| 7¡  q”| j8rÄ|rºt9dƒ‚ttj:ƒ}|r^| jsÞ| tj$¡ | j%sò| tj&¡ g }| ¡ D ]}|j|vrþ| 3|¡ qþ|r^|d |u rJ| 4d¡}d|_|j 5¡  |D ]}| 7¡  qN| j;rÈt<|ƒD ]X}|  =|¡sn| d¡}|r´d|v rªdd | v rªqnd!| }nd}| d|¡ qndS )"z&
        Cleans the document.
        ZimageZimgr   r9   ZonF)Zresolve_base_hrefr-   Ú Útypeztext/javascriptz/* deleted */r:   Z
stylesheetÚrelÚmeta)ÚheadÚhtmlÚtitleÚparamN)r;   r#   )r;   )r   r   r<   r#   rE   Zform)ZbuttonÚinputÚselectÚtextarea)ZblinkZmarqueer   ZdivrC   r"   Znofollowz
 nofollow z %s z%s nofollow)>ÚgetrootÚAttributeErrorr	   ÚiterÚtagÚcommentsÚkill_conditional_commentsr(   Ú	kill_tagsÚremove_tagsr    ÚscriptsÚaddÚsafe_attrs_onlyÚ
safe_attrsr   ZElementÚattribÚkeysÚ
javascriptr   Ú
startswithZrewrite_linksÚ_remove_javascript_linkr   Ú_find_styled_elementsr.   Ú_replace_css_javascriptÚ_replace_css_importÚ_has_sneaky_javascriptr-   r*   ÚlowerÚstripÚ	drop_treeÚtextÚCommentÚprocessing_instructionsZProcessingInstructionZstrip_attributesÚlinksÚallow_elementrA   Úpage_structureÚupdateÚembeddedZ	getparentÚframesZ
frame_tagsÚformsÚannoying_tagsÚappendÚpopÚclearÚreverseZdrop_tagr!   r/   ZtagsÚadd_nofollowÚ_find_external_linksÚallow_follow)r0   ÚdocrI   ÚelrO   rP   r    rT   rU   ZanameÚoldÚnewÚparentÚ_removeZ_killÚbadr@   r   r   r   Ú__call__  s*   





ÿÿ



















ÿ





ÿ
zCleaner.__call__c                 C   s   dS )zF
        Override to suppress rel="nofollow" on some anchors.
        Fr   )r0   Úanchorr   r   r   rr   Ä  s    zCleaner.allow_followc                 C   s€   |j | jvrdS | j|j  }t|ttfƒr^|D ]*}| |¡}|sF dS |  ||¡s. dS q.dS | |¡}|spdS |  ||¡S dS )zÀ
        Decide whether an element is configured to be accepted or rejected.

        :param el: an element.
        :return: true to accept the element or false to reject/discard it.
        FTN)rL   Ú_tag_link_attrsr&   r*   r)   r.   Úallow_embedded_url)r0   rt   ÚattrZone_attrÚurlr   r   r   re   Ê  s    

zCleaner.allow_elementc                 C   s^   | j dur|j| j vrdS t|ƒ\}}}}}| ¡  dd¡d }|dvrLdS || jv rZdS dS )a  
        Decide whether a URL that was found in an element's attributes or text
        if configured to be accepted or rejected.

        :param el: an element.
        :param url: a URL found on the element.
        :return: true to accept the URL and false to reject it.
        NFú:r   r   )ZhttpZhttpsT)Úwhitelist_tagsrL   r   r^   ÚsplitÚhost_whitelist)r0   rt   r   ZschemeZnetlocÚpathZqueryZfragmentr   r   r   r}   â  s    	
zCleaner.allow_embedded_urlc                    s"   t j‰ |  |‡ fdd„tj¡ dS )zÎ
        IE conditional comments basically embed HTML that the parser
        doesn't normally see.  We can't allow anything like that, so
        we'll kill any comments that could be conditional.
        c                    s
   ˆ | j ƒS ©N)ra   )rt   ©Zhas_conditional_commentr   r   Ú<lambda>ý  ó    z3Cleaner.kill_conditional_comments.<locals>.<lambda>N)Ú_conditional_comment_reÚsearchÚ_kill_elementsr   rb   )r0   rs   r   r†   r   rN   õ  s
    þz!Cleaner.kill_conditional_commentsc                 C   s<   g }|  |¡D ]}||ƒr| |¡ q|D ]}| ¡  q*d S r…   )rK   rl   r`   )r0   rs   Z	conditionZiteratery   rt   r   r   r   r‹      s    zCleaner._kill_elementsc                 C   s   t dt|ƒƒ}t|ƒrdS |S )Nr>   )Ú_substitute_whitespacer   r   )r0   r:   rv   r   r   r   rY     s    zCleaner._remove_javascript_linkz	/\*.*?\*/c                 C   sj   |   d|¡}| dd¡}td|ƒ}| ¡ }t|ƒr6dS d|v rBdS d|v rNdS d|v rZdS t|ƒrfdS dS )aÆ  
        Depending on the browser, stuff like ``e x p r e s s i o n(...)``
        can get interpreted, or ``expre/* stuff */ssion(...)``.  This
        checks for attempt to do stuff like this.

        Typically the response will be to kill the entire style; if you
        have just a bit of Javascript in the style another rule will catch
        that and remove only the Javascript from the style; this catches
        more sneaky attempts.
        r>   ú\Tzexpression(z@importz
</noscriptF)Ú_substitute_commentsÚreplacerŒ   r^   r   Ú_looks_like_tag_content)r0   r-   r   r   r   r]     s    
zCleaner._has_sneaky_javascriptc                 C   s8   t |ƒ}t|tƒrt|ƒ}n
t |¡}| |ƒ t||ƒS r…   )r?   r&   Ú
basestringr   ÚcopyÚdeepcopyr
   )r0   rC   Úresult_typers   r   r   r   r   /  s    


zCleaner.clean_html)N),Ú__name__Ú
__module__Ú__qualname__Ú__doc__rQ   rW   rM   r-   r   rd   rA   rf   rc   rh   ri   rj   rk   rP   r    rO   r!   rS   r   rT   rp   rƒ   r   r5   Údictr|   rz   rr   re   r}   rN   r‹   rY   ÚreÚcompileÚSÚsubrŽ   r]   r   r   r   r   r   r   m   sX   ^	ï :
r   zb(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)z9mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_.-]+[a-z])))rH   Úprer8   rB   rG   r=   z
^localhostz\bexample\.(?:com|org|net)$z^127\.0\.0\.1$Znolinkc                 C   sÚ   | j |v rdS |  d¡}|r<| ¡ }|D ]}||v r( dS q(t| ƒD ]\}t|||||d |jrDt|j||| jd\}}	|	rD||_|  |¡}
|	| |
d |
d …< qD| j	rÖt| j	||| jd\}}|rÖ|| _	|| dd…< dS )a  
    Turn any URLs into links.

    It will search for links identified by the given regular
    expressions (by default mailto and http(s) links).

    It won't link text in an element in avoid_elements, or an element
    with a class in avoid_classes.  It won't link to anything with a
    host that matches one of the regular expressions in avoid_hosts
    (default localhost and 127.0.0.1).

    If you pass in an element, the element's tail will not be
    substituted, only the contents of the element.
    NÚclass)Úlink_regexesÚavoid_elementsÚavoid_hostsÚavoid_classes)Úfactoryr   r   )
rL   r.   r‚   r*   r   ÚtailÚ
_link_textZmakeelementÚindexra   )rt   r    r¡   r¢   r£   Ú
class_nameZmatch_classÚchildra   Ztail_childrenr§   Zpre_childrenr   r   r   r   O  s:    

ýÿ

ÿ
r   c                 C   s¶  d}g }d}d\}}|D ]x}	|}
|	j | |
d}|d u r8qf| d¡}|D ]}|  |¡rF| ¡ }
 q qFqfq |d u rpq|d u s„| ¡ |k r|}| ¡ }q|d u rÈ|r¸|d jr¬J ‚| |d _n|rÀJ ‚| }q®| d¡}| ¡ }| d¡sð| d¡r|d	8 }|d d… }| d | ¡ … }|r6|d jr*J ‚||d _n|r@J ‚|}|d
ƒ}| d|¡ | d¡}|sl|}| d¡s„| d¡r|d d… }||_| |¡ | |d … } q||fS )Nr>   r   )NN)ÚposÚhostéÿÿÿÿÚ.ú,r   r=   r7   Úbody)	rŠ   ÚgroupÚendÚstartr¥   Úendswithr(   ra   rl   )ra   r    r¢   r¤   Zleading_textrd   Zlast_posZ
best_matchZbest_posZregexZ	regex_posÚmatchr«   Z
host_regexr:   r±   Z	prev_textr{   r¯   r   r   r   r¦   |  sb    






r¦   c                 O   sF   t | ƒ}t| tƒrt| ƒ}n
t | ¡}t|g|¢R i |¤Ž t||ƒS r…   )r?   r&   r‘   r   r’   r“   r   r
   ©rC   Úargsr1   r”   rs   r   r   r   r   ¶  s    


r   )rž   rH   r8   Znobreaké(   i   c           	      C   s–   | j tv rdS |  d¡}|rJd}| ¡ }|D ]}||v r,d} qBq,|rJdS | jr`t| j||ƒ| _| D ],}t|||||d |jrdt|j||ƒ|_qddS )aç  
    Breaks any long words found in the body of the text (not attributes).

    Doesn't effect any of the tags in avoid_elements, by default
    ``<textarea>`` and ``<pre>``

    Breaks words by inserting &#8203;, which is a unicode character
    for Zero Width Space character.  This generally takes up no space
    in rendering, but does copy as a space, and in monospace contexts
    usually takes up space.

    See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
    NrŸ   FT)Ú	max_widthr¡   r£   Úbreak_character)rL   Ú_avoid_word_break_elementsr.   r‚   ra   Ú_break_textr   r¥   )	rt   r¸   r¡   r£   r¹   r¨   Z
dont_breakZavoidr©   r   r   r   r   È  s,    

ýr   c                 O   s0   t | ƒ}t| ƒ}t|g|¢R i |¤Ž t||ƒS r…   )r?   r   r   r
   rµ   r   r   r   r   ñ  s    r   c                 C   s:   |   ¡ }|D ](}t|ƒ|krt|||ƒ}|  ||¡} q| S r…   )r‚   r   Ú_insert_breakr   )ra   r¸   r¹   ZwordsÚwordZreplacementr   r   r   r»   ÷  s    r»   z[^a-z]c                 C   s„   | }d}t | ƒ|krx| d |… }tt |¡ƒ}|rZ|d }| ¡ |d krZ| d | ¡ … }||| 7 }| t |ƒd … } q|| 7 }|S )Nr>   r¬   é
   )r   r*   Ú_break_prefer_reÚfinditerr±   )r½   Úwidthr¹   Z	orig_wordÚresultr²   ZbreaksZ
last_breakr   r   r   r¼     s    r¼   )Br˜   Z
__future__r   r’   rš   ÚsysZurlparser   Zurllibr   ÚImportErrorZurllib.parseZlxmlr   Z	lxml.htmlr   r   r   r	   r
   ZunichrÚ	NameErrorÚchrZunicodeÚstrr‘   ÚbytesÚ__all__r›   rœ   ÚIr   r[   r\   Úversion_infoÚASCIIrŠ   r   Úfindallr   r   r   r   rŒ   r‰   ZXPathrZ   rq   r#   r   r   r   Z_link_regexesZ_avoid_elementsZ_avoid_hostsZ_avoid_classesr   r¦   r   rº   Z_avoid_word_break_classesr   r   r»   r¿   r¼   r   r   r   r   Ú<module>   s¶   

ÿÿÿþ
ÿþÿÿý   Nýýý
-:	ý
)