# Copyright (c) 2004 Ian Bicking. All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are # met: # # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in # the documentation and/or other materials provided with the # distribution. # # 3. Neither the name of Ian Bicking nor the names of its contributors may # be used to endorse or promote products derived from this software # without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL IAN BICKING OR # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. """The ``lxml.html`` tool set for HTML handling. """ __all__ = [ 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 'find_rel_links', 'find_class', 'make_links_absolute', 'resolve_base_href', 'iterlinks', 'rewrite_links', 'parse'] import copy import re from collections.abc import MutableMapping, MutableSet from functools import partial from urllib.parse import urljoin from .. import etree from . import defs from ._setmixin import SetMixin def __fix_docstring(s): # TODO: remove and clean up doctests if not s: return s sub = re.compile(r"^(\s*)u'", re.M).sub return sub(r"\1'", s) XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", namespaces={'x':XHTML_NAMESPACE}) _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", namespaces={'x':XHTML_NAMESPACE}) _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", namespaces={'x':XHTML_NAMESPACE}) #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") _collect_string_content = etree.XPath("string()") _iter_css_urls = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I).finditer _iter_css_imports = re.compile(r'@import "(.*?)"').finditer _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", namespaces={'x':XHTML_NAMESPACE}) _archive_re = re.compile(r'[^ ]+') _parse_meta_refresh_url = re.compile( r'[^;=]*;\s*(?:url\s*=\s*)?(?P.*)$', re.I).search def _unquote_match(s, pos): if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": return s[1:-1], pos+1 else: return s,pos def _transform_result(typ, result): """Convert the result back into the input type. """ if issubclass(typ, bytes): return tostring(result, encoding='utf-8') elif issubclass(typ, str): return tostring(result, encoding='unicode') else: return result def _nons(tag): if isinstance(tag, str): if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: return tag.split('}')[-1] return tag class Classes(MutableSet): """Provides access to an element's class attribute as a set-like collection. Usage:: >>> el = fromstring('') >>> classes = el.classes # or: classes = Classes(el.attrib) >>> classes |= ['block', 'paragraph'] >>> el.get('class') 'hidden large block paragraph' >>> classes.toggle('hidden') False >>> el.get('class') 'large block paragraph' >>> classes -= ('some', 'classes', 'block') >>> el.get('class') 'large paragraph' """ def __init__(self, attributes): self._attributes = attributes self._get_class_value = partial(attributes.get, 'class', '') def add(self, value): """ Add a class. This has no effect if the class is already present. """ if not value or re.search(r'\s', value): raise ValueError("Invalid class name: %r" % value) classes = self._get_class_value().split() if value in classes: return classes.append(value) self._attributes['class'] = ' '.join(classes) def discard(self, value): """ Remove a class if it is currently present. If the class is not present, do nothing. """ if not value or re.search(r'\s', value): raise ValueError("Invalid class name: %r" % value) classes = [name for name in self._get_class_value().split() if name != value] if classes: self._attributes['class'] = ' '.join(classes) elif 'class' in self._attributes: del self._attributes['class'] def remove(self, value): """ Remove a class; it must currently be present. If the class is not present, raise a KeyError. """ if not value or re.search(r'\s', value): raise ValueError("Invalid class name: %r" % value) super().remove(value) def __contains__(self, name): classes = self._get_class_value() return name in classes and name in classes.split() def __iter__(self): return iter(self._get_class_value().split()) def __len__(self): return len(self._get_class_value().split()) # non-standard methods def update(self, values): """ Add all names from 'values'. """ classes = self._get_class_value().split() extended = False for value in values: if value not in classes: classes.append(value) extended = True if extended: self._attributes['class'] = ' '.join(classes) def toggle(self, value): """ Add a class name if it isn't there yet, or remove it if it exists. Returns true if the class was added (and is now enabled) and false if it was removed (and is now disabled). """ if not value or re.search(r'\s', value): raise ValueError("Invalid class name: %r" % value) classes = self._get_class_value().split() try: classes.remove(value) enabled = False except ValueError: classes.append(value) enabled = True if classes: self._attributes['class'] = ' '.join(classes) else: del self._attributes['class'] return enabled class HtmlMixin: def set(self, key, value=None): """set(self, key, value=None) Sets an element attribute. If no value is provided, or if the value is None, creates a 'boolean' attribute without value, e.g. "
" for ``form.set('novalidate')``. """ super().set(key, value) @property def classes(self): """ A set-like wrapper around the 'class' attribute. """ return Classes(self.attrib) @classes.setter def classes(self, classes): assert isinstance(classes, Classes) # only allow "el.classes |= ..." etc. value = classes._get_class_value() if value: self.set('class', value) elif self.get('class') is not None: del self.attrib['class'] @property def base_url(self): """ Returns the base URL, given when the page was parsed. Use with ``urlparse.urljoin(el.base_url, href)`` to get absolute URLs. """ return self.getroottree().docinfo.URL @property def forms(self): """ Return a list of all the forms """ return _forms_xpath(self) @property def body(self): """ Return the element. Can be called from a child element to get the document's head. """ return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0] @property def head(self): """ Returns the element. Can be called from a child element to get the document's head. """ return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0] @property def label(self): """ Get or set any