# -*- coding: utf-8 -*- """ Functions for dealing with markup text """ import warnings import re import six from six import moves from w3lib.util import to_bytes, to_unicode from w3lib.url import safe_url_string _ent_re = re.compile(r'&((?P[a-z\d]+)|#(?P\d+)|#x(?P[a-f\d]+))(?P;?)', re.IGNORECASE) _tag_re = re.compile(r'<[a-zA-Z\/!].*?>', re.DOTALL) _baseurl_re = re.compile(six.u(r']*href\s*=\s*[\"\']\s*([^\"\'\s]+)\s*[\"\']'), re.I) _meta_refresh_re = re.compile(six.u(r']*http-equiv[^>]*refresh[^>]*content\s*=\s*(?P["\'])(?P(\d*\.)?\d+)\s*;\s*url=\s*(?P.*?)(?P=quote)'), re.DOTALL | re.IGNORECASE) _cdata_re = re.compile(r'((?P.*?)(?P\]\]>))', re.DOTALL) HTML5_WHITESPACE = ' \t\n\r\x0c' def remove_entities(text, keep=(), remove_illegal=True, encoding='utf-8'): r""" .. warning:: This function is deprecated and will be removed in future. Please use :func:`replace_entities` instead. """ warnings.warn( "`w3lib.html.remove_entities` function is deprecated and " "will be removed in future releases. Please use " "`w3lib.html.replace_entities` instead.", DeprecationWarning ) return replace_entities(text, keep, remove_illegal, encoding) def replace_entities(text, keep=(), remove_illegal=True, encoding='utf-8'): u"""Remove entities from the given `text` by converting them to their corresponding unicode character. `text` can be a unicode string or a byte string encoded in the given `encoding` (which defaults to 'utf-8'). If `keep` is passed (with a list of entity names) those entities will be kept (they won't be removed). It supports both numeric entities (``&#nnnn;`` and ``&#hhhh;``) and named entities (such as `` `` or ``>``). If `remove_illegal` is ``True``, entities that can't be converted are removed. If `remove_illegal` is ``False``, entities that can't be converted are kept "as is". For more information see the tests. Always returns a unicode string (with the entities removed). >>> import w3lib.html >>> w3lib.html.replace_entities(b'Price: £100') u'Price: \\xa3100' >>> print(w3lib.html.replace_entities(b'Price: £100')) Price: £100 >>> """ def convert_entity(m): groups = m.groupdict() if groups.get('dec'): number = int(groups['dec'], 10) elif groups.get('hex'): number = int(groups['hex'], 16) elif groups.get('named'): entity_name = groups['named'] if entity_name.lower() in keep: return m.group(0) else: number = (moves.html_entities.name2codepoint.get(entity_name) or moves.html_entities.name2codepoint.get(entity_name.lower())) if number is not None: # Numeric character references in the 80-9F range are typically # interpreted by browsers as representing the characters mapped # to bytes 80-9F in the Windows-1252 encoding. For more info # see: http://en.wikipedia.org/wiki/Character_encodings_in_HTML try: if 0x80 <= number <= 0x9f: return six.int2byte(number).decode('cp1252') else: return six.unichr(number) except ValueError: pass return u'' if remove_illegal and groups.get('semicolon') else m.group(0) return _ent_re.sub(convert_entity, to_unicode(text, encoding)) def has_entities(text, encoding=None): return bool(_ent_re.search(to_unicode(text, encoding))) def replace_tags(text, token='', encoding=None): """Replace all markup tags found in the given `text` by the given token. By default `token` is an empty string so it just removes all tags. `text` can be a unicode string or a regular string encoded as `encoding` (or ``'utf-8'`` if `encoding` is not given.) Always returns a unicode string. Examples: >>> import w3lib.html >>> w3lib.html.replace_tags(u'This text contains some tag') u'This text contains some tag' >>> w3lib.html.replace_tags('

Je ne parle pas fran\\xe7ais

', ' -- ', 'latin-1') u' -- Je ne parle pas -- fran\\xe7ais -- -- ' >>> """ return _tag_re.sub(token, to_unicode(text, encoding)) _REMOVECOMMENTS_RE = re.compile(u'|$)', re.DOTALL) def remove_comments(text, encoding=None): """ Remove HTML Comments. >>> import w3lib.html >>> w3lib.html.remove_comments(b"test whatever") u'test whatever' >>> """ text = to_unicode(text, encoding) return _REMOVECOMMENTS_RE.sub(u'', text) def remove_tags(text, which_ones=(), keep=(), encoding=None): """ Remove HTML Tags only. `which_ones` and `keep` are both tuples, there are four cases: ============== ============= ========================================== ``which_ones`` ``keep`` what it does ============== ============= ========================================== **not empty** empty remove all tags in ``which_ones`` empty **not empty** remove all tags except the ones in ``keep`` empty empty remove all tags **not empty** **not empty** not allowed ============== ============= ========================================== Remove all tags: >>> import w3lib.html >>> doc = '

This is a link: example

' >>> w3lib.html.remove_tags(doc) u'This is a link: example' >>> Keep only some tags: >>> w3lib.html.remove_tags(doc, keep=('div',)) u'
This is a link: example
' >>> Remove only specific tags: >>> w3lib.html.remove_tags(doc, which_ones=('a','b')) u'

This is a link: example

' >>> You can't remove some and keep some: >>> w3lib.html.remove_tags(doc, which_ones=('a',), keep=('p',)) Traceback (most recent call last): File "", line 1, in File "/usr/local/lib/python2.7/dist-packages/w3lib/html.py", line 101, in remove_tags assert not (which_ones and keep), 'which_ones and keep can not be given at the same time' AssertionError: which_ones and keep can not be given at the same time >>> """ assert not (which_ones and keep), 'which_ones and keep can not be given at the same time' which_ones = {tag.lower() for tag in which_ones} keep = {tag.lower() for tag in keep} def will_remove(tag): tag = tag.lower() if which_ones: return tag in which_ones else: return tag not in keep def remove_tag(m): tag = m.group(1) return u'' if will_remove(tag) else m.group(0) regex = '/]+).*?>' retags = re.compile(regex, re.DOTALL | re.IGNORECASE) return retags.sub(remove_tag, to_unicode(text, encoding)) def remove_tags_with_content(text, which_ones=(), encoding=None): """Remove tags and their content. `which_ones` is a tuple of which tags to remove including their content. If is empty, returns the string unmodified. >>> import w3lib.html >>> doc = '

This is a link: example

' >>> w3lib.html.remove_tags_with_content(doc, which_ones=('b',)) u'' >>> """ text = to_unicode(text, encoding) if which_ones: tags = '|'.join([r'<%s\b.*?|<%s\s*/>' % (tag, tag, tag) for tag in which_ones]) retags = re.compile(tags, re.DOTALL | re.IGNORECASE) text = retags.sub(u'', text) return text def replace_escape_chars(text, which_ones=('\n', '\t', '\r'), replace_by=u'', \ encoding=None): """Remove escape characters. `which_ones` is a tuple of which escape characters we want to remove. By default removes ``\\n``, ``\\t``, ``\\r``. `replace_by` is the string to replace the escape characters by. It defaults to ``''``, meaning the escape characters are removed. """ text = to_unicode(text, encoding) for ec in which_ones: text = text.replace(ec, to_unicode(replace_by, encoding)) return text def unquote_markup(text, keep=(), remove_illegal=True, encoding=None): """ This function receives markup as a text (always a unicode string or a UTF-8 encoded string) and does the following: 1. removes entities (except the ones in `keep`) from any part of it that is not inside a CDATA 2. searches for CDATAs and extracts their text (if any) without modifying it. 3. removes the found CDATAs """ def _get_fragments(txt, pattern): offset = 0 for match in pattern.finditer(txt): match_s, match_e = match.span(1) yield txt[offset:match_s] yield match offset = match_e yield txt[offset:] text = to_unicode(text, encoding) ret_text = u'' for fragment in _get_fragments(text, _cdata_re): if isinstance(fragment, six.string_types): # it's not a CDATA (so we try to remove its entities) ret_text += replace_entities(fragment, keep=keep, remove_illegal=remove_illegal) else: # it's a CDATA (so we just extract its content) ret_text += fragment.group('cdata_d') return ret_text def get_base_url(text, baseurl='', encoding='utf-8'): """Return the base url if declared in the given HTML `text`, relative to the given base url. If no base url is found, the given `baseurl` is returned. """ text = to_unicode(text, encoding) m = _baseurl_re.search(text) if m: return moves.urllib.parse.urljoin( safe_url_string(baseurl), safe_url_string(m.group(1), encoding=encoding) ) else: return safe_url_string(baseurl) def get_meta_refresh(text, baseurl='', encoding='utf-8', ignore_tags=('script', 'noscript')): """Return the http-equiv parameter of the HTML meta element from the given HTML text and return a tuple ``(interval, url)`` where interval is an integer containing the delay in seconds (or zero if not present) and url is a string with the absolute url to redirect. If no meta redirect is found, ``(None, None)`` is returned. """ if six.PY2: baseurl = to_bytes(baseurl, encoding) try: text = to_unicode(text, encoding) except UnicodeDecodeError: print(text) raise text = remove_tags_with_content(text, ignore_tags) text = remove_comments(replace_entities(text)) m = _meta_refresh_re.search(text) if m: interval = float(m.group('int')) url = safe_url_string(m.group('url').strip(' "\''), encoding) url = moves.urllib.parse.urljoin(baseurl, url) return interval, url else: return None, None def strip_html5_whitespace(text): r""" Strip all leading and trailing space characters (as defined in https://www.w3.org/TR/html5/infrastructure.html#space-character). Such stripping is useful e.g. for processing HTML element attributes which contain URLs, like ``href``, ``src`` or form ``action`` - HTML5 standard defines them as "valid URL potentially surrounded by spaces" or "valid non-empty URL potentially surrounded by spaces". >>> strip_html5_whitespace(' hello\n') 'hello' """ return text.strip(HTML5_WHITESPACE)