# Licensed under a 3-clause BSD style license - see LICENSE.rst """ A collection of functions for checking various XML-related strings for standards compliance. """ import re import urllib.parse def check_id(ID): """ Returns `True` if *ID* is a valid XML ID. """ return re.match(r"^[A-Za-z_][A-Za-z0-9_\.\-]*$", ID) is not None def fix_id(ID): """ Given an arbitrary string, create one that can be used as an xml id. This is rather simplistic at the moment, since it just replaces non-valid characters with underscores. """ if re.match(r"^[A-Za-z_][A-Za-z0-9_\.\-]*$", ID): return ID if len(ID): corrected = ID if not len(corrected) or re.match('^[^A-Za-z_]$', corrected[0]): corrected = '_' + corrected corrected = (re.sub(r"[^A-Za-z_]", '_', corrected[0]) + re.sub(r"[^A-Za-z0-9_\.\-]", "_", corrected[1:])) return corrected return '' _token_regex = r"(?![\r\l\t ])[^\r\l\t]*(?![\r\l\t ])" def check_token(token): """ Returns `True` if *token* is a valid XML token, as defined by XML Schema Part 2. """ return (token == '' or re.match( r"[^\r\n\t ]?([^\r\n\t ]| [^\r\n\t ])*[^\r\n\t ]?$", token) is not None) def check_mime_content_type(content_type): """ Returns `True` if *content_type* is a valid MIME content type (syntactically at least), as defined by RFC 2045. """ ctrls = ''.join(chr(x) for x in range(0, 0x20)) token_regex = f'[^()<>@,;:\\\"/[\\]?= {ctrls}\x7f]+' return re.match( fr'(?P{token_regex})/(?P{token_regex})$', content_type) is not None def check_anyuri(uri): """ Returns `True` if *uri* is a valid URI as defined in RFC 2396. """ if (re.match( (r"(([a-zA-Z][0-9a-zA-Z+\-\.]*:)?/{0,2}[0-9a-zA-Z;" + r"/?:@&=+$\.\-_!~*'()%]+)?(#[0-9a-zA-Z;/?:@&=+$\.\-_!~*'()%]+)?"), uri) is None): return False try: urllib.parse.urlparse(uri) except Exception: return False return True