"tldextract helpers for testing and fetching remote resources." import logging import pkgutil import re import requests from requests_file import FileAdapter # type: ignore[import] LOG = logging.getLogger("tldextract") PUBLIC_SUFFIX_RE = re.compile(r"^(?P[.*!]*\w[\S]*)", re.UNICODE | re.MULTILINE) PUBLIC_PRIVATE_SUFFIX_SEPARATOR = "// ===BEGIN PRIVATE DOMAINS===" class SuffixListNotFound(LookupError): """A recoverable error while looking up a suffix list. Recoverable because you can specify backups, or use this library's bundled snapshot.""" def find_first_response(cache, urls, cache_fetch_timeout=None): """Decode the first successfully fetched URL, from UTF-8 encoding to Python unicode. """ with requests.Session() as session: session.mount("file://", FileAdapter()) for url in urls: try: return cache.cached_fetch_url( session=session, url=url, timeout=cache_fetch_timeout ) except requests.exceptions.RequestException: LOG.exception("Exception reading Public Suffix List url %s", url) raise SuffixListNotFound( "No Public Suffix List found. Consider using a mirror or constructing " "your TLDExtract with `suffix_list_urls=None`." ) def extract_tlds_from_suffix_list(suffix_list_text): """Parse the raw suffix list text for its different designations of suffixes.""" public_text, _, private_text = suffix_list_text.partition( PUBLIC_PRIVATE_SUFFIX_SEPARATOR ) public_tlds = [m.group("suffix") for m in PUBLIC_SUFFIX_RE.finditer(public_text)] private_tlds = [m.group("suffix") for m in PUBLIC_SUFFIX_RE.finditer(private_text)] return public_tlds, private_tlds def get_suffix_lists(cache, urls, cache_fetch_timeout, fallback_to_snapshot): """Fetch, parse, and cache the suffix lists""" return cache.run_and_cache( func=_get_suffix_lists, namespace="publicsuffix.org-tlds", kwargs={ "cache": cache, "urls": urls, "cache_fetch_timeout": cache_fetch_timeout, "fallback_to_snapshot": fallback_to_snapshot, }, hashed_argnames=["urls", "fallback_to_snapshot"], ) def _get_suffix_lists(cache, urls, cache_fetch_timeout, fallback_to_snapshot): """Fetch, parse, and cache the suffix lists""" try: text = find_first_response(cache, urls, cache_fetch_timeout=cache_fetch_timeout) except SuffixListNotFound as exc: if fallback_to_snapshot: text = pkgutil.get_data("tldextract", ".tld_set_snapshot") if not isinstance(text, str): text = str(text, "utf-8") else: raise exc public_tlds, private_tlds = extract_tlds_from_suffix_list(text) return public_tlds, private_tlds