# Licensed under a 3-clause BSD style license - see LICENSE.rst """ This file contains routines to verify the correctness of UCD strings. """ # STDLIB import re # LOCAL from astropy.utils import data __all__ = ['parse_ucd', 'check_ucd'] class UCDWords: """ Manages a list of acceptable UCD words. Works by reading in a data file exactly as provided by IVOA. This file resides in data/ucd1p-words.txt. """ def __init__(self): self._primary = set() self._secondary = set() self._descriptions = {} self._capitalization = {} with data.get_pkg_data_fileobj( "data/ucd1p-words.txt", encoding='ascii') as fd: for line in fd.readlines(): type, name, descr = [ x.strip() for x in line.split('|')] name_lower = name.lower() if type in 'QPEVC': self._primary.add(name_lower) if type in 'QSEVC': self._secondary.add(name_lower) self._descriptions[name_lower] = descr self._capitalization[name_lower] = name def is_primary(self, name): """ Returns True if *name* is a valid primary name. """ return name.lower() in self._primary def is_secondary(self, name): """ Returns True if *name* is a valid secondary name. """ return name.lower() in self._secondary def get_description(self, name): """ Returns the official English description of the given UCD *name*. """ return self._descriptions[name.lower()] def normalize_capitalization(self, name): """ Returns the standard capitalization form of the given name. """ return self._capitalization[name.lower()] _ucd_singleton = None def parse_ucd(ucd, check_controlled_vocabulary=False, has_colon=False): """ Parse the UCD into its component parts. Parameters ---------- ucd : str The UCD string check_controlled_vocabulary : bool, optional If `True`, then each word in the UCD will be verified against the UCD1+ controlled vocabulary, (as required by the VOTable specification version 1.2), otherwise not. has_colon : bool, optional If `True`, the UCD may contain a colon (as defined in earlier versions of the standard). Returns ------- parts : list The result is a list of tuples of the form: (*namespace*, *word*) If no namespace was explicitly specified, *namespace* will be returned as ``'ivoa'`` (i.e., the default namespace). Raises ------ ValueError if *ucd* is invalid """ global _ucd_singleton if _ucd_singleton is None: _ucd_singleton = UCDWords() if has_colon: m = re.search(r'[^A-Za-z0-9_.:;\-]', ucd) else: m = re.search(r'[^A-Za-z0-9_.;\-]', ucd) if m is not None: raise ValueError(f"UCD has invalid character '{m.group(0)}' in '{ucd}'") word_component_re = r'[A-Za-z0-9][A-Za-z0-9\-_]*' word_re = fr'{word_component_re}(\.{word_component_re})*' parts = ucd.split(';') words = [] for i, word in enumerate(parts): colon_count = word.count(':') if colon_count == 1: ns, word = word.split(':', 1) if not re.match(word_component_re, ns): raise ValueError(f"Invalid namespace '{ns}'") ns = ns.lower() elif colon_count > 1: raise ValueError(f"Too many colons in '{word}'") else: ns = 'ivoa' if not re.match(word_re, word): raise ValueError(f"Invalid word '{word}'") if ns == 'ivoa' and check_controlled_vocabulary: if i == 0: if not _ucd_singleton.is_primary(word): if _ucd_singleton.is_secondary(word): raise ValueError( f"Secondary word '{word}' is not valid as a primary word") else: raise ValueError(f"Unknown word '{word}'") else: if not _ucd_singleton.is_secondary(word): if _ucd_singleton.is_primary(word): raise ValueError( f"Primary word '{word}' is not valid as a secondary word") else: raise ValueError(f"Unknown word '{word}'") try: normalized_word = _ucd_singleton.normalize_capitalization(word) except KeyError: normalized_word = word words.append((ns, normalized_word)) return words def check_ucd(ucd, check_controlled_vocabulary=False, has_colon=False): """ Returns False if *ucd* is not a valid `unified content descriptor`_. Parameters ---------- ucd : str The UCD string check_controlled_vocabulary : bool, optional If `True`, then each word in the UCD will be verified against the UCD1+ controlled vocabulary, (as required by the VOTable specification version 1.2), otherwise not. has_colon : bool, optional If `True`, the UCD may contain a colon (as defined in earlier versions of the standard). Returns ------- valid : bool """ if ucd is None: return True try: parse_ucd(ucd, check_controlled_vocabulary=check_controlled_vocabulary, has_colon=has_colon) except ValueError: return False return True