""" Parser for parsing a regular expression. Take a string representing a regular expression and return the root node of its parse tree. usage:: root_node = parse_regex('(hello|world)') Remarks: - The regex parser processes multiline, it ignores all whitespace and supports multiple named groups with the same name and #-style comments. Limitations: - Lookahead is not supported. """ import re from typing import List, Optional __all__ = [ "Repeat", "Variable", "Regex", "Lookahead", "tokenize_regex", "parse_regex", ] class Node: """ Base class for all the grammar nodes. (You don't initialize this one.) """ def __add__(self, other_node: "Node") -> "NodeSequence": return NodeSequence([self, other_node]) def __or__(self, other_node: "Node") -> "AnyNode": return AnyNode([self, other_node]) class AnyNode(Node): """ Union operation (OR operation) between several grammars. You don't initialize this yourself, but it's a result of a "Grammar1 | Grammar2" operation. """ def __init__(self, children: List[Node]) -> None: self.children = children def __or__(self, other_node: Node) -> "AnyNode": return AnyNode(self.children + [other_node]) def __repr__(self) -> str: return "%s(%r)" % (self.__class__.__name__, self.children) class NodeSequence(Node): """ Concatenation operation of several grammars. You don't initialize this yourself, but it's a result of a "Grammar1 + Grammar2" operation. """ def __init__(self, children: List[Node]) -> None: self.children = children def __add__(self, other_node: Node) -> "NodeSequence": return NodeSequence(self.children + [other_node]) def __repr__(self) -> str: return "%s(%r)" % (self.__class__.__name__, self.children) class Regex(Node): """ Regular expression. """ def __init__(self, regex: str) -> None: re.compile(regex) # Validate self.regex = regex def __repr__(self) -> str: return "%s(/%s/)" % (self.__class__.__name__, self.regex) class Lookahead(Node): """ Lookahead expression. """ def __init__(self, childnode: Node, negative: bool = False) -> None: self.childnode = childnode self.negative = negative def __repr__(self) -> str: return "%s(%r)" % (self.__class__.__name__, self.childnode) class Variable(Node): """ Mark a variable in the regular grammar. This will be translated into a named group. Each variable can have his own completer, validator, etc.. :param childnode: The grammar which is wrapped inside this variable. :param varname: String. """ def __init__(self, childnode: Node, varname: str = "") -> None: self.childnode = childnode self.varname = varname def __repr__(self) -> str: return "%s(childnode=%r, varname=%r)" % ( self.__class__.__name__, self.childnode, self.varname, ) class Repeat(Node): def __init__( self, childnode: Node, min_repeat: int = 0, max_repeat: Optional[int] = None, greedy: bool = True, ) -> None: self.childnode = childnode self.min_repeat = min_repeat self.max_repeat = max_repeat self.greedy = greedy def __repr__(self) -> str: return "%s(childnode=%r)" % (self.__class__.__name__, self.childnode) def tokenize_regex(input: str) -> List[str]: """ Takes a string, representing a regular expression as input, and tokenizes it. :param input: string, representing a regular expression. :returns: List of tokens. """ # Regular expression for tokenizing other regular expressions. p = re.compile( r"""^( \(\?P\<[a-zA-Z0-9_-]+\> | # Start of named group. \(\?#[^)]*\) | # Comment \(\?= | # Start of lookahead assertion \(\?! | # Start of negative lookahead assertion \(\?<= | # If preceded by. \(\?< | # If not preceded by. \(?: | # Start of group. (non capturing.) \( | # Start of group. \(?[iLmsux] | # Flags. \(?P=[a-zA-Z]+\) | # Back reference to named group \) | # End of group. \{[^{}]*\} | # Repetition \*\? | \+\? | \?\?\ | # Non greedy repetition. \* | \+ | \? | # Repetition \#.*\n | # Comment \\. | # Character group. \[ ( [^\]\\] | \\.)* \] | [^(){}] | . )""", re.VERBOSE, ) tokens = [] while input: m = p.match(input) if m: token, input = input[: m.end()], input[m.end() :] if not token.isspace(): tokens.append(token) else: raise Exception("Could not tokenize input regex.") return tokens def parse_regex(regex_tokens: List[str]) -> Node: """ Takes a list of tokens from the tokenizer, and returns a parse tree. """ # We add a closing brace because that represents the final pop of the stack. tokens: List[str] = [")"] + regex_tokens[::-1] def wrap(lst: List[Node]) -> Node: """Turn list into sequence when it contains several items.""" if len(lst) == 1: return lst[0] else: return NodeSequence(lst) def _parse() -> Node: or_list: List[List[Node]] = [] result: List[Node] = [] def wrapped_result() -> Node: if or_list == []: return wrap(result) else: or_list.append(result) return AnyNode([wrap(i) for i in or_list]) while tokens: t = tokens.pop() if t.startswith("(?P<"): variable = Variable(_parse(), varname=t[4:-1]) result.append(variable) elif t in ("*", "*?"): greedy = t == "*" result[-1] = Repeat(result[-1], greedy=greedy) elif t in ("+", "+?"): greedy = t == "+" result[-1] = Repeat(result[-1], min_repeat=1, greedy=greedy) elif t in ("?", "??"): if result == []: raise Exception("Nothing to repeat." + repr(tokens)) else: greedy = t == "?" result[-1] = Repeat( result[-1], min_repeat=0, max_repeat=1, greedy=greedy ) elif t == "|": or_list.append(result) result = [] elif t in ("(", "(?:"): result.append(_parse()) elif t == "(?!": result.append(Lookahead(_parse(), negative=True)) elif t == "(?=": result.append(Lookahead(_parse(), negative=False)) elif t == ")": return wrapped_result() elif t.startswith("#"): pass elif t.startswith("{"): # TODO: implement! raise Exception("{}-style repetition not yet supported".format(t)) elif t.startswith("(?"): raise Exception("%r not supported" % t) elif t.isspace(): pass else: result.append(Regex(t)) raise Exception("Expecting ')' token") result = _parse() if len(tokens) != 0: raise Exception("Unmatched parentheses.") else: return result