Source code for match.regex
"""
This module provides functions for compiling a regular expression into a
non-deterministic finite automaton (NFA) and checking if that NFA matches
a given search string.
Supported Operators
===================
The operators supported by this module are outlined below:
* ``.`` - Concatenation. Note that this operator denotes *explicit*
concatenation (e.g. The regular expression "h.e.l.l.o" is required
in order to match the string "hello").
* ``|`` - A vertical bar represents alternation/union.
* ``?`` - Indicates an optional character (zero or one occurrences).
* ``+`` - The plus symbol indicates one or more occurrences of the
preceding character.
* ``*`` - The "Kleene star" indicates zero or more occurrences of the
preceding character.
"""
from .shunting_yard import shunt
from .states import (
EPSILON,
Fragment,
State
)
[docs]class InvalidRegexError(ValueError):
"""
Raised to indicate a string is not a valid regular expression,
and is therefore unable to be compiled into a NFA.
"""
# Make this exception always have (an overridable) default message
# Ref: illagrenan - https://stackoverflow.com/a/56967197
def __init__(self, msg='Invalid regular expression', *args):
super().__init__(msg, *args)
[docs]def compile_regex(regex: str) -> Fragment:
"""
Compiles a given regular expression (in infix notation) to a NFA
Fragment and returns it.
The compilation process works by first converting `regex` to postfix
notation and then scanning through the postfix expression while
maintaining a stack of computed NFA fragments. When a literal character
is read, a new NFA fragment representing that character is pushed onto
the stack. If an operator is read, fragments are popped off the stack
(one fragment for unary operators & two for binary operators) and then
a new fragment representing the result of the operation is pushed back
onto the stack. At the end, a single NFA :class:`Fragment` is returned
which represents the compiled regular expression.
**References & Further Info:**
* Cox, Russ - `Regular Expression Matching Can Be Simple And Fast -
Implementation: Compiling to NFA
<https://swtch.com/~rsc/regexp/regexp1.html#compiling>`_
:param regex: The regular expression to compile.
:return: A :class:`Fragment` that represents the compiled regular
expression.
:raises InvalidRegexError: If `regex` is determined to be an invalid infix
regular expression.
"""
# Turn the (infix) regular expression into postfix/RPN, since it's easier
# to turn a postfix expression into a NFA.
try:
postfix = shunt(regex)
postfix = list(postfix)[::-1]
except ValueError as err:
raise InvalidRegexError("Unbalanced parentheses") from err
nfa_stack = []
while postfix:
c = postfix.pop()
if c in ('.', '|'):
# If a binary operator is read, there should be at least two
# Fragments on the `nfa_stack`.
if len(nfa_stack) < 2:
raise InvalidRegexError(f"The `{c}` operator requires two operands")
# Pop two fragments off the stack
frag1 = nfa_stack.pop()
frag2 = nfa_stack.pop()
if c == '.':
# Point frag2's accept state at frag1's start state.
frag2.accept.edges.append(frag1.start)
# Make frag2's start state the new start state & frag1's
# accept state the new accept state.
start = frag2.start
accept = frag1.accept
else:
# Create new start & accept states, with the new start
# state pointing to the start state of each fragment.
accept = State()
start = State(edges=[frag2.start, frag1.start])
# Point both the old accept states at the new one.
frag2.accept.edges.append(accept)
frag1.accept.edges.append(accept)
elif c in ('*', '+', '?'):
# If a unary operator is read, there should be at least one
# Fragment on the `nfa_stack`.
if len(nfa_stack) < 1:
raise InvalidRegexError(f"The `{c}` operator requires an operand")
frag = nfa_stack.pop()
accept = State()
if c == '*':
# Create the new start state, pointing to both the old
# start state and the new accept state.
start = State(edges=[frag.start, accept])
# Point the old fragment's acceptor to both the new accept
# state, and the old start state.
frag.accept.edges = [frag.start, accept]
elif c == '+':
# The new start state points to the old start state.
start = State(edges=[frag.start])
# Point the old fragment's acceptor to both the new accept
# state, and the old start state.
frag.accept.edges = [frag.start, accept]
else:
# Create the new start state, which points to both the old
# start state and the new accept state.
start = State(edges=[frag.start, accept])
# Point the old fragment's acceptor to the new accept state.
frag.accept.edges = [accept]
else:
# For literal characters, make a new accept state, assign a label,
# and then point the start state to the accept state.
accept = State()
start = State(label=c, edges=[accept])
# Push the new Fragment to the NFA stack.
new_frag = Fragment(start, accept)
nfa_stack.append(new_frag)
# The stack should now only have one NFA Fragment on it.
return nfa_stack.pop()
[docs]def match(regex: str, s: str) -> bool:
"""
This function will return `True` if the regular expression `regex`
(fully) matches the string `s`, and `False` otherwise.
:param regex: The regular expression to match.
:param s: The string to check against the regular expression.
:return: `True` if the string `s` matches the regular expression, and
`False` otherwise.
:raises InvalidRegexError: If there is an error compiling the regular
expression.
:raises ValueError: If the regular expression is an empty string.
"""
if not regex:
raise ValueError("`regex` cannot be an empty string")
current = set() # The current set of visited states
# Compile the regular expression into an NFA
nfa = compile_regex(regex)
# Add the first state, and follow all arrows labelled by
# epsilon (the empty string).
_follow_eps(nfa.start, current)
for c in s:
previous = current # Keeps track of where we were
current = set() # Empty set for states about to be visited
for state in previous:
# Only follow arrows labeled by `c`, where c != EPSILON
if state.label == c and c is not EPSILON:
# Add the state at the end of the arrow to `current`
_follow_eps(state.edges[0], current)
return nfa.accept in current
def _follow_eps(state: State, current: set):
"""
Adds the given :class:`State` to a set, and then follows all of that
state's ``EPSILON`` labelled edges.
:param state: A state to follow.
:param current: A `set` of the currently visited states.
"""
# If `state` is already in `current` there is no need to follow
if state in current:
return
current.add(state)
if state.label is EPSILON:
# Loop through the states pointed to by this state & follow
# all their EPSILON-labelled edges too
for node in state.edges:
_follow_eps(node, current)