Improve expressionmatch docstrings.

master
voussoir 2021-05-05 19:18:37 -07:00
parent af5ec0b52e
commit 9e0509ae17
No known key found for this signature in database
GPG Key ID: 5F7554F8C26DACCB
1 changed files with 64 additions and 24 deletions

View File

@ -1,17 +1,39 @@
ESCAPE_SEQUENCES = {
'\\': '\\',
'"': '"',
}
'''
This module provides the ExpressionTree class, which parses a query expression
like "a AND (b OR c)" and then evaluates whether an input satisfies the query.
Basic usage:
tree = expressionmatch.ExpressionTree.parse('a AND (b OR c)')
tree.evaluate('a b')
tree.evaluate('a c')
tree.evaluate('b c')
The available operators are:
a AND b
a OR b
a XOR b
NOT a
where a and b can be single tokens or a parenthesized group of tokens.
The operators must be capitalized as seen and can be enclosed in quotes if you
need to literally match the word "AND", etc.
If the tokens contain spaces, they must be enclosed in quotation marks:
tree = expressionmatch.ExpressionTree.parse('"mark hamill" OR "harrison ford"')
'''
from voussoirkit import sentinel
BINARY_OPERATORS = {'AND', 'OR', 'XOR'}
UNARY_OPERATORS = {'NOT'}
PRECEDENCE = ['NOT', 'AND', 'XOR', 'OR']
OPERATORS = BINARY_OPERATORS | UNARY_OPERATORS
# Sentinel values used for breaking up the tokens, so we dont' have to use
# strings '(' and ')' which can get confused with user input.
PAREN_OPEN = object()
PAREN_CLOSE = object()
# These sentinels help the parser distinguish between parens used for token
# grouping and parens that have been escaped by the user and should remain
# as strings.
PAREN_OPEN = sentinel.Sentinel('PAREN_OPEN')
PAREN_CLOSE = sentinel.Sentinel('PAREN_CLOSE')
DEFAULT_MATCH_FUNCTION = str.__contains__
@ -51,6 +73,11 @@ class NoTokens(Exception):
class ExpressionTree:
def __init__(self, token, parent=None):
'''
This constructor is for each individual node of the tree.
End-users should probably call ExpressionTree.parse instead of this
constructor.
'''
self.children = []
self.parent = parent
self.token = token
@ -92,7 +119,10 @@ class ExpressionTree:
return s
@classmethod
def parse(cls, tokens, spaces=0):
def parse(cls, tokens):
'''
Create an ExpressionTree from the given query string or list of tokens.
'''
if isinstance(tokens, str):
tokens = tokenize(tokens)
@ -100,13 +130,13 @@ class ExpressionTree:
raise NoTokens()
if isinstance(tokens[0], list):
current = cls.parse(tokens[0], spaces=spaces+1)
current = cls.parse(tokens[0])
else:
current = cls(token=tokens[0])
for token in tokens[1:]:
if isinstance(token, list):
new = cls.parse(token, spaces=spaces+1)
new = cls.parse(token)
else:
new = cls(token=token)
@ -223,6 +253,10 @@ class ExpressionTree:
def is_leaf(self):
return self.token not in OPERATORS
@property
def is_root(self):
return self.parent is None
def map(self, function):
'''
Apply this function to all of the operands.
@ -260,11 +294,15 @@ class ExpressionTree:
if node.is_leaf:
yield node
def implied_tokens(tokens):
'''
1. If two operands are directly next to each other, or an operand is followed
by a unary operator, it is implied that there is an AND between them.
This function returns a new list of tokens which has all of the implied
tokens added explicitly and meaningless tokens removed, by the
following rules:
1. If two operands are directly next to each other, or an operand is
followed by a unary operator, it is implied that there is an AND
between them.
'1 2' -> '1 AND 2'
'1 NOT 2' -> '1 AND NOT 2'
@ -433,10 +471,11 @@ def sublist_tokens(tokens, _from_index=0, depth=0):
def tokenize(expression):
'''
Break the string into a list of tokens. Spaces are the delimiter unless
Break the string into a list of tokens. Spaces are the delimiter unless
they are inside quotation marks.
Quotation marks and parentheses can be escaped by preceeding with a backslash '\\'
Quotation marks and parentheses can be escaped by preceeding with a
backslash '\\'.
Opening and closing parentheses are put into their own token unless
escaped / quoted.
@ -498,7 +537,7 @@ if __name__ == '__main__':
'[sci-fi] OR [pg-13]',
'([sci-fi] OR [war]) AND [r]',
'[r] XOR [sci-fi]',
'"mark hamill" "harrison ford"',
'"[mark hamill]" "[harrison ford]"',
]
teststrings = {
'Star Wars': '[harrison ford] [george lucas] [sci-fi] [pg] [carrie fisher] [mark hamill] [space]',
@ -506,12 +545,13 @@ if __name__ == '__main__':
'Indiana Jones': '[harrison ford] [steven spielberg] [adventure] [pg-13]',
'Apocalypse Now': '[harrison ford] [francis coppola] [r] [war] [drama]'
}
for token in tests:
print('start:', token)
token = tokenize(token)
print('implied:', token)
e = ExpressionTree.parse(token)
print('tree:', e)
for test in tests:
print('start:', test)
tokens = tokenize(test)
print('implied:', tokens)
etree = ExpressionTree.parse(tokens)
print('tree:', etree)
print(etree.diagram())
for (name, teststring) in teststrings.items():
print('Matches', name, ':', e.evaluate(teststring))
print('Matches', name, ':', etree.evaluate(teststring))
print()