Improve expressionmatch docstrings.

master
voussoir 2021-05-05 19:18:37 -07:00
parent af5ec0b52e
commit 9e0509ae17
No known key found for this signature in database
GPG Key ID: 5F7554F8C26DACCB
1 changed files with 64 additions and 24 deletions

View File

@ -1,17 +1,39 @@
ESCAPE_SEQUENCES = { '''
'\\': '\\', This module provides the ExpressionTree class, which parses a query expression
'"': '"', like "a AND (b OR c)" and then evaluates whether an input satisfies the query.
}
Basic usage:
tree = expressionmatch.ExpressionTree.parse('a AND (b OR c)')
tree.evaluate('a b')
tree.evaluate('a c')
tree.evaluate('b c')
The available operators are:
a AND b
a OR b
a XOR b
NOT a
where a and b can be single tokens or a parenthesized group of tokens.
The operators must be capitalized as seen and can be enclosed in quotes if you
need to literally match the word "AND", etc.
If the tokens contain spaces, they must be enclosed in quotation marks:
tree = expressionmatch.ExpressionTree.parse('"mark hamill" OR "harrison ford"')
'''
from voussoirkit import sentinel
BINARY_OPERATORS = {'AND', 'OR', 'XOR'} BINARY_OPERATORS = {'AND', 'OR', 'XOR'}
UNARY_OPERATORS = {'NOT'} UNARY_OPERATORS = {'NOT'}
PRECEDENCE = ['NOT', 'AND', 'XOR', 'OR'] PRECEDENCE = ['NOT', 'AND', 'XOR', 'OR']
OPERATORS = BINARY_OPERATORS | UNARY_OPERATORS OPERATORS = BINARY_OPERATORS | UNARY_OPERATORS
# Sentinel values used for breaking up the tokens, so we dont' have to use # These sentinels help the parser distinguish between parens used for token
# strings '(' and ')' which can get confused with user input. # grouping and parens that have been escaped by the user and should remain
PAREN_OPEN = object() # as strings.
PAREN_CLOSE = object() PAREN_OPEN = sentinel.Sentinel('PAREN_OPEN')
PAREN_CLOSE = sentinel.Sentinel('PAREN_CLOSE')
DEFAULT_MATCH_FUNCTION = str.__contains__ DEFAULT_MATCH_FUNCTION = str.__contains__
@ -51,6 +73,11 @@ class NoTokens(Exception):
class ExpressionTree: class ExpressionTree:
def __init__(self, token, parent=None): def __init__(self, token, parent=None):
'''
This constructor is for each individual node of the tree.
End-users should probably call ExpressionTree.parse instead of this
constructor.
'''
self.children = [] self.children = []
self.parent = parent self.parent = parent
self.token = token self.token = token
@ -92,7 +119,10 @@ class ExpressionTree:
return s return s
@classmethod @classmethod
def parse(cls, tokens, spaces=0): def parse(cls, tokens):
'''
Create an ExpressionTree from the given query string or list of tokens.
'''
if isinstance(tokens, str): if isinstance(tokens, str):
tokens = tokenize(tokens) tokens = tokenize(tokens)
@ -100,13 +130,13 @@ class ExpressionTree:
raise NoTokens() raise NoTokens()
if isinstance(tokens[0], list): if isinstance(tokens[0], list):
current = cls.parse(tokens[0], spaces=spaces+1) current = cls.parse(tokens[0])
else: else:
current = cls(token=tokens[0]) current = cls(token=tokens[0])
for token in tokens[1:]: for token in tokens[1:]:
if isinstance(token, list): if isinstance(token, list):
new = cls.parse(token, spaces=spaces+1) new = cls.parse(token)
else: else:
new = cls(token=token) new = cls(token=token)
@ -223,6 +253,10 @@ class ExpressionTree:
def is_leaf(self): def is_leaf(self):
return self.token not in OPERATORS return self.token not in OPERATORS
@property
def is_root(self):
return self.parent is None
def map(self, function): def map(self, function):
''' '''
Apply this function to all of the operands. Apply this function to all of the operands.
@ -260,11 +294,15 @@ class ExpressionTree:
if node.is_leaf: if node.is_leaf:
yield node yield node
def implied_tokens(tokens): def implied_tokens(tokens):
''' '''
1. If two operands are directly next to each other, or an operand is followed This function returns a new list of tokens which has all of the implied
by a unary operator, it is implied that there is an AND between them. tokens added explicitly and meaningless tokens removed, by the
following rules:
1. If two operands are directly next to each other, or an operand is
followed by a unary operator, it is implied that there is an AND
between them.
'1 2' -> '1 AND 2' '1 2' -> '1 AND 2'
'1 NOT 2' -> '1 AND NOT 2' '1 NOT 2' -> '1 AND NOT 2'
@ -433,10 +471,11 @@ def sublist_tokens(tokens, _from_index=0, depth=0):
def tokenize(expression): def tokenize(expression):
''' '''
Break the string into a list of tokens. Spaces are the delimiter unless Break the string into a list of tokens. Spaces are the delimiter unless
they are inside quotation marks. they are inside quotation marks.
Quotation marks and parentheses can be escaped by preceeding with a backslash '\\' Quotation marks and parentheses can be escaped by preceeding with a
backslash '\\'.
Opening and closing parentheses are put into their own token unless Opening and closing parentheses are put into their own token unless
escaped / quoted. escaped / quoted.
@ -498,7 +537,7 @@ if __name__ == '__main__':
'[sci-fi] OR [pg-13]', '[sci-fi] OR [pg-13]',
'([sci-fi] OR [war]) AND [r]', '([sci-fi] OR [war]) AND [r]',
'[r] XOR [sci-fi]', '[r] XOR [sci-fi]',
'"mark hamill" "harrison ford"', '"[mark hamill]" "[harrison ford]"',
] ]
teststrings = { teststrings = {
'Star Wars': '[harrison ford] [george lucas] [sci-fi] [pg] [carrie fisher] [mark hamill] [space]', 'Star Wars': '[harrison ford] [george lucas] [sci-fi] [pg] [carrie fisher] [mark hamill] [space]',
@ -506,12 +545,13 @@ if __name__ == '__main__':
'Indiana Jones': '[harrison ford] [steven spielberg] [adventure] [pg-13]', 'Indiana Jones': '[harrison ford] [steven spielberg] [adventure] [pg-13]',
'Apocalypse Now': '[harrison ford] [francis coppola] [r] [war] [drama]' 'Apocalypse Now': '[harrison ford] [francis coppola] [r] [war] [drama]'
} }
for token in tests: for test in tests:
print('start:', token) print('start:', test)
token = tokenize(token) tokens = tokenize(test)
print('implied:', token) print('implied:', tokens)
e = ExpressionTree.parse(token) etree = ExpressionTree.parse(tokens)
print('tree:', e) print('tree:', etree)
print(etree.diagram())
for (name, teststring) in teststrings.items(): for (name, teststring) in teststrings.items():
print('Matches', name, ':', e.evaluate(teststring)) print('Matches', name, ':', etree.evaluate(teststring))
print() print()