Improve expressionmatch docstrings.

2021-05-05 19:18:37 -07:00 · 2021-05-05 19:18:37 -07:00 · 9e0509ae17
commit 9e0509ae17
parent af5ec0b52e
1 changed files with 64 additions and 24 deletions
--- a/voussoirkit/expressionmatch.py
+++ b/voussoirkit/expressionmatch.py
@ -1,17 +1,39 @@
-ESCAPE_SEQUENCES = {
+'''
-    '\\': '\\',
+This module provides the ExpressionTree class, which parses a query expression
-    '"': '"',
+like "a AND (b OR c)" and then evaluates whether an input satisfies the query.
-}
+
 Basic usage:
 tree = expressionmatch.ExpressionTree.parse('a AND (b OR c)')
 tree.evaluate('a b')
 tree.evaluate('a c')
 tree.evaluate('b c')
 The available operators are:
 a AND b
 a OR b
 a XOR b
 NOT a
 where a and b can be single tokens or a parenthesized group of tokens.
 The operators must be capitalized as seen and can be enclosed in quotes if you
 need to literally match the word "AND", etc.
 If the tokens contain spaces, they must be enclosed in quotation marks:
 tree = expressionmatch.ExpressionTree.parse('"mark hamill" OR "harrison ford"')
 '''
 from voussoirkit import sentinel
 BINARY_OPERATORS = {'AND', 'OR', 'XOR'}
 UNARY_OPERATORS = {'NOT'}
 PRECEDENCE = ['NOT', 'AND', 'XOR', 'OR']
 OPERATORS = BINARY_OPERATORS | UNARY_OPERATORS
-# Sentinel values used for breaking up the tokens, so we dont' have to use
+# These sentinels help the parser distinguish between parens used for token
-# strings '(' and ')' which can get confused with user input.
+# grouping and parens that have been escaped by the user and should remain
-PAREN_OPEN = object()
+# as strings.
-PAREN_CLOSE = object()
+PAREN_OPEN = sentinel.Sentinel('PAREN_OPEN')
 PAREN_CLOSE = sentinel.Sentinel('PAREN_CLOSE')
 DEFAULT_MATCH_FUNCTION = str.__contains__
@ -51,6 +73,11 @@ class NoTokens(Exception):
 class ExpressionTree:
    def __init__(self, token, parent=None):
        '''
        This constructor is for each individual node of the tree.
        End-users should probably call ExpressionTree.parse instead of this
        constructor.
        '''
        self.children = []
        self.parent = parent
        self.token = token
@ -92,7 +119,10 @@ class ExpressionTree:
        return s
    @classmethod
-    def parse(cls, tokens, spaces=0):
+    def parse(cls, tokens):
        '''
        Create an ExpressionTree from the given query string or list of tokens.
        '''
        if isinstance(tokens, str):
            tokens = tokenize(tokens)
@ -100,13 +130,13 @@ class ExpressionTree:
            raise NoTokens()
        if isinstance(tokens[0], list):
-            current = cls.parse(tokens[0], spaces=spaces+1)
+            current = cls.parse(tokens[0])
        else:
            current = cls(token=tokens[0])
        for token in tokens[1:]:
            if isinstance(token, list):
-                new = cls.parse(token, spaces=spaces+1)
+                new = cls.parse(token)
            else:
                new = cls(token=token)
@ -223,6 +253,10 @@ class ExpressionTree:
    def is_leaf(self):
        return self.token not in OPERATORS
    @property
    def is_root(self):
        return self.parent is None
    def map(self, function):
        '''
        Apply this function to all of the operands.
@ -260,11 +294,15 @@ class ExpressionTree:
            if node.is_leaf:
                yield node
 def implied_tokens(tokens):
    '''
-    1. If two operands are directly next to each other, or an operand is followed
+    This function returns a new list of tokens which has all of the implied
-        by a unary operator, it is implied that there is an AND between them.
+    tokens added explicitly and meaningless tokens removed, by the
    following rules:
    1. If two operands are directly next to each other, or an operand is
        followed by a unary operator, it is implied that there is an AND
        between them.
        '1 2' -> '1 AND 2'
        '1 NOT 2' -> '1 AND NOT 2'
@ -433,10 +471,11 @@ def sublist_tokens(tokens, _from_index=0, depth=0):
 def tokenize(expression):
    '''
-    Break the string into a list of  tokens. Spaces are the delimiter unless
+    Break the string into a list of tokens. Spaces are the delimiter unless
    they are inside quotation marks.
-    Quotation marks and parentheses can be escaped by preceeding with a backslash '\\'
+    Quotation marks and parentheses can be escaped by preceeding with a
    backslash '\\'.
    Opening and closing parentheses are put into their own token unless
    escaped / quoted.
@ -498,7 +537,7 @@ if __name__ == '__main__':
        '[sci-fi] OR [pg-13]',
        '([sci-fi] OR [war]) AND [r]',
        '[r] XOR [sci-fi]',
-        '"mark hamill" "harrison ford"',
+        '"[mark hamill]" "[harrison ford]"',
    ]
    teststrings = {
        'Star Wars': '[harrison ford] [george lucas] [sci-fi] [pg] [carrie fisher] [mark hamill] [space]',
@ -506,12 +545,13 @@ if __name__ == '__main__':
        'Indiana Jones': '[harrison ford] [steven spielberg] [adventure] [pg-13]',
        'Apocalypse Now': '[harrison ford] [francis coppola] [r] [war] [drama]'
    }
-    for token in tests:
+    for test in tests:
-        print('start:', token)
+        print('start:', test)
-        token = tokenize(token)
+        tokens = tokenize(test)
-        print('implied:', token)
+        print('implied:', tokens)
-        e = ExpressionTree.parse(token)
+        etree = ExpressionTree.parse(tokens)
-        print('tree:', e)
+        print('tree:', etree)
        print(etree.diagram())
        for (name, teststring) in teststrings.items():
-            print('Matches', name, ':', e.evaluate(teststring))
+            print('Matches', name, ':', etree.evaluate(teststring))
        print()