Improve expressionmatch docstrings.

2021-05-05 19:18:37 -07:00 · 2021-05-05 19:18:37 -07:00 · 9e0509ae17
commit 9e0509ae17
parent af5ec0b52e
1 changed files with 64 additions and 24 deletions
--- a/voussoirkit/expressionmatch.py
+++ b/voussoirkit/expressionmatch.py
@ -1,17 +1,39 @@
-ESCAPE_SEQUENCES = {
-    '\\': '\\',
-    '"': '"',
-}
+'''
+This module provides the ExpressionTree class, which parses a query expression
+like "a AND (b OR c)" and then evaluates whether an input satisfies the query.
+
+Basic usage:
+tree = expressionmatch.ExpressionTree.parse('a AND (b OR c)')
+tree.evaluate('a b')
+tree.evaluate('a c')
+tree.evaluate('b c')
+
+The available operators are:
+a AND b
+a OR b
+a XOR b
+NOT a
+
+where a and b can be single tokens or a parenthesized group of tokens.
+
+The operators must be capitalized as seen and can be enclosed in quotes if you
+need to literally match the word "AND", etc.
+
+If the tokens contain spaces, they must be enclosed in quotation marks:
+tree = expressionmatch.ExpressionTree.parse('"mark hamill" OR "harrison ford"')
+'''
+from voussoirkit import sentinel

 BINARY_OPERATORS = {'AND', 'OR', 'XOR'}
 UNARY_OPERATORS = {'NOT'}
 PRECEDENCE = ['NOT', 'AND', 'XOR', 'OR']
 OPERATORS = BINARY_OPERATORS | UNARY_OPERATORS

-# Sentinel values used for breaking up the tokens, so we dont' have to use
-# strings '(' and ')' which can get confused with user input.
-PAREN_OPEN = object()
-PAREN_CLOSE = object()
+# These sentinels help the parser distinguish between parens used for token
+# grouping and parens that have been escaped by the user and should remain
+# as strings.
+PAREN_OPEN = sentinel.Sentinel('PAREN_OPEN')
+PAREN_CLOSE = sentinel.Sentinel('PAREN_CLOSE')

 DEFAULT_MATCH_FUNCTION = str.__contains__

@ -51,6 +73,11 @@ class NoTokens(Exception):

 class ExpressionTree:
    def __init__(self, token, parent=None):
+        '''
+        This constructor is for each individual node of the tree.
+        End-users should probably call ExpressionTree.parse instead of this
+        constructor.
+        '''
        self.children = []
        self.parent = parent
        self.token = token
@ -92,7 +119,10 @@ class ExpressionTree:
        return s

    @classmethod
-    def parse(cls, tokens, spaces=0):
+    def parse(cls, tokens):
+        '''
+        Create an ExpressionTree from the given query string or list of tokens.
+        '''
        if isinstance(tokens, str):
            tokens = tokenize(tokens)

@ -100,13 +130,13 @@ class ExpressionTree:
            raise NoTokens()

        if isinstance(tokens[0], list):
-            current = cls.parse(tokens[0], spaces=spaces+1)
+            current = cls.parse(tokens[0])
        else:
            current = cls(token=tokens[0])

        for token in tokens[1:]:
            if isinstance(token, list):
-                new = cls.parse(token, spaces=spaces+1)
+                new = cls.parse(token)
            else:
                new = cls(token=token)

@ -223,6 +253,10 @@ class ExpressionTree:
    def is_leaf(self):
        return self.token not in OPERATORS

+    @property
+    def is_root(self):
+        return self.parent is None
+
    def map(self, function):
        '''
        Apply this function to all of the operands.
@ -260,11 +294,15 @@ class ExpressionTree:
            if node.is_leaf:
                yield node

-
 def implied_tokens(tokens):
    '''
-    1. If two operands are directly next to each other, or an operand is followed
-        by a unary operator, it is implied that there is an AND between them.
+    This function returns a new list of tokens which has all of the implied
+    tokens added explicitly and meaningless tokens removed, by the
+    following rules:
+
+    1. If two operands are directly next to each other, or an operand is
+        followed by a unary operator, it is implied that there is an AND
+        between them.
        '1 2' -> '1 AND 2'
        '1 NOT 2' -> '1 AND NOT 2'

@ -433,10 +471,11 @@ def sublist_tokens(tokens, _from_index=0, depth=0):

 def tokenize(expression):
    '''
-    Break the string into a list of  tokens. Spaces are the delimiter unless
+    Break the string into a list of tokens. Spaces are the delimiter unless
    they are inside quotation marks.

-    Quotation marks and parentheses can be escaped by preceeding with a backslash '\\'
+    Quotation marks and parentheses can be escaped by preceeding with a
+    backslash '\\'.

    Opening and closing parentheses are put into their own token unless
    escaped / quoted.
@ -498,7 +537,7 @@ if __name__ == '__main__':
        '[sci-fi] OR [pg-13]',
        '([sci-fi] OR [war]) AND [r]',
        '[r] XOR [sci-fi]',
-        '"mark hamill" "harrison ford"',
+        '"[mark hamill]" "[harrison ford]"',
    ]
    teststrings = {
        'Star Wars': '[harrison ford] [george lucas] [sci-fi] [pg] [carrie fisher] [mark hamill] [space]',
@ -506,12 +545,13 @@ if __name__ == '__main__':
        'Indiana Jones': '[harrison ford] [steven spielberg] [adventure] [pg-13]',
        'Apocalypse Now': '[harrison ford] [francis coppola] [r] [war] [drama]'
    }
-    for token in tests:
-        print('start:', token)
-        token = tokenize(token)
-        print('implied:', token)
-        e = ExpressionTree.parse(token)
-        print('tree:', e)
+    for test in tests:
+        print('start:', test)
+        tokens = tokenize(test)
+        print('implied:', tokens)
+        etree = ExpressionTree.parse(tokens)
+        print('tree:', etree)
+        print(etree.diagram())
        for (name, teststring) in teststrings.items():
-            print('Matches', name, ':', e.evaluate(teststring))
+            print('Matches', name, ':', etree.evaluate(teststring))
        print()