From 9e0509ae170d356c7e9ed2a8415621da107d0c0a Mon Sep 17 00:00:00 2001
From: Ethan Dalool <git@voussoir.net>
Date: Wed, 5 May 2021 19:18:37 -0700
Subject: [PATCH] Improve expressionmatch docstrings.

---
 voussoirkit/expressionmatch.py | 88 ++++++++++++++++++++++++----------
 1 file changed, 64 insertions(+), 24 deletions(-)

diff --git a/voussoirkit/expressionmatch.py b/voussoirkit/expressionmatch.py
index d5e351f..41ed3f6 100644
--- a/voussoirkit/expressionmatch.py
+++ b/voussoirkit/expressionmatch.py
@@ -1,17 +1,39 @@
-ESCAPE_SEQUENCES = {
-    '\\': '\\',
-    '"': '"',
-}
+'''
+This module provides the ExpressionTree class, which parses a query expression
+like "a AND (b OR c)" and then evaluates whether an input satisfies the query.
+
+Basic usage:
+tree = expressionmatch.ExpressionTree.parse('a AND (b OR c)')
+tree.evaluate('a b')
+tree.evaluate('a c')
+tree.evaluate('b c')
+
+The available operators are:
+a AND b
+a OR b
+a XOR b
+NOT a
+
+where a and b can be single tokens or a parenthesized group of tokens.
+
+The operators must be capitalized as seen and can be enclosed in quotes if you
+need to literally match the word "AND", etc.
+
+If the tokens contain spaces, they must be enclosed in quotation marks:
+tree = expressionmatch.ExpressionTree.parse('"mark hamill" OR "harrison ford"')
+'''
+from voussoirkit import sentinel
 
 BINARY_OPERATORS = {'AND', 'OR', 'XOR'}
 UNARY_OPERATORS = {'NOT'}
 PRECEDENCE = ['NOT', 'AND', 'XOR', 'OR']
 OPERATORS = BINARY_OPERATORS | UNARY_OPERATORS
 
-# Sentinel values used for breaking up the tokens, so we dont' have to use
-# strings '(' and ')' which can get confused with user input.
-PAREN_OPEN = object()
-PAREN_CLOSE = object()
+# These sentinels help the parser distinguish between parens used for token
+# grouping and parens that have been escaped by the user and should remain
+# as strings.
+PAREN_OPEN = sentinel.Sentinel('PAREN_OPEN')
+PAREN_CLOSE = sentinel.Sentinel('PAREN_CLOSE')
 
 DEFAULT_MATCH_FUNCTION = str.__contains__
 
@@ -51,6 +73,11 @@ class NoTokens(Exception):
 
 class ExpressionTree:
     def __init__(self, token, parent=None):
+        '''
+        This constructor is for each individual node of the tree.
+        End-users should probably call ExpressionTree.parse instead of this
+        constructor.
+        '''
         self.children = []
         self.parent = parent
         self.token = token
@@ -92,7 +119,10 @@ class ExpressionTree:
         return s
 
     @classmethod
-    def parse(cls, tokens, spaces=0):
+    def parse(cls, tokens):
+        '''
+        Create an ExpressionTree from the given query string or list of tokens.
+        '''
         if isinstance(tokens, str):
             tokens = tokenize(tokens)
 
@@ -100,13 +130,13 @@ class ExpressionTree:
             raise NoTokens()
 
         if isinstance(tokens[0], list):
-            current = cls.parse(tokens[0], spaces=spaces+1)
+            current = cls.parse(tokens[0])
         else:
             current = cls(token=tokens[0])
 
         for token in tokens[1:]:
             if isinstance(token, list):
-                new = cls.parse(token, spaces=spaces+1)
+                new = cls.parse(token)
             else:
                 new = cls(token=token)
 
@@ -223,6 +253,10 @@ class ExpressionTree:
     def is_leaf(self):
         return self.token not in OPERATORS
 
+    @property
+    def is_root(self):
+        return self.parent is None
+
     def map(self, function):
         '''
         Apply this function to all of the operands.
@@ -260,11 +294,15 @@ class ExpressionTree:
             if node.is_leaf:
                 yield node
 
-
 def implied_tokens(tokens):
     '''
-    1. If two operands are directly next to each other, or an operand is followed
-        by a unary operator, it is implied that there is an AND between them.
+    This function returns a new list of tokens which has all of the implied
+    tokens added explicitly and meaningless tokens removed, by the
+    following rules:
+
+    1. If two operands are directly next to each other, or an operand is
+        followed by a unary operator, it is implied that there is an AND
+        between them.
         '1 2' -> '1 AND 2'
         '1 NOT 2' -> '1 AND NOT 2'
 
@@ -433,10 +471,11 @@ def sublist_tokens(tokens, _from_index=0, depth=0):
 
 def tokenize(expression):
     '''
-    Break the string into a list of  tokens. Spaces are the delimiter unless
+    Break the string into a list of tokens. Spaces are the delimiter unless
     they are inside quotation marks.
 
-    Quotation marks and parentheses can be escaped by preceeding with a backslash '\\'
+    Quotation marks and parentheses can be escaped by preceeding with a
+    backslash '\\'.
 
     Opening and closing parentheses are put into their own token unless
     escaped / quoted.
@@ -498,7 +537,7 @@ if __name__ == '__main__':
         '[sci-fi] OR [pg-13]',
         '([sci-fi] OR [war]) AND [r]',
         '[r] XOR [sci-fi]',
-        '"mark hamill" "harrison ford"',
+        '"[mark hamill]" "[harrison ford]"',
     ]
     teststrings = {
         'Star Wars': '[harrison ford] [george lucas] [sci-fi] [pg] [carrie fisher] [mark hamill] [space]',
@@ -506,12 +545,13 @@ if __name__ == '__main__':
         'Indiana Jones': '[harrison ford] [steven spielberg] [adventure] [pg-13]',
         'Apocalypse Now': '[harrison ford] [francis coppola] [r] [war] [drama]'
     }
-    for token in tests:
-        print('start:', token)
-        token = tokenize(token)
-        print('implied:', token)
-        e = ExpressionTree.parse(token)
-        print('tree:', e)
+    for test in tests:
+        print('start:', test)
+        tokens = tokenize(test)
+        print('implied:', tokens)
+        etree = ExpressionTree.parse(tokens)
+        print('tree:', etree)
+        print(etree.diagram())
         for (name, teststring) in teststrings.items():
-            print('Matches', name, ':', e.evaluate(teststring))
+            print('Matches', name, ':', etree.evaluate(teststring))
         print()