voussoirkit/voussoirkit/expressionmatch.py

import time
ESCAPE_SEQUENCES = {
    '\\': '\\',
    '"': '"',
}

BINARY_OPERATORS = {'AND', 'OR', 'XOR'}
UNARY_OPERATORS = {'NOT'}
PRECEDENCE = ['NOT', 'AND', 'XOR', 'OR']
OPERATORS = BINARY_OPERATORS | UNARY_OPERATORS

# Sentinel values used for breaking up the tokens, so we dont' have to use
# strings '(' and ')' which can get confused with user input.
PAREN_OPEN = object()
PAREN_CLOSE = object()

DEFAULT_MATCH_FUNCTION = str.__contains__

MESSAGE_WRITE_YOUR_OWN_MATCHER = '''
The default match function is {function}.
Consider passing your own `match_function`, which accepts two
positional arguments:
1. The object being tested.
2. The Expression token, a string.
'''.strip()

def func_and(values):
    return all(values)

def func_or(values):
    return any(values)

def func_xor(values):
    values = list(values)
    return values.count(True) % 2 == 1

def func_not(value):
    value = list(value)
    if len(value) != 1:
        raise ValueError('NOT only takes 1 value')
    return not value[0]

OPERATOR_FUNCTIONS = {
    'AND': func_and,
    'OR': func_or,
    'XOR': func_xor,
    'NOT': func_not,
}

class NoTokens(Exception):
    pass

class ExpressionTree:
    def __init__(self, token, parent=None):
        self.children = []
        self.parent = parent
        self.token = token

    def __str__(self):
        if self.token is None:
            return '""'

        if self.token not in OPERATORS:
            t = self.token
            t = t.replace('"', '\\"')
            t = t.replace('(', '\\(')
            t = t.replace(')', '\\)')
            if ' ' in t:
                t = '"%s"' % t
            return t

        if len(self.children) == 1:
            child = self.children[0]
            childstring = str(child)
            if child.token in OPERATORS:
                childstring = '(%s)' % childstring
                return '%s%s' % (self.token, childstring)
            return '%s %s' % (self.token, childstring)

        children = []
        for child in self.children:
            childstring = str(child)
            if child.token in OPERATORS:
                childstring = '(%s)' % childstring
            children.append(childstring)
        #children = [str(child) for child in self.children]

        if len(children) == 1:
            return '%s %s' % (self.token, children[0])

        s = ' %s ' % self.token
        s = s.join(children)
        return s

    @classmethod
    def parse(cls, tokens, spaces=0):
        if isinstance(tokens, str):
            tokens = tokenize(tokens)

        if tokens == []:
            raise NoTokens()

        if isinstance(tokens[0], list):
            current = cls.parse(tokens[0], spaces=spaces+1)
        else:
            current = cls(token=tokens[0])

        for token in tokens[1:]:
            ##print('  '*spaces, 'cur', current, current.token)
            if isinstance(token, list):
                new = cls.parse(token, spaces=spaces+1)
            else:
                new = cls(token=token)
            ##print('  '*spaces, 'new', new)

            if 0 == 1:
                pass

            elif current.token not in OPERATORS:
                if new.token in BINARY_OPERATORS:
                    if len(new.children) == 0:
                        new.children.append(current)
                        current.parent = new
                        current = new
                else:
                    raise Exception('Expected binary operator, got %s.' % new.token)

            elif current.token in BINARY_OPERATORS:
                if new.token in BINARY_OPERATORS:
                    if new.token == current.token:
                        for child in new.children:
                            child.parent = current
                        current.children.extend(new.children)
                    else:
                        if len(new.children) == 0:
                            new.children.append(current)
                            current.parent = new
                            current = new
                        else:
                            current.children.append(new)
                            new.parent = current

                elif new.token in UNARY_OPERATORS:
                    if len(new.children) == 0:
                        current.children.append(new)
                        new.parent = current
                        current = new
                    else:
                        current.children.append(new)
                        new.parent = current

                elif new.token not in OPERATORS:
                    if len(current.children) > 0:
                        current.children.append(new)
                        new.parent = current
                    else:
                        raise Exception('Expected current children > 0.')

            elif current.token in UNARY_OPERATORS:
                if len(current.children) == 0:
                    current.children.append(new)
                    new.parent = current
                    if current.parent is not None:
                        current = current.parent
                elif new.token in BINARY_OPERATORS:
                    if len(new.children) == 0:
                        new.children.append(current)
                        current.parent = new
                        current = new
                    else:
                        current.children.append(new)
                        new.parent = current
                        if current.parent is not None:
                            current = current.parent
                else:
                    raise Exception('Expected new to be my operand or parent binary.')

            ##print('  '*spaces, 'fin:', current.rootmost(), '\n')

        current = current.rootmost()
        ##print('---', current)
        return current

    def _evaluate(self, text, match_function=None):
        if self.token not in OPERATORS:
            if match_function is None:
                match_function = DEFAULT_MATCH_FUNCTION

            value = match_function(text, self.token)
            #print(self.token, value)
            return value

        operator_function = OPERATOR_FUNCTIONS[self.token]
        children = (child.evaluate(text, match_function=match_function) for child in self.children)
        return operator_function(children)

    def diagram(self):
        if self.token is None:
            return '""'
        t = self.token
        if ' ' in t:
            t = '"%s"' % t

        output = t
        indent = 1
        for child in self.children:
            child = child.diagram()
            for line in child.splitlines():
                output += (' ' * indent)
                output += line + '\n'
                indent = len(t) + 1
        output = output.strip()

        return output

    def evaluate(self, text, match_function=None):
        if match_function is None:
            match_function = DEFAULT_MATCH_FUNCTION

        try:
            return self._evaluate(text, match_function)
        except Exception as e:
            if match_function is DEFAULT_MATCH_FUNCTION:
                message = MESSAGE_WRITE_YOUR_OWN_MATCHER.format(function=DEFAULT_MATCH_FUNCTION)
                override = Exception(message)
                raise override from e
            raise e

    @property
    def is_leaf(self):
        return self.token not in OPERATORS

    def map(self, function):
        '''
        Apply this function to all of the operands.
        '''
        for node in self.walk_leaves():
            node.token = function(node.token)

    def prune(self):
        '''
        Remove any nodes where `token` is None.
        '''
        self.children = [child for child in self.children if child.token is not None]

        for child in self.children:
            child.prune()

        if self.token in OPERATORS and len(self.children) == 0:
            self.token = None
            if self.parent is not None:
                self.parent.children.remove(self)

    def rootmost(self):
        current = self
        while current.parent is not None:
            current = current.parent
        return current

    def walk(self):
        yield self
        for child in self.children:
            yield from child.walk()

    def walk_leaves(self):
        for node in self.walk():
            if node.is_leaf:
                yield node


def implied_tokens(tokens):
    '''
    1. If two operands are directly next to each other, or an operand is followed
        by a unary operator, it is implied that there is an AND between them.
        '1 2' -> '1 AND 2'
        '1 NOT 2' -> '1 AND NOT 2'

    2. If an expression begins or ends with an invalid operator, remove it.
        'AND 2' -> '2'
        '2 AND' -> '2'

    3. If a parenthetical term contains only 1 item, the parentheses can be removed.
        '(a)' -> 'a'
        '(NOT a)' -> 'NOT a'
        '(a OR)' -> '(a)' (by rule 2) -> 'a'

    4. If two operators are next to each other, except for binary-unary,
        keep only the first.
        '1 OR AND 2' -> '1 OR 2'
        '1 NOT AND 2' -> '1 AND NOT AND 2' (by rule 1) -> '1 AND NOT 2'
        'NOT NOT 1' -> 'NOT 1'
        '1 AND NOT NOT 2' -> '1 AND NOT 2'
    '''
    final_tokens = []
    has_operand = False
    has_binary_operator = False
    has_unary_operator = False

    if len(tokens) == 1 and not isinstance(tokens[0], str):
        # [['A' 'AND' 'B']] -> ['A' 'AND' 'B']
        tokens = tokens[0]

    for token in tokens:
        skip_this = False
        while isinstance(token, (list, tuple)):
            if len(token) == 0:
                # Delete empty parentheses.
                skip_this = True
                break
            if len(token) == 1:
                # Take singular terms out of their parentheses.
                token = token[0]
            else:
                previous = token
                token = implied_tokens(token)
                if previous == token:
                    break

        if skip_this:
            continue

        #print('tk:', token, 'hu:', has_unary_operator, 'hb:', has_binary_operator, 'ho:', has_operand)
        if isinstance(token, str) and token in OPERATORS:
            this_binary = token in BINARY_OPERATORS
            this_unary = not this_binary

            # 'NOT AND' and 'AND AND' are malformed...
            if this_binary and (has_binary_operator or has_unary_operator):
                continue
            # ...'NOT NOT' is malformed...
            if this_unary and has_unary_operator:
                continue
            # ...but AND NOT is okay.

            # 'AND test' is malformed
            if this_binary and not has_operand:
                continue

            if this_unary and has_operand:
                final_tokens.append('AND')

            has_unary_operator = this_unary
            has_binary_operator = this_binary
            has_operand = False

        else:
            if has_operand:
                final_tokens.append('AND')
            has_unary_operator = False
            has_binary_operator = False
            has_operand = True

        final_tokens.append(token)

    if has_binary_operator or has_unary_operator:
        final_tokens.pop(-1)

    return final_tokens

def order_operations(tokens):
    for (index, token) in enumerate(tokens):
        if isinstance(token, list):
            tokens[index] = order_operations(token)

    if len(tokens) < 5:
        return tokens

    index = 0
    slice_start = None
    slice_end = None
    precedence_stack = []
    while index < len(tokens):
        #time.sleep(0.1)
        token = tokens[index]
        try:
            precedence = PRECEDENCE.index(token)
        except ValueError:
            precedence = None

        if precedence is None:
            index += 1
            continue
        precedence_stack.append(precedence)


        if token in UNARY_OPERATORS:
            slice_start = index
            slice_end = index + 2

        elif len(precedence_stack) > 1:
            if precedence_stack[-1] < precedence_stack[-2]:
                slice_start = index - 1
                slice_end = None
            elif precedence_stack[-2] < precedence_stack[-1]:
                slice_end = index

        #print(tokens, index, token, precedence_stack, slice_start, slice_end, sep=' || ')

        if slice_start is None or slice_end is None:
            index += 1
            continue

        tokens[slice_start:slice_end] = [tokens[slice_start:slice_end]]
        slice_start = None
        slice_end = None
        for x in range(2):
            if not precedence_stack:
                break

            delete = precedence_stack[-1]
            while precedence_stack and precedence_stack[-1] == delete:
                index -= 1
                precedence_stack.pop(-1)

        index += 1

    if slice_start is not None:
        slice_end = len(tokens)
        tokens[slice_start:slice_end] = [tokens[slice_start:slice_end]]

    return tokens

def sublist_tokens(tokens, _from_index=0, depth=0):
    '''
    Given a list of tokens, replace parentheses with actual sublists.
    ['1', 'AND', '(', '3', 'OR', '4', ')'] ->
    ['1', 'AND', ['3', 'OR', '4']]

    Unclosed parentheses are automatically closed at the end.
    '''
    final_tokens = []
    index = _from_index
    while index < len(tokens):
        token = tokens[index]
        #print(index, token)
        index += 1
        if token is PAREN_OPEN:
            (token, index) = sublist_tokens(tokens, _from_index=index, depth=depth+1)
        if token is PAREN_CLOSE:
            break
        final_tokens.append(token)
    if _from_index == 0:
        return final_tokens
    else:
        return (final_tokens, index)

def tokenize(expression):
    '''
    Break the string into a list of  tokens. Spaces are the delimiter unless
    they are inside quotation marks.

    Quotation marks and parentheses can be escaped by preceeding with a backslash '\\'

    Opening and closing parentheses are put into their own token unless
    escaped / quoted.

    Extraneous closing parentheses are ignored completely.

    '1 AND(4 OR "5 6") OR \\(test\\)' ->
    ['1', 'AND', '(', '4', 'OR', '5 6', ')', 'OR', '\\(test\\)']
    '''
    current_word = []
    in_escape = False
    in_quotes = False
    paren_depth = 0
    tokens = []
    for character in expression:
        if in_escape:
            #character = ESCAPE_SEQUENCES.get(character, '\\'+character)
            in_escape = False

        elif character in  {'(', ')'} and not in_quotes:
            if character == '(':
                sentinel = PAREN_OPEN
                paren_depth += 1
            elif character == ')':
                sentinel = PAREN_CLOSE
                paren_depth -= 1

            if paren_depth >= 0:
                tokens.append(''.join(current_word))
                tokens.append(sentinel)
                current_word.clear()
                continue
            else:
                continue

        elif character == '\\':
            in_escape = True
            continue

        elif character == '"':
            in_quotes = not in_quotes
            continue

        elif character.isspace() and not in_quotes:
            tokens.append(''.join(current_word))
            current_word.clear()
            continue

        current_word.append(character)

    tokens.append(''.join(current_word))
    tokens = [w for w in tokens if w != '']
    tokens = sublist_tokens(tokens)
    tokens = implied_tokens(tokens)
    tokens = order_operations(tokens)
    return tokens

if __name__ == '__main__':
    tests = [
    #'test you AND(1 OR "harrison ford") AND (where are you) AND pg',
    #'(you OR "AND ME")',
    #'(3 XOR 2 OR 4',
    #'1 NOT OR AND (2 OR (3 OR 4) OR (5 OR 6)))',
    #'3 OR (5 OR)',
    #'1 AND(4 OR "5 6")OR \\(test) 2',
    #'1 2 AND (3 OR 4)',
    #'AND 2',
    #'1 AND 2 AND ("3 7" OR 6)AND (4 OR 5)',
    #'NOT 1 AND NOT (2 OR 3)',
    #'1 AND 2 AND 3 AND 4',
    #'NOT 1 AND 2 OR 3 OR (5 AND 6)',
    #'5 OR 6 AND 7 OR 8',
    #'1 OR 2 AND 3 AND 4 OR 5 AND 6 OR 7 OR 8 AND 9',
    #'2 XOR 3 AND 4',
    #'1 OR (2 OR 3 AND 4)',
    #'NOT XOR 4 7'
    '[sci-fi] OR [pg-13]',
    '([sci-fi] OR [war]) AND [r]',
    '[r] XOR [sci-fi]',
    '"mark hamill" "harrison ford"',
    ]
    teststrings = {
        'Star Wars': '[harrison ford] [george lucas] [sci-fi] [pg] [carrie fisher] [mark hamill] [space]',
        'Blade Runner': '[harrison ford] [ridley scott] [neo-noir] [dystopian] [sci-fi] [r]',
        'Indiana Jones': '[harrison ford] [steven spielberg] [adventure] [pg-13]',
        'Apocalypse Now': '[harrison ford] [francis coppola] [r] [war] [drama]'
    }
    for token in tests:
        print('start:', token)
        token = tokenize(token)
        print('implied:', token)
        e = ExpressionTree.parse(token)
        print('tree:', e)
        for (name, teststring) in teststrings.items():
            print('Matches', name, ':', e.evaluate(teststring))
        print()