import time ESCAPE_SEQUENCES = { '\\': '\\', '"': '"', } BINARY_OPERATORS = {'AND', 'OR', 'XOR'} UNARY_OPERATORS = {'NOT'} PRECEDENCE = ['NOT', 'AND', 'XOR', 'OR'] OPERATORS = BINARY_OPERATORS | UNARY_OPERATORS DEFAULT_MATCH_FUNCTION = str.__contains__ MESSAGE_WRITE_YOUR_OWN_MATCHER = ''' The default match function is {function}. Consider passing your own `match_function`, which accepts two positional arguments: 1. The object being tested. 2. The Expression token, a string. '''.strip() def func_and(values): return all(values) def func_or(values): return any(values) def func_xor(values): values = list(values) return values.count(True) % 2 == 1 def func_not(value): value = list(value) if len(value) != 1: raise ValueError('NOT only takes 1 value') return not value[0] OPERATOR_FUNCTIONS = { 'AND': func_and, 'OR': func_or, 'XOR': func_xor, 'NOT': func_not, } class NoTokens(Exception): pass class ExpressionTree: def __init__(self, token, parent=None): self.children = [] self.parent = parent self.token = token def __str__(self): if self.token is None: return '""' if self.token not in OPERATORS: t = self.token t = t.replace('"', '\\"') if ' ' in t: t = '"%s"' % t return t if len(self.children) == 1: child = self.children[0] childstring = str(child) if child.token in OPERATORS: childstring = '(%s)' % childstring return '%s%s' % (self.token, childstring) return '%s %s' % (self.token, childstring) children = [] for child in self.children: childstring = str(child) if child.token in OPERATORS: childstring = '(%s)' % childstring children.append(childstring) #children = [str(child) for child in self.children] if len(children) == 1: return '%s %s' % (self.token, children[0]) s = ' %s ' % self.token s = s.join(children) return s @classmethod def parse(cls, tokens, spaces=0): if isinstance(tokens, str): tokens = tokenize(tokens) if tokens == []: raise NoTokens() if isinstance(tokens[0], list): current = cls.parse(tokens[0], spaces=spaces+1) else: current = cls(token=tokens[0]) for token in tokens[1:]: ##print(' '*spaces, 'cur', current, current.token) if isinstance(token, list): new = cls.parse(token, spaces=spaces+1) else: new = cls(token=token) ##print(' '*spaces, 'new', new) if 0 == 1: pass elif current.token not in OPERATORS: if new.token in BINARY_OPERATORS: if len(new.children) == 0: new.children.append(current) current.parent = new current = new else: raise Exception('Expected binary operator, got %s.' % new.token) elif current.token in BINARY_OPERATORS: if new.token in BINARY_OPERATORS: if new.token == current.token: for child in new.children: child.parent = current current.children.extend(new.children) else: if len(new.children) == 0: new.children.append(current) current.parent = new current = new else: current.children.append(new) new.parent = current elif new.token in UNARY_OPERATORS: if len(new.children) == 0: current.children.append(new) new.parent = current current = new else: current.children.append(new) new.parent = current elif new.token not in OPERATORS: if len(current.children) > 0: current.children.append(new) new.parent = current else: raise Exception('Expected current children > 0.') elif current.token in UNARY_OPERATORS: if len(current.children) == 0: current.children.append(new) new.parent = current if current.parent is not None: current = current.parent elif new.token in BINARY_OPERATORS: if len(new.children) == 0: new.children.append(current) current.parent = new current = new else: current.children.append(new) new.parent = current if current.parent is not None: current = current.parent else: raise Exception('Expected new to be my operand or parent binary.') ##print(' '*spaces, 'fin:', current.rootmost(), '\n') current = current.rootmost() ##print('---', current) return current def _evaluate(self, text, match_function=None): if self.token not in OPERATORS: if match_function is None: match_function = DEFAULT_MATCH_FUNCTION value = match_function(text, self.token) #print(self.token, value) return value operator_function = OPERATOR_FUNCTIONS[self.token] children = (child.evaluate(text, match_function=match_function) for child in self.children) return operator_function(children) def evaluate(self, text, match_function=None): if match_function is None: match_function = DEFAULT_MATCH_FUNCTION try: return self._evaluate(text, match_function) except Exception as e: if match_function is DEFAULT_MATCH_FUNCTION: message = MESSAGE_WRITE_YOUR_OWN_MATCHER.format(function=DEFAULT_MATCH_FUNCTION) override = Exception(message) raise override from e raise e @property def is_leaf(self): return self.token not in OPERATORS def map(self, function): ''' Apply this function to all of the operands. ''' for node in self.walk_leaves(): node.token = function(node.token) def prune(self): ''' Remove any nodes where `token` is None. ''' self.children = [child for child in self.children if child.token is not None] for child in self.children: child.prune() if self.token in OPERATORS and len(self.children) == 0: self.token = None if self.parent is not None: self.parent.children.remove(self) def rootmost(self): current = self while current.parent is not None: current = current.parent return current def walk(self): yield self for child in self.children: yield from child.walk() def walk_leaves(self): for node in self.walk(): if node.is_leaf: yield node def implied_tokens(tokens): ''' 1. If two operands are directly next to each other, or an operand is followed by a unary operator, it is implied that there is an AND between them. '1 2' -> '1 AND 2' '1 NOT 2' -> '1 AND NOT 2' 2. If an expression begins or ends with an invalid operator, remove it. 'AND 2' -> '2' '2 AND' -> '2' 3. If a parenthetical term contains only 1 item, the parentheses can be removed. '(a)' -> 'a' '(NOT a)' -> 'NOT a' '(a OR)' -> '(a)' (by rule 2) -> 'a' 4. If two operators are next to each other, except for binary-unary, keep only the first. '1 OR AND 2' -> '1 OR 2' '1 NOT AND 2' -> '1 AND NOT AND 2' (by rule 1) -> '1 AND NOT 2' 'NOT NOT 1' -> 'NOT 1' '1 AND NOT NOT 2' -> '1 AND NOT 2' ''' final_tokens = [] has_operand = False has_binary_operator = False has_unary_operator = False if len(tokens) == 1 and not isinstance(tokens[0], str): # [['A' 'AND' 'B']] -> ['A' 'AND' 'B'] tokens = tokens[0] for token in tokens: skip_this = False while isinstance(token, (list, tuple)): if len(token) == 0: # Delete empty parentheses. skip_this = True break if len(token) == 1: # Take singular terms out of their parentheses. token = token[0] else: previous = token token = implied_tokens(token) if previous == token: break if skip_this: continue #print('tk:', token, 'hu:', has_unary_operator, 'hb:', has_binary_operator, 'ho:', has_operand) if isinstance(token, str) and token in OPERATORS: this_binary = token in BINARY_OPERATORS this_unary = not this_binary # 'NOT AND' and 'AND AND' are malformed... if this_binary and (has_binary_operator or has_unary_operator): continue # ...'NOT NOT' is malformed... if this_unary and has_unary_operator: continue # ...but AND NOT is okay. # 'AND test' is malformed if this_binary and not has_operand: continue if this_unary and has_operand: final_tokens.append('AND') has_unary_operator = this_unary has_binary_operator = this_binary has_operand = False else: if has_operand: final_tokens.append('AND') has_unary_operator = False has_binary_operator = False has_operand = True final_tokens.append(token) if has_binary_operator or has_unary_operator: final_tokens.pop(-1) return final_tokens def order_operations(tokens): for (index, token) in enumerate(tokens): if isinstance(token, list): tokens[index] = order_operations(token) if len(tokens) < 5: return tokens index = 0 slice_start = None slice_end = None precedence_stack = [] while index < len(tokens): #time.sleep(0.1) token = tokens[index] try: precedence = PRECEDENCE.index(token) except ValueError: precedence = None if precedence is None: index += 1 continue precedence_stack.append(precedence) if token in UNARY_OPERATORS: slice_start = index slice_end = index + 2 elif len(precedence_stack) > 1: if precedence_stack[-1] < precedence_stack[-2]: slice_start = index - 1 slice_end = None elif precedence_stack[-2] < precedence_stack[-1]: slice_end = index #print(tokens, index, token, precedence_stack, slice_start, slice_end, sep=' || ') if slice_start is None or slice_end is None: index += 1 continue tokens[slice_start:slice_end] = [tokens[slice_start:slice_end]] slice_start = None slice_end = None for x in range(2): if not precedence_stack: break delete = precedence_stack[-1] while precedence_stack and precedence_stack[-1] == delete: index -= 1 precedence_stack.pop(-1) index += 1 if slice_start is not None: slice_end = len(tokens) tokens[slice_start:slice_end] = [tokens[slice_start:slice_end]] return tokens def sublist_tokens(tokens, _from_index=0, depth=0): ''' Given a list of tokens, replace parentheses with actual sublists. ['1', 'AND', '(', '3', 'OR', '4', ')'] -> ['1', 'AND', ['3', 'OR', '4']] Unclosed parentheses are automatically closed at the end. ''' final_tokens = [] index = _from_index while index < len(tokens): token = tokens[index] #print(index, token) index += 1 if token == '(': (token, index) = sublist_tokens(tokens, _from_index=index, depth=depth+1) if token == ')': break final_tokens.append(token) if _from_index == 0: return final_tokens else: return (final_tokens, index) def tokenize(expression): ''' Break the string into a list of tokens. Spaces are the delimiter unless they are inside quotation marks. Quotation marks and parentheses can be escaped by preceeding with a backslash '\\' Opening and closing parentheses are put into their own token unless escaped / quoted. Extraneous closing parentheses are ignored completely. '1 AND(4 OR "5 6") OR \\(test\\)' -> ['1', 'AND', '(', '4', 'OR', '5 6', ')', 'OR', '\\(test\\)'] ''' current_word = [] in_escape = False in_quotes = False paren_depth = 0 tokens = [] for character in expression: if in_escape: character = ESCAPE_SEQUENCES.get(character, '\\'+character) in_escape = False elif character in {'(', ')'} and not in_quotes: if character == '(': paren_depth += 1 elif character == ')': paren_depth -= 1 if paren_depth >= 0: tokens.append(''.join(current_word)) tokens.append(character) current_word.clear() continue else: continue elif character == '\\': in_escape = True continue elif character == '"': in_quotes = not in_quotes continue elif character.isspace() and not in_quotes: tokens.append(''.join(current_word)) current_word.clear() continue current_word.append(character) tokens.append(''.join(current_word)) tokens = [w for w in tokens if w != ''] tokens = sublist_tokens(tokens) tokens = implied_tokens(tokens) tokens = order_operations(tokens) return tokens if __name__ == '__main__': tests = [ #'test you AND(1 OR "harrison ford") AND (where are you) AND pg', #'(you OR "AND ME")', #'(3 XOR 2 OR 4', #'1 NOT OR AND (2 OR (3 OR 4) OR (5 OR 6)))', #'3 OR (5 OR)', #'1 AND(4 OR "5 6")OR \\(test) 2', #'1 2 AND (3 OR 4)', #'AND 2', #'1 AND 2 AND ("3 7" OR 6)AND (4 OR 5)', #'NOT 1 AND NOT (2 OR 3)', #'1 AND 2 AND 3 AND 4', #'NOT 1 AND 2 OR 3 OR (5 AND 6)', #'5 OR 6 AND 7 OR 8', #'1 OR 2 AND 3 AND 4 OR 5 AND 6 OR 7 OR 8 AND 9', #'2 XOR 3 AND 4', #'1 OR (2 OR 3 AND 4)', #'NOT XOR 4 7' '[sci-fi] OR [pg-13]', '([sci-fi] OR [war]) AND [r]', '[r] XOR [sci-fi]', '"mark hamill" "harrison ford"', ] teststrings = { 'Star Wars': '[harrison ford] [george lucas] [sci-fi] [pg] [carrie fisher] [mark hamill] [space]', 'Blade Runner': '[harrison ford] [ridley scott] [neo-noir] [dystopian] [sci-fi] [r]', 'Indiana Jones': '[harrison ford] [steven spielberg] [adventure] [pg-13]', 'Apocalypse Now': '[harrison ford] [francis coppola] [r] [war] [drama]' } for token in tests: print('start:', token) token = tokenize(token) print('implied:', token) e = ExpressionTree.parse(token) print('tree:', e) for (name, teststring) in teststrings.items(): print('Matches', name, ':', e.evaluate(teststring)) print()