Much better search

2017-05-06 18:29:42 -07:00 · 2017-05-06 18:29:42 -07:00 · 24dcbfb658
commit 24dcbfb658
parent ad6f4b0d01
1 changed files with 160 additions and 41 deletions
--- a/Toolbox/search.py
+++ b/Toolbox/search.py
@ -1,93 +1,212 @@
 import argparse
 import fnmatch
+import itertools
 import os
 import re
+import stat
 import sys
+import traceback

 from voussoirkit import clipext
 from voussoirkit import expressionmatch
+from voussoirkit import pathclass
 from voussoirkit import safeprint
 from voussoirkit import spinal

+# Thanks georg
+# http://stackoverflow.com/a/13443424
+STDIN_MODE = os.fstat(sys.stdin.fileno()).st_mode
+if stat.S_ISFIFO(STDIN_MODE):
+    STDIN_MODE = 'pipe'
+else:
+    STDIN_MODE = 'terminal'
+
+def all_terms_match(search_text, terms, match_function):
+    matches = (
+        (not terms['yes_all'] or all(match_function(search_text, term) for term in terms['yes_all'])) and
+        (not terms['yes_any'] or any(match_function(search_text, term) for term in terms['yes_any'])) and
+        (not terms['not_all'] or not all(match_function(search_text, term) for term in terms['not_all'])) and
+        (not terms['not_any'] or not any(match_function(search_text, term) for term in terms['not_any']))
+    )
+    return matches
+
 def search(
-        terms,
        *,
+        yes_all=None,
+        yes_any=None,
+        not_all=None,
+        not_any=None,
        case_sensitive=False,
+        content_args=None,
        do_expression=False,
        do_glob=False,
        do_regex=False,
-        inverse=False,
+        line_numbers=False,
        local_only=False,
-        match_any=False,
        text=None,
    ):
-    def term_matches(text, term):
+    terms = {
+        'yes_all': yes_all,
+        'yes_any': yes_any,
+        'not_all': not_all,
+        'not_any': not_any
+    }
+    terms = {k: (v or []) for (k, v) in terms.items()}
+    #print(terms, content_args)
+
+    if all(v == [] for v in terms.values()) and not content_args:
+        raise ValueError('No terms supplied')
+
+    def term_matches(line, term):
        if not case_sensitive:
-            text = text.lower()
+            line = line.lower()
+
+        if do_expression:
+            return term.evaluate(line)

        return (
-            (term in text) or
-            (do_regex and re.search(term, text)) or
-            (do_glob and fnmatch.fnmatch(text, term)) or
-            (do_expression and term.evaluate(text))
+            (term in line) or
+            (do_regex and re.search(term, line)) or
+            (do_glob and fnmatch.fnmatch(line, term))
        )

-    if not case_sensitive:
-        terms = [term.lower() for term in terms]
-
    if do_expression:
-        terms = ' '.join(terms)
-        terms = [expressionmatch.ExpressionTree.parse(terms)]
+        # The value still needs to be a list so the upcoming any() / all()
+        # receives an iterable as it expects. It just happens to be 1 tree.
+        trees = {}
+        for (key, value) in terms.items():
+            if value == []:
+                trees[key] = []
+                continue
+            tree = ' '.join(value)
+            tree = expressionmatch.ExpressionTree.parse(tree)
+            if not case_sensitive:
+                tree.map(str.lower)
+            trees[key] = [tree]
+        terms = trees

-    anyall = any if match_any else all
+    elif not case_sensitive:
+        terms = {k: [x.lower() for x in v] for (k, v) in terms.items()}

    if text is None:
-        walk = spinal.walk_generator(
+        search_objects = spinal.walk_generator(
            depth_first=False,
            recurse=not local_only,
            yield_directories=True,
        )
-        lines = ((filepath.basename, filepath.absolute_path) for filepath in walk)
    else:
-        lines = text.splitlines()
+        search_objects = text.splitlines()

-    for line in lines:
-        if isinstance(line, tuple):
-            (line, printout) = line
+    for (index, search_object) in enumerate(search_objects):
+        if isinstance(search_object, pathclass.Path):
+            search_text = search_object.basename
+            result_text = search_object.absolute_path
        else:
-            printout = line
-        matches = anyall(term_matches(line, term) for term in terms)
-        if matches ^ inverse:
-            safeprint.safeprint(printout)
+            search_text = search_object
+            result_text = search_object
+        if line_numbers:
+            result_text = '%d | %s' % (index+1, result_text)

+        if all_terms_match(search_text, terms, term_matches):
+            if not content_args:
+                yield result_text
+            else:
+                filepath = pathclass.Path(search_object)
+                if not filepath.is_file:
+                    continue
+                try:
+                    with open(filepath.absolute_path, 'r', encoding='utf-8') as handle:
+                        text = handle.read()
+                except:
+                    safeprint.safeprint(filepath.absolute_path)
+                    traceback.print_exc()
+                    continue
+
+                content_args['text'] = text
+                content_args['line_numbers'] = True
+                results = search(**content_args)
+                results = list(results)
+                if not results:
+                    continue
+
+                yield filepath.absolute_path
+                yield from results
+                yield ''
+
+def argparse_to_dict(args):
+    text = args.text
+    if text is not None:
+        text = clipext.resolve(text)
+    elif STDIN_MODE == 'pipe':
+        text = clipext.resolve('!i')
+
+    if hasattr(args, 'content_args') and args.content_args is not None:
+        content_args = argparse_to_dict(args.content_args)
+    else:
+        content_args = None
+
+    return {
+        'yes_all': args.yes_all,
+        'yes_any': args.yes_any,
+        'not_all': args.not_all,
+        'not_any': args.not_any,
+        'case_sensitive': args.case_sensitive,
+        'content_args': content_args,
+        'do_expression': args.do_expression,
+        'do_glob': args.do_glob,
+        'do_regex': args.do_regex,
+        'local_only': args.local_only,
+        'line_numbers': args.line_numbers,
+        'text': text,
+    }

 def search_argparse(args):
-    return search(
-        terms=args.search_terms,
-        case_sensitive=args.case_sensitive,
-        do_glob=args.do_glob,
-        do_regex=args.do_regex,
-        inverse=args.inverse,
-        local_only=args.local_only,
-        match_any=args.match_any,
-        text=args.text if args.text is None else clipext.resolve(args.text),
-    )
+    generator = search(**argparse_to_dict(args))
+    result_count = 0
+    for result in generator:
+        safeprint.safeprint(result)
+        result_count += 1
+    if args.show_count:
+        print('%d items.' % result_count)

 def main(argv):
    parser = argparse.ArgumentParser()

-    parser.add_argument('search_terms', nargs='+', default=None)
-    parser.add_argument('--any', dest='match_any', action='store_true')
+    # The padding is inserted to guarantee that --content is not the first
+    # argument. Because if it were, we wouldn't know if we have
+    # [pre, '--content'] or ['--content', post], etc. and I don't want to
+    # actually check the values.
+    argv.insert(0, 'padding')
+    grouper = itertools.groupby(argv, lambda x: x == '--content')
+    halves = [list(group) for (key, group) in grouper]
+    # halves looks like [pre, '--content', post]
+    name_args = halves[0]
+    # Pop the padding
+    name_args.pop(0)
+    content_args = [item for chunk in halves[2:] for item in chunk]
+
+    parser.add_argument('yes_all', nargs='*', default=None)
+    parser.add_argument('--all', dest='yes_all', nargs='+')
+    parser.add_argument('--any', dest='yes_any', nargs='+')
+    parser.add_argument('--not_all', dest='not_all', nargs='+')
+    parser.add_argument('--not_any', dest='not_any', nargs='+')
+
    parser.add_argument('--case', dest='case_sensitive', action='store_true')
-    parser.add_argument('--regex', dest='do_regex', action='store_true')
-    parser.add_argument('--glob', dest='do_glob', action='store_true')
+    parser.add_argument('--content', dest='do_content', action='store_true')
+    parser.add_argument('--count', dest='show_count', action='store_true')
    parser.add_argument('--expression', dest='do_expression', action='store_true')
+    parser.add_argument('--glob', dest='do_glob', action='store_true')
+    parser.add_argument('--line_numbers', dest='line_numbers', action='store_true')
    parser.add_argument('--local', dest='local_only', action='store_true')
-    parser.add_argument('--inverse', dest='inverse', action='store_true')
+    parser.add_argument('--regex', dest='do_regex', action='store_true')
    parser.add_argument('--text', dest='text', default=None)
    parser.set_defaults(func=search_argparse)

-    args = parser.parse_args(argv)
+    args = parser.parse_args(name_args)
+    if content_args:
+        args.content_args = parser.parse_args(content_args)
+    else:
+        args.content_args = None
    args.func(args)

 if __name__ == '__main__':