cmd/search.py

import argparse
import itertools
import os
import re
import stat
import sys
import traceback
try:
    import winshell
except ImportError:
    winshell = None

from voussoirkit import clipext
from voussoirkit import expressionmatch
from voussoirkit import pathclass
from voussoirkit import safeprint
from voussoirkit import spinal
from voussoirkit import winglob

# Thanks georg
# http://stackoverflow.com/a/13443424
STDIN_MODE = os.fstat(sys.stdin.fileno()).st_mode
if stat.S_ISFIFO(STDIN_MODE):
    STDIN_MODE = 'pipe'
else:
    STDIN_MODE = 'terminal'

class HeaderedText:
    def __init__(self, header, text):
        self.header = header
        self.text = text

    @property
    def with_header(self):
        return f'{self.header}: {self.text}'


def all_terms_match(search_text, terms, match_function):
    matches = (
        (not terms['yes_all'] or all(match_function(search_text, term) for term in terms['yes_all'])) and
        (not terms['yes_any'] or any(match_function(search_text, term) for term in terms['yes_any'])) and
        (not terms['not_all'] or not all(match_function(search_text, term) for term in terms['not_all'])) and
        (not terms['not_any'] or not any(match_function(search_text, term) for term in terms['not_any']))
    )
    return matches

def search_contents_generic(filepath, content_args):
    try:
        with open(filepath.absolute_path, 'r') as handle:
            text = handle.read()
    except UnicodeDecodeError:
        try:
            with open(filepath.absolute_path, 'r', encoding='utf-8') as handle:
                text = handle.read()
        except UnicodeDecodeError:
            #safeprint.safeprint(filepath.absolute_path)
            #traceback.print_exc()
            return
    except Exception:
        safeprint.safeprint(filepath.absolute_path)
        traceback.print_exc()
        return

    content_args['text'] = text
    content_args['line_numbers'] = True

    results = search(**content_args)
    results = list(results)
    if not results:
        return

    yield filepath.absolute_path
    yield from results
    yield ''

def search_contents_windows_lnk(filepath, content_args):
    try:
        shortcut = winshell.Shortcut(filepath.absolute_path)
    except Exception:
        return

    text = [
        HeaderedText('Target', shortcut.path),
        HeaderedText('Arguments', shortcut.arguments),
        HeaderedText('Start In', shortcut.working_directory),
        HeaderedText('Comment', shortcut.description),
    ]
    content_args['text'] = text

    results = search(**content_args)
    results = list(results)
    if not results:
        return

    yield filepath.absolute_path
    yield from results
    yield ''

def search(
        *,
        yes_all=None,
        yes_any=None,
        not_all=None,
        not_any=None,
        case_sensitive=False,
        content_args=None,
        do_expression=False,
        do_glob=False,
        do_regex=False,
        line_numbers=False,
        local_only=False,
        text=None,
    ):
    terms = {
        'yes_all': yes_all,
        'yes_any': yes_any,
        'not_all': not_all,
        'not_any': not_any
    }
    terms = {k: ([v] if isinstance(v, str) else v or []) for (k, v) in terms.items()}
    #print(terms, content_args)

    if all(v == [] for v in terms.values()) and not content_args:
        raise ValueError('No terms supplied')

    def term_matches(line, term):
        if not case_sensitive:
            line = line.lower()

        if do_expression:
            return term.evaluate(line)

        return (
            (term in line) or
            (do_regex and re.search(term, line)) or
            (do_glob and winglob.fnmatch(line, term))
        )

    if do_expression:
        # The value still needs to be a list so the upcoming any() / all()
        # receives an iterable as it expects. It just happens to be 1 tree.
        trees = {}
        for (term_type, term_expression) in terms.items():
            if term_expression == []:
                trees[term_type] = []
                continue
            tree = ' '.join(term_expression)
            tree = expressionmatch.ExpressionTree.parse(tree)
            if not case_sensitive:
                tree.map(str.lower)
            trees[term_type] = [tree]
        terms = trees

    elif not case_sensitive:
        terms = {k: [x.lower() for x in v] for (k, v) in terms.items()}

    if text is None:
        search_objects = spinal.walk_generator(
            recurse=not local_only,
            yield_directories=True,
        )
    elif isinstance(text, (list, tuple)):
        search_objects = text
    else:
        search_objects = text.splitlines()

    for (index, search_object) in enumerate(search_objects):
        # if index % 10 == 0:
        #     print(index, end='\r', flush=True)
        if isinstance(search_object, pathclass.Path):
            search_text = search_object.basename
            result_text = search_object.absolute_path
        elif isinstance(search_object, HeaderedText):
            search_text = search_object.text
            result_text = search_object.with_header
        else:
            search_text = search_object
            result_text = search_object

        if line_numbers:
            result_text = f'{index+1:>4} | {result_text}'

        if not all_terms_match(search_text, terms, term_matches):
            continue

        if not content_args:
            yield result_text
            continue

        filepath = pathclass.Path(search_object)
        if not filepath.is_file:
            continue

        if filepath.extension == 'lnk' and winshell:
            yield from search_contents_windows_lnk(filepath, content_args)
        else:
            yield from search_contents_generic(filepath, content_args)

def argparse_to_dict(args):
    text = args.text
    if text is not None:
        text = clipext.resolve(text)
    elif STDIN_MODE == 'pipe':
        text = clipext.resolve('!i')

    if hasattr(args, 'content_args') and args.content_args is not None:
        content_args = argparse_to_dict(args.content_args)
    else:
        content_args = None

    return {
        'yes_all': args.yes_all,
        'yes_any': args.yes_any,
        'not_all': args.not_all,
        'not_any': args.not_any,
        'case_sensitive': args.case_sensitive,
        'content_args': content_args,
        'do_expression': args.do_expression,
        'do_glob': args.do_glob,
        'do_regex': args.do_regex,
        'local_only': args.local_only,
        'line_numbers': args.line_numbers,
        'text': text,
    }

def search_argparse(args):
    generator = search(**argparse_to_dict(args))
    result_count = 0
    for result in generator:
        safeprint.safeprint(result)
        result_count += 1
    if args.show_count:
        print('%d items.' % result_count)

def main(argv):
    parser = argparse.ArgumentParser()

    # The padding is inserted to guarantee that --content is not the first
    # argument. Because if it were, we wouldn't know if we have
    # [pre, '--content'] or ['--content', post], etc. and I don't want to
    # actually check the values.
    argv.insert(0, 'padding')
    grouper = itertools.groupby(argv, lambda x: x == '--content')
    halves = [list(group) for (key, group) in grouper]
    # halves looks like [pre, '--content', post]
    name_args = halves[0]
    # Pop the padding
    name_args.pop(0)
    content_args = [item for chunk in halves[2:] for item in chunk]

    parser.add_argument('yes_all', nargs='*', default=None)
    parser.add_argument('--all', dest='yes_all', nargs='+')
    parser.add_argument('--any', dest='yes_any', nargs='+')
    parser.add_argument('--not_all', dest='not_all', nargs='+')
    parser.add_argument('--not_any', dest='not_any', nargs='+')

    parser.add_argument('--case', dest='case_sensitive', action='store_true')
    parser.add_argument('--content', dest='do_content', action='store_true')
    parser.add_argument('--count', dest='show_count', action='store_true')
    parser.add_argument('--expression', dest='do_expression', action='store_true')
    parser.add_argument('--glob', dest='do_glob', action='store_true')
    parser.add_argument('--line_numbers', dest='line_numbers', action='store_true')
    parser.add_argument('--local', dest='local_only', action='store_true')
    parser.add_argument('--regex', dest='do_regex', action='store_true')
    parser.add_argument('--text', dest='text', default=None)
    parser.set_defaults(func=search_argparse)

    args = parser.parse_args(name_args)
    if content_args:
        args.content_args = parser.parse_args(content_args)
    else:
        args.content_args = None
    args.func(args)

if __name__ == '__main__':
    raise SystemExit(main(sys.argv[1:]))
Initial commit. 2019-06-12 05:41:31 +00:00			`import argparse`
			`import itertools`
			`import os`
			`import re`
			`import stat`
			`import sys`
			`import traceback`
Support searching of windows .lnk file fields. I'm currently migrating my shortcuts that point to Py 3.7 to 3.8 so this saves a lot of time! 2020-01-25 08:39:18 +00:00			`try:`
			`import winshell`
			`except ImportError:`
			`winshell = None`
Initial commit. 2019-06-12 05:41:31 +00:00
			`from voussoirkit import clipext`
			`from voussoirkit import expressionmatch`
			`from voussoirkit import pathclass`
			`from voussoirkit import safeprint`
			`from voussoirkit import spinal`
Use winglob.fnmatch instead of python fnmatch. 2020-01-29 01:43:47 +00:00			`from voussoirkit import winglob`
Initial commit. 2019-06-12 05:41:31 +00:00
			`# Thanks georg`
			`# http://stackoverflow.com/a/13443424`
			`STDIN_MODE = os.fstat(sys.stdin.fileno()).st_mode`
			`if stat.S_ISFIFO(STDIN_MODE):`
			`STDIN_MODE = 'pipe'`
			`else:`
			`STDIN_MODE = 'terminal'`

Improve lnk searching by separating field names from content. Previously, if you searched for the word target then every lnk matched because "Target: xxx" was literally the text being searched. Now we can use the existing separation between search_text and result_text to show those headers separately. 2020-01-25 09:18:03 +00:00			`class HeaderedText:`
			`def __init__(self, header, text):`
			`self.header = header`
			`self.text = text`

			`@property`
			`def with_header(self):`
			`return f'{self.header}: {self.text}'`


Initial commit. 2019-06-12 05:41:31 +00:00			`def all_terms_match(search_text, terms, match_function):`
			`matches = (`
			`(not terms['yes_all'] or all(match_function(search_text, term) for term in terms['yes_all'])) and`
			`(not terms['yes_any'] or any(match_function(search_text, term) for term in terms['yes_any'])) and`
			`(not terms['not_all'] or not all(match_function(search_text, term) for term in terms['not_all'])) and`
			`(not terms['not_any'] or not any(match_function(search_text, term) for term in terms['not_any']))`
			`)`
			`return matches`

Extract code to function search_contents_generic. 2020-01-25 08:36:50 +00:00			`def search_contents_generic(filepath, content_args):`
			`try:`
			`with open(filepath.absolute_path, 'r') as handle:`
			`text = handle.read()`
			`except UnicodeDecodeError:`
			`try:`
			`with open(filepath.absolute_path, 'r', encoding='utf-8') as handle:`
			`text = handle.read()`
			`except UnicodeDecodeError:`
			`#safeprint.safeprint(filepath.absolute_path)`
			`#traceback.print_exc()`
			`return`
			`except Exception:`
			`safeprint.safeprint(filepath.absolute_path)`
			`traceback.print_exc()`
			`return`

			`content_args['text'] = text`
			`content_args['line_numbers'] = True`

			`results = search(**content_args)`
			`results = list(results)`
			`if not results:`
			`return`

			`yield filepath.absolute_path`
			`yield from results`
			`yield ''`

Support searching of windows .lnk file fields. I'm currently migrating my shortcuts that point to Py 3.7 to 3.8 so this saves a lot of time! 2020-01-25 08:39:18 +00:00			`def search_contents_windows_lnk(filepath, content_args):`
			`try:`
			`shortcut = winshell.Shortcut(filepath.absolute_path)`
			`except Exception:`
			`return`

			`text = [`
Improve lnk searching by separating field names from content. Previously, if you searched for the word target then every lnk matched because "Target: xxx" was literally the text being searched. Now we can use the existing separation between search_text and result_text to show those headers separately. 2020-01-25 09:18:03 +00:00			`HeaderedText('Target', shortcut.path),`
Also search .lnk arguments. 2020-01-26 02:27:55 +00:00			`HeaderedText('Arguments', shortcut.arguments),`
Improve lnk searching by separating field names from content. Previously, if you searched for the word target then every lnk matched because "Target: xxx" was literally the text being searched. Now we can use the existing separation between search_text and result_text to show those headers separately. 2020-01-25 09:18:03 +00:00			`HeaderedText('Start In', shortcut.working_directory),`
			`HeaderedText('Comment', shortcut.description),`
Support searching of windows .lnk file fields. I'm currently migrating my shortcuts that point to Py 3.7 to 3.8 so this saves a lot of time! 2020-01-25 08:39:18 +00:00			`]`
			`content_args['text'] = text`

			`results = search(**content_args)`
			`results = list(results)`
			`if not results:`
			`return`

			`yield filepath.absolute_path`
			`yield from results`
			`yield ''`

Initial commit. 2019-06-12 05:41:31 +00:00			`def search(`
			`*,`
			`yes_all=None,`
			`yes_any=None,`
			`not_all=None,`
			`not_any=None,`
			`case_sensitive=False,`
			`content_args=None,`
			`do_expression=False,`
			`do_glob=False,`
			`do_regex=False,`
			`line_numbers=False,`
			`local_only=False,`
			`text=None,`
			`):`
			`terms = {`
			`'yes_all': yes_all,`
			`'yes_any': yes_any,`
			`'not_all': not_all,`
			`'not_any': not_any`
			`}`
			`terms = {k: ([v] if isinstance(v, str) else v or []) for (k, v) in terms.items()}`
			`#print(terms, content_args)`

			`if all(v == [] for v in terms.values()) and not content_args:`
			`raise ValueError('No terms supplied')`

			`def term_matches(line, term):`
			`if not case_sensitive:`
			`line = line.lower()`

			`if do_expression:`
			`return term.evaluate(line)`

			`return (`
			`(term in line) or`
			`(do_regex and re.search(term, line)) or`
Use winglob.fnmatch instead of python fnmatch. 2020-01-29 01:43:47 +00:00			`(do_glob and winglob.fnmatch(line, term))`
Initial commit. 2019-06-12 05:41:31 +00:00			`)`

			`if do_expression:`
			`# The value still needs to be a list so the upcoming any() / all()`
			`# receives an iterable as it expects. It just happens to be 1 tree.`
			`trees = {}`
Rename these key, value variables to give better context. 2020-01-25 09:08:50 +00:00			`for (term_type, term_expression) in terms.items():`
			`if term_expression == []:`
			`trees[term_type] = []`
Initial commit. 2019-06-12 05:41:31 +00:00			`continue`
Rename these key, value variables to give better context. 2020-01-25 09:08:50 +00:00			`tree = ' '.join(term_expression)`
Initial commit. 2019-06-12 05:41:31 +00:00			`tree = expressionmatch.ExpressionTree.parse(tree)`
			`if not case_sensitive:`
			`tree.map(str.lower)`
Rename these key, value variables to give better context. 2020-01-25 09:08:50 +00:00			`trees[term_type] = [tree]`
Initial commit. 2019-06-12 05:41:31 +00:00			`terms = trees`

			`elif not case_sensitive:`
			`terms = {k: [x.lower() for x in v] for (k, v) in terms.items()}`

			`if text is None:`
			`search_objects = spinal.walk_generator(`
			`recurse=not local_only,`
			`yield_directories=True,`
			`)`
Allow passing a pre-formed list as text. 2020-01-25 09:11:11 +00:00			`elif isinstance(text, (list, tuple)):`
			`search_objects = text`
Initial commit. 2019-06-12 05:41:31 +00:00			`else:`
			`search_objects = text.splitlines()`

			`for (index, search_object) in enumerate(search_objects):`
Comment this whole if and not just the inner part. 2020-01-25 08:52:39 +00:00			`# if index % 10 == 0:`
			`# print(index, end='\r', flush=True)`
Initial commit. 2019-06-12 05:41:31 +00:00			`if isinstance(search_object, pathclass.Path):`
			`search_text = search_object.basename`
			`result_text = search_object.absolute_path`
Improve lnk searching by separating field names from content. Previously, if you searched for the word target then every lnk matched because "Target: xxx" was literally the text being searched. Now we can use the existing separation between search_text and result_text to show those headers separately. 2020-01-25 09:18:03 +00:00			`elif isinstance(search_object, HeaderedText):`
			`search_text = search_object.text`
			`result_text = search_object.with_header`
Initial commit. 2019-06-12 05:41:31 +00:00			`else:`
			`search_text = search_object`
			`result_text = search_object`
Dedent even more. Thanks continue! 2020-01-25 08:58:06 +00:00
Initial commit. 2019-06-12 05:41:31 +00:00			`if line_numbers:`
Switch this formatter to fstring. 2020-01-25 08:48:57 +00:00			`result_text = f'{index+1:>4} \| {result_text}'`
Initial commit. 2019-06-12 05:41:31 +00:00
Dedent this code by reversing condition and continue. 2020-01-25 08:46:52 +00:00			`if not all_terms_match(search_text, terms, term_matches):`
			`continue`

			`if not content_args:`
			`yield result_text`
Dedent even more. Thanks continue! 2020-01-25 08:58:06 +00:00			`continue`
Dedent this code by reversing condition and continue. 2020-01-25 08:46:52 +00:00
Dedent even more. Thanks continue! 2020-01-25 08:58:06 +00:00			`filepath = pathclass.Path(search_object)`
			`if not filepath.is_file:`
			`continue`

Take advantage of pathclass.Extension comparison functionality. 2020-01-30 00:39:08 +00:00			`if filepath.extension == 'lnk' and winshell:`
Dedent even more. Thanks continue! 2020-01-25 08:58:06 +00:00			`yield from search_contents_windows_lnk(filepath, content_args)`
			`else:`
			`yield from search_contents_generic(filepath, content_args)`
Initial commit. 2019-06-12 05:41:31 +00:00
			`def argparse_to_dict(args):`
			`text = args.text`
			`if text is not None:`
			`text = clipext.resolve(text)`
			`elif STDIN_MODE == 'pipe':`
			`text = clipext.resolve('!i')`

			`if hasattr(args, 'content_args') and args.content_args is not None:`
			`content_args = argparse_to_dict(args.content_args)`
			`else:`
			`content_args = None`

			`return {`
			`'yes_all': args.yes_all,`
			`'yes_any': args.yes_any,`
			`'not_all': args.not_all,`
			`'not_any': args.not_any,`
			`'case_sensitive': args.case_sensitive,`
			`'content_args': content_args,`
			`'do_expression': args.do_expression,`
			`'do_glob': args.do_glob,`
			`'do_regex': args.do_regex,`
			`'local_only': args.local_only,`
			`'line_numbers': args.line_numbers,`
			`'text': text,`
			`}`

			`def search_argparse(args):`
			`generator = search(**argparse_to_dict(args))`
			`result_count = 0`
			`for result in generator:`
			`safeprint.safeprint(result)`
			`result_count += 1`
			`if args.show_count:`
			`print('%d items.' % result_count)`

			`def main(argv):`
			`parser = argparse.ArgumentParser()`

			`# The padding is inserted to guarantee that --content is not the first`
			`# argument. Because if it were, we wouldn't know if we have`
			`# [pre, '--content'] or ['--content', post], etc. and I don't want to`
			`# actually check the values.`
			`argv.insert(0, 'padding')`
			`grouper = itertools.groupby(argv, lambda x: x == '--content')`
			`halves = [list(group) for (key, group) in grouper]`
			`# halves looks like [pre, '--content', post]`
			`name_args = halves[0]`
			`# Pop the padding`
			`name_args.pop(0)`
			`content_args = [item for chunk in halves[2:] for item in chunk]`

			`parser.add_argument('yes_all', nargs='*', default=None)`
			`parser.add_argument('--all', dest='yes_all', nargs='+')`
			`parser.add_argument('--any', dest='yes_any', nargs='+')`
			`parser.add_argument('--not_all', dest='not_all', nargs='+')`
			`parser.add_argument('--not_any', dest='not_any', nargs='+')`

			`parser.add_argument('--case', dest='case_sensitive', action='store_true')`
			`parser.add_argument('--content', dest='do_content', action='store_true')`
			`parser.add_argument('--count', dest='show_count', action='store_true')`
			`parser.add_argument('--expression', dest='do_expression', action='store_true')`
			`parser.add_argument('--glob', dest='do_glob', action='store_true')`
			`parser.add_argument('--line_numbers', dest='line_numbers', action='store_true')`
			`parser.add_argument('--local', dest='local_only', action='store_true')`
			`parser.add_argument('--regex', dest='do_regex', action='store_true')`
			`parser.add_argument('--text', dest='text', default=None)`
			`parser.set_defaults(func=search_argparse)`

			`args = parser.parse_args(name_args)`
			`if content_args:`
			`args.content_args = parser.parse_args(content_args)`
			`else:`
			`args.content_args = None`
			`args.func(args)`

			`if __name__ == '__main__':`
			`raise SystemExit(main(sys.argv[1:]))`