Make better use of generators when searching file text.

This commit is contained in:
voussoir 2022-02-12 12:03:07 -08:00
parent cb226265a3
commit 389f22faff
No known key found for this signature in database
GPG key ID: 5F7554F8C26DACCB

View file

@ -50,22 +50,36 @@ def all_terms_match(search_text, terms, match_function):
) )
return matches return matches
def search_contents_generic(filepath, content_args): def is_iterable(something):
try: try:
text = filepath.read('r') iter(something)
return True
except TypeError:
return False
def search_contents_generic(filepath, content_args):
# We first test 1 MB of the file to see if it is text rather than binary.
try:
handle = filepath.open('r')
handle.read(2 ** 20)
except UnicodeDecodeError: except UnicodeDecodeError:
try: try:
text = filepath.read('r', encoding='utf-8') handle.close()
handle = filepath.open('r', encoding='utf-8')
handle.read(2 ** 20)
except UnicodeDecodeError: except UnicodeDecodeError:
#safeprint.safeprint(filepath.absolute_path) log.debug('%s could not be read with encoding=utf-8.', filepath)
#traceback.print_exc()
return return
except Exception: except Exception:
safeprint.safeprint(filepath.absolute_path) safeprint.safeprint(filepath.absolute_path)
traceback.print_exc() traceback.print_exc()
return return
content_args['text'] = text # We keep the lines as a generator instead of using readlines,
# which makes a list.
handle.seek(0)
lines = (line.rstrip('\r\n') for line in handle)
content_args['text'] = lines
content_args['line_numbers'] = True content_args['line_numbers'] = True
results = search(**content_args) results = search(**content_args)
@ -171,10 +185,12 @@ def search(
recurse=not local_only, recurse=not local_only,
yield_directories=True, yield_directories=True,
) )
elif isinstance(text, (list, tuple)) or inspect.isgenerator(text): elif isinstance(text, str):
search_objects = text.splitlines()
elif is_iterable(text):
search_objects = text search_objects = text
else: else:
search_objects = text.splitlines() raise TypeError(f'Don\'t know how to search text={text}')
for (index, search_object) in enumerate(search_objects): for (index, search_object) in enumerate(search_objects):
# if index % 10 == 0: # if index % 10 == 0: