From 3ff49e1a446c84ee575521ef65c714a302970b7f Mon Sep 17 00:00:00 2001 From: Ethan Dalool Date: Fri, 31 Jan 2020 20:25:47 -0800 Subject: [PATCH] Improve spinal performance with os.walk, sacrificing breadth first. Previously, os.path.isdir was one of the biggest time sinks in spinal. Switching from my custom code to os.walk, thanks to its use of scandir, saves a lot of time. For searching my 2TB drive with a hot cache, time shrank from 70s to 35s. However, os.walk doesn't support breadth first search, so that's gone unless I reimplement os.walk myself to support it. --- voussoirkit/spinal.py | 136 ++++++++++++++++++++++-------------------- 1 file changed, 71 insertions(+), 65 deletions(-) diff --git a/voussoirkit/spinal.py b/voussoirkit/spinal.py index 99f165f..f055c50 100644 --- a/voussoirkit/spinal.py +++ b/voussoirkit/spinal.py @@ -564,7 +564,6 @@ def walk_generator( *, callback_exclusion=None, callback_permission_denied=None, - depth_first=True, exclude_directories=None, exclude_filenames=None, recurse=True, @@ -612,10 +611,10 @@ def walk_generator( except I use Path objects with absolute paths for everything. ''' if not yield_directories and not yield_files: - raise ValueError('yield_directories and yield_files cannot both be False') + raise ValueError('yield_directories and yield_files cannot both be False.') if yield_style not in ['flat', 'nested']: - raise ValueError('Invalid yield_style %s. Either "flat" or "nested".' % repr(yield_style)) + raise ValueError(f'yield_style should be "flat" or "nested", not {yield_style}.') if exclude_directories is None: exclude_directories = set() @@ -625,6 +624,9 @@ def walk_generator( callback_exclusion = callback_exclusion or do_nothing callback_permission_denied = callback_permission_denied or do_nothing + _callback_permission_denied = callback_permission_denied + def callback_permission_denied(error): + return _callback_permission_denied(error.filename, error) exclude_filenames = {normalize(f) for f in exclude_filenames} exclude_directories = {normalize(f) for f in exclude_directories} @@ -632,76 +634,80 @@ def walk_generator( path = pathclass.Path(path) path.correct_case() - # Considering full paths - if normalize(path.absolute_path) in exclude_directories: - callback_exclusion(path.absolute_path, 'directory') + exclude = ( + path.basename in exclude_directories or + path.absolute_path in exclude_directories + ) + if exclude: + callback_exclusion(path, 'directory') return - # Considering folder names - if normalize(path.basename) in exclude_directories: - callback_exclusion(path.absolute_path, 'directory') - return + def handle_exclusion(blacklist, basename, abspath, kind): + exclude = ( + os.path.normcase(basename) in blacklist or + os.path.normcase(abspath) in blacklist + ) + if exclude: + callback_exclusion(abspath, kind) + return 1 - directory_queue = collections.deque() - directory_queue.append(path) - - # This is a recursion-free workplace. - # Thank you for your cooperation. - while len(directory_queue) > 0: - current_location = directory_queue.popleft() - log.debug('listdir: %s', current_location.absolute_path) - try: - contents = os.listdir(current_location.absolute_path) - except PermissionError as exception: - callback_permission_denied(current_location, exception) - continue - log.debug('received %d items', len(contents)) - - if yield_style == 'flat' and yield_directories: - yield current_location + # In the following loops, I found joining the os.sep with fstrings to be + # 10x faster than `os.path.join`, reducing a 6.75 second walk to 5.7. + # Because we trust the values of current_location and the child names, + # we don't run the risk of producing bad values this way. + def walkstep_nested(current_location, child_dirs, child_files): directories = [] - files = [] - for base_name in contents: - absolute_name = os.path.join(current_location.absolute_path, base_name) - - if os.path.isdir(absolute_name): - exclude = ( - normalize(absolute_name) in exclude_directories or - normalize(base_name) in exclude_directories - ) - if exclude: - callback_exclusion(absolute_name, 'directory') - continue - - directory = pathclass.Path(absolute_name) - directories.append(directory) - - elif yield_style == 'flat' and not yield_files: + new_child_dirs = [] + for child_dir in child_dirs: + child_dir_abspath = f'{current_location}{os.sep}{child_dir}' + if handle_exclusion(exclude_directories, child_dir, child_dir_abspath, 'directory'): continue - else: - exclude = normalize(absolute_name) in exclude_filenames - exclude |= normalize(base_name) in exclude_filenames - if exclude: - callback_exclusion(absolute_name, 'file') + new_child_dirs.append(child_dir) + directories.append(pathclass.Path(child_dir_abspath)) + + # This will actually affect the results of the os.walk going forward! + child_dirs[:] = new_child_dirs + + files = [] + for child_file in child_files: + child_file_abspath = f'{current_location}{os.sep}{child_file}' + if handle_exclusion(exclude_filenames, child_file, child_file_abspath, 'file'): + continue + + files.append(pathclass.Path(child_file_abspath)) + + current_location = pathclass.Path(current_location) + yield (current_location, directories, files) + + def walkstep_flat(current_location, child_dirs, child_files): + if yield_directories: + yield pathclass.Path(current_location) + + for child_dir in child_dirs: + child_dir_abspath = f'{current_location}{os.sep}{child_dir}' + if handle_exclusion(exclude_directories, child_dir, child_dir_abspath, 'directory'): + continue + yield pathclass.Path(child_dir_abspath) + + if yield_files: + for child_file in child_files: + child_file_abspath = f'{current_location}{os.sep}{child_file}' + if handle_exclusion(exclude_filenames, child_file, child_file_abspath, 'file'): continue - fp = pathclass.Path(absolute_name) - if yield_style == 'flat': - yield fp - else: - files.append(fp) + yield pathclass.Path(child_file_abspath) - if yield_style == 'nested': - yield (current_location, directories, files) + walker = os.walk(path.absolute_path, onerror=callback_permission_denied) + if yield_style == 'flat': + for step in walker: + yield from walkstep_flat(*step) + if not recurse: + break - if not recurse: - break - - if depth_first: - # Extendleft causes them to get reversed, so flip it first. - directories.reverse() - directory_queue.extendleft(directories) - else: - directory_queue.extend(directories) + if yield_style == 'nested': + for step in walker: + yield from walkstep_nested(*step) + if not recurse: + break