Improve spinal performance with os.walk, sacrificing breadth first.

Previously, os.path.isdir was one of the biggest time sinks in spinal.
Switching from my custom code to os.walk, thanks to its use of scandir,
saves a lot of time. For searching my 2TB drive with a hot cache, time
shrank from 70s to 35s.
However, os.walk doesn't support breadth first search, so that's gone
unless I reimplement os.walk myself to support it.
master
Ethan Dalool 2020-01-31 20:25:47 -08:00
parent 752c46512b
commit 3ff49e1a44
1 changed files with 71 additions and 65 deletions

View File

@ -564,7 +564,6 @@ def walk_generator(
*, *,
callback_exclusion=None, callback_exclusion=None,
callback_permission_denied=None, callback_permission_denied=None,
depth_first=True,
exclude_directories=None, exclude_directories=None,
exclude_filenames=None, exclude_filenames=None,
recurse=True, recurse=True,
@ -612,10 +611,10 @@ def walk_generator(
except I use Path objects with absolute paths for everything. except I use Path objects with absolute paths for everything.
''' '''
if not yield_directories and not yield_files: if not yield_directories and not yield_files:
raise ValueError('yield_directories and yield_files cannot both be False') raise ValueError('yield_directories and yield_files cannot both be False.')
if yield_style not in ['flat', 'nested']: if yield_style not in ['flat', 'nested']:
raise ValueError('Invalid yield_style %s. Either "flat" or "nested".' % repr(yield_style)) raise ValueError(f'yield_style should be "flat" or "nested", not {yield_style}.')
if exclude_directories is None: if exclude_directories is None:
exclude_directories = set() exclude_directories = set()
@ -625,6 +624,9 @@ def walk_generator(
callback_exclusion = callback_exclusion or do_nothing callback_exclusion = callback_exclusion or do_nothing
callback_permission_denied = callback_permission_denied or do_nothing callback_permission_denied = callback_permission_denied or do_nothing
_callback_permission_denied = callback_permission_denied
def callback_permission_denied(error):
return _callback_permission_denied(error.filename, error)
exclude_filenames = {normalize(f) for f in exclude_filenames} exclude_filenames = {normalize(f) for f in exclude_filenames}
exclude_directories = {normalize(f) for f in exclude_directories} exclude_directories = {normalize(f) for f in exclude_directories}
@ -632,76 +634,80 @@ def walk_generator(
path = pathclass.Path(path) path = pathclass.Path(path)
path.correct_case() path.correct_case()
# Considering full paths exclude = (
if normalize(path.absolute_path) in exclude_directories: path.basename in exclude_directories or
callback_exclusion(path.absolute_path, 'directory') path.absolute_path in exclude_directories
)
if exclude:
callback_exclusion(path, 'directory')
return return
# Considering folder names def handle_exclusion(blacklist, basename, abspath, kind):
if normalize(path.basename) in exclude_directories: exclude = (
callback_exclusion(path.absolute_path, 'directory') os.path.normcase(basename) in blacklist or
return os.path.normcase(abspath) in blacklist
)
if exclude:
callback_exclusion(abspath, kind)
return 1
directory_queue = collections.deque() # In the following loops, I found joining the os.sep with fstrings to be
directory_queue.append(path) # 10x faster than `os.path.join`, reducing a 6.75 second walk to 5.7.
# Because we trust the values of current_location and the child names,
# This is a recursion-free workplace. # we don't run the risk of producing bad values this way.
# Thank you for your cooperation.
while len(directory_queue) > 0:
current_location = directory_queue.popleft()
log.debug('listdir: %s', current_location.absolute_path)
try:
contents = os.listdir(current_location.absolute_path)
except PermissionError as exception:
callback_permission_denied(current_location, exception)
continue
log.debug('received %d items', len(contents))
if yield_style == 'flat' and yield_directories:
yield current_location
def walkstep_nested(current_location, child_dirs, child_files):
directories = [] directories = []
files = [] new_child_dirs = []
for base_name in contents: for child_dir in child_dirs:
absolute_name = os.path.join(current_location.absolute_path, base_name) child_dir_abspath = f'{current_location}{os.sep}{child_dir}'
if handle_exclusion(exclude_directories, child_dir, child_dir_abspath, 'directory'):
if os.path.isdir(absolute_name):
exclude = (
normalize(absolute_name) in exclude_directories or
normalize(base_name) in exclude_directories
)
if exclude:
callback_exclusion(absolute_name, 'directory')
continue
directory = pathclass.Path(absolute_name)
directories.append(directory)
elif yield_style == 'flat' and not yield_files:
continue continue
else: new_child_dirs.append(child_dir)
exclude = normalize(absolute_name) in exclude_filenames directories.append(pathclass.Path(child_dir_abspath))
exclude |= normalize(base_name) in exclude_filenames
if exclude: # This will actually affect the results of the os.walk going forward!
callback_exclusion(absolute_name, 'file') child_dirs[:] = new_child_dirs
files = []
for child_file in child_files:
child_file_abspath = f'{current_location}{os.sep}{child_file}'
if handle_exclusion(exclude_filenames, child_file, child_file_abspath, 'file'):
continue
files.append(pathclass.Path(child_file_abspath))
current_location = pathclass.Path(current_location)
yield (current_location, directories, files)
def walkstep_flat(current_location, child_dirs, child_files):
if yield_directories:
yield pathclass.Path(current_location)
for child_dir in child_dirs:
child_dir_abspath = f'{current_location}{os.sep}{child_dir}'
if handle_exclusion(exclude_directories, child_dir, child_dir_abspath, 'directory'):
continue
yield pathclass.Path(child_dir_abspath)
if yield_files:
for child_file in child_files:
child_file_abspath = f'{current_location}{os.sep}{child_file}'
if handle_exclusion(exclude_filenames, child_file, child_file_abspath, 'file'):
continue continue
fp = pathclass.Path(absolute_name) yield pathclass.Path(child_file_abspath)
if yield_style == 'flat':
yield fp
else:
files.append(fp)
if yield_style == 'nested': walker = os.walk(path.absolute_path, onerror=callback_permission_denied)
yield (current_location, directories, files) if yield_style == 'flat':
for step in walker:
yield from walkstep_flat(*step)
if not recurse:
break
if not recurse: if yield_style == 'nested':
break for step in walker:
yield from walkstep_nested(*step)
if depth_first: if not recurse:
# Extendleft causes them to get reversed, so flip it first. break
directories.reverse()
directory_queue.extendleft(directories)
else:
directory_queue.extend(directories)