Improve spinal performance with os.walk, sacrificing breadth first.

Previously, os.path.isdir was one of the biggest time sinks in spinal.
Switching from my custom code to os.walk, thanks to its use of scandir,
saves a lot of time. For searching my 2TB drive with a hot cache, time
shrank from 70s to 35s.
However, os.walk doesn't support breadth first search, so that's gone
unless I reimplement os.walk myself to support it.
This commit is contained in:
Ethan Dalool 2020-01-31 20:25:47 -08:00
parent 752c46512b
commit 3ff49e1a44

View file

@ -564,7 +564,6 @@ def walk_generator(
*,
callback_exclusion=None,
callback_permission_denied=None,
depth_first=True,
exclude_directories=None,
exclude_filenames=None,
recurse=True,
@ -612,10 +611,10 @@ def walk_generator(
except I use Path objects with absolute paths for everything.
'''
if not yield_directories and not yield_files:
raise ValueError('yield_directories and yield_files cannot both be False')
raise ValueError('yield_directories and yield_files cannot both be False.')
if yield_style not in ['flat', 'nested']:
raise ValueError('Invalid yield_style %s. Either "flat" or "nested".' % repr(yield_style))
raise ValueError(f'yield_style should be "flat" or "nested", not {yield_style}.')
if exclude_directories is None:
exclude_directories = set()
@ -625,6 +624,9 @@ def walk_generator(
callback_exclusion = callback_exclusion or do_nothing
callback_permission_denied = callback_permission_denied or do_nothing
_callback_permission_denied = callback_permission_denied
def callback_permission_denied(error):
return _callback_permission_denied(error.filename, error)
exclude_filenames = {normalize(f) for f in exclude_filenames}
exclude_directories = {normalize(f) for f in exclude_directories}
@ -632,76 +634,80 @@ def walk_generator(
path = pathclass.Path(path)
path.correct_case()
# Considering full paths
if normalize(path.absolute_path) in exclude_directories:
callback_exclusion(path.absolute_path, 'directory')
exclude = (
path.basename in exclude_directories or
path.absolute_path in exclude_directories
)
if exclude:
callback_exclusion(path, 'directory')
return
# Considering folder names
if normalize(path.basename) in exclude_directories:
callback_exclusion(path.absolute_path, 'directory')
return
def handle_exclusion(blacklist, basename, abspath, kind):
exclude = (
os.path.normcase(basename) in blacklist or
os.path.normcase(abspath) in blacklist
)
if exclude:
callback_exclusion(abspath, kind)
return 1
directory_queue = collections.deque()
directory_queue.append(path)
# This is a recursion-free workplace.
# Thank you for your cooperation.
while len(directory_queue) > 0:
current_location = directory_queue.popleft()
log.debug('listdir: %s', current_location.absolute_path)
try:
contents = os.listdir(current_location.absolute_path)
except PermissionError as exception:
callback_permission_denied(current_location, exception)
continue
log.debug('received %d items', len(contents))
if yield_style == 'flat' and yield_directories:
yield current_location
# In the following loops, I found joining the os.sep with fstrings to be
# 10x faster than `os.path.join`, reducing a 6.75 second walk to 5.7.
# Because we trust the values of current_location and the child names,
# we don't run the risk of producing bad values this way.
def walkstep_nested(current_location, child_dirs, child_files):
directories = []
files = []
for base_name in contents:
absolute_name = os.path.join(current_location.absolute_path, base_name)
if os.path.isdir(absolute_name):
exclude = (
normalize(absolute_name) in exclude_directories or
normalize(base_name) in exclude_directories
)
if exclude:
callback_exclusion(absolute_name, 'directory')
continue
directory = pathclass.Path(absolute_name)
directories.append(directory)
elif yield_style == 'flat' and not yield_files:
new_child_dirs = []
for child_dir in child_dirs:
child_dir_abspath = f'{current_location}{os.sep}{child_dir}'
if handle_exclusion(exclude_directories, child_dir, child_dir_abspath, 'directory'):
continue
else:
exclude = normalize(absolute_name) in exclude_filenames
exclude |= normalize(base_name) in exclude_filenames
if exclude:
callback_exclusion(absolute_name, 'file')
new_child_dirs.append(child_dir)
directories.append(pathclass.Path(child_dir_abspath))
# This will actually affect the results of the os.walk going forward!
child_dirs[:] = new_child_dirs
files = []
for child_file in child_files:
child_file_abspath = f'{current_location}{os.sep}{child_file}'
if handle_exclusion(exclude_filenames, child_file, child_file_abspath, 'file'):
continue
files.append(pathclass.Path(child_file_abspath))
current_location = pathclass.Path(current_location)
yield (current_location, directories, files)
def walkstep_flat(current_location, child_dirs, child_files):
if yield_directories:
yield pathclass.Path(current_location)
for child_dir in child_dirs:
child_dir_abspath = f'{current_location}{os.sep}{child_dir}'
if handle_exclusion(exclude_directories, child_dir, child_dir_abspath, 'directory'):
continue
yield pathclass.Path(child_dir_abspath)
if yield_files:
for child_file in child_files:
child_file_abspath = f'{current_location}{os.sep}{child_file}'
if handle_exclusion(exclude_filenames, child_file, child_file_abspath, 'file'):
continue
fp = pathclass.Path(absolute_name)
if yield_style == 'flat':
yield fp
else:
files.append(fp)
yield pathclass.Path(child_file_abspath)
if yield_style == 'nested':
yield (current_location, directories, files)
walker = os.walk(path.absolute_path, onerror=callback_permission_denied)
if yield_style == 'flat':
for step in walker:
yield from walkstep_flat(*step)
if not recurse:
break
if not recurse:
break
if depth_first:
# Extendleft causes them to get reversed, so flip it first.
directories.reverse()
directory_queue.extendleft(directories)
else:
directory_queue.extend(directories)
if yield_style == 'nested':
for step in walker:
yield from walkstep_nested(*step)
if not recurse:
break