Improve spinal.walk with glob_ arguments.

This saves the caller from having to filter the results, and saves
us from instantiating Path objects that we don't want. This improves
spinal's position as the one-stop-shop for file tree iteration.
This commit is contained in:
voussoir 2021-01-25 14:50:21 -08:00
parent c893a21e14
commit 1d90086d96
No known key found for this signature in database
GPG key ID: 5F7554F8C26DACCB

View file

@ -9,6 +9,7 @@ from voussoirkit import dotdict
from voussoirkit import pathclass from voussoirkit import pathclass
from voussoirkit import ratelimiter from voussoirkit import ratelimiter
from voussoirkit import sentinel from voussoirkit import sentinel
from voussoirkit import winglob
from voussoirkit import vlogging from voussoirkit import vlogging
log = vlogging.getLogger(__name__) log = vlogging.getLogger(__name__)
@ -153,10 +154,10 @@ def copy_dir(
Do everything except the actual file copying. Do everything except the actual file copying.
exclude_filenames: exclude_filenames:
Passed directly into `walk_generator`. Passed directly into `walk`.
exclude_directories: exclude_directories:
Passed directly into `walk_generator`. Passed directly into `walk`.
files_per_second: files_per_second:
Maximum number of files to be processed per second. Helps to keep CPU Maximum number of files to be processed per second. Helps to keep CPU
@ -223,7 +224,7 @@ def copy_dir(
files_per_second = limiter_or_none(files_per_second) files_per_second = limiter_or_none(files_per_second)
# Copy # Copy
walker = walk_generator( walker = walk(
source, source,
exclude_directories=exclude_directories, exclude_directories=exclude_directories,
exclude_filenames=exclude_filenames, exclude_filenames=exclude_filenames,
@ -536,7 +537,7 @@ def get_dir_size(path):
raise SourceNotDirectory(path) raise SourceNotDirectory(path)
total_bytes = 0 total_bytes = 0
for filepath in walk_generator(path): for filepath in walk(path):
total_bytes += filepath.size total_bytes += filepath.size
return total_bytes return total_bytes
@ -641,49 +642,60 @@ def verify_hash(
log.debug('Hash validation passed.') log.debug('Hash validation passed.')
def walk_generator( def walk(
path='.', path='.',
*, *,
callback_permission_denied=None, callback_permission_denied=None,
exclude_directories=None, exclude_directories=None,
exclude_filenames=None, exclude_filenames=None,
glob_directories=None,
glob_filenames=None,
recurse=True, recurse=True,
yield_directories=False, yield_directories=False,
yield_files=True, yield_files=True,
yield_style='flat', yield_style='flat',
): ):
''' '''
Yield Path objects for files in the file tree, similar to os.walk. Yield pathclass.Path objects for files in the tree, similar to os.walk.
callback_permission_denied: callback_permission_denied:
Passed directly into os.walk as onerror. If OSErrors (Permission Denied) Passed directly into os.walk as onerror. If OSErrors (Permission Denied)
occur when trying to list a directory, your function will be called with occur when trying to list a directory, your function will be called with
the exception object as the only argument. the exception object as the only argument.
exclude_filenames:
A set of filenames that will not be copied. Entries can be absolute
paths to exclude that particular file, or plain names to exclude
all matches. For example:
{'C:\\folder\\file.txt', 'desktop.ini'}
exclude_directories: exclude_directories:
A set of directories that will not be copied. Entries can be A set of directories that will not be yielded. Members can be absolute
absolute paths to exclude that particular directory, or plain names paths, glob patterns, or just plain names.
to exclude all matches. For example: For example: {'C:\\folder', '*_small', 'thumbnails'}
{'C:\\folder', 'thumbnails'}
exclude_filenames:
A set of filenames that will not be yielded. Members can be absolute
paths, glob patterns, or just plain names.
For example: {'C:\\folder\\file.txt', '*.temp', 'desktop.ini'}
glob_directories:
A set of glob patterns. Directories will only be yielded if they match
at least one of these patterns.
glob_filenames:
A set of glob patterns. Filenames will only be yielded if they match
at least one of these patterns.
recurse: recurse:
Yield from subdirectories. If False, only immediate files are returned. Yield from subdirectories. If False, only immediate files are returned.
yield_directories: yield_directories:
Should the generator produce directories? Has no effect in nested yield style. Should the generator produce directories? True or False.
Has no effect in nested yield style.
yield_files: yield_files:
Should the generator produce files? Has no effect in nested yield style. Should the generator produce files? True or False.
Has no effect in nested yield style.
yield_style: yield_style:
If 'flat', yield individual files one by one in a constant stream. If 'flat', yield individual files and directories one by one.
If 'nested', yield tuple(root, directories, files) like os.walk does, If 'nested', yield tuple(root, directories, files) like os.walk does,
except I use Path objects with absolute paths for everything. except using pathclass.Path objects for everything.
''' '''
if not yield_directories and not yield_files: if not yield_directories and not yield_files:
raise ValueError('yield_directories and yield_files cannot both be False.') raise ValueError('yield_directories and yield_files cannot both be False.')
@ -691,36 +703,47 @@ def walk_generator(
if yield_style not in ['flat', 'nested']: if yield_style not in ['flat', 'nested']:
raise ValueError(f'yield_style should be "flat" or "nested", not {yield_style}.') raise ValueError(f'yield_style should be "flat" or "nested", not {yield_style}.')
if exclude_directories is None:
exclude_directories = set()
if exclude_filenames is None:
exclude_filenames = set()
callback_permission_denied = callback_permission_denied or do_nothing callback_permission_denied = callback_permission_denied or do_nothing
exclude_filenames = {normalize(f) for f in exclude_filenames} if exclude_filenames is not None:
exclude_directories = {normalize(f) for f in exclude_directories} exclude_filenames = {normalize(f) for f in exclude_filenames}
if exclude_directories is not None:
exclude_directories = {normalize(f) for f in exclude_directories}
if glob_filenames is not None:
glob_filenames = set(glob_filenames)
if glob_directories is not None:
glob_directories = set(glob_directories)
path = pathclass.Path(path) path = pathclass.Path(path)
path.correct_case() path.correct_case()
exclude = ( def handle_exclusion(whitelist, blacklist, basename, abspath):
normalize(path.basename) in exclude_directories or exclude = False
normalize(path.absolute_path) in exclude_directories
)
if exclude: if whitelist is not None and not exclude:
exclude = not any(winglob.fnmatch(basename, whitelisted) for whitelisted in whitelist)
if blacklist is not None and not exclude:
n_basename = normalize(basename)
n_abspath = normalize(abspath)
exclude = any(
n_basename == blacklisted or
n_abspath == blacklisted or
winglob.fnmatch(n_basename, blacklisted)
for blacklisted in blacklist
)
return exclude
# If for some reason the given starting directory is excluded by the
# exclude parameters.
if handle_exclusion(glob_directories, exclude_directories, path.basename, path.absolute_path):
return return
def handle_exclusion(blacklist, basename, abspath, kind):
exclude = (
normalize(basename) in blacklist or
normalize(abspath) in blacklist
)
if exclude:
return 1
# In the following loops, I found joining the os.sep with fstrings to be # In the following loops, I found joining the os.sep with fstrings to be
# 10x faster than `os.path.join`, reducing a 6.75 second walk to 5.7. # 10x faster than `os.path.join`, reducing a 6.75 second walk to 5.7.
# Because we trust the values of current_location and the child names, # Because we trust the values of current_location and the child names,
@ -731,7 +754,7 @@ def walk_generator(
new_child_dirs = [] new_child_dirs = []
for child_dir in child_dirs: for child_dir in child_dirs:
child_dir_abspath = f'{current_location}{os.sep}{child_dir}' child_dir_abspath = f'{current_location}{os.sep}{child_dir}'
if handle_exclusion(exclude_directories, child_dir, child_dir_abspath, 'directory'): if handle_exclusion(glob_directories, exclude_directories, child_dir, child_dir_abspath):
continue continue
new_child_dirs.append(child_dir) new_child_dirs.append(child_dir)
@ -743,7 +766,7 @@ def walk_generator(
files = [] files = []
for child_file in child_files: for child_file in child_files:
child_file_abspath = f'{current_location}{os.sep}{child_file}' child_file_abspath = f'{current_location}{os.sep}{child_file}'
if handle_exclusion(exclude_filenames, child_file, child_file_abspath, 'file'): if handle_exclusion(glob_filenames, exclude_filenames, child_file, child_file_abspath):
continue continue
files.append(pathclass.Path(child_file_abspath)) files.append(pathclass.Path(child_file_abspath))
@ -755,7 +778,7 @@ def walk_generator(
new_child_dirs = [] new_child_dirs = []
for child_dir in child_dirs: for child_dir in child_dirs:
child_dir_abspath = f'{current_location}{os.sep}{child_dir}' child_dir_abspath = f'{current_location}{os.sep}{child_dir}'
if handle_exclusion(exclude_directories, child_dir, child_dir_abspath, 'directory'): if handle_exclusion(glob_directories, exclude_directories, child_dir, child_dir_abspath):
continue continue
new_child_dirs.append(child_dir) new_child_dirs.append(child_dir)
@ -768,20 +791,21 @@ def walk_generator(
if yield_files: if yield_files:
for child_file in child_files: for child_file in child_files:
child_file_abspath = f'{current_location}{os.sep}{child_file}' child_file_abspath = f'{current_location}{os.sep}{child_file}'
if handle_exclusion(exclude_filenames, child_file, child_file_abspath, 'file'): if handle_exclusion(glob_filenames, exclude_filenames, child_file, child_file_abspath):
continue continue
yield pathclass.Path(child_file_abspath) yield pathclass.Path(child_file_abspath)
walker = os.walk(path.absolute_path, onerror=callback_permission_denied, followlinks=True) walker = os.walk(path.absolute_path, onerror=callback_permission_denied, followlinks=True)
if yield_style == 'flat': if yield_style == 'flat':
for step in walker: my_stepper = walkstep_flat
yield from walkstep_flat(*step)
if not recurse:
break
if yield_style == 'nested': if yield_style == 'nested':
for step in walker: my_stepper = walkstep_nested
yield from walkstep_nested(*step)
if not recurse: for step in walker:
break yield from my_stepper(*step)
if not recurse:
break
# Backwards compatibility
walk_generator = walk