Improve spinal.walk with glob_ arguments.
This saves the caller from having to filter the results, and saves us from instantiating Path objects that we don't want. This improves spinal's position as the one-stop-shop for file tree iteration.
This commit is contained in:
parent
c893a21e14
commit
1d90086d96
1 changed files with 77 additions and 53 deletions
|
@ -9,6 +9,7 @@ from voussoirkit import dotdict
|
||||||
from voussoirkit import pathclass
|
from voussoirkit import pathclass
|
||||||
from voussoirkit import ratelimiter
|
from voussoirkit import ratelimiter
|
||||||
from voussoirkit import sentinel
|
from voussoirkit import sentinel
|
||||||
|
from voussoirkit import winglob
|
||||||
from voussoirkit import vlogging
|
from voussoirkit import vlogging
|
||||||
|
|
||||||
log = vlogging.getLogger(__name__)
|
log = vlogging.getLogger(__name__)
|
||||||
|
@ -153,10 +154,10 @@ def copy_dir(
|
||||||
Do everything except the actual file copying.
|
Do everything except the actual file copying.
|
||||||
|
|
||||||
exclude_filenames:
|
exclude_filenames:
|
||||||
Passed directly into `walk_generator`.
|
Passed directly into `walk`.
|
||||||
|
|
||||||
exclude_directories:
|
exclude_directories:
|
||||||
Passed directly into `walk_generator`.
|
Passed directly into `walk`.
|
||||||
|
|
||||||
files_per_second:
|
files_per_second:
|
||||||
Maximum number of files to be processed per second. Helps to keep CPU
|
Maximum number of files to be processed per second. Helps to keep CPU
|
||||||
|
@ -223,7 +224,7 @@ def copy_dir(
|
||||||
files_per_second = limiter_or_none(files_per_second)
|
files_per_second = limiter_or_none(files_per_second)
|
||||||
|
|
||||||
# Copy
|
# Copy
|
||||||
walker = walk_generator(
|
walker = walk(
|
||||||
source,
|
source,
|
||||||
exclude_directories=exclude_directories,
|
exclude_directories=exclude_directories,
|
||||||
exclude_filenames=exclude_filenames,
|
exclude_filenames=exclude_filenames,
|
||||||
|
@ -536,7 +537,7 @@ def get_dir_size(path):
|
||||||
raise SourceNotDirectory(path)
|
raise SourceNotDirectory(path)
|
||||||
|
|
||||||
total_bytes = 0
|
total_bytes = 0
|
||||||
for filepath in walk_generator(path):
|
for filepath in walk(path):
|
||||||
total_bytes += filepath.size
|
total_bytes += filepath.size
|
||||||
|
|
||||||
return total_bytes
|
return total_bytes
|
||||||
|
@ -641,49 +642,60 @@ def verify_hash(
|
||||||
|
|
||||||
log.debug('Hash validation passed.')
|
log.debug('Hash validation passed.')
|
||||||
|
|
||||||
def walk_generator(
|
def walk(
|
||||||
path='.',
|
path='.',
|
||||||
*,
|
*,
|
||||||
callback_permission_denied=None,
|
callback_permission_denied=None,
|
||||||
exclude_directories=None,
|
exclude_directories=None,
|
||||||
exclude_filenames=None,
|
exclude_filenames=None,
|
||||||
|
glob_directories=None,
|
||||||
|
glob_filenames=None,
|
||||||
recurse=True,
|
recurse=True,
|
||||||
yield_directories=False,
|
yield_directories=False,
|
||||||
yield_files=True,
|
yield_files=True,
|
||||||
yield_style='flat',
|
yield_style='flat',
|
||||||
):
|
):
|
||||||
'''
|
'''
|
||||||
Yield Path objects for files in the file tree, similar to os.walk.
|
Yield pathclass.Path objects for files in the tree, similar to os.walk.
|
||||||
|
|
||||||
callback_permission_denied:
|
callback_permission_denied:
|
||||||
Passed directly into os.walk as onerror. If OSErrors (Permission Denied)
|
Passed directly into os.walk as onerror. If OSErrors (Permission Denied)
|
||||||
occur when trying to list a directory, your function will be called with
|
occur when trying to list a directory, your function will be called with
|
||||||
the exception object as the only argument.
|
the exception object as the only argument.
|
||||||
exclude_filenames:
|
|
||||||
A set of filenames that will not be copied. Entries can be absolute
|
|
||||||
paths to exclude that particular file, or plain names to exclude
|
|
||||||
all matches. For example:
|
|
||||||
{'C:\\folder\\file.txt', 'desktop.ini'}
|
|
||||||
|
|
||||||
exclude_directories:
|
exclude_directories:
|
||||||
A set of directories that will not be copied. Entries can be
|
A set of directories that will not be yielded. Members can be absolute
|
||||||
absolute paths to exclude that particular directory, or plain names
|
paths, glob patterns, or just plain names.
|
||||||
to exclude all matches. For example:
|
For example: {'C:\\folder', '*_small', 'thumbnails'}
|
||||||
{'C:\\folder', 'thumbnails'}
|
|
||||||
|
exclude_filenames:
|
||||||
|
A set of filenames that will not be yielded. Members can be absolute
|
||||||
|
paths, glob patterns, or just plain names.
|
||||||
|
For example: {'C:\\folder\\file.txt', '*.temp', 'desktop.ini'}
|
||||||
|
|
||||||
|
glob_directories:
|
||||||
|
A set of glob patterns. Directories will only be yielded if they match
|
||||||
|
at least one of these patterns.
|
||||||
|
|
||||||
|
glob_filenames:
|
||||||
|
A set of glob patterns. Filenames will only be yielded if they match
|
||||||
|
at least one of these patterns.
|
||||||
|
|
||||||
recurse:
|
recurse:
|
||||||
Yield from subdirectories. If False, only immediate files are returned.
|
Yield from subdirectories. If False, only immediate files are returned.
|
||||||
|
|
||||||
yield_directories:
|
yield_directories:
|
||||||
Should the generator produce directories? Has no effect in nested yield style.
|
Should the generator produce directories? True or False.
|
||||||
|
Has no effect in nested yield style.
|
||||||
|
|
||||||
yield_files:
|
yield_files:
|
||||||
Should the generator produce files? Has no effect in nested yield style.
|
Should the generator produce files? True or False.
|
||||||
|
Has no effect in nested yield style.
|
||||||
|
|
||||||
yield_style:
|
yield_style:
|
||||||
If 'flat', yield individual files one by one in a constant stream.
|
If 'flat', yield individual files and directories one by one.
|
||||||
If 'nested', yield tuple(root, directories, files) like os.walk does,
|
If 'nested', yield tuple(root, directories, files) like os.walk does,
|
||||||
except I use Path objects with absolute paths for everything.
|
except using pathclass.Path objects for everything.
|
||||||
'''
|
'''
|
||||||
if not yield_directories and not yield_files:
|
if not yield_directories and not yield_files:
|
||||||
raise ValueError('yield_directories and yield_files cannot both be False.')
|
raise ValueError('yield_directories and yield_files cannot both be False.')
|
||||||
|
@ -691,36 +703,47 @@ def walk_generator(
|
||||||
if yield_style not in ['flat', 'nested']:
|
if yield_style not in ['flat', 'nested']:
|
||||||
raise ValueError(f'yield_style should be "flat" or "nested", not {yield_style}.')
|
raise ValueError(f'yield_style should be "flat" or "nested", not {yield_style}.')
|
||||||
|
|
||||||
if exclude_directories is None:
|
|
||||||
exclude_directories = set()
|
|
||||||
|
|
||||||
if exclude_filenames is None:
|
|
||||||
exclude_filenames = set()
|
|
||||||
|
|
||||||
callback_permission_denied = callback_permission_denied or do_nothing
|
callback_permission_denied = callback_permission_denied or do_nothing
|
||||||
|
|
||||||
exclude_filenames = {normalize(f) for f in exclude_filenames}
|
if exclude_filenames is not None:
|
||||||
exclude_directories = {normalize(f) for f in exclude_directories}
|
exclude_filenames = {normalize(f) for f in exclude_filenames}
|
||||||
|
|
||||||
|
if exclude_directories is not None:
|
||||||
|
exclude_directories = {normalize(f) for f in exclude_directories}
|
||||||
|
|
||||||
|
if glob_filenames is not None:
|
||||||
|
glob_filenames = set(glob_filenames)
|
||||||
|
|
||||||
|
if glob_directories is not None:
|
||||||
|
glob_directories = set(glob_directories)
|
||||||
|
|
||||||
path = pathclass.Path(path)
|
path = pathclass.Path(path)
|
||||||
path.correct_case()
|
path.correct_case()
|
||||||
|
|
||||||
exclude = (
|
def handle_exclusion(whitelist, blacklist, basename, abspath):
|
||||||
normalize(path.basename) in exclude_directories or
|
exclude = False
|
||||||
normalize(path.absolute_path) in exclude_directories
|
|
||||||
)
|
|
||||||
|
|
||||||
if exclude:
|
if whitelist is not None and not exclude:
|
||||||
|
exclude = not any(winglob.fnmatch(basename, whitelisted) for whitelisted in whitelist)
|
||||||
|
|
||||||
|
if blacklist is not None and not exclude:
|
||||||
|
n_basename = normalize(basename)
|
||||||
|
n_abspath = normalize(abspath)
|
||||||
|
|
||||||
|
exclude = any(
|
||||||
|
n_basename == blacklisted or
|
||||||
|
n_abspath == blacklisted or
|
||||||
|
winglob.fnmatch(n_basename, blacklisted)
|
||||||
|
for blacklisted in blacklist
|
||||||
|
)
|
||||||
|
|
||||||
|
return exclude
|
||||||
|
|
||||||
|
# If for some reason the given starting directory is excluded by the
|
||||||
|
# exclude parameters.
|
||||||
|
if handle_exclusion(glob_directories, exclude_directories, path.basename, path.absolute_path):
|
||||||
return
|
return
|
||||||
|
|
||||||
def handle_exclusion(blacklist, basename, abspath, kind):
|
|
||||||
exclude = (
|
|
||||||
normalize(basename) in blacklist or
|
|
||||||
normalize(abspath) in blacklist
|
|
||||||
)
|
|
||||||
if exclude:
|
|
||||||
return 1
|
|
||||||
|
|
||||||
# In the following loops, I found joining the os.sep with fstrings to be
|
# In the following loops, I found joining the os.sep with fstrings to be
|
||||||
# 10x faster than `os.path.join`, reducing a 6.75 second walk to 5.7.
|
# 10x faster than `os.path.join`, reducing a 6.75 second walk to 5.7.
|
||||||
# Because we trust the values of current_location and the child names,
|
# Because we trust the values of current_location and the child names,
|
||||||
|
@ -731,7 +754,7 @@ def walk_generator(
|
||||||
new_child_dirs = []
|
new_child_dirs = []
|
||||||
for child_dir in child_dirs:
|
for child_dir in child_dirs:
|
||||||
child_dir_abspath = f'{current_location}{os.sep}{child_dir}'
|
child_dir_abspath = f'{current_location}{os.sep}{child_dir}'
|
||||||
if handle_exclusion(exclude_directories, child_dir, child_dir_abspath, 'directory'):
|
if handle_exclusion(glob_directories, exclude_directories, child_dir, child_dir_abspath):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
new_child_dirs.append(child_dir)
|
new_child_dirs.append(child_dir)
|
||||||
|
@ -743,7 +766,7 @@ def walk_generator(
|
||||||
files = []
|
files = []
|
||||||
for child_file in child_files:
|
for child_file in child_files:
|
||||||
child_file_abspath = f'{current_location}{os.sep}{child_file}'
|
child_file_abspath = f'{current_location}{os.sep}{child_file}'
|
||||||
if handle_exclusion(exclude_filenames, child_file, child_file_abspath, 'file'):
|
if handle_exclusion(glob_filenames, exclude_filenames, child_file, child_file_abspath):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
files.append(pathclass.Path(child_file_abspath))
|
files.append(pathclass.Path(child_file_abspath))
|
||||||
|
@ -755,7 +778,7 @@ def walk_generator(
|
||||||
new_child_dirs = []
|
new_child_dirs = []
|
||||||
for child_dir in child_dirs:
|
for child_dir in child_dirs:
|
||||||
child_dir_abspath = f'{current_location}{os.sep}{child_dir}'
|
child_dir_abspath = f'{current_location}{os.sep}{child_dir}'
|
||||||
if handle_exclusion(exclude_directories, child_dir, child_dir_abspath, 'directory'):
|
if handle_exclusion(glob_directories, exclude_directories, child_dir, child_dir_abspath):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
new_child_dirs.append(child_dir)
|
new_child_dirs.append(child_dir)
|
||||||
|
@ -768,20 +791,21 @@ def walk_generator(
|
||||||
if yield_files:
|
if yield_files:
|
||||||
for child_file in child_files:
|
for child_file in child_files:
|
||||||
child_file_abspath = f'{current_location}{os.sep}{child_file}'
|
child_file_abspath = f'{current_location}{os.sep}{child_file}'
|
||||||
if handle_exclusion(exclude_filenames, child_file, child_file_abspath, 'file'):
|
if handle_exclusion(glob_filenames, exclude_filenames, child_file, child_file_abspath):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
yield pathclass.Path(child_file_abspath)
|
yield pathclass.Path(child_file_abspath)
|
||||||
|
|
||||||
walker = os.walk(path.absolute_path, onerror=callback_permission_denied, followlinks=True)
|
walker = os.walk(path.absolute_path, onerror=callback_permission_denied, followlinks=True)
|
||||||
if yield_style == 'flat':
|
if yield_style == 'flat':
|
||||||
for step in walker:
|
my_stepper = walkstep_flat
|
||||||
yield from walkstep_flat(*step)
|
|
||||||
if not recurse:
|
|
||||||
break
|
|
||||||
|
|
||||||
if yield_style == 'nested':
|
if yield_style == 'nested':
|
||||||
for step in walker:
|
my_stepper = walkstep_nested
|
||||||
yield from walkstep_nested(*step)
|
|
||||||
if not recurse:
|
for step in walker:
|
||||||
break
|
yield from my_stepper(*step)
|
||||||
|
if not recurse:
|
||||||
|
break
|
||||||
|
|
||||||
|
# Backwards compatibility
|
||||||
|
walk_generator = walk
|
||||||
|
|
Loading…
Reference in a new issue