From 375b00bfd5c71b710c799c0549ab84d56f2daaf6 Mon Sep 17 00:00:00 2001 From: Ethan Dalool Date: Tue, 18 May 2021 17:48:35 -0700 Subject: [PATCH] Add dynamic chunk sizing to keep consistent progress bar pace. --- voussoirkit/spinal.py | 68 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 65 insertions(+), 3 deletions(-) diff --git a/voussoirkit/spinal.py b/voussoirkit/spinal.py index 0e5ef5b..d64dcd5 100644 --- a/voussoirkit/spinal.py +++ b/voussoirkit/spinal.py @@ -1,8 +1,13 @@ +''' +This module provides functions related to walking the filesystem and +copying files and folders. +''' import hashlib import logging import os import shutil import sys +import time from voussoirkit import bytestring from voussoirkit import dotdict @@ -20,6 +25,10 @@ BAIL = sentinel.Sentinel('BAIL') # Number of bytes to read and write at a time CHUNK_SIZE = 2 * bytestring.MIBIBYTE +# When using dynamic chunk sizing, this is the ideal time to process a +# single chunk, in seconds. +IDEAL_CHUNK_TIME = 0.2 + HASH_CLASS = hashlib.md5 class SpinalException(Exception): @@ -91,7 +100,7 @@ def copy_dir( callback_pre_directory=None, callback_pre_file=None, callback_post_file=None, - chunk_size=CHUNK_SIZE, + chunk_size='dynamic', destination_new_root=None, dry_run=False, exclude_directories=None, @@ -150,6 +159,9 @@ def copy_dir( If you think copy_dir should be rewritten as a generator instead, I agree! + chunk_size: + Passed into each `copy_file` as `chunk_size`. + destination_new_root: Determine the destination path by calling `new_root(source, destination_new_root)`. @@ -334,7 +346,7 @@ def copy_file( callback_permission_denied=None, callback_pre_copy=None, callback_validate_hash=None, - chunk_size=CHUNK_SIZE, + chunk_size='dynamic', dry_run=False, hash_class=None, overwrite_old=True, @@ -383,6 +395,11 @@ def copy_file( callback_hash_progress: Passed into `hash_file` as callback_progress when validating the hash. + chunk_size: + An integer number of bytes to read and write at a time. + Or, the string 'dynamic' to enable dynamic chunk sizing that aims to + keep a consistent pace of progress bar updates. + dry_run: Do everything except the actual file copying. @@ -492,7 +509,13 @@ def copy_file( hash_class = HASH_CLASS results.hash = HASH_CLASS() + dynamic_chunk_size = chunk_size == 'dynamic' + if dynamic_chunk_size: + chunk_size = bytestring.MIBIBYTE + while True: + chunk_start = time.perf_counter() + try: data_chunk = source_handle.read(chunk_size) except PermissionError as exception: @@ -517,6 +540,10 @@ def copy_file( if bytes_per_second is not None: bytes_per_second.limit(data_bytes) + if dynamic_chunk_size: + chunk_time = time.perf_counter() - chunk_start + chunk_size = dynamic_chunk_sizer(chunk_size, chunk_time, IDEAL_CHUNK_TIME) + if results.written_bytes == 0: # For zero-length files, we want to get at least one call in there. callback_progress(destination, results.written_bytes, source_bytes) @@ -547,6 +574,25 @@ def do_nothing(*args, **kwargs): ''' return +def dynamic_chunk_sizer(chunk_size, chunk_time, ideal_chunk_time): + ''' + Calculates a new chunk size based on the time it took to do the previous + chunk versus the ideal chunk time. + ''' + # If chunk_time = scale * ideal_chunk_time, + # Then ideal_chunk_size = chunk_size / scale + scale = chunk_time / ideal_chunk_time + scale = min(scale, 2) + scale = max(scale, 0.5) + suggestion = chunk_size / scale + # Give the current size double weight so small fluctuations don't send + # the needle bouncing all over. + new_size = int((chunk_size + chunk_size + suggestion) / 3) + # I doubt any real-world scenario will dynamically suggest a chunk_size of + # zero, but let's enforce a one-byte minimum anyway. + new_size = max(new_size, 1) + return new_size + def get_dir_size(path): ''' Calculate the total number of bytes across all files in this directory @@ -569,7 +615,7 @@ def hash_file( *, bytes_per_second=None, callback_progress=None, - chunk_size=CHUNK_SIZE, + chunk_size='dynamic', ): ''' hash_class: @@ -578,6 +624,11 @@ def hash_file( callback_progress: A function that takes three parameters: path object, bytes ingested so far, bytes total + + chunk_size: + An integer number of bytes to read at a time. + Or, the string 'dynamic' to enable dynamic chunk sizing that aims to + keep a consistent pace of progress bar updates. ''' path = pathclass.Path(path) path.assert_is_file() @@ -590,8 +641,15 @@ def hash_file( file_size = path.size handle = path.open('rb') + + dynamic_chunk_size = chunk_size == 'dynamic' + if dynamic_chunk_size: + chunk_size = bytestring.MIBIBYTE + with handle: while True: + chunk_start = time.perf_counter() + chunk = handle.read(chunk_size) if not chunk: break @@ -605,6 +663,10 @@ def hash_file( if bytes_per_second is not None: bytes_per_second.limit(this_size) + if dynamic_chunk_size: + chunk_time = time.perf_counter() - chunk_start + chunk_size = dynamic_chunk_sizer(chunk_size, chunk_time, IDEAL_CHUNK_TIME) + return hasher def is_xor(*args):