Use threadpool in threaded_dl with separate ui thread.

The downside is that the result_generator is difficult to ctrlc on windows due to signals not occuring during the Queue.get. Pressing ctrlc will kill the program after one file finishes downloading, otherwise you have to press ctrlbreak. The end result is that the program actually feels a little worse to use than before, but it's more, like, pure. Don't know. Might revert or change again.
2021-12-07 12:49:12 -08:00 · 2021-12-07 12:49:12 -08:00 · a35200d594
commit a35200d594
parent 9cec1d8e2f
1 changed files with 132 additions and 37 deletions
--- a/threaded_dl.py
+++ b/threaded_dl.py
@ -1,18 +1,64 @@
 '''
 threaded_dl
 ===========
 > threaded_dl links thread_count filename_format <flags>
 links:
    The name of a file containing links to download, one per line.
    Uses pipeable to support !c clipboard, !i stdin lines of urls.
 thread_count:
    Integer number of threads to use for downloading.
 filename_format:
    A string that controls the names of the downloaded files. Uses Python's
    brace-style formatting. Available formatters are:
    - {basename}: The name of the file as indicated by the URL.
      E.g. example.com/image.jpg -> image.jpg
    - {extension}: The extension of the file as indicated by the URL, including
      the dot. E.g. example.com/image.jpg -> .jpg
    - {index}: The index of this URL within the sequence of all downloaded URLs.
      Starts from 0.
    - {now}: The unix timestamp at which this download job was started. It might
      be ugly but at least it's unambiguous when doing multiple download batches
      with similar filenames.
 flags:
 --bytespersecond X:
    Limit the overall download speed to X bytes per second. Uses
    bytestring.parsebytes to support strings like "1m", "500k", "2 mb", etc.
 --headers X:
    ;
 --timeout X:
    Integer number of seconds to use as HTTP request timeout for each download.
 '''
 import argparse
 import ast
 import os
 import queue
 import shutil
 import sys
 import threading
 import time
 from voussoirkit import betterhelp
 from voussoirkit import bytestring
 from voussoirkit import downloady
 from voussoirkit import pipeable
 from voussoirkit import ratelimiter
 from voussoirkit import ratemeter
 from voussoirkit import sentinel
 from voussoirkit import threadpool
 from voussoirkit import vlogging
 log = vlogging.getLogger(__name__, 'threaded_dl')
 downloady.log.setLevel(vlogging.WARNING)
 THREAD_FINISHED = sentinel.Sentinel('thread finished')
 def clean_url_list(urls):
    for url in urls:
        if isinstance(url, (tuple, list)):
@ -32,36 +78,30 @@ def clean_url_list(urls):
        else:
            yield url
-def download_thread(url, filename, *, bytespersecond=None, headers=None, timeout=None):
+def download_job(
        url,
        filename,
        *,
        bytespersecond=None,
        headers=None,
        meter=None,
        timeout=None,
    ):
    log.info(f'Starting "{filename}"')
    downloady.download_file(
        url,
        filename,
        bytespersecond=bytespersecond,
        headers=headers,
        ratemeter=meter,
        timeout=timeout,
    )
    log.info(f'Finished "{filename}"')
-def remove_finished(threads):
+def prepare_urls_filenames(urls, filename_format):
    return [t for t in threads if t.is_alive()]
 def threaded_dl(
        urls,
        thread_count,
        filename_format,
        bytespersecond=None,
        headers=None,
        timeout=None,
    ):
    now = int(time.time())
    threads = []
-    bytespersecond_thread = bytespersecond
+    if os.path.normcase(filename_format) != os.devnull:
    if bytespersecond_thread is not None:
        bytespersecond_thread = int(bytespersecond_thread / thread_count)
    if filename_format != os.devnull:
        index_digits = len(str(len(urls)))
        filename_format = filename_format.replace('{index}', '{index:0%0dd}' % index_digits)
@ -71,12 +111,9 @@ def threaded_dl(
        if '{extension}' not in filename_format and '{basename}' not in filename_format:
            filename_format += '{extension}'
    urls_filenames = []
    for (index, url) in enumerate(clean_url_list(urls)):
        while len(threads) >= thread_count:
            threads = remove_finished(threads)
            time.sleep(0.1)
        if isinstance(url, (tuple, list)):
            (url, filename) = url
        else:
@ -92,23 +129,82 @@ def threaded_dl(
        if os.path.exists(filename):
            log.info(f'Skipping existing file "{filename}"')
            continue
-        else:
+        urls_filenames.append((url, filename))
-            kwargs = {
+
-                'url': url,
+    return urls_filenames
-                'bytespersecond': bytespersecond_thread,
+
 def threaded_dl(
        urls,
        thread_count,
        filename_format,
        bytespersecond=None,
        headers=None,
        timeout=None,
    ):
    urls_filenames = prepare_urls_filenames(urls, filename_format)
    if not urls_filenames:
        return
    if bytespersecond is not None:
        # It is important that we convert this to a Ratelimter now instead of
        # passing the user's integer to downloady, because we want all threads
        # to share a single limiter instance instead of each creating their
        # own by the integer.
        bytespersecond = ratelimiter.Ratelimiter(bytespersecond)
    meter = ratemeter.RateMeter(span=5)
    pool = threadpool.ThreadPool(thread_count, paused=True)
    ui_stop_event = threading.Event()
    ui_kwargs = {
        'meter': meter,
        'stop_event': ui_stop_event,
        'pool': pool,
    }
    ui_thread = threading.Thread(target=ui_thread_func, kwargs=ui_kwargs, daemon=True)
    ui_thread.start()
    kwargss = []
    for (url, filename) in urls_filenames:
        kwargs = {
            'function': download_job,
            'kwargs': {
                'bytespersecond': bytespersecond,
                'filename': filename,
                'timeout': timeout,
                'headers': headers,
                'meter': meter,
                'timeout': timeout,
                'url': url,
            }
-            t = threading.Thread(target=download_thread, kwargs=kwargs, daemon=True)
+        }
-            threads.append(t)
+        kwargss.append(kwargs)
-            t.start()
+    pool.add_many(kwargss)
-    while len(threads) > 0:
+    for job in pool.result_generator():
-        threads = remove_finished(threads)
+        if job.exception:
-        pipeable.stderr(f'{len(threads)} threads remaining\r', end='')
+            ui_stop_event.set()
-        time.sleep(0.1)
+            ui_thread.join()
            raise job.exception
    ui_stop_event.set()
    ui_thread.join()
 def ui_thread_func(meter, pool, stop_event):
    if pipeable.OUT_PIPE:
        return
    while not stop_event.is_set():
        width = shutil.get_terminal_size().columns
        speed = meter.report()[2]
        message = f'{bytestring.bytestring(speed)}/s | {pool.running_count} threads'
        spaces = ' ' * (width - len(message) - 1)
        pipeable.stderr(message + spaces, end='\r')
        stop_event.wait(timeout=0.5)
 def threaded_dl_argparse(args):
    urls = pipeable.input(args.url_file, read_files=True, skip_blank=True, strip=True)
@ -150,8 +246,7 @@ def main(argv):
    parser.add_argument('--headers', nargs='+', default=None)
    parser.set_defaults(func=threaded_dl_argparse)
-    args = parser.parse_args(argv)
+    return betterhelp.single_main(argv, parser, __doc__)
    return args.func(args)
 if __name__ == '__main__':
    raise SystemExit(main(sys.argv[1:]))