threaded_dl improvements.

- use real argparse
- add timeout argument to command line
- apply default values for timeout and filename at argparse level
- check for existing files before creating their thread
This commit is contained in:
Ethan Dalool 2019-05-27 15:59:23 -07:00
parent f23f8ebc7d
commit 70dd4ef77a

View file

@ -1,47 +1,54 @@
import argparse
import os
import sys
import threading
import time
# pip install voussoirkit
from voussoirkit import clipext
from voussoirkit import downloady
def clean_url_list(urls):
for url in urls:
url = url.strip()
if not url:
continue
if url.startswith('#'):
continue
yield url
def download_thread(url, filename, timeout=None):
print(f' Starting "{filename}"')
downloady.download_file(url, filename, timeout=timeout)
print(f'+Finished "{filename}"')
def remove_finished(threads):
threads = [t for t in threads if t.is_alive()]
return threads
return [t for t in threads if t.is_alive()]
def download_thread(url, filename):
url = url.strip()
if url == '':
return
if os.path.exists(filename):
print('Skipping existing file "%s"' % filename)
return
print(' Starting "%s"' % filename)
downloady.download_file(url, filename, timeout=15)
print('+Finished "%s"' % filename)
def listget(li, index, fallback):
try:
return li[index]
except IndexError:
return fallback
def threaded_dl(urls, thread_count, filename_format=None):
def threaded_dl(
urls,
thread_count,
filename_format,
timeout=None,
):
now = int(time.time())
threads = []
index_digits = len(str(len(urls)))
if filename_format is None:
filename_format = '{now}_{index}_{basename}'
filename_format = filename_format.replace('{index}', '{index:0%0dd}' % index_digits)
if filename_format != os.devnull:
index_digits = len(str(len(urls)))
filename_format = filename_format.replace('{index}', '{index:0%0dd}' % index_digits)
if '{' not in filename_format and len(urls) > 1:
filename_format += '_{index}'
if '{extension}' not in filename_format and '{basename}' not in filename_format:
filename_format += '{extension}'
now = int(time.time())
for (index, url) in enumerate(urls):
for (index, url) in enumerate(clean_url_list(urls)):
while len(threads) == thread_count:
threads = remove_finished(threads)
time.sleep(0.1)
@ -55,28 +62,47 @@ def threaded_dl(urls, thread_count, filename_format=None):
index=index,
now=now,
)
t = threading.Thread(target=download_thread, args=[url, filename])
t.daemon = True
threads.append(t)
t.start()
if os.path.exists(filename):
print(f'Skipping existing file "{filename}"')
else:
t = threading.Thread(target=download_thread, args=[url, filename, timeout], daemon=True)
threads.append(t)
t.start()
while len(threads) > 0:
threads = remove_finished(threads)
print('%d threads remaining\r' % len(threads), end='', flush=True)
time.sleep(0.1)
def main(argv):
filename = argv[0]
if os.path.isfile(filename):
f = open(filename, 'r')
def threaded_dl_argparse(args):
if os.path.isfile(args.url_file):
f = open(args.url_file, 'r')
with f:
urls = f.read()
else:
urls = clipext.resolve(filename)
urls = clipext.resolve(args.url_file)
urls = urls.replace('\r', '').split('\n')
thread_count = int(listget(argv, 1, 4))
filename_format = listget(argv, 2, None)
threaded_dl(urls, thread_count=thread_count, filename_format=filename_format)
threaded_dl(
urls,
thread_count=args.thread_count,
filename_format=args.filename_format,
timeout=args.timeout,
)
def main(argv):
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('url_file')
parser.add_argument('thread_count', nargs='?', default=None)
parser.add_argument('filename_format', nargs='?', default='{now}_{index}_{basename}')
parser.add_argument('--timeout', dest='timeout', default=15)
parser.set_defaults(func=threaded_dl_argparse)
args = parser.parse_args(argv)
args.func(args)
if __name__ == '__main__':
main(sys.argv[1:])
raise SystemExit(main(sys.argv[1:]))