cmd/threaded_dl.py

151 lines
4.3 KiB
Python
Raw Normal View History

2020-09-29 21:27:48 +00:00
import argparse
2020-09-30 20:20:07 +00:00
import ast
2020-09-29 21:27:48 +00:00
import os
import sys
import threading
import time
from voussoirkit import bytestring
2020-09-29 21:27:48 +00:00
from voussoirkit import downloady
from voussoirkit import pipeable
2020-09-29 21:27:48 +00:00
def clean_url_list(urls):
for url in urls:
if isinstance(url, (tuple, list)):
(url, filename) = url
else:
filename = None
2020-09-29 21:27:48 +00:00
url = url.strip()
if not url:
continue
if url.startswith('#'):
continue
if filename:
yield (url, filename)
else:
yield url
2020-09-29 21:27:48 +00:00
def download_thread(url, filename, *, bytespersecond=None, headers=None, timeout=None):
2020-09-29 21:27:48 +00:00
print(f' Starting "{filename}"')
2021-08-10 00:37:19 +00:00
downloady.download_file(
url,
filename,
bytespersecond=bytespersecond,
headers=headers,
timeout=timeout,
)
2020-09-29 21:27:48 +00:00
print(f'+Finished "{filename}"')
def remove_finished(threads):
return [t for t in threads if t.is_alive()]
def threaded_dl(
urls,
thread_count,
filename_format,
bytespersecond=None,
2020-09-30 20:20:07 +00:00
headers=None,
timeout=None,
2020-09-29 21:27:48 +00:00
):
now = int(time.time())
threads = []
bytespersecond_thread = bytespersecond
if bytespersecond_thread is not None:
bytespersecond_thread = int(bytespersecond_thread / thread_count)
2020-09-29 21:27:48 +00:00
if filename_format != os.devnull:
index_digits = len(str(len(urls)))
filename_format = filename_format.replace('{index}', '{index:0%0dd}' % index_digits)
if '{' not in filename_format and len(urls) > 1:
filename_format += '_{index}'
if '{extension}' not in filename_format and '{basename}' not in filename_format:
filename_format += '{extension}'
for (index, url) in enumerate(clean_url_list(urls)):
while len(threads) >= thread_count:
threads = remove_finished(threads)
time.sleep(0.1)
if isinstance(url, (tuple, list)):
(url, filename) = url
else:
basename = downloady.basename_from_url(url)
extension = os.path.splitext(basename)[1]
filename = filename_format.format(
basename=basename,
ext=extension,
extension=extension,
index=index,
now=now,
)
2020-09-29 21:27:48 +00:00
if os.path.exists(filename):
print(f'Skipping existing file "{filename}"')
else:
2020-09-30 20:20:07 +00:00
kwargs = {
'url': url,
'bytespersecond': bytespersecond_thread,
2020-09-30 20:20:07 +00:00
'filename': filename,
'timeout': timeout,
'headers': headers,
}
t = threading.Thread(target=download_thread, kwargs=kwargs, daemon=True)
2020-09-29 21:27:48 +00:00
threads.append(t)
t.start()
while len(threads) > 0:
threads = remove_finished(threads)
2020-09-30 20:20:07 +00:00
print(f'{len(threads)} threads remaining\r', end='', flush=True)
2020-09-29 21:27:48 +00:00
time.sleep(0.1)
def threaded_dl_argparse(args):
urls = pipeable.input(args.url_file, read_files=True, skip_blank=True, strip=True)
urls = [u.split(' ', 1) if ' ' in u else u for u in urls]
2020-09-30 20:20:07 +00:00
headers = args.headers
if headers is not None:
if len(headers) == 1 and headers[0].startswith('{'):
headers = ast.literal_eval(headers[0])
else:
keys = headers[::2]
vals = headers[1::2]
headers = {key: val for (key, val) in zip(keys, vals)}
bytespersecond = args.bytespersecond
if bytespersecond is not None:
bytespersecond = bytestring.parsebytes(bytespersecond)
2020-09-29 21:27:48 +00:00
threaded_dl(
urls,
bytespersecond=bytespersecond,
2020-09-29 21:27:48 +00:00
filename_format=args.filename_format,
2020-09-30 20:20:07 +00:00
headers=headers,
thread_count=args.thread_count,
2020-09-29 21:27:48 +00:00
timeout=args.timeout,
)
def main(argv):
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('url_file')
parser.add_argument('thread_count', type=int)
2020-09-29 21:27:48 +00:00
parser.add_argument('filename_format', nargs='?', default='{now}_{index}_{basename}')
parser.add_argument('--bytespersecond', default=None)
parser.add_argument('--timeout', default=15)
parser.add_argument('--headers', nargs='+', default=None)
2020-09-29 21:27:48 +00:00
parser.set_defaults(func=threaded_dl_argparse)
args = parser.parse_args(argv)
return args.func(args)
if __name__ == '__main__':
raise SystemExit(main(sys.argv[1:]))