2020-09-29 21:27:48 +00:00
|
|
|
import argparse
|
2020-09-30 20:20:07 +00:00
|
|
|
import ast
|
2020-09-29 21:27:48 +00:00
|
|
|
import os
|
2022-11-03 01:42:14 +00:00
|
|
|
import random
|
2021-12-07 20:49:12 +00:00
|
|
|
import shutil
|
2020-09-29 21:27:48 +00:00
|
|
|
import sys
|
|
|
|
import threading
|
|
|
|
import time
|
2022-08-14 17:43:13 +00:00
|
|
|
import traceback
|
2020-09-29 21:27:48 +00:00
|
|
|
|
2021-12-07 20:49:12 +00:00
|
|
|
from voussoirkit import betterhelp
|
2020-10-03 02:54:20 +00:00
|
|
|
from voussoirkit import bytestring
|
2020-09-29 21:27:48 +00:00
|
|
|
from voussoirkit import downloady
|
2022-08-14 17:42:34 +00:00
|
|
|
from voussoirkit import pathclass
|
2021-08-17 21:06:07 +00:00
|
|
|
from voussoirkit import pipeable
|
2021-12-07 20:49:12 +00:00
|
|
|
from voussoirkit import ratelimiter
|
|
|
|
from voussoirkit import ratemeter
|
|
|
|
from voussoirkit import threadpool
|
2021-09-24 06:42:34 +00:00
|
|
|
from voussoirkit import vlogging
|
|
|
|
|
|
|
|
log = vlogging.getLogger(__name__, 'threaded_dl')
|
|
|
|
downloady.log.setLevel(vlogging.WARNING)
|
2020-09-29 21:27:48 +00:00
|
|
|
|
|
|
|
def clean_url_list(urls):
|
|
|
|
for url in urls:
|
2020-10-26 03:15:11 +00:00
|
|
|
if isinstance(url, (tuple, list)):
|
|
|
|
(url, filename) = url
|
|
|
|
else:
|
|
|
|
filename = None
|
2020-09-29 21:27:48 +00:00
|
|
|
url = url.strip()
|
|
|
|
|
|
|
|
if not url:
|
|
|
|
continue
|
|
|
|
|
|
|
|
if url.startswith('#'):
|
|
|
|
continue
|
|
|
|
|
2020-10-26 03:15:11 +00:00
|
|
|
if filename:
|
|
|
|
yield (url, filename)
|
|
|
|
else:
|
|
|
|
yield url
|
2020-09-29 21:27:48 +00:00
|
|
|
|
2021-12-07 20:49:12 +00:00
|
|
|
def download_job(
|
|
|
|
url,
|
|
|
|
filename,
|
|
|
|
*,
|
|
|
|
bytespersecond=None,
|
|
|
|
headers=None,
|
|
|
|
meter=None,
|
|
|
|
timeout=None,
|
|
|
|
):
|
2021-09-24 06:42:34 +00:00
|
|
|
log.info(f'Starting "{filename}"')
|
2021-08-10 00:37:19 +00:00
|
|
|
downloady.download_file(
|
|
|
|
url,
|
|
|
|
filename,
|
|
|
|
bytespersecond=bytespersecond,
|
|
|
|
headers=headers,
|
2021-12-07 20:49:12 +00:00
|
|
|
ratemeter=meter,
|
2021-08-10 00:37:19 +00:00
|
|
|
timeout=timeout,
|
|
|
|
)
|
2021-09-24 06:42:34 +00:00
|
|
|
log.info(f'Finished "{filename}"')
|
2020-09-29 21:27:48 +00:00
|
|
|
|
2022-07-16 05:39:45 +00:00
|
|
|
def normalize_headers(headers):
|
|
|
|
if headers is None:
|
|
|
|
return {}
|
|
|
|
|
|
|
|
if not headers:
|
|
|
|
return {}
|
|
|
|
|
|
|
|
if isinstance(headers, dict):
|
|
|
|
return headers
|
|
|
|
|
|
|
|
if isinstance(headers, list) and len(headers) == 1:
|
|
|
|
headers = headers[0]
|
|
|
|
|
|
|
|
if isinstance(headers, (list, tuple)):
|
|
|
|
keys = headers[::2]
|
|
|
|
vals = headers[1::2]
|
|
|
|
return {key: val for (key, val) in zip(keys, vals)}
|
|
|
|
|
|
|
|
if isinstance(headers, str) and os.path.isfile(headers):
|
2022-11-03 01:42:14 +00:00
|
|
|
headers = pathclass.Path(headers).read('r', encoding='utf-8')
|
2022-07-16 05:39:45 +00:00
|
|
|
|
|
|
|
if isinstance(headers, str):
|
|
|
|
if headers.startswith('{'):
|
|
|
|
return ast.literal_eval(headers)
|
|
|
|
else:
|
|
|
|
lines = [line for line in headers.splitlines() if line.strip()]
|
2022-11-03 01:42:14 +00:00
|
|
|
lines = [line for line in lines if not line.startswith('#')]
|
|
|
|
pairs = [line.strip().split(':', 1) for line in lines]
|
|
|
|
return {key.strip(): value.strip() for (key, value) in pairs}
|
2022-07-16 05:39:45 +00:00
|
|
|
|
|
|
|
return headers
|
|
|
|
|
2021-12-07 20:49:12 +00:00
|
|
|
def prepare_urls_filenames(urls, filename_format):
|
2020-09-29 21:27:48 +00:00
|
|
|
now = int(time.time())
|
2020-10-03 02:54:20 +00:00
|
|
|
|
2021-12-07 20:49:12 +00:00
|
|
|
if os.path.normcase(filename_format) != os.devnull:
|
2020-09-29 21:27:48 +00:00
|
|
|
index_digits = len(str(len(urls)))
|
|
|
|
filename_format = filename_format.replace('{index}', '{index:0%0dd}' % index_digits)
|
|
|
|
|
|
|
|
if '{' not in filename_format and len(urls) > 1:
|
|
|
|
filename_format += '_{index}'
|
|
|
|
|
|
|
|
if '{extension}' not in filename_format and '{basename}' not in filename_format:
|
|
|
|
filename_format += '{extension}'
|
|
|
|
|
2021-12-07 20:49:12 +00:00
|
|
|
urls_filenames = []
|
2020-09-29 21:27:48 +00:00
|
|
|
|
2021-12-07 20:49:12 +00:00
|
|
|
for (index, url) in enumerate(clean_url_list(urls)):
|
2020-10-26 03:15:11 +00:00
|
|
|
if isinstance(url, (tuple, list)):
|
|
|
|
(url, filename) = url
|
|
|
|
else:
|
2022-11-03 01:42:14 +00:00
|
|
|
index1 = index + 1
|
2020-10-26 03:15:11 +00:00
|
|
|
basename = downloady.basename_from_url(url)
|
|
|
|
extension = os.path.splitext(basename)[1]
|
|
|
|
filename = filename_format.format(
|
|
|
|
basename=basename,
|
|
|
|
ext=extension,
|
|
|
|
extension=extension,
|
|
|
|
index=index,
|
2022-11-03 01:42:14 +00:00
|
|
|
index1=index1,
|
2020-10-26 03:15:11 +00:00
|
|
|
now=now,
|
2023-12-31 22:17:00 +00:00
|
|
|
random=random.getrandbits(32),
|
2020-10-26 03:15:11 +00:00
|
|
|
)
|
2020-09-29 21:27:48 +00:00
|
|
|
|
|
|
|
if os.path.exists(filename):
|
2021-09-24 06:42:34 +00:00
|
|
|
log.info(f'Skipping existing file "{filename}"')
|
2021-12-07 20:49:12 +00:00
|
|
|
continue
|
2020-09-29 21:27:48 +00:00
|
|
|
|
2021-12-07 20:49:12 +00:00
|
|
|
urls_filenames.append((url, filename))
|
|
|
|
|
|
|
|
return urls_filenames
|
|
|
|
|
|
|
|
def threaded_dl(
|
|
|
|
urls,
|
|
|
|
thread_count,
|
|
|
|
filename_format,
|
|
|
|
bytespersecond=None,
|
|
|
|
headers=None,
|
|
|
|
timeout=None,
|
|
|
|
):
|
|
|
|
urls_filenames = prepare_urls_filenames(urls, filename_format)
|
|
|
|
|
|
|
|
if not urls_filenames:
|
|
|
|
return
|
|
|
|
|
|
|
|
if bytespersecond is not None:
|
|
|
|
# It is important that we convert this to a Ratelimter now instead of
|
|
|
|
# passing the user's integer to downloady, because we want all threads
|
|
|
|
# to share a single limiter instance instead of each creating their
|
|
|
|
# own by the integer.
|
|
|
|
bytespersecond = ratelimiter.Ratelimiter(bytespersecond)
|
|
|
|
|
|
|
|
meter = ratemeter.RateMeter(span=5)
|
|
|
|
|
|
|
|
pool = threadpool.ThreadPool(thread_count, paused=True)
|
|
|
|
|
|
|
|
ui_stop_event = threading.Event()
|
|
|
|
ui_kwargs = {
|
|
|
|
'meter': meter,
|
|
|
|
'stop_event': ui_stop_event,
|
|
|
|
'pool': pool,
|
|
|
|
}
|
|
|
|
ui_thread = threading.Thread(target=ui_thread_func, kwargs=ui_kwargs, daemon=True)
|
|
|
|
ui_thread.start()
|
|
|
|
|
|
|
|
kwargss = []
|
|
|
|
for (url, filename) in urls_filenames:
|
|
|
|
kwargs = {
|
|
|
|
'function': download_job,
|
|
|
|
'kwargs': {
|
|
|
|
'bytespersecond': bytespersecond,
|
2020-09-30 20:20:07 +00:00
|
|
|
'filename': filename,
|
|
|
|
'headers': headers,
|
2021-12-07 20:49:12 +00:00
|
|
|
'meter': meter,
|
|
|
|
'timeout': timeout,
|
|
|
|
'url': url,
|
2020-09-30 20:20:07 +00:00
|
|
|
}
|
2021-12-07 20:49:12 +00:00
|
|
|
}
|
|
|
|
kwargss.append(kwargs)
|
|
|
|
pool.add_many(kwargss)
|
|
|
|
|
2022-08-14 17:43:13 +00:00
|
|
|
status = 0
|
2021-12-07 20:49:12 +00:00
|
|
|
for job in pool.result_generator():
|
|
|
|
if job.exception:
|
2022-11-03 01:42:14 +00:00
|
|
|
log.error(''.join(traceback.format_exception(None, job.exception, job.exception.__traceback__)))
|
2022-08-14 17:43:13 +00:00
|
|
|
status = 1
|
2021-12-07 20:49:12 +00:00
|
|
|
|
|
|
|
ui_stop_event.set()
|
|
|
|
ui_thread.join()
|
2022-08-14 17:43:13 +00:00
|
|
|
return status
|
2021-12-07 20:49:12 +00:00
|
|
|
|
|
|
|
def ui_thread_func(meter, pool, stop_event):
|
2022-02-13 03:50:00 +00:00
|
|
|
if pipeable.stdout_pipe():
|
2021-12-07 20:49:12 +00:00
|
|
|
return
|
|
|
|
|
|
|
|
while not stop_event.is_set():
|
|
|
|
width = shutil.get_terminal_size().columns
|
|
|
|
speed = meter.report()[2]
|
|
|
|
message = f'{bytestring.bytestring(speed)}/s | {pool.running_count} threads'
|
|
|
|
spaces = ' ' * (width - len(message) - 1)
|
|
|
|
pipeable.stderr(message + spaces, end='\r')
|
2020-09-29 21:27:48 +00:00
|
|
|
|
2021-12-07 20:49:12 +00:00
|
|
|
stop_event.wait(timeout=0.5)
|
2020-09-29 21:27:48 +00:00
|
|
|
|
|
|
|
def threaded_dl_argparse(args):
|
2021-08-17 21:06:07 +00:00
|
|
|
urls = pipeable.input(args.url_file, read_files=True, skip_blank=True, strip=True)
|
2020-10-26 03:15:11 +00:00
|
|
|
urls = [u.split(' ', 1) if ' ' in u else u for u in urls]
|
|
|
|
|
2022-07-16 05:39:45 +00:00
|
|
|
headers = normalize_headers(args.headers)
|
2022-11-03 01:42:14 +00:00
|
|
|
print(headers)
|
2020-09-30 20:20:07 +00:00
|
|
|
|
2020-10-03 02:54:20 +00:00
|
|
|
bytespersecond = args.bytespersecond
|
|
|
|
if bytespersecond is not None:
|
|
|
|
bytespersecond = bytestring.parsebytes(bytespersecond)
|
|
|
|
|
2022-08-14 17:43:13 +00:00
|
|
|
return threaded_dl(
|
2020-09-29 21:27:48 +00:00
|
|
|
urls,
|
2020-10-03 02:54:20 +00:00
|
|
|
bytespersecond=bytespersecond,
|
2020-09-29 21:27:48 +00:00
|
|
|
filename_format=args.filename_format,
|
2020-09-30 20:20:07 +00:00
|
|
|
headers=headers,
|
|
|
|
thread_count=args.thread_count,
|
2020-09-29 21:27:48 +00:00
|
|
|
timeout=args.timeout,
|
|
|
|
)
|
|
|
|
|
2021-09-24 06:42:34 +00:00
|
|
|
@vlogging.main_decorator
|
2020-09-29 21:27:48 +00:00
|
|
|
def main(argv):
|
2022-02-13 03:50:00 +00:00
|
|
|
parser = argparse.ArgumentParser()
|
|
|
|
parser.add_argument(
|
|
|
|
'url_file',
|
|
|
|
metavar='links',
|
|
|
|
help='''
|
|
|
|
The name of a file containing links to download, one per line.
|
|
|
|
Uses pipeable to support !c clipboard, !i stdin lines of urls.
|
|
|
|
''',
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
'thread_count',
|
|
|
|
type=int,
|
|
|
|
help='''
|
|
|
|
Integer number of threads to use for downloading.
|
|
|
|
''',
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
'filename_format',
|
|
|
|
nargs='?',
|
|
|
|
type=str,
|
|
|
|
default='{now}_{index}_{basename}',
|
|
|
|
help='''
|
|
|
|
A string that controls the names of the downloaded files. Uses Python's
|
|
|
|
brace-style formatting. Available formatters are:
|
|
|
|
- {basename}: The name of the file as indicated by the URL.
|
|
|
|
E.g. example.com/image.jpg -> image.jpg
|
|
|
|
- {extension}: The extension of the file as indicated by the URL, including
|
|
|
|
the dot. E.g. example.com/image.jpg -> .jpg
|
|
|
|
- {index}: The index of this URL within the sequence of all downloaded URLs.
|
|
|
|
Starts from 0.
|
|
|
|
- {now}: The unix timestamp at which this download job was started. It might
|
|
|
|
be ugly but at least it's unambiguous when doing multiple download batches
|
|
|
|
with similar filenames.
|
|
|
|
''',
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
'--bytespersecond',
|
|
|
|
default=None,
|
|
|
|
help='''
|
|
|
|
Limit the overall download speed to X bytes per second. Uses
|
|
|
|
bytestring.parsebytes to support strings like "1m", "500k", "2 mb", etc.
|
|
|
|
''',
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
'--timeout',
|
|
|
|
default=15,
|
|
|
|
help='''
|
|
|
|
Integer number of seconds to use as HTTP request timeout for each download.
|
|
|
|
''',
|
|
|
|
)
|
|
|
|
parser.add_argument(
|
|
|
|
'--headers', nargs='+', default=None,
|
2022-11-03 01:42:14 +00:00
|
|
|
help='''
|
|
|
|
HTTP headers to add to your request. There are many ways to specify headers:
|
|
|
|
|
|
|
|
You can provide multiple command line arguments where the first is a key,
|
|
|
|
the second is its value, the third is another key, the fourth is its value...
|
|
|
|
|
|
|
|
You can provide a single command line argument which is a JSON string containing
|
|
|
|
key:value pairs.
|
|
|
|
|
|
|
|
You can provide a single command line argument which is a filename.
|
|
|
|
The file can be a JSON file, or alternatively the file should have each
|
|
|
|
key:value on a separate line and a colon should separate each key from its value.
|
|
|
|
''',
|
2022-02-13 03:50:00 +00:00
|
|
|
)
|
2020-09-29 21:27:48 +00:00
|
|
|
parser.set_defaults(func=threaded_dl_argparse)
|
|
|
|
|
2022-02-13 03:50:00 +00:00
|
|
|
return betterhelp.go(parser, argv)
|
2020-09-29 21:27:48 +00:00
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
raise SystemExit(main(sys.argv[1:]))
|