Ethan Dalool
18de52ecb7
This worked fine before and then I messed it up but I'm not in the mood to go back and look at the old code.
516 lines
16 KiB
Python
516 lines
16 KiB
Python
import argparse
|
|
import io
|
|
import os
|
|
import requests
|
|
import sys
|
|
import time
|
|
import urllib
|
|
|
|
from voussoirkit import bytestring
|
|
from voussoirkit import dotdict
|
|
from voussoirkit import httperrors
|
|
from voussoirkit import pathclass
|
|
from voussoirkit import pipeable
|
|
from voussoirkit import progressbars
|
|
from voussoirkit import ratelimiter
|
|
from voussoirkit import sentinel
|
|
from voussoirkit import vlogging
|
|
|
|
log = vlogging.getLogger(__name__, 'downloady')
|
|
|
|
USERAGENT = '''
|
|
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)
|
|
Chrome/42.0.2311.152 Safari/537.36'
|
|
'''.strip().replace('\n', ' ')
|
|
|
|
HEADERS = {
|
|
'User-Agent': USERAGENT,
|
|
}
|
|
|
|
FILENAME_BADCHARS = '*?"<>|\r\n'
|
|
|
|
# When using dynamic chunk sizing, this is the ideal time to process a
|
|
# single chunk, in seconds.
|
|
IDEAL_CHUNK_TIME = 0.2
|
|
|
|
TIMEOUT = 60
|
|
TEMP_EXTENSION = '.downloadytemp'
|
|
|
|
if os.name == 'nt':
|
|
SPECIAL_FILENAMES = pathclass.WINDOWS_RESERVED_NAMES
|
|
else:
|
|
SPECIAL_FILENAMES = [os.devnull]
|
|
SPECIAL_FILENAMES = [os.path.normcase(x) for x in SPECIAL_FILENAMES]
|
|
|
|
FILE_EXISTS = sentinel.Sentinel('file exists no overwrite', truthyness=False)
|
|
|
|
class DownloadyException(Exception):
|
|
pass
|
|
|
|
class NotEnoughBytes(DownloadyException):
|
|
pass
|
|
|
|
class ServerNoRange(DownloadyException):
|
|
pass
|
|
|
|
class SpecialPath:
|
|
'''
|
|
This class is to be used for special paths like /dev/null and Windows's nul.
|
|
Unlike regular download paths, these special paths will not be renamed with
|
|
a temporary file extension, and the containing paths will not be checked
|
|
for existence or mkdir'ed.
|
|
'''
|
|
def __init__(self, path):
|
|
self.absolute_path = path
|
|
|
|
def open(self, *args, **kwargs):
|
|
return open(self.absolute_path, *args, **kwargs)
|
|
|
|
def download_file(
|
|
url,
|
|
localname=None,
|
|
auth=None,
|
|
bytespersecond=None,
|
|
progressbar=None,
|
|
do_head=True,
|
|
headers=None,
|
|
overwrite=False,
|
|
raise_for_undersized=True,
|
|
ratemeter=None,
|
|
timeout=None,
|
|
verbose=False,
|
|
verify_ssl=True,
|
|
):
|
|
plan = prepare_plan(
|
|
url,
|
|
localname,
|
|
auth=auth,
|
|
bytespersecond=bytespersecond,
|
|
progressbar=progressbar,
|
|
do_head=do_head,
|
|
headers=headers,
|
|
overwrite=overwrite,
|
|
raise_for_undersized=raise_for_undersized,
|
|
ratemeter=ratemeter,
|
|
timeout=timeout,
|
|
verify_ssl=verify_ssl,
|
|
)
|
|
|
|
if isinstance(plan, sentinel.Sentinel):
|
|
return plan
|
|
|
|
return download_plan(plan)
|
|
|
|
def download_plan(plan):
|
|
if isinstance(plan.download_into, pathclass.Path):
|
|
plan.download_into.parent.makedirs(exist_ok=True)
|
|
plan.download_into.touch()
|
|
|
|
if plan.plan_type in ['resume', 'partial']:
|
|
if isinstance(plan.download_into, io.IOBase):
|
|
file_handle = plan.download_into
|
|
if plan.seek_to > 0:
|
|
file_handle.seek(plan.seek_to)
|
|
else:
|
|
file_handle = plan.download_into.open('r+b')
|
|
file_handle.seek(plan.seek_to)
|
|
bytes_downloaded = plan.seek_to
|
|
|
|
elif plan.plan_type == 'fulldownload':
|
|
if isinstance(plan.download_into, io.IOBase):
|
|
file_handle = plan.download_into
|
|
else:
|
|
file_handle = plan.download_into.open('wb')
|
|
bytes_downloaded = 0
|
|
|
|
if plan.header_range_min is not None:
|
|
plan.headers['range'] = 'bytes={min}-{max}'.format(
|
|
min=plan.header_range_min,
|
|
max=plan.header_range_max,
|
|
)
|
|
|
|
log.info('Downloading %s into "%s"', plan.url, plan.real_localname)
|
|
|
|
download_stream = request(
|
|
'get',
|
|
plan.url,
|
|
stream=True,
|
|
auth=plan.auth,
|
|
headers=plan.headers,
|
|
timeout=plan.timeout,
|
|
verify_ssl=plan.verify_ssl,
|
|
)
|
|
|
|
if plan.remote_total_bytes is None:
|
|
# Since we didn't do a head, let's fill this in now.
|
|
plan.remote_total_bytes = download_stream.headers.get('Content-Length', None)
|
|
plan.remote_total_bytes = None if plan.remote_total_bytes is None else int(plan.remote_total_bytes)
|
|
|
|
progressbar = progressbars.normalize_progressbar(plan.progressbar, total=plan.remote_total_bytes)
|
|
|
|
if plan.limiter:
|
|
chunk_size = int(plan.limiter.allowance * IDEAL_CHUNK_TIME)
|
|
else:
|
|
chunk_size = 128 * bytestring.KIBIBYTE
|
|
|
|
while True:
|
|
chunk_start = time.perf_counter()
|
|
chunk = download_stream.raw.read(chunk_size)
|
|
chunk_bytes = len(chunk)
|
|
if chunk_bytes == 0:
|
|
break
|
|
|
|
file_handle.write(chunk)
|
|
bytes_downloaded += chunk_bytes
|
|
|
|
if progressbar is not None:
|
|
progressbar.step(bytes_downloaded)
|
|
|
|
if plan.limiter is not None:
|
|
plan.limiter.limit(chunk_bytes)
|
|
|
|
if plan.ratemeter is not None:
|
|
plan.ratemeter.digest(chunk_bytes)
|
|
|
|
chunk_time = time.perf_counter() - chunk_start
|
|
chunk_size = dynamic_chunk_sizer(chunk_size, chunk_time, IDEAL_CHUNK_TIME)
|
|
|
|
if progressbar is not None:
|
|
progressbar.done()
|
|
|
|
# Don't close the user's file handle
|
|
if isinstance(plan.real_localname, io.IOBase):
|
|
return plan.real_localname
|
|
|
|
file_handle.close()
|
|
|
|
# Don't try to rename /dev/null or other special names
|
|
if isinstance(plan.real_localname, SpecialPath):
|
|
return plan.real_localname
|
|
|
|
temp_localsize = plan.download_into.size
|
|
undersized = (
|
|
plan.plan_type != 'partial' and
|
|
plan.remote_total_bytes is not None and
|
|
temp_localsize < plan.remote_total_bytes
|
|
)
|
|
if undersized and plan.raise_for_undersized:
|
|
message = 'File does not contain expected number of bytes. Received {size} / {total}'
|
|
message = message.format(size=temp_localsize, total=plan.remote_total_bytes)
|
|
raise NotEnoughBytes(message)
|
|
|
|
if plan.download_into != plan.real_localname:
|
|
os.rename(plan.download_into, plan.real_localname)
|
|
|
|
return plan.real_localname
|
|
|
|
def dynamic_chunk_sizer(chunk_size, chunk_time, ideal_chunk_time):
|
|
'''
|
|
Calculates a new chunk size based on the time it took to do the previous
|
|
chunk versus the ideal chunk time.
|
|
'''
|
|
# If chunk_time = scale * ideal_chunk_time,
|
|
# Then ideal_chunk_size = chunk_size / scale
|
|
scale = chunk_time / ideal_chunk_time
|
|
scale = min(scale, 2)
|
|
scale = max(scale, 0.5)
|
|
suggestion = chunk_size / scale
|
|
# Give the current size double weight so small fluctuations don't send
|
|
# the needle bouncing all over.
|
|
new_size = int((chunk_size + chunk_size + suggestion) / 3)
|
|
# I doubt any real-world scenario will dynamically suggest a chunk_size of
|
|
# zero, but let's enforce a one-byte minimum anyway.
|
|
new_size = max(new_size, 1)
|
|
return new_size
|
|
|
|
def prepare_plan(
|
|
url,
|
|
localname,
|
|
auth=None,
|
|
bytespersecond=None,
|
|
progressbar=None,
|
|
do_head=True,
|
|
headers=None,
|
|
overwrite=False,
|
|
raise_for_undersized=True,
|
|
ratemeter=None,
|
|
timeout=TIMEOUT,
|
|
verify_ssl=True,
|
|
):
|
|
# Chapter 1: File existence
|
|
headers = headers or {}
|
|
user_provided_range = 'range' in headers
|
|
|
|
url = sanitize_url(url)
|
|
if localname in [None, '']:
|
|
localname = basename_from_url(url)
|
|
|
|
if isinstance(localname, io.IOBase):
|
|
real_localname = localname
|
|
temp_localname = localname
|
|
real_exists = False
|
|
temp_exists = False
|
|
elif is_special_file(localname):
|
|
real_localname = SpecialPath(localname)
|
|
temp_localname = SpecialPath(localname)
|
|
real_exists = False
|
|
temp_exists = False
|
|
else:
|
|
localname = pathclass.Path(localname)
|
|
if localname.is_dir:
|
|
localname = localname.with_child(basename_from_url(url))
|
|
localname = sanitize_filename(localname.absolute_path)
|
|
real_localname = pathclass.Path(localname)
|
|
temp_localname = real_localname.add_extension(TEMP_EXTENSION)
|
|
real_exists = real_localname.exists
|
|
temp_exists = temp_localname.exists
|
|
|
|
if real_exists and overwrite is False and not user_provided_range:
|
|
log.debug('File exists and overwrite is off. Nothing to do.')
|
|
return FILE_EXISTS
|
|
|
|
if isinstance(real_localname, SpecialPath):
|
|
temp_localsize = 0
|
|
elif isinstance(real_localname, io.IOBase):
|
|
try:
|
|
temp_localsize = real_localname.tell()
|
|
except io.UnsupportedOperation:
|
|
temp_localsize = 0
|
|
else:
|
|
temp_localsize = int(temp_exists and temp_localname.size)
|
|
|
|
# Chapter 2: Ratelimiting
|
|
if bytespersecond is None:
|
|
limiter = None
|
|
elif isinstance(bytespersecond, ratelimiter.Ratelimiter):
|
|
limiter = bytespersecond
|
|
else:
|
|
limiter = ratelimiter.Ratelimiter(allowance=bytespersecond)
|
|
|
|
# Chapter 3: Extracting range
|
|
if user_provided_range:
|
|
user_range_min = int(headers['range'].split('bytes=')[1].split('-')[0])
|
|
user_range_max = headers['range'].split('-')[1]
|
|
if user_range_max != '':
|
|
user_range_max = int(user_range_max)
|
|
else:
|
|
user_range_min = None
|
|
user_range_max = None
|
|
|
|
# Chapter 4: Server range support
|
|
# Always include a range on the first request to figure out whether the
|
|
# server supports it. Use 0- to get correct remote_total_bytes
|
|
if user_provided_range and not do_head:
|
|
raise DownloadyException('Cannot determine range support without the head request')
|
|
|
|
temp_headers = headers.copy()
|
|
temp_headers.update({'range': 'bytes=0-'})
|
|
|
|
if do_head:
|
|
# I'm using a GET instead of an actual HEAD here because some servers respond
|
|
# differently, even though they're not supposed to.
|
|
head = request('get', url, stream=True, headers=temp_headers, auth=auth)
|
|
remote_total_bytes = head.headers.get('content-length', None)
|
|
remote_total_bytes = None if remote_total_bytes is None else int(remote_total_bytes)
|
|
server_respects_range = (head.status_code == 206 and 'content-range' in head.headers)
|
|
head.connection.close()
|
|
else:
|
|
remote_total_bytes = None
|
|
server_respects_range = False
|
|
|
|
if user_provided_range and not server_respects_range:
|
|
raise ServerNoRange('Server did not respect your range header')
|
|
|
|
# Chapter 5: Plan definitions
|
|
plan_base = {
|
|
'url': url,
|
|
'auth': auth,
|
|
'progressbar': progressbar,
|
|
'limiter': limiter,
|
|
'headers': headers,
|
|
'real_localname': real_localname,
|
|
'raise_for_undersized': raise_for_undersized,
|
|
'ratemeter': ratemeter,
|
|
'remote_total_bytes': remote_total_bytes,
|
|
'timeout': timeout,
|
|
'verify_ssl': verify_ssl,
|
|
}
|
|
plan_fulldownload = dotdict.DotDict(
|
|
plan_base,
|
|
download_into=temp_localname,
|
|
header_range_min=None,
|
|
header_range_max=None,
|
|
plan_type='fulldownload',
|
|
seek_to=0,
|
|
)
|
|
plan_resume = dotdict.DotDict(
|
|
plan_base,
|
|
download_into=temp_localname,
|
|
header_range_min=temp_localsize,
|
|
header_range_max='',
|
|
plan_type='resume',
|
|
seek_to=temp_localsize,
|
|
)
|
|
plan_partial = dotdict.DotDict(
|
|
plan_base,
|
|
download_into=real_localname,
|
|
header_range_min=user_range_min,
|
|
header_range_max=user_range_max,
|
|
plan_type='partial',
|
|
seek_to=user_range_min,
|
|
)
|
|
|
|
# Chapter 6: Redeem your meal vouchers here
|
|
if real_exists:
|
|
if overwrite:
|
|
os.remove(real_localname)
|
|
|
|
if user_provided_range:
|
|
return plan_partial
|
|
|
|
return plan_fulldownload
|
|
|
|
elif temp_exists and temp_localsize > 0:
|
|
if overwrite:
|
|
return plan_fulldownload
|
|
|
|
if user_provided_range:
|
|
return plan_partial
|
|
|
|
if server_respects_range:
|
|
log.info('Resume from byte %d' % plan_resume.seek_to)
|
|
return plan_resume
|
|
|
|
else:
|
|
if user_provided_range:
|
|
return plan_partial
|
|
|
|
return plan_fulldownload
|
|
|
|
raise DownloadyException('No plan was chosen?')
|
|
|
|
def basename_from_url(url):
|
|
'''
|
|
Determine the local filename appropriate for a URL.
|
|
'''
|
|
localname = urllib.parse.unquote(url)
|
|
localname = localname.rstrip('/')
|
|
localname = localname.split('?')[0]
|
|
localname = localname.rsplit('/', 1)[-1]
|
|
return localname
|
|
|
|
def is_special_file(file):
|
|
if isinstance(file, pathclass.Path):
|
|
return False
|
|
file = os.path.normpath(file)
|
|
file = file.rsplit(os.sep)[-1]
|
|
file = os.path.normcase(file)
|
|
return file in SPECIAL_FILENAMES
|
|
|
|
def request(method, url, headers=None, timeout=TIMEOUT, verify_ssl=True, **kwargs):
|
|
if headers is None:
|
|
headers = {}
|
|
else:
|
|
headers = headers.copy()
|
|
|
|
for (key, value) in HEADERS.items():
|
|
headers.setdefault(key, value)
|
|
|
|
session = requests.Session()
|
|
a = requests.adapters.HTTPAdapter(max_retries=30)
|
|
b = requests.adapters.HTTPAdapter(max_retries=30)
|
|
session.mount('http://', a)
|
|
session.mount('https://', b)
|
|
session.max_redirects = 40
|
|
|
|
method = {
|
|
'get': session.get,
|
|
'head': session.head,
|
|
'post': session.post,
|
|
}[method]
|
|
|
|
response = method(url, headers=headers, timeout=timeout, verify=verify_ssl, **kwargs)
|
|
httperrors.raise_for_status(response)
|
|
return response
|
|
|
|
def sanitize_filename(text, exclusions=''):
|
|
to_remove = FILENAME_BADCHARS
|
|
for exclude in exclusions:
|
|
to_remove = to_remove.replace(exclude, '')
|
|
|
|
for char in to_remove:
|
|
text = text.replace(char, '')
|
|
|
|
(drive, path) = os.path.splitdrive(text)
|
|
path = path.replace(':', '')
|
|
text = drive + path
|
|
|
|
return text
|
|
|
|
def sanitize_url(url):
|
|
url = url.replace('%3A//', '://')
|
|
return url
|
|
|
|
def download_argparse(args):
|
|
url = pipeable.input(args.url, split_lines=False)
|
|
|
|
if args.progressbar.lower() in {'none', 'off'}:
|
|
progressbar = None
|
|
if args.progressbar.lower() == 'bar':
|
|
progressbar = progressbars.bar1_bytestring
|
|
|
|
bytespersecond = args.bytespersecond
|
|
if bytespersecond is not None:
|
|
bytespersecond = bytestring.parsebytes(bytespersecond)
|
|
|
|
headers = {}
|
|
if args.range is not None:
|
|
headers['range'] = 'bytes=%s' % args.range
|
|
|
|
retry = args.retry
|
|
if not retry:
|
|
retry = 1
|
|
|
|
while retry != 0:
|
|
# Negative numbers permit infinite retries.
|
|
try:
|
|
download_file(
|
|
url=url,
|
|
localname=args.localname,
|
|
bytespersecond=bytespersecond,
|
|
progressbar=progressbar,
|
|
do_head=args.no_head is False,
|
|
headers=headers,
|
|
overwrite=args.overwrite,
|
|
timeout=args.timeout,
|
|
verbose=True,
|
|
verify_ssl=args.no_ssl is False,
|
|
)
|
|
except (NotEnoughBytes, requests.exceptions.ConnectionError, requests.exceptions.ChunkedEncodingError):
|
|
retry -= 1
|
|
if retry == 0:
|
|
raise
|
|
else:
|
|
break
|
|
|
|
@vlogging.main_decorator
|
|
def main(argv):
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
|
|
parser.add_argument('url')
|
|
parser.add_argument('localname', nargs='?', default=None)
|
|
parser.add_argument('--progressbar', default='bar')
|
|
parser.add_argument('-bps', '--bytespersecond', dest='bytespersecond', default=None)
|
|
parser.add_argument('--overwrite', action='store_true')
|
|
parser.add_argument('--range', default=None)
|
|
parser.add_argument('--timeout', type=int, default=TIMEOUT)
|
|
parser.add_argument('--retry', nargs='?', type=int, default=1)
|
|
parser.add_argument('--no_head', '--no-head', dest='no_head', action='store_true')
|
|
parser.add_argument('--no_ssl', '--no-ssl', dest='no_ssl', action='store_true')
|
|
parser.set_defaults(func=download_argparse)
|
|
|
|
args = parser.parse_args(argv)
|
|
return args.func(args)
|
|
|
|
if __name__ == '__main__':
|
|
raise SystemExit(main(sys.argv[1:]))
|