else/Downloady/downloady.py

437 lines
14 KiB
Python
Raw Normal View History

2016-07-28 03:41:13 +00:00
import argparse
import os
2016-12-25 03:18:23 +00:00
import pyperclip
2016-07-28 03:41:13 +00:00
import requests
2016-08-18 01:24:38 +00:00
import sys
2016-07-28 03:41:13 +00:00
import time
import urllib
import warnings
2016-08-18 01:24:38 +00:00
2016-12-25 03:18:23 +00:00
# pip install voussoirkit
2016-12-02 06:37:07 +00:00
from voussoirkit import bytestring
from voussoirkit import ratelimiter
from voussoirkit import clipext
2016-08-18 01:24:38 +00:00
2016-07-28 03:41:13 +00:00
warnings.simplefilter('ignore')
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36'
}
2016-10-22 03:47:08 +00:00
FILENAME_BADCHARS = '*?"<>|\r'
2016-07-28 03:41:13 +00:00
last_request = 0
2016-09-05 23:37:07 +00:00
CHUNKSIZE = 4 * bytestring.KIBIBYTE
2016-12-23 03:42:21 +00:00
TIMEOUT = 60
2016-08-18 01:24:38 +00:00
TEMP_EXTENSION = '.downloadytemp'
2016-07-28 03:41:13 +00:00
2016-09-05 23:37:07 +00:00
PRINT_LIMITER = ratelimiter.Ratelimiter(allowance=5, mode='reject')
2016-07-28 03:41:13 +00:00
2016-07-29 20:39:04 +00:00
def download_file(
url,
localname=None,
auth=None,
bytespersecond=None,
callback_progress=None,
2016-12-23 03:42:21 +00:00
do_head=True,
2016-07-29 20:39:04 +00:00
headers=None,
2016-08-18 01:24:38 +00:00
overwrite=False,
2016-09-05 23:37:07 +00:00
raise_for_undersized=True,
2016-12-23 03:42:21 +00:00
timeout=None,
2016-08-18 01:24:38 +00:00
verbose=False,
2016-07-29 20:39:04 +00:00
):
2016-08-18 01:24:38 +00:00
headers = headers or {}
url = sanitize_url(url)
2016-07-29 20:39:04 +00:00
if localname in [None, '']:
localname = basename_from_url(url)
2016-10-22 03:47:08 +00:00
if os.path.isdir(localname):
localname = os.path.join(localname, basename_from_url(url))
2016-08-18 01:24:38 +00:00
localname = sanitize_filename(localname)
2016-10-22 03:47:08 +00:00
if localname != os.devnull:
localname = os.path.abspath(localname)
2016-07-29 20:39:04 +00:00
2016-08-18 01:24:38 +00:00
if verbose:
2016-10-22 03:47:08 +00:00
safeprint(' URL:', url)
safeprint('File:', localname)
2016-07-29 20:39:04 +00:00
2016-08-18 01:24:38 +00:00
plan = prepare_plan(
url,
localname,
auth=auth,
bytespersecond=bytespersecond,
2016-12-23 03:42:21 +00:00
callback_progress=callback_progress,
do_head=do_head,
2016-08-18 01:24:38 +00:00
headers=headers,
overwrite=overwrite,
2016-12-23 03:42:21 +00:00
raise_for_undersized=raise_for_undersized,
timeout=timeout,
2016-08-18 01:24:38 +00:00
)
#print(plan)
if plan is None:
return
2016-12-23 03:42:21 +00:00
return download_plan(plan)
def download_plan(plan):
2016-08-18 01:24:38 +00:00
localname = plan['download_into']
2016-07-29 20:39:04 +00:00
directory = os.path.split(localname)[0]
if directory != '':
os.makedirs(directory, exist_ok=True)
2016-08-18 01:24:38 +00:00
touch(localname)
file_handle = open(localname, 'r+b')
file_handle.seek(plan['seek_to'])
2016-07-29 20:39:04 +00:00
2016-08-18 01:24:38 +00:00
if plan['header_range_min'] is not None:
2016-12-23 03:42:21 +00:00
plan['headers']['range'] = 'bytes={min}-{max}'.format(
2016-08-18 01:24:38 +00:00
min=plan['header_range_min'],
max=plan['header_range_max'],
)
if plan['plan_type'] == 'resume':
bytes_downloaded = plan['seek_to']
2016-09-24 00:35:58 +00:00
elif plan['plan_type'] == 'partial':
bytes_downloaded = plan['seek_to']
2016-07-29 20:39:04 +00:00
else:
2016-08-18 01:24:38 +00:00
bytes_downloaded = 0
2016-07-29 20:39:04 +00:00
2016-12-23 03:42:21 +00:00
download_stream = request(
'get',
plan['url'],
stream=True,
auth=plan['auth'],
headers=plan['headers'],
timeout=plan['timeout'],
)
if plan['remote_total_bytes'] is None:
# Since we didn't do a head, let's fill this in now.
plan['remote_total_bytes'] = int(download_stream.headers.get('Content-Length', 0))
callback_progress = plan['callback_progress']
2016-09-05 23:37:07 +00:00
if callback_progress is not None:
callback_progress = callback_progress(plan['remote_total_bytes'])
2016-08-18 01:24:38 +00:00
for chunk in download_stream.iter_content(chunk_size=CHUNKSIZE):
bytes_downloaded += len(chunk)
file_handle.write(chunk)
if callback_progress is not None:
2016-09-05 23:37:07 +00:00
callback_progress.step(bytes_downloaded)
2016-08-18 01:24:38 +00:00
if plan['limiter'] is not None and bytes_downloaded < plan['remote_total_bytes']:
plan['limiter'].limit(len(chunk))
file_handle.close()
2016-09-05 23:37:07 +00:00
# Don't try to rename /dev/null
if os.devnull not in [localname, plan['real_localname']]:
localsize = os.path.getsize(localname)
undersized = plan['plan_type'] != 'partial' and localsize < plan['remote_total_bytes']
2016-12-23 03:42:21 +00:00
if plan['raise_for_undersized'] and undersized:
2016-09-05 23:37:07 +00:00
message = 'File does not contain expected number of bytes. Received {size} / {total}'
message = message.format(size=localsize, total=plan['remote_total_bytes'])
raise Exception(message)
2016-08-18 01:24:38 +00:00
2016-09-05 23:37:07 +00:00
if localname != plan['real_localname']:
os.rename(localname, plan['real_localname'])
2016-07-29 20:39:04 +00:00
2016-08-18 01:24:38 +00:00
return plan['real_localname']
def prepare_plan(
url,
localname,
2016-12-23 03:42:21 +00:00
auth=None,
bytespersecond=None,
callback_progress=None,
do_head=True,
headers=None,
overwrite=False,
raise_for_undersized=True,
timeout=TIMEOUT,
2016-08-18 01:24:38 +00:00
):
# Chapter 1: File existence
2016-12-23 03:42:21 +00:00
headers = headers or {}
2016-07-29 20:39:04 +00:00
user_provided_range = 'range' in headers
2016-08-18 01:24:38 +00:00
real_localname = localname
temp_localname = localname + TEMP_EXTENSION
real_exists = os.path.exists(real_localname)
if real_exists and overwrite is False and not user_provided_range:
print('File exists and overwrite is off. Nothing to do.')
return None
temp_exists = os.path.exists(temp_localname)
real_localsize = int(real_exists and os.path.getsize(real_localname))
temp_localsize = int(temp_exists and os.path.getsize(temp_localname))
# Chapter 2: Ratelimiting
if bytespersecond is None:
limiter = None
elif isinstance(bytespersecond, ratelimiter.Ratelimiter):
limiter = bytespersecond
else:
2016-09-05 23:37:07 +00:00
limiter = ratelimiter.Ratelimiter(allowance=bytespersecond)
2016-08-18 01:24:38 +00:00
# Chapter 3: Extracting range
2016-07-29 20:39:04 +00:00
if user_provided_range:
user_range_min = int(headers['range'].split('bytes=')[1].split('-')[0])
user_range_max = headers['range'].split('-')[1]
if user_range_max != '':
user_range_max = int(user_range_max)
else:
user_range_min = None
user_range_max = None
2016-08-18 01:24:38 +00:00
# Chapter 4: Server range support
2016-08-09 08:33:36 +00:00
# Always include a range on the first request to figure out whether the
2016-08-18 01:24:38 +00:00
# server supports it. Use 0- to get correct remote_total_bytes
2016-08-09 08:33:36 +00:00
temp_headers = headers
temp_headers.update({'range': 'bytes=0-'})
2016-12-23 03:42:21 +00:00
if do_head:
# I'm using a GET instead of an actual HEAD here because some servers respond
# differently, even though they're not supposed to.
head = request('get', url, stream=True, headers=temp_headers, auth=auth)
remote_total_bytes = int(head.headers.get('content-length', 0))
server_respects_range = (head.status_code == 206 and 'content-range' in head.headers)
head.connection.close()
else:
remote_total_bytes = None
server_respects_range = False
2016-07-29 20:39:04 +00:00
2016-08-18 01:24:38 +00:00
if user_provided_range and not server_respects_range:
2016-12-23 03:42:21 +00:00
if not do_head:
raise Exception('Cannot determine range support without the head request')
else:
raise Exception('Server did not respect your range header')
2016-08-18 01:24:38 +00:00
# Chapter 5: Plan definitions
plan_base = {
2016-12-23 03:42:21 +00:00
'url': url,
'auth': auth,
'callback_progress': callback_progress,
2016-08-18 01:24:38 +00:00
'limiter': limiter,
2016-12-23 03:42:21 +00:00
'headers': headers,
2016-08-18 01:24:38 +00:00
'real_localname': real_localname,
2016-12-23 03:42:21 +00:00
'raise_for_undersized': raise_for_undersized,
2016-08-18 01:24:38 +00:00
'remote_total_bytes': remote_total_bytes,
2016-12-23 03:42:21 +00:00
'timeout': timeout,
2016-08-18 01:24:38 +00:00
}
plan_fulldownload = dict(
plan_base,
download_into=temp_localname,
header_range_min=None,
header_range_max=None,
plan_type='fulldownload',
seek_to=0,
)
plan_resume = dict(
plan_base,
download_into=temp_localname,
header_range_min=temp_localsize,
header_range_max='',
plan_type='resume',
seek_to=temp_localsize,
)
plan_partial = dict(
plan_base,
download_into=real_localname,
header_range_min=user_range_min,
header_range_max=user_range_max,
plan_type='partial',
seek_to=user_range_min,
2016-07-29 20:39:04 +00:00
)
2016-08-18 01:24:38 +00:00
# Chapter 6: Redeem your meal vouchers here
if real_exists:
if overwrite:
os.remove(real_localname)
2016-07-28 03:41:13 +00:00
2016-08-18 01:24:38 +00:00
if user_provided_range:
return plan_partial
2016-07-28 03:41:13 +00:00
2016-08-18 01:24:38 +00:00
return plan_fulldownload
2016-07-28 03:41:13 +00:00
2016-08-18 01:24:38 +00:00
elif temp_exists and temp_localsize > 0:
if overwrite:
return plan_fulldownload
2016-07-28 03:41:13 +00:00
2016-08-18 01:24:38 +00:00
if user_provided_range:
return plan_partial
2016-07-28 03:41:13 +00:00
2016-08-18 01:24:38 +00:00
if server_respects_range:
print('Resume from byte %d' % plan_resume['seek_to'])
return plan_resume
else:
if user_provided_range:
return plan_partial
return plan_fulldownload
2016-07-28 03:41:13 +00:00
2016-10-22 03:47:08 +00:00
raise Exception('No plan was chosen?')
2016-09-05 23:37:07 +00:00
class Progress1:
def __init__(self, total_bytes):
2016-10-04 02:20:58 +00:00
self.limiter = ratelimiter.Ratelimiter(allowance=8, mode='reject')
2016-09-05 23:37:07 +00:00
self.limiter.balance = 1
self.total_bytes = max(1, total_bytes)
self.divisor = bytestring.get_appropriate_divisor(total_bytes)
self.total_format = bytestring.bytestring(total_bytes, force_unit=self.divisor)
self.downloaded_format = '{:>%d}' % len(self.total_format)
self.blank_char = ' '
self.solid_char = ''
def step(self, bytes_downloaded):
#print(self.limiter.balance)
percent = bytes_downloaded / self.total_bytes
percent = min(1, percent)
if self.limiter.limit(1) is False and percent < 1:
return
downloaded_string = bytestring.bytestring(bytes_downloaded, force_unit=self.divisor)
downloaded_string = self.downloaded_format.format(downloaded_string)
block_count = 50
solid_blocks = self.solid_char * int(block_count * percent)
statusbar = solid_blocks.ljust(block_count, self.blank_char)
statusbar = self.solid_char + statusbar + self.solid_char
end = '\n' if percent == 1 else ''
message = '\r{bytes_downloaded} {statusbar} {total_bytes}'
message = message.format(
bytes_downloaded=downloaded_string,
total_bytes=self.total_format,
statusbar=statusbar,
)
print(message, end=end, flush=True)
class Progress2:
def __init__(self, total_bytes):
self.total_bytes = max(1, total_bytes)
2016-10-04 02:20:58 +00:00
self.limiter = ratelimiter.Ratelimiter(allowance=8, mode='reject')
2016-09-05 23:37:07 +00:00
self.limiter.balance = 1
self.total_bytes_string = '{:,}'.format(self.total_bytes)
self.bytes_downloaded_string = '{:%d,}' % len(self.total_bytes_string)
def step(self, bytes_downloaded):
percent = (bytes_downloaded * 100) / self.total_bytes
percent = min(100, percent)
if self.limiter.limit(1) is False and percent < 100:
return
percent_string = '%08.4f' % percent
bytes_downloaded_string = self.bytes_downloaded_string.format(bytes_downloaded)
end = '\n' if percent == 100 else ''
message = '\r{bytes_downloaded} / {total_bytes} / {percent}%'
message = message.format(
bytes_downloaded=bytes_downloaded_string,
total_bytes=self.total_bytes_string,
percent=percent_string,
)
print(message, end=end, flush=True)
def basename_from_url(url):
'''
Determine the local filename appropriate for a URL.
'''
localname = urllib.parse.unquote(url)
localname = localname.split('?')[0]
localname = localname.split('/')[-1]
return localname
2016-08-18 01:24:38 +00:00
2016-07-28 03:41:13 +00:00
def get_permission(prompt='y/n\n>', affirmative=['y', 'yes']):
permission = input(prompt)
return permission.lower() in affirmative
def request(method, url, stream=False, headers=None, timeout=TIMEOUT, **kwargs):
if headers is None:
headers = {}
for (key, value) in HEADERS.items():
headers.setdefault(key, value)
session = requests.Session()
2016-09-05 23:37:07 +00:00
a = requests.adapters.HTTPAdapter(max_retries=30)
b = requests.adapters.HTTPAdapter(max_retries=30)
session.mount('http://', a)
session.mount('https://', b)
2016-07-28 03:41:13 +00:00
session.max_redirects = 40
method = {
'get': session.get,
'head': session.head,
'post': session.post,
}[method]
2016-10-22 03:47:08 +00:00
req = method(url, stream=stream, headers=headers, timeout=timeout, **kwargs)
2016-07-28 03:41:13 +00:00
req.raise_for_status()
return req
2016-10-22 03:47:08 +00:00
def safeprint(*texts, **kwargs):
texts = [str(text).encode('ascii', 'replace').decode() for text in texts]
print(*texts, **kwargs)
2016-08-18 01:24:38 +00:00
def sanitize_filename(text, exclusions=''):
bet = FILENAME_BADCHARS.replace(exclusions, '')
for char in bet:
text = text.replace(char, '')
return text
def sanitize_url(url):
url = url.replace('%3A//', '://')
return url
2016-07-28 03:41:13 +00:00
def touch(filename):
f = open(filename, 'ab')
f.close()
return
def download_argparse(args):
url = args.url
2016-08-18 01:24:38 +00:00
url = clipext.resolve(url)
2016-07-28 03:41:13 +00:00
callback = {
2016-09-05 23:37:07 +00:00
None: Progress1,
'1': Progress1,
'2': Progress2,
2016-08-18 01:24:38 +00:00
}.get(args.callback, args.callback)
2016-07-28 03:41:13 +00:00
bytespersecond = args.bytespersecond
if bytespersecond is not None:
bytespersecond = bytestring.parsebytes(bytespersecond)
headers = {}
if args.range is not None:
headers['range'] = 'bytes=%s' % args.range
download_file(
url=url,
localname=args.localname,
bytespersecond=bytespersecond,
callback_progress=callback,
2016-12-23 03:42:21 +00:00
do_head=args.no_head is False,
2016-07-28 03:41:13 +00:00
headers=headers,
2016-08-18 01:24:38 +00:00
overwrite=args.overwrite,
2016-12-23 03:42:21 +00:00
timeout=int(args.timeout),
2016-08-18 01:24:38 +00:00
verbose=True,
2016-07-28 03:41:13 +00:00
)
2016-08-18 01:24:38 +00:00
2016-07-28 03:41:13 +00:00
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('url')
parser.add_argument('localname', nargs='?', default=None)
2016-10-22 03:47:08 +00:00
parser.add_argument('-c', '--callback', dest='callback', default=Progress1)
2016-07-28 03:41:13 +00:00
parser.add_argument('-bps', '--bytespersecond', dest='bytespersecond', default=None)
2016-08-18 01:24:38 +00:00
parser.add_argument('-ow', '--overwrite', dest='overwrite', action='store_true')
2016-07-28 03:41:13 +00:00
parser.add_argument('-r', '--range', dest='range', default=None)
2016-12-23 03:42:21 +00:00
parser.add_argument('--timeout', dest='timeout', default=TIMEOUT)
parser.add_argument('--no-head', dest='no_head', action='store_true')
2016-07-28 03:41:13 +00:00
parser.set_defaults(func=download_argparse)
args = parser.parse_args()
2016-08-18 01:24:38 +00:00
args.func(args)