Improve handling of compressed streams.
parent
3203850c93
commit
8584dc5e7f
|
@ -5,6 +5,7 @@ import requests
|
||||||
import sys
|
import sys
|
||||||
import time
|
import time
|
||||||
import urllib
|
import urllib
|
||||||
|
import zlib
|
||||||
|
|
||||||
from voussoirkit import bytestring
|
from voussoirkit import bytestring
|
||||||
from voussoirkit import dotdict
|
from voussoirkit import dotdict
|
||||||
|
@ -18,10 +19,7 @@ from voussoirkit import vlogging
|
||||||
|
|
||||||
log = vlogging.getLogger(__name__, 'downloady')
|
log = vlogging.getLogger(__name__, 'downloady')
|
||||||
|
|
||||||
USERAGENT = '''
|
USERAGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
|
||||||
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)
|
|
||||||
Chrome/42.0.2311.152 Safari/537.36'
|
|
||||||
'''.strip().replace('\n', ' ')
|
|
||||||
|
|
||||||
HEADERS = {
|
HEADERS = {
|
||||||
'User-Agent': USERAGENT,
|
'User-Agent': USERAGENT,
|
||||||
|
@ -71,6 +69,7 @@ def download_file(
|
||||||
localname=None,
|
localname=None,
|
||||||
auth=None,
|
auth=None,
|
||||||
bytespersecond=None,
|
bytespersecond=None,
|
||||||
|
custom_validator=None,
|
||||||
progressbar=None,
|
progressbar=None,
|
||||||
do_head=True,
|
do_head=True,
|
||||||
headers=None,
|
headers=None,
|
||||||
|
@ -86,6 +85,7 @@ def download_file(
|
||||||
localname,
|
localname,
|
||||||
auth=auth,
|
auth=auth,
|
||||||
bytespersecond=bytespersecond,
|
bytespersecond=bytespersecond,
|
||||||
|
custom_validator=custom_validator,
|
||||||
progressbar=progressbar,
|
progressbar=progressbar,
|
||||||
do_head=do_head,
|
do_head=do_head,
|
||||||
headers=headers,
|
headers=headers,
|
||||||
|
@ -153,28 +153,25 @@ def download_plan(plan):
|
||||||
else:
|
else:
|
||||||
chunk_size = 128 * bytestring.KIBIBYTE
|
chunk_size = 128 * bytestring.KIBIBYTE
|
||||||
|
|
||||||
while True:
|
chunks = stream_to_chunks(download_stream, chunk_size, limiter=plan.limiter)
|
||||||
chunk_start = time.perf_counter()
|
if download_stream.headers.get('Content-Encoding') == 'gzip':
|
||||||
chunk = download_stream.raw.read(chunk_size)
|
log.debug('Content is gzipped. Decompressor active.')
|
||||||
chunk_bytes = len(chunk)
|
chunks = decompress_gzip(chunks)
|
||||||
if chunk_bytes == 0:
|
|
||||||
break
|
|
||||||
|
|
||||||
|
for chunk in chunks:
|
||||||
|
httperrors.raise_for_status(download_stream)
|
||||||
|
chunk_bytes = len(chunk)
|
||||||
|
|
||||||
|
# log.loud(f'Writing {chunk_bytes} to disk.')
|
||||||
file_handle.write(chunk)
|
file_handle.write(chunk)
|
||||||
bytes_downloaded += chunk_bytes
|
bytes_downloaded += chunk_bytes
|
||||||
|
|
||||||
if progressbar is not None:
|
if progressbar is not None:
|
||||||
progressbar.step(bytes_downloaded)
|
progressbar.step(bytes_downloaded)
|
||||||
|
|
||||||
if plan.limiter is not None:
|
|
||||||
plan.limiter.limit(chunk_bytes)
|
|
||||||
|
|
||||||
if plan.ratemeter is not None:
|
if plan.ratemeter is not None:
|
||||||
plan.ratemeter.digest(chunk_bytes)
|
plan.ratemeter.digest(chunk_bytes)
|
||||||
|
|
||||||
chunk_time = time.perf_counter() - chunk_start
|
|
||||||
chunk_size = dynamic_chunk_sizer(chunk_size, chunk_time, IDEAL_CHUNK_TIME)
|
|
||||||
|
|
||||||
if progressbar is not None:
|
if progressbar is not None:
|
||||||
progressbar.done()
|
progressbar.done()
|
||||||
|
|
||||||
|
@ -195,15 +192,26 @@ def download_plan(plan):
|
||||||
temp_localsize < plan.remote_total_bytes
|
temp_localsize < plan.remote_total_bytes
|
||||||
)
|
)
|
||||||
if undersized and plan.raise_for_undersized:
|
if undersized and plan.raise_for_undersized:
|
||||||
message = 'File does not contain expected number of bytes. Received {size} / {total}'
|
message = 'File does not contain expected number of bytes. Received {size} / {total}.'
|
||||||
message = message.format(size=temp_localsize, total=plan.remote_total_bytes)
|
message = message.format(size=temp_localsize, total=plan.remote_total_bytes)
|
||||||
raise NotEnoughBytes(message)
|
raise NotEnoughBytes(message)
|
||||||
|
|
||||||
|
if plan.custom_validator:
|
||||||
|
plan.custom_validator(plan.download_into)
|
||||||
|
|
||||||
if plan.download_into != plan.real_localname:
|
if plan.download_into != plan.real_localname:
|
||||||
os.rename(plan.download_into, plan.real_localname)
|
os.rename(plan.download_into, plan.real_localname)
|
||||||
|
|
||||||
return plan.real_localname
|
return plan.real_localname
|
||||||
|
|
||||||
|
def decompress_gzip(stream):
|
||||||
|
decompressor = zlib.decompressobj(16 | zlib.MAX_WBITS)
|
||||||
|
for chunk in stream:
|
||||||
|
chunk = decompressor.decompress(chunk)
|
||||||
|
yield chunk
|
||||||
|
|
||||||
|
yield decompressor.flush()
|
||||||
|
|
||||||
def dynamic_chunk_sizer(chunk_size, chunk_time, ideal_chunk_time):
|
def dynamic_chunk_sizer(chunk_size, chunk_time, ideal_chunk_time):
|
||||||
'''
|
'''
|
||||||
Calculates a new chunk size based on the time it took to do the previous
|
Calculates a new chunk size based on the time it took to do the previous
|
||||||
|
@ -228,6 +236,7 @@ def prepare_plan(
|
||||||
localname,
|
localname,
|
||||||
auth=None,
|
auth=None,
|
||||||
bytespersecond=None,
|
bytespersecond=None,
|
||||||
|
custom_validator=None,
|
||||||
progressbar=None,
|
progressbar=None,
|
||||||
do_head=True,
|
do_head=True,
|
||||||
headers=None,
|
headers=None,
|
||||||
|
@ -313,7 +322,9 @@ def prepare_plan(
|
||||||
remote_total_bytes = head.headers.get('content-length', None)
|
remote_total_bytes = head.headers.get('content-length', None)
|
||||||
remote_total_bytes = None if remote_total_bytes is None else int(remote_total_bytes)
|
remote_total_bytes = None if remote_total_bytes is None else int(remote_total_bytes)
|
||||||
server_respects_range = (head.status_code == 206 and 'content-range' in head.headers)
|
server_respects_range = (head.status_code == 206 and 'content-range' in head.headers)
|
||||||
|
log.debug(f'Server respects range: {server_respects_range}')
|
||||||
head.connection.close()
|
head.connection.close()
|
||||||
|
del head
|
||||||
else:
|
else:
|
||||||
remote_total_bytes = None
|
remote_total_bytes = None
|
||||||
server_respects_range = False
|
server_respects_range = False
|
||||||
|
@ -325,6 +336,7 @@ def prepare_plan(
|
||||||
plan_base = {
|
plan_base = {
|
||||||
'url': url,
|
'url': url,
|
||||||
'auth': auth,
|
'auth': auth,
|
||||||
|
'custom_validator': custom_validator,
|
||||||
'progressbar': progressbar,
|
'progressbar': progressbar,
|
||||||
'limiter': limiter,
|
'limiter': limiter,
|
||||||
'headers': headers,
|
'headers': headers,
|
||||||
|
@ -381,6 +393,8 @@ def prepare_plan(
|
||||||
log.info('Resume from byte %d' % plan_resume.seek_to)
|
log.info('Resume from byte %d' % plan_resume.seek_to)
|
||||||
return plan_resume
|
return plan_resume
|
||||||
|
|
||||||
|
return plan_fulldownload
|
||||||
|
|
||||||
else:
|
else:
|
||||||
if user_provided_range:
|
if user_provided_range:
|
||||||
return plan_partial
|
return plan_partial
|
||||||
|
@ -429,7 +443,9 @@ def request(method, url, headers=None, timeout=TIMEOUT, verify_ssl=True, **kwarg
|
||||||
'post': session.post,
|
'post': session.post,
|
||||||
}[method]
|
}[method]
|
||||||
|
|
||||||
|
log.debug(f'Request headers: {headers}')
|
||||||
response = method(url, headers=headers, timeout=timeout, verify=verify_ssl, **kwargs)
|
response = method(url, headers=headers, timeout=timeout, verify=verify_ssl, **kwargs)
|
||||||
|
log.debug(f'Response headers: {response.status_code} {response.headers}')
|
||||||
httperrors.raise_for_status(response)
|
httperrors.raise_for_status(response)
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
@ -451,6 +467,24 @@ def sanitize_url(url):
|
||||||
url = url.replace('%3A//', '://')
|
url = url.replace('%3A//', '://')
|
||||||
return url
|
return url
|
||||||
|
|
||||||
|
def stream_to_chunks(download_stream, chunk_size, limiter):
|
||||||
|
while True:
|
||||||
|
chunk_start = time.perf_counter()
|
||||||
|
# log.loud(f'Reading {chunk_size} from stream.')
|
||||||
|
chunk = download_stream.raw.read(chunk_size)
|
||||||
|
chunk_bytes = len(chunk)
|
||||||
|
# log.loud(f'Got {chunk_bytes} from stream.')
|
||||||
|
if chunk_bytes == 0:
|
||||||
|
break
|
||||||
|
|
||||||
|
if limiter is not None:
|
||||||
|
limiter.limit(chunk_bytes)
|
||||||
|
|
||||||
|
chunk_time = time.perf_counter() - chunk_start
|
||||||
|
chunk_size = dynamic_chunk_sizer(chunk_size, chunk_time, IDEAL_CHUNK_TIME)
|
||||||
|
|
||||||
|
yield chunk
|
||||||
|
|
||||||
def download_argparse(args):
|
def download_argparse(args):
|
||||||
url = pipeable.input(args.url, split_lines=False)
|
url = pipeable.input(args.url, split_lines=False)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue