Improve handling of compressed streams.

master
voussoir 2024-03-31 19:03:37 -07:00
parent 3203850c93
commit 8584dc5e7f
1 changed files with 51 additions and 17 deletions

View File

@ -5,6 +5,7 @@ import requests
import sys import sys
import time import time
import urllib import urllib
import zlib
from voussoirkit import bytestring from voussoirkit import bytestring
from voussoirkit import dotdict from voussoirkit import dotdict
@ -18,10 +19,7 @@ from voussoirkit import vlogging
log = vlogging.getLogger(__name__, 'downloady') log = vlogging.getLogger(__name__, 'downloady')
USERAGENT = ''' USERAGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/42.0.2311.152 Safari/537.36'
'''.strip().replace('\n', ' ')
HEADERS = { HEADERS = {
'User-Agent': USERAGENT, 'User-Agent': USERAGENT,
@ -71,6 +69,7 @@ def download_file(
localname=None, localname=None,
auth=None, auth=None,
bytespersecond=None, bytespersecond=None,
custom_validator=None,
progressbar=None, progressbar=None,
do_head=True, do_head=True,
headers=None, headers=None,
@ -86,6 +85,7 @@ def download_file(
localname, localname,
auth=auth, auth=auth,
bytespersecond=bytespersecond, bytespersecond=bytespersecond,
custom_validator=custom_validator,
progressbar=progressbar, progressbar=progressbar,
do_head=do_head, do_head=do_head,
headers=headers, headers=headers,
@ -153,28 +153,25 @@ def download_plan(plan):
else: else:
chunk_size = 128 * bytestring.KIBIBYTE chunk_size = 128 * bytestring.KIBIBYTE
while True: chunks = stream_to_chunks(download_stream, chunk_size, limiter=plan.limiter)
chunk_start = time.perf_counter() if download_stream.headers.get('Content-Encoding') == 'gzip':
chunk = download_stream.raw.read(chunk_size) log.debug('Content is gzipped. Decompressor active.')
chunk_bytes = len(chunk) chunks = decompress_gzip(chunks)
if chunk_bytes == 0:
break
for chunk in chunks:
httperrors.raise_for_status(download_stream)
chunk_bytes = len(chunk)
# log.loud(f'Writing {chunk_bytes} to disk.')
file_handle.write(chunk) file_handle.write(chunk)
bytes_downloaded += chunk_bytes bytes_downloaded += chunk_bytes
if progressbar is not None: if progressbar is not None:
progressbar.step(bytes_downloaded) progressbar.step(bytes_downloaded)
if plan.limiter is not None:
plan.limiter.limit(chunk_bytes)
if plan.ratemeter is not None: if plan.ratemeter is not None:
plan.ratemeter.digest(chunk_bytes) plan.ratemeter.digest(chunk_bytes)
chunk_time = time.perf_counter() - chunk_start
chunk_size = dynamic_chunk_sizer(chunk_size, chunk_time, IDEAL_CHUNK_TIME)
if progressbar is not None: if progressbar is not None:
progressbar.done() progressbar.done()
@ -195,15 +192,26 @@ def download_plan(plan):
temp_localsize < plan.remote_total_bytes temp_localsize < plan.remote_total_bytes
) )
if undersized and plan.raise_for_undersized: if undersized and plan.raise_for_undersized:
message = 'File does not contain expected number of bytes. Received {size} / {total}' message = 'File does not contain expected number of bytes. Received {size} / {total}.'
message = message.format(size=temp_localsize, total=plan.remote_total_bytes) message = message.format(size=temp_localsize, total=plan.remote_total_bytes)
raise NotEnoughBytes(message) raise NotEnoughBytes(message)
if plan.custom_validator:
plan.custom_validator(plan.download_into)
if plan.download_into != plan.real_localname: if plan.download_into != plan.real_localname:
os.rename(plan.download_into, plan.real_localname) os.rename(plan.download_into, plan.real_localname)
return plan.real_localname return plan.real_localname
def decompress_gzip(stream):
decompressor = zlib.decompressobj(16 | zlib.MAX_WBITS)
for chunk in stream:
chunk = decompressor.decompress(chunk)
yield chunk
yield decompressor.flush()
def dynamic_chunk_sizer(chunk_size, chunk_time, ideal_chunk_time): def dynamic_chunk_sizer(chunk_size, chunk_time, ideal_chunk_time):
''' '''
Calculates a new chunk size based on the time it took to do the previous Calculates a new chunk size based on the time it took to do the previous
@ -228,6 +236,7 @@ def prepare_plan(
localname, localname,
auth=None, auth=None,
bytespersecond=None, bytespersecond=None,
custom_validator=None,
progressbar=None, progressbar=None,
do_head=True, do_head=True,
headers=None, headers=None,
@ -313,7 +322,9 @@ def prepare_plan(
remote_total_bytes = head.headers.get('content-length', None) remote_total_bytes = head.headers.get('content-length', None)
remote_total_bytes = None if remote_total_bytes is None else int(remote_total_bytes) remote_total_bytes = None if remote_total_bytes is None else int(remote_total_bytes)
server_respects_range = (head.status_code == 206 and 'content-range' in head.headers) server_respects_range = (head.status_code == 206 and 'content-range' in head.headers)
log.debug(f'Server respects range: {server_respects_range}')
head.connection.close() head.connection.close()
del head
else: else:
remote_total_bytes = None remote_total_bytes = None
server_respects_range = False server_respects_range = False
@ -325,6 +336,7 @@ def prepare_plan(
plan_base = { plan_base = {
'url': url, 'url': url,
'auth': auth, 'auth': auth,
'custom_validator': custom_validator,
'progressbar': progressbar, 'progressbar': progressbar,
'limiter': limiter, 'limiter': limiter,
'headers': headers, 'headers': headers,
@ -381,6 +393,8 @@ def prepare_plan(
log.info('Resume from byte %d' % plan_resume.seek_to) log.info('Resume from byte %d' % plan_resume.seek_to)
return plan_resume return plan_resume
return plan_fulldownload
else: else:
if user_provided_range: if user_provided_range:
return plan_partial return plan_partial
@ -429,7 +443,9 @@ def request(method, url, headers=None, timeout=TIMEOUT, verify_ssl=True, **kwarg
'post': session.post, 'post': session.post,
}[method] }[method]
log.debug(f'Request headers: {headers}')
response = method(url, headers=headers, timeout=timeout, verify=verify_ssl, **kwargs) response = method(url, headers=headers, timeout=timeout, verify=verify_ssl, **kwargs)
log.debug(f'Response headers: {response.status_code} {response.headers}')
httperrors.raise_for_status(response) httperrors.raise_for_status(response)
return response return response
@ -451,6 +467,24 @@ def sanitize_url(url):
url = url.replace('%3A//', '://') url = url.replace('%3A//', '://')
return url return url
def stream_to_chunks(download_stream, chunk_size, limiter):
while True:
chunk_start = time.perf_counter()
# log.loud(f'Reading {chunk_size} from stream.')
chunk = download_stream.raw.read(chunk_size)
chunk_bytes = len(chunk)
# log.loud(f'Got {chunk_bytes} from stream.')
if chunk_bytes == 0:
break
if limiter is not None:
limiter.limit(chunk_bytes)
chunk_time = time.perf_counter() - chunk_start
chunk_size = dynamic_chunk_sizer(chunk_size, chunk_time, IDEAL_CHUNK_TIME)
yield chunk
def download_argparse(args): def download_argparse(args):
url = pipeable.input(args.url, split_lines=False) url = pipeable.input(args.url, split_lines=False)