This commit is contained in:
Ethan Dalool 2016-10-03 19:20:58 -07:00
parent aa836ce5c3
commit fcdaa58bd4
4 changed files with 148 additions and 150 deletions

View file

@ -229,7 +229,7 @@ def prepare_plan(
class Progress1: class Progress1:
def __init__(self, total_bytes): def __init__(self, total_bytes):
self.limiter = ratelimiter.Ratelimiter(allowance=5, mode='reject') self.limiter = ratelimiter.Ratelimiter(allowance=8, mode='reject')
self.limiter.balance = 1 self.limiter.balance = 1
self.total_bytes = max(1, total_bytes) self.total_bytes = max(1, total_bytes)
self.divisor = bytestring.get_appropriate_divisor(total_bytes) self.divisor = bytestring.get_appropriate_divisor(total_bytes)
@ -265,7 +265,7 @@ class Progress1:
class Progress2: class Progress2:
def __init__(self, total_bytes): def __init__(self, total_bytes):
self.total_bytes = max(1, total_bytes) self.total_bytes = max(1, total_bytes)
self.limiter = ratelimiter.Ratelimiter(allowance=5, mode='reject') self.limiter = ratelimiter.Ratelimiter(allowance=8, mode='reject')
self.limiter.balance = 1 self.limiter.balance = 1
self.total_bytes_string = '{:,}'.format(self.total_bytes) self.total_bytes_string = '{:,}'.format(self.total_bytes)
self.bytes_downloaded_string = '{:%d,}' % len(self.total_bytes_string) self.bytes_downloaded_string = '{:%d,}' % len(self.total_bytes_string)

View file

@ -7,6 +7,12 @@ Requires `pip install beautifulsoup4`.
See inside opendirdl.py for usage instructions. See inside opendirdl.py for usage instructions.
- 2016 10 01
- **[bugfix]** Fixed the download function so it actually passes `headers` into downloady.
- **[change]** `url_split` key 'root' has been renamed to 'domain'.
- **[cleanup]** Removed import for Ratelimiter since downloady handles all of that now.
- **[cleanup]** Improved some variable names, including `walkurl -> root_url`.
- 2016 08 16 - 2016 08 16
- **[cleanup]** Now that Downloady uses temp files for incomplete downloads, that logic can be removed from opendirdl. - **[cleanup]** Now that Downloady uses temp files for incomplete downloads, that logic can be removed from opendirdl.

View file

@ -1,6 +1,5 @@
# voussoir # voussoir
'''
DOCSTRING='''
OpenDirDL OpenDirDL
downloads open directories downloads open directories
@ -108,36 +107,31 @@ tree:
''' '''
# Module names preceeded by `## ~` indicate modules that are imported during # Module names preceeded by `## ` indicate modules that are imported during
# a function, because they are not used anywhere else and we don't need to waste # a function, because they are not used anywhere else and we don't need to waste
# time importing them usually. # time importing them usually, but I still want them listed here for clarity.
import argparse
## import bs4
import collections
## import hashlib
import os
## import re
import requests
import shutil
import sqlite3
import sys import sys
## import tkinter
import urllib.parse
# Please consult my github repo for these files # Please consult my github repo for these files
# https://github.com/voussoir/else # https://github.com/voussoir/else
sys.path.append('C:\\git\\else\\Downloady'); import downloady sys.path.append('C:\\git\\else\\Downloady'); import downloady
sys.path.append('C:\\git\\else\\Bytestring'); import bytestring sys.path.append('C:\\git\\else\\Bytestring'); import bytestring
sys.path.append('C:\\git\\else\\Ratelimiter'); import ratelimiter
import argparse
## ~import bs4
import collections
## ~import hashlib
import os
## ~import re
import requests
import shutil
import sqlite3
## ~tkinter
import traceback
import urllib.parse
FILENAME_BADCHARS = '/\\:*?"<>|'
TERMINAL_WIDTH = shutil.get_terminal_size().columns
DOWNLOAD_CHUNK = 16 * bytestring.KIBIBYTE DOWNLOAD_CHUNK = 16 * bytestring.KIBIBYTE
FILENAME_BADCHARS = '/\\:*?"<>|'
TERMINAL_WIDTH = shutil.get_terminal_size().columns
UNKNOWN_SIZE_STRING = '???' UNKNOWN_SIZE_STRING = '???'
# When doing a basic scan, we will not send HEAD requests to URLs that end in # When doing a basic scan, we will not send HEAD requests to URLs that end in
@ -145,50 +139,50 @@ UNKNOWN_SIZE_STRING = '???'
# This isn't meant to be a comprehensive filetype library, but it covers # This isn't meant to be a comprehensive filetype library, but it covers
# enough of the typical opendir to speed things up. # enough of the typical opendir to speed things up.
SKIPPABLE_FILETYPES = [ SKIPPABLE_FILETYPES = [
'.aac', '.aac',
'.avi', '.avi',
'.bin', '.bin',
'.bmp', '.bmp',
'.bz2', '.bz2',
'.epub', '.epub',
'.exe', '.exe',
'.db', '.db',
'.flac', '.flac',
'.gif', '.gif',
'.gz', '.gz',
'.ico', '.ico',
'.iso', '.iso',
'.jpeg', '.jpeg',
'.jpg', '.jpg',
'.m3u', '.m3u',
'.m4a', '.m4a',
'.m4v', '.m4v',
'.mka', '.mka',
'.mkv', '.mkv',
'.mov', '.mov',
'.mp3', '.mp3',
'.mp4', '.mp4',
'.nfo', '.nfo',
'.ogg', '.ogg',
'.ott', '.ott',
'.pdf', '.pdf',
'.png', '.png',
'.rar', '.rar',
'.srt', '.srt',
'.tar', '.tar',
'.ttf', '.ttf',
'.txt', '.txt',
'.wav', '.wav',
'.webm', '.webm',
'.wma', '.wma',
'.zip', '.zip',
] ]
SKIPPABLE_FILETYPES = set(x.lower() for x in SKIPPABLE_FILETYPES) SKIPPABLE_FILETYPES = set(x.lower() for x in SKIPPABLE_FILETYPES)
# Will be ignored completely. Are case-sensitive # Will be ignored completely. Are case-sensitive
BLACKLISTED_FILENAMES = [ BLACKLISTED_FILENAMES = [
'desktop.ini', 'desktop.ini',
'thumbs.db', 'thumbs.db',
] ]
# oh shit # oh shit
@ -275,15 +269,18 @@ those files.
## WALKER ########################################################################################## ## WALKER ##########################################################################################
## ## ## ##
class Walker: class Walker:
def __init__(self, walkurl, databasename=None, fullscan=False): '''
if not walkurl.endswith('/'): This class manages the extraction and saving of URLs, given a starting root url.
walkurl += '/' '''
if '://' not in walkurl.split('.')[0]: def __init__(self, root_url, databasename=None, fullscan=False):
walkurl = 'http://' + walkurl if not root_url.endswith('/'):
self.walkurl = walkurl root_url += '/'
if '://' not in root_url.split('.')[0]:
root_url = 'http://' + root_url
self.root_url = root_url
if databasename in (None, ''): if databasename in (None, ''):
domain = url_split(self.walkurl)['root'] domain = url_split(self.root_url)['domain']
databasename = domain + '.db' databasename = domain + '.db'
databasename = databasename.replace(':', '#') databasename = databasename.replace(':', '#')
self.databasename = databasename self.databasename = databasename
@ -318,7 +315,7 @@ class Walker:
continue continue
href = urllib.parse.urljoin(response.url, href) href = urllib.parse.urljoin(response.url, href)
if not href.startswith(self.walkurl): if not href.startswith(self.root_url):
# Don't go to other sites or parent directories. # Don't go to other sites or parent directories.
continue continue
@ -337,7 +334,7 @@ class Walker:
If it is an index page, its links are extracted and queued. If it is an index page, its links are extracted and queued.
If it is a file, its information is saved to the database. If it is a file, its information is saved to the database.
We perform a We perform a
HEAD: HEAD:
when `self.fullscan` is True. when `self.fullscan` is True.
when `self.fullscan` is False but the url is not a SKIPPABLE_FILETYPE. when `self.fullscan` is False but the url is not a SKIPPABLE_FILETYPE.
@ -346,15 +343,15 @@ class Walker:
when the url is an index page. when the url is an index page.
''' '''
if url is None: if url is None:
url = self.walkurl url = self.root_url
else: else:
url = urllib.parse.urljoin(self.walkurl, url) url = urllib.parse.urljoin(self.root_url, url)
if url in self.seen_directories: if url in self.seen_directories:
# We already picked this up at some point # We already picked this up at some point
return return
if not url.startswith(self.walkurl): if not url.startswith(self.root_url):
# Don't follow external links or parent directory. # Don't follow external links or parent directory.
write('Skipping "%s" due to external url.' % url) write('Skipping "%s" due to external url.' % url)
return return
@ -374,11 +371,11 @@ class Walker:
try: try:
head = do_head(url) head = do_head(url)
except requests.exceptions.HTTPError as e: except requests.exceptions.HTTPError as exception:
if e.response.status_code == 403: if exception.response.status_code == 403:
write('403 FORBIDDEN!') write('403 FORBIDDEN!')
return return
if e.response.status_code == 404: if exception.response.status_code == 404:
write('404 NOT FOUND!') write('404 NOT FOUND!')
return return
raise raise
@ -405,6 +402,10 @@ class Walker:
self.smart_insert(head=head, commit=False) self.smart_insert(head=head, commit=False)
def walk(self, url=None): def walk(self, url=None):
'''
Given a starting URL (defaults to self.root_url), continually extract
links from the page and repeat.
'''
self.queue.appendleft(url) self.queue.appendleft(url)
try: try:
while len(self.queue) > 0: while len(self.queue) > 0:
@ -422,12 +423,6 @@ class Walker:
## OTHER CLASSES ################################################################################### ## OTHER CLASSES ###################################################################################
## ## ## ##
class Generic:
def __init__(self, **kwargs):
for (key, value) in kwargs.items():
setattr(self, key, value)
class TreeExistingChild(Exception): class TreeExistingChild(Exception):
pass pass
@ -540,10 +535,10 @@ def build_file_tree(databasename):
'name': databasename, 'name': databasename,
} }
scheme = url_split(all_items[0]['url'])['scheme'] scheme = url_split(all_items[0]['url'])['scheme']
tree = TreeNode(databasename, data=root_data) tree_root = TreeNode(databasename, data=root_data)
tree.unsorted_children = all_items tree_root.unsorted_children = all_items
node_queue = set() node_queue = set()
node_queue.add(tree) node_queue.add(tree_root)
# In this process, URLs are divided up into their nodes one directory layer at a time. # In this process, URLs are divided up into their nodes one directory layer at a time.
# The root receives all URLs, and creates nodes for each of the top-level # The root receives all URLs, and creates nodes for each of the top-level
@ -574,14 +569,14 @@ def build_file_tree(databasename):
child.data['url'] = new_child_data['url'] child.data['url'] = new_child_data['url']
if node.parent is None: if node.parent is None:
continue continue
elif node.parent == tree: elif node.parent == tree_root:
node.data['url'] = scheme + '://' + node.identifier node.data['url'] = scheme + '://' + node.identifier
else: else:
node.data['url'] = node.parent.data['url'] + '/' + node.identifier node.data['url'] = node.parent.data['url'] + '/' + node.identifier
del node.unsorted_children del node.unsorted_children
return tree return tree_root
def db_init(sql, cur): def db_init(sql, cur):
lines = DB_INIT.split(';') lines = DB_INIT.split(';')
@ -604,7 +599,7 @@ def do_request(message, method, url, raise_for_status=True):
if raise_for_status: if raise_for_status:
response.raise_for_status() response.raise_for_status()
return response return response
def fetch_generator(cur): def fetch_generator(cur):
while True: while True:
fetch = cur.fetchone() fetch = cur.fetchone()
@ -613,6 +608,9 @@ def fetch_generator(cur):
yield fetch yield fetch
def filepath_sanitize(text, allowed=''): def filepath_sanitize(text, allowed=''):
'''
Remove forbidden characters from the text, unless specifically sanctioned.
'''
badchars = FILENAME_BADCHARS badchars = FILENAME_BADCHARS
badchars = set(char for char in FILENAME_BADCHARS if char not in allowed) badchars = set(char for char in FILENAME_BADCHARS if char not in allowed)
text = ''.join(char for char in text if char not in badchars) text = ''.join(char for char in text if char not in badchars)
@ -627,22 +625,10 @@ def get_clipboard():
def hashit(text, length=None): def hashit(text, length=None):
import hashlib import hashlib
h = hashlib.sha512(text.encode('utf-8')).hexdigest() sha = hashlib.sha512(text.encode('utf-8')).hexdigest()
if length is not None: if length is not None:
h = h[:length] sha = sha[:length]
return h return sha
def listget(l, index, default=None):
try:
return l[index]
except IndexError:
return default
def longest_length(li):
longest = 0
for item in li:
longest = max(longest, len(item))
return longest
def recursive_get_size(node): def recursive_get_size(node):
''' '''
@ -722,10 +708,15 @@ def recursive_print_node(node, depth=0, use_html=False, output_file=None):
# This helps put some space between sibling directories # This helps put some space between sibling directories
write('| ' * (depth), output_file) write('| ' * (depth), output_file)
def safeindex(sequence, index, fallback=None):
try:
return sequence[index]
except IndexError:
return fallback
def safeprint(text, **kwargs): def safeprint(text, **kwargs):
text = str(text) text = str(text)
text = text.encode('ascii', 'replace').decode() text = text.encode('ascii', 'replace').decode()
#text = text.replace('?', '_')
print(text, **kwargs) print(text, **kwargs)
def smart_insert(sql, cur, url=None, head=None, commit=True): def smart_insert(sql, cur, url=None, head=None, commit=True):
@ -780,9 +771,12 @@ def smart_insert(sql, cur, url=None, head=None, commit=True):
sql.commit() sql.commit()
return data return data
def url_split(text): def url_split(url):
text = urllib.parse.unquote(text) '''
parts = urllib.parse.urlsplit(text) Given a url, return a dictionary of its components.
'''
url = urllib.parse.unquote(url)
parts = urllib.parse.urlsplit(url)
if any(part == '' for part in [parts.scheme, parts.netloc]): if any(part == '' for part in [parts.scheme, parts.netloc]):
raise ValueError('Not a valid URL') raise ValueError('Not a valid URL')
scheme = parts.scheme scheme = parts.scheme
@ -800,7 +794,7 @@ def url_split(text):
result = { result = {
'scheme': scheme, 'scheme': scheme,
'root': root, 'domain': root,
'folder': folder, 'folder': folder,
'filename': filename, 'filename': filename,
} }
@ -817,14 +811,14 @@ def write(line, file_handle=None, **kwargs):
## COMMANDLINE FUNCTIONS ########################################################################### ## COMMANDLINE FUNCTIONS ###########################################################################
## ## ## ##
def digest(walkurl, databasename=None, fullscan=False): def digest(root_url, databasename=None, fullscan=False):
if walkurl in ('!clipboard', '!c'): if root_url in ('!clipboard', '!c'):
walkurl = get_clipboard() root_url = get_clipboard()
write('From clipboard: %s' % walkurl) write('From clipboard: %s' % root_url)
walker = Walker( walker = Walker(
databasename=databasename, databasename=databasename,
fullscan=fullscan, fullscan=fullscan,
walkurl=walkurl, root_url=root_url,
) )
walker.walk() walker.walk()
@ -832,7 +826,7 @@ def digest_argparse(args):
return digest( return digest(
databasename=args.databasename, databasename=args.databasename,
fullscan=args.fullscan, fullscan=args.fullscan,
walkurl=args.walkurl, root_url=args.root_url,
) )
def download( def download(
@ -873,7 +867,7 @@ def download(
# on their own. # on their own.
cur.execute('SELECT url FROM urls LIMIT 1') cur.execute('SELECT url FROM urls LIMIT 1')
url = cur.fetchone()[0] url = cur.fetchone()[0]
outputdir = url_split(url)['root'] outputdir = url_split(url)['domain']
if isinstance(bytespersecond, str): if isinstance(bytespersecond, str):
bytespersecond = bytestring.parsebytes(bytespersecond) bytespersecond = bytestring.parsebytes(bytespersecond)
@ -894,7 +888,8 @@ def download(
localname=fullname, localname=fullname,
bytespersecond=bytespersecond, bytespersecond=bytespersecond,
callback_progress=downloady.progress2, callback_progress=downloady.progress2,
overwrite=overwrite headers=headers,
overwrite=overwrite,
) )
def download_argparse(args): def download_argparse(args):
@ -905,7 +900,7 @@ def download_argparse(args):
bytespersecond=args.bytespersecond, bytespersecond=args.bytespersecond,
) )
def filter_pattern(databasename, regex, action='keep', *trash): def filter_pattern(databasename, regex, action='keep'):
''' '''
When `action` is 'keep', then any URLs matching the regex will have their When `action` is 'keep', then any URLs matching the regex will have their
`do_download` flag set to True. `do_download` flag set to True.
@ -930,15 +925,13 @@ def filter_pattern(databasename, regex, action='keep', *trash):
items = cur.fetchall() items = cur.fetchall()
for item in items: for item in items:
url = item[SQL_URL] url = item[SQL_URL]
current_do_dl = item[SQL_DO_DOWNLOAD]
for pattern in regex: for pattern in regex:
contains = re.search(pattern, url) is not None contains = re.search(pattern, url) is not None
should_keep = (keep and contains) if keep and contains and not item[SQL_DO_DOWNLOAD]:
if keep and contains and not current_do_dl:
write('Enabling "%s"' % url) write('Enabling "%s"' % url)
cur.execute('UPDATE urls SET do_download = 1 WHERE url == ?', [url]) cur.execute('UPDATE urls SET do_download = 1 WHERE url == ?', [url])
if remove and contains and current_do_dl: if remove and contains and item[SQL_DO_DOWNLOAD]:
write('Disabling "%s"' % url) write('Disabling "%s"' % url)
cur.execute('UPDATE urls SET do_download = 0 WHERE url == ?', [url]) cur.execute('UPDATE urls SET do_download = 0 WHERE url == ?', [url])
sql.commit() sql.commit()
@ -1079,7 +1072,7 @@ def tree(databasename, output_filename=None):
collapsible boxes and clickable filenames. Otherwise the file will just collapsible boxes and clickable filenames. Otherwise the file will just
be a plain text drawing. be a plain text drawing.
''' '''
tree = build_file_tree(databasename) tree_root = build_file_tree(databasename)
if output_filename is not None: if output_filename is not None:
output_file = open(output_filename, 'w', encoding='utf-8') output_file = open(output_filename, 'w', encoding='utf-8')
@ -1093,8 +1086,8 @@ def tree(databasename, output_filename=None):
write(HTML_TREE_HEAD, output_file) write(HTML_TREE_HEAD, output_file)
write('<body>', output_file) write('<body>', output_file)
size_details = recursive_get_size(tree) size_details = recursive_get_size(tree_root)
recursive_print_node(tree, use_html=use_html, output_file=output_file) recursive_print_node(tree_root, use_html=use_html, output_file=output_file)
if size_details['unmeasured'] > 0: if size_details['unmeasured'] > 0:
write(UNMEASURED_WARNING % size_details['unmeasured'], output_file) write(UNMEASURED_WARNING % size_details['unmeasured'], output_file)
@ -1102,7 +1095,7 @@ def tree(databasename, output_filename=None):
if use_html: if use_html:
write('</body>\n</html>', output_file) write('</body>\n</html>', output_file)
output_file.close() output_file.close()
return tree return tree_root
def tree_argparse(args): def tree_argparse(args):
return tree( return tree(
@ -1113,14 +1106,14 @@ def tree_argparse(args):
## COMMANDLINE FUNCTIONS ########################################################################### ## COMMANDLINE FUNCTIONS ###########################################################################
def main(argv): def main(argv):
if listget(argv, 1, '').lower() in ('help', '-h', '--help', ''): if safeindex(argv, 1, '').lower() in ('help', '-h', '--help', ''):
write(DOCSTRING) write(__doc__)
return return
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers() subparsers = parser.add_subparsers()
p_digest = subparsers.add_parser('digest') p_digest = subparsers.add_parser('digest')
p_digest.add_argument('walkurl') p_digest.add_argument('root_url')
p_digest.add_argument('-db', '--database', dest='databasename', default=None) p_digest.add_argument('-db', '--database', dest='databasename', default=None)
p_digest.add_argument('-f', '--fullscan', dest='fullscan', action='store_true') p_digest.add_argument('-f', '--fullscan', dest='fullscan', action='store_true')
p_digest.set_defaults(func=digest_argparse) p_digest.set_defaults(func=digest_argparse)

View file

@ -10,19 +10,17 @@ def remove_finished(threads):
threads = [t for t in threads if t.is_alive()] threads = [t for t in threads if t.is_alive()]
return threads return threads
def download_thread(url, filename_prefix=''): def download_thread(url, filename):
url = url.strip() url = url.strip()
if url == '': if url == '':
return return
basename = downloady.basename_from_url(url) if os.path.exists(filename):
basename = filename_prefix + basename print('Skipping existing file "%s"' % filename)
if os.path.exists(basename):
print('Skipping existing file "%s"' % basename)
return return
print('Starting "%s"' % basename) print(' Starting "%s"' % filename)
downloady.download_file(url, basename) downloady.download_file(url, filename)
print('Finished "%s"' % basename) print('+Finished "%s"' % filename)
def listget(li, index, fallback): def listget(li, index, fallback):
try: try:
@ -30,19 +28,21 @@ def listget(li, index, fallback):
except IndexError: except IndexError:
return fallback return fallback
def threaded_dl(urls, thread_count, prefix=None): def threaded_dl(urls, thread_count, filename_format=None):
threads = [] threads = []
prefix_digits = len(str(len(urls))) index_digits = len(str(len(urls)))
if prefix is None: if filename_format is None:
prefix = now = int(time.time()) filename_format = '{now}_{index}_{basename}'
prefix_text = '{prefix}_{{index:0{digits}d}}_'.format(prefix=prefix, digits=prefix_digits) filename_format = filename_format.replace('{index}', '{index:0%0dd}' % index_digits)
now = int(time.time())
for (index, url) in enumerate(urls): for (index, url) in enumerate(urls):
while len(threads) == thread_count: while len(threads) == thread_count:
threads = remove_finished(threads) threads = remove_finished(threads)
time.sleep(0.1) time.sleep(0.1)
prefix = prefix_text.format(index=index) basename = downloady.basename_from_url(url)
t = threading.Thread(target=download_thread, args=[url, prefix]) filename = filename_format.format(now=now, index=index, basename=basename)
t = threading.Thread(target=download_thread, args=[url, filename])
t.daemon = True t.daemon = True
threads.append(t) threads.append(t)
t.start() t.start()
@ -58,13 +58,12 @@ def main():
f = open(filename, 'r') f = open(filename, 'r')
with f: with f:
urls = f.read() urls = f.read()
urls = urls.split('\n')
else: else:
urls = clipext.resolve(filename) urls = clipext.resolve(filename)
urls = urls.split('\n') urls = urls.split('\n')
thread_count = int(listget(sys.argv, 2, 4)) thread_count = int(listget(sys.argv, 2, 4))
prefix = listget(sys.argv, 3, None) filename_format = listget(sys.argv, 3, None)
threaded_dl(urls, thread_count=thread_count, prefix=prefix) threaded_dl(urls, thread_count=thread_count, filename_format=filename_format)
if __name__ == '__main__': if __name__ == '__main__':
main() main()