else
This commit is contained in:
parent
aa836ce5c3
commit
fcdaa58bd4
4 changed files with 148 additions and 150 deletions
|
@ -229,7 +229,7 @@ def prepare_plan(
|
||||||
|
|
||||||
class Progress1:
|
class Progress1:
|
||||||
def __init__(self, total_bytes):
|
def __init__(self, total_bytes):
|
||||||
self.limiter = ratelimiter.Ratelimiter(allowance=5, mode='reject')
|
self.limiter = ratelimiter.Ratelimiter(allowance=8, mode='reject')
|
||||||
self.limiter.balance = 1
|
self.limiter.balance = 1
|
||||||
self.total_bytes = max(1, total_bytes)
|
self.total_bytes = max(1, total_bytes)
|
||||||
self.divisor = bytestring.get_appropriate_divisor(total_bytes)
|
self.divisor = bytestring.get_appropriate_divisor(total_bytes)
|
||||||
|
@ -265,7 +265,7 @@ class Progress1:
|
||||||
class Progress2:
|
class Progress2:
|
||||||
def __init__(self, total_bytes):
|
def __init__(self, total_bytes):
|
||||||
self.total_bytes = max(1, total_bytes)
|
self.total_bytes = max(1, total_bytes)
|
||||||
self.limiter = ratelimiter.Ratelimiter(allowance=5, mode='reject')
|
self.limiter = ratelimiter.Ratelimiter(allowance=8, mode='reject')
|
||||||
self.limiter.balance = 1
|
self.limiter.balance = 1
|
||||||
self.total_bytes_string = '{:,}'.format(self.total_bytes)
|
self.total_bytes_string = '{:,}'.format(self.total_bytes)
|
||||||
self.bytes_downloaded_string = '{:%d,}' % len(self.total_bytes_string)
|
self.bytes_downloaded_string = '{:%d,}' % len(self.total_bytes_string)
|
||||||
|
|
|
@ -7,6 +7,12 @@ Requires `pip install beautifulsoup4`.
|
||||||
|
|
||||||
See inside opendirdl.py for usage instructions.
|
See inside opendirdl.py for usage instructions.
|
||||||
|
|
||||||
|
- 2016 10 01
|
||||||
|
- **[bugfix]** Fixed the download function so it actually passes `headers` into downloady.
|
||||||
|
- **[change]** `url_split` key 'root' has been renamed to 'domain'.
|
||||||
|
- **[cleanup]** Removed import for Ratelimiter since downloady handles all of that now.
|
||||||
|
- **[cleanup]** Improved some variable names, including `walkurl -> root_url`.
|
||||||
|
|
||||||
- 2016 08 16
|
- 2016 08 16
|
||||||
- **[cleanup]** Now that Downloady uses temp files for incomplete downloads, that logic can be removed from opendirdl.
|
- **[cleanup]** Now that Downloady uses temp files for incomplete downloads, that logic can be removed from opendirdl.
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
# voussoir
|
# voussoir
|
||||||
|
'''
|
||||||
DOCSTRING='''
|
|
||||||
OpenDirDL
|
OpenDirDL
|
||||||
downloads open directories
|
downloads open directories
|
||||||
|
|
||||||
|
@ -108,36 +107,31 @@ tree:
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
|
||||||
# Module names preceeded by `## ~` indicate modules that are imported during
|
# Module names preceeded by `## ` indicate modules that are imported during
|
||||||
# a function, because they are not used anywhere else and we don't need to waste
|
# a function, because they are not used anywhere else and we don't need to waste
|
||||||
# time importing them usually.
|
# time importing them usually, but I still want them listed here for clarity.
|
||||||
|
import argparse
|
||||||
|
## import bs4
|
||||||
|
import collections
|
||||||
|
## import hashlib
|
||||||
|
import os
|
||||||
|
## import re
|
||||||
|
import requests
|
||||||
|
import shutil
|
||||||
|
import sqlite3
|
||||||
import sys
|
import sys
|
||||||
|
## import tkinter
|
||||||
|
import urllib.parse
|
||||||
|
|
||||||
# Please consult my github repo for these files
|
# Please consult my github repo for these files
|
||||||
# https://github.com/voussoir/else
|
# https://github.com/voussoir/else
|
||||||
sys.path.append('C:\\git\\else\\Downloady'); import downloady
|
sys.path.append('C:\\git\\else\\Downloady'); import downloady
|
||||||
sys.path.append('C:\\git\\else\\Bytestring'); import bytestring
|
sys.path.append('C:\\git\\else\\Bytestring'); import bytestring
|
||||||
sys.path.append('C:\\git\\else\\Ratelimiter'); import ratelimiter
|
|
||||||
|
|
||||||
import argparse
|
|
||||||
## ~import bs4
|
|
||||||
import collections
|
|
||||||
## ~import hashlib
|
|
||||||
import os
|
|
||||||
## ~import re
|
|
||||||
import requests
|
|
||||||
import shutil
|
|
||||||
import sqlite3
|
|
||||||
## ~tkinter
|
|
||||||
import traceback
|
|
||||||
import urllib.parse
|
|
||||||
|
|
||||||
FILENAME_BADCHARS = '/\\:*?"<>|'
|
|
||||||
|
|
||||||
TERMINAL_WIDTH = shutil.get_terminal_size().columns
|
|
||||||
|
|
||||||
DOWNLOAD_CHUNK = 16 * bytestring.KIBIBYTE
|
DOWNLOAD_CHUNK = 16 * bytestring.KIBIBYTE
|
||||||
|
FILENAME_BADCHARS = '/\\:*?"<>|'
|
||||||
|
TERMINAL_WIDTH = shutil.get_terminal_size().columns
|
||||||
UNKNOWN_SIZE_STRING = '???'
|
UNKNOWN_SIZE_STRING = '???'
|
||||||
|
|
||||||
# When doing a basic scan, we will not send HEAD requests to URLs that end in
|
# When doing a basic scan, we will not send HEAD requests to URLs that end in
|
||||||
|
@ -145,50 +139,50 @@ UNKNOWN_SIZE_STRING = '???'
|
||||||
# This isn't meant to be a comprehensive filetype library, but it covers
|
# This isn't meant to be a comprehensive filetype library, but it covers
|
||||||
# enough of the typical opendir to speed things up.
|
# enough of the typical opendir to speed things up.
|
||||||
SKIPPABLE_FILETYPES = [
|
SKIPPABLE_FILETYPES = [
|
||||||
'.aac',
|
'.aac',
|
||||||
'.avi',
|
'.avi',
|
||||||
'.bin',
|
'.bin',
|
||||||
'.bmp',
|
'.bmp',
|
||||||
'.bz2',
|
'.bz2',
|
||||||
'.epub',
|
'.epub',
|
||||||
'.exe',
|
'.exe',
|
||||||
'.db',
|
'.db',
|
||||||
'.flac',
|
'.flac',
|
||||||
'.gif',
|
'.gif',
|
||||||
'.gz',
|
'.gz',
|
||||||
'.ico',
|
'.ico',
|
||||||
'.iso',
|
'.iso',
|
||||||
'.jpeg',
|
'.jpeg',
|
||||||
'.jpg',
|
'.jpg',
|
||||||
'.m3u',
|
'.m3u',
|
||||||
'.m4a',
|
'.m4a',
|
||||||
'.m4v',
|
'.m4v',
|
||||||
'.mka',
|
'.mka',
|
||||||
'.mkv',
|
'.mkv',
|
||||||
'.mov',
|
'.mov',
|
||||||
'.mp3',
|
'.mp3',
|
||||||
'.mp4',
|
'.mp4',
|
||||||
'.nfo',
|
'.nfo',
|
||||||
'.ogg',
|
'.ogg',
|
||||||
'.ott',
|
'.ott',
|
||||||
'.pdf',
|
'.pdf',
|
||||||
'.png',
|
'.png',
|
||||||
'.rar',
|
'.rar',
|
||||||
'.srt',
|
'.srt',
|
||||||
'.tar',
|
'.tar',
|
||||||
'.ttf',
|
'.ttf',
|
||||||
'.txt',
|
'.txt',
|
||||||
'.wav',
|
'.wav',
|
||||||
'.webm',
|
'.webm',
|
||||||
'.wma',
|
'.wma',
|
||||||
'.zip',
|
'.zip',
|
||||||
]
|
]
|
||||||
SKIPPABLE_FILETYPES = set(x.lower() for x in SKIPPABLE_FILETYPES)
|
SKIPPABLE_FILETYPES = set(x.lower() for x in SKIPPABLE_FILETYPES)
|
||||||
|
|
||||||
# Will be ignored completely. Are case-sensitive
|
# Will be ignored completely. Are case-sensitive
|
||||||
BLACKLISTED_FILENAMES = [
|
BLACKLISTED_FILENAMES = [
|
||||||
'desktop.ini',
|
'desktop.ini',
|
||||||
'thumbs.db',
|
'thumbs.db',
|
||||||
]
|
]
|
||||||
|
|
||||||
# oh shit
|
# oh shit
|
||||||
|
@ -275,15 +269,18 @@ those files.
|
||||||
## WALKER ##########################################################################################
|
## WALKER ##########################################################################################
|
||||||
## ##
|
## ##
|
||||||
class Walker:
|
class Walker:
|
||||||
def __init__(self, walkurl, databasename=None, fullscan=False):
|
'''
|
||||||
if not walkurl.endswith('/'):
|
This class manages the extraction and saving of URLs, given a starting root url.
|
||||||
walkurl += '/'
|
'''
|
||||||
if '://' not in walkurl.split('.')[0]:
|
def __init__(self, root_url, databasename=None, fullscan=False):
|
||||||
walkurl = 'http://' + walkurl
|
if not root_url.endswith('/'):
|
||||||
self.walkurl = walkurl
|
root_url += '/'
|
||||||
|
if '://' not in root_url.split('.')[0]:
|
||||||
|
root_url = 'http://' + root_url
|
||||||
|
self.root_url = root_url
|
||||||
|
|
||||||
if databasename in (None, ''):
|
if databasename in (None, ''):
|
||||||
domain = url_split(self.walkurl)['root']
|
domain = url_split(self.root_url)['domain']
|
||||||
databasename = domain + '.db'
|
databasename = domain + '.db'
|
||||||
databasename = databasename.replace(':', '#')
|
databasename = databasename.replace(':', '#')
|
||||||
self.databasename = databasename
|
self.databasename = databasename
|
||||||
|
@ -318,7 +315,7 @@ class Walker:
|
||||||
continue
|
continue
|
||||||
href = urllib.parse.urljoin(response.url, href)
|
href = urllib.parse.urljoin(response.url, href)
|
||||||
|
|
||||||
if not href.startswith(self.walkurl):
|
if not href.startswith(self.root_url):
|
||||||
# Don't go to other sites or parent directories.
|
# Don't go to other sites or parent directories.
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
@ -346,15 +343,15 @@ class Walker:
|
||||||
when the url is an index page.
|
when the url is an index page.
|
||||||
'''
|
'''
|
||||||
if url is None:
|
if url is None:
|
||||||
url = self.walkurl
|
url = self.root_url
|
||||||
else:
|
else:
|
||||||
url = urllib.parse.urljoin(self.walkurl, url)
|
url = urllib.parse.urljoin(self.root_url, url)
|
||||||
|
|
||||||
if url in self.seen_directories:
|
if url in self.seen_directories:
|
||||||
# We already picked this up at some point
|
# We already picked this up at some point
|
||||||
return
|
return
|
||||||
|
|
||||||
if not url.startswith(self.walkurl):
|
if not url.startswith(self.root_url):
|
||||||
# Don't follow external links or parent directory.
|
# Don't follow external links or parent directory.
|
||||||
write('Skipping "%s" due to external url.' % url)
|
write('Skipping "%s" due to external url.' % url)
|
||||||
return
|
return
|
||||||
|
@ -374,11 +371,11 @@ class Walker:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
head = do_head(url)
|
head = do_head(url)
|
||||||
except requests.exceptions.HTTPError as e:
|
except requests.exceptions.HTTPError as exception:
|
||||||
if e.response.status_code == 403:
|
if exception.response.status_code == 403:
|
||||||
write('403 FORBIDDEN!')
|
write('403 FORBIDDEN!')
|
||||||
return
|
return
|
||||||
if e.response.status_code == 404:
|
if exception.response.status_code == 404:
|
||||||
write('404 NOT FOUND!')
|
write('404 NOT FOUND!')
|
||||||
return
|
return
|
||||||
raise
|
raise
|
||||||
|
@ -405,6 +402,10 @@ class Walker:
|
||||||
self.smart_insert(head=head, commit=False)
|
self.smart_insert(head=head, commit=False)
|
||||||
|
|
||||||
def walk(self, url=None):
|
def walk(self, url=None):
|
||||||
|
'''
|
||||||
|
Given a starting URL (defaults to self.root_url), continually extract
|
||||||
|
links from the page and repeat.
|
||||||
|
'''
|
||||||
self.queue.appendleft(url)
|
self.queue.appendleft(url)
|
||||||
try:
|
try:
|
||||||
while len(self.queue) > 0:
|
while len(self.queue) > 0:
|
||||||
|
@ -422,12 +423,6 @@ class Walker:
|
||||||
|
|
||||||
## OTHER CLASSES ###################################################################################
|
## OTHER CLASSES ###################################################################################
|
||||||
## ##
|
## ##
|
||||||
class Generic:
|
|
||||||
def __init__(self, **kwargs):
|
|
||||||
for (key, value) in kwargs.items():
|
|
||||||
setattr(self, key, value)
|
|
||||||
|
|
||||||
|
|
||||||
class TreeExistingChild(Exception):
|
class TreeExistingChild(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -540,10 +535,10 @@ def build_file_tree(databasename):
|
||||||
'name': databasename,
|
'name': databasename,
|
||||||
}
|
}
|
||||||
scheme = url_split(all_items[0]['url'])['scheme']
|
scheme = url_split(all_items[0]['url'])['scheme']
|
||||||
tree = TreeNode(databasename, data=root_data)
|
tree_root = TreeNode(databasename, data=root_data)
|
||||||
tree.unsorted_children = all_items
|
tree_root.unsorted_children = all_items
|
||||||
node_queue = set()
|
node_queue = set()
|
||||||
node_queue.add(tree)
|
node_queue.add(tree_root)
|
||||||
|
|
||||||
# In this process, URLs are divided up into their nodes one directory layer at a time.
|
# In this process, URLs are divided up into their nodes one directory layer at a time.
|
||||||
# The root receives all URLs, and creates nodes for each of the top-level
|
# The root receives all URLs, and creates nodes for each of the top-level
|
||||||
|
@ -574,14 +569,14 @@ def build_file_tree(databasename):
|
||||||
child.data['url'] = new_child_data['url']
|
child.data['url'] = new_child_data['url']
|
||||||
if node.parent is None:
|
if node.parent is None:
|
||||||
continue
|
continue
|
||||||
elif node.parent == tree:
|
elif node.parent == tree_root:
|
||||||
node.data['url'] = scheme + '://' + node.identifier
|
node.data['url'] = scheme + '://' + node.identifier
|
||||||
else:
|
else:
|
||||||
node.data['url'] = node.parent.data['url'] + '/' + node.identifier
|
node.data['url'] = node.parent.data['url'] + '/' + node.identifier
|
||||||
|
|
||||||
del node.unsorted_children
|
del node.unsorted_children
|
||||||
|
|
||||||
return tree
|
return tree_root
|
||||||
|
|
||||||
def db_init(sql, cur):
|
def db_init(sql, cur):
|
||||||
lines = DB_INIT.split(';')
|
lines = DB_INIT.split(';')
|
||||||
|
@ -613,6 +608,9 @@ def fetch_generator(cur):
|
||||||
yield fetch
|
yield fetch
|
||||||
|
|
||||||
def filepath_sanitize(text, allowed=''):
|
def filepath_sanitize(text, allowed=''):
|
||||||
|
'''
|
||||||
|
Remove forbidden characters from the text, unless specifically sanctioned.
|
||||||
|
'''
|
||||||
badchars = FILENAME_BADCHARS
|
badchars = FILENAME_BADCHARS
|
||||||
badchars = set(char for char in FILENAME_BADCHARS if char not in allowed)
|
badchars = set(char for char in FILENAME_BADCHARS if char not in allowed)
|
||||||
text = ''.join(char for char in text if char not in badchars)
|
text = ''.join(char for char in text if char not in badchars)
|
||||||
|
@ -627,22 +625,10 @@ def get_clipboard():
|
||||||
|
|
||||||
def hashit(text, length=None):
|
def hashit(text, length=None):
|
||||||
import hashlib
|
import hashlib
|
||||||
h = hashlib.sha512(text.encode('utf-8')).hexdigest()
|
sha = hashlib.sha512(text.encode('utf-8')).hexdigest()
|
||||||
if length is not None:
|
if length is not None:
|
||||||
h = h[:length]
|
sha = sha[:length]
|
||||||
return h
|
return sha
|
||||||
|
|
||||||
def listget(l, index, default=None):
|
|
||||||
try:
|
|
||||||
return l[index]
|
|
||||||
except IndexError:
|
|
||||||
return default
|
|
||||||
|
|
||||||
def longest_length(li):
|
|
||||||
longest = 0
|
|
||||||
for item in li:
|
|
||||||
longest = max(longest, len(item))
|
|
||||||
return longest
|
|
||||||
|
|
||||||
def recursive_get_size(node):
|
def recursive_get_size(node):
|
||||||
'''
|
'''
|
||||||
|
@ -722,10 +708,15 @@ def recursive_print_node(node, depth=0, use_html=False, output_file=None):
|
||||||
# This helps put some space between sibling directories
|
# This helps put some space between sibling directories
|
||||||
write('| ' * (depth), output_file)
|
write('| ' * (depth), output_file)
|
||||||
|
|
||||||
|
def safeindex(sequence, index, fallback=None):
|
||||||
|
try:
|
||||||
|
return sequence[index]
|
||||||
|
except IndexError:
|
||||||
|
return fallback
|
||||||
|
|
||||||
def safeprint(text, **kwargs):
|
def safeprint(text, **kwargs):
|
||||||
text = str(text)
|
text = str(text)
|
||||||
text = text.encode('ascii', 'replace').decode()
|
text = text.encode('ascii', 'replace').decode()
|
||||||
#text = text.replace('?', '_')
|
|
||||||
print(text, **kwargs)
|
print(text, **kwargs)
|
||||||
|
|
||||||
def smart_insert(sql, cur, url=None, head=None, commit=True):
|
def smart_insert(sql, cur, url=None, head=None, commit=True):
|
||||||
|
@ -780,9 +771,12 @@ def smart_insert(sql, cur, url=None, head=None, commit=True):
|
||||||
sql.commit()
|
sql.commit()
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def url_split(text):
|
def url_split(url):
|
||||||
text = urllib.parse.unquote(text)
|
'''
|
||||||
parts = urllib.parse.urlsplit(text)
|
Given a url, return a dictionary of its components.
|
||||||
|
'''
|
||||||
|
url = urllib.parse.unquote(url)
|
||||||
|
parts = urllib.parse.urlsplit(url)
|
||||||
if any(part == '' for part in [parts.scheme, parts.netloc]):
|
if any(part == '' for part in [parts.scheme, parts.netloc]):
|
||||||
raise ValueError('Not a valid URL')
|
raise ValueError('Not a valid URL')
|
||||||
scheme = parts.scheme
|
scheme = parts.scheme
|
||||||
|
@ -800,7 +794,7 @@ def url_split(text):
|
||||||
|
|
||||||
result = {
|
result = {
|
||||||
'scheme': scheme,
|
'scheme': scheme,
|
||||||
'root': root,
|
'domain': root,
|
||||||
'folder': folder,
|
'folder': folder,
|
||||||
'filename': filename,
|
'filename': filename,
|
||||||
}
|
}
|
||||||
|
@ -817,14 +811,14 @@ def write(line, file_handle=None, **kwargs):
|
||||||
|
|
||||||
## COMMANDLINE FUNCTIONS ###########################################################################
|
## COMMANDLINE FUNCTIONS ###########################################################################
|
||||||
## ##
|
## ##
|
||||||
def digest(walkurl, databasename=None, fullscan=False):
|
def digest(root_url, databasename=None, fullscan=False):
|
||||||
if walkurl in ('!clipboard', '!c'):
|
if root_url in ('!clipboard', '!c'):
|
||||||
walkurl = get_clipboard()
|
root_url = get_clipboard()
|
||||||
write('From clipboard: %s' % walkurl)
|
write('From clipboard: %s' % root_url)
|
||||||
walker = Walker(
|
walker = Walker(
|
||||||
databasename=databasename,
|
databasename=databasename,
|
||||||
fullscan=fullscan,
|
fullscan=fullscan,
|
||||||
walkurl=walkurl,
|
root_url=root_url,
|
||||||
)
|
)
|
||||||
walker.walk()
|
walker.walk()
|
||||||
|
|
||||||
|
@ -832,7 +826,7 @@ def digest_argparse(args):
|
||||||
return digest(
|
return digest(
|
||||||
databasename=args.databasename,
|
databasename=args.databasename,
|
||||||
fullscan=args.fullscan,
|
fullscan=args.fullscan,
|
||||||
walkurl=args.walkurl,
|
root_url=args.root_url,
|
||||||
)
|
)
|
||||||
|
|
||||||
def download(
|
def download(
|
||||||
|
@ -873,7 +867,7 @@ def download(
|
||||||
# on their own.
|
# on their own.
|
||||||
cur.execute('SELECT url FROM urls LIMIT 1')
|
cur.execute('SELECT url FROM urls LIMIT 1')
|
||||||
url = cur.fetchone()[0]
|
url = cur.fetchone()[0]
|
||||||
outputdir = url_split(url)['root']
|
outputdir = url_split(url)['domain']
|
||||||
|
|
||||||
if isinstance(bytespersecond, str):
|
if isinstance(bytespersecond, str):
|
||||||
bytespersecond = bytestring.parsebytes(bytespersecond)
|
bytespersecond = bytestring.parsebytes(bytespersecond)
|
||||||
|
@ -894,7 +888,8 @@ def download(
|
||||||
localname=fullname,
|
localname=fullname,
|
||||||
bytespersecond=bytespersecond,
|
bytespersecond=bytespersecond,
|
||||||
callback_progress=downloady.progress2,
|
callback_progress=downloady.progress2,
|
||||||
overwrite=overwrite
|
headers=headers,
|
||||||
|
overwrite=overwrite,
|
||||||
)
|
)
|
||||||
|
|
||||||
def download_argparse(args):
|
def download_argparse(args):
|
||||||
|
@ -905,7 +900,7 @@ def download_argparse(args):
|
||||||
bytespersecond=args.bytespersecond,
|
bytespersecond=args.bytespersecond,
|
||||||
)
|
)
|
||||||
|
|
||||||
def filter_pattern(databasename, regex, action='keep', *trash):
|
def filter_pattern(databasename, regex, action='keep'):
|
||||||
'''
|
'''
|
||||||
When `action` is 'keep', then any URLs matching the regex will have their
|
When `action` is 'keep', then any URLs matching the regex will have their
|
||||||
`do_download` flag set to True.
|
`do_download` flag set to True.
|
||||||
|
@ -930,15 +925,13 @@ def filter_pattern(databasename, regex, action='keep', *trash):
|
||||||
items = cur.fetchall()
|
items = cur.fetchall()
|
||||||
for item in items:
|
for item in items:
|
||||||
url = item[SQL_URL]
|
url = item[SQL_URL]
|
||||||
current_do_dl = item[SQL_DO_DOWNLOAD]
|
|
||||||
for pattern in regex:
|
for pattern in regex:
|
||||||
contains = re.search(pattern, url) is not None
|
contains = re.search(pattern, url) is not None
|
||||||
|
|
||||||
should_keep = (keep and contains)
|
if keep and contains and not item[SQL_DO_DOWNLOAD]:
|
||||||
if keep and contains and not current_do_dl:
|
|
||||||
write('Enabling "%s"' % url)
|
write('Enabling "%s"' % url)
|
||||||
cur.execute('UPDATE urls SET do_download = 1 WHERE url == ?', [url])
|
cur.execute('UPDATE urls SET do_download = 1 WHERE url == ?', [url])
|
||||||
if remove and contains and current_do_dl:
|
if remove and contains and item[SQL_DO_DOWNLOAD]:
|
||||||
write('Disabling "%s"' % url)
|
write('Disabling "%s"' % url)
|
||||||
cur.execute('UPDATE urls SET do_download = 0 WHERE url == ?', [url])
|
cur.execute('UPDATE urls SET do_download = 0 WHERE url == ?', [url])
|
||||||
sql.commit()
|
sql.commit()
|
||||||
|
@ -1079,7 +1072,7 @@ def tree(databasename, output_filename=None):
|
||||||
collapsible boxes and clickable filenames. Otherwise the file will just
|
collapsible boxes and clickable filenames. Otherwise the file will just
|
||||||
be a plain text drawing.
|
be a plain text drawing.
|
||||||
'''
|
'''
|
||||||
tree = build_file_tree(databasename)
|
tree_root = build_file_tree(databasename)
|
||||||
|
|
||||||
if output_filename is not None:
|
if output_filename is not None:
|
||||||
output_file = open(output_filename, 'w', encoding='utf-8')
|
output_file = open(output_filename, 'w', encoding='utf-8')
|
||||||
|
@ -1093,8 +1086,8 @@ def tree(databasename, output_filename=None):
|
||||||
write(HTML_TREE_HEAD, output_file)
|
write(HTML_TREE_HEAD, output_file)
|
||||||
write('<body>', output_file)
|
write('<body>', output_file)
|
||||||
|
|
||||||
size_details = recursive_get_size(tree)
|
size_details = recursive_get_size(tree_root)
|
||||||
recursive_print_node(tree, use_html=use_html, output_file=output_file)
|
recursive_print_node(tree_root, use_html=use_html, output_file=output_file)
|
||||||
if size_details['unmeasured'] > 0:
|
if size_details['unmeasured'] > 0:
|
||||||
write(UNMEASURED_WARNING % size_details['unmeasured'], output_file)
|
write(UNMEASURED_WARNING % size_details['unmeasured'], output_file)
|
||||||
|
|
||||||
|
@ -1102,7 +1095,7 @@ def tree(databasename, output_filename=None):
|
||||||
if use_html:
|
if use_html:
|
||||||
write('</body>\n</html>', output_file)
|
write('</body>\n</html>', output_file)
|
||||||
output_file.close()
|
output_file.close()
|
||||||
return tree
|
return tree_root
|
||||||
|
|
||||||
def tree_argparse(args):
|
def tree_argparse(args):
|
||||||
return tree(
|
return tree(
|
||||||
|
@ -1113,14 +1106,14 @@ def tree_argparse(args):
|
||||||
## COMMANDLINE FUNCTIONS ###########################################################################
|
## COMMANDLINE FUNCTIONS ###########################################################################
|
||||||
|
|
||||||
def main(argv):
|
def main(argv):
|
||||||
if listget(argv, 1, '').lower() in ('help', '-h', '--help', ''):
|
if safeindex(argv, 1, '').lower() in ('help', '-h', '--help', ''):
|
||||||
write(DOCSTRING)
|
write(__doc__)
|
||||||
return
|
return
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
subparsers = parser.add_subparsers()
|
subparsers = parser.add_subparsers()
|
||||||
|
|
||||||
p_digest = subparsers.add_parser('digest')
|
p_digest = subparsers.add_parser('digest')
|
||||||
p_digest.add_argument('walkurl')
|
p_digest.add_argument('root_url')
|
||||||
p_digest.add_argument('-db', '--database', dest='databasename', default=None)
|
p_digest.add_argument('-db', '--database', dest='databasename', default=None)
|
||||||
p_digest.add_argument('-f', '--fullscan', dest='fullscan', action='store_true')
|
p_digest.add_argument('-f', '--fullscan', dest='fullscan', action='store_true')
|
||||||
p_digest.set_defaults(func=digest_argparse)
|
p_digest.set_defaults(func=digest_argparse)
|
||||||
|
|
|
@ -10,19 +10,17 @@ def remove_finished(threads):
|
||||||
threads = [t for t in threads if t.is_alive()]
|
threads = [t for t in threads if t.is_alive()]
|
||||||
return threads
|
return threads
|
||||||
|
|
||||||
def download_thread(url, filename_prefix=''):
|
def download_thread(url, filename):
|
||||||
url = url.strip()
|
url = url.strip()
|
||||||
if url == '':
|
if url == '':
|
||||||
return
|
return
|
||||||
|
|
||||||
basename = downloady.basename_from_url(url)
|
if os.path.exists(filename):
|
||||||
basename = filename_prefix + basename
|
print('Skipping existing file "%s"' % filename)
|
||||||
if os.path.exists(basename):
|
|
||||||
print('Skipping existing file "%s"' % basename)
|
|
||||||
return
|
return
|
||||||
print('Starting "%s"' % basename)
|
print(' Starting "%s"' % filename)
|
||||||
downloady.download_file(url, basename)
|
downloady.download_file(url, filename)
|
||||||
print('Finished "%s"' % basename)
|
print('+Finished "%s"' % filename)
|
||||||
|
|
||||||
def listget(li, index, fallback):
|
def listget(li, index, fallback):
|
||||||
try:
|
try:
|
||||||
|
@ -30,19 +28,21 @@ def listget(li, index, fallback):
|
||||||
except IndexError:
|
except IndexError:
|
||||||
return fallback
|
return fallback
|
||||||
|
|
||||||
def threaded_dl(urls, thread_count, prefix=None):
|
def threaded_dl(urls, thread_count, filename_format=None):
|
||||||
threads = []
|
threads = []
|
||||||
prefix_digits = len(str(len(urls)))
|
index_digits = len(str(len(urls)))
|
||||||
if prefix is None:
|
if filename_format is None:
|
||||||
prefix = now = int(time.time())
|
filename_format = '{now}_{index}_{basename}'
|
||||||
prefix_text = '{prefix}_{{index:0{digits}d}}_'.format(prefix=prefix, digits=prefix_digits)
|
filename_format = filename_format.replace('{index}', '{index:0%0dd}' % index_digits)
|
||||||
|
now = int(time.time())
|
||||||
for (index, url) in enumerate(urls):
|
for (index, url) in enumerate(urls):
|
||||||
while len(threads) == thread_count:
|
while len(threads) == thread_count:
|
||||||
threads = remove_finished(threads)
|
threads = remove_finished(threads)
|
||||||
time.sleep(0.1)
|
time.sleep(0.1)
|
||||||
|
|
||||||
prefix = prefix_text.format(index=index)
|
basename = downloady.basename_from_url(url)
|
||||||
t = threading.Thread(target=download_thread, args=[url, prefix])
|
filename = filename_format.format(now=now, index=index, basename=basename)
|
||||||
|
t = threading.Thread(target=download_thread, args=[url, filename])
|
||||||
t.daemon = True
|
t.daemon = True
|
||||||
threads.append(t)
|
threads.append(t)
|
||||||
t.start()
|
t.start()
|
||||||
|
@ -58,13 +58,12 @@ def main():
|
||||||
f = open(filename, 'r')
|
f = open(filename, 'r')
|
||||||
with f:
|
with f:
|
||||||
urls = f.read()
|
urls = f.read()
|
||||||
urls = urls.split('\n')
|
|
||||||
else:
|
else:
|
||||||
urls = clipext.resolve(filename)
|
urls = clipext.resolve(filename)
|
||||||
urls = urls.split('\n')
|
urls = urls.split('\n')
|
||||||
thread_count = int(listget(sys.argv, 2, 4))
|
thread_count = int(listget(sys.argv, 2, 4))
|
||||||
prefix = listget(sys.argv, 3, None)
|
filename_format = listget(sys.argv, 3, None)
|
||||||
threaded_dl(urls, thread_count=thread_count, prefix=prefix)
|
threaded_dl(urls, thread_count=thread_count, filename_format=filename_format)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
main()
|
Loading…
Reference in a new issue