This commit is contained in:
unknown 2016-07-29 13:39:04 -07:00
parent 25fd827eb1
commit 32221ce607
4 changed files with 316 additions and 307 deletions

View file

@ -23,67 +23,30 @@ CHUNKSIZE = 16 * bytestring.KIBIBYTE
STOP = False STOP = False
TIMEOUT = 600 TIMEOUT = 600
def download_file( def basename_from_url(url):
url, '''
localname=None, Determine the local filename appropriate for a URL.
auth=None, '''
bytespersecond=None, localname = urllib.parse.unquote(url)
callback_progress=None, localname = localname.split('?')[0]
headers=None, localname = localname.split('/')[-1]
overwrite=None return localname
def determine_seek_and_range(
file_handle,
localname,
local_exists,
overwrite,
remote_total_bytes,
server_respects_range,
user_provided_range,
user_range_min,
user_range_max,
): ):
if headers is None: ''' THINGS THAT CAN HAPPEN '''
headers = {}
''' Determine local filename '''
url = url.replace('%3A//', '://')
if localname in [None, '']:
localname = localize(url)
localname = filepath_sanitize(localname)
directory = os.path.split(localname)[0]
if directory != '':
os.makedirs(directory, exist_ok=True)
if bytespersecond is None:
limiter = None
else:
limiter = ratelimiter.Ratelimiter(bytespersecond, period=1)
''' Prepare condition variables '''
local_exists = os.path.exists(localname)
if local_exists and overwrite is False:
print('Overwrite off. Nothing to do.')
return
user_provided_range = 'range' in headers
if user_provided_range:
user_range_min = int(headers['range'].split('bytes=')[1].split('-')[0])
user_range_max = headers['range'].split('-')[1]
if user_range_max != '':
user_range_max = int(user_range_max)
else:
# Included to determine whether the server supports this
headers['range'] = 'bytes=0-'
# I'm using a GET instead of an actual HEAD here because some servers respond
# differently, even though they're not supposed to.
head = request('get', url, stream=True, headers=headers, auth=auth)
remote_total_bytes = int(head.headers.get('content-length', 1))
server_respects_range = (head.status_code == 206 and 'content-range' in head.headers)
seek_to = 0 seek_to = 0
header_range_min = None header_range_min = None
header_range_max = None header_range_max = None
head.connection.close()
if not user_provided_range:
del headers['range']
touch(localname)
file_handle = open(localname, 'r+b')
file_handle.seek(0)
''' THINGS THAT CAN HAPPEN '''
if local_exists: if local_exists:
local_existing_bytes = os.path.getsize(localname) local_existing_bytes = os.path.getsize(localname)
if overwrite is True: if overwrite is True:
@ -110,7 +73,7 @@ def download_file(
elif not user_provided_range: elif not user_provided_range:
if server_respects_range: if server_respects_range:
print('Resuming from %d' % local_existing_bytes) print('Resuming from byte %d' % local_existing_bytes)
header_range_min = local_existing_bytes header_range_min = local_existing_bytes
header_range_max = '' header_range_max = ''
seek_to = local_existing_bytes seek_to = local_existing_bytes
@ -142,7 +105,82 @@ def download_file(
elif not user_provided_range: elif not user_provided_range:
pass pass
return (seek_to, header_range_min, header_range_max)
def download_file(
url,
localname=None,
auth=None,
bytespersecond=None,
callback_progress=None,
headers=None,
overwrite=None
):
if headers is None:
headers = {}
''' Determine local filename '''
url = url.replace('%3A//', '://')
if localname in [None, '']:
localname = basename_from_url(url)
localname = filepath_sanitize(localname)
directory = os.path.split(localname)[0]
if directory != '':
os.makedirs(directory, exist_ok=True)
if bytespersecond is None:
limiter = None
else:
limiter = ratelimiter.Ratelimiter(bytespersecond, period=1)
''' Prepare plan variables '''
local_exists = os.path.exists(localname)
if local_exists and overwrite is False:
print('Overwrite off. Nothing to do.')
return
user_provided_range = 'range' in headers
if user_provided_range:
user_range_min = int(headers['range'].split('bytes=')[1].split('-')[0])
user_range_max = headers['range'].split('-')[1]
if user_range_max != '':
user_range_max = int(user_range_max)
else:
# Included to determine whether the server supports this
headers['range'] = 'bytes=0-'
user_range_min = None
user_range_max = None
# I'm using a GET instead of an actual HEAD here because some servers respond
# differently, even though they're not supposed to.
head = request('get', url, stream=True, headers=headers, auth=auth)
remote_total_bytes = int(head.headers.get('content-length', 1))
server_respects_range = (head.status_code == 206 and 'content-range' in head.headers)
head.connection.close()
if not user_provided_range:
del headers['range']
touch(localname)
file_handle = open(localname, 'r+b')
file_handle.seek(0)
plan = determine_seek_and_range(
file_handle=file_handle,
localname=localname,
local_exists=local_exists,
overwrite=overwrite,
remote_total_bytes=remote_total_bytes,
server_respects_range=server_respects_range,
user_provided_range=user_provided_range,
user_range_min=user_range_min,
user_range_max=user_range_max,
)
if plan is None:
return
(seek_to, header_range_min, header_range_max) = plan
if header_range_min is not None: if header_range_min is not None:
headers['range'] = 'bytes={0}-{1}'.format(header_range_min, header_range_max) headers['range'] = 'bytes={0}-{1}'.format(header_range_min, header_range_max)
@ -176,15 +214,6 @@ def get_permission(prompt='y/n\n>', affirmative=['y', 'yes']):
def is_clipboard(s): def is_clipboard(s):
return s.lower() in ['!c', '!clip', '!clipboard'] return s.lower() in ['!c', '!clip', '!clipboard']
def localize(url):
'''
Determine the local filename appropriate for a URL.
'''
localname = urllib.parse.unquote(url)
localname = localname.split('?')[0]
localname = localname.split('/')[-1]
return localname
def progress(bytes_downloaded, bytes_total, prefix=''): def progress(bytes_downloaded, bytes_total, prefix=''):
divisor = bytestring.get_appropriate_divisor(bytes_total) divisor = bytestring.get_appropriate_divisor(bytes_total)
bytes_total_string = bytestring.bytestring(bytes_total, force_unit=divisor) bytes_total_string = bytestring.bytestring(bytes_total, force_unit=divisor)
@ -216,7 +245,7 @@ def progress(bytes_downloaded, bytes_total, prefix=''):
def progress2(bytes_downloaded, bytes_total, prefix=''): def progress2(bytes_downloaded, bytes_total, prefix=''):
percent = (bytes_downloaded*100)/bytes_total percent = (bytes_downloaded*100)/bytes_total
percent = min(100, percent) percent = min(100, percent)
percent = '%08.4f' % percent percent_string = '%08.4f' % percent
bytes_downloaded_string = '{0:,}'.format(bytes_downloaded) bytes_downloaded_string = '{0:,}'.format(bytes_downloaded)
bytes_total_string = '{0:,}'.format(bytes_total) bytes_total_string = '{0:,}'.format(bytes_total)
bytes_downloaded_string = bytes_downloaded_string.rjust(len(bytes_total_string), ' ') bytes_downloaded_string = bytes_downloaded_string.rjust(len(bytes_total_string), ' ')
@ -227,7 +256,7 @@ def progress2(bytes_downloaded, bytes_total, prefix=''):
prefix=prefix, prefix=prefix,
bytes_downloaded=bytes_downloaded_string, bytes_downloaded=bytes_downloaded_string,
bytes_total=bytes_total_string, bytes_total=bytes_total_string,
percent=percent, percent=percent_string,
) )
print(message, end=end, flush=True) print(message, end=end, flush=True)

View file

@ -0,0 +1,7 @@
javascript:
function rename()
{
document.title = prompt("New page title:");
}
rename();

View file

@ -1,6 +1,19 @@
Open Dir DL Open Dir DL
=========== ===========
The open directory downloader
Requires `pip install beautifulsoup4`
See inside opendirdl.py for usage instructions.
- 2016 07 29
- Moved some nested function definitions out to the top level, and made the construction of the file tree its own function. These functions really don't need to be used on their own, but they were cluttering the logic of the `tree` command.
- Renamed `Tree.listnodes` to `Tree.list_children` and the `customsort` now expects to operate on Node objects rather than `(identifier, Node)` tuples. Nodes already have their identifier so the tuple was unecessary.
- Removed `Tree.sorted_children` since it was basically a duplicate of `Tree.listnodes` and I don't know why I had both.
- Replaced all `safeprint` calls with `write` because it provides access to safeprint as well as file writing if needed.
- Replaced local `download_file` function with a call to `downloady.download_file`. It supports download continuation and removes duplicate work.
- 2016 07 25 - 2016 07 25
- Removed the `Downloader` class after watching [this Jack Diederich talk](https://youtu.be/o9pEzgHorH0) about unecessary classes. - Removed the `Downloader` class after watching [this Jack Diederich talk](https://youtu.be/o9pEzgHorH0) about unecessary classes.
- Bytespersecond is now parsed by `bytestring.parsebytes` rather than `eval`, so you can write "100k" as opposed to "100 * 1024" etc. - Bytespersecond is now parsed by `bytestring.parsebytes` rather than `eval`, so you can write "100k" as opposed to "100 * 1024" etc.
@ -19,9 +32,5 @@ Open Dir DL
- Added new argparse command "tree" - Added new argparse command "tree"
- 2016 02 08 - 2016 02 08
- Fixed bug where server:port urls did not create db files. - Fixed bug where server:port urls did not create db files because of the colon. It's been replaced by a hash.
- Moved db commits to only happen at the end of a digest. - Moved db commits to only happen at the end of a digest.
Requires `pip install beautifulsoup4`
See inside opendirdl.py for usage instructions.

View file

@ -1,3 +1,5 @@
# voussoir
DOCSTRING=''' DOCSTRING='''
OpenDirDL OpenDirDL
downloads open directories downloads open directories
@ -45,8 +47,9 @@ download:
in the output directory. in the output directory.
-bps 100 | --bytespersecond 100: -bps 100 | --bytespersecond 100:
Ratelimit yourself to downloading at 100 BYTES per second. -bps 100k | -bps "100 kb" | -bps 100kib | -bps 1.2m
The webmaster will appreciate this. Ratelimit your download speed. Supports units like "k", "m" according
to `bytestring.parsebytes`.
keep_pattern: keep_pattern:
Enable URLs which match a regex pattern. Matches are based on the percent- Enable URLs which match a regex pattern. Matches are based on the percent-
@ -61,8 +64,8 @@ remove_pattern:
> opendirdl remove_pattern website.com.db ".*" > opendirdl remove_pattern website.com.db ".*"
list_basenames: list_basenames:
List Enabled URLs in order of their base filename. This makes it easier to List Enabled URLs alphabetized by their base filename. This makes it easier
find titles of interest in a directory that is very scattered or poorly to find titles of interest in a directory that is very scattered or poorly
organized. organized.
> opendirdl list_basenames website.com.db <flags> > opendirdl list_basenames website.com.db <flags>
@ -112,8 +115,9 @@ import sys
# Please consult my github repo for these files # Please consult my github repo for these files
# https://github.com/voussoir/else # https://github.com/voussoir/else
sys.path.append('C:\\git\\else\\ratelimiter'); import ratelimiter sys.path.append('C:\\git\\else\\Downloady'); import downloady
sys.path.append('C:\\git\\else\\bytestring'); import bytestring sys.path.append('C:\\git\\else\\Bytestring'); import bytestring
sys.path.append('C:\\git\\else\\Ratelimiter'); import ratelimiter
import argparse import argparse
## ~import bs4 ## ~import bs4
@ -278,7 +282,7 @@ class Walker:
databasename = databasename.replace(':', '#') databasename = databasename.replace(':', '#')
self.databasename = databasename self.databasename = databasename
safeprint('Opening %s' % self.databasename) write('Opening %s' % self.databasename)
self.sql = sqlite3.connect(self.databasename) self.sql = sqlite3.connect(self.databasename)
self.cur = self.sql.cursor() self.cur = self.sql.cursor()
db_init(self.sql, self.cur) db_init(self.sql, self.cur)
@ -346,20 +350,20 @@ class Walker:
if not url.startswith(self.walkurl): if not url.startswith(self.walkurl):
# Don't follow external links or parent directory. # Don't follow external links or parent directory.
safeprint('Skipping "%s" due to external url.' % url) write('Skipping "%s" due to external url.' % url)
return return
urll = url.lower() urll = url.lower()
if self.fullscan is False: if self.fullscan is False:
skippable = any(urll.endswith(ext) for ext in SKIPPABLE_FILETYPES) skippable = any(urll.endswith(ext) for ext in SKIPPABLE_FILETYPES)
if skippable: if skippable:
safeprint('Skipping "%s" due to extension.' % url) write('Skipping "%s" due to extension.' % url)
self.smart_insert(url=url, commit=False) self.smart_insert(url=url, commit=False)
return return
self.cur.execute('SELECT * FROM urls WHERE url == ?', [url]) self.cur.execute('SELECT * FROM urls WHERE url == ?', [url])
skippable = self.cur.fetchone() is not None skippable = self.cur.fetchone() is not None
if skippable: if skippable:
safeprint('Skipping "%s" since we already have it.' % url) write('Skipping "%s" since we already have it.' % url)
return return
try: try:
@ -468,13 +472,13 @@ class TreeNode:
del self.parent.children[self.identifier] del self.parent.children[self.identifier]
self.parent = None self.parent = None
def listnodes(self, customsort=None): def list_children(self, customsort=None):
items = list(self.children.items()) children = list(self.children.values())
if customsort is None: if customsort is None:
items.sort(key=lambda x: x[0].lower()) children.sort(key=lambda node: node.identifier.lower())
else: else:
items.sort(key=customsort) children.sort(key=customsort)
return [item[1] for item in items] return children
def merge_other(self, othertree, otherroot=None): def merge_other(self, othertree, otherroot=None):
newroot = None newroot = None
@ -490,16 +494,9 @@ class TreeNode:
self.check_child_availability(newroot) self.check_child_availability(newroot)
self.children[newroot] = othertree self.children[newroot] = othertree
def sorted_children(self, customsort=None):
keys = sorted(self.children.keys(), key=customsort)
for key in keys:
yield (key, self.children[key])
def walk(self, customsort=None): def walk(self, customsort=None):
yield self yield self
for child in self.listnodes(customsort=customsort): for child in self.listnodes(customsort=customsort):
#print(child)
#print(child.listnodes())
yield from child.walk(customsort=customsort) yield from child.walk(customsort=customsort)
## ## ## ##
## OTHER CLASSES ################################################################################### ## OTHER CLASSES ###################################################################################
@ -507,6 +504,88 @@ class TreeNode:
## GENERAL FUNCTIONS ############################################################################### ## GENERAL FUNCTIONS ###############################################################################
## ## ## ##
def build_file_tree(databasename):
sql = sqlite3.connect(databasename)
cur = sql.cursor()
cur.execute('SELECT * FROM urls WHERE do_download == 1')
items = cur.fetchall()
sql.close()
if len(items) == 0:
return
items.sort(key=lambda x: x[SQL_URL])
path_parts = url_to_filepath(items[0][SQL_URL])
root_identifier = path_parts['root']
#print('Root', root_identifier)
root_data = {'name': root_identifier, 'item_type': 'directory'}
root_identifier = root_identifier.replace(':', '')
tree = TreeNode(
identifier=root_identifier,
data=root_data
)
node_map = {}
for item in items:
path = url_to_filepath(item[SQL_URL])
scheme = path['scheme']
# I join and re-split because 'folder' may contain slashes of its own
# and I want to break all the pieces
path = '\\'.join([path['root'], path['folder'], path['filename']])
parts = path.split('\\')
#print(path)
for (index, part) in enumerate(parts):
this_path = '/'.join(parts[:index + 1])
parent_path = '/'.join(parts[:index])
#input()
data = {
'name': part,
'url': scheme + '://' + this_path,
}
this_identifier = this_path.replace(':', '')
parent_identifier = parent_path.replace(':', '')
if (index + 1) == len(parts):
data['item_type'] = 'file'
if item[SQL_CONTENT_LENGTH]:
data['size'] = item[SQL_CONTENT_LENGTH]
else:
data['size'] = None
else:
data['item_type'] = 'directory'
# Ensure this comment is in a node of its own
this_node = node_map.get(this_identifier, None)
if this_node:
# This ID was detected as a parent of a previous iteration
# Now we're actually filling it in.
this_node.data = data
else:
this_node = TreeNode(this_identifier, data)
node_map[this_identifier] = this_node
# Attach this node to the parent.
if parent_identifier == root_identifier:
try:
tree.add_child(this_node)
except TreeExistingChild:
pass
else:
parent_node = node_map.get(parent_identifier, None)
if not parent_node:
parent_node = TreeNode(parent_identifier, data=None)
node_map[parent_identifier] = parent_node
try:
parent_node.add_child(this_node)
except TreeExistingChild:
pass
this_node.parent = parent_node
#print(this_node.data)
return tree
def db_init(sql, cur): def db_init(sql, cur):
lines = DB_INIT.split(';') lines = DB_INIT.split(';')
for line in lines: for line in lines:
@ -522,38 +601,13 @@ def do_head(url, raise_for_status=True):
def do_request(message, method, url, raise_for_status=True): def do_request(message, method, url, raise_for_status=True):
message = '{message:>4s}: {url} : '.format(message=message, url=url) message = '{message:>4s}: {url} : '.format(message=message, url=url)
safeprint(message, end='', flush=True) write(message, end='', flush=True)
response = method(url) response = method(url)
safeprint(response.status_code) write(response.status_code)
if raise_for_status: if raise_for_status:
response.raise_for_status() response.raise_for_status()
return response return response
def download_file(url, filehandle, hookfunction=None, headers={}, bytespersecond=None):
if bytespersecond is not None:
limiter = ratelimiter.Ratelimiter(allowance_per_period=bytespersecond, period=1)
else:
limiter = None
currentblock = 0
downloading = requests.get(url, stream=True, headers=headers)
totalsize = int(downloading.headers.get('content-length', 1))
for chunk in downloading.iter_content(chunk_size=DOWNLOAD_CHUNK):
if not chunk:
break
currentblock += 1
filehandle.write(chunk)
if limiter is not None:
limiter.limit(len(chunk))
if hookfunction is not None:
hookfunction(currentblock, DOWNLOAD_CHUNK, totalsize)
filehandle.close()
size = os.path.getsize(filehandle.name)
if size < totalsize:
raise Exception('Did not receive expected total size. %d / %d' % (size, totalsize))
return True
def fetch_generator(cur): def fetch_generator(cur):
while True: while True:
fetch = cur.fetchone() fetch = cur.fetchone()
@ -581,17 +635,6 @@ def hashit(text, length=None):
h = h[:length] h = h[:length]
return h return h
def hook1(currentblock, chunksize, totalsize):
currentbytes = currentblock * chunksize
if currentbytes > totalsize:
currentbytes = totalsize
currentbytes = '{:,}'.format(currentbytes)
totalsize = '{:,}'.format(totalsize)
currentbytes = currentbytes.rjust(len(totalsize), ' ')
print('%s / %s bytes' % (currentbytes, totalsize), end='\r')
if currentbytes == totalsize:
print()
def listget(l, index, default=None): def listget(l, index, default=None):
try: try:
return l[index] return l[index]
@ -604,10 +647,79 @@ def longest_length(li):
longest = max(longest, len(item)) longest = max(longest, len(item))
return longest return longest
def recursive_get_size(node):
'''
Calculate the size of the Directory nodes by summing the sizes of all children.
Modifies the nodes in-place.
'''
return_value = {
'size': 0,
'unmeasured': 0,
}
if node.data['item_type'] == 'file':
if node.data['size'] is None:
return_value['unmeasured'] = 1
return_value['size'] = node.data['size']
else:
for child in node.list_children():
child_details = recursive_get_size(child)
return_value['size'] += child_details['size'] or 0
return_value['unmeasured'] += child_details['unmeasured']
node.data['size'] = return_value['size']
return return_value
def recursive_print_node(node, depth=0, use_html=False, output_file=None):
'''
Given a tree node (presumably the root), print it and all of its children.
'''
size = node.data['size']
if size is None:
size = UNKNOWN_SIZE_STRING
else:
size = bytestring.bytestring(size)
if use_html:
css_class = 'directory_even' if depth % 2 == 0 else 'directory_odd'
if node.data['item_type'] == 'directory':
div_id = hashit(node.identifier, 16)
line = '<button onclick="collapse(\'{div_id}\')">{name} ({size})</button>'
line += '<div class="{css}" id="{div_id}" style="display:none">'
line = line.format(div_id=div_id, name=node.data['name'], size=size, css=css_class)
else:
line = '<a href="{url}">{name} ({size})</a><br>'
line = line.format(url=node.data['url'], name=node.data['name'], size=size)
else:
line = '{space}{bar}{name} : ({size})'
line = line.format(
space='| ' * (depth-1),
bar='|---' if depth > 0 else '',
name=node.data['name'],
size=size
)
write(line, output_file)
# Sort by type (directories first) then subsort by lowercase path
customsort = lambda node: (
node.data['item_type'] == 'file',
node.data['url'].lower(),
)
for child in node.list_children(customsort=customsort):
recursive_print_node(child, depth=depth+1, use_html=use_html, output_file=output_file)
if node.data['item_type'] == 'directory':
if use_html:
write('</div>', output_file)
else:
# This helps put some space between sibling directories
write('| ' * (depth), output_file)
def safeprint(text, **kwargs): def safeprint(text, **kwargs):
text = str(text) text = str(text)
text = text.encode('ascii', 'replace').decode() text = text.encode('ascii', 'replace').decode()
text = text.replace('?', '_') #text = text.replace('?', '_')
print(text, **kwargs) print(text, **kwargs)
def smart_insert(sql, cur, url=None, head=None, commit=True): def smart_insert(sql, cur, url=None, head=None, commit=True):
@ -686,11 +798,11 @@ def url_to_filepath(text):
} }
return result return result
def write(line, file_handle=None): def write(line, file_handle=None, **kwargs):
if file_handle is None: if file_handle is None:
safeprint(line) safeprint(line, **kwargs)
else: else:
file_handle.write(line + '\n') file_handle.write(line + '\n', **kwargs)
## ## ## ##
## GENERAL FUNCTIONS ############################################################################### ## GENERAL FUNCTIONS ###############################################################################
@ -700,7 +812,7 @@ def write(line, file_handle=None):
def digest(databasename, walkurl, fullscan=False): def digest(databasename, walkurl, fullscan=False):
if walkurl in ('!clipboard', '!c'): if walkurl in ('!clipboard', '!c'):
walkurl = get_clipboard() walkurl = get_clipboard()
safeprint('From clipboard: %s' % walkurl) write('From clipboard: %s' % walkurl)
walker = Walker( walker = Walker(
databasename=databasename, databasename=databasename,
fullscan=fullscan, fullscan=fullscan,
@ -773,14 +885,19 @@ def download(
if overwrite: if overwrite:
os.remove(fullname) os.remove(fullname)
else: else:
safeprint('Skipping "%s". Use `--overwrite`' % fullname) write('Skipping "%s". Use `--overwrite`' % fullname)
continue continue
safeprint('Downloading "%s" as "%s"' % (fullname, temporary_basename)) overwrite = overwrite or None
filehandle = open(temporary_fullname, 'wb') write('Downloading "%s" as "%s"' % (fullname, temporary_basename))
with filehandle: downloady.download_file(
download_file(url, filehandle, hookfunction=hook1, bytespersecond=bytespersecond) url,
os.rename(temporary_fullname, fullname) localname=temporary_fullname,
bytespersecond=bytespersecond,
callback_progress=downloady.progress2,
overwrite=overwrite
)
os.rename(temporary_fullname, fullname)
def download_argparse(args): def download_argparse(args):
return download( return download(
@ -821,10 +938,10 @@ def filter_pattern(databasename, regex, action='keep', *trash):
should_keep = (keep and contains) should_keep = (keep and contains)
if keep and contains and not current_do_dl: if keep and contains and not current_do_dl:
safeprint('Enabling "%s"' % url) write('Enabling "%s"' % url)
cur.execute('UPDATE urls SET do_download = 1 WHERE url == ?', [url]) cur.execute('UPDATE urls SET do_download = 1 WHERE url == ?', [url])
if remove and contains and current_do_dl: if remove and contains and current_do_dl:
safeprint('Disabling "%s"' % url) write('Disabling "%s"' % url)
cur.execute('UPDATE urls SET do_download = 0 WHERE url == ?', [url]) cur.execute('UPDATE urls SET do_download = 0 WHERE url == ?', [url])
sql.commit() sql.commit()
@ -914,7 +1031,7 @@ def measure(databasename, fullscan=False, new_only=False):
fetch = smart_insert(sql, cur, head=head, commit=True) fetch = smart_insert(sql, cur, head=head, commit=True)
size = fetch[SQL_CONTENT_LENGTH] size = fetch[SQL_CONTENT_LENGTH]
if size is None: if size is None:
safeprint('"%s" is not revealing Content-Length' % url) write('"%s" is not revealing Content-Length' % url)
size = 0 size = 0
@ -961,159 +1078,7 @@ def tree(databasename, output_filename=None):
collapsible boxes and clickable filenames. Otherwise the file will just collapsible boxes and clickable filenames. Otherwise the file will just
be a plain text drawing. be a plain text drawing.
''' '''
tree = build_file_tree(databasename)
sql = sqlite3.connect(databasename)
cur = sql.cursor()
cur.execute('SELECT * FROM urls WHERE do_download == 1')
items = cur.fetchall()
if len(items) == 0:
return
items.sort(key=lambda x: x[SQL_URL])
path_parts = url_to_filepath(items[0][SQL_URL])
root_identifier = path_parts['root']
#print('Root', root_identifier)
root_data = {'name': root_identifier, 'item_type': 'directory'}
root_identifier = root_identifier.replace(':', '')
tree = TreeNode(
identifier=root_identifier,
data=root_data
)
node_map = {}
unmeasured_file_count = 0
for item in items:
path = url_to_filepath(item[SQL_URL])
scheme = path['scheme']
# I join and re-split because 'folder' may contain slashes of its own
# and I want to break all the pieces
path = '\\'.join([path['root'], path['folder'], path['filename']])
parts = path.split('\\')
#print(path)
for (index, part) in enumerate(parts):
this_path = '/'.join(parts[:index + 1])
parent_path = '/'.join(parts[:index])
#safeprint('this:' + this_path)
#safeprint('parent:' + parent_path)
#input()
data = {
'name': part,
'url': scheme + '://' + this_path,
}
this_identifier = this_path.replace(':', '')
parent_identifier = parent_path.replace(':', '')
if (index + 1) == len(parts):
data['item_type'] = 'file'
if item[SQL_CONTENT_LENGTH]:
data['size'] = item[SQL_CONTENT_LENGTH]
else:
unmeasured_file_count += 1
data['size'] = None
else:
data['item_type'] = 'directory'
# Ensure this comment is in a node of its own
this_node = node_map.get(this_identifier, None)
if this_node:
# This ID was detected as a parent of a previous iteration
# Now we're actually filling it in.
this_node.data = data
else:
this_node = TreeNode(this_identifier, data)
node_map[this_identifier] = this_node
# Attach this node to the parent.
if parent_identifier == root_identifier:
try:
tree.add_child(this_node)
except TreeExistingChild:
pass
else:
parent_node = node_map.get(parent_identifier, None)
if not parent_node:
parent_node = TreeNode(parent_identifier, data=None)
node_map[parent_identifier] = parent_node
try:
parent_node.add_child(this_node)
except TreeExistingChild:
pass
this_node.parent = parent_node
#print(this_node.data)
def recursive_get_size(node):
size = node.data.get('size', 0)
if size:
# Files have this attribute, dirs don't
return size
for child in node.children.values():
child_size = recursive_get_size(child)
child_size = child_size or 0
size += child_size
node.data['size'] = size
return size
def recursive_print_node(node, depth=0, output_file=None):
size = node.data['size']
if size is None:
size = UNKNOWN_SIZE_STRING
else:
size = bytestring.bytestring(size)
if use_html:
if depth % 2 == 0:
css_class = 'directory_even'
else:
css_class = 'directory_odd'
if node.data['item_type'] == 'directory':
div_id = hashit(node.identifier, 16)
line = '<button onclick="collapse(\'{div_id}\')">{name} ({size})</button>'
line += '<div class="%s" id="{div_id}" style="display:none">' % css_class
line = line.format(
div_id=div_id,
name=node.data['name'],
size=size,
)
else:
line = '<a href="{url}">{name} ({size})</a><br>'
line = line.format(
url=node.data['url'],
name=node.data['name'],
size=size,
)
else:
line = '{space}{bar}{name} : ({size})'
line = line.format(
space='| '*(depth-1),
bar='|---' if depth > 0 else '',
name=node.data['name'],
size=size
)
write(line, output_file)
# Sort by type (directories first) then subsort by lowercase path
customsort = lambda x: (
node.children[x].data['item_type'] == 'file',
node.children[x].data['url'].lower(),
)
for (key, child) in node.sorted_children(customsort=customsort):
recursive_print_node(child, depth=depth+1, output_file=output_file)
if node.data['item_type'] == 'directory':
if use_html:
write('</div>', output_file)
else:
# This helps put some space between sibling directories
write('| ' * (depth), output_file)
if output_filename is not None: if output_filename is not None:
output_file = open(output_filename, 'w', encoding='utf-8') output_file = open(output_filename, 'w', encoding='utf-8')
@ -1122,14 +1087,13 @@ def tree(databasename, output_filename=None):
output_file = None output_file = None
use_html = False use_html = False
if use_html: if use_html:
write(HTML_TREE_HEADER, file_handle=output_file) write(HTML_TREE_HEADER, output_file)
recursive_get_size(tree) size_details = recursive_get_size(tree)
recursive_print_node(tree, output_file=output_file) recursive_print_node(tree, use_html=use_html, output_file=output_file)
if unmeasured_file_count > 0: if size_details['unmeasured'] > 0:
write(UNMEASURED_WARNING % unmeasured_file_count, file_handle=output_file) write(UNMEASURED_WARNING % size_details['unmeasured'], output_file)
if output_file is not None: if output_file is not None:
output_file.close() output_file.close()