else
This commit is contained in:
parent
25fd827eb1
commit
32221ce607
4 changed files with 316 additions and 307 deletions
|
@ -23,67 +23,30 @@ CHUNKSIZE = 16 * bytestring.KIBIBYTE
|
||||||
STOP = False
|
STOP = False
|
||||||
TIMEOUT = 600
|
TIMEOUT = 600
|
||||||
|
|
||||||
def download_file(
|
def basename_from_url(url):
|
||||||
url,
|
'''
|
||||||
localname=None,
|
Determine the local filename appropriate for a URL.
|
||||||
auth=None,
|
'''
|
||||||
bytespersecond=None,
|
localname = urllib.parse.unquote(url)
|
||||||
callback_progress=None,
|
localname = localname.split('?')[0]
|
||||||
headers=None,
|
localname = localname.split('/')[-1]
|
||||||
overwrite=None
|
return localname
|
||||||
|
|
||||||
|
def determine_seek_and_range(
|
||||||
|
file_handle,
|
||||||
|
localname,
|
||||||
|
local_exists,
|
||||||
|
overwrite,
|
||||||
|
remote_total_bytes,
|
||||||
|
server_respects_range,
|
||||||
|
user_provided_range,
|
||||||
|
user_range_min,
|
||||||
|
user_range_max,
|
||||||
):
|
):
|
||||||
if headers is None:
|
''' THINGS THAT CAN HAPPEN '''
|
||||||
headers = {}
|
|
||||||
''' Determine local filename '''
|
|
||||||
url = url.replace('%3A//', '://')
|
|
||||||
if localname in [None, '']:
|
|
||||||
localname = localize(url)
|
|
||||||
|
|
||||||
localname = filepath_sanitize(localname)
|
|
||||||
|
|
||||||
directory = os.path.split(localname)[0]
|
|
||||||
if directory != '':
|
|
||||||
os.makedirs(directory, exist_ok=True)
|
|
||||||
|
|
||||||
if bytespersecond is None:
|
|
||||||
limiter = None
|
|
||||||
else:
|
|
||||||
limiter = ratelimiter.Ratelimiter(bytespersecond, period=1)
|
|
||||||
|
|
||||||
''' Prepare condition variables '''
|
|
||||||
local_exists = os.path.exists(localname)
|
|
||||||
if local_exists and overwrite is False:
|
|
||||||
print('Overwrite off. Nothing to do.')
|
|
||||||
return
|
|
||||||
|
|
||||||
user_provided_range = 'range' in headers
|
|
||||||
if user_provided_range:
|
|
||||||
user_range_min = int(headers['range'].split('bytes=')[1].split('-')[0])
|
|
||||||
user_range_max = headers['range'].split('-')[1]
|
|
||||||
if user_range_max != '':
|
|
||||||
user_range_max = int(user_range_max)
|
|
||||||
else:
|
|
||||||
# Included to determine whether the server supports this
|
|
||||||
headers['range'] = 'bytes=0-'
|
|
||||||
|
|
||||||
# I'm using a GET instead of an actual HEAD here because some servers respond
|
|
||||||
# differently, even though they're not supposed to.
|
|
||||||
head = request('get', url, stream=True, headers=headers, auth=auth)
|
|
||||||
remote_total_bytes = int(head.headers.get('content-length', 1))
|
|
||||||
server_respects_range = (head.status_code == 206 and 'content-range' in head.headers)
|
|
||||||
seek_to = 0
|
seek_to = 0
|
||||||
header_range_min = None
|
header_range_min = None
|
||||||
header_range_max = None
|
header_range_max = None
|
||||||
head.connection.close()
|
|
||||||
|
|
||||||
if not user_provided_range:
|
|
||||||
del headers['range']
|
|
||||||
|
|
||||||
touch(localname)
|
|
||||||
file_handle = open(localname, 'r+b')
|
|
||||||
file_handle.seek(0)
|
|
||||||
|
|
||||||
''' THINGS THAT CAN HAPPEN '''
|
|
||||||
if local_exists:
|
if local_exists:
|
||||||
local_existing_bytes = os.path.getsize(localname)
|
local_existing_bytes = os.path.getsize(localname)
|
||||||
if overwrite is True:
|
if overwrite is True:
|
||||||
|
@ -110,7 +73,7 @@ def download_file(
|
||||||
|
|
||||||
elif not user_provided_range:
|
elif not user_provided_range:
|
||||||
if server_respects_range:
|
if server_respects_range:
|
||||||
print('Resuming from %d' % local_existing_bytes)
|
print('Resuming from byte %d' % local_existing_bytes)
|
||||||
header_range_min = local_existing_bytes
|
header_range_min = local_existing_bytes
|
||||||
header_range_max = ''
|
header_range_max = ''
|
||||||
seek_to = local_existing_bytes
|
seek_to = local_existing_bytes
|
||||||
|
@ -142,7 +105,82 @@ def download_file(
|
||||||
|
|
||||||
elif not user_provided_range:
|
elif not user_provided_range:
|
||||||
pass
|
pass
|
||||||
|
return (seek_to, header_range_min, header_range_max)
|
||||||
|
|
||||||
|
def download_file(
|
||||||
|
url,
|
||||||
|
localname=None,
|
||||||
|
auth=None,
|
||||||
|
bytespersecond=None,
|
||||||
|
callback_progress=None,
|
||||||
|
headers=None,
|
||||||
|
overwrite=None
|
||||||
|
):
|
||||||
|
if headers is None:
|
||||||
|
headers = {}
|
||||||
|
''' Determine local filename '''
|
||||||
|
url = url.replace('%3A//', '://')
|
||||||
|
if localname in [None, '']:
|
||||||
|
localname = basename_from_url(url)
|
||||||
|
|
||||||
|
localname = filepath_sanitize(localname)
|
||||||
|
|
||||||
|
directory = os.path.split(localname)[0]
|
||||||
|
if directory != '':
|
||||||
|
os.makedirs(directory, exist_ok=True)
|
||||||
|
|
||||||
|
if bytespersecond is None:
|
||||||
|
limiter = None
|
||||||
|
else:
|
||||||
|
limiter = ratelimiter.Ratelimiter(bytespersecond, period=1)
|
||||||
|
|
||||||
|
''' Prepare plan variables '''
|
||||||
|
local_exists = os.path.exists(localname)
|
||||||
|
if local_exists and overwrite is False:
|
||||||
|
print('Overwrite off. Nothing to do.')
|
||||||
|
return
|
||||||
|
|
||||||
|
user_provided_range = 'range' in headers
|
||||||
|
if user_provided_range:
|
||||||
|
user_range_min = int(headers['range'].split('bytes=')[1].split('-')[0])
|
||||||
|
user_range_max = headers['range'].split('-')[1]
|
||||||
|
if user_range_max != '':
|
||||||
|
user_range_max = int(user_range_max)
|
||||||
|
else:
|
||||||
|
# Included to determine whether the server supports this
|
||||||
|
headers['range'] = 'bytes=0-'
|
||||||
|
user_range_min = None
|
||||||
|
user_range_max = None
|
||||||
|
|
||||||
|
# I'm using a GET instead of an actual HEAD here because some servers respond
|
||||||
|
# differently, even though they're not supposed to.
|
||||||
|
head = request('get', url, stream=True, headers=headers, auth=auth)
|
||||||
|
remote_total_bytes = int(head.headers.get('content-length', 1))
|
||||||
|
server_respects_range = (head.status_code == 206 and 'content-range' in head.headers)
|
||||||
|
head.connection.close()
|
||||||
|
|
||||||
|
if not user_provided_range:
|
||||||
|
del headers['range']
|
||||||
|
|
||||||
|
touch(localname)
|
||||||
|
file_handle = open(localname, 'r+b')
|
||||||
|
file_handle.seek(0)
|
||||||
|
|
||||||
|
plan = determine_seek_and_range(
|
||||||
|
file_handle=file_handle,
|
||||||
|
localname=localname,
|
||||||
|
local_exists=local_exists,
|
||||||
|
overwrite=overwrite,
|
||||||
|
remote_total_bytes=remote_total_bytes,
|
||||||
|
server_respects_range=server_respects_range,
|
||||||
|
user_provided_range=user_provided_range,
|
||||||
|
user_range_min=user_range_min,
|
||||||
|
user_range_max=user_range_max,
|
||||||
|
)
|
||||||
|
if plan is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
(seek_to, header_range_min, header_range_max) = plan
|
||||||
if header_range_min is not None:
|
if header_range_min is not None:
|
||||||
headers['range'] = 'bytes={0}-{1}'.format(header_range_min, header_range_max)
|
headers['range'] = 'bytes={0}-{1}'.format(header_range_min, header_range_max)
|
||||||
|
|
||||||
|
@ -176,15 +214,6 @@ def get_permission(prompt='y/n\n>', affirmative=['y', 'yes']):
|
||||||
def is_clipboard(s):
|
def is_clipboard(s):
|
||||||
return s.lower() in ['!c', '!clip', '!clipboard']
|
return s.lower() in ['!c', '!clip', '!clipboard']
|
||||||
|
|
||||||
def localize(url):
|
|
||||||
'''
|
|
||||||
Determine the local filename appropriate for a URL.
|
|
||||||
'''
|
|
||||||
localname = urllib.parse.unquote(url)
|
|
||||||
localname = localname.split('?')[0]
|
|
||||||
localname = localname.split('/')[-1]
|
|
||||||
return localname
|
|
||||||
|
|
||||||
def progress(bytes_downloaded, bytes_total, prefix=''):
|
def progress(bytes_downloaded, bytes_total, prefix=''):
|
||||||
divisor = bytestring.get_appropriate_divisor(bytes_total)
|
divisor = bytestring.get_appropriate_divisor(bytes_total)
|
||||||
bytes_total_string = bytestring.bytestring(bytes_total, force_unit=divisor)
|
bytes_total_string = bytestring.bytestring(bytes_total, force_unit=divisor)
|
||||||
|
@ -216,7 +245,7 @@ def progress(bytes_downloaded, bytes_total, prefix=''):
|
||||||
def progress2(bytes_downloaded, bytes_total, prefix=''):
|
def progress2(bytes_downloaded, bytes_total, prefix=''):
|
||||||
percent = (bytes_downloaded*100)/bytes_total
|
percent = (bytes_downloaded*100)/bytes_total
|
||||||
percent = min(100, percent)
|
percent = min(100, percent)
|
||||||
percent = '%08.4f' % percent
|
percent_string = '%08.4f' % percent
|
||||||
bytes_downloaded_string = '{0:,}'.format(bytes_downloaded)
|
bytes_downloaded_string = '{0:,}'.format(bytes_downloaded)
|
||||||
bytes_total_string = '{0:,}'.format(bytes_total)
|
bytes_total_string = '{0:,}'.format(bytes_total)
|
||||||
bytes_downloaded_string = bytes_downloaded_string.rjust(len(bytes_total_string), ' ')
|
bytes_downloaded_string = bytes_downloaded_string.rjust(len(bytes_total_string), ' ')
|
||||||
|
@ -227,7 +256,7 @@ def progress2(bytes_downloaded, bytes_total, prefix=''):
|
||||||
prefix=prefix,
|
prefix=prefix,
|
||||||
bytes_downloaded=bytes_downloaded_string,
|
bytes_downloaded=bytes_downloaded_string,
|
||||||
bytes_total=bytes_total_string,
|
bytes_total=bytes_total_string,
|
||||||
percent=percent,
|
percent=percent_string,
|
||||||
)
|
)
|
||||||
print(message, end=end, flush=True)
|
print(message, end=end, flush=True)
|
||||||
|
|
||||||
|
|
7
Javascript/tab_renamer.js
Normal file
7
Javascript/tab_renamer.js
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
javascript:
|
||||||
|
function rename()
|
||||||
|
{
|
||||||
|
document.title = prompt("New page title:");
|
||||||
|
}
|
||||||
|
|
||||||
|
rename();
|
|
@ -1,6 +1,19 @@
|
||||||
Open Dir DL
|
Open Dir DL
|
||||||
===========
|
===========
|
||||||
|
|
||||||
|
The open directory downloader
|
||||||
|
|
||||||
|
Requires `pip install beautifulsoup4`
|
||||||
|
|
||||||
|
See inside opendirdl.py for usage instructions.
|
||||||
|
|
||||||
|
- 2016 07 29
|
||||||
|
- Moved some nested function definitions out to the top level, and made the construction of the file tree its own function. These functions really don't need to be used on their own, but they were cluttering the logic of the `tree` command.
|
||||||
|
- Renamed `Tree.listnodes` to `Tree.list_children` and the `customsort` now expects to operate on Node objects rather than `(identifier, Node)` tuples. Nodes already have their identifier so the tuple was unecessary.
|
||||||
|
- Removed `Tree.sorted_children` since it was basically a duplicate of `Tree.listnodes` and I don't know why I had both.
|
||||||
|
- Replaced all `safeprint` calls with `write` because it provides access to safeprint as well as file writing if needed.
|
||||||
|
- Replaced local `download_file` function with a call to `downloady.download_file`. It supports download continuation and removes duplicate work.
|
||||||
|
|
||||||
- 2016 07 25
|
- 2016 07 25
|
||||||
- Removed the `Downloader` class after watching [this Jack Diederich talk](https://youtu.be/o9pEzgHorH0) about unecessary classes.
|
- Removed the `Downloader` class after watching [this Jack Diederich talk](https://youtu.be/o9pEzgHorH0) about unecessary classes.
|
||||||
- Bytespersecond is now parsed by `bytestring.parsebytes` rather than `eval`, so you can write "100k" as opposed to "100 * 1024" etc.
|
- Bytespersecond is now parsed by `bytestring.parsebytes` rather than `eval`, so you can write "100k" as opposed to "100 * 1024" etc.
|
||||||
|
@ -19,9 +32,5 @@ Open Dir DL
|
||||||
- Added new argparse command "tree"
|
- Added new argparse command "tree"
|
||||||
|
|
||||||
- 2016 02 08
|
- 2016 02 08
|
||||||
- Fixed bug where server:port urls did not create db files.
|
- Fixed bug where server:port urls did not create db files because of the colon. It's been replaced by a hash.
|
||||||
- Moved db commits to only happen at the end of a digest.
|
- Moved db commits to only happen at the end of a digest.
|
||||||
|
|
||||||
Requires `pip install beautifulsoup4`
|
|
||||||
|
|
||||||
See inside opendirdl.py for usage instructions.
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
# voussoir
|
||||||
|
|
||||||
DOCSTRING='''
|
DOCSTRING='''
|
||||||
OpenDirDL
|
OpenDirDL
|
||||||
downloads open directories
|
downloads open directories
|
||||||
|
@ -45,8 +47,9 @@ download:
|
||||||
in the output directory.
|
in the output directory.
|
||||||
|
|
||||||
-bps 100 | --bytespersecond 100:
|
-bps 100 | --bytespersecond 100:
|
||||||
Ratelimit yourself to downloading at 100 BYTES per second.
|
-bps 100k | -bps "100 kb" | -bps 100kib | -bps 1.2m
|
||||||
The webmaster will appreciate this.
|
Ratelimit your download speed. Supports units like "k", "m" according
|
||||||
|
to `bytestring.parsebytes`.
|
||||||
|
|
||||||
keep_pattern:
|
keep_pattern:
|
||||||
Enable URLs which match a regex pattern. Matches are based on the percent-
|
Enable URLs which match a regex pattern. Matches are based on the percent-
|
||||||
|
@ -61,8 +64,8 @@ remove_pattern:
|
||||||
> opendirdl remove_pattern website.com.db ".*"
|
> opendirdl remove_pattern website.com.db ".*"
|
||||||
|
|
||||||
list_basenames:
|
list_basenames:
|
||||||
List Enabled URLs in order of their base filename. This makes it easier to
|
List Enabled URLs alphabetized by their base filename. This makes it easier
|
||||||
find titles of interest in a directory that is very scattered or poorly
|
to find titles of interest in a directory that is very scattered or poorly
|
||||||
organized.
|
organized.
|
||||||
|
|
||||||
> opendirdl list_basenames website.com.db <flags>
|
> opendirdl list_basenames website.com.db <flags>
|
||||||
|
@ -112,8 +115,9 @@ import sys
|
||||||
|
|
||||||
# Please consult my github repo for these files
|
# Please consult my github repo for these files
|
||||||
# https://github.com/voussoir/else
|
# https://github.com/voussoir/else
|
||||||
sys.path.append('C:\\git\\else\\ratelimiter'); import ratelimiter
|
sys.path.append('C:\\git\\else\\Downloady'); import downloady
|
||||||
sys.path.append('C:\\git\\else\\bytestring'); import bytestring
|
sys.path.append('C:\\git\\else\\Bytestring'); import bytestring
|
||||||
|
sys.path.append('C:\\git\\else\\Ratelimiter'); import ratelimiter
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
## ~import bs4
|
## ~import bs4
|
||||||
|
@ -278,7 +282,7 @@ class Walker:
|
||||||
databasename = databasename.replace(':', '#')
|
databasename = databasename.replace(':', '#')
|
||||||
self.databasename = databasename
|
self.databasename = databasename
|
||||||
|
|
||||||
safeprint('Opening %s' % self.databasename)
|
write('Opening %s' % self.databasename)
|
||||||
self.sql = sqlite3.connect(self.databasename)
|
self.sql = sqlite3.connect(self.databasename)
|
||||||
self.cur = self.sql.cursor()
|
self.cur = self.sql.cursor()
|
||||||
db_init(self.sql, self.cur)
|
db_init(self.sql, self.cur)
|
||||||
|
@ -346,20 +350,20 @@ class Walker:
|
||||||
|
|
||||||
if not url.startswith(self.walkurl):
|
if not url.startswith(self.walkurl):
|
||||||
# Don't follow external links or parent directory.
|
# Don't follow external links or parent directory.
|
||||||
safeprint('Skipping "%s" due to external url.' % url)
|
write('Skipping "%s" due to external url.' % url)
|
||||||
return
|
return
|
||||||
|
|
||||||
urll = url.lower()
|
urll = url.lower()
|
||||||
if self.fullscan is False:
|
if self.fullscan is False:
|
||||||
skippable = any(urll.endswith(ext) for ext in SKIPPABLE_FILETYPES)
|
skippable = any(urll.endswith(ext) for ext in SKIPPABLE_FILETYPES)
|
||||||
if skippable:
|
if skippable:
|
||||||
safeprint('Skipping "%s" due to extension.' % url)
|
write('Skipping "%s" due to extension.' % url)
|
||||||
self.smart_insert(url=url, commit=False)
|
self.smart_insert(url=url, commit=False)
|
||||||
return
|
return
|
||||||
self.cur.execute('SELECT * FROM urls WHERE url == ?', [url])
|
self.cur.execute('SELECT * FROM urls WHERE url == ?', [url])
|
||||||
skippable = self.cur.fetchone() is not None
|
skippable = self.cur.fetchone() is not None
|
||||||
if skippable:
|
if skippable:
|
||||||
safeprint('Skipping "%s" since we already have it.' % url)
|
write('Skipping "%s" since we already have it.' % url)
|
||||||
return
|
return
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
@ -468,13 +472,13 @@ class TreeNode:
|
||||||
del self.parent.children[self.identifier]
|
del self.parent.children[self.identifier]
|
||||||
self.parent = None
|
self.parent = None
|
||||||
|
|
||||||
def listnodes(self, customsort=None):
|
def list_children(self, customsort=None):
|
||||||
items = list(self.children.items())
|
children = list(self.children.values())
|
||||||
if customsort is None:
|
if customsort is None:
|
||||||
items.sort(key=lambda x: x[0].lower())
|
children.sort(key=lambda node: node.identifier.lower())
|
||||||
else:
|
else:
|
||||||
items.sort(key=customsort)
|
children.sort(key=customsort)
|
||||||
return [item[1] for item in items]
|
return children
|
||||||
|
|
||||||
def merge_other(self, othertree, otherroot=None):
|
def merge_other(self, othertree, otherroot=None):
|
||||||
newroot = None
|
newroot = None
|
||||||
|
@ -490,16 +494,9 @@ class TreeNode:
|
||||||
self.check_child_availability(newroot)
|
self.check_child_availability(newroot)
|
||||||
self.children[newroot] = othertree
|
self.children[newroot] = othertree
|
||||||
|
|
||||||
def sorted_children(self, customsort=None):
|
|
||||||
keys = sorted(self.children.keys(), key=customsort)
|
|
||||||
for key in keys:
|
|
||||||
yield (key, self.children[key])
|
|
||||||
|
|
||||||
def walk(self, customsort=None):
|
def walk(self, customsort=None):
|
||||||
yield self
|
yield self
|
||||||
for child in self.listnodes(customsort=customsort):
|
for child in self.listnodes(customsort=customsort):
|
||||||
#print(child)
|
|
||||||
#print(child.listnodes())
|
|
||||||
yield from child.walk(customsort=customsort)
|
yield from child.walk(customsort=customsort)
|
||||||
## ##
|
## ##
|
||||||
## OTHER CLASSES ###################################################################################
|
## OTHER CLASSES ###################################################################################
|
||||||
|
@ -507,6 +504,88 @@ class TreeNode:
|
||||||
|
|
||||||
## GENERAL FUNCTIONS ###############################################################################
|
## GENERAL FUNCTIONS ###############################################################################
|
||||||
## ##
|
## ##
|
||||||
|
def build_file_tree(databasename):
|
||||||
|
sql = sqlite3.connect(databasename)
|
||||||
|
cur = sql.cursor()
|
||||||
|
cur.execute('SELECT * FROM urls WHERE do_download == 1')
|
||||||
|
items = cur.fetchall()
|
||||||
|
sql.close()
|
||||||
|
if len(items) == 0:
|
||||||
|
return
|
||||||
|
|
||||||
|
items.sort(key=lambda x: x[SQL_URL])
|
||||||
|
|
||||||
|
path_parts = url_to_filepath(items[0][SQL_URL])
|
||||||
|
root_identifier = path_parts['root']
|
||||||
|
#print('Root', root_identifier)
|
||||||
|
root_data = {'name': root_identifier, 'item_type': 'directory'}
|
||||||
|
root_identifier = root_identifier.replace(':', '')
|
||||||
|
tree = TreeNode(
|
||||||
|
identifier=root_identifier,
|
||||||
|
data=root_data
|
||||||
|
)
|
||||||
|
node_map = {}
|
||||||
|
|
||||||
|
for item in items:
|
||||||
|
path = url_to_filepath(item[SQL_URL])
|
||||||
|
scheme = path['scheme']
|
||||||
|
|
||||||
|
# I join and re-split because 'folder' may contain slashes of its own
|
||||||
|
# and I want to break all the pieces
|
||||||
|
path = '\\'.join([path['root'], path['folder'], path['filename']])
|
||||||
|
parts = path.split('\\')
|
||||||
|
#print(path)
|
||||||
|
for (index, part) in enumerate(parts):
|
||||||
|
this_path = '/'.join(parts[:index + 1])
|
||||||
|
parent_path = '/'.join(parts[:index])
|
||||||
|
|
||||||
|
#input()
|
||||||
|
data = {
|
||||||
|
'name': part,
|
||||||
|
'url': scheme + '://' + this_path,
|
||||||
|
}
|
||||||
|
this_identifier = this_path.replace(':', '')
|
||||||
|
parent_identifier = parent_path.replace(':', '')
|
||||||
|
|
||||||
|
if (index + 1) == len(parts):
|
||||||
|
data['item_type'] = 'file'
|
||||||
|
if item[SQL_CONTENT_LENGTH]:
|
||||||
|
data['size'] = item[SQL_CONTENT_LENGTH]
|
||||||
|
else:
|
||||||
|
data['size'] = None
|
||||||
|
else:
|
||||||
|
data['item_type'] = 'directory'
|
||||||
|
|
||||||
|
|
||||||
|
# Ensure this comment is in a node of its own
|
||||||
|
this_node = node_map.get(this_identifier, None)
|
||||||
|
if this_node:
|
||||||
|
# This ID was detected as a parent of a previous iteration
|
||||||
|
# Now we're actually filling it in.
|
||||||
|
this_node.data = data
|
||||||
|
else:
|
||||||
|
this_node = TreeNode(this_identifier, data)
|
||||||
|
node_map[this_identifier] = this_node
|
||||||
|
|
||||||
|
# Attach this node to the parent.
|
||||||
|
if parent_identifier == root_identifier:
|
||||||
|
try:
|
||||||
|
tree.add_child(this_node)
|
||||||
|
except TreeExistingChild:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
parent_node = node_map.get(parent_identifier, None)
|
||||||
|
if not parent_node:
|
||||||
|
parent_node = TreeNode(parent_identifier, data=None)
|
||||||
|
node_map[parent_identifier] = parent_node
|
||||||
|
try:
|
||||||
|
parent_node.add_child(this_node)
|
||||||
|
except TreeExistingChild:
|
||||||
|
pass
|
||||||
|
this_node.parent = parent_node
|
||||||
|
#print(this_node.data)
|
||||||
|
return tree
|
||||||
|
|
||||||
def db_init(sql, cur):
|
def db_init(sql, cur):
|
||||||
lines = DB_INIT.split(';')
|
lines = DB_INIT.split(';')
|
||||||
for line in lines:
|
for line in lines:
|
||||||
|
@ -522,38 +601,13 @@ def do_head(url, raise_for_status=True):
|
||||||
|
|
||||||
def do_request(message, method, url, raise_for_status=True):
|
def do_request(message, method, url, raise_for_status=True):
|
||||||
message = '{message:>4s}: {url} : '.format(message=message, url=url)
|
message = '{message:>4s}: {url} : '.format(message=message, url=url)
|
||||||
safeprint(message, end='', flush=True)
|
write(message, end='', flush=True)
|
||||||
response = method(url)
|
response = method(url)
|
||||||
safeprint(response.status_code)
|
write(response.status_code)
|
||||||
if raise_for_status:
|
if raise_for_status:
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
return response
|
return response
|
||||||
|
|
||||||
def download_file(url, filehandle, hookfunction=None, headers={}, bytespersecond=None):
|
|
||||||
if bytespersecond is not None:
|
|
||||||
limiter = ratelimiter.Ratelimiter(allowance_per_period=bytespersecond, period=1)
|
|
||||||
else:
|
|
||||||
limiter = None
|
|
||||||
|
|
||||||
currentblock = 0
|
|
||||||
downloading = requests.get(url, stream=True, headers=headers)
|
|
||||||
totalsize = int(downloading.headers.get('content-length', 1))
|
|
||||||
for chunk in downloading.iter_content(chunk_size=DOWNLOAD_CHUNK):
|
|
||||||
if not chunk:
|
|
||||||
break
|
|
||||||
currentblock += 1
|
|
||||||
filehandle.write(chunk)
|
|
||||||
if limiter is not None:
|
|
||||||
limiter.limit(len(chunk))
|
|
||||||
if hookfunction is not None:
|
|
||||||
hookfunction(currentblock, DOWNLOAD_CHUNK, totalsize)
|
|
||||||
|
|
||||||
filehandle.close()
|
|
||||||
size = os.path.getsize(filehandle.name)
|
|
||||||
if size < totalsize:
|
|
||||||
raise Exception('Did not receive expected total size. %d / %d' % (size, totalsize))
|
|
||||||
return True
|
|
||||||
|
|
||||||
def fetch_generator(cur):
|
def fetch_generator(cur):
|
||||||
while True:
|
while True:
|
||||||
fetch = cur.fetchone()
|
fetch = cur.fetchone()
|
||||||
|
@ -581,17 +635,6 @@ def hashit(text, length=None):
|
||||||
h = h[:length]
|
h = h[:length]
|
||||||
return h
|
return h
|
||||||
|
|
||||||
def hook1(currentblock, chunksize, totalsize):
|
|
||||||
currentbytes = currentblock * chunksize
|
|
||||||
if currentbytes > totalsize:
|
|
||||||
currentbytes = totalsize
|
|
||||||
currentbytes = '{:,}'.format(currentbytes)
|
|
||||||
totalsize = '{:,}'.format(totalsize)
|
|
||||||
currentbytes = currentbytes.rjust(len(totalsize), ' ')
|
|
||||||
print('%s / %s bytes' % (currentbytes, totalsize), end='\r')
|
|
||||||
if currentbytes == totalsize:
|
|
||||||
print()
|
|
||||||
|
|
||||||
def listget(l, index, default=None):
|
def listget(l, index, default=None):
|
||||||
try:
|
try:
|
||||||
return l[index]
|
return l[index]
|
||||||
|
@ -604,10 +647,79 @@ def longest_length(li):
|
||||||
longest = max(longest, len(item))
|
longest = max(longest, len(item))
|
||||||
return longest
|
return longest
|
||||||
|
|
||||||
|
def recursive_get_size(node):
|
||||||
|
'''
|
||||||
|
Calculate the size of the Directory nodes by summing the sizes of all children.
|
||||||
|
Modifies the nodes in-place.
|
||||||
|
'''
|
||||||
|
return_value = {
|
||||||
|
'size': 0,
|
||||||
|
'unmeasured': 0,
|
||||||
|
}
|
||||||
|
if node.data['item_type'] == 'file':
|
||||||
|
if node.data['size'] is None:
|
||||||
|
return_value['unmeasured'] = 1
|
||||||
|
return_value['size'] = node.data['size']
|
||||||
|
|
||||||
|
else:
|
||||||
|
for child in node.list_children():
|
||||||
|
child_details = recursive_get_size(child)
|
||||||
|
return_value['size'] += child_details['size'] or 0
|
||||||
|
return_value['unmeasured'] += child_details['unmeasured']
|
||||||
|
node.data['size'] = return_value['size']
|
||||||
|
|
||||||
|
return return_value
|
||||||
|
|
||||||
|
def recursive_print_node(node, depth=0, use_html=False, output_file=None):
|
||||||
|
'''
|
||||||
|
Given a tree node (presumably the root), print it and all of its children.
|
||||||
|
'''
|
||||||
|
size = node.data['size']
|
||||||
|
if size is None:
|
||||||
|
size = UNKNOWN_SIZE_STRING
|
||||||
|
else:
|
||||||
|
size = bytestring.bytestring(size)
|
||||||
|
|
||||||
|
if use_html:
|
||||||
|
css_class = 'directory_even' if depth % 2 == 0 else 'directory_odd'
|
||||||
|
if node.data['item_type'] == 'directory':
|
||||||
|
div_id = hashit(node.identifier, 16)
|
||||||
|
line = '<button onclick="collapse(\'{div_id}\')">{name} ({size})</button>'
|
||||||
|
line += '<div class="{css}" id="{div_id}" style="display:none">'
|
||||||
|
line = line.format(div_id=div_id, name=node.data['name'], size=size, css=css_class)
|
||||||
|
else:
|
||||||
|
line = '<a href="{url}">{name} ({size})</a><br>'
|
||||||
|
line = line.format(url=node.data['url'], name=node.data['name'], size=size)
|
||||||
|
else:
|
||||||
|
line = '{space}{bar}{name} : ({size})'
|
||||||
|
line = line.format(
|
||||||
|
space='| ' * (depth-1),
|
||||||
|
bar='|---' if depth > 0 else '',
|
||||||
|
name=node.data['name'],
|
||||||
|
size=size
|
||||||
|
)
|
||||||
|
write(line, output_file)
|
||||||
|
|
||||||
|
# Sort by type (directories first) then subsort by lowercase path
|
||||||
|
customsort = lambda node: (
|
||||||
|
node.data['item_type'] == 'file',
|
||||||
|
node.data['url'].lower(),
|
||||||
|
)
|
||||||
|
|
||||||
|
for child in node.list_children(customsort=customsort):
|
||||||
|
recursive_print_node(child, depth=depth+1, use_html=use_html, output_file=output_file)
|
||||||
|
|
||||||
|
if node.data['item_type'] == 'directory':
|
||||||
|
if use_html:
|
||||||
|
write('</div>', output_file)
|
||||||
|
else:
|
||||||
|
# This helps put some space between sibling directories
|
||||||
|
write('| ' * (depth), output_file)
|
||||||
|
|
||||||
def safeprint(text, **kwargs):
|
def safeprint(text, **kwargs):
|
||||||
text = str(text)
|
text = str(text)
|
||||||
text = text.encode('ascii', 'replace').decode()
|
text = text.encode('ascii', 'replace').decode()
|
||||||
text = text.replace('?', '_')
|
#text = text.replace('?', '_')
|
||||||
print(text, **kwargs)
|
print(text, **kwargs)
|
||||||
|
|
||||||
def smart_insert(sql, cur, url=None, head=None, commit=True):
|
def smart_insert(sql, cur, url=None, head=None, commit=True):
|
||||||
|
@ -686,11 +798,11 @@ def url_to_filepath(text):
|
||||||
}
|
}
|
||||||
return result
|
return result
|
||||||
|
|
||||||
def write(line, file_handle=None):
|
def write(line, file_handle=None, **kwargs):
|
||||||
if file_handle is None:
|
if file_handle is None:
|
||||||
safeprint(line)
|
safeprint(line, **kwargs)
|
||||||
else:
|
else:
|
||||||
file_handle.write(line + '\n')
|
file_handle.write(line + '\n', **kwargs)
|
||||||
## ##
|
## ##
|
||||||
## GENERAL FUNCTIONS ###############################################################################
|
## GENERAL FUNCTIONS ###############################################################################
|
||||||
|
|
||||||
|
@ -700,7 +812,7 @@ def write(line, file_handle=None):
|
||||||
def digest(databasename, walkurl, fullscan=False):
|
def digest(databasename, walkurl, fullscan=False):
|
||||||
if walkurl in ('!clipboard', '!c'):
|
if walkurl in ('!clipboard', '!c'):
|
||||||
walkurl = get_clipboard()
|
walkurl = get_clipboard()
|
||||||
safeprint('From clipboard: %s' % walkurl)
|
write('From clipboard: %s' % walkurl)
|
||||||
walker = Walker(
|
walker = Walker(
|
||||||
databasename=databasename,
|
databasename=databasename,
|
||||||
fullscan=fullscan,
|
fullscan=fullscan,
|
||||||
|
@ -773,13 +885,18 @@ def download(
|
||||||
if overwrite:
|
if overwrite:
|
||||||
os.remove(fullname)
|
os.remove(fullname)
|
||||||
else:
|
else:
|
||||||
safeprint('Skipping "%s". Use `--overwrite`' % fullname)
|
write('Skipping "%s". Use `--overwrite`' % fullname)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
safeprint('Downloading "%s" as "%s"' % (fullname, temporary_basename))
|
overwrite = overwrite or None
|
||||||
filehandle = open(temporary_fullname, 'wb')
|
write('Downloading "%s" as "%s"' % (fullname, temporary_basename))
|
||||||
with filehandle:
|
downloady.download_file(
|
||||||
download_file(url, filehandle, hookfunction=hook1, bytespersecond=bytespersecond)
|
url,
|
||||||
|
localname=temporary_fullname,
|
||||||
|
bytespersecond=bytespersecond,
|
||||||
|
callback_progress=downloady.progress2,
|
||||||
|
overwrite=overwrite
|
||||||
|
)
|
||||||
os.rename(temporary_fullname, fullname)
|
os.rename(temporary_fullname, fullname)
|
||||||
|
|
||||||
def download_argparse(args):
|
def download_argparse(args):
|
||||||
|
@ -821,10 +938,10 @@ def filter_pattern(databasename, regex, action='keep', *trash):
|
||||||
|
|
||||||
should_keep = (keep and contains)
|
should_keep = (keep and contains)
|
||||||
if keep and contains and not current_do_dl:
|
if keep and contains and not current_do_dl:
|
||||||
safeprint('Enabling "%s"' % url)
|
write('Enabling "%s"' % url)
|
||||||
cur.execute('UPDATE urls SET do_download = 1 WHERE url == ?', [url])
|
cur.execute('UPDATE urls SET do_download = 1 WHERE url == ?', [url])
|
||||||
if remove and contains and current_do_dl:
|
if remove and contains and current_do_dl:
|
||||||
safeprint('Disabling "%s"' % url)
|
write('Disabling "%s"' % url)
|
||||||
cur.execute('UPDATE urls SET do_download = 0 WHERE url == ?', [url])
|
cur.execute('UPDATE urls SET do_download = 0 WHERE url == ?', [url])
|
||||||
sql.commit()
|
sql.commit()
|
||||||
|
|
||||||
|
@ -914,7 +1031,7 @@ def measure(databasename, fullscan=False, new_only=False):
|
||||||
fetch = smart_insert(sql, cur, head=head, commit=True)
|
fetch = smart_insert(sql, cur, head=head, commit=True)
|
||||||
size = fetch[SQL_CONTENT_LENGTH]
|
size = fetch[SQL_CONTENT_LENGTH]
|
||||||
if size is None:
|
if size is None:
|
||||||
safeprint('"%s" is not revealing Content-Length' % url)
|
write('"%s" is not revealing Content-Length' % url)
|
||||||
size = 0
|
size = 0
|
||||||
|
|
||||||
|
|
||||||
|
@ -961,159 +1078,7 @@ def tree(databasename, output_filename=None):
|
||||||
collapsible boxes and clickable filenames. Otherwise the file will just
|
collapsible boxes and clickable filenames. Otherwise the file will just
|
||||||
be a plain text drawing.
|
be a plain text drawing.
|
||||||
'''
|
'''
|
||||||
|
tree = build_file_tree(databasename)
|
||||||
sql = sqlite3.connect(databasename)
|
|
||||||
cur = sql.cursor()
|
|
||||||
cur.execute('SELECT * FROM urls WHERE do_download == 1')
|
|
||||||
items = cur.fetchall()
|
|
||||||
if len(items) == 0:
|
|
||||||
return
|
|
||||||
|
|
||||||
items.sort(key=lambda x: x[SQL_URL])
|
|
||||||
|
|
||||||
path_parts = url_to_filepath(items[0][SQL_URL])
|
|
||||||
root_identifier = path_parts['root']
|
|
||||||
#print('Root', root_identifier)
|
|
||||||
root_data = {'name': root_identifier, 'item_type': 'directory'}
|
|
||||||
root_identifier = root_identifier.replace(':', '')
|
|
||||||
tree = TreeNode(
|
|
||||||
identifier=root_identifier,
|
|
||||||
data=root_data
|
|
||||||
)
|
|
||||||
node_map = {}
|
|
||||||
|
|
||||||
unmeasured_file_count = 0
|
|
||||||
|
|
||||||
for item in items:
|
|
||||||
path = url_to_filepath(item[SQL_URL])
|
|
||||||
scheme = path['scheme']
|
|
||||||
|
|
||||||
# I join and re-split because 'folder' may contain slashes of its own
|
|
||||||
# and I want to break all the pieces
|
|
||||||
path = '\\'.join([path['root'], path['folder'], path['filename']])
|
|
||||||
parts = path.split('\\')
|
|
||||||
#print(path)
|
|
||||||
for (index, part) in enumerate(parts):
|
|
||||||
this_path = '/'.join(parts[:index + 1])
|
|
||||||
parent_path = '/'.join(parts[:index])
|
|
||||||
#safeprint('this:' + this_path)
|
|
||||||
#safeprint('parent:' + parent_path)
|
|
||||||
|
|
||||||
#input()
|
|
||||||
data = {
|
|
||||||
'name': part,
|
|
||||||
'url': scheme + '://' + this_path,
|
|
||||||
}
|
|
||||||
this_identifier = this_path.replace(':', '')
|
|
||||||
parent_identifier = parent_path.replace(':', '')
|
|
||||||
|
|
||||||
if (index + 1) == len(parts):
|
|
||||||
data['item_type'] = 'file'
|
|
||||||
if item[SQL_CONTENT_LENGTH]:
|
|
||||||
data['size'] = item[SQL_CONTENT_LENGTH]
|
|
||||||
else:
|
|
||||||
unmeasured_file_count += 1
|
|
||||||
data['size'] = None
|
|
||||||
else:
|
|
||||||
data['item_type'] = 'directory'
|
|
||||||
|
|
||||||
|
|
||||||
# Ensure this comment is in a node of its own
|
|
||||||
this_node = node_map.get(this_identifier, None)
|
|
||||||
if this_node:
|
|
||||||
# This ID was detected as a parent of a previous iteration
|
|
||||||
# Now we're actually filling it in.
|
|
||||||
this_node.data = data
|
|
||||||
else:
|
|
||||||
this_node = TreeNode(this_identifier, data)
|
|
||||||
node_map[this_identifier] = this_node
|
|
||||||
|
|
||||||
# Attach this node to the parent.
|
|
||||||
if parent_identifier == root_identifier:
|
|
||||||
try:
|
|
||||||
tree.add_child(this_node)
|
|
||||||
except TreeExistingChild:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
parent_node = node_map.get(parent_identifier, None)
|
|
||||||
if not parent_node:
|
|
||||||
parent_node = TreeNode(parent_identifier, data=None)
|
|
||||||
node_map[parent_identifier] = parent_node
|
|
||||||
try:
|
|
||||||
parent_node.add_child(this_node)
|
|
||||||
except TreeExistingChild:
|
|
||||||
pass
|
|
||||||
this_node.parent = parent_node
|
|
||||||
#print(this_node.data)
|
|
||||||
|
|
||||||
def recursive_get_size(node):
|
|
||||||
size = node.data.get('size', 0)
|
|
||||||
if size:
|
|
||||||
# Files have this attribute, dirs don't
|
|
||||||
return size
|
|
||||||
|
|
||||||
for child in node.children.values():
|
|
||||||
child_size = recursive_get_size(child)
|
|
||||||
child_size = child_size or 0
|
|
||||||
size += child_size
|
|
||||||
node.data['size'] = size
|
|
||||||
return size
|
|
||||||
|
|
||||||
def recursive_print_node(node, depth=0, output_file=None):
|
|
||||||
size = node.data['size']
|
|
||||||
if size is None:
|
|
||||||
size = UNKNOWN_SIZE_STRING
|
|
||||||
else:
|
|
||||||
size = bytestring.bytestring(size)
|
|
||||||
|
|
||||||
if use_html:
|
|
||||||
if depth % 2 == 0:
|
|
||||||
css_class = 'directory_even'
|
|
||||||
else:
|
|
||||||
css_class = 'directory_odd'
|
|
||||||
|
|
||||||
if node.data['item_type'] == 'directory':
|
|
||||||
div_id = hashit(node.identifier, 16)
|
|
||||||
line = '<button onclick="collapse(\'{div_id}\')">{name} ({size})</button>'
|
|
||||||
line += '<div class="%s" id="{div_id}" style="display:none">' % css_class
|
|
||||||
line = line.format(
|
|
||||||
div_id=div_id,
|
|
||||||
name=node.data['name'],
|
|
||||||
size=size,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
line = '<a href="{url}">{name} ({size})</a><br>'
|
|
||||||
line = line.format(
|
|
||||||
url=node.data['url'],
|
|
||||||
name=node.data['name'],
|
|
||||||
size=size,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
line = '{space}{bar}{name} : ({size})'
|
|
||||||
line = line.format(
|
|
||||||
space='| '*(depth-1),
|
|
||||||
bar='|---' if depth > 0 else '',
|
|
||||||
name=node.data['name'],
|
|
||||||
size=size
|
|
||||||
)
|
|
||||||
write(line, output_file)
|
|
||||||
|
|
||||||
# Sort by type (directories first) then subsort by lowercase path
|
|
||||||
customsort = lambda x: (
|
|
||||||
node.children[x].data['item_type'] == 'file',
|
|
||||||
node.children[x].data['url'].lower(),
|
|
||||||
)
|
|
||||||
|
|
||||||
for (key, child) in node.sorted_children(customsort=customsort):
|
|
||||||
recursive_print_node(child, depth=depth+1, output_file=output_file)
|
|
||||||
|
|
||||||
if node.data['item_type'] == 'directory':
|
|
||||||
if use_html:
|
|
||||||
write('</div>', output_file)
|
|
||||||
else:
|
|
||||||
# This helps put some space between sibling directories
|
|
||||||
write('| ' * (depth), output_file)
|
|
||||||
|
|
||||||
|
|
||||||
if output_filename is not None:
|
if output_filename is not None:
|
||||||
output_file = open(output_filename, 'w', encoding='utf-8')
|
output_file = open(output_filename, 'w', encoding='utf-8')
|
||||||
|
@ -1122,14 +1087,13 @@ def tree(databasename, output_filename=None):
|
||||||
output_file = None
|
output_file = None
|
||||||
use_html = False
|
use_html = False
|
||||||
|
|
||||||
|
|
||||||
if use_html:
|
if use_html:
|
||||||
write(HTML_TREE_HEADER, file_handle=output_file)
|
write(HTML_TREE_HEADER, output_file)
|
||||||
|
|
||||||
recursive_get_size(tree)
|
size_details = recursive_get_size(tree)
|
||||||
recursive_print_node(tree, output_file=output_file)
|
recursive_print_node(tree, use_html=use_html, output_file=output_file)
|
||||||
if unmeasured_file_count > 0:
|
if size_details['unmeasured'] > 0:
|
||||||
write(UNMEASURED_WARNING % unmeasured_file_count, file_handle=output_file)
|
write(UNMEASURED_WARNING % size_details['unmeasured'], output_file)
|
||||||
|
|
||||||
if output_file is not None:
|
if output_file is not None:
|
||||||
output_file.close()
|
output_file.close()
|
||||||
|
|
Loading…
Reference in a new issue