This commit is contained in:
unknown 2016-07-05 00:24:08 -07:00
parent 3d0a3dc746
commit 5e88950156
19 changed files with 1355 additions and 278 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 98 KiB

82
BaseNumber/basenumber.py Normal file
View file

@ -0,0 +1,82 @@
import string
ALPHABET = '0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
def from_base(number, base, alphabet=None):
if base < 2:
raise ValueError('base must be >= 2.')
if not isinstance(base, int):
raise TypeError('base must be an int.')
if base == 10:
return number
if alphabet is None:
alphabet = ALPHABET
number = str(number)
alphabet = alphabet[:base]
if number.count('.') > 1:
raise ValueError('Too many decimal points')
mixed_case = any(c in string.ascii_uppercase for c in alphabet) and \
any(c in string.ascii_lowercase for c in alphabet)
if not mixed_case:
alphabet = alphabet.upper()
number = number.upper()
char_set = set(number.replace('.', '', 1))
alpha_set = set(alphabet)
differences = char_set.difference(alpha_set)
if len(differences) > 0:
raise ValueError('Unknown characters for base', base, differences)
alpha_dict = {character:index for (index, character) in enumerate(alphabet)}
try:
decimal_pos = number.index('.')
except ValueError:
decimal_pos = len(number)
result = 0
for (index, character) in enumerate(number):
if index == decimal_pos:
continue
power = (decimal_pos - index)
if index < decimal_pos:
power -= 1
value = alpha_dict[character] * (base ** power)
#print(value)
result += value
return result
def to_base(number, base, decimal_places=10, alphabet=None):
if base < 2:
raise ValueError('base must be >= 2.')
if not isinstance(base, int):
raise TypeError('base must be an int.')
if base == 10:
return str(number)
if alphabet is None:
alphabet = ALPHABET
if base > len(alphabet):
raise ValueError('Not enough symbols in alphabet for base %d' % base)
result = ''
whole_portion = int(number)
float_portion = number - whole_portion
while whole_portion > 0:
(whole_portion, remainder) = divmod(whole_portion, base)
result = alphabet[remainder] + result
if float_portion != 0:
result += '.'
for x in range(decimal_places):
float_portion *= base
whole = int(float_portion)
float_portion -= whole
result += alphabet[whole]
return result

View file

@ -10,6 +10,7 @@ except:
pass
def close_enough(a, b):
#print(a, b)
for (a_channel, b_channel) in zip(a, b):
if abs(a_channel - b_channel) > close_enough_threshold:
return False
@ -17,15 +18,26 @@ def close_enough(a, b):
def deletterbox(filename):
image = Image.open(filename)
(base, ext) = os.path.splitext(filename)
for x in range(4):
image = trim_top(image)
image = image.rotate(90, expand=True)
(base, ext) = os.path.splitext(filename)
filename = base + 'X' + ext
print('size', image.size)
#image.save('%s_%d%s' % (base, x, ext))
rotated = image.rotate(90, expand=True)
# There is currently a bug in PIL which causes rotated images
# to have a 1 px black border on the top and left
if rotated.size != image.size:
rotated = rotated.crop([1, 1, rotated.size[0], rotated.size[1]])
image = rotated
print()
filename = base + '_crop' + ext
image.save(filename, quality=100)
def trim_top(image):
letterbox_color = image.getpixel((0, 0))
print('letterbox color', letterbox_color)
for y in range(image.size[1]):
solid = True
for x in range(image.size[0]):
@ -33,12 +45,12 @@ def trim_top(image):
#print(pixel)
if not close_enough(letterbox_color, pixel):
solid = False
#print(y,pixel)
print('broke at', y,pixel)
break
if not solid:
break
bounds = (0, y, image.size[0], image.size[1])
print(bounds)
print('bounds', bounds)
image = image.crop(bounds)
return image

Binary file not shown.

Before

Width:  |  Height:  |  Size: 343 KiB

After

Width:  |  Height:  |  Size: 429 KiB

View file

@ -10,6 +10,11 @@ KERNEL_EDGE_DETECTION_H = [
[-2, 0, 2],
[-2, 0, 2],
]
KERNEL_EDGE_DETECTION_V = [
[-2, -2, 2],
[0, 0, 0],
[2, 2, 2],
]
def index_to_xy(index, width):
(y, x) = divmod(index, width)
return (x, y)
@ -17,6 +22,15 @@ def index_to_xy(index, width):
def xy_to_index(x, y, width):
return (y * width) + x
def add(image_a, image_b):
pixels_a = image_a.getdata()
pixels_b = image_b.getdata()
assert len(pixels_a) == len(pixels_b)
pixels_c = [a + b for (a, b) in zip(pixels_a, pixels_b)]
new_image = PIL.Image.new('L', (image_a.size))
new_image.putdata(pixels_c, 1, 0)
return new_image
def apply_filter(old_image, kernel):
kernel_height = len(kernel)
kernel_width = len(kernel[0])
@ -49,6 +63,8 @@ def apply_filter(old_image, kernel):
if subject_y < 0 or subject_y >= image_height:
continue
for (kernel_x, kernel_entry) in enumerate(kernel_row):
if kernel_entry == 0:
continue
subject_x = x - (kernel_center[0] - kernel_x)
if subject_x < 0 or subject_x >= image_width:
continue
@ -61,8 +77,8 @@ def apply_filter(old_image, kernel):
operation_avg = abs(operation_sum / operation_denominator)
#n_operation_avg = int(map_range(operation_avg, lower, upper, 0, 255))
if index % 4096 == 0:
print(x, y, operation_sum, operation_denominator, operation_avg)
#print(y, '/', image_height)
#print(x, y, operation_sum, operation_denominator, operation_avg)
print(y, '/', image_height)
new_pixels[index] = operation_avg
#print(new_pixels)
@ -91,7 +107,10 @@ def map_range(x, old_low, old_high, new_low, new_high):
return y
if __name__ == '__main__':
i = PIL.Image.open('ear.jpg')
i = PIL.Image.open('icon.jpg')
i = i.convert('L')
i = apply_filter(apply_filter(i, KERNEL_GAUSSIAN_BLUR), KERNEL_EDGE_DETECTION_H)
i.save('ear.png')
i = apply_filter(i, KERNEL_GAUSSIAN_BLUR)
a = apply_filter(i, KERNEL_EDGE_DETECTION_H)
b = apply_filter(i, KERNEL_EDGE_DETECTION_V)
i = add(a, b)
i.save('icon.png')

View file

@ -0,0 +1,263 @@
<!DOCTYPE html>
<html>
<head>
<title>/new</title>
</head>
<body>
<div id="control_panel">
<input type="text" id="subreddit_field" placeholder="learnpython">
<button id="start_button" onclick="start()">Start</button>
<a id="browser_link"></a>
<button id="clear_button" onclick="clear_workspace()">Clear workspace</button>
</div>
<div id="workspace">
</div>
</body>
</html>
<style>
html
{
background-color: #1b1c18;
}
body
{
background-color: #272822;
margin-left: 10%;
margin-right: 10%;
padding: 5px;
}
#control_panel
{
background-color: #284142;
padding: 5px;
}
#workspace
{
}
a
{
color: #ddd;
text-decoration: none;
font-family: sans-serif;
}
a:hover
{
text-decoration: underline;
}
.submission
{
padding: 10px;
padding-top: 20px;
padding-bottom: 20px;
margin: 10px;
margin-top: 20px;
margin-bottom: 20px;
box-shadow: 5px 5px 10px 0px rgba(0,0,0,0.5);
background-color: #284142;
}
</style>
<script type="text/javascript">
/*
Thanks Joe Marini for the tab focus code
http://www.html5rocks.com/en/tutorials/pagevisibility/intro/
*/
var CHECK_DELAY = 30 * 1000;
var WORKSPACE = document.getElementById("workspace");
var HTTPClient = function()
{
/* Thanks ttgagne http://stackoverflow.com/a/22076667 */
var asynchronous = true;
this.get = function(url, callback)
{
var request = new XMLHttpRequest();
request.onreadystatechange = function()
{
if (request.readyState == 4 && request.status == 200)
{
callback(request.responseText);
}
}
request.open("GET", url, asynchronous);
//request.withCredentials = true;
request.send(null);
}
}
function apply_to_page(text)
{
var j = JSON.parse(text);
var submissions = j["data"]["children"];
submissions.reverse(); // newest last
var new_items = 0;
for (var index = 0; index < submissions.length; index += 1)
{
var submission = submissions[index]["data"];
if (done_ids.has(submission["id"]))
{
continue;
}
done_ids.add(submission["id"]);
if (first_loop)
{
continue;
}
new_items += 1;
var div = document.createElement("div");
div.className = "submission";
var anchor = document.createElement("a");
anchor.innerHTML = "/r/" + submission["subreddit"] + " - " + submission["title"];
anchor.href = "https://reddit.com/r/" + submission["subreddit"] + "/comments/" + submission["id"];
anchor.target = "_blank";
var timestamp = document.createElement("span");
var submission_time = new Date(submission["created_utc"])
timestamp.innerHTML = "" + submission_time.getHours() + ":" + submission_time.getMinutes();
div.appendChild(anchor);
//WORKSPACE.insertBefore(div, WORKSPACE.firstChild);
WORKSPACE.appendChild(div);
}
console.log("+" + new_items);
if (new_items > 0 && !page_focused_cached)
{
unread_notification_count += new_items;
update_title();
}
first_loop = false;
}
function check_forever()
{
clearTimeout(check_timer);
check_once();
check_timer = setTimeout(check_forever, CHECK_DELAY);
}
function check_once()
{
console.log("checking");
if (subreddit == "")
{
console.log("no subreddit");
return;
}
var url = "https://api.reddit.com/r/" + subreddit + "/new.json";
session.get(url, apply_to_page);
}
function clear_workspace()
{
while (WORKSPACE.children.length > 0)
{
WORKSPACE.removeChild(WORKSPACE.firstChild);
}
}
function on_focus_change()
{
if (page_focused_fresh())
{
unread_notification_count = 0;
update_title();
}
}
function page_focused_fresh()
{
var property = visibility_property();
if (!property)
{
page_focused_cached = true;
return true;
}
else
{
page_focused_cached = !document[property];
}
return page_focused_cached;
}
function start()
{
console.log("start");
first_loop = true;
clear_workspace();
var field = document.getElementById("subreddit_field");
var text = field.value;
text = text.replace("/r/", "").replace("r/", "");
subreddit = text;
var link = document.getElementById("browser_link");
var url = "https://reddit.com/r/" + subreddit + "/new";
link.href = url;
link.innerHTML = url;
update_title();
check_forever();
}
function update_title()
{
var title = subreddit + "/new";
if (unread_notification_count > 0)
{
title = "(" + unread_notification_count + ") " + title;
}
document.title = title;
}
function visibility_property()
{
var prefixes = ["webkit","moz","ms","o"];
if ("hidden" in document)
{
return "hidden";
}
for (var i = 0; i < prefixes.length; i++)
{
hidden_attribute = prefixes[i] + "Hidden";
if ((hidden_attribute) in document)
return hidden_attribute;
}
return null;
}
var done_ids = new Set();
var first_loop = true;
var unread_notification_count = 0;
var subreddit = "";
var check_timer = null;
var page_focused_cached;
page_focused_fresh();
var my_visibility_property = visibility_property();
if (my_visibility_property)
{
var my_event_name = my_visibility_property.replace(/[H|h]idden/,'') + 'visibilitychange';
document.addEventListener(my_event_name, on_focus_change);
}
var session = new HTTPClient();
</script>

View file

@ -1,10 +1,13 @@
Open Dir DL
===========
- 2016 07 04
- Added new argparse command "tree"
- 2016 02 08
- Fixed bug where server:port urls did not create db files.
- Moved db commits to only happen at the end of a digest.
Requires `pip install beautifulsoup4`
See inside opendirdl.py for usage instructions.
2016 02 08
- Fixed bug where server:port urls did not create db files.
- Moved db commits to only happen at the end of a digest.
See inside opendirdl.py for usage instructions.

View file

@ -4,14 +4,14 @@ downloads open directories
The basics:
1. Create a database of the directory's files with
> opendirdl digest http://website.com/directory/
> opendirdl digest http://website.com/directory/
2. Enable and disable the files you are interested in with
> opendirdl remove_pattern ".*"
> opendirdl keep_pattern "Daft%20Punk"
> opendirdl remove_pattern "folder\.jpg"
> opendirdl remove_pattern ".*"
> opendirdl keep_pattern "Daft%20Punk"
> opendirdl remove_pattern "folder\.jpg"
Note the percent-encoded string.
3. Download the enabled files with
> opendirdl download database.db
> opendirdl download database.db
Specifics:
@ -52,13 +52,13 @@ keep_pattern:
Enable URLs which match a regex pattern. Matches are based on the percent-
encoded strings!
> opendirdl keep_pattern database.db ".*"
> opendirdl keep_pattern website.com.db ".*"
remove_pattern:
Disable URLs which match a regex pattern. Matches are based on the percent-
encoded strings!
> opendirdl remove_pattern database.db ".*"
> opendirdl remove_pattern website.com.db ".*"
list_basenames:
List enabled URLs in order of their base filename. This makes it easier to
@ -76,13 +76,27 @@ list_basenames:
measure:
Sum up the filesizes of all Enabled URLs.
> opendirdl measure database.db <flags>
> opendirdl measure website.com.db <flags>
flags:
-f | --fullscan:
When included, perform HEAD requests when a file's size is not known.
If this flag is not included, and some file's size is unkown, you will
receive a printed note.
tree:
Print the file / folder tree.
> opendirdl tree website.com.db <flags>
flags:
-o "x.txt" | --outputfile "x.txt":
Output the results to a file instead of stdout. This is useful if the
filenames contain special characters that crash Python, or are so long
that the console becomes unreadable.
If the filename ends with ".html", the webpage will use collapsible
boxes rather than plain text.
'''
@ -91,10 +105,14 @@ measure:
# time importing them usually.
import sys
# Please consult my github repo for these files
# https://github.com/voussoir/else
sys.path.append('C:\\git\\else\\ratelimiter'); import ratelimiter
sys.path.append('C:\\git\\else\\bytestring'); import bytestring
import argparse
## ~import bs4
import collections
## ~import hashlib
import os
## ~import re
@ -108,6 +126,8 @@ FILENAME_BADCHARS = '/\\:*?"<>|'
TERMINAL_WIDTH = shutil.get_terminal_size().columns
DOWNLOAD_CHUNK = 16 * bytestring.KIBIBYTE
# When doing a basic scan, we will not send HEAD requests to URLs that end in these strings,
# because they're probably files.
# This isn't meant to be a comprehensive filetype library, but it covers enough of the
@ -152,23 +172,43 @@ SKIPPABLE_FILETYPES = [
]
SKIPPABLE_FILETYPES = set(x.lower() for x in SKIPPABLE_FILETYPES)
BYTE = 1
KIBIBYTE = 1024 * BYTE
MIBIBYTE = 1024 * KIBIBYTE
GIBIBYTE = 1024 * MIBIBYTE
TEBIBYTE = 1024 * GIBIBYTE
SIZE_UNITS = (TEBIBYTE, GIBIBYTE, MIBIBYTE, KIBIBYTE, BYTE)
# oh shit
HTML_TREE_HEADER = '''
<meta charset="UTF-8">
UNIT_STRINGS = {
BYTE: 'b',
KIBIBYTE: 'KiB',
MIBIBYTE: 'MiB',
GIBIBYTE: 'GiB',
TEBIBYTE: 'TiB',
<script type="text/javascript">
function collapse(id)
{
div = document.getElementById(id);
if (div.style.display != "none")
{
div.style.display = "none";
}
else
{
div.style.display = "block";
}
}
</script>
DOWNLOAD_CHUNK = 2 * KIBIBYTE
<style>
*
{
font-family: Consolas;
}
button
{
display: block;
}
div
{
padding: 10px;
padding-left: 15px;
margin-bottom: 10px;
border: 1px solid #000;
}
</style>
'''
DB_INIT = '''
CREATE TABLE IF NOT EXISTS urls(
@ -202,8 +242,7 @@ class Downloader:
# If they aren't, it's the user's fault.
self.cur.execute('SELECT url FROM urls LIMIT 1')
url = self.cur.fetchone()[0]
# returns (root, path, filename). Keep root.
outputdir = url_to_filepath(url)[0]
outputdir = url_to_filepath(url)['root']
self.outputdir = outputdir
def download(self, overwrite=False, bytespersecond=None):
@ -216,13 +255,13 @@ class Downloader:
break
url = fetch[SQL_URL]
''' Creating the Path '''
(root, folder, basename) = url_to_filepath(url)
''' Creating the permanent and temporary filenames '''
url_filepath = url_to_filepath(url)
# Ignore this value of `root`, because we might have a custom outputdir.
root = self.outputdir
folder = os.path.join(root, folder)
root = url_filepath['root']
folder = os.path.join(root, url_filepath['folder'])
os.makedirs(folder, exist_ok=True)
fullname = os.path.join(folder, basename)
fullname = os.path.join(folder, url_filepath['filename'])
temporary_basename = hashit(url, 16) + '.oddltemporary'
temporary_fullname = os.path.join(folder, temporary_basename)
@ -252,6 +291,89 @@ class Generic:
def __init__(self, **kwargs):
for kwarg in kwargs:
setattr(self, kwarg, kwargs[kwarg])
class TreeNode:
def __init__(self, identifier, data, parent=None):
assert isinstance(identifier, str)
assert '\\' not in identifier
self.identifier = identifier
self.data = data
self.parent = parent
self.children = {}
def __getitem__(self, key):
return self.children[key]
def __repr__(self):
return 'TreeNode %s' % self.abspath()
def abspath(self):
node = self
nodes = [node]
while node.parent is not None:
node = node.parent
nodes.append(node)
nodes.reverse()
nodes = [node.identifier for node in nodes]
return '\\'.join(nodes)
def add_child(self, other_node, overwrite_parent=False):
self.check_child_availability(other_node.identifier)
if other_node.parent is not None and not overwrite_parent:
raise ValueError('That node already has a parent. Try `overwrite_parent=True`')
other_node.parent = self
self.children[other_node.identifier] = other_node
return other_node
def check_child_availability(self, identifier):
if ':' in identifier:
raise Exception('Only roots may have a colon')
if identifier in self.children:
raise Exception('Node %s already has child %s' % (self.identifier, identifier))
def detach(self):
del self.parent.children[self.identifier]
self.parent = None
def listnodes(self, customsort=None):
items = list(self.children.items())
if customsort is None:
items.sort(key=lambda x: x[0].lower())
else:
items.sort(key=customsort)
return [item[1] for item in items]
def merge_other(self, othertree, otherroot=None):
newroot = None
if ':' in othertree.identifier:
if otherroot is None:
raise Exception('Must specify a new name for the other tree\'s root')
else:
newroot = otherroot
else:
newroot = othertree.identifier
othertree.identifier = newroot
othertree.parent = self
self.check_child_availability(newroot)
self.children[newroot] = othertree
def printtree(self, customsort=None):
for node in self.walk(customsort):
print(node.abspath())
def sorted_children(self):
keys = sorted(self.children.keys())
for key in keys:
yield (key, self.children[key])
def walk(self, customsort=None):
yield self
for child in self.listnodes(customsort=customsort):
#print(child)
#print(child.listnodes())
yield from child.walk(customsort=customsort)
## ##
## GENERIC #########################################################################################
@ -264,7 +386,7 @@ class Walker:
walkurl += '/'
self.walkurl = walkurl
if databasename is None or databasename == "":
self.domain = url_to_filepath(walkurl)[0]
self.domain = url_to_filepath(walkurl)['root']
databasename = self.domain + '.db'
databasename = databasename.replace(':', '')
self.databasename = databasename
@ -275,7 +397,7 @@ class Walker:
db_init(self.sql, self.cur)
self.fullscan = bool(fullscan)
self.queue = []
self.queue = collections.deque()
self.seen_directories = set()
def smart_insert(self, url=None, head=None, commit=True):
@ -301,7 +423,8 @@ class Walker:
if not href.startswith(self.walkurl):
# Don't go to other sites or parent directories.
continue
if 'C=' in href and 'O=' in href:
#if 'C=' in href and 'O=' in href:
if any(sorter in href for sorter in ('?C=', '?O=', '?M=', '?D=', '?N=', '?S=')):
# Alternative sort modes for index pages.
continue
if href.endswith('desktop.ini'):
@ -376,12 +499,12 @@ class Walker:
self.smart_insert(head=head, commit=False)
def walk(self, url=None):
self.queue.append(url)
self.queue.appendleft(url)
try:
while len(self.queue) > 0:
# Popping from right helps keep the queue short because it handles the files
# early.
url = self.queue.pop(-1)
url = self.queue.popleft()
self.process_url(url)
line = '{:,} Remaining'.format(len(self.queue))
print(line)
@ -395,16 +518,6 @@ class Walker:
## GENERAL FUNCTIONS ###############################################################################
## ##
def bytes_to_unit_string(bytes):
size_unit = 1
for unit in SIZE_UNITS:
if bytes >= unit:
size_unit = unit
break
size_unit_string = UNIT_STRINGS[size_unit]
size_string = '%.3f %s' % ((bytes / size_unit), size_unit_string)
return size_string
def db_init(sql, cur):
lines = DB_INIT.split(';')
for line in lines:
@ -419,20 +532,19 @@ def dict_to_file(jdict, filename):
filehandle.write(text)
filehandle.close()
def do_get(url):
def do_get(url, raise_for_status=True):
return do_request('GET', requests.get, url)
def do_head(url):
def do_head(url, raise_for_status=True):
return do_request('HEAD', requests.head, url)
def do_request(message, method, url):
import sys
def do_request(message, method, url, raise_for_status=True):
message = '{message:>4s}: {url} : '.format(message=message, url=url)
safeprint(message, end='')
sys.stdout.flush()
safeprint(message, end='', flush=True)
response = method(url)
safeprint(response.status_code)
response.raise_for_status()
if raise_for_status:
response.raise_for_status()
return response
def download_file(url, filehandle, hookfunction=None, headers={}, bytespersecond=None):
@ -511,7 +623,8 @@ def safeprint(text, **kwargs):
def smart_insert(sql, cur, url=None, head=None, commit=True):
'''
INSERT or UPDATE the appropriate entry.
INSERT or UPDATE the appropriate entry, or DELETE if the head
shows a 403 / 404.
'''
if bool(url) is bool(head):
raise ValueError('One and only one of `url` or `head` is necessary.')
@ -523,21 +636,28 @@ def smart_insert(sql, cur, url=None, head=None, commit=True):
elif head is not None:
# When doing a full scan, we get a Response object.
url = head.url
content_length = head.headers.get('Content-Length', None)
if content_length is not None:
content_length = int(content_length)
content_type = head.headers.get('Content-Type', None)
if head.status_code in [403, 404]:
cur.execute('DELETE FROM urls WHERE url == ?', [url])
if commit:
sql.commit()
return (url, None, 0, None, 0)
else:
url = head.url
content_length = head.headers.get('Content-Length', None)
if content_length is not None:
content_length = int(content_length)
content_type = head.headers.get('Content-Type', None)
basename = url_to_filepath(url)[2]
basename = url_to_filepath(url)['filename']
basename = urllib.parse.unquote(basename)
do_download = True
cur.execute('SELECT * FROM urls WHERE url == ?', [url])
existing_entry = cur.fetchone()
is_new = existing_entry is None
data = (url, basename, content_length, content_type, do_download)
if is_new:
cur.execute('INSERT INTO urls VALUES(?, ?, ?, ?, ?)', data)
else:
command = '''
@ -547,6 +667,7 @@ def smart_insert(sql, cur, url=None, head=None, commit=True):
WHERE url == ?
'''
cur.execute(command, [content_length, content_type, url])
if commit:
sql.commit()
return data
@ -554,6 +675,7 @@ def smart_insert(sql, cur, url=None, head=None, commit=True):
def url_to_filepath(text):
text = urllib.parse.unquote(text)
parts = urllib.parse.urlsplit(text)
scheme = parts.scheme
root = parts.netloc
(folder, filename) = os.path.split(parts.path)
while folder.startswith('/'):
@ -566,42 +688,58 @@ def url_to_filepath(text):
# ...but Files are not.
filename = filepath_sanitize(filename)
return (root, folder, filename)
result = {
'scheme': scheme,
'root': root,
'folder': folder,
'filename': filename,
}
return result
## ##
## GENERAL FUNCTIONS ###############################################################################
## COMMANDLINE FUNCTIONS ###########################################################################
## ##
def digest(args):
fullscan = args.fullscan
if isinstance(fullscan, str):
fullscan = bool(eval(fullscan))
walkurl = args.walkurl
if walkurl == '!clipboard':
def digest(databasename, walkurl, fullscan=False):
if walkurl in ('!clipboard', '!c'):
walkurl = get_clipboard()
safeprint('From clipboard: %s' % walkurl)
walker = Walker(
databasename=args.databasename,
databasename=databasename,
fullscan=fullscan,
walkurl=walkurl,
)
walker.walk()
def download(args):
bytespersecond = args.bytespersecond
def digest_argparse(args):
return digest(
databasename=args.databasename,
walkurl=args.walkurl,
fullscan=args.fullscan,
)
def download(databasename, outputdir=None, overwrite=False, bytespersecond=None):
if isinstance(bytespersecond, str):
bytespersecond = eval(bytespersecond)
downloader = Downloader(
databasename=args.databasename,
outputdir=args.outputdir,
databasename=databasename,
outputdir=outputdir,
)
downloader.download(
bytespersecond=bytespersecond,
overwrite=args.overwrite,
overwrite=overwrite,
)
def download_argparse(args):
return download(
databasename=args.databasename,
outputdir=args.outputdir,
overwrite=args.overwrite,
bytespersecond=args.bytespersecond,
)
def filter_pattern(databasename, regex, action='keep', *trash):
'''
When `action` is 'keep', then any URLs matching the regex will have their
@ -653,15 +791,12 @@ def keep_pattern(args):
regex=args.regex,
)
def list_basenames(args):
def list_basenames(databasename, outputfile=None):
'''
Given a database, print the entries in order of the file basenames.
This makes it easier to find interesting titles without worrying about
what directory they're in.
'''
databasename = args.databasename
outputfile = args.outputfile
sql = sqlite3.connect(databasename)
cur = sql.cursor()
cur.execute('SELECT basename FROM urls WHERE do_download == 1 ORDER BY LENGTH(basename) DESC LIMIT 1')
@ -691,14 +826,18 @@ def list_basenames(args):
if outputfile:
outputfile.close()
def measure(args):
def list_basenames_argparse(args):
return list_basenames(
databasename=args.databasename,
outputfile=args.outputfile,
)
def measure(databasename, fullscan=False):
'''
Given a database, print the sum of all Content-Lengths.
If `fullscan`, then URLs with no Content-Length will be
HEAD requested, and the result will be saved back into the file.
'''
databasename = args.databasename
fullscan = args.fullscan
if isinstance(fullscan, str):
fullscan = bool(fullscan)
@ -708,25 +847,29 @@ def measure(args):
cur2 = sql.cursor()
cur2.execute('SELECT * FROM urls WHERE do_download == 1')
filecount = 0
files_without_size = 0
unmeasured_file_count = 0
try:
while True:
fetch = cur2.fetchone()
if fetch is None:
break
size = fetch[SQL_CONTENT_LENGTH]
if size is None:
if fullscan:
url = fetch[SQL_URL]
head = do_head(url)
fetch = smart_insert(sql, cur1, head=head, commit=False)
size = fetch[SQL_CONTENT_LENGTH]
if size is None:
safeprint('"%s" is not revealing Content-Length' % url)
size = 0
else:
files_without_size += 1
if fullscan:
url = fetch[SQL_URL]
head = do_head(url, raise_for_status=False)
fetch = smart_insert(sql, cur1, head=head, commit=False)
size = fetch[SQL_CONTENT_LENGTH]
if size is None:
safeprint('"%s" is not revealing Content-Length' % url)
size = 0
elif fetch[SQL_CONTENT_LENGTH] is None:
unmeasured_file_count += 1
size = 0
totalsize += size
filecount += 1
except:
@ -734,14 +877,20 @@ def measure(args):
raise
sql.commit()
short_string = bytes_to_unit_string(totalsize)
short_string = bytestring.bytestring(totalsize)
totalsize_string = '{} ({:,} bytes) in {:,} files'.format(short_string, totalsize, filecount)
print(totalsize_string)
if files_without_size > 0:
print('Note: %d files do not have a stored Content-Length.' % files_without_size)
if unmeasured_file_count > 0:
print('Note: %d files do not have a stored Content-Length.' % unmeasured_file_count)
print('Run `measure` with `-f` or `--fullscan` to HEAD request those files.')
return totalsize
def measure_argparse(args):
return measure(
databasename=args.databasename,
fullscan=args.fullscan,
)
def remove_pattern(args):
'''
See `filter_pattern`.
@ -751,6 +900,160 @@ def remove_pattern(args):
databasename=args.databasename,
regex=args.regex,
)
def tree(databasename, output_filename=None):
sql = sqlite3.connect(databasename)
cur = sql.cursor()
cur.execute('SELECT * FROM urls WHERE do_download == 1')
items = cur.fetchall()
if len(items) == 0:
return
items.sort(key=lambda x: x[SQL_URL])
path_parts = url_to_filepath(items[0][SQL_URL])
root_identifier = path_parts['root']
#print('Root', root_identifier)
root_data = {'name': root_identifier, 'item_type': 'directory'}
tree = TreeNode(identifier=root_identifier, data=root_data)
node_map = {}
unmeasured_file_count = 0
for item in items:
path = url_to_filepath(item[SQL_URL])
scheme = path['scheme']
path = '\\'.join([path['root'], path['folder'], path['filename']])
parts = path.split('\\')
for (index, part) in enumerate(parts):
index += 1
this_path = '/'.join(parts[:index])
parent_path = '/'.join(parts[:index-1])
#safeprint('this:' + this_path)
#safeprint('parent:' + parent_path)
#input()
data = {
'name': part,
'url': scheme + '://' + this_path,
}
if index == len(parts):
data['item_type'] = 'file'
if item[SQL_CONTENT_LENGTH]:
data['size'] = item[SQL_CONTENT_LENGTH]
else:
unmeasured_file_count += 1
data['size'] = 0
else:
data['item_type'] = 'directory'
# Ensure this comment is in a node of its own
this_node = node_map.get(this_path, None)
if this_node:
# This ID was detected as a parent of a previous iteration
# Now we're actually filling it in.
this_node.data = data
else:
this_node = TreeNode(this_path, data)
node_map[this_path] = this_node
# Attach this node to the parent.
if parent_path == root_identifier:
try:
tree.add_child(this_node)
except:
pass
else:
parent_node = node_map.get(parent_path, None)
if not parent_node:
parent_node = TreeNode(parent_path, data=None)
node_map[parent_path] = parent_node
try:
parent_node.add_child(this_node)
except:
pass
this_node.parent = parent_node
#print(this_node.data)
def write(line, outfile=None):
if outfile is None:
safeprint(line)
else:
outfile.write(line + '\n')
def recursive_get_size(node):
size = node.data.get('size', 0)
if size:
# Files have this attribute, dirs don't
return size
for child in node.children.values():
size += recursive_get_size(child)
node.data['size'] = size
return size
def recursive_print_node(node, depth=0, outfile=None):
if use_html:
if node.data['item_type'] == 'directory':
div_id = hashit(node.identifier, 16)
line = '<button onclick="collapse(\'{div_id}\')">{name} ({size})</button>'
line += '<div id="{div_id}">'
line = line.format(
div_id=div_id,
name=node.data['name'],
size=bytestring.bytestring(node.data['size']),
)
else:
line = '<a href="{url}">{name} ({size})</a><br>'
line = line.format(
url=node.data['url'],
name=node.data['name'],
size=bytestring.bytestring(node.data['size']),
)
else:
line = '{space}{bar}{name} : ({size})'
line = line.format(
space='| '*(depth-1),
bar='|---' if depth > 0 else '',
name=node.data['name'],
size=bytestring.bytestring(node.data['size'])
)
write(line, outfile)
for (key, child) in node.sorted_children():
recursive_print_node(child, depth+1, outfile=outfile)
if node.data['item_type'] == 'directory':
if use_html:
write('</div>', outfile)
else:
# This helps put some space between sibling directories
write('| ' * (depth), outfile)
recursive_get_size(tree)
use_html = output_filename.lower().endswith('.html')
if output_filename is not None:
output_file = open(output_filename, 'w', encoding='utf-8')
if use_html:
write(HTML_TREE_HEADER, outfile=output_file)
recursive_print_node(tree, outfile=output_file)
if unmeasured_file_count > 0:
write('Note: %d files do not have a stored Content-Length.' % unmeasured_file_count, outfile=output_file)
write('Run `measure` with `-f` or `--fullscan` to HEAD request those files.', outfile=output_file)
if output_file is not None:
output_file.close()
return tree
def tree_argparse(args):
return tree(
databasename=args.databasename,
output_filename=args.outputfile,
)
## ##
## COMMANDLINE FUNCTIONS ###########################################################################
@ -765,15 +1068,15 @@ if __name__ == '__main__':
p_digest = subparsers.add_parser('digest')
p_digest.add_argument('walkurl')
p_digest.add_argument('-db', '--database', dest='databasename', default=None)
p_digest.add_argument('-f', '--fullscan', action='store_true')
p_digest.set_defaults(func=digest)
p_digest.add_argument('-f', '--fullscan', dest='fullscan', action='store_true')
p_digest.set_defaults(func=digest_argparse)
p_download = subparsers.add_parser('download')
p_download.add_argument('databasename')
p_download.add_argument('-o', '--outputdir', dest='outputdir', default=None)
p_download.add_argument('-ow', '--overwrite', dest='overwrite', default=False)
p_download.add_argument('-bps', '--bytespersecond', dest='bytespersecond', default=None)
p_download.set_defaults(func=download)
p_download.add_argument('-ow', '--overwrite', dest='overwrite', action='store_true')
p_download.set_defaults(func=download_argparse)
p_keep_pattern = subparsers.add_parser('keep_pattern')
p_keep_pattern.add_argument('databasename')
@ -782,18 +1085,23 @@ if __name__ == '__main__':
p_list_basenames = subparsers.add_parser('list_basenames')
p_list_basenames.add_argument('databasename')
p_list_basenames.add_argument('outputfile', nargs='?', default=None)
p_list_basenames.set_defaults(func=list_basenames)
p_list_basenames.add_argument('-o', '--outputfile', dest='outputfile', default=None)
p_list_basenames.set_defaults(func=list_basenames_argparse)
p_measure = subparsers.add_parser('measure')
p_measure.add_argument('databasename')
p_measure.add_argument('-f', '--fullscan', action='store_true')
p_measure.set_defaults(func=measure)
p_measure.add_argument('-f', '--fullscan', dest='fullscan', action='store_true')
p_measure.set_defaults(func=measure_argparse)
p_remove_pattern = subparsers.add_parser('remove_pattern')
p_remove_pattern.add_argument('databasename')
p_remove_pattern.add_argument('regex')
p_remove_pattern.set_defaults(func=remove_pattern)
p_tree = subparsers.add_parser('tree')
p_tree.add_argument('databasename')
p_tree.add_argument('-o', '--outputfile', dest='outputfile', default=None)
p_tree.set_defaults(func=tree_argparse)
args = parser.parse_args()
args.func(args)

Binary file not shown.

View file

@ -1,3 +1,8 @@
import datetime
import os
import PIL.Image
@ -10,31 +15,44 @@ ID_LENGTH = 22
VALID_TAG_CHARS = string.ascii_lowercase + string.digits + '_-'
MAX_TAG_NAME_LENGTH = 32
SQL_LASTID_COLUMNCOUNT = 2
SQL_LASTID_TAB = 0
SQL_LASTID_ID = 1
SQL_LASTID_COLUMNS = [
'table',
'last_id',
]
SQL_PHOTO_COLUMNCOUNT = 8
SQL_PHOTO_ID = 0
SQL_PHOTO_FILEPATH = 1
SQL_PHOTO_EXTENSION = 2
SQL_PHOTO_WIDTH = 3
SQL_PHOTO_HEIGHT = 4
SQL_PHOTO_AREA = 5
SQL_PHOTO_BYTES = 6
SQL_PHOTO_CREATED = 7
SQL_PHOTO_COLUMNS = [
'id',
'filepath',
'extension',
'width',
'height',
'ratio',
'area',
'bytes',
'created',
]
SQL_PHOTOTAG_COLUMNCOUNT = 2
SQL_PHOTOTAG_PHOTOID = 0
SQL_PHOTOTAG_TAGID = 1
SQL_PHOTOTAG_COLUMNS = [
'photoid',
'tagid',
]
SQL_SYN_COLUMNCOUNT = 2
SQL_SYN_NAME = 0
SQL_SYN_MASTER = 1
SQL_SYN_COLUMNS = [
'name',
'master',
]
SQL_TAG_COLUMNS = [
'id',
'name',
]
SQL_LASTID = {key:index for (index, key) in enumerate(SQL_LASTID_COLUMNS)}
SQL_PHOTO = {key:index for (index, key) in enumerate(SQL_PHOTO_COLUMNS)}
SQL_PHOTOTAG = {key:index for (index, key) in enumerate(SQL_PHOTOTAG_COLUMNS)}
SQL_SYN = {key:index for (index, key) in enumerate(SQL_SYN_COLUMNS)}
SQL_TAG = {key:index for (index, key) in enumerate(SQL_TAG_COLUMNS)}
SQL_TAG_COLUMNCOUNT = 2
SQL_TAG_ID = 0
SQL_TAG_NAME = 1
DB_INIT = '''
CREATE TABLE IF NOT EXISTS photos(
@ -43,6 +61,7 @@ CREATE TABLE IF NOT EXISTS photos(
extension TEXT,
width INT,
height INT,
ratio REAL,
area INT,
bytes INT,
created INT
@ -51,6 +70,10 @@ CREATE TABLE IF NOT EXISTS tags(
id TEXT,
name TEXT
);
CREATE TABLE IF NOT EXISTS albums(
albumid TEXT,
photoid TEXT
);
CREATE TABLE IF NOT EXISTS photo_tag_rel(
photoid TEXT,
tagid TEXT
@ -76,15 +99,6 @@ CREATE INDEX IF NOT EXISTS index_tagrel_tagid on photo_tag_rel(tagid);
CREATE INDEX IF NOT EXISTS index_tagsyn_name on tag_synonyms(name);
'''
def assert_lower(*args):
previous = args[0]
for element in args[1:]:
if element is None:
continue
if element < previous:
raise ValueError('Min and Max out of order')
previous = element
def basex(number, base, alphabet='0123456789abcdefghijklmnopqrstuvwxyz'):
'''
Converts an integer to a different base string.
@ -131,12 +145,8 @@ def is_xor(*args):
'''
return [bool(a) for a in args].count(True) == 1
def min_max_query_builder(name, sign, value):