This commit is contained in:
unknown 2016-01-24 12:48:39 -08:00
parent 4b7cfea08d
commit 001a8d970f
10 changed files with 716 additions and 199 deletions

4
DeLetterbox/README.md Normal file
View file

@ -0,0 +1,4 @@
DeLetterbox
===========
I didn't test this very much, just needed something quick.

View file

@ -0,0 +1,42 @@
from PIL import Image
import os
import sys
CLOSE_ENOUGH_THRESHOLD = 10
def close_enough(a, b):
for (a_channel, b_channel) in zip(a, b):
if abs(a_channel - b_channel) > CLOSE_ENOUGH_THRESHOLD:
return False
return True
def deletterbox(filename):
image = Image.open(filename)
trim_top(image)
for x in range(4):
image = trim_top(image)
image = image.rotate(90)
(base, ext) = os.path.splitext(filename)
#filename = base + 'X' + ext
image.save(filename)
def trim_top(image):
letterbox_color = image.getpixel((0, 0))
for y in range(image.size[1]):
solid = True
for x in range(image.size[0]):
pixel = image.getpixel((x, y))
if not close_enough(letterbox_color, pixel):
solid = False
break
if not solid:
break
bounds = (0, y, image.size[0], image.size[1])
#print(bounds)
image = image.crop(bounds)
return image
filenames = sys.argv[1:]
for filename in filenames:
deletterbox(filename)

Binary file not shown.

After

Width:  |  Height:  |  Size: 37 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 73 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 28 KiB

6
OpenDirDL/README.md Normal file
View file

@ -0,0 +1,6 @@
Open Dir DL
===========
Requires `pip install beautifulsoup4`
See inside opendirdl.py for usage instructions.

View file

@ -1,17 +1,85 @@
import bs4 '''
import hashlib OpenDirDL
import json downloads open directories
Usage:
DIGEST:
Recursively fetch directories and build a database of file URLs.
> opendirdl digest !clipboard <flags>
> opendirdl digest http://website.com/directory/ <flags>
flags:
-f | --fullscan : When included, perform HEAD requests on all files, to
know the size of the entire directory.
-dv "x.db" | --databasename "x.db" : Use a custom database filename. By default, databases
are named after the web domain.
DOWNLOAD:
Download the files whose URLs are enabled in the database.
> opendirdl download website.com.db <flags>
flags:
-o "x" | --outputdir "x" : Save the files to a custom directory, "x". By default,
files are saved to a folder named after the web domain.
-ow | --overwrite : When included, download and overwrite files even if they
already exist in the output directory.
-bps 100 | --bytespersecond 100 : Ratelimit yourself to downloading at 100 BYTES per second.
The webmaster will appreciate this.
KEEP_PATTERN:
Enable URLs which match a regex pattern. Matches are based on the percent-encoded strings!
> opendirdl keep_pattern website.com.db ".*"
REMOVE_PATTERN:
Disable URLs which match a regex pattern. Matches are based on the percent-encoded strings!
> opendirdl remove_pattern website.com.db ".*"
LIST_BASENAMES:
List enabled URLs in order of their base filename. This makes it easier to find titles of
interest in a directory that is very scattered or poorly organized.
> opendirdl list_basenames website.com.db <flags>
flags:
-o "x.txt" | --outputfile "x.txt" : Output the results to a file instead of stdout. This is
useful if the filenames contain special characters that
crash Python, or are so long that the console becomes
unreadable.
MEASURE:
Sum up the filesizes of all enabled URLs.
> opendirdl measure website.com.db <flags>
flags:
-f | --fullscan : When included, perform HEAD requests on any URL whose size is not known.
If this flag is not included, and some file's size is unkown, you will
receive a note.
'''
# Module names preceeded by two hashes indicate modules that are imported during
# a function, because they are not used anywhere else and we don't need to waste
# time importing them usually.
import argparse
## import bs4
## import hashlib
import os import os
import re import ratelimiter
## import re
import requests import requests
import string import sqlite3
import sys ## import sys
import time ## tkinter
import traceback
import urllib.parse import urllib.parse
FILENAME_BADCHARS = '/\\:*?"<>|' FILENAME_BADCHARS = '/\\:*?"<>|'
DOWNLOAD_CHUNK = 2048
# When doing a basic scan, we will not send HEAD requests to URLs that end in these strings, # When doing a basic scan, we will not send HEAD requests to URLs that end in these strings,
# because they're probably files. # because they're probably files.
@ -23,197 +91,343 @@ SKIPPABLE_FILETYPES = [
'.epub', '.epub',
'.db', '.db',
'.flac', '.flac',
'.gif',
'.gz'
'.ico', '.ico',
'.iso', '.iso',
'.jpeg',
'.jpg', '.jpg',
'.m3u',
'.m4a', '.m4a',
'.mkv', '.mkv',
'.mov', '.mov',
'.mp3', '.mp3',
'.mp4', '.mp4',
'.nfo',
'.ogg',
'.pdf', '.pdf',
'.png', '.png',
'.srt', '.srt',
'.tar',
'.txt', '.txt',
'.webm', '.webm',
'.zip', '.zip',
] ]
SKIPPABLE_FILETYPES = set(x.lower() for x in SKIPPABLE_FILETYPES)
SKIPPABLE_FILETYPES = [x.lower() for x in SKIPPABLE_FILETYPES] BYTE = 1
KIBIBYTE = 1024 * BYTE
MIBIBYTE = 1024 * KIBIBYTE
GIBIBYTE = 1024 * MIBIBYTE
TEBIBYTE = 1024 * GIBIBYTE
SIZE_UNITS = (TEBIBYTE, GIBIBYTE, MIBIBYTE, KIBIBYTE, BYTE)
UNIT_STRINGS = {
BYTE: 'b',
KIBIBYTE: 'KiB',
MIBIBYTE: 'MiB',
GIBIBYTE: 'GiB',
TEBIBYTE: 'TiB',
}
DOWNLOAD_CHUNK = 2 * KIBIBYTE
DB_INIT = '''
CREATE TABLE IF NOT EXISTS urls(
url TEXT,
basename TEXT,
content_length INT,
content_type TEXT,
do_download INT
);
CREATE INDEX IF NOT EXISTS urlindex on urls(url);
CREATE INDEX IF NOT EXISTS baseindex on urls(basename);
CREATE INDEX IF NOT EXISTS sizeindex on urls(content_length);
'''.strip()
SQL_URL = 0
SQL_BASENAME = 1
SQL_CONTENT_LENGTH = 2
SQL_CONTENT_TYPE = 3
SQL_DO_DOWNLOAD = 4
## DOWNLOADER ######################################################################################
## ##
class Downloader: class Downloader:
def __init__(self, urlfile, outputdir=None, headers=None): def __init__(self, databasename, outputdir=None, headers=None):
jdict = file_to_dict(urlfile) self.databasename = databasename
self.urls = [item[0] for item in jdict.items()] self.sql = sqlite3.connect(databasename)
self.urls.sort(key=str.lower) self.cur = self.sql.cursor()
if outputdir is None or outputdir == "":
# This assumes that all URLs in the database are from the same domain.
# If they aren't, it's the user's fault.
self.cur.execute('SELECT url FROM urls LIMIT 1')
url = self.cur.fetchone()[0]
# returns (root, path, filename). Keep root.
outputdir = url_to_filepath(url)[0]
self.outputdir = outputdir self.outputdir = outputdir
if self.outputdir is None or self.outputdir == "": def download(self, overwrite=False, bytespersecond=None):
# returns (root, path, filename). Keep root.
self.outputdir = url_to_filepath(self.urls[0])[0]
def download(self, overwrite=False):
overwrite = bool(overwrite) overwrite = bool(overwrite)
for url in self.urls:
self.cur.execute('SELECT * FROM urls WHERE do_download == 1 ORDER BY url')
while True:
fetch = self.cur.fetchone()
if fetch is None:
break
url = fetch[SQL_URL]
''' Creating the Path ''' ''' Creating the Path '''
(root, folder, filename) = url_to_filepath(url) (root, folder, basename) = url_to_filepath(url)
# In case the user has set a custom download directory, # Ignore this value of `root`, because we might have a custom outputdir.
# ignore the above value of `root`.
root = self.outputdir root = self.outputdir
folder = os.path.join(root, folder) folder = os.path.join(root, folder)
if not os.path.exists(folder): if not os.path.exists(folder):
os.makedirs(folder) os.makedirs(folder)
localname = os.path.join(folder, filename) fullname = os.path.join(folder, basename)
temporary_basename = hashit(url, 16) + '.oddltemporary' temporary_basename = hashit(url, 16) + '.oddltemporary'
temporary_localname = os.path.join(folder, temporary_basename) temporary_fullname = os.path.join(folder, temporary_basename)
''' Managing overwrite ''' ''' Managing overwrite '''
if os.path.isfile(localname): if os.path.isfile(fullname):
if overwrite is True: if overwrite is True:
os.remove(localname) os.remove(fullname)
else: else:
safeprint('Skipping "%s". Use `overwrite=True`' % localname) safeprint('Skipping "%s". Use `--overwrite`' % fullname)
continue continue
safeprint('Downloading "%s" as "%s"' % (localname, temporary_basename)) safeprint('Downloading "%s" as "%s"' % (fullname, temporary_basename))
filehandle = open(temporary_localname, 'wb') filehandle = open(temporary_fullname, 'wb')
try: try:
download_file(url, filehandle, hookfunction=hook1) download_file(url, filehandle, hookfunction=hook1, bytespersecond=bytespersecond)
os.rename(temporary_localname, localname) os.rename(temporary_fullname, fullname)
except: except:
filehandle.close() filehandle.close()
raise raise
## ##
## DOWNLOADER ######################################################################################
## GENERIC #########################################################################################
## ##
class Generic:
def __init__(self, **kwargs):
for kwarg in kwargs:
setattr(self, kwarg, kwargs[kwarg])
## ##
## GENERIC #########################################################################################
## WALKER ##########################################################################################
## ##
class Walker: class Walker:
def __init__(self, website, outputfile, fullscan=False): def __init__(self, walkurl, databasename=None, fullscan=False):
self.website = website if walkurl[-1] != '/':
walkurl += '/'
self.walkurl = walkurl
if databasename is None:
self.domain = url_to_filepath(walkurl)[0]
databasename = self.domain + '.db'
self.databasename = databasename
self.sql = sqlite3.connect(self.databasename)
self.cur = self.sql.cursor()
db_init(self.sql, self.cur)
self.fullscan = bool(fullscan) self.fullscan = bool(fullscan)
if os.path.exists(outputfile): self.queue = []
self.results = file_to_dict(outputfile) self.seen_directories = set()
else:
self.results = {}
self.already_seen = set()
def add_head_to_results(self, head): def smart_insert(self, url=None, head=None, commit=True):
if isinstance(head, str): '''
# For when we're doing a basic scan, which skips urls that See `smart_insert`.
# look like a file. '''
self.results[head] = { smart_insert(self.sql, self.cur, url=url, head=head, commit=commit)
'Content-Length': -1,
'Content-Type': '?',
}
self.already_seen.add(head)
else:
# For when we're doing a full scan, which does a HEAD request
# for all urls.
self.results[head.url] = {
'Content-Length': int(head.headers.get('Content-Length', -1)),
'Content-Type': head.headers.get('Content-Type', '?'),
}
self.already_seen.add(head.url)
def extract_hrefs(self, response): def extract_hrefs(self, response, tag='a', attribute='href'):
'''
Given a Response object, extract href urls.
External links, index sort links, and desktop.ini are discarded.
'''
import bs4
soup = bs4.BeautifulSoup(response.text) soup = bs4.BeautifulSoup(response.text)
elements = soup.findAll('a') elements = soup.findAll(tag)
hrefs = []
for element in elements: for element in elements:
try: try:
href = element['href'] href = element[attribute]
except KeyError: except KeyError:
continue continue
href = urllib.parse.urljoin(response.url, href) href = urllib.parse.urljoin(response.url, href)
if not href.startswith(self.website): if not href.startswith(self.walkurl):
# Don't go to other sites or parent directories # Don't go to other sites or parent directories.
continue continue
if 'C=' in href and 'O=' in href: if 'C=' in href and 'O=' in href:
# Alternative sort modes for index pages # Alternative sort modes for index pages.
continue continue
if href.endswith('desktop.ini'): if href.endswith('desktop.ini'):
# I hate these things # I hate these things.
continue continue
hrefs.append(href) yield href
return hrefs
def walk(self, url=None): def process_url(self, url=None):
'''
Given a URL, check whether it is an index page or an actual file.
If it is an index page, it's links are extracted and queued.
If it is a file, its information is saved to the database.
We perform a
HEAD:
when `self.fullscan` is True.
when `self.fullscan` is False but the url is not a SKIPPABLE_FILETYPE.
when the url is an index page.
GET:
when the url is a index page.
'''
if url is None: if url is None:
url = self.website url = self.walkurl
else: else:
url = urllib.parse.urljoin(self.website, url) url = urllib.parse.urljoin(self.walkurl, url)
results = [] if not url.startswith(self.walkurl):
# Don't follow external links or parent directory.
print('Skipping "%s" due to external url.' % url)
return
urll = url.lower() urll = url.lower()
if self.fullscan is False and any(urll.endswith(ext) for ext in SKIPPABLE_FILETYPES): if self.fullscan is False:
print('Skipping "%s" due to extension' % url) skippable = any(urll.endswith(ext) for ext in SKIPPABLE_FILETYPES)
self.add_head_to_results(url) if skippable:
return results safeprint('Skipping "%s" due to extension.' % url)
self.smart_insert(url=url)
return
self.cur.execute('SELECT * FROM urls WHERE url == ?', [url])
skippable = self.cur.fetchone() is not None
if skippable:
safeprint('Skipping "%s" since we already have it.' % url)
return
if not url.startswith(self.website): try:
# Don't follow external links or parent directory. head = do_head(url)
return results except requests.exceptions.HTTPError as e:
if e.response.status_code == 403:
head = requests.head(url) print('403 FORBIDDEN!')
head.raise_for_status() return
if e.response.status_code == 404:
safeprint('HEAD: %s : %s' % (url, head)) print('404 NOT FOUND!')
return
raise
content_type = head.headers.get('Content-Type', '?') content_type = head.headers.get('Content-Type', '?')
self.already_seen.add(head.url)
if content_type.startswith('text/html') and head.url.endswith('/'): if content_type.startswith('text/html') and head.url.endswith('/'):
# This is an index page, let's get recursive. # This is an index page, so extract links and queue them.
page = requests.get(url) response = do_get(url)
safeprint(' GET: %s : %s' % (url, page)) hrefs = self.extract_hrefs(response)
hrefs = self.extract_hrefs(page) self.seen_directories.add(head.url)
for url in hrefs: added = 0
if url not in self.results and url not in self.already_seen: for href in hrefs:
results += self.walk(url) if href in self.seen_directories:
continue
else:
self.queue.append(href)
added += 1
print('Queued %d urls' % added)
else: else:
# Don't add index pages to the results. # This is not an index page, so save it.
self.add_head_to_results(head) self.smart_insert(head=head)
return results def walk(self, url=None):
self.queue.append(url)
while len(self.queue) > 0:
url = self.queue.pop(0)
self.process_url(url)
## ##
## WALKER ##########################################################################################
## GENERAL FUNCTIONS ###############################################################################
## ##
def bytes_to_unit_string(bytes):
size_unit = 1
for unit in SIZE_UNITS:
if bytes >= unit:
size_unit = unit
break
size_unit_string = UNIT_STRINGS[size_unit]
size_string = '%.3f %s' % ((bytes / size_unit), size_unit_string)
return size_string
def db_init(sql, cur):
lines = DB_INIT.split(';')
for line in lines:
cur.execute(line)
sql.commit()
return True
def dict_to_file(jdict, filename): def dict_to_file(jdict, filename):
filehandle = open(filename, 'wb') text = dict_to_string(jdict)
text = json.dumps(jdict, indent=4, sort_keys=True)
text = text.encode('utf-8') text = text.encode('utf-8')
filehandle = open(filename, 'wb')
filehandle.write(text) filehandle.write(text)
filehandle.close() filehandle.close()
def download_file(url, filehandle, getsizeheaders=True, hookfunction=None, headers={}, auth=None): def do_get(url):
if getsizeheaders: return do_request('GET', requests.get, url)
totalsize = requests.head(url, headers=headers, auth=auth)
totalsize = int(totalsize.headers['content-length']) def do_head(url):
return do_request('HEAD', requests.head, url)
def do_request(message, method, url):
import sys
message = '{message:>4s}: {url} : '.format(message=message, url=url)
safeprint(message, end='')
sys.stdout.flush()
response = method(url)
safeprint(response)
response.raise_for_status()
return response
def download_file(url, filehandle, hookfunction=None, headers={}, bytespersecond=None):
if bytespersecond is not None:
limiter = ratelimiter.Ratelimiter(allowance_per_period=bytespersecond, period=1)
else: else:
totalsize = 1 limiter = None
currentblock = 0 currentblock = 0
downloading = requests.get(url, stream=True, headers=headers, auth=auth) downloading = requests.get(url, stream=True, headers=headers)
totalsize = int(downloading.headers.get('content-length', 1))
for chunk in downloading.iter_content(chunk_size=DOWNLOAD_CHUNK): for chunk in downloading.iter_content(chunk_size=DOWNLOAD_CHUNK):
if chunk: if not chunk:
currentblock += 1 break
filehandle.write(chunk) currentblock += 1
if hookfunction is not None: filehandle.write(chunk)
hookfunction(currentblock, DOWNLOAD_CHUNK, totalsize) if limiter is not None:
limiter.limit(len(chunk))
if hookfunction is not None:
hookfunction(currentblock, DOWNLOAD_CHUNK, totalsize)
filehandle.close() filehandle.close()
size = os.path.getsize(filehandle.name) size = os.path.getsize(filehandle.name)
if size < totalsize: if size < totalsize:
raise Exception('Did not receive expected total size. %d / %d' % (size, totalsize)) raise Exception('Did not receive expected total size. %d / %d' % (size, totalsize))
return True return True
def file_to_dict(filename): def filepath_sanitize(text, allowed=''):
filehandle = open(filename, 'rb') bet = FILENAME_BADCHARS.replace(allowed, '')
jdict = json.loads(filehandle.read().decode('utf-8'))
filehandle.close()
return jdict
def filepath_sanitize(text, exclusions=''):
bet = FILENAME_BADCHARS.replace(exclusions, '')
for char in bet: for char in bet:
text = text.replace(char, '') text = text.replace(char, '')
return text return text
def get_clipboard():
import tkinter
t = tkinter.Tk()
clip = t.clipboard_get()
t.destroy()
return clip
def hashit(text, length=None): def hashit(text, length=None):
import hashlib
h = hashlib.sha512(text.encode('utf-8')).hexdigest() h = hashlib.sha512(text.encode('utf-8')).hexdigest()
if length is not None: if length is not None:
h = h[:length] h = h[:length]
@ -230,12 +444,66 @@ def hook1(currentblock, chunksize, totalsize):
if currentbytes == totalsize: if currentbytes == totalsize:
print() print()
def listget(l, index, default=None):
try:
return l[index]
except IndexError:
return default
def longest_length(li):
longest = 0
for item in li:
longest = max(longest, len(item))
return longest
def safeprint(text, **kwargs): def safeprint(text, **kwargs):
text = str(text) text = str(text)
text = text.encode('ascii', 'replace').decode() text = text.encode('ascii', 'replace').decode()
text = text.replace('?', '_') text = text.replace('?', '_')
print(text, **kwargs) print(text, **kwargs)
def smart_insert(sql, cur, url=None, head=None, commit=True):
'''
INSERT or UPDATE the appropriate entry.
'''
if bool(url) is bool(head):
raise ValueError('One and only one of `url` or `head` is necessary.')
if url is not None:
# When doing a basic scan, all we get is the URL.
content_length = None
content_type = None
elif head is not None:
# When doing a full scan, we get a Response object.
url = head.url
content_length = head.headers.get('Content-Length', None)
if content_length is not None:
content_length = int(content_length)
content_type = head.headers.get('Content-Type', None)
basename = url_to_filepath(url)[2]
basename = urllib.parse.unquote(basename)
do_download = True
cur.execute('SELECT * FROM urls WHERE url == ?', [url])
existing_entry = cur.fetchone()
is_new = existing_entry is None
data = (url, basename, content_length, content_type, do_download)
if is_new:
cur.execute('INSERT INTO urls VALUES(?, ?, ?, ?, ?)', data)
else:
command = '''
UPDATE urls SET
content_length = coalesce(?, content_length),
content_type = coalesce(?, content_type)
WHERE url == ?
'''
cur.execute(command, [content_length, content_type, url])
if commit:
sql.commit()
return data
def url_to_filepath(text): def url_to_filepath(text):
text = urllib.parse.unquote(text) text = urllib.parse.unquote(text)
parts = urllib.parse.urlsplit(text) parts = urllib.parse.urlsplit(text)
@ -244,89 +512,230 @@ def url_to_filepath(text):
while folder.startswith('/'): while folder.startswith('/'):
folder = folder[1:] folder = folder[1:]
# Folders are allowed to have slashes # Folders are allowed to have slashes...
folder = filepath_sanitize(folder, exclusions='/\\') folder = filepath_sanitize(folder, allowed='/\\')
folder = folder.replace('\\', os.path.sep) folder = folder.replace('\\', os.path.sep)
folder = folder.replace('/', os.path.sep) folder = folder.replace('/', os.path.sep)
# But Files are not. # ...but Files are not.
filename = filepath_sanitize(filename) filename = filepath_sanitize(filename)
return (root, folder, filename) return (root, folder, filename)
## ##
## GENERAL FUNCTIONS ###############################################################################
## Commandline functions ####################################################\\
def digest(website, outputfile, fullscan, *trash):
fullscan = bool(fullscan)
if website[-1] != '/':
website += '/'
walker = Walker(website, outputfile, fullscan=fullscan)
try:
walker.walk()
dict_to_file(walker.results, outputfile)
except:
dict_to_file(walker.results, outputfile)
traceback.print_exc()
print('SAVED PROGRESS SO FAR')
def download(urlfile, outputdir, overwrite, *trash): ## COMMANDLINE FUNCTIONS ###########################################################################
downloader = Downloader(urlfile, outputdir) ## ##
downloader.download(overwrite) def digest(args):
fullscan = args.fullscan
if isinstance(fullscan, str):
fullscan = bool(eval(fullscan))
walkurl = args.walkurl
if walkurl == '!clipboard':
walkurl = get_clipboard()
safeprint('From clipboard: %s' % walkurl)
walker = Walker(
databasename=args.databasename,
fullscan=fullscan,
walkurl=walkurl,
)
walker.walk()
def filter_pattern(urlfile, patterns, negative=False, *trash): def download(args):
bytespersecond = args.bytespersecond
if isinstance(bytespersecond, str):
bytespersecond = eval(bytespersecond)
downloader = Downloader(
databasename=args.databasename,
outputdir=args.outputdir,
)
downloader.download(
bytespersecond=bytespersecond,
overwrite=args.overwrite,
)
def filter_pattern(databasename, regex, action='keep', *trash):
''' '''
When `negative` is True, items are kept when they do NOT match the pattern, When `action` is 'keep', then any URLs matching the regex will have their
allowing you to delete trash files. `do_download` flag set to True.
When `negative` is False, items are keep when they DO match the pattern, When `action` is 'remove', then any URLs matching the regex will have their
allowing you to keep items of interest. `do_download` flag set to False.
Actions will not act on each other's behalf. A 'keep' will NEVER disable a url,
and 'remove' will NEVER enable one.
''' '''
if isinstance(patterns, str): import re
patterns = [patterns] if isinstance(regex, str):
jdict = file_to_dict(urlfile) regex = [regex]
keys = list(jdict.keys())
for key in keys:
for pattern in patterns:
contains = re.search(pattern, key) is not None
if contains ^ negative:
safeprint('Removing "%s"' % key)
del jdict[key]
dict_to_file(jdict, urlfile)
def keep_pattern(urlfile, patterns, *trash): keep = action == 'keep'
filter_pattern(urlfile=urlfile, patterns=patterns, negative=True) remove = action == 'remove'
def measure(urlfile, *trash): sql = sqlite3.connect(databasename)
jdict = file_to_dict(urlfile) cur = sql.cursor()
totalbytes = 0 cur2 = sql.cursor()
for (url, info) in jdict.items():
bytes = info['Content-Length']
if bytes > 0:
totalbytes += bytes
bytestring = '{:,}'.format(totalbytes)
print(bytestring)
return totalbytes
def remove_pattern(urlfile, patterns, *trash): cur2.execute('SELECT * FROM urls')
filter_pattern(urlfile=urlfile, patterns=patterns, negative=False) while True:
fetch = cur2.fetchone()
def listget(l, index, default=None): if fetch is None:
try:
return l[index]
except IndexError:
return default
cmdfunctions = [digest, download, keep_pattern, measure, remove_pattern]
## End of commandline functions #############################################//
if __name__ == '__main__':
command = listget(sys.argv, 1, None)
arg1 = listget(sys.argv, 2, None)
arg2 = listget(sys.argv, 3, None)
arg3 = listget(sys.argv, 4, None)
if command is None:
quit()
did_something = False
for function in cmdfunctions:
if command == function.__name__:
function(arg1, arg2, arg3)
did_something = True
break break
if not did_something: url = fetch[SQL_URL]
print('No matching function') current_do_dl = fetch[SQL_DO_DOWNLOAD]
for pattern in regex:
contains = re.search(pattern, url) is not None
should_keep = (keep and contains)
if keep and contains and not current_do_dl:
safeprint('Keeping "%s"' % url)
cur.execute('UPDATE urls SET do_download = 1 WHERE url == ?', [url])
if remove and contains and current_do_dl:
safeprint('Removing "%s"' % url)
cur.execute('UPDATE urls SET do_download = 0 WHERE url == ?', [url])
sql.commit()
def keep_pattern(args):
'''
See `filter_pattern`.
'''
filter_pattern(
action='keep',
databasename=args.databasename,
regex=args.regex,
)
def list_basenames(args):
'''
Given a database, print the entries in order of the file basenames.
This makes it easier to find interesting titles without worrying about
what directory they're in.
'''
databasename = args.databasename
outputfile = args.outputfile
sql = sqlite3.connect(databasename)
cur = sql.cursor()
cur.execute('SELECT basename FROM urls WHERE do_download == 1 ORDER BY LENGTH(basename) DESC LIMIT 1')
longest = len(cur.fetchone()[0])
cur.execute('SELECT * FROM urls WHERE do_download == 1 ORDER BY basename')
form = '{bn:<%ds} : {url}' % longest
if outputfile:
outputfile = open(outputfile, 'w', encoding='utf-8')
while True:
fetch = cur.fetchone()
if fetch is None:
break
line = form.format(bn=fetch[SQL_BASENAME], url=fetch[SQL_URL])
if outputfile:
outputfile.write(line + '\n')
else:
print(line)
if outputfile:
outputfile.close()
def measure(args):
'''
Given a database, print the sum of all Content-Lengths.
If `fullscan`, then URLs with no Content-Length will be
HEAD requested, and the result will be saved back into the file.
'''
databasename = args.databasename
fullscan = args.fullscan
if isinstance(fullscan, str):
fullscan = bool(fullscan)
totalsize = 0
sql = sqlite3.connect(databasename)
cur1 = sql.cursor()
cur2 = sql.cursor()
cur2.execute('SELECT * FROM urls WHERE do_download == 1')
filecount = 0
files_without_size = 0
try:
while True:
fetch = cur2.fetchone()
if fetch is None:
break
size = fetch[SQL_CONTENT_LENGTH]
if size is None:
if fullscan:
url = fetch[SQL_URL]
head = do_head(url)
fetch = smart_insert(sql, cur1, head=head, commit=False)
size = fetch[SQL_CONTENT_LENGTH]
if size is None:
safeprint('"%s" is not revealing Content-Length' % url)
size = 0
else:
files_without_size += 1
size = 0
totalsize += size
filecount += 1
except:
sql.commit()
raise
sql.commit()
short_string = bytes_to_unit_string(totalsize)
totalsize_string = '{} ({:,} bytes) in {:,} files'.format(short_string, totalsize, filecount)
print(totalsize_string)
if files_without_size > 0:
print('Note: %d files do not have a stored Content-Length.' % files_without_size)
print('Run `measure` with `-f` or `--fullscan` to HEAD request those files.')
return totalsize
def remove_pattern(args):
'''
See `filter_pattern`.
'''
filter_pattern(
action='remove',
databasename=args.databasename,
regex=args.regex,
)
## ##
## COMMANDLINE FUNCTIONS ###########################################################################
if __name__ == '__main__':
parser = argparse.ArgumentParser()
subparsers = parser.add_subparsers()
p_digest = subparsers.add_parser('digest')
p_digest.add_argument('walkurl')
p_digest.add_argument('-db', '--database', dest='databasename', default=None)
p_digest.add_argument('-f', '--fullscan', action='store_true')
p_digest.set_defaults(func=digest)
p_download = subparsers.add_parser('download')
p_download.add_argument('databasename')
p_download.add_argument('-o', '--outputdir', dest='outputdir', default=None)
p_download.add_argument('-ow', '--overwrite', dest='overwrite', default=False)
p_download.add_argument('-bps', '--bytespersecond', dest='bytespersecond', default=None)
p_download.set_defaults(func=download)
p_keep_pattern = subparsers.add_parser('keep_pattern')
p_keep_pattern.add_argument('databasename')
p_keep_pattern.add_argument('regex')
p_keep_pattern.set_defaults(func=keep_pattern)
p_list_basenames = subparsers.add_parser('list_basenames')
p_list_basenames.add_argument('databasename')
p_list_basenames.add_argument('-o', '--outputfile', dest='outputfile', default=None)
p_list_basenames.set_defaults(func=list_basenames)
p_measure = subparsers.add_parser('measure')
p_measure.add_argument('databasename')
p_measure.add_argument('-f', '--fullscan', action='store_true')
p_measure.set_defaults(func=measure)
p_remove_pattern = subparsers.add_parser('remove_pattern')
p_remove_pattern.add_argument('databasename')
p_remove_pattern.add_argument('regex')
p_remove_pattern.set_defaults(func=remove_pattern)
args = parser.parse_args()
args.func(args)

56
OpenDirDL/ratelimiter.py Normal file
View file

@ -0,0 +1,56 @@
import time
class Ratelimiter:
def __init__(self, allowance_per_period, period, operation_cost=1, mode='sleep'):
'''
allowance_per_period:
The number of operations we can perform per `period` seconds.
period:
The number of seconds over which we can perform `allowance_per_period` operations.
operation_cost:
The default amount to remove from our balance after each operation.
Pass a `cost` parameter to `self.limit` to use a nondefault value.
mode:
'sleep': If we do not have the balance for an operation, sleep until we do.
Return True every time.
'reject': If we do not have the balance for an operation, return False.
'''
if mode not in ('sleep', 'reject'):
raise ValueError('Invalid mode %s' % repr(mode))
self.allowance_per_period = allowance_per_period
self.period = period
self.operation_cost = operation_cost
self.mode = mode
self.last_operation = time.time()
self.balance = 0
self.gain_rate = allowance_per_period / period
def limit(self, cost=None):
if cost is None:
cost = self.operation_cost
timediff = time.time() - self.last_operation
self.balance += timediff * self.gain_rate
self.balance = min(self.balance, self.allowance_per_period)
successful = False
deficit = cost - self.balance
if deficit > 0 and self.mode == 'sleep':
time_needed = (deficit / self.gain_rate)
#print(self.balance, deficit, 'Need to sleep %f' % time_needed)
time.sleep(time_needed)
self.balance = cost
#print(self.balance)
if self.balance >= cost:
#print('pass')
self.balance -= cost
successful = True
self.last_operation = time.time()
return successful

View file

@ -99,15 +99,24 @@ class BitsToImage:
class ImageToBits: class ImageToBits:
def __init__(self, image, bitness): def __init__(self, image, bitness):
self.image = image self.image = image
self.bitness = bitness
self.width = image.size[0] self.width = image.size[0]
self.height = image.size[1]
self.pixel_index = -1 self.pixel_index = -1
self.bit_index = bitness
self.active_byte = [] self.active_byte = []
self.pixels = self.image.getdata()
#self.bits = ''
#for pixel in self.pixels:
# for channel in pixel:
# self.bits += binary(channel)[-bitness:]
#print(len(self.bits))
def _read(self): def _read(self):
if len(self.active_byte) == 0: if len(self.active_byte) == 0:
self.pixel_index += 1 self.pixel_index += 1
(x, y) = index_to_xy(self.pixel_index, self.width) self.active_byte = self.pixels[self.pixel_index]
self.active_byte = list(self.image.getpixel((x, y)))
self.active_byte = self.active_byte[:3] self.active_byte = self.active_byte[:3]
self.active_byte = [binary(channel) for channel in self.active_byte] self.active_byte = [binary(channel) for channel in self.active_byte]
self.active_byte = [channel[-bitness:] for channel in self.active_byte] self.active_byte = [channel[-bitness:] for channel in self.active_byte]
@ -115,6 +124,7 @@ class ImageToBits:
self.active_byte = list(self.active_byte) self.active_byte = list(self.active_byte)
ret = self.active_byte.pop(0) ret = self.active_byte.pop(0)
self.bit_index += 1
return ret return ret
def read(self, bits=1): def read(self, bits=1):
@ -196,24 +206,14 @@ def encode(imagefilename, secretfilename, bitness=1):
secret_content_length = (secret_size) + (len(secret_extension)) + 1 secret_content_length = (secret_size) + (len(secret_extension)) + 1
requiredpixels = math.ceil(((secret_content_length * 8) + 32) / (3 * bitness)) requiredpixels = math.ceil(((secret_content_length * 8) + 32) / (3 * bitness))
if totalpixels < requiredpixels: if totalpixels < requiredpixels:
raise StegError('Image does not have enough pixels to store the Secret' raise StegError('Image does not have enough pixels to store the Secret. '
'Must have at least %d pixels' % requiredpixels) 'Must have at least %d pixels' % requiredpixels)
print('%d pixels available, %d required' % (totalpixels, requiredpixels)) print('%d pixels available, %d required' % (totalpixels, requiredpixels))
# --> YOU ARE HERE <-- # --> YOU ARE HERE <--
# Because bitness may be between 1 and 8, we need to create a writing buffer
# called `binary_write_buffer`, so that we're always writing the same amount
# of data per color channel.
# If we were to write the secret length / extension on the fly, we might end
# up using the wrong number of bits for the final channel of some pixel.
# Example: 10010101 broken into groups of 3 is [100, 101, 01]
# Note that the last group is not the same size as the desired bitness, and
# will cause decode errors.
pixel = list(image.getpixel((0, 0))) pixel = list(image.getpixel((0, 0)))
binary_write_buffer = ''
# Write secret length # Write secret length
secret_content_length_b = binary(secret_content_length).rjust(32, '0') secret_content_length_b = binary(secret_content_length).rjust(32, '0')