diff --git a/DeLetterbox/README.md b/DeLetterbox/README.md new file mode 100644 index 0000000..ffa7ff9 --- /dev/null +++ b/DeLetterbox/README.md @@ -0,0 +1,4 @@ +DeLetterbox +=========== + +I didn't test this very much, just needed something quick. \ No newline at end of file diff --git a/DeLetterbox/deletterbox.py b/DeLetterbox/deletterbox.py new file mode 100644 index 0000000..64a79b2 --- /dev/null +++ b/DeLetterbox/deletterbox.py @@ -0,0 +1,42 @@ +from PIL import Image +import os +import sys + +CLOSE_ENOUGH_THRESHOLD = 10 + +def close_enough(a, b): + for (a_channel, b_channel) in zip(a, b): + if abs(a_channel - b_channel) > CLOSE_ENOUGH_THRESHOLD: + return False + return True + +def deletterbox(filename): + image = Image.open(filename) + trim_top(image) + for x in range(4): + image = trim_top(image) + image = image.rotate(90) + (base, ext) = os.path.splitext(filename) + #filename = base + 'X' + ext + image.save(filename) + +def trim_top(image): + letterbox_color = image.getpixel((0, 0)) + for y in range(image.size[1]): + solid = True + for x in range(image.size[0]): + pixel = image.getpixel((x, y)) + if not close_enough(letterbox_color, pixel): + solid = False + break + if not solid: + break + bounds = (0, y, image.size[0], image.size[1]) + #print(bounds) + image = image.crop(bounds) + return image + +filenames = sys.argv[1:] +for filename in filenames: + deletterbox(filename) + diff --git a/DeLetterbox/example1_after.jpg b/DeLetterbox/example1_after.jpg new file mode 100644 index 0000000..77f00f9 Binary files /dev/null and b/DeLetterbox/example1_after.jpg differ diff --git a/DeLetterbox/example1_before.jpg b/DeLetterbox/example1_before.jpg new file mode 100644 index 0000000..58819e8 Binary files /dev/null and b/DeLetterbox/example1_before.jpg differ diff --git a/DeLetterbox/example2_after.jpg b/DeLetterbox/example2_after.jpg new file mode 100644 index 0000000..9cf76c6 Binary files /dev/null and b/DeLetterbox/example2_after.jpg differ diff --git a/DeLetterbox/example2_before.jpg b/DeLetterbox/example2_before.jpg new file mode 100644 index 0000000..99aa165 Binary files /dev/null and b/DeLetterbox/example2_before.jpg differ diff --git a/OpenDirDL/README.md b/OpenDirDL/README.md new file mode 100644 index 0000000..2cfffd0 --- /dev/null +++ b/OpenDirDL/README.md @@ -0,0 +1,6 @@ +Open Dir DL +=========== + +Requires `pip install beautifulsoup4` + +See inside opendirdl.py for usage instructions. \ No newline at end of file diff --git a/OpenDirDL/opendirdl.py b/OpenDirDL/opendirdl.py index 9bff4be..8512076 100644 --- a/OpenDirDL/opendirdl.py +++ b/OpenDirDL/opendirdl.py @@ -1,17 +1,85 @@ -import bs4 -import hashlib -import json +''' +OpenDirDL +downloads open directories + +Usage: + +DIGEST: + Recursively fetch directories and build a database of file URLs. + + > opendirdl digest !clipboard + > opendirdl digest http://website.com/directory/ + + flags: + -f | --fullscan : When included, perform HEAD requests on all files, to + know the size of the entire directory. + -dv "x.db" | --databasename "x.db" : Use a custom database filename. By default, databases + are named after the web domain. + + +DOWNLOAD: + Download the files whose URLs are enabled in the database. + + > opendirdl download website.com.db + + flags: + -o "x" | --outputdir "x" : Save the files to a custom directory, "x". By default, + files are saved to a folder named after the web domain. + -ow | --overwrite : When included, download and overwrite files even if they + already exist in the output directory. + -bps 100 | --bytespersecond 100 : Ratelimit yourself to downloading at 100 BYTES per second. + The webmaster will appreciate this. + +KEEP_PATTERN: + Enable URLs which match a regex pattern. Matches are based on the percent-encoded strings! + + > opendirdl keep_pattern website.com.db ".*" + +REMOVE_PATTERN: + Disable URLs which match a regex pattern. Matches are based on the percent-encoded strings! + + > opendirdl remove_pattern website.com.db ".*" + +LIST_BASENAMES: + List enabled URLs in order of their base filename. This makes it easier to find titles of + interest in a directory that is very scattered or poorly organized. + + > opendirdl list_basenames website.com.db + + flags: + -o "x.txt" | --outputfile "x.txt" : Output the results to a file instead of stdout. This is + useful if the filenames contain special characters that + crash Python, or are so long that the console becomes + unreadable. + +MEASURE: + Sum up the filesizes of all enabled URLs. + + > opendirdl measure website.com.db + + flags: + -f | --fullscan : When included, perform HEAD requests on any URL whose size is not known. + If this flag is not included, and some file's size is unkown, you will + receive a note. +''' + +# Module names preceeded by two hashes indicate modules that are imported during +# a function, because they are not used anywhere else and we don't need to waste +# time importing them usually. + +import argparse +## import bs4 +## import hashlib import os -import re +import ratelimiter +## import re import requests -import string -import sys -import time -import traceback +import sqlite3 +## import sys +## tkinter import urllib.parse FILENAME_BADCHARS = '/\\:*?"<>|' -DOWNLOAD_CHUNK = 2048 # When doing a basic scan, we will not send HEAD requests to URLs that end in these strings, # because they're probably files. @@ -23,197 +91,343 @@ SKIPPABLE_FILETYPES = [ '.epub', '.db', '.flac', +'.gif', +'.gz' '.ico', '.iso', +'.jpeg', '.jpg', +'.m3u', '.m4a', '.mkv', '.mov', '.mp3', '.mp4', +'.nfo', +'.ogg', '.pdf', '.png', '.srt', +'.tar', '.txt', '.webm', '.zip', ] +SKIPPABLE_FILETYPES = set(x.lower() for x in SKIPPABLE_FILETYPES) -SKIPPABLE_FILETYPES = [x.lower() for x in SKIPPABLE_FILETYPES] +BYTE = 1 +KIBIBYTE = 1024 * BYTE +MIBIBYTE = 1024 * KIBIBYTE +GIBIBYTE = 1024 * MIBIBYTE +TEBIBYTE = 1024 * GIBIBYTE +SIZE_UNITS = (TEBIBYTE, GIBIBYTE, MIBIBYTE, KIBIBYTE, BYTE) +UNIT_STRINGS = { + BYTE: 'b', + KIBIBYTE: 'KiB', + MIBIBYTE: 'MiB', + GIBIBYTE: 'GiB', + TEBIBYTE: 'TiB', +} + +DOWNLOAD_CHUNK = 2 * KIBIBYTE + + +DB_INIT = ''' +CREATE TABLE IF NOT EXISTS urls( + url TEXT, + basename TEXT, + content_length INT, + content_type TEXT, + do_download INT + ); +CREATE INDEX IF NOT EXISTS urlindex on urls(url); +CREATE INDEX IF NOT EXISTS baseindex on urls(basename); +CREATE INDEX IF NOT EXISTS sizeindex on urls(content_length); +'''.strip() +SQL_URL = 0 +SQL_BASENAME = 1 +SQL_CONTENT_LENGTH = 2 +SQL_CONTENT_TYPE = 3 +SQL_DO_DOWNLOAD = 4 + + +## DOWNLOADER ###################################################################################### +## ## class Downloader: - def __init__(self, urlfile, outputdir=None, headers=None): - jdict = file_to_dict(urlfile) - self.urls = [item[0] for item in jdict.items()] - self.urls.sort(key=str.lower) + def __init__(self, databasename, outputdir=None, headers=None): + self.databasename = databasename + self.sql = sqlite3.connect(databasename) + self.cur = self.sql.cursor() + + if outputdir is None or outputdir == "": + # This assumes that all URLs in the database are from the same domain. + # If they aren't, it's the user's fault. + self.cur.execute('SELECT url FROM urls LIMIT 1') + url = self.cur.fetchone()[0] + # returns (root, path, filename). Keep root. + outputdir = url_to_filepath(url)[0] self.outputdir = outputdir - if self.outputdir is None or self.outputdir == "": - # returns (root, path, filename). Keep root. - self.outputdir = url_to_filepath(self.urls[0])[0] - - def download(self, overwrite=False): + def download(self, overwrite=False, bytespersecond=None): overwrite = bool(overwrite) - for url in self.urls: + + self.cur.execute('SELECT * FROM urls WHERE do_download == 1 ORDER BY url') + while True: + fetch = self.cur.fetchone() + if fetch is None: + break + url = fetch[SQL_URL] + ''' Creating the Path ''' - (root, folder, filename) = url_to_filepath(url) - # In case the user has set a custom download directory, - # ignore the above value of `root`. + (root, folder, basename) = url_to_filepath(url) + # Ignore this value of `root`, because we might have a custom outputdir. root = self.outputdir folder = os.path.join(root, folder) if not os.path.exists(folder): os.makedirs(folder) - localname = os.path.join(folder, filename) + fullname = os.path.join(folder, basename) temporary_basename = hashit(url, 16) + '.oddltemporary' - temporary_localname = os.path.join(folder, temporary_basename) + temporary_fullname = os.path.join(folder, temporary_basename) ''' Managing overwrite ''' - if os.path.isfile(localname): + if os.path.isfile(fullname): if overwrite is True: - os.remove(localname) + os.remove(fullname) else: - safeprint('Skipping "%s". Use `overwrite=True`' % localname) + safeprint('Skipping "%s". Use `--overwrite`' % fullname) continue - safeprint('Downloading "%s" as "%s"' % (localname, temporary_basename)) - filehandle = open(temporary_localname, 'wb') + safeprint('Downloading "%s" as "%s"' % (fullname, temporary_basename)) + filehandle = open(temporary_fullname, 'wb') try: - download_file(url, filehandle, hookfunction=hook1) - os.rename(temporary_localname, localname) + download_file(url, filehandle, hookfunction=hook1, bytespersecond=bytespersecond) + os.rename(temporary_fullname, fullname) except: filehandle.close() raise +## ## +## DOWNLOADER ###################################################################################### + +## GENERIC ######################################################################################### +## ## +class Generic: + def __init__(self, **kwargs): + for kwarg in kwargs: + setattr(self, kwarg, kwargs[kwarg]) +## ## +## GENERIC ######################################################################################### + + +## WALKER ########################################################################################## +## ## class Walker: - def __init__(self, website, outputfile, fullscan=False): - self.website = website + def __init__(self, walkurl, databasename=None, fullscan=False): + if walkurl[-1] != '/': + walkurl += '/' + self.walkurl = walkurl + if databasename is None: + self.domain = url_to_filepath(walkurl)[0] + databasename = self.domain + '.db' + self.databasename = databasename + + self.sql = sqlite3.connect(self.databasename) + self.cur = self.sql.cursor() + db_init(self.sql, self.cur) + self.fullscan = bool(fullscan) - if os.path.exists(outputfile): - self.results = file_to_dict(outputfile) - else: - self.results = {} - self.already_seen = set() + self.queue = [] + self.seen_directories = set() - def add_head_to_results(self, head): - if isinstance(head, str): - # For when we're doing a basic scan, which skips urls that - # look like a file. - self.results[head] = { - 'Content-Length': -1, - 'Content-Type': '?', - } - self.already_seen.add(head) - else: - # For when we're doing a full scan, which does a HEAD request - # for all urls. - self.results[head.url] = { - 'Content-Length': int(head.headers.get('Content-Length', -1)), - 'Content-Type': head.headers.get('Content-Type', '?'), - } - self.already_seen.add(head.url) + def smart_insert(self, url=None, head=None, commit=True): + ''' + See `smart_insert`. + ''' + smart_insert(self.sql, self.cur, url=url, head=head, commit=commit) - def extract_hrefs(self, response): + def extract_hrefs(self, response, tag='a', attribute='href'): + ''' + Given a Response object, extract href urls. + External links, index sort links, and desktop.ini are discarded. + ''' + import bs4 soup = bs4.BeautifulSoup(response.text) - elements = soup.findAll('a') - hrefs = [] + elements = soup.findAll(tag) for element in elements: try: - href = element['href'] + href = element[attribute] except KeyError: continue href = urllib.parse.urljoin(response.url, href) - if not href.startswith(self.website): - # Don't go to other sites or parent directories + if not href.startswith(self.walkurl): + # Don't go to other sites or parent directories. continue if 'C=' in href and 'O=' in href: - # Alternative sort modes for index pages + # Alternative sort modes for index pages. continue if href.endswith('desktop.ini'): - # I hate these things + # I hate these things. continue - hrefs.append(href) - return hrefs + yield href - def walk(self, url=None): + def process_url(self, url=None): + ''' + Given a URL, check whether it is an index page or an actual file. + If it is an index page, it's links are extracted and queued. + If it is a file, its information is saved to the database. + + We perform a + HEAD: + when `self.fullscan` is True. + when `self.fullscan` is False but the url is not a SKIPPABLE_FILETYPE. + when the url is an index page. + GET: + when the url is a index page. + ''' if url is None: - url = self.website + url = self.walkurl else: - url = urllib.parse.urljoin(self.website, url) + url = urllib.parse.urljoin(self.walkurl, url) - results = [] + if not url.startswith(self.walkurl): + # Don't follow external links or parent directory. + print('Skipping "%s" due to external url.' % url) + return urll = url.lower() - if self.fullscan is False and any(urll.endswith(ext) for ext in SKIPPABLE_FILETYPES): - print('Skipping "%s" due to extension' % url) - self.add_head_to_results(url) - return results + if self.fullscan is False: + skippable = any(urll.endswith(ext) for ext in SKIPPABLE_FILETYPES) + if skippable: + safeprint('Skipping "%s" due to extension.' % url) + self.smart_insert(url=url) + return + self.cur.execute('SELECT * FROM urls WHERE url == ?', [url]) + skippable = self.cur.fetchone() is not None + if skippable: + safeprint('Skipping "%s" since we already have it.' % url) + return - if not url.startswith(self.website): - # Don't follow external links or parent directory. - return results - - head = requests.head(url) - head.raise_for_status() - - safeprint('HEAD: %s : %s' % (url, head)) + try: + head = do_head(url) + except requests.exceptions.HTTPError as e: + if e.response.status_code == 403: + print('403 FORBIDDEN!') + return + if e.response.status_code == 404: + print('404 NOT FOUND!') + return + raise content_type = head.headers.get('Content-Type', '?') - self.already_seen.add(head.url) if content_type.startswith('text/html') and head.url.endswith('/'): - # This is an index page, let's get recursive. - page = requests.get(url) - safeprint(' GET: %s : %s' % (url, page)) - hrefs = self.extract_hrefs(page) - for url in hrefs: - if url not in self.results and url not in self.already_seen: - results += self.walk(url) + # This is an index page, so extract links and queue them. + response = do_get(url) + hrefs = self.extract_hrefs(response) + self.seen_directories.add(head.url) + added = 0 + for href in hrefs: + if href in self.seen_directories: + continue + else: + self.queue.append(href) + added += 1 + print('Queued %d urls' % added) else: - # Don't add index pages to the results. - self.add_head_to_results(head) + # This is not an index page, so save it. + self.smart_insert(head=head) - return results + def walk(self, url=None): + self.queue.append(url) + while len(self.queue) > 0: + url = self.queue.pop(0) + self.process_url(url) +## ## +## WALKER ########################################################################################## +## GENERAL FUNCTIONS ############################################################################### +## ## +def bytes_to_unit_string(bytes): + size_unit = 1 + for unit in SIZE_UNITS: + if bytes >= unit: + size_unit = unit + break + size_unit_string = UNIT_STRINGS[size_unit] + size_string = '%.3f %s' % ((bytes / size_unit), size_unit_string) + return size_string + +def db_init(sql, cur): + lines = DB_INIT.split(';') + for line in lines: + cur.execute(line) + sql.commit() + return True + def dict_to_file(jdict, filename): - filehandle = open(filename, 'wb') - text = json.dumps(jdict, indent=4, sort_keys=True) + text = dict_to_string(jdict) text = text.encode('utf-8') + filehandle = open(filename, 'wb') filehandle.write(text) filehandle.close() -def download_file(url, filehandle, getsizeheaders=True, hookfunction=None, headers={}, auth=None): - if getsizeheaders: - totalsize = requests.head(url, headers=headers, auth=auth) - totalsize = int(totalsize.headers['content-length']) +def do_get(url): + return do_request('GET', requests.get, url) + +def do_head(url): + return do_request('HEAD', requests.head, url) + +def do_request(message, method, url): + import sys + message = '{message:>4s}: {url} : '.format(message=message, url=url) + safeprint(message, end='') + sys.stdout.flush() + response = method(url) + safeprint(response) + response.raise_for_status() + return response + +def download_file(url, filehandle, hookfunction=None, headers={}, bytespersecond=None): + if bytespersecond is not None: + limiter = ratelimiter.Ratelimiter(allowance_per_period=bytespersecond, period=1) else: - totalsize = 1 + limiter = None + currentblock = 0 - downloading = requests.get(url, stream=True, headers=headers, auth=auth) + downloading = requests.get(url, stream=True, headers=headers) + totalsize = int(downloading.headers.get('content-length', 1)) for chunk in downloading.iter_content(chunk_size=DOWNLOAD_CHUNK): - if chunk: - currentblock += 1 - filehandle.write(chunk) - if hookfunction is not None: - hookfunction(currentblock, DOWNLOAD_CHUNK, totalsize) + if not chunk: + break + currentblock += 1 + filehandle.write(chunk) + if limiter is not None: + limiter.limit(len(chunk)) + if hookfunction is not None: + hookfunction(currentblock, DOWNLOAD_CHUNK, totalsize) + filehandle.close() size = os.path.getsize(filehandle.name) if size < totalsize: raise Exception('Did not receive expected total size. %d / %d' % (size, totalsize)) return True -def file_to_dict(filename): - filehandle = open(filename, 'rb') - jdict = json.loads(filehandle.read().decode('utf-8')) - filehandle.close() - return jdict - -def filepath_sanitize(text, exclusions=''): - bet = FILENAME_BADCHARS.replace(exclusions, '') +def filepath_sanitize(text, allowed=''): + bet = FILENAME_BADCHARS.replace(allowed, '') for char in bet: text = text.replace(char, '') return text +def get_clipboard(): + import tkinter + t = tkinter.Tk() + clip = t.clipboard_get() + t.destroy() + return clip + def hashit(text, length=None): + import hashlib h = hashlib.sha512(text.encode('utf-8')).hexdigest() if length is not None: h = h[:length] @@ -230,12 +444,66 @@ def hook1(currentblock, chunksize, totalsize): if currentbytes == totalsize: print() +def listget(l, index, default=None): + try: + return l[index] + except IndexError: + return default + +def longest_length(li): + longest = 0 + for item in li: + longest = max(longest, len(item)) + return longest + def safeprint(text, **kwargs): text = str(text) text = text.encode('ascii', 'replace').decode() text = text.replace('?', '_') print(text, **kwargs) +def smart_insert(sql, cur, url=None, head=None, commit=True): + ''' + INSERT or UPDATE the appropriate entry. + ''' + if bool(url) is bool(head): + raise ValueError('One and only one of `url` or `head` is necessary.') + + if url is not None: + # When doing a basic scan, all we get is the URL. + content_length = None + content_type = None + + elif head is not None: + # When doing a full scan, we get a Response object. + url = head.url + content_length = head.headers.get('Content-Length', None) + if content_length is not None: + content_length = int(content_length) + content_type = head.headers.get('Content-Type', None) + + basename = url_to_filepath(url)[2] + basename = urllib.parse.unquote(basename) + do_download = True + cur.execute('SELECT * FROM urls WHERE url == ?', [url]) + existing_entry = cur.fetchone() + is_new = existing_entry is None + data = (url, basename, content_length, content_type, do_download) + if is_new: + + cur.execute('INSERT INTO urls VALUES(?, ?, ?, ?, ?)', data) + else: + command = ''' + UPDATE urls SET + content_length = coalesce(?, content_length), + content_type = coalesce(?, content_type) + WHERE url == ? + ''' + cur.execute(command, [content_length, content_type, url]) + if commit: + sql.commit() + return data + def url_to_filepath(text): text = urllib.parse.unquote(text) parts = urllib.parse.urlsplit(text) @@ -244,89 +512,230 @@ def url_to_filepath(text): while folder.startswith('/'): folder = folder[1:] - # Folders are allowed to have slashes - folder = filepath_sanitize(folder, exclusions='/\\') + # Folders are allowed to have slashes... + folder = filepath_sanitize(folder, allowed='/\\') folder = folder.replace('\\', os.path.sep) folder = folder.replace('/', os.path.sep) - # But Files are not. + # ...but Files are not. filename = filepath_sanitize(filename) + return (root, folder, filename) +## ## +## GENERAL FUNCTIONS ############################################################################### -## Commandline functions ####################################################\\ -def digest(website, outputfile, fullscan, *trash): - fullscan = bool(fullscan) - if website[-1] != '/': - website += '/' - walker = Walker(website, outputfile, fullscan=fullscan) - try: - walker.walk() - dict_to_file(walker.results, outputfile) - except: - dict_to_file(walker.results, outputfile) - traceback.print_exc() - print('SAVED PROGRESS SO FAR') -def download(urlfile, outputdir, overwrite, *trash): - downloader = Downloader(urlfile, outputdir) - downloader.download(overwrite) +## COMMANDLINE FUNCTIONS ########################################################################### +## ## +def digest(args): + fullscan = args.fullscan + if isinstance(fullscan, str): + fullscan = bool(eval(fullscan)) + walkurl = args.walkurl + if walkurl == '!clipboard': + walkurl = get_clipboard() + safeprint('From clipboard: %s' % walkurl) + walker = Walker( + databasename=args.databasename, + fullscan=fullscan, + walkurl=walkurl, + ) + walker.walk() -def filter_pattern(urlfile, patterns, negative=False, *trash): +def download(args): + bytespersecond = args.bytespersecond + if isinstance(bytespersecond, str): + bytespersecond = eval(bytespersecond) + + downloader = Downloader( + databasename=args.databasename, + outputdir=args.outputdir, + ) + downloader.download( + bytespersecond=bytespersecond, + overwrite=args.overwrite, + ) + +def filter_pattern(databasename, regex, action='keep', *trash): ''' - When `negative` is True, items are kept when they do NOT match the pattern, - allowing you to delete trash files. + When `action` is 'keep', then any URLs matching the regex will have their + `do_download` flag set to True. - When `negative` is False, items are keep when they DO match the pattern, - allowing you to keep items of interest. + When `action` is 'remove', then any URLs matching the regex will have their + `do_download` flag set to False. + + Actions will not act on each other's behalf. A 'keep' will NEVER disable a url, + and 'remove' will NEVER enable one. ''' - if isinstance(patterns, str): - patterns = [patterns] - jdict = file_to_dict(urlfile) - keys = list(jdict.keys()) - for key in keys: - for pattern in patterns: - contains = re.search(pattern, key) is not None - if contains ^ negative: - safeprint('Removing "%s"' % key) - del jdict[key] - dict_to_file(jdict, urlfile) + import re + if isinstance(regex, str): + regex = [regex] -def keep_pattern(urlfile, patterns, *trash): - filter_pattern(urlfile=urlfile, patterns=patterns, negative=True) + keep = action == 'keep' + remove = action == 'remove' -def measure(urlfile, *trash): - jdict = file_to_dict(urlfile) - totalbytes = 0 - for (url, info) in jdict.items(): - bytes = info['Content-Length'] - if bytes > 0: - totalbytes += bytes - bytestring = '{:,}'.format(totalbytes) - print(bytestring) - return totalbytes + sql = sqlite3.connect(databasename) + cur = sql.cursor() + cur2 = sql.cursor() -def remove_pattern(urlfile, patterns, *trash): - filter_pattern(urlfile=urlfile, patterns=patterns, negative=False) - -def listget(l, index, default=None): - try: - return l[index] - except IndexError: - return default -cmdfunctions = [digest, download, keep_pattern, measure, remove_pattern] -## End of commandline functions #############################################// - -if __name__ == '__main__': - command = listget(sys.argv, 1, None) - arg1 = listget(sys.argv, 2, None) - arg2 = listget(sys.argv, 3, None) - arg3 = listget(sys.argv, 4, None) - if command is None: - quit() - did_something = False - for function in cmdfunctions: - if command == function.__name__: - function(arg1, arg2, arg3) - did_something = True + cur2.execute('SELECT * FROM urls') + while True: + fetch = cur2.fetchone() + if fetch is None: break - if not did_something: - print('No matching function') \ No newline at end of file + url = fetch[SQL_URL] + current_do_dl = fetch[SQL_DO_DOWNLOAD] + for pattern in regex: + contains = re.search(pattern, url) is not None + + should_keep = (keep and contains) + if keep and contains and not current_do_dl: + safeprint('Keeping "%s"' % url) + cur.execute('UPDATE urls SET do_download = 1 WHERE url == ?', [url]) + if remove and contains and current_do_dl: + safeprint('Removing "%s"' % url) + cur.execute('UPDATE urls SET do_download = 0 WHERE url == ?', [url]) + sql.commit() + +def keep_pattern(args): + ''' + See `filter_pattern`. + ''' + filter_pattern( + action='keep', + databasename=args.databasename, + regex=args.regex, + ) + +def list_basenames(args): + ''' + Given a database, print the entries in order of the file basenames. + This makes it easier to find interesting titles without worrying about + what directory they're in. + ''' + databasename = args.databasename + outputfile = args.outputfile + + sql = sqlite3.connect(databasename) + cur = sql.cursor() + cur.execute('SELECT basename FROM urls WHERE do_download == 1 ORDER BY LENGTH(basename) DESC LIMIT 1') + + longest = len(cur.fetchone()[0]) + cur.execute('SELECT * FROM urls WHERE do_download == 1 ORDER BY basename') + form = '{bn:<%ds} : {url}' % longest + if outputfile: + outputfile = open(outputfile, 'w', encoding='utf-8') + while True: + fetch = cur.fetchone() + if fetch is None: + break + line = form.format(bn=fetch[SQL_BASENAME], url=fetch[SQL_URL]) + if outputfile: + outputfile.write(line + '\n') + else: + print(line) + if outputfile: + outputfile.close() + +def measure(args): + ''' + Given a database, print the sum of all Content-Lengths. + If `fullscan`, then URLs with no Content-Length will be + HEAD requested, and the result will be saved back into the file. + ''' + databasename = args.databasename + fullscan = args.fullscan + if isinstance(fullscan, str): + fullscan = bool(fullscan) + + totalsize = 0 + sql = sqlite3.connect(databasename) + cur1 = sql.cursor() + cur2 = sql.cursor() + cur2.execute('SELECT * FROM urls WHERE do_download == 1') + filecount = 0 + files_without_size = 0 + try: + while True: + fetch = cur2.fetchone() + if fetch is None: + break + size = fetch[SQL_CONTENT_LENGTH] + if size is None: + if fullscan: + url = fetch[SQL_URL] + head = do_head(url) + fetch = smart_insert(sql, cur1, head=head, commit=False) + size = fetch[SQL_CONTENT_LENGTH] + if size is None: + safeprint('"%s" is not revealing Content-Length' % url) + size = 0 + else: + files_without_size += 1 + size = 0 + totalsize += size + filecount += 1 + except: + sql.commit() + raise + + sql.commit() + short_string = bytes_to_unit_string(totalsize) + totalsize_string = '{} ({:,} bytes) in {:,} files'.format(short_string, totalsize, filecount) + print(totalsize_string) + if files_without_size > 0: + print('Note: %d files do not have a stored Content-Length.' % files_without_size) + print('Run `measure` with `-f` or `--fullscan` to HEAD request those files.') + return totalsize + +def remove_pattern(args): + ''' + See `filter_pattern`. + ''' + filter_pattern( + action='remove', + databasename=args.databasename, + regex=args.regex, + ) +## ## +## COMMANDLINE FUNCTIONS ########################################################################### + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + subparsers = parser.add_subparsers() + + p_digest = subparsers.add_parser('digest') + p_digest.add_argument('walkurl') + p_digest.add_argument('-db', '--database', dest='databasename', default=None) + p_digest.add_argument('-f', '--fullscan', action='store_true') + p_digest.set_defaults(func=digest) + + p_download = subparsers.add_parser('download') + p_download.add_argument('databasename') + p_download.add_argument('-o', '--outputdir', dest='outputdir', default=None) + p_download.add_argument('-ow', '--overwrite', dest='overwrite', default=False) + p_download.add_argument('-bps', '--bytespersecond', dest='bytespersecond', default=None) + p_download.set_defaults(func=download) + + p_keep_pattern = subparsers.add_parser('keep_pattern') + p_keep_pattern.add_argument('databasename') + p_keep_pattern.add_argument('regex') + p_keep_pattern.set_defaults(func=keep_pattern) + + p_list_basenames = subparsers.add_parser('list_basenames') + p_list_basenames.add_argument('databasename') + p_list_basenames.add_argument('-o', '--outputfile', dest='outputfile', default=None) + p_list_basenames.set_defaults(func=list_basenames) + + p_measure = subparsers.add_parser('measure') + p_measure.add_argument('databasename') + p_measure.add_argument('-f', '--fullscan', action='store_true') + p_measure.set_defaults(func=measure) + + p_remove_pattern = subparsers.add_parser('remove_pattern') + p_remove_pattern.add_argument('databasename') + p_remove_pattern.add_argument('regex') + p_remove_pattern.set_defaults(func=remove_pattern) + + args = parser.parse_args() + args.func(args) diff --git a/OpenDirDL/ratelimiter.py b/OpenDirDL/ratelimiter.py new file mode 100644 index 0000000..55f856e --- /dev/null +++ b/OpenDirDL/ratelimiter.py @@ -0,0 +1,56 @@ +import time + + +class Ratelimiter: + def __init__(self, allowance_per_period, period, operation_cost=1, mode='sleep'): + ''' + allowance_per_period: + The number of operations we can perform per `period` seconds. + + period: + The number of seconds over which we can perform `allowance_per_period` operations. + + operation_cost: + The default amount to remove from our balance after each operation. + Pass a `cost` parameter to `self.limit` to use a nondefault value. + + mode: + 'sleep': If we do not have the balance for an operation, sleep until we do. + Return True every time. + 'reject': If we do not have the balance for an operation, return False. + ''' + if mode not in ('sleep', 'reject'): + raise ValueError('Invalid mode %s' % repr(mode)) + self.allowance_per_period = allowance_per_period + self.period = period + self.operation_cost = operation_cost + self.mode = mode + + self.last_operation = time.time() + self.balance = 0 + self.gain_rate = allowance_per_period / period + + def limit(self, cost=None): + if cost is None: + cost = self.operation_cost + timediff = time.time() - self.last_operation + self.balance += timediff * self.gain_rate + self.balance = min(self.balance, self.allowance_per_period) + successful = False + + deficit = cost - self.balance + if deficit > 0 and self.mode == 'sleep': + time_needed = (deficit / self.gain_rate) + #print(self.balance, deficit, 'Need to sleep %f' % time_needed) + time.sleep(time_needed) + self.balance = cost + + #print(self.balance) + if self.balance >= cost: + #print('pass') + self.balance -= cost + successful = True + + self.last_operation = time.time() + + return successful diff --git a/Steganographic/steganographic.py b/Steganographic/steganographic.py index 5e5f06e..f5da08c 100644 --- a/Steganographic/steganographic.py +++ b/Steganographic/steganographic.py @@ -99,15 +99,24 @@ class BitsToImage: class ImageToBits: def __init__(self, image, bitness): self.image = image + self.bitness = bitness self.width = image.size[0] + self.height = image.size[1] self.pixel_index = -1 + self.bit_index = bitness self.active_byte = [] + self.pixels = self.image.getdata() + #self.bits = '' + #for pixel in self.pixels: + # for channel in pixel: + # self.bits += binary(channel)[-bitness:] + #print(len(self.bits)) + def _read(self): if len(self.active_byte) == 0: self.pixel_index += 1 - (x, y) = index_to_xy(self.pixel_index, self.width) - self.active_byte = list(self.image.getpixel((x, y))) + self.active_byte = self.pixels[self.pixel_index] self.active_byte = self.active_byte[:3] self.active_byte = [binary(channel) for channel in self.active_byte] self.active_byte = [channel[-bitness:] for channel in self.active_byte] @@ -115,6 +124,7 @@ class ImageToBits: self.active_byte = list(self.active_byte) ret = self.active_byte.pop(0) + self.bit_index += 1 return ret def read(self, bits=1): @@ -196,24 +206,14 @@ def encode(imagefilename, secretfilename, bitness=1): secret_content_length = (secret_size) + (len(secret_extension)) + 1 requiredpixels = math.ceil(((secret_content_length * 8) + 32) / (3 * bitness)) if totalpixels < requiredpixels: - raise StegError('Image does not have enough pixels to store the Secret' + raise StegError('Image does not have enough pixels to store the Secret. ' 'Must have at least %d pixels' % requiredpixels) print('%d pixels available, %d required' % (totalpixels, requiredpixels)) # --> YOU ARE HERE <-- - # Because bitness may be between 1 and 8, we need to create a writing buffer - # called `binary_write_buffer`, so that we're always writing the same amount - # of data per color channel. - # If we were to write the secret length / extension on the fly, we might end - # up using the wrong number of bits for the final channel of some pixel. - # Example: 10010101 broken into groups of 3 is [100, 101, 01] - # Note that the last group is not the same size as the desired bitness, and - # will cause decode errors. - pixel = list(image.getpixel((0, 0))) - binary_write_buffer = '' # Write secret length secret_content_length_b = binary(secret_content_length).rjust(32, '0')