else

2016-01-24 12:48:39 -08:00 · 2016-01-24 12:48:39 -08:00 · 001a8d970f
commit 001a8d970f
parent 4b7cfea08d
10 changed files with 716 additions and 199 deletions
--- a/DeLetterbox/README.md
+++ b/DeLetterbox/README.md
@ -0,0 +1,4 @@
+DeLetterbox
+===========
+
+I didn't test this very much, just needed something quick.
--- a/DeLetterbox/deletterbox.py
+++ b/DeLetterbox/deletterbox.py
@ -0,0 +1,42 @@
+from PIL import Image
+import os
+import sys
+
+CLOSE_ENOUGH_THRESHOLD = 10
+
+def close_enough(a, b):
+    for (a_channel, b_channel) in zip(a, b):
+        if abs(a_channel - b_channel) > CLOSE_ENOUGH_THRESHOLD:
+            return False
+    return True
+
+def deletterbox(filename):
+    image = Image.open(filename)
+    trim_top(image)
+    for x in range(4):
+        image = trim_top(image)
+        image = image.rotate(90)
+    (base, ext) = os.path.splitext(filename)
+    #filename = base + 'X' + ext
+    image.save(filename)
+
+def trim_top(image):
+    letterbox_color = image.getpixel((0, 0))
+    for y in range(image.size[1]):
+        solid = True
+        for x in range(image.size[0]):
+            pixel = image.getpixel((x, y))
+            if not close_enough(letterbox_color, pixel):
+                solid = False
+                break
+        if not solid:
+            break
+    bounds = (0, y, image.size[0], image.size[1])
+    #print(bounds)
+    image = image.crop(bounds)
+    return image
+
+filenames = sys.argv[1:]
+for filename in filenames:
+    deletterbox(filename)
+
--- a/DeLetterbox/example1_after.jpg
+++ b/DeLetterbox/example1_after.jpg
--- a/DeLetterbox/example1_before.jpg
+++ b/DeLetterbox/example1_before.jpg
--- a/DeLetterbox/example2_after.jpg
+++ b/DeLetterbox/example2_after.jpg
--- a/DeLetterbox/example2_before.jpg
+++ b/DeLetterbox/example2_before.jpg
--- a/OpenDirDL/README.md
+++ b/OpenDirDL/README.md
@ -0,0 +1,6 @@
+Open Dir DL
+===========
+
+Requires `pip install beautifulsoup4`
+
+See inside opendirdl.py for usage instructions.
--- a/OpenDirDL/opendirdl.py
+++ b/OpenDirDL/opendirdl.py
@ -1,17 +1,85 @@
-import bs4
-import hashlib
-import json
+'''
+OpenDirDL
+downloads open directories
+
+Usage:
+
+DIGEST:
+    Recursively fetch directories and build a database of file URLs.
+
+    > opendirdl digest !clipboard <flags>
+    > opendirdl digest http://website.com/directory/ <flags>
+
+    flags:
+                           -f | --fullscan : When included, perform HEAD requests on all files, to
+                                             know the size of the entire directory.
+        -dv "x.db" | --databasename "x.db" : Use a custom database filename. By default, databases
+                                             are named after the web domain.
+
+
+DOWNLOAD:
+    Download the files whose URLs are enabled in the database.
+
+    > opendirdl download website.com.db <flags>
+
+    flags:
+               -o "x" | --outputdir "x" : Save the files to a custom directory, "x". By default,
+                                          files are saved to a folder named after the web domain.
+                      -ow | --overwrite : When included, download and overwrite files even if they
+                                          already exist in the output directory.
+        -bps 100 | --bytespersecond 100 : Ratelimit yourself to downloading at 100 BYTES per second.
+                                          The webmaster will appreciate this.
+
+KEEP_PATTERN:
+    Enable URLs which match a regex pattern. Matches are based on the percent-encoded strings!
+
+    > opendirdl keep_pattern website.com.db ".*"
+
+REMOVE_PATTERN:
+    Disable URLs which match a regex pattern. Matches are based on the percent-encoded strings!
+
+    > opendirdl remove_pattern website.com.db ".*"
+
+LIST_BASENAMES:
+    List enabled URLs in order of their base filename. This makes it easier to find titles of
+    interest in a directory that is very scattered or poorly organized.
+
+    > opendirdl list_basenames website.com.db <flags>
+
+    flags:
+        -o "x.txt" | --outputfile "x.txt" : Output the results to a file instead of stdout. This is
+                                            useful if the filenames contain special characters that
+                                            crash Python, or are so long that the console becomes
+                                            unreadable.
+
+MEASURE:
+    Sum up the filesizes of all enabled URLs.
+
+    > opendirdl measure website.com.db <flags>
+
+    flags:
+        -f | --fullscan : When included, perform HEAD requests on any URL whose size is not known.
+                          If this flag is not included, and some file's size is unkown, you will
+                          receive a note.
+'''
+
+# Module names preceeded by two hashes indicate modules that are imported during
+# a function, because they are not used anywhere else and we don't need to waste
+# time importing them usually.
+
+import argparse
+## import bs4
+## import hashlib
 import os
-import re
+import ratelimiter
+## import re
 import requests
-import string
-import sys
-import time
-import traceback
+import sqlite3
+## import sys
+## tkinter
 import urllib.parse

 FILENAME_BADCHARS = '/\\:*?"<>|'
-DOWNLOAD_CHUNK = 2048

 # When doing a basic scan, we will not send HEAD requests to URLs that end in these strings,
 # because they're probably files.
@ -23,197 +91,343 @@ SKIPPABLE_FILETYPES = [
 '.epub',
 '.db',
 '.flac',
+'.gif',
+'.gz'
 '.ico',
 '.iso',
+'.jpeg',
 '.jpg',
+'.m3u',
 '.m4a',
 '.mkv',
 '.mov',
 '.mp3',
 '.mp4',
+'.nfo',
+'.ogg',
 '.pdf',
 '.png',
 '.srt',
+'.tar',
 '.txt',
 '.webm',
 '.zip',
 ]
+SKIPPABLE_FILETYPES = set(x.lower() for x in SKIPPABLE_FILETYPES)

-SKIPPABLE_FILETYPES = [x.lower() for x in SKIPPABLE_FILETYPES]
+BYTE = 1
+KIBIBYTE = 1024 * BYTE
+MIBIBYTE = 1024 * KIBIBYTE
+GIBIBYTE = 1024 * MIBIBYTE
+TEBIBYTE = 1024 * GIBIBYTE
+SIZE_UNITS = (TEBIBYTE, GIBIBYTE, MIBIBYTE, KIBIBYTE, BYTE)

+UNIT_STRINGS = {
+    BYTE: 'b',
+    KIBIBYTE: 'KiB',
+    MIBIBYTE: 'MiB',
+    GIBIBYTE: 'GiB',
+    TEBIBYTE: 'TiB',
+}
+
+DOWNLOAD_CHUNK = 2 * KIBIBYTE
+
+
+DB_INIT = '''
+CREATE TABLE IF NOT EXISTS urls(
+    url TEXT,
+    basename TEXT,
+    content_length INT,
+    content_type TEXT,
+    do_download INT
+    );
+CREATE INDEX IF NOT EXISTS urlindex on urls(url);
+CREATE INDEX IF NOT EXISTS baseindex on urls(basename);
+CREATE INDEX IF NOT EXISTS sizeindex on urls(content_length);
+'''.strip()
+SQL_URL = 0
+SQL_BASENAME = 1
+SQL_CONTENT_LENGTH = 2
+SQL_CONTENT_TYPE = 3
+SQL_DO_DOWNLOAD = 4
+
+
+## DOWNLOADER ######################################################################################
+##                                                                                                ##
 class Downloader:
-    def __init__(self, urlfile, outputdir=None, headers=None):
-        jdict = file_to_dict(urlfile)
-        self.urls = [item[0] for item in jdict.items()]
-        self.urls.sort(key=str.lower)
+    def __init__(self, databasename, outputdir=None, headers=None):
+        self.databasename = databasename
+        self.sql = sqlite3.connect(databasename)
+        self.cur = self.sql.cursor()
+
+        if outputdir is None or outputdir == "":
+            # This assumes that all URLs in the database are from the same domain.
+            # If they aren't, it's the user's fault.
+            self.cur.execute('SELECT url FROM urls LIMIT 1')
+            url = self.cur.fetchone()[0]
+            # returns (root, path, filename). Keep root.
+            outputdir = url_to_filepath(url)[0]
        self.outputdir = outputdir

-        if self.outputdir is None or self.outputdir == "":
-            # returns (root, path, filename). Keep root.
-            self.outputdir = url_to_filepath(self.urls[0])[0]
-
-    def download(self, overwrite=False):
+    def download(self, overwrite=False, bytespersecond=None):
        overwrite = bool(overwrite)
-        for url in self.urls:
+
+        self.cur.execute('SELECT * FROM urls WHERE do_download == 1 ORDER BY url')
+        while True:
+            fetch = self.cur.fetchone()
+            if fetch is None:
+                break
+            url = fetch[SQL_URL]
+
            ''' Creating the Path '''
-            (root, folder, filename) = url_to_filepath(url)
-            # In case the user has set a custom download directory,
-            # ignore the above value of `root`.
+            (root, folder, basename) = url_to_filepath(url)
+            # Ignore this value of `root`, because we might have a custom outputdir.
            root = self.outputdir
            folder = os.path.join(root, folder)
            if not os.path.exists(folder):
                os.makedirs(folder)
-            localname = os.path.join(folder, filename)
+            fullname = os.path.join(folder, basename)
            temporary_basename = hashit(url, 16) + '.oddltemporary'
-            temporary_localname = os.path.join(folder, temporary_basename)
+            temporary_fullname = os.path.join(folder, temporary_basename)

            ''' Managing overwrite '''
-            if os.path.isfile(localname):
+            if os.path.isfile(fullname):
                if overwrite is True:
-                    os.remove(localname)
+                    os.remove(fullname)
                else:
-                    safeprint('Skipping "%s". Use `overwrite=True`' % localname)
+                    safeprint('Skipping "%s". Use `--overwrite`' % fullname)
                    continue

-            safeprint('Downloading "%s" as "%s"' % (localname, temporary_basename))
-            filehandle = open(temporary_localname, 'wb')
+            safeprint('Downloading "%s" as "%s"' % (fullname, temporary_basename))
+            filehandle = open(temporary_fullname, 'wb')
            try:
-                download_file(url, filehandle, hookfunction=hook1)
-                os.rename(temporary_localname, localname)
+                download_file(url, filehandle, hookfunction=hook1, bytespersecond=bytespersecond)
+                os.rename(temporary_fullname, fullname)
            except:
                filehandle.close()
                raise
+##                                                                                                ##
+## DOWNLOADER ######################################################################################

+
+## GENERIC #########################################################################################
+##                                                                                                ##
+class Generic:
+    def __init__(self, **kwargs):
+        for kwarg in kwargs:
+            setattr(self, kwarg, kwargs[kwarg])
+##                                                                                                ##
+## GENERIC #########################################################################################
+
+
+## WALKER ##########################################################################################
+##                                                                                                ##
 class Walker:
-    def __init__(self, website, outputfile, fullscan=False):
-        self.website = website
+    def __init__(self, walkurl, databasename=None, fullscan=False):
+        if walkurl[-1] != '/':
+            walkurl += '/'
+        self.walkurl = walkurl
+        if databasename is None:
+            self.domain = url_to_filepath(walkurl)[0]
+            databasename = self.domain + '.db'
+        self.databasename = databasename
+
+        self.sql = sqlite3.connect(self.databasename)
+        self.cur = self.sql.cursor()
+        db_init(self.sql, self.cur)
+
        self.fullscan = bool(fullscan)
-        if os.path.exists(outputfile):
-            self.results = file_to_dict(outputfile)
-        else:
-            self.results = {}
-        self.already_seen = set()
+        self.queue = []
+        self.seen_directories = set()

-    def add_head_to_results(self, head):
-        if isinstance(head, str):
-            # For when we're doing a basic scan, which skips urls that
-            # look like a file.
-            self.results[head] = {
-                'Content-Length': -1,
-                'Content-Type': '?',
-            }
-            self.already_seen.add(head)
-        else:
-            # For when we're doing a full scan, which does a HEAD request
-            # for all urls.
-            self.results[head.url] = {
-                'Content-Length': int(head.headers.get('Content-Length', -1)),
-                'Content-Type': head.headers.get('Content-Type', '?'),
-            }
-            self.already_seen.add(head.url)
+    def smart_insert(self, url=None, head=None, commit=True):
+        '''
+        See `smart_insert`.
+        '''
+        smart_insert(self.sql, self.cur, url=url, head=head, commit=commit)

-    def extract_hrefs(self, response):
+    def extract_hrefs(self, response, tag='a', attribute='href'):
+        '''
+        Given a Response object, extract href urls.
+        External links, index sort links, and desktop.ini are discarded.
+        '''
+        import bs4
        soup = bs4.BeautifulSoup(response.text)
-        elements = soup.findAll('a')
-        hrefs = []
+        elements = soup.findAll(tag)
        for element in elements:
            try:
-                href = element['href']
+                href = element[attribute]
            except KeyError:
                continue
            href = urllib.parse.urljoin(response.url, href)
-            if not href.startswith(self.website):
-                # Don't go to other sites or parent directories
+            if not href.startswith(self.walkurl):
+                # Don't go to other sites or parent directories.
                continue
            if 'C=' in href and 'O=' in href:
-                # Alternative sort modes for index pages
+                # Alternative sort modes for index pages.
                continue
            if href.endswith('desktop.ini'):
-                # I hate these things
+                # I hate these things.
                continue
-            hrefs.append(href)
-        return hrefs
+            yield href

-    def walk(self, url=None):
+    def process_url(self, url=None):
+        '''
+        Given a URL, check whether it is an index page or an actual file.
+        If it is an index page, it's links are extracted and queued.
+        If it is a file, its information is saved to the database.
+
+        We perform a 
+        HEAD:
+            when `self.fullscan` is True.
+            when `self.fullscan` is False but the url is not a SKIPPABLE_FILETYPE.
+            when the url is an index page.
+        GET:
+            when the url is a index page.
+        '''
        if url is None:
-            url = self.website
+            url = self.walkurl
        else:
-            url = urllib.parse.urljoin(self.website, url)
+            url = urllib.parse.urljoin(self.walkurl, url)

-        results = []
+        if not url.startswith(self.walkurl):
+            # Don't follow external links or parent directory.
+            print('Skipping "%s" due to external url.' % url)
+            return

        urll = url.lower()
-        if self.fullscan is False and any(urll.endswith(ext) for ext in SKIPPABLE_FILETYPES):
-            print('Skipping "%s" due to extension' % url)
-            self.add_head_to_results(url)
-            return results
+        if self.fullscan is False:
+            skippable = any(urll.endswith(ext) for ext in SKIPPABLE_FILETYPES)
+            if skippable:
+                safeprint('Skipping "%s" due to extension.' % url)
+                self.smart_insert(url=url)
+                return
+            self.cur.execute('SELECT * FROM urls WHERE url == ?', [url])
+            skippable = self.cur.fetchone() is not None
+            if skippable:
+                safeprint('Skipping "%s" since we already have it.' % url)
+                return

-        if not url.startswith(self.website):
-            # Don't follow external links or parent directory.
-            return results
-
-        head = requests.head(url)
-        head.raise_for_status()
-
-        safeprint('HEAD: %s : %s' % (url, head))
+        try:
+            head = do_head(url)
+        except requests.exceptions.HTTPError as e:
+            if e.response.status_code == 403:
+                print('403 FORBIDDEN!')
+                return
+            if e.response.status_code == 404:
+                print('404 NOT FOUND!')
+                return
+            raise
        content_type = head.headers.get('Content-Type', '?')
-        self.already_seen.add(head.url)

        if content_type.startswith('text/html') and head.url.endswith('/'):
-            # This is an index page, let's get recursive.
-            page = requests.get(url)
-            safeprint(' GET: %s : %s' % (url, page))
-            hrefs = self.extract_hrefs(page)
-            for url in hrefs:
-                if url not in self.results and url not in self.already_seen:
-                    results += self.walk(url)
+            # This is an index page, so extract links and queue them.
+            response = do_get(url)
+            hrefs = self.extract_hrefs(response)
+            self.seen_directories.add(head.url)
+            added = 0
+            for href in hrefs:
+                if href in self.seen_directories:
+                    continue
+                else:
+                    self.queue.append(href)
+                    added += 1
+            print('Queued %d urls' % added)
        else:
-            # Don't add index pages to the results.
-            self.add_head_to_results(head)
+            # This is not an index page, so save it.
+            self.smart_insert(head=head)

-        return results
+    def walk(self, url=None):
+        self.queue.append(url)
+        while len(self.queue) > 0:
+            url = self.queue.pop(0)
+            self.process_url(url)
+##                                                                                                ##
+## WALKER ##########################################################################################


+## GENERAL FUNCTIONS ###############################################################################
+##                                                                                                ##
+def bytes_to_unit_string(bytes):
+    size_unit = 1
+    for unit in SIZE_UNITS:
+        if bytes >= unit:
+            size_unit = unit
+            break
+    size_unit_string = UNIT_STRINGS[size_unit]
+    size_string = '%.3f %s' % ((bytes / size_unit), size_unit_string)
+    return size_string
+
+def db_init(sql, cur):
+    lines = DB_INIT.split(';')
+    for line in lines:
+        cur.execute(line)
+    sql.commit()
+    return True
+
 def dict_to_file(jdict, filename):
-    filehandle = open(filename, 'wb')
-    text = json.dumps(jdict, indent=4, sort_keys=True)
+    text = dict_to_string(jdict)
    text = text.encode('utf-8')
+    filehandle = open(filename, 'wb')
    filehandle.write(text)
    filehandle.close()

-def download_file(url, filehandle, getsizeheaders=True, hookfunction=None, headers={}, auth=None):
-    if getsizeheaders:
-        totalsize = requests.head(url, headers=headers, auth=auth)
-        totalsize = int(totalsize.headers['content-length'])
+def do_get(url):
+    return do_request('GET', requests.get, url)
+
+def do_head(url):
+    return do_request('HEAD', requests.head, url)
+
+def do_request(message, method, url):
+    import sys
+    message = '{message:>4s}: {url} : '.format(message=message, url=url)
+    safeprint(message, end='')
+    sys.stdout.flush()
+    response = method(url)
+    safeprint(response)
+    response.raise_for_status()
+    return response
+    
+def download_file(url, filehandle, hookfunction=None, headers={}, bytespersecond=None):
+    if bytespersecond is not None:
+        limiter = ratelimiter.Ratelimiter(allowance_per_period=bytespersecond, period=1)
    else:
-        totalsize = 1
+        limiter = None
+
    currentblock = 0
-    downloading = requests.get(url, stream=True, headers=headers, auth=auth)
+    downloading = requests.get(url, stream=True, headers=headers)
+    totalsize = int(downloading.headers.get('content-length', 1))
    for chunk in downloading.iter_content(chunk_size=DOWNLOAD_CHUNK):
-        if chunk:
-            currentblock += 1
-            filehandle.write(chunk)
-            if hookfunction is not None:
-                hookfunction(currentblock, DOWNLOAD_CHUNK, totalsize)
+        if not chunk:
+            break
+        currentblock += 1
+        filehandle.write(chunk)
+        if limiter is not None:
+            limiter.limit(len(chunk))
+        if hookfunction is not None:
+            hookfunction(currentblock, DOWNLOAD_CHUNK, totalsize)
+
    filehandle.close()
    size = os.path.getsize(filehandle.name)
    if size < totalsize:
        raise Exception('Did not receive expected total size. %d / %d' % (size, totalsize))
    return True

-def file_to_dict(filename):
-    filehandle = open(filename, 'rb')
-    jdict = json.loads(filehandle.read().decode('utf-8'))
-    filehandle.close()
-    return jdict
-
-def filepath_sanitize(text, exclusions=''):
-    bet = FILENAME_BADCHARS.replace(exclusions, '')
+def filepath_sanitize(text, allowed=''):
+    bet = FILENAME_BADCHARS.replace(allowed, '')
    for char in bet:
        text = text.replace(char, '')
    return text

+def get_clipboard():
+    import tkinter
+    t = tkinter.Tk()
+    clip = t.clipboard_get()
+    t.destroy()
+    return clip
+
 def hashit(text, length=None):
+    import hashlib
    h = hashlib.sha512(text.encode('utf-8')).hexdigest()
    if length is not None:
        h = h[:length]
@ -230,12 +444,66 @@ def hook1(currentblock, chunksize, totalsize):
    if currentbytes == totalsize:
        print()

+def listget(l, index, default=None):
+    try:
+        return l[index]
+    except IndexError:
+        return default
+
+def longest_length(li):
+    longest = 0
+    for item in li:
+        longest = max(longest, len(item))
+    return longest
+
 def safeprint(text, **kwargs):
    text = str(text)
    text = text.encode('ascii', 'replace').decode()
    text = text.replace('?', '_')
    print(text, **kwargs)

+def smart_insert(sql, cur, url=None, head=None, commit=True):
+    '''
+    INSERT or UPDATE the appropriate entry.
+    '''
+    if bool(url) is bool(head):
+        raise ValueError('One and only one of `url` or `head` is necessary.')
+
+    if url is not None:
+        # When doing a basic scan, all we get is the URL.
+        content_length = None
+        content_type = None
+
+    elif head is not None:
+        # When doing a full scan, we get a Response object.
+        url = head.url
+        content_length = head.headers.get('Content-Length', None)
+        if content_length is not None:
+            content_length = int(content_length)
+        content_type = head.headers.get('Content-Type', None)
+
+    basename = url_to_filepath(url)[2]
+    basename = urllib.parse.unquote(basename)
+    do_download = True
+    cur.execute('SELECT * FROM urls WHERE url == ?', [url])
+    existing_entry = cur.fetchone()
+    is_new = existing_entry is None
+    data = (url, basename, content_length, content_type, do_download)
+    if is_new:
+        
+        cur.execute('INSERT INTO urls VALUES(?, ?, ?, ?, ?)', data)
+    else:
+        command = '''
+            UPDATE urls SET
+            content_length = coalesce(?, content_length),
+            content_type = coalesce(?, content_type)
+            WHERE url == ?
+        '''
+        cur.execute(command, [content_length, content_type, url])
+    if commit:
+        sql.commit()
+    return data
+
 def url_to_filepath(text):
    text = urllib.parse.unquote(text)
    parts = urllib.parse.urlsplit(text)
@ -244,89 +512,230 @@ def url_to_filepath(text):
    while folder.startswith('/'):
        folder = folder[1:]

-    # Folders are allowed to have slashes
-    folder = filepath_sanitize(folder, exclusions='/\\')
+    # Folders are allowed to have slashes...
+    folder = filepath_sanitize(folder, allowed='/\\')
    folder = folder.replace('\\', os.path.sep)
    folder = folder.replace('/', os.path.sep)
-    # But Files are not.
+    # ...but Files are not.
    filename = filepath_sanitize(filename)
+
    return (root, folder, filename)
+##                                                                                                ##
+## GENERAL FUNCTIONS ###############################################################################

-## Commandline functions ####################################################\\
-def digest(website, outputfile, fullscan, *trash):
-    fullscan = bool(fullscan)
-    if website[-1] != '/':
-        website += '/'
-    walker = Walker(website, outputfile, fullscan=fullscan)
-    try:
-        walker.walk()
-        dict_to_file(walker.results, outputfile)
-    except:
-        dict_to_file(walker.results, outputfile)
-        traceback.print_exc()
-        print('SAVED PROGRESS SO FAR')

-def download(urlfile, outputdir, overwrite, *trash):
-    downloader = Downloader(urlfile, outputdir)
-    downloader.download(overwrite)
+## COMMANDLINE FUNCTIONS ###########################################################################
+##                                                                                                ##
+def digest(args):
+    fullscan = args.fullscan
+    if isinstance(fullscan, str):
+        fullscan = bool(eval(fullscan))
+    walkurl = args.walkurl
+    if walkurl == '!clipboard':
+        walkurl = get_clipboard()
+        safeprint('From clipboard: %s' % walkurl)
+    walker = Walker(
+        databasename=args.databasename,
+        fullscan=fullscan,
+        walkurl=walkurl,
+        )
+    walker.walk()

-def filter_pattern(urlfile, patterns, negative=False, *trash):
+def download(args):
+    bytespersecond = args.bytespersecond
+    if isinstance(bytespersecond, str):
+        bytespersecond = eval(bytespersecond)
+
+    downloader = Downloader(
+        databasename=args.databasename,
+        outputdir=args.outputdir,
+        )
+    downloader.download(
+        bytespersecond=bytespersecond,
+        overwrite=args.overwrite,
+        )
+
+def filter_pattern(databasename, regex, action='keep', *trash):
    '''
-    When `negative` is True, items are kept when they do NOT match the pattern,
-    allowing you to delete trash files.
+    When `action` is 'keep', then any URLs matching the regex will have their
+    `do_download` flag set to True.

-    When `negative` is False, items are keep when they DO match the pattern,
-    allowing you to keep items of interest.
+    When `action` is 'remove', then any URLs matching the regex will have their
+    `do_download` flag set to False.
+
+    Actions will not act on each other's behalf. A 'keep' will NEVER disable a url,
+    and 'remove' will NEVER enable one.
    '''
-    if isinstance(patterns, str):
-        patterns = [patterns]
-    jdict = file_to_dict(urlfile)
-    keys = list(jdict.keys())
-    for key in keys:
-        for pattern in patterns:
-            contains = re.search(pattern, key) is not None
-            if contains ^ negative:
-                safeprint('Removing "%s"' % key)
-                del jdict[key]
-    dict_to_file(jdict, urlfile)
+    import re
+    if isinstance(regex, str):
+        regex = [regex]

-def keep_pattern(urlfile, patterns, *trash):
-    filter_pattern(urlfile=urlfile, patterns=patterns, negative=True)
+    keep = action == 'keep'
+    remove = action == 'remove'

-def measure(urlfile, *trash):
-    jdict = file_to_dict(urlfile)
-    totalbytes = 0
-    for (url, info) in jdict.items():
-        bytes = info['Content-Length']
-        if bytes > 0:
-            totalbytes += bytes
-    bytestring = '{:,}'.format(totalbytes)
-    print(bytestring)
-    return totalbytes
+    sql = sqlite3.connect(databasename)
+    cur = sql.cursor()
+    cur2 = sql.cursor()

-def remove_pattern(urlfile, patterns, *trash):
-    filter_pattern(urlfile=urlfile, patterns=patterns, negative=False)
-
-def listget(l, index, default=None):
-    try:
-        return l[index]
-    except IndexError:
-        return default
-cmdfunctions = [digest, download, keep_pattern, measure, remove_pattern]
-## End of commandline functions #############################################//
-    
-if __name__ == '__main__':
-    command = listget(sys.argv, 1, None)
-    arg1 = listget(sys.argv, 2, None)
-    arg2 = listget(sys.argv, 3, None)
-    arg3 = listget(sys.argv, 4, None)
-    if command is None:
-        quit()
-    did_something = False
-    for function in cmdfunctions:
-        if command == function.__name__:
-            function(arg1, arg2, arg3)
-            did_something = True
+    cur2.execute('SELECT * FROM urls')
+    while True:
+        fetch = cur2.fetchone()
+        if fetch is None:
            break
-    if not did_something:
-        print('No matching function')
+        url = fetch[SQL_URL]
+        current_do_dl = fetch[SQL_DO_DOWNLOAD]
+        for pattern in regex:
+            contains = re.search(pattern, url) is not None
+
+            should_keep = (keep and contains)
+            if keep and contains and not current_do_dl:
+                safeprint('Keeping "%s"' % url)
+                cur.execute('UPDATE urls SET do_download = 1 WHERE url == ?', [url])
+            if remove and contains and current_do_dl:
+                safeprint('Removing "%s"' % url)
+                cur.execute('UPDATE urls SET do_download = 0 WHERE url == ?', [url])
+    sql.commit()
+
+def keep_pattern(args):
+    '''
+    See `filter_pattern`.
+    '''
+    filter_pattern(
+        action='keep',
+        databasename=args.databasename,
+        regex=args.regex,
+        )
+
+def list_basenames(args):
+    '''
+    Given a database, print the entries in order of the file basenames.
+    This makes it easier to find interesting titles without worrying about
+    what directory they're in.
+    '''
+    databasename = args.databasename
+    outputfile = args.outputfile
+
+    sql = sqlite3.connect(databasename)
+    cur = sql.cursor()
+    cur.execute('SELECT basename FROM urls WHERE do_download == 1 ORDER BY LENGTH(basename) DESC LIMIT 1')
+
+    longest = len(cur.fetchone()[0])
+    cur.execute('SELECT * FROM urls WHERE do_download == 1 ORDER BY basename')
+    form = '{bn:<%ds}  :  {url}' % longest
+    if outputfile:
+        outputfile = open(outputfile, 'w', encoding='utf-8')
+    while True:
+        fetch = cur.fetchone()
+        if fetch is None:
+            break
+        line = form.format(bn=fetch[SQL_BASENAME], url=fetch[SQL_URL])
+        if outputfile:
+            outputfile.write(line + '\n')
+        else:
+            print(line)
+    if outputfile:
+        outputfile.close()
+
+def measure(args):
+    '''
+    Given a database, print the sum of all Content-Lengths.
+    If `fullscan`, then URLs with no Content-Length will be
+    HEAD requested, and the result will be saved back into the file.
+    '''
+    databasename = args.databasename
+    fullscan = args.fullscan
+    if isinstance(fullscan, str):
+        fullscan = bool(fullscan)
+
+    totalsize = 0
+    sql = sqlite3.connect(databasename)
+    cur1 = sql.cursor()
+    cur2 = sql.cursor()
+    cur2.execute('SELECT * FROM urls WHERE do_download == 1')
+    filecount = 0
+    files_without_size = 0
+    try:
+        while True:
+            fetch = cur2.fetchone()
+            if fetch is None:
+                break
+            size = fetch[SQL_CONTENT_LENGTH]
+            if size is None:
+                if fullscan:
+                    url = fetch[SQL_URL]
+                    head = do_head(url)
+                    fetch = smart_insert(sql, cur1, head=head, commit=False)
+                    size = fetch[SQL_CONTENT_LENGTH]
+                    if size is None:
+                        safeprint('"%s" is not revealing Content-Length' % url)
+                        size = 0
+                else:
+                    files_without_size += 1
+                    size = 0
+            totalsize += size
+            filecount += 1
+    except:
+        sql.commit()
+        raise
+
+    sql.commit()
+    short_string = bytes_to_unit_string(totalsize)
+    totalsize_string = '{} ({:,} bytes) in {:,} files'.format(short_string, totalsize, filecount)
+    print(totalsize_string)
+    if files_without_size > 0:
+        print('Note: %d files do not have a stored Content-Length.' % files_without_size)
+        print('Run `measure` with `-f` or `--fullscan` to HEAD request those files.')
+    return totalsize
+
+def remove_pattern(args):
+    '''
+    See `filter_pattern`.
+    '''
+    filter_pattern(
+        action='remove',
+        databasename=args.databasename,
+        regex=args.regex,
+        )
+##                                                                                                ##
+## COMMANDLINE FUNCTIONS ###########################################################################
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    subparsers = parser.add_subparsers()
+
+    p_digest = subparsers.add_parser('digest')
+    p_digest.add_argument('walkurl')
+    p_digest.add_argument('-db', '--database', dest='databasename', default=None)
+    p_digest.add_argument('-f', '--fullscan', action='store_true')
+    p_digest.set_defaults(func=digest)
+
+    p_download = subparsers.add_parser('download')
+    p_download.add_argument('databasename')
+    p_download.add_argument('-o', '--outputdir', dest='outputdir', default=None)
+    p_download.add_argument('-ow', '--overwrite', dest='overwrite', default=False)
+    p_download.add_argument('-bps', '--bytespersecond', dest='bytespersecond', default=None)
+    p_download.set_defaults(func=download)
+
+    p_keep_pattern = subparsers.add_parser('keep_pattern')
+    p_keep_pattern.add_argument('databasename')
+    p_keep_pattern.add_argument('regex')
+    p_keep_pattern.set_defaults(func=keep_pattern)
+
+    p_list_basenames = subparsers.add_parser('list_basenames')
+    p_list_basenames.add_argument('databasename')
+    p_list_basenames.add_argument('-o', '--outputfile', dest='outputfile', default=None)
+    p_list_basenames.set_defaults(func=list_basenames)
+
+    p_measure = subparsers.add_parser('measure')
+    p_measure.add_argument('databasename')
+    p_measure.add_argument('-f', '--fullscan', action='store_true')
+    p_measure.set_defaults(func=measure)
+
+    p_remove_pattern = subparsers.add_parser('remove_pattern')
+    p_remove_pattern.add_argument('databasename')
+    p_remove_pattern.add_argument('regex')
+    p_remove_pattern.set_defaults(func=remove_pattern)
+
+    args = parser.parse_args()
+    args.func(args)
--- a/OpenDirDL/ratelimiter.py
+++ b/OpenDirDL/ratelimiter.py
@ -0,0 +1,56 @@
+import time
+
+
+class Ratelimiter:
+    def __init__(self, allowance_per_period, period, operation_cost=1, mode='sleep'):
+        '''
+        allowance_per_period:
+            The number of operations we can perform per `period` seconds.
+
+        period:
+            The number of seconds over which we can perform `allowance_per_period` operations.
+
+        operation_cost:
+            The default amount to remove from our balance after each operation.
+            Pass a `cost` parameter to `self.limit` to use a nondefault value.
+
+        mode:
+            'sleep':  If we do not have the balance for an operation, sleep until we do.
+                      Return True every time.
+            'reject': If we do not have the balance for an operation, return False.
+        '''
+        if mode not in ('sleep', 'reject'):
+            raise ValueError('Invalid mode %s' % repr(mode))
+        self.allowance_per_period = allowance_per_period
+        self.period = period
+        self.operation_cost = operation_cost
+        self.mode = mode
+
+        self.last_operation = time.time()
+        self.balance = 0
+        self.gain_rate = allowance_per_period / period
+
+    def limit(self, cost=None):
+        if cost is None:
+            cost = self.operation_cost
+        timediff = time.time() - self.last_operation
+        self.balance += timediff * self.gain_rate
+        self.balance = min(self.balance, self.allowance_per_period)
+        successful = False
+
+        deficit = cost - self.balance
+        if deficit > 0 and self.mode == 'sleep':
+            time_needed = (deficit / self.gain_rate)
+            #print(self.balance, deficit, 'Need to sleep %f' % time_needed)
+            time.sleep(time_needed)
+            self.balance = cost
+
+        #print(self.balance)
+        if self.balance >= cost:
+            #print('pass')
+            self.balance -= cost
+            successful = True
+
+        self.last_operation = time.time()
+
+        return successful
--- a/Steganographic/steganographic.py
+++ b/Steganographic/steganographic.py
@ -99,15 +99,24 @@ class BitsToImage:
 class ImageToBits:
    def __init__(self, image, bitness):
        self.image = image
+        self.bitness = bitness
        self.width = image.size[0]
+        self.height = image.size[1]
        self.pixel_index = -1
+        self.bit_index = bitness
        self.active_byte = []
+        self.pixels = self.image.getdata()
+        #self.bits = ''
+        #for pixel in self.pixels:
+        #    for channel in pixel:
+        #        self.bits += binary(channel)[-bitness:]
+        #print(len(self.bits))
+

    def _read(self):
        if len(self.active_byte) == 0:
            self.pixel_index += 1
-            (x, y) = index_to_xy(self.pixel_index, self.width)
-            self.active_byte = list(self.image.getpixel((x, y)))
+            self.active_byte = self.pixels[self.pixel_index]
            self.active_byte = self.active_byte[:3]
            self.active_byte = [binary(channel) for channel in self.active_byte]
            self.active_byte = [channel[-bitness:] for channel in self.active_byte]
@ -115,6 +124,7 @@ class ImageToBits:
            self.active_byte = list(self.active_byte)

        ret = self.active_byte.pop(0)
+        self.bit_index += 1
        return ret

    def read(self, bits=1):
@ -196,24 +206,14 @@ def encode(imagefilename, secretfilename, bitness=1):
    secret_content_length = (secret_size) + (len(secret_extension)) + 1
    requiredpixels = math.ceil(((secret_content_length * 8) + 32) / (3 * bitness))
    if totalpixels < requiredpixels:
-        raise StegError('Image does not have enough pixels to store the Secret'
+        raise StegError('Image does not have enough pixels to store the Secret. '
                        'Must have at least %d pixels' % requiredpixels)

    print('%d pixels available, %d required' % (totalpixels, requiredpixels))

    # --> YOU ARE HERE <--

-    # Because bitness may be between 1 and 8, we need to create a writing buffer
-    # called `binary_write_buffer`, so that we're always writing the same amount
-    # of data per color channel.
-    # If we were to write the secret length / extension on the fly, we might end
-    # up using the wrong number of bits for the final channel of some pixel.
-    # Example: 10010101 broken into groups of 3 is [100, 101, 01]
-    # Note that the last group is not the same size as the desired bitness, and
-    # will cause decode errors.
-
    pixel = list(image.getpixel((0, 0)))
-    binary_write_buffer = ''

    # Write secret length
    secret_content_length_b = binary(secret_content_length).rjust(32, '0')