else

2016-01-24 12:48:39 -08:00 · 2016-01-24 12:48:39 -08:00 · 001a8d970f
commit 001a8d970f
parent 4b7cfea08d
10 changed files with 716 additions and 199 deletions
--- a/DeLetterbox/README.md
+++ b/DeLetterbox/README.md
@ -0,0 +1,4 @@
 DeLetterbox
 ===========
 I didn't test this very much, just needed something quick.
--- a/DeLetterbox/deletterbox.py
+++ b/DeLetterbox/deletterbox.py
@ -0,0 +1,42 @@
 from PIL import Image
 import os
 import sys
 CLOSE_ENOUGH_THRESHOLD = 10
 def close_enough(a, b):
    for (a_channel, b_channel) in zip(a, b):
        if abs(a_channel - b_channel) > CLOSE_ENOUGH_THRESHOLD:
            return False
    return True
 def deletterbox(filename):
    image = Image.open(filename)
    trim_top(image)
    for x in range(4):
        image = trim_top(image)
        image = image.rotate(90)
    (base, ext) = os.path.splitext(filename)
    #filename = base + 'X' + ext
    image.save(filename)
 def trim_top(image):
    letterbox_color = image.getpixel((0, 0))
    for y in range(image.size[1]):
        solid = True
        for x in range(image.size[0]):
            pixel = image.getpixel((x, y))
            if not close_enough(letterbox_color, pixel):
                solid = False
                break
        if not solid:
            break
    bounds = (0, y, image.size[0], image.size[1])
    #print(bounds)
    image = image.crop(bounds)
    return image
 filenames = sys.argv[1:]
 for filename in filenames:
    deletterbox(filename)
--- a/DeLetterbox/example1_after.jpg
+++ b/DeLetterbox/example1_after.jpg
--- a/DeLetterbox/example1_before.jpg
+++ b/DeLetterbox/example1_before.jpg
--- a/DeLetterbox/example2_after.jpg
+++ b/DeLetterbox/example2_after.jpg
--- a/DeLetterbox/example2_before.jpg
+++ b/DeLetterbox/example2_before.jpg
--- a/OpenDirDL/README.md
+++ b/OpenDirDL/README.md
@ -0,0 +1,6 @@
 Open Dir DL
 ===========
 Requires `pip install beautifulsoup4`
 See inside opendirdl.py for usage instructions.
--- a/OpenDirDL/opendirdl.py
+++ b/OpenDirDL/opendirdl.py
@ -1,17 +1,85 @@
-import bs4
+'''
-import hashlib
+OpenDirDL
-import json
+downloads open directories
 Usage:
 DIGEST:
    Recursively fetch directories and build a database of file URLs.
    > opendirdl digest !clipboard <flags>
    > opendirdl digest http://website.com/directory/ <flags>
    flags:
                           -f | --fullscan : When included, perform HEAD requests on all files, to
                                             know the size of the entire directory.
        -dv "x.db" | --databasename "x.db" : Use a custom database filename. By default, databases
                                             are named after the web domain.
 DOWNLOAD:
    Download the files whose URLs are enabled in the database.
    > opendirdl download website.com.db <flags>
    flags:
               -o "x" | --outputdir "x" : Save the files to a custom directory, "x". By default,
                                          files are saved to a folder named after the web domain.
                      -ow | --overwrite : When included, download and overwrite files even if they
                                          already exist in the output directory.
        -bps 100 | --bytespersecond 100 : Ratelimit yourself to downloading at 100 BYTES per second.
                                          The webmaster will appreciate this.
 KEEP_PATTERN:
    Enable URLs which match a regex pattern. Matches are based on the percent-encoded strings!
    > opendirdl keep_pattern website.com.db ".*"
 REMOVE_PATTERN:
    Disable URLs which match a regex pattern. Matches are based on the percent-encoded strings!
    > opendirdl remove_pattern website.com.db ".*"
 LIST_BASENAMES:
    List enabled URLs in order of their base filename. This makes it easier to find titles of
    interest in a directory that is very scattered or poorly organized.
    > opendirdl list_basenames website.com.db <flags>
    flags:
        -o "x.txt" | --outputfile "x.txt" : Output the results to a file instead of stdout. This is
                                            useful if the filenames contain special characters that
                                            crash Python, or are so long that the console becomes
                                            unreadable.
 MEASURE:
    Sum up the filesizes of all enabled URLs.
    > opendirdl measure website.com.db <flags>
    flags:
        -f | --fullscan : When included, perform HEAD requests on any URL whose size is not known.
                          If this flag is not included, and some file's size is unkown, you will
                          receive a note.
 '''
 # Module names preceeded by two hashes indicate modules that are imported during
 # a function, because they are not used anywhere else and we don't need to waste
 # time importing them usually.
 import argparse
 ## import bs4
 ## import hashlib
 import os
-import re
+import ratelimiter
 ## import re
 import requests
-import string
+import sqlite3
-import sys
+## import sys
-import time
+## tkinter
 import traceback
 import urllib.parse
 FILENAME_BADCHARS = '/\\:*?"<>|'
 DOWNLOAD_CHUNK = 2048
 # When doing a basic scan, we will not send HEAD requests to URLs that end in these strings,
 # because they're probably files.
@ -23,197 +91,343 @@ SKIPPABLE_FILETYPES = [
 '.epub',
 '.db',
 '.flac',
 '.gif',
 '.gz'
 '.ico',
 '.iso',
 '.jpeg',
 '.jpg',
 '.m3u',
 '.m4a',
 '.mkv',
 '.mov',
 '.mp3',
 '.mp4',
 '.nfo',
 '.ogg',
 '.pdf',
 '.png',
 '.srt',
 '.tar',
 '.txt',
 '.webm',
 '.zip',
 ]
 SKIPPABLE_FILETYPES = set(x.lower() for x in SKIPPABLE_FILETYPES)
-SKIPPABLE_FILETYPES = [x.lower() for x in SKIPPABLE_FILETYPES]
+BYTE = 1
 KIBIBYTE = 1024 * BYTE
 MIBIBYTE = 1024 * KIBIBYTE
 GIBIBYTE = 1024 * MIBIBYTE
 TEBIBYTE = 1024 * GIBIBYTE
 SIZE_UNITS = (TEBIBYTE, GIBIBYTE, MIBIBYTE, KIBIBYTE, BYTE)
 UNIT_STRINGS = {
    BYTE: 'b',
    KIBIBYTE: 'KiB',
    MIBIBYTE: 'MiB',
    GIBIBYTE: 'GiB',
    TEBIBYTE: 'TiB',
 }
 DOWNLOAD_CHUNK = 2 * KIBIBYTE
 DB_INIT = '''
 CREATE TABLE IF NOT EXISTS urls(
    url TEXT,
    basename TEXT,
    content_length INT,
    content_type TEXT,
    do_download INT
    );
 CREATE INDEX IF NOT EXISTS urlindex on urls(url);
 CREATE INDEX IF NOT EXISTS baseindex on urls(basename);
 CREATE INDEX IF NOT EXISTS sizeindex on urls(content_length);
 '''.strip()
 SQL_URL = 0
 SQL_BASENAME = 1
 SQL_CONTENT_LENGTH = 2
 SQL_CONTENT_TYPE = 3
 SQL_DO_DOWNLOAD = 4
 ## DOWNLOADER ######################################################################################
 ##                                                                                                ##
 class Downloader:
-    def __init__(self, urlfile, outputdir=None, headers=None):
+    def __init__(self, databasename, outputdir=None, headers=None):
-        jdict = file_to_dict(urlfile)
+        self.databasename = databasename
-        self.urls = [item[0] for item in jdict.items()]
+        self.sql = sqlite3.connect(databasename)
-        self.urls.sort(key=str.lower)
+        self.cur = self.sql.cursor()
        if outputdir is None or outputdir == "":
            # This assumes that all URLs in the database are from the same domain.
            # If they aren't, it's the user's fault.
            self.cur.execute('SELECT url FROM urls LIMIT 1')
            url = self.cur.fetchone()[0]
            # returns (root, path, filename). Keep root.
            outputdir = url_to_filepath(url)[0]
        self.outputdir = outputdir
-        if self.outputdir is None or self.outputdir == "":
+    def download(self, overwrite=False, bytespersecond=None):
            # returns (root, path, filename). Keep root.
            self.outputdir = url_to_filepath(self.urls[0])[0]
    def download(self, overwrite=False):
        overwrite = bool(overwrite)
-        for url in self.urls:
+
        self.cur.execute('SELECT * FROM urls WHERE do_download == 1 ORDER BY url')
        while True:
            fetch = self.cur.fetchone()
            if fetch is None:
                break
            url = fetch[SQL_URL]
            ''' Creating the Path '''
-            (root, folder, filename) = url_to_filepath(url)
+            (root, folder, basename) = url_to_filepath(url)
-            # In case the user has set a custom download directory,
+            # Ignore this value of `root`, because we might have a custom outputdir.
            # ignore the above value of `root`.
            root = self.outputdir
            folder = os.path.join(root, folder)
            if not os.path.exists(folder):
                os.makedirs(folder)
-            localname = os.path.join(folder, filename)
+            fullname = os.path.join(folder, basename)
            temporary_basename = hashit(url, 16) + '.oddltemporary'
-            temporary_localname = os.path.join(folder, temporary_basename)
+            temporary_fullname = os.path.join(folder, temporary_basename)
            ''' Managing overwrite '''
-            if os.path.isfile(localname):
+            if os.path.isfile(fullname):
                if overwrite is True:
-                    os.remove(localname)
+                    os.remove(fullname)
                else:
-                    safeprint('Skipping "%s". Use `overwrite=True`' % localname)
+                    safeprint('Skipping "%s". Use `--overwrite`' % fullname)
                    continue
-            safeprint('Downloading "%s" as "%s"' % (localname, temporary_basename))
+            safeprint('Downloading "%s" as "%s"' % (fullname, temporary_basename))
-            filehandle = open(temporary_localname, 'wb')
+            filehandle = open(temporary_fullname, 'wb')
            try:
-                download_file(url, filehandle, hookfunction=hook1)
+                download_file(url, filehandle, hookfunction=hook1, bytespersecond=bytespersecond)
-                os.rename(temporary_localname, localname)
+                os.rename(temporary_fullname, fullname)
            except:
                filehandle.close()
                raise
 ##                                                                                                ##
 ## DOWNLOADER ######################################################################################
 ## GENERIC #########################################################################################
 ##                                                                                                ##
 class Generic:
    def __init__(self, **kwargs):
        for kwarg in kwargs:
            setattr(self, kwarg, kwargs[kwarg])
 ##                                                                                                ##
 ## GENERIC #########################################################################################
 ## WALKER ##########################################################################################
 ##                                                                                                ##
 class Walker:
-    def __init__(self, website, outputfile, fullscan=False):
+    def __init__(self, walkurl, databasename=None, fullscan=False):
-        self.website = website
+        if walkurl[-1] != '/':
            walkurl += '/'
        self.walkurl = walkurl
        if databasename is None:
            self.domain = url_to_filepath(walkurl)[0]
            databasename = self.domain + '.db'
        self.databasename = databasename
        self.sql = sqlite3.connect(self.databasename)
        self.cur = self.sql.cursor()
        db_init(self.sql, self.cur)
        self.fullscan = bool(fullscan)
-        if os.path.exists(outputfile):
+        self.queue = []
-            self.results = file_to_dict(outputfile)
+        self.seen_directories = set()
        else:
            self.results = {}
        self.already_seen = set()
-    def add_head_to_results(self, head):
+    def smart_insert(self, url=None, head=None, commit=True):
-        if isinstance(head, str):
+        '''
-            # For when we're doing a basic scan, which skips urls that
+        See `smart_insert`.
-            # look like a file.
+        '''
-            self.results[head] = {
+        smart_insert(self.sql, self.cur, url=url, head=head, commit=commit)
                'Content-Length': -1,
                'Content-Type': '?',
            }
            self.already_seen.add(head)
        else:
            # For when we're doing a full scan, which does a HEAD request
            # for all urls.
            self.results[head.url] = {
                'Content-Length': int(head.headers.get('Content-Length', -1)),
                'Content-Type': head.headers.get('Content-Type', '?'),
            }
            self.already_seen.add(head.url)
-    def extract_hrefs(self, response):
+    def extract_hrefs(self, response, tag='a', attribute='href'):
        '''
        Given a Response object, extract href urls.
        External links, index sort links, and desktop.ini are discarded.
        '''
        import bs4
        soup = bs4.BeautifulSoup(response.text)
-        elements = soup.findAll('a')
+        elements = soup.findAll(tag)
        hrefs = []
        for element in elements:
            try:
-                href = element['href']
+                href = element[attribute]
            except KeyError:
                continue
            href = urllib.parse.urljoin(response.url, href)
-            if not href.startswith(self.website):
+            if not href.startswith(self.walkurl):
-                # Don't go to other sites or parent directories
+                # Don't go to other sites or parent directories.
                continue
            if 'C=' in href and 'O=' in href:
-                # Alternative sort modes for index pages
+                # Alternative sort modes for index pages.
                continue
            if href.endswith('desktop.ini'):
-                # I hate these things
+                # I hate these things.
                continue
-            hrefs.append(href)
+            yield href
        return hrefs
-    def walk(self, url=None):
+    def process_url(self, url=None):
        '''
        Given a URL, check whether it is an index page or an actual file.
        If it is an index page, it's links are extracted and queued.
        If it is a file, its information is saved to the database.
        We perform a 
        HEAD:
            when `self.fullscan` is True.
            when `self.fullscan` is False but the url is not a SKIPPABLE_FILETYPE.
            when the url is an index page.
        GET:
            when the url is a index page.
        '''
        if url is None:
-            url = self.website
+            url = self.walkurl
        else:
-            url = urllib.parse.urljoin(self.website, url)
+            url = urllib.parse.urljoin(self.walkurl, url)
-        results = []
+        if not url.startswith(self.walkurl):
            # Don't follow external links or parent directory.
            print('Skipping "%s" due to external url.' % url)
            return
        urll = url.lower()
-        if self.fullscan is False and any(urll.endswith(ext) for ext in SKIPPABLE_FILETYPES):
+        if self.fullscan is False:
-            print('Skipping "%s" due to extension' % url)
+            skippable = any(urll.endswith(ext) for ext in SKIPPABLE_FILETYPES)
-            self.add_head_to_results(url)
+            if skippable:
-            return results
+                safeprint('Skipping "%s" due to extension.' % url)
                self.smart_insert(url=url)
                return
            self.cur.execute('SELECT * FROM urls WHERE url == ?', [url])
            skippable = self.cur.fetchone() is not None
            if skippable:
                safeprint('Skipping "%s" since we already have it.' % url)
                return
-        if not url.startswith(self.website):
+        try:
-            # Don't follow external links or parent directory.
+            head = do_head(url)
-            return results
+        except requests.exceptions.HTTPError as e:
-
+            if e.response.status_code == 403:
-        head = requests.head(url)
+                print('403 FORBIDDEN!')
-        head.raise_for_status()
+                return
-
+            if e.response.status_code == 404:
-        safeprint('HEAD: %s : %s' % (url, head))
+                print('404 NOT FOUND!')
                return
            raise
        content_type = head.headers.get('Content-Type', '?')
        self.already_seen.add(head.url)
        if content_type.startswith('text/html') and head.url.endswith('/'):
-            # This is an index page, let's get recursive.
+            # This is an index page, so extract links and queue them.
-            page = requests.get(url)
+            response = do_get(url)
-            safeprint(' GET: %s : %s' % (url, page))
+            hrefs = self.extract_hrefs(response)
-            hrefs = self.extract_hrefs(page)
+            self.seen_directories.add(head.url)
-            for url in hrefs:
+            added = 0
-                if url not in self.results and url not in self.already_seen:
+            for href in hrefs:
-                    results += self.walk(url)
+                if href in self.seen_directories:
                    continue
                else:
                    self.queue.append(href)
                    added += 1
            print('Queued %d urls' % added)
        else:
-            # Don't add index pages to the results.
+            # This is not an index page, so save it.
-            self.add_head_to_results(head)
+            self.smart_insert(head=head)
-        return results
+    def walk(self, url=None):
        self.queue.append(url)
        while len(self.queue) > 0:
            url = self.queue.pop(0)
            self.process_url(url)
 ##                                                                                                ##
 ## WALKER ##########################################################################################
 ## GENERAL FUNCTIONS ###############################################################################
 ##                                                                                                ##
 def bytes_to_unit_string(bytes):
    size_unit = 1
    for unit in SIZE_UNITS:
        if bytes >= unit:
            size_unit = unit
            break
    size_unit_string = UNIT_STRINGS[size_unit]
    size_string = '%.3f %s' % ((bytes / size_unit), size_unit_string)
    return size_string
 def db_init(sql, cur):
    lines = DB_INIT.split(';')
    for line in lines:
        cur.execute(line)
    sql.commit()
    return True
 def dict_to_file(jdict, filename):
-    filehandle = open(filename, 'wb')
+    text = dict_to_string(jdict)
    text = json.dumps(jdict, indent=4, sort_keys=True)
    text = text.encode('utf-8')
    filehandle = open(filename, 'wb')
    filehandle.write(text)
    filehandle.close()
-def download_file(url, filehandle, getsizeheaders=True, hookfunction=None, headers={}, auth=None):
+def do_get(url):
-    if getsizeheaders:
+    return do_request('GET', requests.get, url)
-        totalsize = requests.head(url, headers=headers, auth=auth)
+
-        totalsize = int(totalsize.headers['content-length'])
+def do_head(url):
    return do_request('HEAD', requests.head, url)
 def do_request(message, method, url):
    import sys
    message = '{message:>4s}: {url} : '.format(message=message, url=url)
    safeprint(message, end='')
    sys.stdout.flush()
    response = method(url)
    safeprint(response)
    response.raise_for_status()
    return response
 def download_file(url, filehandle, hookfunction=None, headers={}, bytespersecond=None):
    if bytespersecond is not None:
        limiter = ratelimiter.Ratelimiter(allowance_per_period=bytespersecond, period=1)
    else:
-        totalsize = 1
+        limiter = None
    currentblock = 0
-    downloading = requests.get(url, stream=True, headers=headers, auth=auth)
+    downloading = requests.get(url, stream=True, headers=headers)
    totalsize = int(downloading.headers.get('content-length', 1))
    for chunk in downloading.iter_content(chunk_size=DOWNLOAD_CHUNK):
-        if chunk:
+        if not chunk:
-            currentblock += 1
+            break
-            filehandle.write(chunk)
+        currentblock += 1
-            if hookfunction is not None:
+        filehandle.write(chunk)
-                hookfunction(currentblock, DOWNLOAD_CHUNK, totalsize)
+        if limiter is not None:
            limiter.limit(len(chunk))
        if hookfunction is not None:
            hookfunction(currentblock, DOWNLOAD_CHUNK, totalsize)
    filehandle.close()
    size = os.path.getsize(filehandle.name)
    if size < totalsize:
        raise Exception('Did not receive expected total size. %d / %d' % (size, totalsize))
    return True
-def file_to_dict(filename):
+def filepath_sanitize(text, allowed=''):
-    filehandle = open(filename, 'rb')
+    bet = FILENAME_BADCHARS.replace(allowed, '')
    jdict = json.loads(filehandle.read().decode('utf-8'))
    filehandle.close()
    return jdict
 def filepath_sanitize(text, exclusions=''):
    bet = FILENAME_BADCHARS.replace(exclusions, '')
    for char in bet:
        text = text.replace(char, '')
    return text
 def get_clipboard():
    import tkinter
    t = tkinter.Tk()
    clip = t.clipboard_get()
    t.destroy()
    return clip
 def hashit(text, length=None):
    import hashlib
    h = hashlib.sha512(text.encode('utf-8')).hexdigest()
    if length is not None:
        h = h[:length]
@ -230,12 +444,66 @@ def hook1(currentblock, chunksize, totalsize):
    if currentbytes == totalsize:
        print()
 def listget(l, index, default=None):
    try:
        return l[index]
    except IndexError:
        return default
 def longest_length(li):
    longest = 0
    for item in li:
        longest = max(longest, len(item))
    return longest
 def safeprint(text, **kwargs):
    text = str(text)
    text = text.encode('ascii', 'replace').decode()
    text = text.replace('?', '_')
    print(text, **kwargs)
 def smart_insert(sql, cur, url=None, head=None, commit=True):
    '''
    INSERT or UPDATE the appropriate entry.
    '''
    if bool(url) is bool(head):
        raise ValueError('One and only one of `url` or `head` is necessary.')
    if url is not None:
        # When doing a basic scan, all we get is the URL.
        content_length = None
        content_type = None
    elif head is not None:
        # When doing a full scan, we get a Response object.
        url = head.url
        content_length = head.headers.get('Content-Length', None)
        if content_length is not None:
            content_length = int(content_length)
        content_type = head.headers.get('Content-Type', None)
    basename = url_to_filepath(url)[2]
    basename = urllib.parse.unquote(basename)
    do_download = True
    cur.execute('SELECT * FROM urls WHERE url == ?', [url])
    existing_entry = cur.fetchone()
    is_new = existing_entry is None
    data = (url, basename, content_length, content_type, do_download)
    if is_new:
        cur.execute('INSERT INTO urls VALUES(?, ?, ?, ?, ?)', data)
    else:
        command = '''
            UPDATE urls SET
            content_length = coalesce(?, content_length),
            content_type = coalesce(?, content_type)
            WHERE url == ?
        '''
        cur.execute(command, [content_length, content_type, url])
    if commit:
        sql.commit()
    return data
 def url_to_filepath(text):
    text = urllib.parse.unquote(text)
    parts = urllib.parse.urlsplit(text)
@ -244,89 +512,230 @@ def url_to_filepath(text):
    while folder.startswith('/'):
        folder = folder[1:]
-    # Folders are allowed to have slashes
+    # Folders are allowed to have slashes...
-    folder = filepath_sanitize(folder, exclusions='/\\')
+    folder = filepath_sanitize(folder, allowed='/\\')
    folder = folder.replace('\\', os.path.sep)
    folder = folder.replace('/', os.path.sep)
-    # But Files are not.
+    # ...but Files are not.
    filename = filepath_sanitize(filename)
    return (root, folder, filename)
 ##                                                                                                ##
 ## GENERAL FUNCTIONS ###############################################################################
 ## Commandline functions ####################################################\\
 def digest(website, outputfile, fullscan, *trash):
    fullscan = bool(fullscan)
    if website[-1] != '/':
        website += '/'
    walker = Walker(website, outputfile, fullscan=fullscan)
    try:
        walker.walk()
        dict_to_file(walker.results, outputfile)
    except:
        dict_to_file(walker.results, outputfile)
        traceback.print_exc()
        print('SAVED PROGRESS SO FAR')
-def download(urlfile, outputdir, overwrite, *trash):
+## COMMANDLINE FUNCTIONS ###########################################################################
-    downloader = Downloader(urlfile, outputdir)
+##                                                                                                ##
-    downloader.download(overwrite)
+def digest(args):
    fullscan = args.fullscan
    if isinstance(fullscan, str):
        fullscan = bool(eval(fullscan))
    walkurl = args.walkurl
    if walkurl == '!clipboard':
        walkurl = get_clipboard()
        safeprint('From clipboard: %s' % walkurl)
    walker = Walker(
        databasename=args.databasename,
        fullscan=fullscan,
        walkurl=walkurl,
        )
    walker.walk()
-def filter_pattern(urlfile, patterns, negative=False, *trash):
+def download(args):
    bytespersecond = args.bytespersecond
    if isinstance(bytespersecond, str):
        bytespersecond = eval(bytespersecond)
    downloader = Downloader(
        databasename=args.databasename,
        outputdir=args.outputdir,
        )
    downloader.download(
        bytespersecond=bytespersecond,
        overwrite=args.overwrite,
        )
 def filter_pattern(databasename, regex, action='keep', *trash):
    '''
-    When `negative` is True, items are kept when they do NOT match the pattern,
+    When `action` is 'keep', then any URLs matching the regex will have their
-    allowing you to delete trash files.
+    `do_download` flag set to True.
-    When `negative` is False, items are keep when they DO match the pattern,
+    When `action` is 'remove', then any URLs matching the regex will have their
-    allowing you to keep items of interest.
+    `do_download` flag set to False.
    Actions will not act on each other's behalf. A 'keep' will NEVER disable a url,
    and 'remove' will NEVER enable one.
    '''
-    if isinstance(patterns, str):
+    import re
-        patterns = [patterns]
+    if isinstance(regex, str):
-    jdict = file_to_dict(urlfile)
+        regex = [regex]
    keys = list(jdict.keys())
    for key in keys:
        for pattern in patterns:
            contains = re.search(pattern, key) is not None
            if contains ^ negative:
                safeprint('Removing "%s"' % key)
                del jdict[key]
    dict_to_file(jdict, urlfile)
-def keep_pattern(urlfile, patterns, *trash):
+    keep = action == 'keep'
-    filter_pattern(urlfile=urlfile, patterns=patterns, negative=True)
+    remove = action == 'remove'
-def measure(urlfile, *trash):
+    sql = sqlite3.connect(databasename)
-    jdict = file_to_dict(urlfile)
+    cur = sql.cursor()
-    totalbytes = 0
+    cur2 = sql.cursor()
    for (url, info) in jdict.items():
        bytes = info['Content-Length']
        if bytes > 0:
            totalbytes += bytes
    bytestring = '{:,}'.format(totalbytes)
    print(bytestring)
    return totalbytes
-def remove_pattern(urlfile, patterns, *trash):
+    cur2.execute('SELECT * FROM urls')
-    filter_pattern(urlfile=urlfile, patterns=patterns, negative=False)
+    while True:
-
+        fetch = cur2.fetchone()
-def listget(l, index, default=None):
+        if fetch is None:
    try:
        return l[index]
    except IndexError:
        return default
 cmdfunctions = [digest, download, keep_pattern, measure, remove_pattern]
 ## End of commandline functions #############################################//
 if __name__ == '__main__':
    command = listget(sys.argv, 1, None)
    arg1 = listget(sys.argv, 2, None)
    arg2 = listget(sys.argv, 3, None)
    arg3 = listget(sys.argv, 4, None)
    if command is None:
        quit()
    did_something = False
    for function in cmdfunctions:
        if command == function.__name__:
            function(arg1, arg2, arg3)
            did_something = True
            break
-    if not did_something:
+        url = fetch[SQL_URL]
-        print('No matching function')
+        current_do_dl = fetch[SQL_DO_DOWNLOAD]
        for pattern in regex:
            contains = re.search(pattern, url) is not None
            should_keep = (keep and contains)
            if keep and contains and not current_do_dl:
                safeprint('Keeping "%s"' % url)
                cur.execute('UPDATE urls SET do_download = 1 WHERE url == ?', [url])
            if remove and contains and current_do_dl:
                safeprint('Removing "%s"' % url)
                cur.execute('UPDATE urls SET do_download = 0 WHERE url == ?', [url])
    sql.commit()
 def keep_pattern(args):
    '''
    See `filter_pattern`.
    '''
    filter_pattern(
        action='keep',
        databasename=args.databasename,
        regex=args.regex,
        )
 def list_basenames(args):
    '''
    Given a database, print the entries in order of the file basenames.
    This makes it easier to find interesting titles without worrying about
    what directory they're in.
    '''
    databasename = args.databasename
    outputfile = args.outputfile
    sql = sqlite3.connect(databasename)
    cur = sql.cursor()
    cur.execute('SELECT basename FROM urls WHERE do_download == 1 ORDER BY LENGTH(basename) DESC LIMIT 1')
    longest = len(cur.fetchone()[0])
    cur.execute('SELECT * FROM urls WHERE do_download == 1 ORDER BY basename')
    form = '{bn:<%ds}  :  {url}' % longest
    if outputfile:
        outputfile = open(outputfile, 'w', encoding='utf-8')
    while True:
        fetch = cur.fetchone()
        if fetch is None:
            break
        line = form.format(bn=fetch[SQL_BASENAME], url=fetch[SQL_URL])
        if outputfile:
            outputfile.write(line + '\n')
        else:
            print(line)
    if outputfile:
        outputfile.close()
 def measure(args):
    '''
    Given a database, print the sum of all Content-Lengths.
    If `fullscan`, then URLs with no Content-Length will be
    HEAD requested, and the result will be saved back into the file.
    '''
    databasename = args.databasename
    fullscan = args.fullscan
    if isinstance(fullscan, str):
        fullscan = bool(fullscan)
    totalsize = 0
    sql = sqlite3.connect(databasename)
    cur1 = sql.cursor()
    cur2 = sql.cursor()
    cur2.execute('SELECT * FROM urls WHERE do_download == 1')
    filecount = 0
    files_without_size = 0
    try:
        while True:
            fetch = cur2.fetchone()
            if fetch is None:
                break
            size = fetch[SQL_CONTENT_LENGTH]
            if size is None:
                if fullscan:
                    url = fetch[SQL_URL]
                    head = do_head(url)
                    fetch = smart_insert(sql, cur1, head=head, commit=False)
                    size = fetch[SQL_CONTENT_LENGTH]
                    if size is None:
                        safeprint('"%s" is not revealing Content-Length' % url)
                        size = 0
                else:
                    files_without_size += 1
                    size = 0
            totalsize += size
            filecount += 1
    except:
        sql.commit()
        raise
    sql.commit()
    short_string = bytes_to_unit_string(totalsize)
    totalsize_string = '{} ({:,} bytes) in {:,} files'.format(short_string, totalsize, filecount)
    print(totalsize_string)
    if files_without_size > 0:
        print('Note: %d files do not have a stored Content-Length.' % files_without_size)
        print('Run `measure` with `-f` or `--fullscan` to HEAD request those files.')
    return totalsize
 def remove_pattern(args):
    '''
    See `filter_pattern`.
    '''
    filter_pattern(
        action='remove',
        databasename=args.databasename,
        regex=args.regex,
        )
 ##                                                                                                ##
 ## COMMANDLINE FUNCTIONS ###########################################################################
 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    subparsers = parser.add_subparsers()
    p_digest = subparsers.add_parser('digest')
    p_digest.add_argument('walkurl')
    p_digest.add_argument('-db', '--database', dest='databasename', default=None)
    p_digest.add_argument('-f', '--fullscan', action='store_true')
    p_digest.set_defaults(func=digest)
    p_download = subparsers.add_parser('download')
    p_download.add_argument('databasename')
    p_download.add_argument('-o', '--outputdir', dest='outputdir', default=None)
    p_download.add_argument('-ow', '--overwrite', dest='overwrite', default=False)
    p_download.add_argument('-bps', '--bytespersecond', dest='bytespersecond', default=None)
    p_download.set_defaults(func=download)
    p_keep_pattern = subparsers.add_parser('keep_pattern')
    p_keep_pattern.add_argument('databasename')
    p_keep_pattern.add_argument('regex')
    p_keep_pattern.set_defaults(func=keep_pattern)
    p_list_basenames = subparsers.add_parser('list_basenames')
    p_list_basenames.add_argument('databasename')
    p_list_basenames.add_argument('-o', '--outputfile', dest='outputfile', default=None)
    p_list_basenames.set_defaults(func=list_basenames)
    p_measure = subparsers.add_parser('measure')
    p_measure.add_argument('databasename')
    p_measure.add_argument('-f', '--fullscan', action='store_true')
    p_measure.set_defaults(func=measure)
    p_remove_pattern = subparsers.add_parser('remove_pattern')
    p_remove_pattern.add_argument('databasename')
    p_remove_pattern.add_argument('regex')
    p_remove_pattern.set_defaults(func=remove_pattern)
    args = parser.parse_args()
    args.func(args)
--- a/OpenDirDL/ratelimiter.py
+++ b/OpenDirDL/ratelimiter.py
@ -0,0 +1,56 @@
 import time
 class Ratelimiter:
    def __init__(self, allowance_per_period, period, operation_cost=1, mode='sleep'):
        '''
        allowance_per_period:
            The number of operations we can perform per `period` seconds.
        period:
            The number of seconds over which we can perform `allowance_per_period` operations.
        operation_cost:
            The default amount to remove from our balance after each operation.
            Pass a `cost` parameter to `self.limit` to use a nondefault value.
        mode:
            'sleep':  If we do not have the balance for an operation, sleep until we do.
                      Return True every time.
            'reject': If we do not have the balance for an operation, return False.
        '''
        if mode not in ('sleep', 'reject'):
            raise ValueError('Invalid mode %s' % repr(mode))
        self.allowance_per_period = allowance_per_period
        self.period = period
        self.operation_cost = operation_cost
        self.mode = mode
        self.last_operation = time.time()
        self.balance = 0
        self.gain_rate = allowance_per_period / period
    def limit(self, cost=None):
        if cost is None:
            cost = self.operation_cost
        timediff = time.time() - self.last_operation
        self.balance += timediff * self.gain_rate
        self.balance = min(self.balance, self.allowance_per_period)
        successful = False
        deficit = cost - self.balance
        if deficit > 0 and self.mode == 'sleep':
            time_needed = (deficit / self.gain_rate)
            #print(self.balance, deficit, 'Need to sleep %f' % time_needed)
            time.sleep(time_needed)
            self.balance = cost
        #print(self.balance)
        if self.balance >= cost:
            #print('pass')
            self.balance -= cost
            successful = True
        self.last_operation = time.time()
        return successful
--- a/Steganographic/steganographic.py
+++ b/Steganographic/steganographic.py
@ -99,15 +99,24 @@ class BitsToImage:
 class ImageToBits:
    def __init__(self, image, bitness):
        self.image = image
        self.bitness = bitness
        self.width = image.size[0]
        self.height = image.size[1]
        self.pixel_index = -1
        self.bit_index = bitness
        self.active_byte = []
        self.pixels = self.image.getdata()
        #self.bits = ''
        #for pixel in self.pixels:
        #    for channel in pixel:
        #        self.bits += binary(channel)[-bitness:]
        #print(len(self.bits))
    def _read(self):
        if len(self.active_byte) == 0:
            self.pixel_index += 1
-            (x, y) = index_to_xy(self.pixel_index, self.width)
+            self.active_byte = self.pixels[self.pixel_index]
            self.active_byte = list(self.image.getpixel((x, y)))
            self.active_byte = self.active_byte[:3]
            self.active_byte = [binary(channel) for channel in self.active_byte]
            self.active_byte = [channel[-bitness:] for channel in self.active_byte]
@ -115,6 +124,7 @@ class ImageToBits:
            self.active_byte = list(self.active_byte)
        ret = self.active_byte.pop(0)
        self.bit_index += 1
        return ret
    def read(self, bits=1):
@ -196,24 +206,14 @@ def encode(imagefilename, secretfilename, bitness=1):
    secret_content_length = (secret_size) + (len(secret_extension)) + 1
    requiredpixels = math.ceil(((secret_content_length * 8) + 32) / (3 * bitness))
    if totalpixels < requiredpixels:
-        raise StegError('Image does not have enough pixels to store the Secret'
+        raise StegError('Image does not have enough pixels to store the Secret. '
                        'Must have at least %d pixels' % requiredpixels)
    print('%d pixels available, %d required' % (totalpixels, requiredpixels))
    # --> YOU ARE HERE <--
    # Because bitness may be between 1 and 8, we need to create a writing buffer
    # called `binary_write_buffer`, so that we're always writing the same amount
    # of data per color channel.
    # If we were to write the secret length / extension on the fly, we might end
    # up using the wrong number of bits for the final channel of some pixel.
    # Example: 10010101 broken into groups of 3 is [100, 101, 01]
    # Note that the last group is not the same size as the desired bitness, and
    # will cause decode errors.
    pixel = list(image.getpixel((0, 0)))
    binary_write_buffer = ''
    # Write secret length
    secret_content_length_b = binary(secret_content_length).rjust(32, '0')