# voussoir ''' OpenDirDL downloads open directories The basics: 1. Create a database of the directory's files with > opendirdl digest http://website.com/directory/ 2. Enable and disable the files you are interested in with > opendirdl remove_pattern ".*" > opendirdl keep_pattern "Daft%20Punk" > opendirdl remove_pattern "folder\.jpg" Note the percent-encoded string. 3. Download the enabled files with > opendirdl download website.com.db The specifics: digest: Recursively fetch directories and build a database of file URLs. > opendirdl digest http://website.com/directory/ > opendirdl digest !clipboard flags: -f | --fullscan: When included, perform HEAD requests on all files, to know the size of the entire directory. -db "x.db" | --databasename "x.db": Use a custom database filename. By default, databases are named after the web domain. download: Download the files whose URLs are Enabled in the database. > opendirdl download website.com.db flags: -o "x" | --outputdir "x": Save the files to a custom directory, "x". By default, files are saved to a folder named after the web domain. -ow | --overwrite: When included, download and overwrite files even if they already exist in the output directory. -bps 100 | --bytespersecond 100: -bps 100k | -bps "100 kb" | -bps 100kib | -bps 1.2m Ratelimit your download speed. Supports units like "k", "m" according to `bytestring.parsebytes`. keep_pattern: Enable URLs which match a regex pattern. Matches are based on the percent- encoded strings! > opendirdl keep_pattern website.com.db ".*" remove_pattern: Disable URLs which match a regex pattern. Matches are based on the percent- encoded strings! > opendirdl remove_pattern website.com.db ".*" list_basenames: List Enabled URLs alphabetized by their base filename. This makes it easier to find titles of interest in a directory that is very scattered or poorly organized. > opendirdl list_basenames website.com.db flags: -o "x.txt" | --outputfile "x.txt": Output the results to a file instead of stdout. This is useful if the filenames contain special characters that crash Python, or are so long that the console becomes unreadable. measure: Sum up the filesizes of all Enabled URLs. > opendirdl measure website.com.db flags: -f | --fullscan: When included, perform HEAD requests on all files to update their size. -n | --new_only: When included, perform HEAD requests only on files that haven't gotten one yet. If a file's size is not known by the time this operation completes, you will receive a printed note. tree: Print the file / folder tree. > opendirdl tree website.com.db flags: -o "x.txt" | --outputfile "x.txt": Output the results to a file instead of stdout. This is useful if the filenames contain special characters that crash Python, or are so long that the console becomes unreadable. If the filename ends with ".html", the created page will have collapsible boxes rather than a plaintext diagram. ''' # Module names preceeded by `## ` indicate modules that are imported during # a function, because they are not used anywhere else and we don't need to waste # time importing them usually, but I still want them listed here for clarity. import argparse ## import bs4 import collections ## import hashlib import os ## import re import requests import shutil import sqlite3 import sys ## import tkinter import urllib.parse # Please consult my github repo for these files # https://github.com/voussoir/else sys.path.append('C:\\git\\else\\Downloady'); import downloady sys.path.append('C:\\git\\else\\Bytestring'); import bytestring DOWNLOAD_CHUNK = 16 * bytestring.KIBIBYTE FILENAME_BADCHARS = '/\\:*?"<>|' TERMINAL_WIDTH = shutil.get_terminal_size().columns UNKNOWN_SIZE_STRING = '???' # When doing a basic scan, we will not send HEAD requests to URLs that end in # these strings, because they're probably files. # This isn't meant to be a comprehensive filetype library, but it covers # enough of the typical opendir to speed things up. SKIPPABLE_FILETYPES = [ '.aac', '.avi', '.bin', '.bmp', '.bz2', '.epub', '.exe', '.db', '.flac', '.gif', '.gz', '.ico', '.iso', '.jpeg', '.jpg', '.m3u', '.m4a', '.m4v', '.mka', '.mkv', '.mov', '.mp3', '.mp4', '.nfo', '.ogg', '.ott', '.pdf', '.png', '.rar', '.srt', '.tar', '.ttf', '.txt', '.wav', '.webm', '.wma', '.zip', ] SKIPPABLE_FILETYPES = set(x.lower() for x in SKIPPABLE_FILETYPES) # Will be ignored completely. Are case-sensitive BLACKLISTED_FILENAMES = [ 'desktop.ini', 'thumbs.db', ] # oh shit HTML_TREE_HEAD = ''' ''' HTML_FORMAT_DIRECTORY = '''
{directory_anchor}
', output_file) else: # This helps put some space between sibling directories write('| ' * (depth), output_file) def safeindex(sequence, index, fallback=None): try: return sequence[index] except IndexError: return fallback def safeprint(text, **kwargs): text = str(text) text = text.encode('ascii', 'replace').decode() print(text, **kwargs) def smart_insert(sql, cur, url=None, head=None, commit=True): ''' INSERT or UPDATE the appropriate entry, or DELETE if the head shows a 403 / 404. ''' if bool(url) is bool(head) and not isinstance(head, requests.Response): raise ValueError('One and only one of `url` or `head` is necessary.') if url is not None: # When doing a basic scan, all we get is the URL. content_length = None content_type = None elif head is not None: url = head.url # When doing a full scan, we get a Response object. if head.status_code in [403, 404]: cur.execute('DELETE FROM urls WHERE url == ?', [url]) if commit: sql.commit() return (url, None, 0, None, 0) else: url = head.url content_length = head.headers.get('Content-Length', None) if content_length is not None: content_length = int(content_length) content_type = head.headers.get('Content-Type', None) basename = url_split(url)['filename'] basename = urllib.parse.unquote(basename) do_download = True cur.execute('SELECT * FROM urls WHERE url == ?', [url]) existing_entry = cur.fetchone() is_new = existing_entry is None data = (url, basename, content_length, content_type, do_download) if is_new: cur.execute('INSERT INTO urls VALUES(?, ?, ?, ?, ?)', data) else: command = ''' UPDATE urls SET content_length = coalesce(?, content_length), content_type = coalesce(?, content_type) WHERE url == ? ''' cur.execute(command, [content_length, content_type, url]) if commit: sql.commit() return data def url_split(url): ''' Given a url, return a dictionary of its components. ''' url = urllib.parse.unquote(url) parts = urllib.parse.urlsplit(url) if any(part == '' for part in [parts.scheme, parts.netloc]): raise ValueError('Not a valid URL') scheme = parts.scheme root = parts.netloc (folder, filename) = os.path.split(parts.path) while folder.startswith('/'): folder = folder[1:] # Folders are allowed to have slashes... folder = filepath_sanitize(folder, allowed='/\\') folder = folder.replace('\\', os.path.sep) folder = folder.replace('/', os.path.sep) # ...but Files are not. filename = filepath_sanitize(filename) result = { 'scheme': scheme, 'domain': root, 'folder': folder, 'filename': filename, } return result def write(line, file_handle=None, **kwargs): if file_handle is None: safeprint(line, **kwargs) else: file_handle.write(line + '\n', **kwargs) ## ## ## GENERAL FUNCTIONS ############################################################################### ## COMMANDLINE FUNCTIONS ########################################################################### ## ## def digest(root_url, databasename=None, fullscan=False): if root_url in ('!clipboard', '!c'): root_url = get_clipboard() write('From clipboard: %s' % root_url) walker = Walker( databasename=databasename, fullscan=fullscan, root_url=root_url, ) walker.walk() def digest_argparse(args): return digest( databasename=args.databasename, fullscan=args.fullscan, root_url=args.root_url, ) def download( databasename, outputdir=None, bytespersecond=None, headers=None, overwrite=False, ): ''' Download all of the Enabled files. The filepaths will match that of the website, using `outputdir` as the root directory. Parameters: outputdir: The directory to mirror the files into. If not provided, the domain name is used. bytespersecond: The speed to ratelimit the downloads. Can be an integer, or a string like '500k', according to the capabilities of `bytestring.parsebytes` Note that this is bytes, not bits. headers: Additional headers to pass to each `download_file` call. overwrite: If True, delete local copies of existing files and rewrite them. Otherwise, completed files are skipped. ''' sql = sqlite3.connect(databasename) cur = sql.cursor() if outputdir in (None, ''): # This assumes that all URLs in the database are from the same domain. # If they aren't, it's the user's fault because Walkers don't leave the given site # on their own. cur.execute('SELECT url FROM urls LIMIT 1') url = cur.fetchone()[0] outputdir = url_split(url)['domain'] if isinstance(bytespersecond, str): bytespersecond = bytestring.parsebytes(bytespersecond) cur.execute('SELECT * FROM urls WHERE do_download == 1 ORDER BY url') for fetch in fetch_generator(cur): url = fetch[SQL_URL] url_filepath = url_split(url) folder = os.path.join(outputdir, url_filepath['folder']) os.makedirs(folder, exist_ok=True) fullname = os.path.join(folder, url_filepath['filename']) write('Downloading "%s"' % fullname) downloady.download_file( url, localname=fullname, bytespersecond=bytespersecond, callback_progress=downloady.progress2, headers=headers, overwrite=overwrite, ) def download_argparse(args): return download( databasename=args.databasename, outputdir=args.outputdir, overwrite=args.overwrite, bytespersecond=args.bytespersecond, ) def filter_pattern(databasename, regex, action='keep'): ''' When `action` is 'keep', then any URLs matching the regex will have their `do_download` flag set to True. When `action` is 'remove', then any URLs matching the regex will have their `do_download` flag set to False. Actions will not act on each other's behalf. Keep will NEVER disable a url, and remove will NEVER enable one. ''' import re if isinstance(regex, str): regex = [regex] keep = action == 'keep' remove = action == 'remove' sql = sqlite3.connect(databasename) cur = sql.cursor() cur.execute('SELECT * FROM urls') items = cur.fetchall() for item in items: url = item[SQL_URL] for pattern in regex: contains = re.search(pattern, url) is not None if keep and contains and not item[SQL_DO_DOWNLOAD]: write('Enabling "%s"' % url) cur.execute('UPDATE urls SET do_download = 1 WHERE url == ?', [url]) if remove and contains and item[SQL_DO_DOWNLOAD]: write('Disabling "%s"' % url) cur.execute('UPDATE urls SET do_download = 0 WHERE url == ?', [url]) sql.commit() def keep_pattern_argparse(args): ''' See `filter_pattern`. ''' return filter_pattern( action='keep', databasename=args.databasename, regex=args.regex, ) def list_basenames(databasename, output_filename=None): ''' Print the Enabled entries in order of the file basenames. This makes it easier to find interesting titles without worrying about what directory they're in. ''' sql = sqlite3.connect(databasename) cur = sql.cursor() cur.execute('SELECT * FROM urls WHERE do_download == 1') items = cur.fetchall() longest = max(items, key=lambda x: len(x[SQL_BASENAME])) longest = len(longest[SQL_BASENAME]) items.sort(key=lambda x: x[SQL_BASENAME].lower()) if output_filename is not None: output_file = open(output_filename, 'w', encoding='utf-8') else: output_file = None form = '{basename:<%ds} : {url} : {size}' % longest for item in items: size = item[SQL_CONTENT_LENGTH] if size is None: size = '' else: size = bytestring.bytestring(size) line = form.format( basename=item[SQL_BASENAME], url=item[SQL_URL], size=size, ) write(line, output_file) if output_file: output_file.close() def list_basenames_argparse(args): return list_basenames( databasename=args.databasename, output_filename=args.outputfile, ) def measure(databasename, fullscan=False, new_only=False): ''' Given a database, print the sum of all Content-Lengths. URLs will be HEAD requested if: `new_only` is True and the file has no stored content length, or `fullscan` is True and `new_only` is False ''' if isinstance(fullscan, str): fullscan = bool(fullscan) totalsize = 0 sql = sqlite3.connect(databasename) cur = sql.cursor() if new_only: cur.execute('SELECT * FROM urls WHERE do_download == 1 AND content_length IS NULL') else: cur.execute('SELECT * FROM urls WHERE do_download == 1') items = cur.fetchall() filecount = len(items) unmeasured_file_count = 0 for fetch in items: size = fetch[SQL_CONTENT_LENGTH] if fullscan or new_only: url = fetch[SQL_URL] head = do_head(url, raise_for_status=False) fetch = smart_insert(sql, cur, head=head, commit=True) size = fetch[SQL_CONTENT_LENGTH] elif size is None: # Unmeasured and no intention to measure. unmeasured_file_count += 1 size = 0 if size is None: # Unmeasured even though we tried the head request. write('"%s" is not revealing Content-Length' % url) size = 0 totalsize += size sql.commit() size_string = bytestring.bytestring(totalsize) totalsize_string = '{size_short} ({size_exact:,} bytes) in {filecount:,} files' totalsize_string = totalsize_string.format( size_short=size_string, size_exact=totalsize, filecount=filecount, ) write(totalsize_string) if unmeasured_file_count > 0: write(UNMEASURED_WARNING % unmeasured_file_count) return totalsize def measure_argparse(args): return measure( databasename=args.databasename, fullscan=args.fullscan, new_only=args.new_only, ) def remove_pattern_argparse(args): ''' See `filter_pattern`. ''' return filter_pattern( action='remove', databasename=args.databasename, regex=args.regex, ) def tree(databasename, output_filename=None): ''' Print a tree diagram of the directory-file structure. If an .html file is given for `output_filename`, the page will have collapsible boxes and clickable filenames. Otherwise the file will just be a plain text drawing. ''' tree_root = build_file_tree(databasename) if output_filename is not None: output_file = open(output_filename, 'w', encoding='utf-8') use_html = output_filename.lower().endswith('.html') else: output_file = None use_html = False if use_html: write('\n', output_file) write(HTML_TREE_HEAD, output_file) write('', output_file) size_details = recursive_get_size(tree_root) recursive_print_node(tree_root, use_html=use_html, output_file=output_file) if size_details['unmeasured'] > 0: write(UNMEASURED_WARNING % size_details['unmeasured'], output_file) if output_file is not None: if use_html: write('\n', output_file) output_file.close() return tree_root def tree_argparse(args): return tree( databasename=args.databasename, output_filename=args.outputfile, ) ## ## ## COMMANDLINE FUNCTIONS ########################################################################### def main(argv): if safeindex(argv, 1, '').lower() in ('help', '-h', '--help', ''): write(__doc__) return parser = argparse.ArgumentParser() subparsers = parser.add_subparsers() p_digest = subparsers.add_parser('digest') p_digest.add_argument('root_url') p_digest.add_argument('-db', '--database', dest='databasename', default=None) p_digest.add_argument('-f', '--fullscan', dest='fullscan', action='store_true') p_digest.set_defaults(func=digest_argparse) p_download = subparsers.add_parser('download') p_download.add_argument('databasename') p_download.add_argument('-o', '--outputdir', dest='outputdir', default=None) p_download.add_argument('-bps', '--bytespersecond', dest='bytespersecond', default=None) p_download.add_argument('-ow', '--overwrite', dest='overwrite', action='store_true') p_download.set_defaults(func=download_argparse) p_keep_pattern = subparsers.add_parser('keep_pattern') p_keep_pattern.add_argument('databasename') p_keep_pattern.add_argument('regex') p_keep_pattern.set_defaults(func=keep_pattern_argparse) p_list_basenames = subparsers.add_parser('list_basenames') p_list_basenames.add_argument('databasename') p_list_basenames.add_argument('-o', '--outputfile', dest='outputfile', default=None) p_list_basenames.set_defaults(func=list_basenames_argparse) p_measure = subparsers.add_parser('measure') p_measure.add_argument('databasename') p_measure.add_argument('-f', '--fullscan', dest='fullscan', action='store_true') p_measure.add_argument('-n', '--new_only', dest='new_only', action='store_true') p_measure.set_defaults(func=measure_argparse) p_remove_pattern = subparsers.add_parser('remove_pattern') p_remove_pattern.add_argument('databasename') p_remove_pattern.add_argument('regex') p_remove_pattern.set_defaults(func=remove_pattern_argparse) p_tree = subparsers.add_parser('tree') p_tree.add_argument('databasename') p_tree.add_argument('-o', '--outputfile', dest='outputfile', default=None) p_tree.set_defaults(func=tree_argparse) args = parser.parse_args(argv) args.func(args) if __name__ == '__main__': main(sys.argv[1:])