''' OpenDirDL downloads open directories Usage: DIGEST: Recursively fetch directories and build a database of file URLs. > opendirdl digest !clipboard > opendirdl digest http://website.com/directory/ flags: -f | --fullscan : When included, perform HEAD requests on all files, to know the size of the entire directory. -dv "x.db" | --databasename "x.db" : Use a custom database filename. By default, databases are named after the web domain. DOWNLOAD: Download the files whose URLs are enabled in the database. > opendirdl download website.com.db flags: -o "x" | --outputdir "x" : Save the files to a custom directory, "x". By default, files are saved to a folder named after the web domain. -ow | --overwrite : When included, download and overwrite files even if they already exist in the output directory. -bps 100 | --bytespersecond 100 : Ratelimit yourself to downloading at 100 BYTES per second. The webmaster will appreciate this. KEEP_PATTERN: Enable URLs which match a regex pattern. Matches are based on the percent-encoded strings! > opendirdl keep_pattern website.com.db ".*" REMOVE_PATTERN: Disable URLs which match a regex pattern. Matches are based on the percent-encoded strings! > opendirdl remove_pattern website.com.db ".*" LIST_BASENAMES: List enabled URLs in order of their base filename. This makes it easier to find titles of interest in a directory that is very scattered or poorly organized. > opendirdl list_basenames website.com.db flags: -o "x.txt" | --outputfile "x.txt" : Output the results to a file instead of stdout. This is useful if the filenames contain special characters that crash Python, or are so long that the console becomes unreadable. MEASURE: Sum up the filesizes of all enabled URLs. > opendirdl measure website.com.db flags: -f | --fullscan : When included, perform HEAD requests on any URL whose size is not known. If this flag is not included, and some file's size is unkown, you will receive a note. ''' # Module names preceeded by two hashes indicate modules that are imported during # a function, because they are not used anywhere else and we don't need to waste # time importing them usually. import argparse ## import bs4 ## import hashlib import os import ratelimiter ## import re import requests import sqlite3 ## import sys ## tkinter import urllib.parse FILENAME_BADCHARS = '/\\:*?"<>|' # When doing a basic scan, we will not send HEAD requests to URLs that end in these strings, # because they're probably files. # This isn't meant to be a comprehensive filetype library, but it covers enough of the # typical opendir to speed things up. SKIPPABLE_FILETYPES = [ '.avi', '.bmp', '.epub', '.db', '.flac', '.gif', '.gz' '.ico', '.iso', '.jpeg', '.jpg', '.m3u', '.m4a', '.mkv', '.mov', '.mp3', '.mp4', '.nfo', '.ogg', '.pdf', '.png', '.srt', '.tar', '.txt', '.webm', '.zip', ] SKIPPABLE_FILETYPES = set(x.lower() for x in SKIPPABLE_FILETYPES) BYTE = 1 KIBIBYTE = 1024 * BYTE MIBIBYTE = 1024 * KIBIBYTE GIBIBYTE = 1024 * MIBIBYTE TEBIBYTE = 1024 * GIBIBYTE SIZE_UNITS = (TEBIBYTE, GIBIBYTE, MIBIBYTE, KIBIBYTE, BYTE) UNIT_STRINGS = { BYTE: 'b', KIBIBYTE: 'KiB', MIBIBYTE: 'MiB', GIBIBYTE: 'GiB', TEBIBYTE: 'TiB', } DOWNLOAD_CHUNK = 2 * KIBIBYTE DB_INIT = ''' CREATE TABLE IF NOT EXISTS urls( url TEXT, basename TEXT, content_length INT, content_type TEXT, do_download INT ); CREATE INDEX IF NOT EXISTS urlindex on urls(url); CREATE INDEX IF NOT EXISTS baseindex on urls(basename); CREATE INDEX IF NOT EXISTS sizeindex on urls(content_length); '''.strip() SQL_URL = 0 SQL_BASENAME = 1 SQL_CONTENT_LENGTH = 2 SQL_CONTENT_TYPE = 3 SQL_DO_DOWNLOAD = 4 ## DOWNLOADER ###################################################################################### ## ## class Downloader: def __init__(self, databasename, outputdir=None, headers=None): self.databasename = databasename self.sql = sqlite3.connect(databasename) self.cur = self.sql.cursor() if outputdir is None or outputdir == "": # This assumes that all URLs in the database are from the same domain. # If they aren't, it's the user's fault. self.cur.execute('SELECT url FROM urls LIMIT 1') url = self.cur.fetchone()[0] # returns (root, path, filename). Keep root. outputdir = url_to_filepath(url)[0] self.outputdir = outputdir def download(self, overwrite=False, bytespersecond=None): overwrite = bool(overwrite) self.cur.execute('SELECT * FROM urls WHERE do_download == 1 ORDER BY url') while True: fetch = self.cur.fetchone() if fetch is None: break url = fetch[SQL_URL] ''' Creating the Path ''' (root, folder, basename) = url_to_filepath(url) # Ignore this value of `root`, because we might have a custom outputdir. root = self.outputdir folder = os.path.join(root, folder) if not os.path.exists(folder): os.makedirs(folder) fullname = os.path.join(folder, basename) temporary_basename = hashit(url, 16) + '.oddltemporary' temporary_fullname = os.path.join(folder, temporary_basename) ''' Managing overwrite ''' if os.path.isfile(fullname): if overwrite is True: os.remove(fullname) else: safeprint('Skipping "%s". Use `--overwrite`' % fullname) continue safeprint('Downloading "%s" as "%s"' % (fullname, temporary_basename)) filehandle = open(temporary_fullname, 'wb') try: download_file(url, filehandle, hookfunction=hook1, bytespersecond=bytespersecond) os.rename(temporary_fullname, fullname) except: filehandle.close() raise ## ## ## DOWNLOADER ###################################################################################### ## GENERIC ######################################################################################### ## ## class Generic: def __init__(self, **kwargs): for kwarg in kwargs: setattr(self, kwarg, kwargs[kwarg]) ## ## ## GENERIC ######################################################################################### ## WALKER ########################################################################################## ## ## class Walker: def __init__(self, walkurl, databasename=None, fullscan=False): if walkurl[-1] != '/': walkurl += '/' self.walkurl = walkurl if databasename is None: self.domain = url_to_filepath(walkurl)[0] databasename = self.domain + '.db' self.databasename = databasename self.sql = sqlite3.connect(self.databasename) self.cur = self.sql.cursor() db_init(self.sql, self.cur) self.fullscan = bool(fullscan) self.queue = [] self.seen_directories = set() def smart_insert(self, url=None, head=None, commit=True): ''' See `smart_insert`. ''' smart_insert(self.sql, self.cur, url=url, head=head, commit=commit) def extract_hrefs(self, response, tag='a', attribute='href'): ''' Given a Response object, extract href urls. External links, index sort links, and desktop.ini are discarded. ''' import bs4 soup = bs4.BeautifulSoup(response.text) elements = soup.findAll(tag) for element in elements: try: href = element[attribute] except KeyError: continue href = urllib.parse.urljoin(response.url, href) if not href.startswith(self.walkurl): # Don't go to other sites or parent directories. continue if 'C=' in href and 'O=' in href: # Alternative sort modes for index pages. continue if href.endswith('desktop.ini'): # I hate these things. continue yield href def process_url(self, url=None): ''' Given a URL, check whether it is an index page or an actual file. If it is an index page, it's links are extracted and queued. If it is a file, its information is saved to the database. We perform a HEAD: when `self.fullscan` is True. when `self.fullscan` is False but the url is not a SKIPPABLE_FILETYPE. when the url is an index page. GET: when the url is a index page. ''' if url is None: url = self.walkurl else: url = urllib.parse.urljoin(self.walkurl, url) if not url.startswith(self.walkurl): # Don't follow external links or parent directory. print('Skipping "%s" due to external url.' % url) return urll = url.lower() if self.fullscan is False: skippable = any(urll.endswith(ext) for ext in SKIPPABLE_FILETYPES) if skippable: safeprint('Skipping "%s" due to extension.' % url) self.smart_insert(url=url) return self.cur.execute('SELECT * FROM urls WHERE url == ?', [url]) skippable = self.cur.fetchone() is not None if skippable: safeprint('Skipping "%s" since we already have it.' % url) return try: head = do_head(url) except requests.exceptions.HTTPError as e: if e.response.status_code == 403: print('403 FORBIDDEN!') return if e.response.status_code == 404: print('404 NOT FOUND!') return raise content_type = head.headers.get('Content-Type', '?') if content_type.startswith('text/html') and head.url.endswith('/'): # This is an index page, so extract links and queue them. response = do_get(url) hrefs = self.extract_hrefs(response) self.seen_directories.add(head.url) added = 0 for href in hrefs: if href in self.seen_directories: continue else: self.queue.append(href) added += 1 print('Queued %d urls' % added) else: # This is not an index page, so save it. self.smart_insert(head=head) def walk(self, url=None): self.queue.append(url) while len(self.queue) > 0: url = self.queue.pop(0) self.process_url(url) ## ## ## WALKER ########################################################################################## ## GENERAL FUNCTIONS ############################################################################### ## ## def bytes_to_unit_string(bytes): size_unit = 1 for unit in SIZE_UNITS: if bytes >= unit: size_unit = unit break size_unit_string = UNIT_STRINGS[size_unit] size_string = '%.3f %s' % ((bytes / size_unit), size_unit_string) return size_string def db_init(sql, cur): lines = DB_INIT.split(';') for line in lines: cur.execute(line) sql.commit() return True def dict_to_file(jdict, filename): text = dict_to_string(jdict) text = text.encode('utf-8') filehandle = open(filename, 'wb') filehandle.write(text) filehandle.close() def do_get(url): return do_request('GET', requests.get, url) def do_head(url): return do_request('HEAD', requests.head, url) def do_request(message, method, url): import sys message = '{message:>4s}: {url} : '.format(message=message, url=url) safeprint(message, end='') sys.stdout.flush() response = method(url) safeprint(response) response.raise_for_status() return response def download_file(url, filehandle, hookfunction=None, headers={}, bytespersecond=None): if bytespersecond is not None: limiter = ratelimiter.Ratelimiter(allowance_per_period=bytespersecond, period=1) else: limiter = None currentblock = 0 downloading = requests.get(url, stream=True, headers=headers) totalsize = int(downloading.headers.get('content-length', 1)) for chunk in downloading.iter_content(chunk_size=DOWNLOAD_CHUNK): if not chunk: break currentblock += 1 filehandle.write(chunk) if limiter is not None: limiter.limit(len(chunk)) if hookfunction is not None: hookfunction(currentblock, DOWNLOAD_CHUNK, totalsize) filehandle.close() size = os.path.getsize(filehandle.name) if size < totalsize: raise Exception('Did not receive expected total size. %d / %d' % (size, totalsize)) return True def filepath_sanitize(text, allowed=''): bet = FILENAME_BADCHARS.replace(allowed, '') for char in bet: text = text.replace(char, '') return text def get_clipboard(): import tkinter t = tkinter.Tk() clip = t.clipboard_get() t.destroy() return clip def hashit(text, length=None): import hashlib h = hashlib.sha512(text.encode('utf-8')).hexdigest() if length is not None: h = h[:length] return h def hook1(currentblock, chunksize, totalsize): currentbytes = currentblock * chunksize if currentbytes > totalsize: currentbytes = totalsize currentbytes = '{:,}'.format(currentbytes) totalsize = '{:,}'.format(totalsize) currentbytes = currentbytes.rjust(len(totalsize), ' ') print('%s / %s bytes' % (currentbytes, totalsize), end='\r') if currentbytes == totalsize: print() def listget(l, index, default=None): try: return l[index] except IndexError: return default def longest_length(li): longest = 0 for item in li: longest = max(longest, len(item)) return longest def safeprint(text, **kwargs): text = str(text) text = text.encode('ascii', 'replace').decode() text = text.replace('?', '_') print(text, **kwargs) def smart_insert(sql, cur, url=None, head=None, commit=True): ''' INSERT or UPDATE the appropriate entry. ''' if bool(url) is bool(head): raise ValueError('One and only one of `url` or `head` is necessary.') if url is not None: # When doing a basic scan, all we get is the URL. content_length = None content_type = None elif head is not None: # When doing a full scan, we get a Response object. url = head.url content_length = head.headers.get('Content-Length', None) if content_length is not None: content_length = int(content_length) content_type = head.headers.get('Content-Type', None) basename = url_to_filepath(url)[2] basename = urllib.parse.unquote(basename) do_download = True cur.execute('SELECT * FROM urls WHERE url == ?', [url]) existing_entry = cur.fetchone() is_new = existing_entry is None data = (url, basename, content_length, content_type, do_download) if is_new: cur.execute('INSERT INTO urls VALUES(?, ?, ?, ?, ?)', data) else: command = ''' UPDATE urls SET content_length = coalesce(?, content_length), content_type = coalesce(?, content_type) WHERE url == ? ''' cur.execute(command, [content_length, content_type, url]) if commit: sql.commit() return data def url_to_filepath(text): text = urllib.parse.unquote(text) parts = urllib.parse.urlsplit(text) root = parts.netloc (folder, filename) = os.path.split(parts.path) while folder.startswith('/'): folder = folder[1:] # Folders are allowed to have slashes... folder = filepath_sanitize(folder, allowed='/\\') folder = folder.replace('\\', os.path.sep) folder = folder.replace('/', os.path.sep) # ...but Files are not. filename = filepath_sanitize(filename) return (root, folder, filename) ## ## ## GENERAL FUNCTIONS ############################################################################### ## COMMANDLINE FUNCTIONS ########################################################################### ## ## def digest(args): fullscan = args.fullscan if isinstance(fullscan, str): fullscan = bool(eval(fullscan)) walkurl = args.walkurl if walkurl == '!clipboard': walkurl = get_clipboard() safeprint('From clipboard: %s' % walkurl) walker = Walker( databasename=args.databasename, fullscan=fullscan, walkurl=walkurl, ) walker.walk() def download(args): bytespersecond = args.bytespersecond if isinstance(bytespersecond, str): bytespersecond = eval(bytespersecond) downloader = Downloader( databasename=args.databasename, outputdir=args.outputdir, ) downloader.download( bytespersecond=bytespersecond, overwrite=args.overwrite, ) def filter_pattern(databasename, regex, action='keep', *trash): ''' When `action` is 'keep', then any URLs matching the regex will have their `do_download` flag set to True. When `action` is 'remove', then any URLs matching the regex will have their `do_download` flag set to False. Actions will not act on each other's behalf. A 'keep' will NEVER disable a url, and 'remove' will NEVER enable one. ''' import re if isinstance(regex, str): regex = [regex] keep = action == 'keep' remove = action == 'remove' sql = sqlite3.connect(databasename) cur = sql.cursor() cur2 = sql.cursor() cur2.execute('SELECT * FROM urls') while True: fetch = cur2.fetchone() if fetch is None: break url = fetch[SQL_URL] current_do_dl = fetch[SQL_DO_DOWNLOAD] for pattern in regex: contains = re.search(pattern, url) is not None should_keep = (keep and contains) if keep and contains and not current_do_dl: safeprint('Keeping "%s"' % url) cur.execute('UPDATE urls SET do_download = 1 WHERE url == ?', [url]) if remove and contains and current_do_dl: safeprint('Removing "%s"' % url) cur.execute('UPDATE urls SET do_download = 0 WHERE url == ?', [url]) sql.commit() def keep_pattern(args): ''' See `filter_pattern`. ''' filter_pattern( action='keep', databasename=args.databasename, regex=args.regex, ) def list_basenames(args): ''' Given a database, print the entries in order of the file basenames. This makes it easier to find interesting titles without worrying about what directory they're in. ''' databasename = args.databasename outputfile = args.outputfile sql = sqlite3.connect(databasename) cur = sql.cursor() cur.execute('SELECT basename FROM urls WHERE do_download == 1 ORDER BY LENGTH(basename) DESC LIMIT 1') longest = len(cur.fetchone()[0]) cur.execute('SELECT * FROM urls WHERE do_download == 1 ORDER BY basename') form = '{bn:<%ds} : {url}' % longest if outputfile: outputfile = open(outputfile, 'w', encoding='utf-8') while True: fetch = cur.fetchone() if fetch is None: break line = form.format(bn=fetch[SQL_BASENAME], url=fetch[SQL_URL]) if outputfile: outputfile.write(line + '\n') else: print(line) if outputfile: outputfile.close() def measure(args): ''' Given a database, print the sum of all Content-Lengths. If `fullscan`, then URLs with no Content-Length will be HEAD requested, and the result will be saved back into the file. ''' databasename = args.databasename fullscan = args.fullscan if isinstance(fullscan, str): fullscan = bool(fullscan) totalsize = 0 sql = sqlite3.connect(databasename) cur1 = sql.cursor() cur2 = sql.cursor() cur2.execute('SELECT * FROM urls WHERE do_download == 1') filecount = 0 files_without_size = 0 try: while True: fetch = cur2.fetchone() if fetch is None: break size = fetch[SQL_CONTENT_LENGTH] if size is None: if fullscan: url = fetch[SQL_URL] head = do_head(url) fetch = smart_insert(sql, cur1, head=head, commit=False) size = fetch[SQL_CONTENT_LENGTH] if size is None: safeprint('"%s" is not revealing Content-Length' % url) size = 0 else: files_without_size += 1 size = 0 totalsize += size filecount += 1 except: sql.commit() raise sql.commit() short_string = bytes_to_unit_string(totalsize) totalsize_string = '{} ({:,} bytes) in {:,} files'.format(short_string, totalsize, filecount) print(totalsize_string) if files_without_size > 0: print('Note: %d files do not have a stored Content-Length.' % files_without_size) print('Run `measure` with `-f` or `--fullscan` to HEAD request those files.') return totalsize def remove_pattern(args): ''' See `filter_pattern`. ''' filter_pattern( action='remove', databasename=args.databasename, regex=args.regex, ) ## ## ## COMMANDLINE FUNCTIONS ########################################################################### if __name__ == '__main__': parser = argparse.ArgumentParser() subparsers = parser.add_subparsers() p_digest = subparsers.add_parser('digest') p_digest.add_argument('walkurl') p_digest.add_argument('-db', '--database', dest='databasename', default=None) p_digest.add_argument('-f', '--fullscan', action='store_true') p_digest.set_defaults(func=digest) p_download = subparsers.add_parser('download') p_download.add_argument('databasename') p_download.add_argument('-o', '--outputdir', dest='outputdir', default=None) p_download.add_argument('-ow', '--overwrite', dest='overwrite', default=False) p_download.add_argument('-bps', '--bytespersecond', dest='bytespersecond', default=None) p_download.set_defaults(func=download) p_keep_pattern = subparsers.add_parser('keep_pattern') p_keep_pattern.add_argument('databasename') p_keep_pattern.add_argument('regex') p_keep_pattern.set_defaults(func=keep_pattern) p_list_basenames = subparsers.add_parser('list_basenames') p_list_basenames.add_argument('databasename') p_list_basenames.add_argument('-o', '--outputfile', dest='outputfile', default=None) p_list_basenames.set_defaults(func=list_basenames) p_measure = subparsers.add_parser('measure') p_measure.add_argument('databasename') p_measure.add_argument('-f', '--fullscan', action='store_true') p_measure.set_defaults(func=measure) p_remove_pattern = subparsers.add_parser('remove_pattern') p_remove_pattern.add_argument('databasename') p_remove_pattern.add_argument('regex') p_remove_pattern.set_defaults(func=remove_pattern) args = parser.parse_args() args.func(args)