'''.replace('\n', '') HTML_FORMAT_FILE = '{name} ({size})
' DB_INIT = ''' CREATE TABLE IF NOT EXISTS urls( url TEXT, basename TEXT, content_length INT, content_type TEXT, do_download INT ); CREATE INDEX IF NOT EXISTS urlindex on urls(url); CREATE INDEX IF NOT EXISTS baseindex on urls(basename); CREATE INDEX IF NOT EXISTS sizeindex on urls(content_length); '''.strip() SQL_URL = 0 SQL_BASENAME = 1 SQL_CONTENT_LENGTH = 2 SQL_CONTENT_TYPE = 3 SQL_DO_DOWNLOAD = 4 UNMEASURED_WARNING = ''' Note: %d files do not have a stored Content-Length. Run `measure` with `-f`|`--fullscan` or `-n`|`--new_only` to HEAD request those files. '''.strip() ## WALKER ########################################################################################## ## ## class Walker: ''' This class manages the extraction and saving of URLs, given a starting root url. ''' def __init__(self, root_url, databasename=None, fullscan=False): if not root_url.endswith('/'): root_url += '/' if '://' not in root_url.split('.')[0]: root_url = 'http://' + root_url self.root_url = root_url if databasename in (None, ''): domain = url_split(self.root_url)['domain'] databasename = domain + '.db' databasename = databasename.replace(':', '#') self.databasename = databasename write('Opening %s' % self.databasename) self.sql = sqlite3.connect(self.databasename) self.cur = self.sql.cursor() db_init(self.sql, self.cur) self.fullscan = bool(fullscan) self.queue = collections.deque() self.seen_directories = set() def smart_insert(self, url=None, head=None, commit=True): ''' See `smart_insert`. ''' smart_insert(self.sql, self.cur, url=url, head=head, commit=commit) def extract_hrefs(self, response, tag='a', attribute='href'): ''' Given a Response object, extract href urls. External links, index sort links, and blacklisted files are discarded. ''' import bs4 soup = bs4.BeautifulSoup(response.text, 'html.parser') elements = soup.find_all(tag) for element in elements: try: href = element[attribute] except KeyError: continue href = urllib.parse.urljoin(response.url, href) if not href.startswith(self.root_url): # Don't go to other sites or parent directories. continue if any(sorter in href for sorter in ('?C=', '?O=', '?M=', '?D=', '?N=', '?S=')): # Alternative sort modes for index pages. continue if any(href.endswith(blacklisted) for blacklisted in BLACKLISTED_FILENAMES): continue yield href def process_url(self, url=None): ''' Given a URL, check whether it is an index page or an actual file. If it is an index page, its links are extracted and queued. If it is a file, its information is saved to the database. We perform a HEAD: when `self.fullscan` is True. when `self.fullscan` is False but the url is not a SKIPPABLE_FILETYPE. when the url is an index page. GET: when the url is an index page. ''' if url is None: url = self.root_url else: url = urllib.parse.urljoin(self.root_url, url) if url in self.seen_directories: # We already picked this up at some point return if not url.startswith(self.root_url): # Don't follow external links or parent directory. write('Skipping "%s" due to external url.' % url) return urll = url.lower() if self.fullscan is False: skippable = any(urll.endswith(ext) for ext in SKIPPABLE_FILETYPES) if skippable: write('Skipping "%s" due to extension.' % url) self.smart_insert(url=url, commit=False) return self.cur.execute('SELECT * FROM urls WHERE url == ?', [url]) skippable = self.cur.fetchone() is not None if skippable: write('Skipping "%s" since we already have it.' % url) return try: head = do_head(url) except requests.exceptions.HTTPError as exception: if exception.response.status_code == 403: write('403 FORBIDDEN!') return if exception.response.status_code == 404: write('404 NOT FOUND!') return raise content_type = head.headers.get('Content-Type', '?') #print(content_type) if content_type.startswith('text/html'):# and head.url.endswith('/'): # This is an index page, so extract links and queue them. response = do_get(url) hrefs = self.extract_hrefs(response) # Just in case the URL we used is different than the real one, # such as missing a trailing slash, add both. self.seen_directories.add(url) self.seen_directories.add(head.url) added = 0 for href in hrefs: if href in self.seen_directories: continue else: self.queue.append(href) added += 1 write('Queued %d urls' % added) else: # This is not an index page, so save it. self.smart_insert(head=head, commit=False) def walk(self, url=None): ''' Given a starting URL (defaults to self.root_url), continually extract links from the page and repeat. ''' self.queue.appendleft(url) try: while len(self.queue) > 0: url = self.queue.popleft() self.process_url(url) line = '{:,} Remaining'.format(len(self.queue)) write(line) except: self.sql.commit() raise self.sql.commit() ## ## ## WALKER ########################################################################################## ## OTHER CLASSES ################################################################################### ## ## class TreeExistingChild(Exception): pass class TreeInvalidIdentifier(Exception): pass class TreeNode: def __init__(self, identifier, data=None): assert isinstance(identifier, str) assert '\\' not in identifier self.identifier = identifier self.data = data self.parent = None self.children = {} def __eq__(self, other): return isinstance(other, TreeNode) and self.abspath() == other.abspath() def __getitem__(self, key): return self.children[key] def __hash__(self): return hash(self.abspath()) def __repr__(self): return 'TreeNode %s' % self.abspath() def abspath(self): node = self nodes = [node] while node.parent is not None: node = node.parent nodes.append(node) nodes.reverse() nodes = [node.identifier for node in nodes] return '\\'.join(nodes) def add_child(self, other_node, overwrite_parent=False): self.check_child_availability(other_node.identifier) if other_node.parent is not None and not overwrite_parent: raise ValueError('That node already has a parent. Try `overwrite_parent=True`') other_node.parent = self self.children[other_node.identifier] = other_node return other_node def check_child_availability(self, identifier): if identifier in self.children: raise TreeExistingChild('Node %s already has child %s' % (self.identifier, identifier)) def detach(self): del self.parent.children[self.identifier] self.parent = None def list_children(self, customsort=None): children = list(self.children.values()) if customsort is None: children.sort(key=lambda node: node.identifier.lower()) else: children.sort(key=customsort) return children def merge_other(self, othertree, otherroot=None): newroot = None if ':' in othertree.identifier: if otherroot is None: raise Exception('Must specify a new name for the other tree\'s root') else: newroot = otherroot else: newroot = othertree.identifier othertree.identifier = newroot othertree.parent = self self.check_child_availability(newroot) self.children[newroot] = othertree def walk(self, customsort=None): yield self for child in self.list_children(customsort=customsort): yield from child.walk(customsort=customsort) ## ## ## OTHER CLASSES ################################################################################### ## GENERAL FUNCTIONS ############################################################################### ## ## def build_file_tree(databasename): sql = sqlite3.connect(databasename) cur = sql.cursor() cur.execute('SELECT * FROM urls WHERE do_download == 1') all_items = cur.fetchall() sql.close() if len(all_items) == 0: return path_form = '{domain}\\{folder}\\{filename}' all_items = [ { 'url': item[SQL_URL], 'size': item[SQL_CONTENT_LENGTH], 'path_parts': path_form.format(**url_split(item[SQL_URL])).split('\\'), } for item in all_items ] all_items.sort(key=lambda x: x['url']) root_data = { 'item_type': 'directory', 'name': databasename, } scheme = url_split(all_items[0]['url'])['scheme'] tree_root = TreeNode(databasename, data=root_data) tree_root.unsorted_children = all_items node_queue = set() node_queue.add(tree_root) # In this process, URLs are divided up into their nodes one directory layer at a time. # The root receives all URLs, and creates nodes for each of the top-level # directories. Those nodes receive all subdirectories, and repeat. while len(node_queue) > 0: node = node_queue.pop() for new_child_data in node.unsorted_children: path_parts = new_child_data['path_parts'] # Create a new node for the directory, path_parts[0] # path_parts[1:] is assigned to that node to be divided next. child_identifier = path_parts.pop(0) #child_identifier = child_identifier.replace(':', '#') child = node.children.get(child_identifier, None) if not child: child = TreeNode(child_identifier, data={}) child.unsorted_children = [] node.add_child(child) child.data['name'] = child_identifier if len(path_parts) > 0: child.data['item_type'] = 'directory' child.unsorted_children.append(new_child_data) node_queue.add(child) else: child.data['item_type'] = 'file' child.data['size'] = new_child_data['size'] child.data['url'] = new_child_data['url'] if node.parent is None: continue elif node.parent == tree_root: node.data['url'] = scheme + '://' + node.identifier else: node.data['url'] = node.parent.data['url'] + '/' + node.identifier del node.unsorted_children return tree_root def db_init(sql, cur): lines = DB_INIT.split(';') for line in lines: cur.execute(line) sql.commit() return True def do_get(url, raise_for_status=True): return do_request('GET', requests.get, url, raise_for_status=raise_for_status) def do_head(url, raise_for_status=True): return do_request('HEAD', requests.head, url, raise_for_status=raise_for_status) def do_request(message, method, url, raise_for_status=True): message = '{message:>4s}: {url} : '.format(message=message, url=url) write(message, end='', flush=True) response = method(url) write(response.status_code) if raise_for_status: response.raise_for_status() return response def fetch_generator(cur): while True: fetch = cur.fetchone() if fetch is None: break yield fetch def filepath_sanitize(text, allowed=''): ''' Remove forbidden characters from the text, unless specifically sanctioned. ''' badchars = FILENAME_BADCHARS badchars = set(char for char in FILENAME_BADCHARS if char not in allowed) text = ''.join(char for char in text if char not in badchars) return text def get_clipboard(): import tkinter t = tkinter.Tk() clip = t.clipboard_get() t.destroy() return clip def hashit(text, length=None): import hashlib sha = hashlib.sha512(text.encode('utf-8')).hexdigest() if length is not None: sha = sha[:length] return sha def recursive_get_size(node): ''' Calculate the size of the Directory nodes by summing the sizes of all children. Modifies the nodes in-place. ''' return_value = { 'size': 0, 'unmeasured': 0, } if node.data['item_type'] == 'file': if node.data['size'] is None: return_value['unmeasured'] = 1 return_value['size'] = node.data['size'] else: for child in node.list_children(): child_details = recursive_get_size(child) return_value['size'] += child_details['size'] or 0 return_value['unmeasured'] += child_details['unmeasured'] node.data['size'] = return_value['size'] return return_value def recursive_print_node(node, depth=0, use_html=False, output_file=None): ''' Given a tree node (presumably the root), print it and all of its children. ''' size = node.data['size'] if size is None: size = UNKNOWN_SIZE_STRING else: size = bytestring.bytestring(size) if use_html: css_class = 'directory_even' if depth % 2 == 0 else 'directory_odd' if node.data['item_type'] == 'directory': directory_url = node.data.get('url') directory_anchor = '►' if directory_url else '' directory_anchor = directory_anchor.format(url=directory_url) line = HTML_FORMAT_DIRECTORY.format( css=css_class, directory_anchor=directory_anchor, name=node.data['name'], size=size, ) else: line = HTML_FORMAT_FILE.format( name=node.data['name'], size=size, url=node.data['url'], ) else: line = '{space}{bar}{name} : ({size})' line = line.format( space='| ' * (depth-1), bar='|---' if depth > 0 else '', name=node.data['name'], size=size ) write(line, output_file) # Sort by type (directories first) then subsort by lowercase path customsort = lambda node: ( node.data['item_type'] == 'file', node.data['url'].lower(), ) for child in node.list_children(customsort=customsort): recursive_print_node(child, depth=depth+1, use_html=use_html, output_file=output_file) if node.data['item_type'] == 'directory': if use_html: # Close the directory div write('