diff --git a/Downloady/downloady.py b/Downloady/downloady.py index 5b729b3..16b77f6 100644 --- a/Downloady/downloady.py +++ b/Downloady/downloady.py @@ -17,9 +17,9 @@ try: except ImportError: # pip install # https://raw.githubusercontent.com/voussoir/else/master/_voussoirkit/voussoirkit.zip - from vousoirkit import bytestring - from vousoirkit import ratelimiter - from vousoirkit import clipext + from voussoirkit import bytestring + from voussoirkit import ratelimiter + from voussoirkit import clipext warnings.simplefilter('ignore') diff --git a/OpenDirDL/README.md b/OpenDirDL/README.md index d1345af..cf97412 100644 --- a/OpenDirDL/README.md +++ b/OpenDirDL/README.md @@ -13,6 +13,11 @@ See inside opendirdl.py for usage instructions.   +-2016 11 11 + - **[addition]** You can now call opendirdl using the database filename as the first argument, and the subcommand as the second. Previously, the subcommand always had to come first, but now they are interchangeable when the system detects that argv[0] is a file. This makes it much easier to do multiple operations on a single database because you can just backspace the previous command rather than having to hop over the database name to get to it. + - **[addition]** `measure` now takes an argument `--threads x` to use `x` threads during the head requests. + - **[addition]** New subcommand `list_urls` to just dump the urls. + - 2016 10 03 - **[bugfix]** Fix KeyError caused by the 'root' -> 'domain' rename. diff --git a/OpenDirDL/opendirdl.py b/OpenDirDL/opendirdl.py index a5e974e..1521986 100644 --- a/OpenDirDL/opendirdl.py +++ b/OpenDirDL/opendirdl.py @@ -88,6 +88,9 @@ measure: When included, perform HEAD requests only on files that haven't gotten one yet. + -t 4 | --threads 4: + The number of threads to use for performing requests. + If a file's size is not known by the time this operation completes, you will receive a printed note. @@ -300,7 +303,6 @@ class Walker: self.fullscan = bool(fullscan) self.queue = collections.deque() self.seen_directories = set() - self.threadpool = concurrent.futures.ThreadPoolExecutor(4) def smart_insert(self, url=None, head=None, commit=True): ''' @@ -600,10 +602,10 @@ def do_head(url, raise_for_status=True): return do_request('HEAD', requests.head, url, raise_for_status=raise_for_status) def do_request(message, method, url, raise_for_status=True): - message = '{message:>4s}: {url} : '.format(message=message, url=url) - write(message, end='', flush=True) + form = '{message:>4s}: {url} : {status}' + write(form.format(message=message, url=url, status='')) response = method(url) - write(response.status_code) + write(form.format(message=message, url=url, status=response.status_code)) if raise_for_status: response.raise_for_status() return response @@ -638,6 +640,11 @@ def hashit(text, length=None): sha = sha[:length] return sha +def int_none(x): + if x is None: + return x + return int(x) + def recursive_get_size(node): ''' Calculate the size of the Directory nodes by summing the sizes of all children. @@ -996,7 +1003,37 @@ def list_basenames_argparse(args): output_filename=args.outputfile, ) -def measure(databasename, fullscan=False, new_only=False): +def list_urls(databasename, output_filename=None): + ''' + Print the Enabled entries in order of the file basenames. + This makes it easier to find interesting titles without worrying about + what directory they're in. + ''' + sql = sqlite3.connect(databasename) + cur = sql.cursor() + + cur.execute('SELECT * FROM urls WHERE do_download == 1') + items = cur.fetchall() + items.sort(key=lambda x: x[SQL_URL].lower()) + + if output_filename is not None: + output_file = open(output_filename, 'w', encoding='utf-8') + else: + output_file = None + + for item in items: + write(item[SQL_URL], output_file) + + if output_file: + output_file.close() + +def list_urls_argparse(args): + return list_urls( + databasename=args.databasename, + output_filename=args.outputfile, + ) + +def measure(databasename, fullscan=False, new_only=False, threads=4): ''' Given a database, print the sum of all Content-Lengths. URLs will be HEAD requested if: @@ -1006,7 +1043,6 @@ def measure(databasename, fullscan=False, new_only=False): if isinstance(fullscan, str): fullscan = bool(fullscan) - totalsize = 0 sql = sqlite3.connect(databasename) cur = sql.cursor() @@ -1016,30 +1052,47 @@ def measure(databasename, fullscan=False, new_only=False): cur.execute('SELECT * FROM urls WHERE do_download == 1') items = cur.fetchall() - filecount = len(items) + totalsize = 0 unmeasured_file_count = 0 - for fetch in items: - size = fetch[SQL_CONTENT_LENGTH] + if threads is None: + threads = 1 - if fullscan or new_only: - url = fetch[SQL_URL] - head = do_head(url, raise_for_status=False) - fetch = smart_insert(sql, cur, head=head, commit=True) + threadpool = concurrent.futures.ThreadPoolExecutor(threads) + thread_promises = [] + + try: + for fetch in items: size = fetch[SQL_CONTENT_LENGTH] - elif size is None: - # Unmeasured and no intention to measure. - unmeasured_file_count += 1 - size = 0 + if fullscan or new_only: + url = fetch[SQL_URL] + promise = threadpool.submit(do_head, url, raise_for_status=False) + thread_promises.append(promise) - if size is None: - # Unmeasured even though we tried the head request. - write('"%s" is not revealing Content-Length' % url) - size = 0 + elif size is None: + # Unmeasured and no intention to measure. + unmeasured_file_count += 1 - totalsize += size + else: + totalsize += size + + while len(thread_promises) > 0: + # If that thread is done, `result()` will return immediately + # Otherwise, it will wait, which is okay because the threads themselves + # are not blocked. + head = thread_promises.pop(0).result() + fetch = smart_insert(sql, cur, head=head, commit=True) + size = fetch[SQL_CONTENT_LENGTH] + if size is None: + write('"%s" is not revealing Content-Length' % url) + size = 0 + totalsize += size + except KeyboardInterrupt: + for promise in thread_promises: + promise.cancel() + raise sql.commit() size_string = bytestring.bytestring(totalsize) @@ -1059,6 +1112,7 @@ def measure_argparse(args): databasename=args.databasename, fullscan=args.fullscan, new_only=args.new_only, + threads=int_none(args.threads), ) def remove_pattern_argparse(args): @@ -1142,10 +1196,16 @@ def main(argv): p_list_basenames.add_argument('-o', '--outputfile', dest='outputfile', default=None) p_list_basenames.set_defaults(func=list_basenames_argparse) + p_list_urls = subparsers.add_parser('list_urls') + p_list_urls.add_argument('databasename') + p_list_urls.add_argument('-o', '--outputfile', dest='outputfile', default=None) + p_list_urls.set_defaults(func=list_urls_argparse) + p_measure = subparsers.add_parser('measure') p_measure.add_argument('databasename') p_measure.add_argument('-f', '--fullscan', dest='fullscan', action='store_true') p_measure.add_argument('-n', '--new_only', dest='new_only', action='store_true') + p_measure.add_argument('-t', '--threads', dest='threads', default=1) p_measure.set_defaults(func=measure_argparse) p_remove_pattern = subparsers.add_parser('remove_pattern') @@ -1158,6 +1218,12 @@ def main(argv): p_tree.add_argument('-o', '--outputfile', dest='outputfile', default=None) p_tree.set_defaults(func=tree_argparse) + # Allow interchangability of the command and database name + # opendirdl measure test.db -n = opendirdl test.db measure -n + if os.path.isfile(argv[0]): + (argv[0], argv[1]) = (argv[1], argv[0]) + #print(argv) + args = parser.parse_args(argv) args.func(args) diff --git a/Pathclass/pathclass.py b/Pathclass/pathclass.py index d2da2b0..f4600b6 100644 --- a/Pathclass/pathclass.py +++ b/Pathclass/pathclass.py @@ -75,6 +75,11 @@ class Path: def stat(self): return os.stat(self.absolute_path) + def with_child(self, basename): + basename = os.path.basename(basename) + return Path(os.path.join(self.absolute_path, basename)) + + def get_path_casing(path): ''' Take what is perhaps incorrectly cased input and get the path's actual diff --git a/ThreadedDL/threaded_dl.py b/ThreadedDL/threaded_dl.py index 496b0be..5781c89 100644 --- a/ThreadedDL/threaded_dl.py +++ b/ThreadedDL/threaded_dl.py @@ -52,6 +52,7 @@ def threaded_dl(urls, thread_count, filename_format=None): extension = os.path.splitext(basename)[1] filename = filename_format.format( basename=basename, + ext=extension, extension=extension, index=index, now=now,