This commit is contained in:
Ethan Dalool 2016-11-11 15:52:25 -08:00
parent fac1491322
commit cc9123dbc2
5 changed files with 102 additions and 25 deletions

View file

@ -17,9 +17,9 @@ try:
except ImportError: except ImportError:
# pip install # pip install
# https://raw.githubusercontent.com/voussoir/else/master/_voussoirkit/voussoirkit.zip # https://raw.githubusercontent.com/voussoir/else/master/_voussoirkit/voussoirkit.zip
from vousoirkit import bytestring from voussoirkit import bytestring
from vousoirkit import ratelimiter from voussoirkit import ratelimiter
from vousoirkit import clipext from voussoirkit import clipext
warnings.simplefilter('ignore') warnings.simplefilter('ignore')

View file

@ -13,6 +13,11 @@ See inside opendirdl.py for usage instructions.
   
-2016 11 11
- **[addition]** You can now call opendirdl using the database filename as the first argument, and the subcommand as the second. Previously, the subcommand always had to come first, but now they are interchangeable when the system detects that argv[0] is a file. This makes it much easier to do multiple operations on a single database because you can just backspace the previous command rather than having to hop over the database name to get to it.
- **[addition]** `measure` now takes an argument `--threads x` to use `x` threads during the head requests.
- **[addition]** New subcommand `list_urls` to just dump the urls.
- 2016 10 03 - 2016 10 03
- **[bugfix]** Fix KeyError caused by the 'root' -> 'domain' rename. - **[bugfix]** Fix KeyError caused by the 'root' -> 'domain' rename.

View file

@ -88,6 +88,9 @@ measure:
When included, perform HEAD requests only on files that haven't gotten When included, perform HEAD requests only on files that haven't gotten
one yet. one yet.
-t 4 | --threads 4:
The number of threads to use for performing requests.
If a file's size is not known by the time this operation completes, you If a file's size is not known by the time this operation completes, you
will receive a printed note. will receive a printed note.
@ -300,7 +303,6 @@ class Walker:
self.fullscan = bool(fullscan) self.fullscan = bool(fullscan)
self.queue = collections.deque() self.queue = collections.deque()
self.seen_directories = set() self.seen_directories = set()
self.threadpool = concurrent.futures.ThreadPoolExecutor(4)
def smart_insert(self, url=None, head=None, commit=True): def smart_insert(self, url=None, head=None, commit=True):
''' '''
@ -600,10 +602,10 @@ def do_head(url, raise_for_status=True):
return do_request('HEAD', requests.head, url, raise_for_status=raise_for_status) return do_request('HEAD', requests.head, url, raise_for_status=raise_for_status)
def do_request(message, method, url, raise_for_status=True): def do_request(message, method, url, raise_for_status=True):
message = '{message:>4s}: {url} : '.format(message=message, url=url) form = '{message:>4s}: {url} : {status}'
write(message, end='', flush=True) write(form.format(message=message, url=url, status=''))
response = method(url) response = method(url)
write(response.status_code) write(form.format(message=message, url=url, status=response.status_code))
if raise_for_status: if raise_for_status:
response.raise_for_status() response.raise_for_status()
return response return response
@ -638,6 +640,11 @@ def hashit(text, length=None):
sha = sha[:length] sha = sha[:length]
return sha return sha
def int_none(x):
if x is None:
return x
return int(x)
def recursive_get_size(node): def recursive_get_size(node):
''' '''
Calculate the size of the Directory nodes by summing the sizes of all children. Calculate the size of the Directory nodes by summing the sizes of all children.
@ -996,7 +1003,37 @@ def list_basenames_argparse(args):
output_filename=args.outputfile, output_filename=args.outputfile,
) )
def measure(databasename, fullscan=False, new_only=False): def list_urls(databasename, output_filename=None):
'''
Print the Enabled entries in order of the file basenames.
This makes it easier to find interesting titles without worrying about
what directory they're in.
'''
sql = sqlite3.connect(databasename)
cur = sql.cursor()
cur.execute('SELECT * FROM urls WHERE do_download == 1')
items = cur.fetchall()
items.sort(key=lambda x: x[SQL_URL].lower())
if output_filename is not None:
output_file = open(output_filename, 'w', encoding='utf-8')
else:
output_file = None
for item in items:
write(item[SQL_URL], output_file)
if output_file:
output_file.close()
def list_urls_argparse(args):
return list_urls(
databasename=args.databasename,
output_filename=args.outputfile,
)
def measure(databasename, fullscan=False, new_only=False, threads=4):
''' '''
Given a database, print the sum of all Content-Lengths. Given a database, print the sum of all Content-Lengths.
URLs will be HEAD requested if: URLs will be HEAD requested if:
@ -1006,7 +1043,6 @@ def measure(databasename, fullscan=False, new_only=False):
if isinstance(fullscan, str): if isinstance(fullscan, str):
fullscan = bool(fullscan) fullscan = bool(fullscan)
totalsize = 0
sql = sqlite3.connect(databasename) sql = sqlite3.connect(databasename)
cur = sql.cursor() cur = sql.cursor()
@ -1016,30 +1052,47 @@ def measure(databasename, fullscan=False, new_only=False):
cur.execute('SELECT * FROM urls WHERE do_download == 1') cur.execute('SELECT * FROM urls WHERE do_download == 1')
items = cur.fetchall() items = cur.fetchall()
filecount = len(items) filecount = len(items)
totalsize = 0
unmeasured_file_count = 0 unmeasured_file_count = 0
if threads is None:
threads = 1
threadpool = concurrent.futures.ThreadPoolExecutor(threads)
thread_promises = []
try:
for fetch in items: for fetch in items:
size = fetch[SQL_CONTENT_LENGTH] size = fetch[SQL_CONTENT_LENGTH]
if fullscan or new_only: if fullscan or new_only:
url = fetch[SQL_URL] url = fetch[SQL_URL]
head = do_head(url, raise_for_status=False) promise = threadpool.submit(do_head, url, raise_for_status=False)
fetch = smart_insert(sql, cur, head=head, commit=True) thread_promises.append(promise)
size = fetch[SQL_CONTENT_LENGTH]
elif size is None: elif size is None:
# Unmeasured and no intention to measure. # Unmeasured and no intention to measure.
unmeasured_file_count += 1 unmeasured_file_count += 1
size = 0
else:
totalsize += size
while len(thread_promises) > 0:
# If that thread is done, `result()` will return immediately
# Otherwise, it will wait, which is okay because the threads themselves
# are not blocked.
head = thread_promises.pop(0).result()
fetch = smart_insert(sql, cur, head=head, commit=True)
size = fetch[SQL_CONTENT_LENGTH]
if size is None: if size is None:
# Unmeasured even though we tried the head request.
write('"%s" is not revealing Content-Length' % url) write('"%s" is not revealing Content-Length' % url)
size = 0 size = 0
totalsize += size totalsize += size
except KeyboardInterrupt:
for promise in thread_promises:
promise.cancel()
raise
sql.commit() sql.commit()
size_string = bytestring.bytestring(totalsize) size_string = bytestring.bytestring(totalsize)
@ -1059,6 +1112,7 @@ def measure_argparse(args):
databasename=args.databasename, databasename=args.databasename,
fullscan=args.fullscan, fullscan=args.fullscan,
new_only=args.new_only, new_only=args.new_only,
threads=int_none(args.threads),
) )
def remove_pattern_argparse(args): def remove_pattern_argparse(args):
@ -1142,10 +1196,16 @@ def main(argv):
p_list_basenames.add_argument('-o', '--outputfile', dest='outputfile', default=None) p_list_basenames.add_argument('-o', '--outputfile', dest='outputfile', default=None)
p_list_basenames.set_defaults(func=list_basenames_argparse) p_list_basenames.set_defaults(func=list_basenames_argparse)
p_list_urls = subparsers.add_parser('list_urls')
p_list_urls.add_argument('databasename')
p_list_urls.add_argument('-o', '--outputfile', dest='outputfile', default=None)
p_list_urls.set_defaults(func=list_urls_argparse)
p_measure = subparsers.add_parser('measure') p_measure = subparsers.add_parser('measure')
p_measure.add_argument('databasename') p_measure.add_argument('databasename')
p_measure.add_argument('-f', '--fullscan', dest='fullscan', action='store_true') p_measure.add_argument('-f', '--fullscan', dest='fullscan', action='store_true')
p_measure.add_argument('-n', '--new_only', dest='new_only', action='store_true') p_measure.add_argument('-n', '--new_only', dest='new_only', action='store_true')
p_measure.add_argument('-t', '--threads', dest='threads', default=1)
p_measure.set_defaults(func=measure_argparse) p_measure.set_defaults(func=measure_argparse)
p_remove_pattern = subparsers.add_parser('remove_pattern') p_remove_pattern = subparsers.add_parser('remove_pattern')
@ -1158,6 +1218,12 @@ def main(argv):
p_tree.add_argument('-o', '--outputfile', dest='outputfile', default=None) p_tree.add_argument('-o', '--outputfile', dest='outputfile', default=None)
p_tree.set_defaults(func=tree_argparse) p_tree.set_defaults(func=tree_argparse)
# Allow interchangability of the command and database name
# opendirdl measure test.db -n = opendirdl test.db measure -n
if os.path.isfile(argv[0]):
(argv[0], argv[1]) = (argv[1], argv[0])
#print(argv)
args = parser.parse_args(argv) args = parser.parse_args(argv)
args.func(args) args.func(args)

View file

@ -75,6 +75,11 @@ class Path:
def stat(self): def stat(self):
return os.stat(self.absolute_path) return os.stat(self.absolute_path)
def with_child(self, basename):
basename = os.path.basename(basename)
return Path(os.path.join(self.absolute_path, basename))
def get_path_casing(path): def get_path_casing(path):
''' '''
Take what is perhaps incorrectly cased input and get the path's actual Take what is perhaps incorrectly cased input and get the path's actual

View file

@ -52,6 +52,7 @@ def threaded_dl(urls, thread_count, filename_format=None):
extension = os.path.splitext(basename)[1] extension = os.path.splitext(basename)[1]
filename = filename_format.format( filename = filename_format.format(
basename=basename, basename=basename,
ext=extension,
extension=extension, extension=extension,
index=index, index=index,
now=now, now=now,