master
Ethan Dalool 2017-02-18 17:06:55 -08:00
parent fa2c2bda76
commit 64de5c94bb
14 changed files with 351 additions and 149 deletions

View File

@ -133,6 +133,7 @@ import requests
import shutil
import sqlite3
import sys
import threading
import time
## import tkinter
import urllib.parse
@ -143,6 +144,7 @@ from voussoirkit import downloady
from voussoirkit import fusker
from voussoirkit import treeclass
from voussoirkit import pathtree
sys.path.append('C:\\git\\else\\threadqueue'); import threadqueue
DOWNLOAD_CHUNK = 16 * bytestring.KIBIBYTE
FILENAME_BADCHARS = '/\\:*?"<>|'
@ -184,6 +186,7 @@ SKIPPABLE_FILETYPES = [
'.pdf',
'.png',
'.rar',
'.sfv',
'.srt',
'.tar',
'.ttf',
@ -237,7 +240,7 @@ class Walker:
'''
This class manages the extraction and saving of URLs, given a starting root url.
'''
def __init__(self, root_url, databasename=None, fullscan=False):
def __init__(self, root_url, databasename=None, fullscan=False, threads=1):
if not root_url.endswith('/'):
root_url += '/'
if '://' not in root_url.split('.')[0]:
@ -255,6 +258,8 @@ class Walker:
self.cur = self.sql.cursor()
db_init(self.sql, self.cur)
self.thread_queue = threadqueue.ThreadQueue(threads)
self._main_thread = threading.current_thread().ident
self.fullscan = bool(fullscan)
self.queue = collections.deque()
self.seen_directories = set()
@ -326,10 +331,15 @@ class Walker:
skippable = any(urll.endswith(ext) for ext in SKIPPABLE_FILETYPES)
if skippable:
write('Skipping "%s" due to extension.' % url)
self.smart_insert(url=url, commit=False)
#self.smart_insert(url=url, commit=False)
#return {'url': url, 'commit': False}
self.thread_queue.behalf(self._main_thread, self.smart_insert, url=url, commit=False)
return
self.cur.execute('SELECT * FROM urls WHERE url == ?', [url])
skippable = self.cur.fetchone() is not None
skippable = lambda: self.cur.execute('SELECT * FROM urls WHERE url == ?', [url]).fetchone()
skippable = self.thread_queue.behalf(self._main_thread, skippable)
#print(skippable)
skippable = skippable is not None
#skippable = self.cur.fetchone() is not None
if skippable:
write('Skipping "%s" since we already have it.' % url)
return
@ -359,28 +369,34 @@ class Walker:
if href in self.seen_directories:
continue
else:
self.queue.append(href)
#self.queue.append(href)
self.thread_queue.add(self.process_url, href)
added += 1
write('Queued %d urls' % added)
else:
# This is not an index page, so save it.
self.smart_insert(head=head, commit=False)
#self.smart_insert(head=head, commit=False)
self.thread_queue.behalf(self._main_thread, self.smart_insert, head=head, commit=False)
#return {'head': head, 'commit': False}
def walk(self, url=None):
'''
Given a starting URL (defaults to self.root_url), continually extract
links from the page and repeat.
'''
self.queue.appendleft(url)
try:
while len(self.queue) > 0:
url = self.queue.popleft()
self.process_url(url)
line = '{:,} Remaining'.format(len(self.queue))
write(line)
except:
self.sql.commit()
raise
#self.queue.appendleft(url)
self.thread_queue.add(self.process_url, url)
for return_value in self.thread_queue.run(hold_open=False):
pass
#try:
# while len(self.queue) > 0:
# url = self.queue.popleft()
# self.process_url(url)
# line = '{:,} Remaining'.format(len(self.queue))
# write(line)
#except:
# self.sql.commit()
# raise
self.sql.commit()
## ##
## WALKER ##########################################################################################
@ -584,7 +600,7 @@ def write(line, file_handle=None, **kwargs):
## COMMANDLINE FUNCTIONS ###########################################################################
## ##
def digest(root_url, databasename=None, fullscan=False):
def digest(root_url, databasename=None, fullscan=False, threads=1):
if root_url in ('!clipboard', '!c'):
root_url = get_clipboard()
write('From clipboard: %s' % root_url)
@ -592,6 +608,7 @@ def digest(root_url, databasename=None, fullscan=False):
databasename=databasename,
fullscan=fullscan,
root_url=root_url,
threads=threads,
)
walker.walk()
@ -600,6 +617,7 @@ def digest_argparse(args):
databasename=args.databasename,
fullscan=args.fullscan,
root_url=args.root_url,
threads=int(args.threads),
)
def download(
@ -818,8 +836,7 @@ def measure(databasename, fullscan=False, new_only=False, threads=4):
if threads is None:
threads = 1
threadpool = concurrent.futures.ThreadPoolExecutor(threads)
thread_promises = []
thread_queue = threadqueue.ThreadQueue(threads)
try:
for fetch in items:
@ -827,8 +844,7 @@ def measure(databasename, fullscan=False, new_only=False, threads=4):
if fullscan or new_only:
url = fetch[SQL_URL]
promise = threadpool.submit(do_head, url, raise_for_status=False)
thread_promises.append(promise)
thread_queue.add(do_head, url, raise_for_status=False)
elif size is None:
# Unmeasured and no intention to measure.
@ -837,16 +853,15 @@ def measure(databasename, fullscan=False, new_only=False, threads=4):
else:
totalsize += size
for head in promise_results(thread_promises):
fetch = smart_insert(sql, cur, head=head, commit=True)
for head in thread_queue.run():
fetch = smart_insert(sql, cur, head=head, commit=False)
size = fetch[SQL_CONTENT_LENGTH]
if size is None:
write('"%s" is not revealing Content-Length' % url)
size = 0
totalsize += size
except (Exception, KeyboardInterrupt):
for promise in thread_promises:
promise.cancel()
sql.commit()
raise
sql.commit()
@ -938,6 +953,7 @@ def main(argv):
p_digest.add_argument('root_url')
p_digest.add_argument('-db', '--database', dest='databasename', default=None)
p_digest.add_argument('-f', '--fullscan', dest='fullscan', action='store_true')
p_digest.add_argument('-t', '--threads', dest='threads', default=1)
p_digest.set_defaults(func=digest_argparse)
p_download = subparsers.add_parser('download')

View File

@ -45,6 +45,10 @@ class Path:
def exists(self):
return os.path.exists(self.absolute_path)
@property
def extension(self):
return os.path.splitext(self.absolute_path)[1].lstrip('.')
@property
def is_dir(self):
return os.path.isdir(self.absolute_path)
@ -62,6 +66,11 @@ class Path:
raise TypeError('subpath must be a string')
return Path(os.path.join(self.absolute_path, subpath))
def listdir(self):
children = os.listdir(self.absolute_path)
children = [self.with_child(child) for child in children]
return children
@property
def normcase(self):
return os.path.normcase(self.absolute_path)
@ -90,6 +99,15 @@ class Path:
backsteps = os.sep.join('..' for x in range(backsteps))
return self.absolute_path.replace(common.absolute_path, backsteps)
def replace_extension(self, extension):
extension = extension.rsplit('.', 1)[-1]
base = os.path.splitext(self.absolute_path)[0]
if extension == '':
return Path(base)
return Path(base + '.' + extension)
@property
def size(self):
if self.is_file:
@ -105,6 +123,7 @@ class Path:
return self.join(os.path.basename(basename))
def common_path(paths, fallback):
'''
Given a list of file paths, determine the deepest path which all
@ -171,9 +190,10 @@ def get_path_casing(path):
except IndexError:
return input_path.absolute_path
imaginary_portion = input_path.normcase
real_portion = os.path.normcase(cased)
imaginary_portion = imaginary_portion.replace(real_portion, '')
imaginary_portion = input_path.absolute_path
imaginary_portion = imaginary_portion[len(cased):]
#real_portion = os.path.normcase(cased)
#imaginary_portion = imaginary_portion.replace(real_portion, '')
imaginary_portion = imaginary_portion.lstrip(os.sep)
cased = os.path.join(cased, imaginary_portion)
cased = cased.rstrip(os.sep)

View File

@ -0,0 +1,10 @@
import pathclass
import unittest
class Tests(unittest.TestCase):
def test_something(self):
self.assertEqual('C:\\Users', pathclass.get_path_casing('C:\\users'))
self.assertEqual('C:\\Users\\Nonexist', pathclass.get_path_casing('C:\\users\\Nonexist'))
if __name__ == '__main__':
unittest.main()

View File

@ -1,9 +1,10 @@
Continue
========
Discards the current iteration, and restarts the loop using the next item.
Skips the rest of the current iteration, and starts the next one.
```Python
>>> for x in range(6):
... if x == 3:
... continue
@ -14,38 +15,63 @@ Discards the current iteration, and restarts the loop using the next item.
2
4
5
```
```Python
while len(directory_queue) > 0:
directory = directory_queue.popleft()
try:
filenames = os.listdir(directory)
except PermissionError:
continue
for filename in filenames:
...
```
####Continue is great for cleaning code with lots of conditions:
#####Without continue:
Nested:
```Python
for submission in submissions:
if submission.author is not None:
if submission.over_18 is False:
if not submission.over_18:
if 'suggestion' in submission.title.lower():
print('Found:', submission.id)
```
&nbsp;
or all grouped up:
```Python
for submission in submissions:
if submission.author is not None and submission.over_18 is False and 'suggestion' in submission.title.lower():
if (
submission.author is not None
and not submission.over_18
and 'suggestion' in submission.title.lower()
):
print('Found:', submission.id)
```
#####With continue:
```Python
for submission in submissions:
if submission.author is None:
continue
if submission.over_18:
continue
if 'suggestion' not in submission.title.lower():
continue
print('Found:', submission.id)
```
The mentality changes from "keep only the items with the right properties" to "discard the items with the wrong properties".
Notice that all of the checks are the opposite of the originals. The mentality changes from "keep only the items with the right properties" to "discard the items with the wrong properties", and the result is the same.

View File

@ -12,22 +12,27 @@ Generators are a type of iterable that create their contents on-the-fly. Unlike
Writing a generator looks like writing a function, but instead of `return`, you use `yield`. The object which is yielded is what you'll get when you do a loop over the generator. This one lets you count to a billion:
```Python
def billion():
x = 0
while x < 1000000000:
yield x
x += 1
```
I purposely used a `while` loop instead of `for x in range()` to show the extra work.
Note that, unlike a `return` statement, you can include more code after a `yield` statement. Also notice that generators keep track of their internal state -- the `billion` generator has an `x` that it increments every time you loop over it. You can imagine the code pausing after the `yield` line, and resuming when you come back for the next cycle. Try this with some extra print statements to help visualize.
Generators can also take arguments. Here's a generator that counts to a custom amount:
```Python
def count_to(y):
x = 0
while x < y:
yield x
x += 1
```
&nbsp;
@ -35,23 +40,31 @@ Generators can also take arguments. Here's a generator that counts to a custom a
Although generators look like functions when you're writing them, they feel more like objects when using them. Remember that generators don't calculate their contents until they are actually used in a loop, so simply doing:
```Python
numbers = count_to(100)
```
does **not** create a list of 100 numbers. It creates a new instance of the generator that is ready to be iterated over, like this:
```Python
numbers = count_to(100)
for number in numbers:
print(number)
```
or this:
```Python
for number in count_to(100):
print(number)
```
This should remind you of:
```Python
for number in range(100):
print(number)
```
because the `range` class behaves a lot like a generator ([but not exactly](http://stackoverflow.com/a/13092317)).
@ -66,10 +79,11 @@ To get a single item from a generator without looping, use `next(generator)`.
# StopIteration
Generators pause and resume a lot, but they still flow like normal functions. As long as there is no endless `while` loop inside, they'll come to an end at some point. When a generator is all finished, it will raise a `StopIteration` exception every time you try to do `next()`. Luckily, `for` loops will detect this automatically and stop themselves.
Generators pause and resume a lot, but they still flow like normal functions. As long as there is no endless `while` loop inside, they'll come to an end at some point. When a generator is all finished, it will raise a `StopIteration` exception every time you try to do `next()` on it. Luckily, `for` loops will detect this automatically and stop themselves.
Earlier, I said that generators use `yield` instead of `return`, but in fact you can include a return statement. If it is encountered, it will raise a `StopIteration`, and the generator will not resume even if there is more code.
```Python
>>> def generator():
... yield 1
... return 2
@ -88,6 +102,7 @@ Earlier, I said that generators use `yield` instead of `return`, but in fact you
File "<stdin>", line 1, in <module>
StopIteration
>>>
```
In general, I don't like to use `return` in generators. I prefer to `break` from their internal loops and conclude naturally.
@ -106,6 +121,7 @@ In general, I don't like to use `return` in generators. I prefer to `break` from
Suppose you're getting data from an imaginary website which sends you items in groups of 100. You want to let the user loop over every item without having to worry about the groups themselves.
```Python
def item_generator(url):
page = 0
while True:
@ -130,13 +146,15 @@ Suppose you're getting data from an imaginary website which sends you items in g
comments = item_generator('http://website.com/user/voussoir/comments')
for comment in comments:
print(comment.body)
```
&nbsp;
#### Sqlite3 fetch generator
This is one that I almost always include in my program when I'm doing lots of sqlite work. Sqlite cursors don't allow you to simply do a for-loop over the results of a SELECT, so this generator is very handy:
This is one that I almost always include when I'm doing lots of sqlite work. Sqlite cursors don't allow you to simply do a for-loop over the results of a SELECT, and doing `fetchall` on a large query can be very memory-heavy, so this generator is very handy:
```Python
def fetch_generator(cur):
while True:
item = cur.fetchone()
@ -147,11 +165,12 @@ This is one that I almost always include in my program when I'm doing lots of sq
cur.execute('SELECT * FROM table')
for item in fetch_generator(cur):
print(item)
```
&nbsp;
# Further reading
[Stack Overflow - What are the main uses for `yield from`?](http://stackoverflow.com/questions/9708902/in-practice-what-are-the-main-uses-for-the-new-yield-from-syntax-in-python-3) -- If you like recursive functions, how about recursive generators? The only time I've ever used this is to [iterate over a tree's nodes](https://github.com/voussoir/reddit/blob/2069c3bd731cc8f90401ee49a9fc4d0dbf436cfc/Prawtimestamps/timesearch.py#L756-L761).
[Stack Overflow - What are the main uses for `yield from`?](http://stackoverflow.com/questions/9708902/in-practice-what-are-the-main-uses-for-the-new-yield-from-syntax-in-python-3) &mdash; If you like recursive functions, how about recursive generators?
[Stack Overflow - Python generator `send` function purpose?](http://stackoverflow.com/questions/19302530/python-generator-send-function-purpose) -- This quickly dives out of "quick tips" territory.
[Stack Overflow - Python generator `send` function purpose?](http://stackoverflow.com/questions/19302530/python-generator-send-function-purpose) &mdash; This quickly dives out of "quick tips" territory.

View File

@ -5,6 +5,7 @@ When using Tkinter alone, you can only embed .gif images in your interface. PIL
Requires `pip install pillow`
```Python
import PIL.Image
import PIL.ImageTk
import tkinter
@ -15,6 +16,7 @@ Requires `pip install pillow`
label = tkinter.Label(t, image=image_tk)
label.image_reference = image_tk
label.pack()
```
You must store the `image_tk` somewhere, such as an attribute of the label it belongs to. Otherwise, it gets [prematurely garbage-collected](http://effbot.org/pyfaq/why-do-my-tkinter-images-not-appear.htm).

View File

@ -1,5 +1,6 @@
import math
import math
import random
import shutil
import string
import threading
import time
@ -12,7 +13,7 @@ import tkinter
# 0, 90, 180, 270
# ░▒▓
SCREEN_WIDTH = 114
SCREEN_WIDTH = shutil.get_terminal_size()[1] - 6
DEFAULT_LINE = {
'character': '#',
@ -27,7 +28,7 @@ DEFAULT_LINE = {
variables = {
'clock': 0,
'frames':[],
'delay': 0.01,
'delay': 0.02,
'lines':[
]
}

8
Templates/unittester.py Normal file
View File

@ -0,0 +1,8 @@
import unittest
class Tests(unittest.TestCase):
def test_something(self):
pass
if __name__ == '__main__':
unittest.main()

15
ThreadQueue/test.py Normal file
View File

@ -0,0 +1,15 @@
import time
import threadqueue
import random
import threading
t = threadqueue.ThreadQueue(4, print)
main_thr = threading.current_thread().ident
def f():
mysleep = random.randint(1, 10)
time.sleep(mysleep)
t.behalf(main_thr, lambda: print(threading.current_thread().ident==main_thr))
raise ValueError()
return mysleep
[t.add(f) for x in range(20)]
list(t.run())

View File

@ -0,0 +1,61 @@
import threading
import time
class ThreadQueue:
def __init__(self, thread_count, post_processor=None):
self.thread_count = thread_count
self.post_processor = post_processor
self._returns = []
self._threads = []
self._lambdas = []
self._behalfs = {}
self.hold_open = False
def _post_process(self, returned_value):
if self.post_processor is not None:
self.post_processor(returned_value)
self._returns.append(returned_value)
def add(self, function, *function_args, **function_kwargs):
lam = lambda: self._post_process(function(*function_args, **function_kwargs))
self._lambdas.append(lam)
def behalf(self, thread_id, f, *args, **kwargs):
self._behalfs.setdefault(thread_id, [])
event = threading.Event()
call = {'f': f, 'args': args, 'kwargs': kwargs, 'event': event, 'return': None}
self._behalfs[thread_id].append(call)
event.wait()
return call['return']
def run_behalfs(self):
calls = self._behalfs.get(threading.current_thread().ident, [])
while calls:
call = calls.pop(0)
ret = call['f'](*call['args'], **call['kwargs'])
call['return'] = ret
call['event'].set()
def run_queue(self):
#print('Managing threads')
self._threads = [thread for thread in self._threads if thread.is_alive()]
threads_needed = self.thread_count - len(self._threads)
if threads_needed > 0:
for x in range(threads_needed):
if len(self._lambdas) == 0:
break
lam = self._lambdas.pop(0)
thread = threading.Thread(target=lam)
#thread.daemon = True
thread.start()
self._threads.append(thread)
def run(self, hold_open=False):
self.hold_open = hold_open
while self.hold_open or self._threads or self._lambdas:
self.run_queue()
while self._returns:
yield self._returns.pop(0)
self.run_behalfs()
#time.sleep(0.5)

View File

@ -38,7 +38,7 @@ def threaded_dl(urls, thread_count, filename_format=None):
if filename_format != os.devnull:
if '{' not in filename_format and len(urls) > 1:
filename_format += '_{index}'
if '{extension}' not in filename_format:
if '{extension}' not in filename_format and '{basename}' not in filename_format:
filename_format += '{extension}'
now = int(time.time())
for (index, url) in enumerate(urls):

View File

@ -15,8 +15,18 @@ from voussoirkit import safeprint
from voussoirkit import spinal
def fileswith(filepattern, terms, do_regex=False, do_glob=False):
search_terms = [term.lower() for term in terms]
def fileswith(
filepattern,
terms,
case_sensitive=False,
do_regex=False,
do_glob=False,
inverse=False,
match_any=False,
):
if not case_sensitive:
terms = [term.lower() for term in terms]
def term_matches(text, term):
return (
@ -25,6 +35,8 @@ def fileswith(filepattern, terms, do_regex=False, do_glob=False):
(do_glob and fnmatch.fnmatch(text, term))
)
anyall = any if match_any else all
generator = spinal.walk_generator(depth_first=False, yield_directories=True)
for filepath in generator:
@ -35,8 +47,12 @@ def fileswith(filepattern, terms, do_regex=False, do_glob=False):
try:
with handle:
for (index, line) in enumerate(handle):
if all(term_matches(line, term) for term in terms):
line = '%d | %s' % (index, line.strip())
if not case_sensitive:
compare_line = line.lower()
else:
compare_line = line
if inverse ^ anyall(term_matches(compare_line, term) for term in terms):
line = '%d | %s' % (index+1, line.strip())
matches.append(line)
except:
pass
@ -50,8 +66,11 @@ def fileswith_argparse(args):
return fileswith(
filepattern=args.filepattern,
terms=args.search_terms,
case_sensitive=args.case_sensitive,
do_glob=args.do_glob,
do_regex=args.do_regex,
inverse=args.inverse,
match_any=args.match_any,
)
def main(argv):
@ -59,8 +78,11 @@ def main(argv):
parser.add_argument('filepattern')
parser.add_argument('search_terms', nargs='+', default=None)
parser.add_argument('--any', dest='match_any', action='store_true')
parser.add_argument('--case', dest='case_sensitive', action='store_true')
parser.add_argument('--regex', dest='do_regex', action='store_true')
parser.add_argument('--glob', dest='do_glob', action='store_true')
parser.add_argument('--inverse', dest='inverse', action='store_true')
parser.set_defaults(func=fileswith_argparse)
args = parser.parse_args(argv)

View File

@ -13,6 +13,7 @@ def search(
case_sensitive=False,
do_regex=False,
do_glob=False,
inverse=False,
local_only=False,
match_any=False,
):
@ -23,10 +24,8 @@ def search(
(do_glob and fnmatch.fnmatch(text, term))
)
if case_sensitive:
search_terms = terms
else:
search_terms = [term.lower() for term in terms]
if not case_sensitive:
terms = [term.lower() for term in terms]
anyall = any if match_any else all
@ -40,7 +39,8 @@ def search(
if not case_sensitive:
basename = basename.lower()
if anyall(term_matches(basename, term) for term in search_terms):
matches = anyall(term_matches(basename, term) for term in terms)
if matches ^ inverse:
safeprint.safeprint(filepath.absolute_path)
@ -50,6 +50,7 @@ def search_argparse(args):
case_sensitive=args.case_sensitive,
do_glob=args.do_glob,
do_regex=args.do_regex,
inverse=args.inverse,
local_only=args.local_only,
match_any=args.match_any,
)
@ -59,10 +60,11 @@ def main(argv):
parser.add_argument('search_terms', nargs='+', default=None)
parser.add_argument('--any', dest='match_any', action='store_true')
parser.add_argument('--case', dest='case_sensitive', action='store_true')
parser.add_argument('--regex', dest='do_regex', action='store_true')
parser.add_argument('--glob', dest='do_glob', action='store_true')
parser.add_argument('--case', dest='case_sensitive', action='store_true')
parser.add_argument('--local', dest='local_only', action='store_true')
parser.add_argument('--inverse', dest='inverse', action='store_true')
parser.set_defaults(func=search_argparse)
args = parser.parse_args(argv)