else
This commit is contained in:
parent
5e88950156
commit
ef15e1a644
6 changed files with 367 additions and 116 deletions
|
@ -7,7 +7,7 @@
|
|||
|
||||
<body>
|
||||
<div id="control_panel">
|
||||
<input type="text" id="subreddit_field" placeholder="learnpython">
|
||||
<input type="text" id="subreddit_field" placeholder="/r/learnpython">
|
||||
<button id="start_button" onclick="start()">Start</button>
|
||||
<a id="browser_link"></a>
|
||||
<button id="clear_button" onclick="clear_workspace()">Clear workspace</button>
|
||||
|
@ -32,14 +32,14 @@ body
|
|||
margin-right: 10%;
|
||||
padding: 5px;
|
||||
}
|
||||
|
||||
|
||||
#control_panel
|
||||
{
|
||||
background-color: #284142;
|
||||
padding: 5px;
|
||||
}
|
||||
#workspace
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
a
|
||||
{
|
||||
|
@ -53,11 +53,26 @@ a:hover
|
|||
text-decoration: underline;
|
||||
}
|
||||
|
||||
.delete_button
|
||||
{
|
||||
position: absolute;
|
||||
top: 0px;
|
||||
right: 0px;
|
||||
height: 100%;
|
||||
width: 5%;
|
||||
|
||||
border: 0px;
|
||||
background-color: #b53030;
|
||||
}
|
||||
|
||||
.submission
|
||||
{
|
||||
position: relative;
|
||||
|
||||
padding: 10px;
|
||||
padding-top: 20px;
|
||||
padding-bottom: 20px;
|
||||
padding-right: 6%; /*for the delete button*/
|
||||
|
||||
margin: 10px;
|
||||
margin-top: 20px;
|
||||
|
@ -76,8 +91,21 @@ a:hover
|
|||
*/
|
||||
|
||||
var CHECK_DELAY = 30 * 1000;
|
||||
var CONTROL_PANEL = document.getElementById("control_panel");
|
||||
CONTROL_PANEL.default_color = CONTROL_PANEL.style.backgroundColor;
|
||||
var WORKSPACE = document.getElementById("workspace");
|
||||
|
||||
var id_cache = [];
|
||||
var id_cache_size = 20;
|
||||
|
||||
var first_loop = true;
|
||||
var unread_notification_count = 0;
|
||||
var subreddit = "";
|
||||
var check_timer = null;
|
||||
|
||||
var page_focused_cached;
|
||||
|
||||
|
||||
var HTTPClient = function()
|
||||
{
|
||||
/* Thanks ttgagne http://stackoverflow.com/a/22076667 */
|
||||
|
@ -87,9 +115,19 @@ var HTTPClient = function()
|
|||
var request = new XMLHttpRequest();
|
||||
request.onreadystatechange = function()
|
||||
{
|
||||
if (request.readyState == 4 && request.status == 200)
|
||||
// console.log(request.readyState);
|
||||
// console.log(request.status);
|
||||
if (request.readyState == 4)
|
||||
{
|
||||
callback(request.responseText);
|
||||
if (request.status == 200)
|
||||
{
|
||||
CONTROL_PANEL.style.backgroundColor = CONTROL_PANEL.default_color;
|
||||
callback(request.responseText);
|
||||
}
|
||||
else
|
||||
{
|
||||
CONTROL_PANEL.style.backgroundColor = "#f00";
|
||||
}
|
||||
}
|
||||
}
|
||||
request.open("GET", url, asynchronous);
|
||||
|
@ -98,21 +136,21 @@ var HTTPClient = function()
|
|||
}
|
||||
}
|
||||
|
||||
function apply_to_page(text)
|
||||
function apply_to_page(response_json)
|
||||
{
|
||||
var j = JSON.parse(text);
|
||||
var j = JSON.parse(response_json);
|
||||
var submissions = j["data"]["children"];
|
||||
submissions.reverse(); // newest last
|
||||
var new_items = 0;
|
||||
for (var index = 0; index < submissions.length; index += 1)
|
||||
{
|
||||
var submission = submissions[index]["data"];
|
||||
if (done_ids.has(submission["id"]))
|
||||
if (id_cache.includes(submission["id"]))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
done_ids.add(submission["id"]);
|
||||
id_cache.push(submission["id"]);
|
||||
|
||||
if (first_loop)
|
||||
{
|
||||
|
@ -128,11 +166,21 @@ function apply_to_page(text)
|
|||
anchor.href = "https://reddit.com/r/" + submission["subreddit"] + "/comments/" + submission["id"];
|
||||
anchor.target = "_blank";
|
||||
|
||||
var delete_button = document.createElement("button")
|
||||
delete_button.className = "delete_button";
|
||||
delete_button.div = div;
|
||||
delete_button.innerHTML = "X";
|
||||
delete_button.onclick = function()
|
||||
{
|
||||
this.div.parentElement.removeChild(this.div);
|
||||
}
|
||||
|
||||
var timestamp = document.createElement("span");
|
||||
var submission_time = new Date(submission["created_utc"])
|
||||
timestamp.innerHTML = "" + submission_time.getHours() + ":" + submission_time.getMinutes();
|
||||
|
||||
div.appendChild(anchor);
|
||||
div.appendChild(delete_button);
|
||||
//WORKSPACE.insertBefore(div, WORKSPACE.firstChild);
|
||||
WORKSPACE.appendChild(div);
|
||||
}
|
||||
|
@ -142,6 +190,12 @@ function apply_to_page(text)
|
|||
unread_notification_count += new_items;
|
||||
update_title();
|
||||
}
|
||||
|
||||
while (id_cache.length < id_cache_size)
|
||||
{
|
||||
id_cache.shift();
|
||||
}
|
||||
|
||||
first_loop = false;
|
||||
}
|
||||
|
||||
|
@ -160,7 +214,7 @@ function check_once()
|
|||
console.log("no subreddit");
|
||||
return;
|
||||
}
|
||||
var url = "https://api.reddit.com/r/" + subreddit + "/new.json";
|
||||
var url = "https://api.reddit.com" + subreddit + "/new.json";
|
||||
session.get(url, apply_to_page);
|
||||
}
|
||||
|
||||
|
@ -196,6 +250,18 @@ function page_focused_fresh()
|
|||
return page_focused_cached;
|
||||
}
|
||||
|
||||
function sort_submission_comparator(submission_1, submission_2)
|
||||
{
|
||||
created_1 = submission_1["created_utc"];
|
||||
created_2 = submission_2["created_utc"];
|
||||
|
||||
if (created_1 < created_2)
|
||||
{return -1;}
|
||||
if (created_1 > created_2)
|
||||
{return 1;}
|
||||
return 0;
|
||||
}
|
||||
|
||||
function start()
|
||||
{
|
||||
console.log("start");
|
||||
|
@ -203,10 +269,14 @@ function start()
|
|||
clear_workspace();
|
||||
var field = document.getElementById("subreddit_field");
|
||||
var text = field.value;
|
||||
text = text.replace("/r/", "").replace("r/", "");
|
||||
text = text.replace("/u/", "/user/");
|
||||
if (text.indexOf("/") == -1)
|
||||
{
|
||||
text = "/r/" + text;
|
||||
}
|
||||
subreddit = text;
|
||||
var link = document.getElementById("browser_link");
|
||||
var url = "https://reddit.com/r/" + subreddit + "/new";
|
||||
var url = "https://reddit.com" + subreddit + "/new";
|
||||
link.href = url;
|
||||
link.innerHTML = url;
|
||||
update_title();
|
||||
|
@ -242,14 +312,7 @@ function visibility_property()
|
|||
return null;
|
||||
}
|
||||
|
||||
var done_ids = new Set();
|
||||
var first_loop = true;
|
||||
|
||||
var unread_notification_count = 0;
|
||||
var subreddit = "";
|
||||
var check_timer = null;
|
||||
|
||||
var page_focused_cached;
|
||||
page_focused_fresh();
|
||||
|
||||
var my_visibility_property = visibility_property();
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
Open Dir DL
|
||||
===========
|
||||
|
||||
- 2016 07 08
|
||||
- Fixed bug in which trees wouldn't generate on server:port urls.
|
||||
|
||||
- 2016 07 04
|
||||
- Added new argparse command "tree"
|
||||
|
||||
|
|
|
@ -80,9 +80,14 @@ measure:
|
|||
|
||||
flags:
|
||||
-f | --fullscan:
|
||||
When included, perform HEAD requests when a file's size is not known.
|
||||
If this flag is not included, and some file's size is unkown, you will
|
||||
receive a printed note.
|
||||
When included, perform HEAD requests on all files to update their size.
|
||||
|
||||
-n | --new_only:
|
||||
When included, perform HEAD requests only on files that haven't gotten one
|
||||
yet.
|
||||
|
||||
If a file's size is not known by the time this operation completes, you will
|
||||
receive a printed note.
|
||||
|
||||
tree:
|
||||
Print the file / folder tree.
|
||||
|
@ -120,6 +125,7 @@ import requests
|
|||
import shutil
|
||||
import sqlite3
|
||||
## ~tkinter
|
||||
import traceback
|
||||
import urllib.parse
|
||||
|
||||
FILENAME_BADCHARS = '/\\:*?"<>|'
|
||||
|
@ -166,6 +172,7 @@ SKIPPABLE_FILETYPES = [
|
|||
'.tar',
|
||||
'.ttf',
|
||||
'.txt',
|
||||
'.wav',
|
||||
'.webm',
|
||||
'.wma',
|
||||
'.zip',
|
||||
|
@ -229,6 +236,11 @@ SQL_CONTENT_TYPE = 3
|
|||
SQL_DO_DOWNLOAD = 4
|
||||
|
||||
|
||||
UNMEASURED_WARNING = '''
|
||||
Note: %d files do not have a stored Content-Length.
|
||||
Run `measure` with `-f`|`--fullscan` or `-n`|`--new_only` to HEAD request those files.
|
||||
'''.strip()
|
||||
|
||||
## DOWNLOADER ######################################################################################
|
||||
## ##
|
||||
class Downloader:
|
||||
|
@ -293,6 +305,12 @@ class Generic:
|
|||
setattr(self, kwarg, kwargs[kwarg])
|
||||
|
||||
|
||||
class TreeExistingChild(Exception):
|
||||
pass
|
||||
|
||||
class TreeInvalidIdentifier(Exception):
|
||||
pass
|
||||
|
||||
class TreeNode:
|
||||
def __init__(self, identifier, data, parent=None):
|
||||
assert isinstance(identifier, str)
|
||||
|
@ -329,9 +347,9 @@ class TreeNode:
|
|||
|
||||
def check_child_availability(self, identifier):
|
||||
if ':' in identifier:
|
||||
raise Exception('Only roots may have a colon')
|
||||
raise TreeInvalidIdentifier('Only roots may have a colon')
|
||||
if identifier in self.children:
|
||||
raise Exception('Node %s already has child %s' % (self.identifier, identifier))
|
||||
raise TreeExistingChild('Node %s already has child %s' % (self.identifier, identifier))
|
||||
|
||||
def detach(self):
|
||||
del self.parent.children[self.identifier]
|
||||
|
@ -363,8 +381,11 @@ class TreeNode:
|
|||
for node in self.walk(customsort):
|
||||
print(node.abspath())
|
||||
|
||||
def sorted_children(self):
|
||||
keys = sorted(self.children.keys())
|
||||
def sorted_children(self, customsort=None):
|
||||
if customsort:
|
||||
keys = sorted(self.children.keys(), key=customsort)
|
||||
else:
|
||||
keys = sorted(self.children.keys())
|
||||
for key in keys:
|
||||
yield (key, self.children[key])
|
||||
|
||||
|
@ -388,7 +409,7 @@ class Walker:
|
|||
if databasename is None or databasename == "":
|
||||
self.domain = url_to_filepath(walkurl)['root']
|
||||
databasename = self.domain + '.db'
|
||||
databasename = databasename.replace(':', '')
|
||||
databasename = databasename.replace(':', '#')
|
||||
self.databasename = databasename
|
||||
|
||||
safeprint('Opening %s' % self.databasename)
|
||||
|
@ -451,6 +472,10 @@ class Walker:
|
|||
else:
|
||||
url = urllib.parse.urljoin(self.walkurl, url)
|
||||
|
||||
if url in self.seen_directories:
|
||||
# We already picked this up at some point
|
||||
return
|
||||
|
||||
if not url.startswith(self.walkurl):
|
||||
# Don't follow external links or parent directory.
|
||||
safeprint('Skipping "%s" due to external url.' % url)
|
||||
|
@ -480,11 +505,14 @@ class Walker:
|
|||
return
|
||||
raise
|
||||
content_type = head.headers.get('Content-Type', '?')
|
||||
|
||||
if content_type.startswith('text/html') and head.url.endswith('/'):
|
||||
#print(content_type)
|
||||
if content_type.startswith('text/html'):# and head.url.endswith('/'):
|
||||
# This is an index page, so extract links and queue them.
|
||||
response = do_get(url)
|
||||
hrefs = self.extract_hrefs(response)
|
||||
# Just in case the URL we used is different than the real one,
|
||||
# such as missing a trailing slash, add both.
|
||||
self.seen_directories.add(url)
|
||||
self.seen_directories.add(head.url)
|
||||
added = 0
|
||||
for href in hrefs:
|
||||
|
@ -660,6 +688,7 @@ def smart_insert(sql, cur, url=None, head=None, commit=True):
|
|||
if is_new:
|
||||
cur.execute('INSERT INTO urls VALUES(?, ?, ?, ?, ?)', data)
|
||||
else:
|
||||
print(url)
|
||||
command = '''
|
||||
UPDATE urls SET
|
||||
content_length = coalesce(?, content_length),
|
||||
|
@ -832,34 +861,37 @@ def list_basenames_argparse(args):
|
|||
outputfile=args.outputfile,
|
||||
)
|
||||
|
||||
def measure(databasename, fullscan=False):
|
||||
def measure(databasename, fullscan=False, new_only=False):
|
||||
'''
|
||||
Given a database, print the sum of all Content-Lengths.
|
||||
If `fullscan`, then URLs with no Content-Length will be
|
||||
HEAD requested, and the result will be saved back into the file.
|
||||
URLs will be HEAD requested if:
|
||||
`new_only` is True and the file has no stored content length, or
|
||||
`fullscan` is True and `new_only` is False
|
||||
'''
|
||||
if isinstance(fullscan, str):
|
||||
fullscan = bool(fullscan)
|
||||
|
||||
totalsize = 0
|
||||
sql = sqlite3.connect(databasename)
|
||||
cur1 = sql.cursor()
|
||||
cur2 = sql.cursor()
|
||||
cur2.execute('SELECT * FROM urls WHERE do_download == 1')
|
||||
cur = sql.cursor()
|
||||
|
||||
if new_only:
|
||||
cur.execute('SELECT * FROM urls WHERE do_download == 1 AND content_length IS NULL')
|
||||
else:
|
||||
cur.execute('SELECT * FROM urls WHERE do_download == 1')
|
||||
|
||||
items = cur.fetchall()
|
||||
|
||||
filecount = 0
|
||||
unmeasured_file_count = 0
|
||||
try:
|
||||
while True:
|
||||
fetch = cur2.fetchone()
|
||||
if fetch is None:
|
||||
break
|
||||
|
||||
for fetch in items:
|
||||
size = fetch[SQL_CONTENT_LENGTH]
|
||||
|
||||
if fullscan:
|
||||
if fullscan or new_only:
|
||||
url = fetch[SQL_URL]
|
||||
head = do_head(url, raise_for_status=False)
|
||||
fetch = smart_insert(sql, cur1, head=head, commit=False)
|
||||
fetch = smart_insert(sql, cur, head=head, commit=True)
|
||||
size = fetch[SQL_CONTENT_LENGTH]
|
||||
if size is None:
|
||||
safeprint('"%s" is not revealing Content-Length' % url)
|
||||
|
@ -881,14 +913,14 @@ def measure(databasename, fullscan=False):
|
|||
totalsize_string = '{} ({:,} bytes) in {:,} files'.format(short_string, totalsize, filecount)
|
||||
print(totalsize_string)
|
||||
if unmeasured_file_count > 0:
|
||||
print('Note: %d files do not have a stored Content-Length.' % unmeasured_file_count)
|
||||
print('Run `measure` with `-f` or `--fullscan` to HEAD request those files.')
|
||||
print(UNMEASURED_WARNING % unmeasured_file_count)
|
||||
return totalsize
|
||||
|
||||
def measure_argparse(args):
|
||||
return measure(
|
||||
databasename=args.databasename,
|
||||
fullscan=args.fullscan,
|
||||
new_only=args.new_only,
|
||||
)
|
||||
|
||||
def remove_pattern(args):
|
||||
|
@ -913,9 +945,13 @@ def tree(databasename, output_filename=None):
|
|||
|
||||
path_parts = url_to_filepath(items[0][SQL_URL])
|
||||
root_identifier = path_parts['root']
|
||||
#print('Root', root_identifier)
|
||||
print('Root', root_identifier)
|
||||
root_data = {'name': root_identifier, 'item_type': 'directory'}
|
||||
tree = TreeNode(identifier=root_identifier, data=root_data)
|
||||
root_identifier = root_identifier.replace(':', '')
|
||||
tree = TreeNode(
|
||||
identifier=root_identifier,
|
||||
data=root_data
|
||||
)
|
||||
node_map = {}
|
||||
|
||||
unmeasured_file_count = 0
|
||||
|
@ -923,20 +959,27 @@ def tree(databasename, output_filename=None):
|
|||
for item in items:
|
||||
path = url_to_filepath(item[SQL_URL])
|
||||
scheme = path['scheme']
|
||||
|
||||
# I join and re-split because 'folder' may contain slashes of its own
|
||||
# and I want to break all the pieces
|
||||
path = '\\'.join([path['root'], path['folder'], path['filename']])
|
||||
parts = path.split('\\')
|
||||
#print(path)
|
||||
for (index, part) in enumerate(parts):
|
||||
index += 1
|
||||
this_path = '/'.join(parts[:index])
|
||||
parent_path = '/'.join(parts[:index-1])
|
||||
this_path = '/'.join(parts[:index + 1])
|
||||
parent_path = '/'.join(parts[:index])
|
||||
#safeprint('this:' + this_path)
|
||||
#safeprint('parent:' + parent_path)
|
||||
|
||||
#input()
|
||||
data = {
|
||||
'name': part,
|
||||
'url': scheme + '://' + this_path,
|
||||
}
|
||||
if index == len(parts):
|
||||
this_identifier = this_path.replace(':', '')
|
||||
parent_identifier = parent_path.replace(':', '')
|
||||
|
||||
if (index + 1) == len(parts):
|
||||
data['item_type'] = 'file'
|
||||
if item[SQL_CONTENT_LENGTH]:
|
||||
data['size'] = item[SQL_CONTENT_LENGTH]
|
||||
|
@ -948,29 +991,29 @@ def tree(databasename, output_filename=None):
|
|||
|
||||
|
||||
# Ensure this comment is in a node of its own
|
||||
this_node = node_map.get(this_path, None)
|
||||
this_node = node_map.get(this_identifier, None)
|
||||
if this_node:
|
||||
# This ID was detected as a parent of a previous iteration
|
||||
# Now we're actually filling it in.
|
||||
this_node.data = data
|
||||
else:
|
||||
this_node = TreeNode(this_path, data)
|
||||
node_map[this_path] = this_node
|
||||
this_node = TreeNode(this_identifier, data)
|
||||
node_map[this_identifier] = this_node
|
||||
|
||||
# Attach this node to the parent.
|
||||
if parent_path == root_identifier:
|
||||
if parent_identifier == root_identifier:
|
||||
try:
|
||||
tree.add_child(this_node)
|
||||
except:
|
||||
except TreeExistingChild:
|
||||
pass
|
||||
else:
|
||||
parent_node = node_map.get(parent_path, None)
|
||||
parent_node = node_map.get(parent_identifier, None)
|
||||
if not parent_node:
|
||||
parent_node = TreeNode(parent_path, data=None)
|
||||
node_map[parent_path] = parent_node
|
||||
parent_node = TreeNode(parent_identifier, data=None)
|
||||
node_map[parent_identifier] = parent_node
|
||||
try:
|
||||
parent_node.add_child(this_node)
|
||||
except:
|
||||
except TreeExistingChild:
|
||||
pass
|
||||
this_node.parent = parent_node
|
||||
#print(this_node.data)
|
||||
|
@ -997,7 +1040,7 @@ def tree(databasename, output_filename=None):
|
|||
if node.data['item_type'] == 'directory':
|
||||
div_id = hashit(node.identifier, 16)
|
||||
line = '<button onclick="collapse(\'{div_id}\')">{name} ({size})</button>'
|
||||
line += '<div id="{div_id}">'
|
||||
line += '<div id="{div_id}" style="display:none">'
|
||||
line = line.format(
|
||||
div_id=div_id,
|
||||
name=node.data['name'],
|
||||
|
@ -1020,7 +1063,8 @@ def tree(databasename, output_filename=None):
|
|||
)
|
||||
write(line, outfile)
|
||||
|
||||
for (key, child) in node.sorted_children():
|
||||
customsort = lambda x: (node.children[x].data['item_type'] == 'file', node.children[x].data['url'].lower())
|
||||
for (key, child) in node.sorted_children(customsort=customsort):
|
||||
recursive_print_node(child, depth+1, outfile=outfile)
|
||||
|
||||
if node.data['item_type'] == 'directory':
|
||||
|
@ -1030,19 +1074,22 @@ def tree(databasename, output_filename=None):
|
|||
# This helps put some space between sibling directories
|
||||
write('| ' * (depth), outfile)
|
||||
|
||||
recursive_get_size(tree)
|
||||
use_html = output_filename.lower().endswith('.html')
|
||||
|
||||
if output_filename is not None:
|
||||
output_file = open(output_filename, 'w', encoding='utf-8')
|
||||
use_html = output_filename.lower().endswith('.html')
|
||||
else:
|
||||
output_file = None
|
||||
use_html = False
|
||||
|
||||
|
||||
if use_html:
|
||||
write(HTML_TREE_HEADER, outfile=output_file)
|
||||
|
||||
recursive_get_size(tree)
|
||||
recursive_print_node(tree, outfile=output_file)
|
||||
if unmeasured_file_count > 0:
|
||||
write('Note: %d files do not have a stored Content-Length.' % unmeasured_file_count, outfile=output_file)
|
||||
write('Run `measure` with `-f` or `--fullscan` to HEAD request those files.', outfile=output_file)
|
||||
write(UNMEASURED_WARNING % unmeasured_file_count, outfile=output_file)
|
||||
|
||||
if output_file is not None:
|
||||
output_file.close()
|
||||
|
@ -1091,6 +1138,7 @@ if __name__ == '__main__':
|
|||
p_measure = subparsers.add_parser('measure')
|
||||
p_measure.add_argument('databasename')
|
||||
p_measure.add_argument('-f', '--fullscan', dest='fullscan', action='store_true')
|
||||
p_measure.add_argument('-n', '--new_only', dest='new_only', action='store_true')
|
||||
p_measure.set_defaults(func=measure_argparse)
|
||||
|
||||
p_remove_pattern = subparsers.add_parser('remove_pattern')
|
||||
|
|
|
@ -4,12 +4,13 @@ Generators
|
|||
|
||||
# What are they
|
||||
|
||||
Generators are a type of iterable that create their contents on-the-fly. Unlike a list, whose entire contents are available before beginning any loops or manipulations, generators don't know how many items they will produce or when they will stop.
|
||||
Generators are a type of iterable that create their contents on-the-fly. Unlike a list, whose entire contents are available before beginning any loops or manipulations, generators don't know how many items they will produce or when they will stop. They can even go on forever.
|
||||
|
||||
|
||||
|
||||
# Writing one
|
||||
|
||||
Writing a generator looks like writing a function, but instead of `return`, you use `yield`. The object which is yielded is what you'll get when you do a loop over the generator. This generator lets you count to a billion:
|
||||
Writing a generator looks like writing a function, but instead of `return`, you use `yield`. The object which is yielded is what you'll get when you do a loop over the generator. This one lets you count to a billion:
|
||||
|
||||
def billion():
|
||||
x = 0
|
||||
|
@ -17,41 +18,92 @@ Writing a generator looks like writing a function, but instead of `return`, you
|
|||
yield x
|
||||
x += 1
|
||||
|
||||
Note that, unlike a `return` statement, you can include more code after a `yield` statement. Also notice that generators keep track of their internal state. The `billion` generator has an `x` that it increments every time you loop over it.
|
||||
Note that, unlike a `return` statement, you can include more code after a `yield` statement. Also notice that generators keep track of their internal state -- the `billion` generator has an `x` that it increments every time you loop over it. You can imagine the code pausing after the `yield` line, and resuming when you come back for the next cycle. Try this with some extra print statements to help visualize.
|
||||
|
||||
Generators can also take arguments. Here's a generator that counts to a custom amount:
|
||||
|
||||
def count_to(y):
|
||||
x = 0
|
||||
while x < y:
|
||||
yield x
|
||||
x += 1
|
||||
|
||||
|
||||
|
||||
|
||||
# Using one
|
||||
|
||||
Although generators look like a function when you're writing them, they feel more like objects when using them. Remember that generators don't calculate their contents until they are actually used in a loop, so simply doing:
|
||||
Although generators look like functions when you're writing them, they feel more like objects when using them. Remember that generators don't calculate their contents until they are actually used in a loop, so simply doing:
|
||||
|
||||
numbers = billion()
|
||||
numbers = count_to(100)
|
||||
|
||||
does **not** create a list of a billion numbers. It creates a new instance of the generator that is ready to be iterated over, like this:
|
||||
does **not** create a list of 100 numbers. It creates a new instance of the generator that is ready to be iterated over, like this:
|
||||
|
||||
numbers = billion()
|
||||
numbers = count_to(100)
|
||||
for number in numbers:
|
||||
print(number)
|
||||
|
||||
This might remind you of:
|
||||
or this:
|
||||
|
||||
for number in range(1000000000):
|
||||
for number in count_to(100):
|
||||
print(number)
|
||||
|
||||
because `range` is simply a generator.
|
||||
This should remind you of:
|
||||
|
||||
for number in range(100):
|
||||
print(number)
|
||||
|
||||
because the `range` class behaves a lot like a generator ([but not exactly](http://stackoverflow.com/a/13092317)).
|
||||
|
||||
|
||||
Generators are excellent for cases where using a list is infeasible or unnecessary. If you wanted to count to a billion using a list, you would first have to create a list of every number, which is a huge waste of time and memory. With a generator, the item is created, used, and trashed.
|
||||
Generators are excellent for cases where using a list is infeasible or unnecessary. In order to loop over a list, you have to generate the entire thing first. If you're only going to use each item once, storing the entire list can be a big memory waste when a generator could take its place. With a generator, the items are created, used, and trashed, so memory can be recycled.
|
||||
|
||||
Since generators can go on forever, they're great for abstracting out ugly `while` loops, so you can get down to business faster.
|
||||
|
||||
To get a single item from a generator without looping, use `next(generator)`.
|
||||
|
||||
|
||||
|
||||
# StopIteration
|
||||
|
||||
When a generator is all finished, it will raise a `StopIteration` exception every time you try to do `next()`. `for` loops will detect this automatically and stop themselves.
|
||||
Generators pause and resume a lot, but they still flow like normal functions. As long as there is no endless `while` loop inside, they'll come to an end at some point. When a generator is all finished, it will raise a `StopIteration` exception every time you try to do `next()`. Luckily, `for` loops will detect this automatically and stop themselves.
|
||||
|
||||
Earlier, I said that generators use `yield` instead of `return`, but in fact you can include a return statement. If it is encountered, it will raise a `StopIteration`, and the generator will not resume even if there is more code.
|
||||
|
||||
>>> def generator():
|
||||
... yield 1
|
||||
... return 2
|
||||
... yield 3
|
||||
...
|
||||
>>>
|
||||
>>> g = generator()
|
||||
>>> next(g)
|
||||
1
|
||||
>>> next(g)
|
||||
Traceback (most recent call last):
|
||||
File "<stdin>", line 1, in <module>
|
||||
StopIteration: 2
|
||||
>>> next(g)
|
||||
Traceback (most recent call last):
|
||||
File "<stdin>", line 1, in <module>
|
||||
StopIteration
|
||||
>>>
|
||||
|
||||
In general, I don't like to use `return` in generators. I prefer to `break` from their internal loops and conclude naturally.
|
||||
|
||||
|
||||
|
||||
# Minor notes
|
||||
|
||||
- You cannot access the items of a generator by index, because only one item exists at a time. Once you do a loop over a generator, those items are gone forever unless you kept them somewhere else.
|
||||
|
||||
|
||||
|
||||
|
||||
# More examples
|
||||
|
||||
#### Yielding individual items from batches
|
||||
|
||||
Suppose you're getting data from an imaginary website which sends you items in groups of 100. You want to let the user loop over every item without having to worry about the groups themselves.
|
||||
|
||||
def item_generator(url):
|
||||
|
@ -60,7 +112,7 @@ Suppose you're getting data from an imaginary website which sends you items in g
|
|||
# get_items is a pretend method that collects the 100 items from that page
|
||||
batch = get_items(url, page=page)
|
||||
|
||||
if not batch:
|
||||
if len(batch) == 0:
|
||||
# for this imaginary website, the batch will be empty when that page
|
||||
# doesn't have any items on it.
|
||||
break
|
||||
|
@ -73,9 +125,18 @@ Suppose you're getting data from an imaginary website which sends you items in g
|
|||
page += 1
|
||||
|
||||
# When the while loop breaks, we reach the end of the function body,
|
||||
# and a StopIteration will be raised and handled automatically,
|
||||
# ending the for-loop.
|
||||
# concluding the generator.
|
||||
|
||||
comments = item_generator('http://website.com/user/voussoir/comments')
|
||||
for comment in comments:
|
||||
print(comment.body)
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
# Further reading
|
||||
|
||||
[Stack Overflow - What are the main uses for `yield from`?](http://stackoverflow.com/questions/9708902/in-practice-what-are-the-main-uses-for-the-new-yield-from-syntax-in-python-3) -- If you like recursive functions, how about recursive generators? The only time I've ever used this is to [iterate over a tree's nodes](https://github.com/voussoir/reddit/blob/2069c3bd731cc8f90401ee49a9fc4d0dbf436cfc/Prawtimestamps/timesearch.py#L756-L761).
|
||||
|
||||
[Stack Overflow - Python generator `send` function purpose?](http://stackoverflow.com/questions/19302530/python-generator-send-function-purpose) -- This quickly dives out of "quick tips" territory.
|
|
@ -5,12 +5,8 @@ import urllib.parse
|
|||
import random
|
||||
import sys
|
||||
|
||||
sys.path.append('C:\\git\\else\\Bytestring')
|
||||
import bytestring
|
||||
|
||||
sys.path.append('C:\\git\\else\\Ratelimiter')
|
||||
import ratelimiter
|
||||
|
||||
sys.path.append('C:\\git\\else\\Bytestring'); import bytestring
|
||||
sys.path.append('C:\\git\\else\\Ratelimiter'); import ratelimiter
|
||||
|
||||
f = open('favicon.png', 'rb')
|
||||
FAVI = f.read()
|
||||
|
@ -60,7 +56,14 @@ class Path:
|
|||
else:
|
||||
# Diamond emoji, because there's not one for files.
|
||||
icon = '\U0001F48E'
|
||||
return '<a href="{full}">{icon} {display}</a>'.format(full=self.path, icon=icon, display=display_name)
|
||||
|
||||
quoted_path = urllib.parse.quote(self.path)
|
||||
a = '<a href="{full}">{icon} {display}</a>'.format(
|
||||
full=quoted_path,
|
||||
icon=icon,
|
||||
display=display_name,
|
||||
)
|
||||
return a
|
||||
|
||||
@property
|
||||
def basename(self):
|
||||
|
@ -70,6 +73,10 @@ class Path:
|
|||
def is_dir(self):
|
||||
return os.path.isdir(self.os_path)
|
||||
|
||||
@property
|
||||
def is_file(self):
|
||||
return os.path.isfile(self.os_path)
|
||||
|
||||
@property
|
||||
def os_path(self):
|
||||
abspath = os.path.join(CWD, self.relative_path)
|
||||
|
@ -96,7 +103,11 @@ class Path:
|
|||
form = '<tr style="background-color:#{bg}"><td>{anchor}</td><td>{size}</td></tr>'
|
||||
bg = 'ddd' if shaded else 'fff';
|
||||
size = bytestring.bytestring(self.size) if self.size != -1 else ''
|
||||
row = form.format(bg=bg, anchor=self.anchor(display_name=display_name), size=size)
|
||||
row = form.format(
|
||||
bg=bg,
|
||||
anchor=self.anchor(display_name=display_name),
|
||||
size=size,
|
||||
)
|
||||
return row
|
||||
|
||||
|
||||
|
@ -124,12 +135,8 @@ class RequestHandler(http.server.BaseHTTPRequestHandler):
|
|||
|
||||
def do_GET(self):
|
||||
#print(dir(self))
|
||||
path = self.path.lower()
|
||||
path = urllib.parse.unquote(path).rstrip('/')
|
||||
|
||||
error = path_validation(path)
|
||||
if error:
|
||||
self.send_error(*error)
|
||||
path = normalize_path(self.path)
|
||||
if self.send_path_validation_error(path):
|
||||
return
|
||||
|
||||
path = Path(path)
|
||||
|
@ -139,11 +146,58 @@ class RequestHandler(http.server.BaseHTTPRequestHandler):
|
|||
if mime is not None:
|
||||
#print(mime)
|
||||
self.send_header('Content-type', mime)
|
||||
self.end_headers()
|
||||
|
||||
if path.is_file:
|
||||
self.send_header('Content-length', path.size)
|
||||
|
||||
d = self.read_filebytes(path)
|
||||
#print('write')
|
||||
self.end_headers()
|
||||
self.write(d)
|
||||
|
||||
def do_HEAD(self):
|
||||
path = normalize_path(self.path)
|
||||
if self.send_path_validation_error(path):
|
||||
return
|
||||
|
||||
path = Path(path)
|
||||
self.send_response(200)
|
||||
|
||||
if path.is_dir:
|
||||
mime = 'text/html'
|
||||
else:
|
||||
mime = mimetypes.guess_type(path.path)[0]
|
||||
|
||||
if mime is not None:
|
||||
self.send_header('Content-type', mime)
|
||||
|
||||
if path.is_file:
|
||||
self.send_header('Content-length', path.size)
|
||||
|
||||
self.end_headers()
|
||||
|
||||
def path_validation(self, path):
|
||||
path = path.lstrip('/')
|
||||
absolute_path = os.path.join(CWD, path)
|
||||
absolute_path = os.path.abspath(absolute_path)
|
||||
path = absolute_path.replace(CWD, '')
|
||||
path = path.lstrip('/')
|
||||
path = path.replace('\\', '/')
|
||||
#if '..' in path:
|
||||
# return (403, 'I\'m not going to play games with you.')
|
||||
#print(path)
|
||||
print(path)
|
||||
if not any(path.startswith(okay) for okay in OKAY_PATHS):
|
||||
self.send_error(403, 'Stop that!')
|
||||
return
|
||||
|
||||
def send_path_validation_error(self, path):
|
||||
error = self.path_validation(path)
|
||||
if error:
|
||||
self.send_error(*error)
|
||||
return True
|
||||
return False
|
||||
|
||||
# def do_POST(self):
|
||||
# path = self.path.lower()
|
||||
# path = urllib.parse.unquote(path).rstrip('/')
|
||||
|
@ -181,32 +235,34 @@ class RequestHandler(http.server.BaseHTTPRequestHandler):
|
|||
# self.write('Thanks')
|
||||
|
||||
def generate_opendir(path):
|
||||
print('Listdir:', path)
|
||||
#print('Listdir:', path)
|
||||
items = os.listdir(path.relative_path)
|
||||
items = [os.path.join(path.relative_path, f) for f in items]
|
||||
|
||||
# This places directories above files, each ordered alphabetically
|
||||
items.sort(key=str.lower)
|
||||
directories = []
|
||||
files = []
|
||||
for item in items:
|
||||
#item = item.lstrip('/')
|
||||
if os.path.isdir(item):
|
||||
directories.append(item)
|
||||
else:
|
||||
files.append(item)
|
||||
directories.sort(key=str.lower)
|
||||
files.sort(key=str.lower)
|
||||
files = directories + files
|
||||
#print(files)
|
||||
files = [Path(f) for f in files]
|
||||
|
||||
items = directories + files
|
||||
items = [Path(f) for f in items]
|
||||
entries = []
|
||||
if not any(okay == path.path for okay in OKAY_PATHS):
|
||||
# If the path actually equals a okay_path, then we shouldn't
|
||||
# let them step up because that would be outisde the okay area.
|
||||
# If the user is on one of the OKAY_PATHS, then he can't step up
|
||||
# because that would be outside the OKAY area.
|
||||
entries.append(path.parent.table_row(display_name='up'))
|
||||
|
||||
shaded = True
|
||||
for f in files:
|
||||
entry = f.table_row(shaded=shaded)
|
||||
for item in items:
|
||||
entry = item.table_row(shaded=shaded)
|
||||
entries.append(entry)
|
||||
shaded = not shaded
|
||||
|
||||
entries = '\n'.join(entries)
|
||||
text = OPENDIR_TEMPLATE.format(entries=entries)
|
||||
return text
|
||||
|
@ -218,12 +274,11 @@ def generate_random_filename(original_filename='', length=8):
|
|||
identifier = '{:x}'.format(bits).rjust(length, '0')
|
||||
return identifier
|
||||
|
||||
def path_validation(path):
|
||||
if '..' in path:
|
||||
return (403, 'I\'m not going to play games with you.')
|
||||
if not any(path.startswith(okay) for okay in OKAY_PATHS):
|
||||
self.send_error(403, 'Stop that!')
|
||||
return
|
||||
def normalize_path(path):
|
||||
#path = path.lower()
|
||||
path = urllib.parse.unquote(path).rstrip('/')
|
||||
return path
|
||||
|
||||
|
||||
server = http.server.HTTPServer(('', 32768), RequestHandler)
|
||||
print('server starting')
|
||||
|
|
|
@ -526,11 +526,32 @@ def get_path_casing(path):
|
|||
'''
|
||||
p = str_to_fp(path)
|
||||
path = p.path
|
||||
path = glob.escape(path)
|
||||
(drive, subpath) = os.path.splitdrive(path)
|
||||
pattern = ["%s[%s]" % (piece[:-1], piece[-1]) for piece in subpath.split(os.sep)[1:]]
|
||||
subpath = subpath.lstrip(os.sep)
|
||||
|
||||
def patternize(piece):
|
||||
'''
|
||||
Create a pattern like "[u]ser" from "user", forcing glob to look up the
|
||||
correct path name, and guaranteeing that the only result will be the correct path.
|
||||
|
||||
Special cases are:
|
||||
!, because in glob syntax, [!x] tells glob to look for paths that don't contain
|
||||
"x". [!] is invalid syntax, so we pick the first non-! character to put
|
||||
in the brackets.
|
||||
[, because this starts a capture group
|
||||
'''
|
||||
piece = glob.escape(piece)
|
||||
for character in piece:
|
||||
if character not in '!':
|
||||
replacement = '[%s]' % character
|
||||
piece = piece.replace(character, replacement, 1)
|
||||
break
|
||||
return piece
|
||||
|
||||
pattern = [patternize(piece) for piece in subpath.split(os.sep)]
|
||||
pattern = os.sep.join(pattern)
|
||||
pattern = drive.upper() + os.sep + pattern
|
||||
print(pattern)
|
||||
try:
|
||||
return str_to_fp(glob.glob(pattern)[0])
|
||||
except IndexError:
|
||||
|
|
Loading…
Reference in a new issue