master
Ethan Dalool 2016-08-17 18:24:38 -07:00
parent c491e417f5
commit 82f63a75ab
14 changed files with 501 additions and 264 deletions

8
Clipext/README.md Normal file
View File

@ -0,0 +1,8 @@
Clip Extension
==============
This module works with pyperclip to provide some handy features for commandline utilities.
Instead of having the user paste text into the commandline to run your script, just let them enter `script.py !c` and resolve it automatically. Pasting into the cmd on Windows is annoying and requires a mouse-click so this can be very convenient.
Since "!i" resolves to user input, your script can accept piping with `ls | script.py !i`.

31
Clipext/clipext.py Normal file
View File

@ -0,0 +1,31 @@
import pyperclip
CLIPBOARD_STRINGS = ['!c', '!clip', '!clipboard']
INPUT_STRINGS = ['!i', '!in', '!input', '!stdin']
EOF = '\x1a'
def multi_line_input():
userinput = []
while True:
try:
additional = input()
except EOFError:
# If you enter nothing but ctrl-z
additional = EOF
userinput.append(additional)
if EOF in additional:
break
userinput = '\n'.join(userinput)
userinput = userinput.split(EOF)[0]
return userinput.strip()
def resolve(arg):
lowered = arg.lower()
if lowered in CLIPBOARD_STRINGS:
return pyperclip.paste()
if lowered in INPUT_STRINGS:
return multi_line_input()
return arg

6
Downloady/README.md Normal file
View File

@ -0,0 +1,6 @@
Downloady
=========
- 2016 08 16
- Downloady now uses temporary files for incomplete downloads, and renames them when finished. This helps distinguish downloads that were interrupted and should be resumed from files that just happen to have the same name, which previously would have been interpreted as a resume. This improves overall ease-of-use, simplifies the behavior of the `overwrite` parameter, and will remove duplicate work from other programs.
- Rewrote the plan creator and download function to do a better job of separating concerns and simplify the plan selector.

View File

@ -1,27 +1,28 @@
import sys
sys.path.append('C:\\git\\else\\ratelimiter'); import ratelimiter
sys.path.append('C:\\git\\else\\bytestring'); import bytestring
import argparse
import os
import pyperclip # pip install pyperclip
import requests
import sys
import time
import urllib
import warnings
sys.path.append('C:\\git\\else\\clipext'); import clipext
sys.path.append('C:\\git\\else\\ratelimiter'); import ratelimiter
sys.path.append('C:\\git\\else\\bytestring'); import bytestring
warnings.simplefilter('ignore')
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36'
}
SLEEPINESS = 3
FILENAME_BADCHARS = '*?"<>|'
last_request = 0
CHUNKSIZE = 16 * bytestring.KIBIBYTE
STOP = False
TIMEOUT = 600
TEMP_EXTENSION = '.downloadytemp'
def basename_from_url(url):
'''
@ -32,81 +33,6 @@ def basename_from_url(url):
localname = localname.split('/')[-1]
return localname
def determine_seek_and_range(
file_handle,
localname,
local_exists,
overwrite,
remote_total_bytes,
server_respects_range,
user_provided_range,
user_range_min,
user_range_max,
):
''' THINGS THAT CAN HAPPEN '''
seek_to = 0
header_range_min = None
header_range_max = None
if local_exists:
local_existing_bytes = os.path.getsize(localname)
if overwrite is True:
file_handle.truncate()
if user_provided_range:
header_range_min = user_range_min
header_range_max = user_range_max
seek_to = user_range_min
elif not user_provided_range:
pass
elif overwrite is None:
if local_existing_bytes == remote_total_bytes:
print('File is 100%. Nothing to do.')
return
if user_provided_range:
if server_respects_range:
seek_to = user_range_min
else:
raise Exception('The server did not respect your range header')
elif not user_provided_range:
if server_respects_range:
print('Resuming from byte %d' % local_existing_bytes)
header_range_min = local_existing_bytes
header_range_max = ''
seek_to = local_existing_bytes
else:
print('File exists, but server doesn\'t allow resumes. Restart from 0?')
permission = get_permission()
if permission:
file_handle.truncate()
else:
raise Exception('Couldn\'t resume')
else:
raise TypeError('Invalid value for `overwrite`. Must be True, False, or None')
elif not local_exists:
if user_provided_range:
if server_respects_range:
file_handle.seek(user_range_min)
file_handle.write(b'\0')
header_range_min = user_range_min
header_range_max = user_range_max
seek_to = user_range_min
else:
raise Exception('The server did not respect your range header')
elif not user_provided_range:
pass
return (seek_to, header_range_min, header_range_max)
def download_file(
url,
localname=None,
@ -114,33 +40,103 @@ def download_file(
bytespersecond=None,
callback_progress=None,
headers=None,
overwrite=None
overwrite=False,
verbose=False,
):
if headers is None:
headers = {}
''' Determine local filename '''
url = url.replace('%3A//', '://')
headers = headers or {}
url = sanitize_url(url)
if localname in [None, '']:
localname = basename_from_url(url)
localname = sanitize_filename(localname)
localname = filepath_sanitize(localname)
if verbose:
print(url)
plan = prepare_plan(
url,
localname,
auth=auth,
bytespersecond=bytespersecond,
headers=headers,
overwrite=overwrite,
)
#print(plan)
if plan is None:
return
localname = plan['download_into']
directory = os.path.split(localname)[0]
if directory != '':
os.makedirs(directory, exist_ok=True)
touch(localname)
file_handle = open(localname, 'r+b')
file_handle.seek(plan['seek_to'])
if plan['header_range_min'] is not None:
headers['range'] = 'bytes={min}-{max}'.format(
min=plan['header_range_min'],
max=plan['header_range_max'],
)
if plan['plan_type'] == 'resume':
bytes_downloaded = plan['seek_to']
else:
bytes_downloaded = 0
download_stream = request('get', url, stream=True, headers=headers, auth=auth)
for chunk in download_stream.iter_content(chunk_size=CHUNKSIZE):
bytes_downloaded += len(chunk)
file_handle.write(chunk)
if callback_progress is not None:
callback_progress(bytes_downloaded, plan['remote_total_bytes'])
if plan['limiter'] is not None and bytes_downloaded < plan['remote_total_bytes']:
plan['limiter'].limit(len(chunk))
file_handle.close()
if localname != plan['real_localname']:
os.rename(localname, plan['real_localname'])
localsize = os.path.getsize(plan['real_localname'])
if plan['plan_type'] != 'partial' and localsize < plan['remote_total_bytes']:
message = 'File does not contain expected number of bytes. Received {size} / {total}'
message = message.format(size=os.path.getsize(localname), total=plan['remote_total_bytes'])
raise Exception(message)
return plan['real_localname']
def prepare_plan(
url,
localname,
auth,
bytespersecond,
headers,
overwrite,
):
# Chapter 1: File existence
user_provided_range = 'range' in headers
real_localname = localname
temp_localname = localname + TEMP_EXTENSION
real_exists = os.path.exists(real_localname)
if real_exists and overwrite is False and not user_provided_range:
print('File exists and overwrite is off. Nothing to do.')
return None
temp_exists = os.path.exists(temp_localname)
real_localsize = int(real_exists and os.path.getsize(real_localname))
temp_localsize = int(temp_exists and os.path.getsize(temp_localname))
# Chapter 2: Ratelimiting
if bytespersecond is None:
limiter = None
elif isinstance(bytespersecond, ratelimiter.Ratelimiter):
limiter = bytespersecond
else:
limiter = ratelimiter.Ratelimiter(bytespersecond, period=1)
limiter = ratelimiter.Ratelimiter(bytespersecond)
''' Prepare plan variables '''
local_exists = os.path.exists(localname)
if local_exists and overwrite is False:
print('Overwrite off. Nothing to do.')
return
user_provided_range = 'range' in headers
# Chapter 3: Extracting range
if user_provided_range:
user_range_min = int(headers['range'].split('bytes=')[1].split('-')[0])
user_range_max = headers['range'].split('-')[1]
@ -150,71 +146,88 @@ def download_file(
user_range_min = None
user_range_max = None
# Chapter 4: Server range support
# Always include a range on the first request to figure out whether the
# server supports it. Use 0- so we get the right `remote_total_bytes`.
# server supports it. Use 0- to get correct remote_total_bytes
temp_headers = headers
temp_headers.update({'range': 'bytes=0-'})
# I'm using a GET instead of an actual HEAD here because some servers respond
# differently, even though they're not supposed to.
head = request('get', url, stream=True, headers=temp_headers, auth=auth)
remote_total_bytes = int(head.headers.get('content-length', 1))
remote_total_bytes = int(head.headers.get('content-length', 0))
server_respects_range = (head.status_code == 206 and 'content-range' in head.headers)
head.connection.close()
touch(localname)
file_handle = open(localname, 'r+b')
file_handle.seek(0)
if user_provided_range and not server_respects_range:
raise Exception('Server did not respect your range header')
plan = determine_seek_and_range(
file_handle=file_handle,
localname=localname,
local_exists=local_exists,
overwrite=overwrite,
remote_total_bytes=remote_total_bytes,
server_respects_range=server_respects_range,
user_provided_range=user_provided_range,
user_range_min=user_range_min,
user_range_max=user_range_max,
# Chapter 5: Plan definitions
plan_base = {
'limiter': limiter,
'real_localname': real_localname,
'remote_total_bytes': remote_total_bytes,
}
plan_fulldownload = dict(
plan_base,
download_into=temp_localname,
header_range_min=None,
header_range_max=None,
plan_type='fulldownload',
seek_to=0,
)
plan_resume = dict(
plan_base,
download_into=temp_localname,
header_range_min=temp_localsize,
header_range_max='',
plan_type='resume',
seek_to=temp_localsize,
)
plan_partial = dict(
plan_base,
download_into=real_localname,
header_range_min=user_range_min,
header_range_max=user_range_max,
plan_type='partial',
seek_to=user_range_min,
)
if plan is None:
return
(seek_to, header_range_min, header_range_max) = plan
if header_range_min is not None:
headers['range'] = 'bytes={0}-{1}'.format(header_range_min, header_range_max)
# Chapter 6: Redeem your meal vouchers here
if real_exists:
if overwrite:
os.remove(real_localname)
bytes_downloaded = seek_to
file_handle.seek(seek_to)
download_stream = request('get', url, stream=True, headers=headers, auth=auth)
if user_provided_range:
return plan_partial
''' Begin download '''
for chunk in download_stream.iter_content(chunk_size=CHUNKSIZE):
bytes_downloaded += len(chunk)
file_handle.write(chunk)
if callback_progress is not None:
callback_progress(bytes_downloaded, remote_total_bytes)
return plan_fulldownload
if limiter is not None and bytes_downloaded < remote_total_bytes:
limiter.limit(len(chunk))
elif temp_exists and temp_localsize > 0:
if overwrite:
return plan_fulldownload
file_handle.close()
return localname
if user_provided_range:
return plan_partial
def filepath_sanitize(text, exclusions=''):
bet = FILENAME_BADCHARS.replace(exclusions, '')
for char in bet:
text = text.replace(char, '')
return text
if server_respects_range:
print('Resume from byte %d' % plan_resume['seek_to'])
return plan_resume
else:
if user_provided_range:
return plan_partial
return plan_fulldownload
print('No plan was chosen?')
return None
def get_permission(prompt='y/n\n>', affirmative=['y', 'yes']):
permission = input(prompt)
return permission.lower() in affirmative
def is_clipboard(s):
return s.lower() in ['!c', '!clip', '!clipboard']
def progress(bytes_downloaded, bytes_total, prefix=''):
def progress1(bytes_downloaded, bytes_total, prefix=''):
divisor = bytestring.get_appropriate_divisor(bytes_total)
bytes_total_string = bytestring.bytestring(bytes_total, force_unit=divisor)
bytes_downloaded_string = bytestring.bytestring(bytes_downloaded, force_unit=divisor)
@ -278,6 +291,16 @@ def request(method, url, stream=False, headers=None, timeout=TIMEOUT, **kwargs):
req.raise_for_status()
return req
def sanitize_filename(text, exclusions=''):
bet = FILENAME_BADCHARS.replace(exclusions, '')
for char in bet:
text = text.replace(char, '')
return text
def sanitize_url(url):
url = url.replace('%3A//', '://')
return url
def touch(filename):
f = open(filename, 'ab')
f.close()
@ -286,26 +309,14 @@ def touch(filename):
def download_argparse(args):
url = args.url
if is_clipboard(url):
url = pyperclip.paste()
print(url)
overwrite = {
'y':True, 't':True,
'n':False, 'f':False,
}.get(args.overwrite.lower(), None)
url = clipext.resolve(url)
callback = {
None: progress,
'1': progress,
None: progress1,
'1': progress1,
'2': progress2,
}.get(args.callback, None)
callback = args.callback
if callback == '1':
callback = progress
if callback == '2':
callback = progress2
}.get(args.callback, args.callback)
bytespersecond = args.bytespersecond
if bytespersecond is not None:
@ -321,18 +332,19 @@ def download_argparse(args):
bytespersecond=bytespersecond,
callback_progress=callback,
headers=headers,
overwrite=overwrite,
overwrite=args.overwrite,
verbose=True,
)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
#p_download_file = subparsers.add_parser('download_file')
parser.add_argument('url')
parser.add_argument('localname', nargs='?', default=None)
parser.add_argument('-c', '--callback', dest='callback', default=progress)
parser.add_argument('-c', '--callback', dest='callback', default=progress1)
parser.add_argument('-bps', '--bytespersecond', dest='bytespersecond', default=None)
parser.add_argument('-ow', '--overwrite', dest='overwrite', default='')
parser.add_argument('-ow', '--overwrite', dest='overwrite', action='store_true')
parser.add_argument('-r', '--range', dest='range', default=None)
parser.set_defaults(func=download_argparse)

View File

@ -1,45 +0,0 @@
THINGS THAT CAN HAPPEN
├───File exists
│ ├───User disables overwrite
│ │ └───Return because there's nothing to do
│ │
│ ├───User enables overwrite
│ │ ├───User requests range
│ │ │ └───Raise exception because requesting a range and forcing overwrite are mutually exclusive
│ │ │
│ │ └───User does not request range
│ │ └───File opened, truncated, full download
│ │
│ └───User does not specify overwrite
│ ├───File is same size as content-length
│ │ └───Return because there's nothing to do.
│ │
│ ├───User requests range
│ │ ├───Server respects range
│ │ │ └───File opened, seeked to request, bytes filled in
│ │ │
│ │ └───Server does not respect range
│ │ └───Raise exception because user's request can't be fulfilled
│ │
│ └───User does not request range
│ ├───Server respects range
│ │ └───File is opened, seeked to end, download resumes
│ │
│ └───Server does not respect range
│ └───Ask for permission to overwrite from beginning
└───File does not exist
├───User requests range
│ ├───Server respects range
│ │ └───File created, seeked to request, bytes filled in. everything else left 0
│ └───Server does not respect range
│ └───Raise exception because user's request can't be fulfilled
└───User does not request range
└───File created, full download
Possible amibiguity: If the user requests a range, and the file does not exist, does he want:
1. to fill the file with zeroes, and patch the requested bytes into their correct spot; or
2. to create the file empty, and only write the requested bytes?
I will assume #1 because that plays nicely with other Things That Can Happen, such as letting the user patch the other bytes in later.

157
Instathief/instathief.py Normal file
View File

@ -0,0 +1,157 @@
import argparse
import bs4
import datetime
import json
import os
import requests
import sys
sys.path.append('C:\\git\\else\\clipext'); import clipext
sys.path.append('C:\\git\\else\\downloady'); import downloady
''' '''
STRFTIME = '%Y%m%d-%H%M%S'
# strftime used for filenames when downloading
URL_PROFILE = 'https://www.instagram.com/{username}'
URL_QUERY = 'https://www.instagram.com/query/'
PAGE_QUERY_TEMPLATE = '''
ig_user({user_id})
{{
media.after({end_cur}, {count})
{{
count,
nodes
{{
code,
date,
display_src,
id,
video_url
}},
page_info
}}
}}
'''.replace('\n', '').replace(' ', '')
USERAGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
''' '''
last_cookie = None
def download_media(media_list):
for media in media_list:
timestamp = datetime.datetime.utcfromtimestamp(media['created'])
timestamp = timestamp.strftime(STRFTIME)
basename = downloady.basename_from_url(media['url'])
extension = os.path.splitext(basename)[1]
filename = timestamp + extension
downloady.download_file(
url=media['url'],
localname=filename,
callback_progress=downloady.progress2,
overwrite=False,
)
def get_page(user_id, end_cur, count, cookies):
query = PAGE_QUERY_TEMPLATE.format(
count=count,
end_cur=end_cur,
user_id=user_id,
)
headers = {
'referer': 'https://www.instagram.com/',
'user-agent': USERAGENT,
'x-csrftoken': cookies['csrftoken'],
}
data = {
'q': query,
'ref': 'users::show',
}
response = requests.post(
url=URL_QUERY,
cookies=cookies,
data=data,
headers=headers,
)
response.raise_for_status()
page = response.json()
return page
def get_user_info(username):
global last_cookie
url = URL_PROFILE.format(username=username)
response = requests.get(url)
response.raise_for_status()
text = response.text
soup = bs4.BeautifulSoup(text, 'html.parser')
scripts = soup.find_all('script')
for script in scripts:
if 'window._sharedData' in script.text:
break
else:
raise Exception('Did not find expected javascript')
user_data = script.text
user_data = user_data.split('window._sharedData = ')[1].rstrip(';').strip()
user_data = json.loads(user_data)
user_data = user_data['entry_data']['ProfilePage'][0]['user']
user_id = user_data['id']
page_info = user_data['media']['page_info']
if page_info['has_next_page']:
end_cur = page_info['start_cursor']
# Minus 1 because the queries use "after" parameters for pagination, and
# if we just take this cursor then we will only get items after it.
end_cur = int(end_cur) - 1
else:
end_cur = None
user_data = {
'user_id': user_id,
'end_cur': end_cur,
'cookies': response.cookies,
}
last_cookie = response.cookies
return user_data
def get_user_media(username):
user_info = get_user_info(username)
end_cur = user_info.pop('end_cur')
while True:
page = get_page(count=50, end_cur=end_cur, **user_info)
page = page['media']
posts = page['nodes']
for post in posts:
timestamp = post['date']
media_url = post.get('video_url') or post.get('display_src')
ret = {
'created': timestamp,
'url': media_url
}
yield ret
page_info = page['page_info']
if page_info['has_next_page']:
end_cur = page_info['end_cursor']
else:
break
def main():
username = sys.argv[1]
media = get_user_media(username)
for (timestamp, url) in media:
print(url)
if __name__ == '__main__':
main()

View File

@ -138,6 +138,7 @@ function create_odi_div(url)
if (paramless_url.match(IMAGE_TYPES))
{
console.log("Creating image div for " + paramless_url);
var div = document.createElement("div");
div.id = generate_id(32);
div.className = "odi_image_div";
@ -194,6 +195,7 @@ function create_odi_div(url)
{
return null;
}
console.log("Creating " + mediatype + " div for " + paramless_url);
var div = document.createElement("div");
div.id = generate_id(32);
@ -253,6 +255,7 @@ function create_odi_div(url)
}
function create_odi_divs(urls)
{
console.log("Creating odi divs");
image_divs = [];
media_divs = [];
odi_divs = [];
@ -332,6 +335,7 @@ function create_workspace()
control_panel.appendChild(ingest_div);
control_panel.appendChild(start_button);
document.body.appendChild(workspace);
console.log("finished workspace");
}
function delete_odi_div(element)
@ -430,6 +434,7 @@ function filter_re(pattern, do_delete)
function format_css()
{
console.log("Formatting CSS variables");
var css = CSS;
while (true)
{
@ -438,22 +443,24 @@ function format_css()
{
break;
}
matches = Array.from(new Set(matches));
for (var index = 0; index < matches.length; index += 1)
console.log(matches);
matches = new Set(matches);
/* Originally used Array.from(set) and did regular iteration, but I found
that sites can override and break that conversion. */
matches.forEach(
function(injector)
{
var injector = matches[index];
var injected = injector.replace(new RegExp("\\$", 'g'), "");
/*console.log(injector);*/
/*console.log(injected);*/
css = css.replace(injector, this[injected]);
}
);
}
return css;
}
function get_all_urls()
{
console.log("Collecting urls");
var urls = [];
function include(source, attr)
{
@ -529,6 +536,7 @@ function get_basename(url)
function get_gfycat_video(id)
{
console.log("Resolving gfycat " + id);
var url = "https://gfycat.com/cajax/get/" + id;
var request = new XMLHttpRequest();
request.answer = null;
@ -595,6 +603,7 @@ function generate_id(length)
function ingest()
{
/* Take the text from the INGEST box, and make odi divs from it */
console.log("Ingesting");
var odi_divs = get_odi_divs();
var ingestbox = document.getElementById("ingestbox");
var text = ingestbox.value;
@ -622,6 +631,7 @@ function ingest()
function lazy_load_all()
{
console.log("Starting lazyload");
lazies = get_lazy_divs();
lazies.reverse();
lazy_buttons = document.getElementsByClassName("load_button");

View File

@ -91,8 +91,8 @@ function swap_source(player, source_list)
function main()
{
var WIDTH = 3;
var HEIGHT = 3;
var WIDTH = 2;
var HEIGHT = 2;
var MEDIAS = get_media_links();
clear_page();

View File

@ -1,49 +1,52 @@
Open Dir DL
===========
The open directory downloader
The open directory downloader.
Requires `pip install beautifulsoup4`
Requires `pip install beautifulsoup4`.
See inside opendirdl.py for usage instructions.
- 2016 08 16
- **[cleanup]** Now that Downloady uses temp files for incomplete downloads, that logic can be removed from opendirdl.
- 2016 08 10
- Fixed bug in smart_insert caused by 404's being considered falsey, triggering the 'one and only one' exception.
- Fixed bug in smart_insert where 404'd URLs were not being deleted from the database.
- Added clickable links to each directory on HTML tree pages.
- **[addition]** Added clickable links to each directory on HTML tree pages.
- **[bugfix]** Fixed bug in smart_insert caused by 404's being considered falsey, triggering the 'one and only one' exception.
- **[bugfix]** Fixed bug in smart_insert where 404'd URLs were not being deleted from the database.
- 2016 08 02
- Removed the usage of div IDs on the Tree pages by making the collapse button use `this.nextSibling`.
- Rewrote `build_file_tree` with a way simpler algorithm.
- Removed the ability to set a Node's parent during `__init__` because it wasn't fully fleshed out and doesn't need to be used anyway.
- **[cleanup]** Removed the need for div IDs on the Tree pages by making the collapse button use `this.nextSibling`.
- **[cleanup]** Rewrote `build_file_tree` with a way simpler algorithm.
- **[removal]** Removed the ability to set a Node's parent during `__init__` because it wasn't fully fleshed out and doesn't need to be used anyway.
- 2016 08 01
- Made the digest work even if you forget the http://
- **[addition]** Made the digest work even if you forget the http://
- 2016 07 29
- Moved some nested function definitions out to the top level, and made the construction of the file tree its own function. These functions really don't need to be used on their own, but they were cluttering the logic of the `tree` command.
- Renamed `Tree.listnodes` to `Tree.list_children` and the `customsort` now expects to operate on Node objects rather than `(identifier, Node)` tuples. Nodes already have their identifier so the tuple was unecessary.
- Removed `Tree.sorted_children` since it was basically a duplicate of `Tree.listnodes` and I don't know why I had both.
- Replaced all `safeprint` calls with `write` because it provides access to safeprint as well as file writing if needed.
- Replaced local `download_file` function with a call to `downloady.download_file`. It supports download continuation and removes duplicate work.
- **[change]** Moved some nested function definitions out to the top level, and made the construction of the file tree its own function. These functions really don't need to be used on their own, but they were cluttering the logic of the `tree` command.
- **[change]** Renamed `Tree.listnodes` to `Tree.list_children`, and the `customsort` now expects to operate on Node objects rather than `(identifier, Node)` tuples. Nodes already have their identifier so the tuple was unecessary.
- **[change]** Replaced local `download_file` function with a call to `downloady.download_file`. It supports download continuation and removes duplicate work.
- **[cleanup]** Replaced all `safeprint` calls with `write` because it provides access to safeprint as well as file writing if needed.
- **[removal]** Removed `Tree.sorted_children` since it was basically a duplicate of `Tree.listnodes` and I don't know why I had both.
- 2016 07 25
- Removed the `Downloader` class after watching [this Jack Diederich talk](https://youtu.be/o9pEzgHorH0) about unecessary classes.
- Bytespersecond is now parsed by `bytestring.parsebytes` rather than `eval`, so you can write "100k" as opposed to "100 * 1024" etc.
- **[change]** Bytespersecond is now parsed by `bytestring.parsebytes` rather than `eval`, so you can write "100k" as opposed to "100 * 1024" etc.
- **[removal]** Removed the `Downloader` class after watching [this Jack Diederich talk](https://youtu.be/o9pEzgHorH0) about unecessary classes.
- 2016 07 19
- Rearranged the big blocks to be in a logical order rather than alphabetical order. Walker > Downloader > other classes
- Renamed the `keep_pattern` and `remove_pattern` functions to `keep_pattern_argparse` etc to be consistent with the other functions used by the argparser. *Does not affect the commandline usage!*
- Gave the HTML tree divs a very gentle shadow and alternating colors to help with depth perception.
- Fixed some mismatched code vs comments
- Fixed the allowed characters parameter of `filepath_sanitize`, which was not written correctly but worked out of luck.
- **[addition]** Gave the HTML tree divs a very gentle shadow and alternating colors to help with depth perception.
- **[bugfix]** Fixed the allowed characters parameter of `filepath_sanitize`, which was not written correctly but worked out of luck.
- **[cleanup]** Rearranged the big blocks to be in a logical order rather than alphabetical order. Walker > Downloader > other classes
- **[cleanup]** Renamed the `keep_pattern` and `remove_pattern` functions to `keep_pattern_argparse` etc to be consistent with the other functions used by the argparser. *Does not affect the commandline usage!*
- **[cleanup]** Fixed some mismatched code vs comments
- 2016 07 08
- Fixed bug in which trees wouldn't generate on server:port urls.
- **[bugfix]** Fixed bug in which trees wouldn't generate on server:port urls.
- 2016 07 04
- Added new argparse command "tree"
- **[addition]** Added new argparse command "tree"
- 2016 02 08
- Fixed bug where server:port urls did not create db files because of the colon. It's been replaced by a hash.
- Moved db commits to only happen at the end of a digest.
- **[bugfix]** Fixed bug where server:port urls did not create db files because of the colon. It's been replaced by a hash.
- **[change]** Moved db commits to only happen at the end of a digest.

View File

@ -614,7 +614,7 @@ def fetch_generator(cur):
def filepath_sanitize(text, allowed=''):
badchars = FILENAME_BADCHARS
badchars = ''.join(char for char in FILENAME_BADCHARS if char not in allowed)
badchars = set(char for char in FILENAME_BADCHARS if char not in allowed)
text = ''.join(char for char in text if char not in badchars)
return text
@ -886,32 +886,16 @@ def download(
folder = os.path.join(outputdir, url_filepath['folder'])
os.makedirs(folder, exist_ok=True)
final_fullname = os.path.join(folder, url_filepath['filename'])
temporary_basename = hashit(url, 16) + '.oddltemporary'
temporary_fullname = os.path.join(folder, temporary_basename)
fullname = os.path.join(folder, url_filepath['filename'])
# Because we use .oddltemporary files, the behavior of `overwrite` here
# is different than the behavior of `overwrite` in downloady.
# The overwrite used in the following block refers to the finalized file.
# The overwrite passed to downloady refers to the oddltemporary which
# may be resumed.
if os.path.isfile(final_fullname):
if overwrite:
os.remove(final_fullname)
else:
write('Skipping "%s". Use `--overwrite`' % final_fullname)
continue
overwrite = overwrite or None
write('Downloading "%s" as "%s"' % (final_fullname, temporary_basename))
write('Downloading "%s"' % fullname)
downloady.download_file(
url,
localname=temporary_fullname,
localname=fullname,
bytespersecond=bytespersecond,
callback_progress=downloady.progress2,
overwrite=overwrite
)
os.rename(temporary_fullname, final_fullname)
def download_argparse(args):
return download(

View File

@ -7,7 +7,6 @@ class Path:
def __init__(self, path):
path = os.path.normpath(path)
path = os.path.abspath(path)
path = get_path_casing(path)
self.absolute_path = path
def __contains__(self, other):
@ -23,6 +22,10 @@ class Path:
def basename(self):
return os.path.basename(self.absolute_path)
def correct_case(self):
self.absolute_path = get_path_casing(self.absolute_path)
return self.absolute_path
@property
def exists(self):
return os.path.exists(self.absolute_path)

View File

@ -14,6 +14,7 @@ sys.path.append('C:\\git\\else\\Ratelimiter'); import ratelimiter
sys.path.append('C:\\git\\else\\SpinalTap'); import spinal
FILE_READ_CHUNK = bytestring.MIBIBYTE
RATELIMITER = ratelimiter.Ratelimiter(16 * bytestring.MIBIBYTE)
# The paths which the user may access.
# Attempting to access anything outside will 403.
@ -98,6 +99,7 @@ class RequestHandler(http.server.BaseHTTPRequestHandler):
if isinstance(data, types.GeneratorType):
for chunk in data:
self.wfile.write(chunk)
RATELIMITER.limit(len(chunk))
else:
self.wfile.write(data)

View File

@ -350,16 +350,17 @@ def copy_file(
source = str_to_fp(source)
if not source.is_file:
raise SourceNotFile(source)
if destination_new_root is not None:
source.correct_case()
destination = new_root(source, destination_new_root)
destination = str_to_fp(destination)
callback = callback or do_nothing
callback_verbose = callback_verbose or do_nothing
if not source.is_file:
raise SourceNotFile(source)
if destination.is_dir:
raise DestinationIsDirectory(destination)

65
ThreadedDL/threaded_dl.py Normal file
View File

@ -0,0 +1,65 @@
import os
import sys
import threading
import time
sys.path.append('C:\\git\\else\\clipext'); import clipext
sys.path.append('C:\\git\\else\\downloady'); import downloady
def remove_finished(threads):
threads = [t for t in threads if t.is_alive()]
return threads
def download_thread(url, filename_prefix=''):
url = url.strip()
if url == '':
return
basename = downloady.basename_from_url(url)
basename = filename_prefix + basename
if os.path.exists(basename):
print('Skipping existing file "%s"' % basename)
return
print('Starting "%s"' % basename)
downloady.download_file(url, basename)
print('Finished "%s"' % basename)
def listget(li, index, fallback):
try:
return li[index]
except IndexError:
return fallback
def threaded_dl(urls, thread_count=4):
threads = []
prefix_digits = len(str(len(urls)))
prefix_text = '%0{digits}d_'.format(digits=prefix_digits)
for (index, url) in enumerate(urls):
while len(threads) == thread_count:
threads = remove_finished(threads)
time.sleep(0.1)
prefix = prefix_text % index
t = threading.Thread(target=download_thread, args=[url, prefix])
t.daemon = True
threads.append(t)
t.start()
while len(threads) > 0:
threads = remove_finished(threads)
time.sleep(0.1)
def main():
filename = sys.argv[1]
if os.path.isfile(filename):
f = open(filename, 'r')
with f:
urls = f.read()
urls = urls.split()
else:
urls = clipext.resolve(filename)
urls = urls.split()
threaded_dl(urls)
if __name__ == '__main__':
main()