master
Ethan Dalool 2016-08-17 18:24:38 -07:00
parent c491e417f5
commit 82f63a75ab
14 changed files with 501 additions and 264 deletions

8
Clipext/README.md Normal file
View File

@ -0,0 +1,8 @@
Clip Extension
==============
This module works with pyperclip to provide some handy features for commandline utilities.
Instead of having the user paste text into the commandline to run your script, just let them enter `script.py !c` and resolve it automatically. Pasting into the cmd on Windows is annoying and requires a mouse-click so this can be very convenient.
Since "!i" resolves to user input, your script can accept piping with `ls | script.py !i`.

31
Clipext/clipext.py Normal file
View File

@ -0,0 +1,31 @@
import pyperclip
CLIPBOARD_STRINGS = ['!c', '!clip', '!clipboard']
INPUT_STRINGS = ['!i', '!in', '!input', '!stdin']
EOF = '\x1a'
def multi_line_input():
userinput = []
while True:
try:
additional = input()
except EOFError:
# If you enter nothing but ctrl-z
additional = EOF
userinput.append(additional)
if EOF in additional:
break
userinput = '\n'.join(userinput)
userinput = userinput.split(EOF)[0]
return userinput.strip()
def resolve(arg):
lowered = arg.lower()
if lowered in CLIPBOARD_STRINGS:
return pyperclip.paste()
if lowered in INPUT_STRINGS:
return multi_line_input()
return arg

6
Downloady/README.md Normal file
View File

@ -0,0 +1,6 @@
Downloady
=========
- 2016 08 16
- Downloady now uses temporary files for incomplete downloads, and renames them when finished. This helps distinguish downloads that were interrupted and should be resumed from files that just happen to have the same name, which previously would have been interpreted as a resume. This improves overall ease-of-use, simplifies the behavior of the `overwrite` parameter, and will remove duplicate work from other programs.
- Rewrote the plan creator and download function to do a better job of separating concerns and simplify the plan selector.

View File

@ -1,27 +1,28 @@
import sys
sys.path.append('C:\\git\\else\\ratelimiter'); import ratelimiter
sys.path.append('C:\\git\\else\\bytestring'); import bytestring
import argparse import argparse
import os import os
import pyperclip # pip install pyperclip import pyperclip # pip install pyperclip
import requests import requests
import sys
import time import time
import urllib import urllib
import warnings import warnings
sys.path.append('C:\\git\\else\\clipext'); import clipext
sys.path.append('C:\\git\\else\\ratelimiter'); import ratelimiter
sys.path.append('C:\\git\\else\\bytestring'); import bytestring
warnings.simplefilter('ignore') warnings.simplefilter('ignore')
HEADERS = { HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36' 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36'
} }
SLEEPINESS = 3
FILENAME_BADCHARS = '*?"<>|' FILENAME_BADCHARS = '*?"<>|'
last_request = 0 last_request = 0
CHUNKSIZE = 16 * bytestring.KIBIBYTE CHUNKSIZE = 16 * bytestring.KIBIBYTE
STOP = False
TIMEOUT = 600 TIMEOUT = 600
TEMP_EXTENSION = '.downloadytemp'
def basename_from_url(url): def basename_from_url(url):
''' '''
@ -32,81 +33,6 @@ def basename_from_url(url):
localname = localname.split('/')[-1] localname = localname.split('/')[-1]
return localname return localname
def determine_seek_and_range(
file_handle,
localname,
local_exists,
overwrite,
remote_total_bytes,
server_respects_range,
user_provided_range,
user_range_min,
user_range_max,
):
''' THINGS THAT CAN HAPPEN '''
seek_to = 0
header_range_min = None
header_range_max = None
if local_exists:
local_existing_bytes = os.path.getsize(localname)
if overwrite is True:
file_handle.truncate()
if user_provided_range:
header_range_min = user_range_min
header_range_max = user_range_max
seek_to = user_range_min
elif not user_provided_range:
pass
elif overwrite is None:
if local_existing_bytes == remote_total_bytes:
print('File is 100%. Nothing to do.')
return
if user_provided_range:
if server_respects_range:
seek_to = user_range_min
else:
raise Exception('The server did not respect your range header')
elif not user_provided_range:
if server_respects_range:
print('Resuming from byte %d' % local_existing_bytes)
header_range_min = local_existing_bytes
header_range_max = ''
seek_to = local_existing_bytes
else:
print('File exists, but server doesn\'t allow resumes. Restart from 0?')
permission = get_permission()
if permission:
file_handle.truncate()
else:
raise Exception('Couldn\'t resume')
else:
raise TypeError('Invalid value for `overwrite`. Must be True, False, or None')
elif not local_exists:
if user_provided_range:
if server_respects_range:
file_handle.seek(user_range_min)
file_handle.write(b'\0')
header_range_min = user_range_min
header_range_max = user_range_max
seek_to = user_range_min
else:
raise Exception('The server did not respect your range header')
elif not user_provided_range:
pass
return (seek_to, header_range_min, header_range_max)
def download_file( def download_file(
url, url,
localname=None, localname=None,
@ -114,33 +40,103 @@ def download_file(
bytespersecond=None, bytespersecond=None,
callback_progress=None, callback_progress=None,
headers=None, headers=None,
overwrite=None overwrite=False,
verbose=False,
): ):
if headers is None: headers = headers or {}
headers = {}
''' Determine local filename ''' url = sanitize_url(url)
url = url.replace('%3A//', '://')
if localname in [None, '']: if localname in [None, '']:
localname = basename_from_url(url) localname = basename_from_url(url)
localname = sanitize_filename(localname)
localname = filepath_sanitize(localname) if verbose:
print(url)
plan = prepare_plan(
url,
localname,
auth=auth,
bytespersecond=bytespersecond,
headers=headers,
overwrite=overwrite,
)
#print(plan)
if plan is None:
return
localname = plan['download_into']
directory = os.path.split(localname)[0] directory = os.path.split(localname)[0]
if directory != '': if directory != '':
os.makedirs(directory, exist_ok=True) os.makedirs(directory, exist_ok=True)
touch(localname)
file_handle = open(localname, 'r+b')
file_handle.seek(plan['seek_to'])
if plan['header_range_min'] is not None:
headers['range'] = 'bytes={min}-{max}'.format(
min=plan['header_range_min'],
max=plan['header_range_max'],
)
if plan['plan_type'] == 'resume':
bytes_downloaded = plan['seek_to']
else:
bytes_downloaded = 0
download_stream = request('get', url, stream=True, headers=headers, auth=auth)
for chunk in download_stream.iter_content(chunk_size=CHUNKSIZE):
bytes_downloaded += len(chunk)
file_handle.write(chunk)
if callback_progress is not None:
callback_progress(bytes_downloaded, plan['remote_total_bytes'])
if plan['limiter'] is not None and bytes_downloaded < plan['remote_total_bytes']:
plan['limiter'].limit(len(chunk))
file_handle.close()
if localname != plan['real_localname']:
os.rename(localname, plan['real_localname'])
localsize = os.path.getsize(plan['real_localname'])
if plan['plan_type'] != 'partial' and localsize < plan['remote_total_bytes']:
message = 'File does not contain expected number of bytes. Received {size} / {total}'
message = message.format(size=os.path.getsize(localname), total=plan['remote_total_bytes'])
raise Exception(message)
return plan['real_localname']
def prepare_plan(
url,
localname,
auth,
bytespersecond,
headers,
overwrite,
):
# Chapter 1: File existence
user_provided_range = 'range' in headers
real_localname = localname
temp_localname = localname + TEMP_EXTENSION
real_exists = os.path.exists(real_localname)
if real_exists and overwrite is False and not user_provided_range:
print('File exists and overwrite is off. Nothing to do.')
return None
temp_exists = os.path.exists(temp_localname)
real_localsize = int(real_exists and os.path.getsize(real_localname))
temp_localsize = int(temp_exists and os.path.getsize(temp_localname))
# Chapter 2: Ratelimiting
if bytespersecond is None: if bytespersecond is None:
limiter = None limiter = None
elif isinstance(bytespersecond, ratelimiter.Ratelimiter):
limiter = bytespersecond
else: else:
limiter = ratelimiter.Ratelimiter(bytespersecond, period=1) limiter = ratelimiter.Ratelimiter(bytespersecond)
''' Prepare plan variables ''' # Chapter 3: Extracting range
local_exists = os.path.exists(localname)
if local_exists and overwrite is False:
print('Overwrite off. Nothing to do.')
return
user_provided_range = 'range' in headers
if user_provided_range: if user_provided_range:
user_range_min = int(headers['range'].split('bytes=')[1].split('-')[0]) user_range_min = int(headers['range'].split('bytes=')[1].split('-')[0])
user_range_max = headers['range'].split('-')[1] user_range_max = headers['range'].split('-')[1]
@ -150,71 +146,88 @@ def download_file(
user_range_min = None user_range_min = None
user_range_max = None user_range_max = None
# Chapter 4: Server range support
# Always include a range on the first request to figure out whether the # Always include a range on the first request to figure out whether the
# server supports it. Use 0- so we get the right `remote_total_bytes`. # server supports it. Use 0- to get correct remote_total_bytes
temp_headers = headers temp_headers = headers
temp_headers.update({'range': 'bytes=0-'}) temp_headers.update({'range': 'bytes=0-'})
# I'm using a GET instead of an actual HEAD here because some servers respond # I'm using a GET instead of an actual HEAD here because some servers respond
# differently, even though they're not supposed to. # differently, even though they're not supposed to.
head = request('get', url, stream=True, headers=temp_headers, auth=auth) head = request('get', url, stream=True, headers=temp_headers, auth=auth)
remote_total_bytes = int(head.headers.get('content-length', 1)) remote_total_bytes = int(head.headers.get('content-length', 0))
server_respects_range = (head.status_code == 206 and 'content-range' in head.headers) server_respects_range = (head.status_code == 206 and 'content-range' in head.headers)
head.connection.close() head.connection.close()
touch(localname) if user_provided_range and not server_respects_range:
file_handle = open(localname, 'r+b') raise Exception('Server did not respect your range header')
file_handle.seek(0)
plan = determine_seek_and_range( # Chapter 5: Plan definitions
file_handle=file_handle, plan_base = {
localname=localname, 'limiter': limiter,
local_exists=local_exists, 'real_localname': real_localname,
overwrite=overwrite, 'remote_total_bytes': remote_total_bytes,
remote_total_bytes=remote_total_bytes, }
server_respects_range=server_respects_range, plan_fulldownload = dict(
user_provided_range=user_provided_range, plan_base,
user_range_min=user_range_min, download_into=temp_localname,
user_range_max=user_range_max, header_range_min=None,
header_range_max=None,
plan_type='fulldownload',
seek_to=0,
)
plan_resume = dict(
plan_base,
download_into=temp_localname,
header_range_min=temp_localsize,
header_range_max='',
plan_type='resume',
seek_to=temp_localsize,
)
plan_partial = dict(
plan_base,
download_into=real_localname,
header_range_min=user_range_min,
header_range_max=user_range_max,
plan_type='partial',
seek_to=user_range_min,
) )
if plan is None:
return
(seek_to, header_range_min, header_range_max) = plan # Chapter 6: Redeem your meal vouchers here
if header_range_min is not None: if real_exists:
headers['range'] = 'bytes={0}-{1}'.format(header_range_min, header_range_max) if overwrite:
os.remove(real_localname)
bytes_downloaded = seek_to if user_provided_range:
file_handle.seek(seek_to) return plan_partial
download_stream = request('get', url, stream=True, headers=headers, auth=auth)
''' Begin download ''' return plan_fulldownload
for chunk in download_stream.iter_content(chunk_size=CHUNKSIZE):
bytes_downloaded += len(chunk)
file_handle.write(chunk)
if callback_progress is not None:
callback_progress(bytes_downloaded, remote_total_bytes)
if limiter is not None and bytes_downloaded < remote_total_bytes: elif temp_exists and temp_localsize > 0:
limiter.limit(len(chunk)) if overwrite:
return plan_fulldownload
file_handle.close() if user_provided_range:
return localname return plan_partial
def filepath_sanitize(text, exclusions=''): if server_respects_range:
bet = FILENAME_BADCHARS.replace(exclusions, '') print('Resume from byte %d' % plan_resume['seek_to'])
for char in bet: return plan_resume
text = text.replace(char, '')
return text
else:
if user_provided_range:
return plan_partial
return plan_fulldownload
print('No plan was chosen?')
return None
def get_permission(prompt='y/n\n>', affirmative=['y', 'yes']): def get_permission(prompt='y/n\n>', affirmative=['y', 'yes']):
permission = input(prompt) permission = input(prompt)
return permission.lower() in affirmative return permission.lower() in affirmative
def is_clipboard(s): def progress1(bytes_downloaded, bytes_total, prefix=''):
return s.lower() in ['!c', '!clip', '!clipboard']
def progress(bytes_downloaded, bytes_total, prefix=''):
divisor = bytestring.get_appropriate_divisor(bytes_total) divisor = bytestring.get_appropriate_divisor(bytes_total)
bytes_total_string = bytestring.bytestring(bytes_total, force_unit=divisor) bytes_total_string = bytestring.bytestring(bytes_total, force_unit=divisor)
bytes_downloaded_string = bytestring.bytestring(bytes_downloaded, force_unit=divisor) bytes_downloaded_string = bytestring.bytestring(bytes_downloaded, force_unit=divisor)
@ -278,6 +291,16 @@ def request(method, url, stream=False, headers=None, timeout=TIMEOUT, **kwargs):
req.raise_for_status() req.raise_for_status()
return req return req
def sanitize_filename(text, exclusions=''):
bet = FILENAME_BADCHARS.replace(exclusions, '')
for char in bet:
text = text.replace(char, '')
return text
def sanitize_url(url):
url = url.replace('%3A//', '://')
return url
def touch(filename): def touch(filename):
f = open(filename, 'ab') f = open(filename, 'ab')
f.close() f.close()
@ -286,26 +309,14 @@ def touch(filename):
def download_argparse(args): def download_argparse(args):
url = args.url url = args.url
if is_clipboard(url):
url = pyperclip.paste()
print(url)
overwrite = { url = clipext.resolve(url)
'y':True, 't':True,
'n':False, 'f':False,
}.get(args.overwrite.lower(), None)
callback = { callback = {
None: progress, None: progress1,
'1': progress, '1': progress1,
'2': progress2, '2': progress2,
}.get(args.callback, None) }.get(args.callback, args.callback)
callback = args.callback
if callback == '1':
callback = progress
if callback == '2':
callback = progress2
bytespersecond = args.bytespersecond bytespersecond = args.bytespersecond
if bytespersecond is not None: if bytespersecond is not None:
@ -321,20 +332,21 @@ def download_argparse(args):
bytespersecond=bytespersecond, bytespersecond=bytespersecond,
callback_progress=callback, callback_progress=callback,
headers=headers, headers=headers,
overwrite=overwrite, overwrite=args.overwrite,
verbose=True,
) )
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
#p_download_file = subparsers.add_parser('download_file')
parser.add_argument('url') parser.add_argument('url')
parser.add_argument('localname', nargs='?', default=None) parser.add_argument('localname', nargs='?', default=None)
parser.add_argument('-c', '--callback', dest='callback', default=progress) parser.add_argument('-c', '--callback', dest='callback', default=progress1)
parser.add_argument('-bps', '--bytespersecond', dest='bytespersecond', default=None) parser.add_argument('-bps', '--bytespersecond', dest='bytespersecond', default=None)
parser.add_argument('-ow', '--overwrite', dest='overwrite', default='') parser.add_argument('-ow', '--overwrite', dest='overwrite', action='store_true')
parser.add_argument('-r', '--range', dest='range', default=None) parser.add_argument('-r', '--range', dest='range', default=None)
parser.set_defaults(func=download_argparse) parser.set_defaults(func=download_argparse)
args = parser.parse_args() args = parser.parse_args()
args.func(args) args.func(args)

View File

@ -1,45 +0,0 @@
THINGS THAT CAN HAPPEN
├───File exists
│ ├───User disables overwrite
│ │ └───Return because there's nothing to do
│ │
│ ├───User enables overwrite
│ │ ├───User requests range
│ │ │ └───Raise exception because requesting a range and forcing overwrite are mutually exclusive
│ │ │
│ │ └───User does not request range
│ │ └───File opened, truncated, full download
│ │
│ └───User does not specify overwrite
│ ├───File is same size as content-length
│ │ └───Return because there's nothing to do.
│ │
│ ├───User requests range
│ │ ├───Server respects range
│ │ │ └───File opened, seeked to request, bytes filled in
│ │ │
│ │ └───Server does not respect range
│ │ └───Raise exception because user's request can't be fulfilled
│ │
│ └───User does not request range
│ ├───Server respects range
│ │ └───File is opened, seeked to end, download resumes
│ │
│ └───Server does not respect range
│ └───Ask for permission to overwrite from beginning
└───File does not exist
├───User requests range
│ ├───Server respects range
│ │ └───File created, seeked to request, bytes filled in. everything else left 0
│ └───Server does not respect range
│ └───Raise exception because user's request can't be fulfilled
└───User does not request range
└───File created, full download
Possible amibiguity: If the user requests a range, and the file does not exist, does he want:
1. to fill the file with zeroes, and patch the requested bytes into their correct spot; or
2. to create the file empty, and only write the requested bytes?
I will assume #1 because that plays nicely with other Things That Can Happen, such as letting the user patch the other bytes in later.

157
Instathief/instathief.py Normal file
View File

@ -0,0 +1,157 @@
import argparse
import bs4
import datetime
import json
import os
import requests
import sys
sys.path.append('C:\\git\\else\\clipext'); import clipext
sys.path.append('C:\\git\\else\\downloady'); import downloady
''' '''
STRFTIME = '%Y%m%d-%H%M%S'
# strftime used for filenames when downloading
URL_PROFILE = 'https://www.instagram.com/{username}'
URL_QUERY = 'https://www.instagram.com/query/'
PAGE_QUERY_TEMPLATE = '''
ig_user({user_id})
{{
media.after({end_cur}, {count})
{{
count,
nodes
{{
code,
date,
display_src,
id,
video_url
}},
page_info
}}
}}
'''.replace('\n', '').replace(' ', '')
USERAGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
''' '''
last_cookie = None
def download_media(media_list):
for media in media_list:
timestamp = datetime.datetime.utcfromtimestamp(media['created'])
timestamp = timestamp.strftime(STRFTIME)
basename = downloady.basename_from_url(media['url'])
extension = os.path.splitext(basename)[1]
filename = timestamp + extension
downloady.download_file(
url=media['url'],
localname=filename,
callback_progress=downloady.progress2,
overwrite=False,
)
def get_page(user_id, end_cur, count, cookies):
query = PAGE_QUERY_TEMPLATE.format(
count=count,
end_cur=end_cur,
user_id=user_id,
)
headers = {
'referer': 'https://www.instagram.com/',
'user-agent': USERAGENT,
'x-csrftoken': cookies['csrftoken'],
}
data = {
'q': query,
'ref': 'users::show',
}
response = requests.post(
url=URL_QUERY,
cookies=cookies,
data=data,
headers=headers,
)
response.raise_for_status()
page = response.json()
return page
def get_user_info(username):
global last_cookie
url = URL_PROFILE.format(username=username)
response = requests.get(url)
response.raise_for_status()
text = response.text
soup = bs4.BeautifulSoup(text, 'html.parser')
scripts = soup.find_all('script')
for script in scripts:
if 'window._sharedData' in script.text:
break
else:
raise Exception('Did not find expected javascript')
user_data = script.text
user_data = user_data.split('window._sharedData = ')[1].rstrip(';').strip()
user_data = json.loads(user_data)
user_data = user_data['entry_data']['ProfilePage'][0]['user']
user_id = user_data['id']
page_info = user_data['media']['page_info']
if page_info['has_next_page']:
end_cur = page_info['start_cursor']
# Minus 1 because the queries use "after" parameters for pagination, and
# if we just take this cursor then we will only get items after it.
end_cur = int(end_cur) - 1
else:
end_cur = None
user_data = {
'user_id': user_id,
'end_cur': end_cur,
'cookies': response.cookies,
}
last_cookie = response.cookies
return user_data
def get_user_media(username):
user_info = get_user_info(username)
end_cur = user_info.pop('end_cur')
while True:
page = get_page(count=50, end_cur=end_cur, **user_info)
page = page['media']
posts = page['nodes']
for post in posts:
timestamp = post['date']
media_url = post.get('video_url') or post.get('display_src')
ret = {
'created': timestamp,
'url': media_url
}
yield ret
page_info = page['page_info']
if page_info['has_next_page']:
end_cur = page_info['end_cursor']
else:
break
def main():
username = sys.argv[1]
media = get_user_media(username)
for (timestamp, url) in media:
print(url)
if __name__ == '__main__':
main()

View File

@ -138,6 +138,7 @@ function create_odi_div(url)
if (paramless_url.match(IMAGE_TYPES)) if (paramless_url.match(IMAGE_TYPES))
{ {
console.log("Creating image div for " + paramless_url);
var div = document.createElement("div"); var div = document.createElement("div");
div.id = generate_id(32); div.id = generate_id(32);
div.className = "odi_image_div"; div.className = "odi_image_div";
@ -194,6 +195,7 @@ function create_odi_div(url)
{ {
return null; return null;
} }
console.log("Creating " + mediatype + " div for " + paramless_url);
var div = document.createElement("div"); var div = document.createElement("div");
div.id = generate_id(32); div.id = generate_id(32);
@ -253,6 +255,7 @@ function create_odi_div(url)
} }
function create_odi_divs(urls) function create_odi_divs(urls)
{ {
console.log("Creating odi divs");
image_divs = []; image_divs = [];
media_divs = []; media_divs = [];
odi_divs = []; odi_divs = [];
@ -332,6 +335,7 @@ function create_workspace()
control_panel.appendChild(ingest_div); control_panel.appendChild(ingest_div);
control_panel.appendChild(start_button); control_panel.appendChild(start_button);
document.body.appendChild(workspace); document.body.appendChild(workspace);
console.log("finished workspace");
} }
function delete_odi_div(element) function delete_odi_div(element)
@ -430,6 +434,7 @@ function filter_re(pattern, do_delete)
function format_css() function format_css()
{ {
console.log("Formatting CSS variables");
var css = CSS; var css = CSS;
while (true) while (true)
{ {
@ -438,22 +443,24 @@ function format_css()
{ {
break; break;
} }
console.log(matches);
matches = Array.from(new Set(matches)); matches = new Set(matches);
for (var index = 0; index < matches.length; index += 1) /* Originally used Array.from(set) and did regular iteration, but I found
{ that sites can override and break that conversion. */
var injector = matches[index]; matches.forEach(
var injected = injector.replace(new RegExp("\\$", 'g'), ""); function(injector)
/*console.log(injector);*/ {
/*console.log(injected);*/ var injected = injector.replace(new RegExp("\\$", 'g'), "");
css = css.replace(injector, this[injected]); css = css.replace(injector, this[injected]);
} }
);
} }
return css; return css;
} }
function get_all_urls() function get_all_urls()
{ {
console.log("Collecting urls");
var urls = []; var urls = [];
function include(source, attr) function include(source, attr)
{ {
@ -529,6 +536,7 @@ function get_basename(url)
function get_gfycat_video(id) function get_gfycat_video(id)
{ {
console.log("Resolving gfycat " + id);
var url = "https://gfycat.com/cajax/get/" + id; var url = "https://gfycat.com/cajax/get/" + id;
var request = new XMLHttpRequest(); var request = new XMLHttpRequest();
request.answer = null; request.answer = null;
@ -595,6 +603,7 @@ function generate_id(length)
function ingest() function ingest()
{ {
/* Take the text from the INGEST box, and make odi divs from it */ /* Take the text from the INGEST box, and make odi divs from it */
console.log("Ingesting");
var odi_divs = get_odi_divs(); var odi_divs = get_odi_divs();
var ingestbox = document.getElementById("ingestbox"); var ingestbox = document.getElementById("ingestbox");
var text = ingestbox.value; var text = ingestbox.value;
@ -622,6 +631,7 @@ function ingest()
function lazy_load_all() function lazy_load_all()
{ {
console.log("Starting lazyload");
lazies = get_lazy_divs(); lazies = get_lazy_divs();
lazies.reverse(); lazies.reverse();
lazy_buttons = document.getElementsByClassName("load_button"); lazy_buttons = document.getElementsByClassName("load_button");

View File

@ -91,8 +91,8 @@ function swap_source(player, source_list)
function main() function main()
{ {
var WIDTH = 3; var WIDTH = 2;
var HEIGHT = 3; var HEIGHT = 2;
var MEDIAS = get_media_links(); var MEDIAS = get_media_links();
clear_page(); clear_page();

View File

@ -1,49 +1,52 @@
Open Dir DL Open Dir DL
=========== ===========
The open directory downloader The open directory downloader.
Requires `pip install beautifulsoup4` Requires `pip install beautifulsoup4`.
See inside opendirdl.py for usage instructions. See inside opendirdl.py for usage instructions.
- 2016 08 16
- **[cleanup]** Now that Downloady uses temp files for incomplete downloads, that logic can be removed from opendirdl.
- 2016 08 10 - 2016 08 10
- Fixed bug in smart_insert caused by 404's being considered falsey, triggering the 'one and only one' exception. - **[addition]** Added clickable links to each directory on HTML tree pages.
- Fixed bug in smart_insert where 404'd URLs were not being deleted from the database. - **[bugfix]** Fixed bug in smart_insert caused by 404's being considered falsey, triggering the 'one and only one' exception.
- Added clickable links to each directory on HTML tree pages. - **[bugfix]** Fixed bug in smart_insert where 404'd URLs were not being deleted from the database.
- 2016 08 02 - 2016 08 02
- Removed the usage of div IDs on the Tree pages by making the collapse button use `this.nextSibling`. - **[cleanup]** Removed the need for div IDs on the Tree pages by making the collapse button use `this.nextSibling`.
- Rewrote `build_file_tree` with a way simpler algorithm. - **[cleanup]** Rewrote `build_file_tree` with a way simpler algorithm.
- Removed the ability to set a Node's parent during `__init__` because it wasn't fully fleshed out and doesn't need to be used anyway. - **[removal]** Removed the ability to set a Node's parent during `__init__` because it wasn't fully fleshed out and doesn't need to be used anyway.
- 2016 08 01 - 2016 08 01
- Made the digest work even if you forget the http:// - **[addition]** Made the digest work even if you forget the http://
- 2016 07 29 - 2016 07 29
- Moved some nested function definitions out to the top level, and made the construction of the file tree its own function. These functions really don't need to be used on their own, but they were cluttering the logic of the `tree` command. - **[change]** Moved some nested function definitions out to the top level, and made the construction of the file tree its own function. These functions really don't need to be used on their own, but they were cluttering the logic of the `tree` command.
- Renamed `Tree.listnodes` to `Tree.list_children` and the `customsort` now expects to operate on Node objects rather than `(identifier, Node)` tuples. Nodes already have their identifier so the tuple was unecessary. - **[change]** Renamed `Tree.listnodes` to `Tree.list_children`, and the `customsort` now expects to operate on Node objects rather than `(identifier, Node)` tuples. Nodes already have their identifier so the tuple was unecessary.
- Removed `Tree.sorted_children` since it was basically a duplicate of `Tree.listnodes` and I don't know why I had both. - **[change]** Replaced local `download_file` function with a call to `downloady.download_file`. It supports download continuation and removes duplicate work.
- Replaced all `safeprint` calls with `write` because it provides access to safeprint as well as file writing if needed. - **[cleanup]** Replaced all `safeprint` calls with `write` because it provides access to safeprint as well as file writing if needed.
- Replaced local `download_file` function with a call to `downloady.download_file`. It supports download continuation and removes duplicate work. - **[removal]** Removed `Tree.sorted_children` since it was basically a duplicate of `Tree.listnodes` and I don't know why I had both.
- 2016 07 25 - 2016 07 25
- Removed the `Downloader` class after watching [this Jack Diederich talk](https://youtu.be/o9pEzgHorH0) about unecessary classes. - **[change]** Bytespersecond is now parsed by `bytestring.parsebytes` rather than `eval`, so you can write "100k" as opposed to "100 * 1024" etc.
- Bytespersecond is now parsed by `bytestring.parsebytes` rather than `eval`, so you can write "100k" as opposed to "100 * 1024" etc. - **[removal]** Removed the `Downloader` class after watching [this Jack Diederich talk](https://youtu.be/o9pEzgHorH0) about unecessary classes.
- 2016 07 19 - 2016 07 19
- Rearranged the big blocks to be in a logical order rather than alphabetical order. Walker > Downloader > other classes - **[addition]** Gave the HTML tree divs a very gentle shadow and alternating colors to help with depth perception.
- Renamed the `keep_pattern` and `remove_pattern` functions to `keep_pattern_argparse` etc to be consistent with the other functions used by the argparser. *Does not affect the commandline usage!* - **[bugfix]** Fixed the allowed characters parameter of `filepath_sanitize`, which was not written correctly but worked out of luck.
- Gave the HTML tree divs a very gentle shadow and alternating colors to help with depth perception. - **[cleanup]** Rearranged the big blocks to be in a logical order rather than alphabetical order. Walker > Downloader > other classes
- Fixed some mismatched code vs comments - **[cleanup]** Renamed the `keep_pattern` and `remove_pattern` functions to `keep_pattern_argparse` etc to be consistent with the other functions used by the argparser. *Does not affect the commandline usage!*
- Fixed the allowed characters parameter of `filepath_sanitize`, which was not written correctly but worked out of luck. - **[cleanup]** Fixed some mismatched code vs comments
- 2016 07 08 - 2016 07 08
- Fixed bug in which trees wouldn't generate on server:port urls. - **[bugfix]** Fixed bug in which trees wouldn't generate on server:port urls.
- 2016 07 04 - 2016 07 04
- Added new argparse command "tree" - **[addition]** Added new argparse command "tree"
- 2016 02 08 - 2016 02 08
- Fixed bug where server:port urls did not create db files because of the colon. It's been replaced by a hash. - **[bugfix]** Fixed bug where server:port urls did not create db files because of the colon. It's been replaced by a hash.
- Moved db commits to only happen at the end of a digest. - **[change]** Moved db commits to only happen at the end of a digest.

View File

@ -614,7 +614,7 @@ def fetch_generator(cur):
def filepath_sanitize(text, allowed=''): def filepath_sanitize(text, allowed=''):
badchars = FILENAME_BADCHARS badchars = FILENAME_BADCHARS
badchars = ''.join(char for char in FILENAME_BADCHARS if char not in allowed) badchars = set(char for char in FILENAME_BADCHARS if char not in allowed)
text = ''.join(char for char in text if char not in badchars) text = ''.join(char for char in text if char not in badchars)
return text return text
@ -886,32 +886,16 @@ def download(
folder = os.path.join(outputdir, url_filepath['folder']) folder = os.path.join(outputdir, url_filepath['folder'])
os.makedirs(folder, exist_ok=True) os.makedirs(folder, exist_ok=True)
final_fullname = os.path.join(folder, url_filepath['filename']) fullname = os.path.join(folder, url_filepath['filename'])
temporary_basename = hashit(url, 16) + '.oddltemporary'
temporary_fullname = os.path.join(folder, temporary_basename)
# Because we use .oddltemporary files, the behavior of `overwrite` here write('Downloading "%s"' % fullname)
# is different than the behavior of `overwrite` in downloady.
# The overwrite used in the following block refers to the finalized file.
# The overwrite passed to downloady refers to the oddltemporary which
# may be resumed.
if os.path.isfile(final_fullname):
if overwrite:
os.remove(final_fullname)
else:
write('Skipping "%s". Use `--overwrite`' % final_fullname)
continue
overwrite = overwrite or None
write('Downloading "%s" as "%s"' % (final_fullname, temporary_basename))
downloady.download_file( downloady.download_file(
url, url,
localname=temporary_fullname, localname=fullname,
bytespersecond=bytespersecond, bytespersecond=bytespersecond,
callback_progress=downloady.progress2, callback_progress=downloady.progress2,
overwrite=overwrite overwrite=overwrite
) )
os.rename(temporary_fullname, final_fullname)
def download_argparse(args): def download_argparse(args):
return download( return download(

View File

@ -7,7 +7,6 @@ class Path:
def __init__(self, path): def __init__(self, path):
path = os.path.normpath(path) path = os.path.normpath(path)
path = os.path.abspath(path) path = os.path.abspath(path)
path = get_path_casing(path)
self.absolute_path = path self.absolute_path = path
def __contains__(self, other): def __contains__(self, other):
@ -23,6 +22,10 @@ class Path:
def basename(self): def basename(self):
return os.path.basename(self.absolute_path) return os.path.basename(self.absolute_path)
def correct_case(self):
self.absolute_path = get_path_casing(self.absolute_path)
return self.absolute_path
@property @property
def exists(self): def exists(self):
return os.path.exists(self.absolute_path) return os.path.exists(self.absolute_path)

View File

@ -14,6 +14,7 @@ sys.path.append('C:\\git\\else\\Ratelimiter'); import ratelimiter
sys.path.append('C:\\git\\else\\SpinalTap'); import spinal sys.path.append('C:\\git\\else\\SpinalTap'); import spinal
FILE_READ_CHUNK = bytestring.MIBIBYTE FILE_READ_CHUNK = bytestring.MIBIBYTE
RATELIMITER = ratelimiter.Ratelimiter(16 * bytestring.MIBIBYTE)
# The paths which the user may access. # The paths which the user may access.
# Attempting to access anything outside will 403. # Attempting to access anything outside will 403.
@ -98,6 +99,7 @@ class RequestHandler(http.server.BaseHTTPRequestHandler):
if isinstance(data, types.GeneratorType): if isinstance(data, types.GeneratorType):
for chunk in data: for chunk in data:
self.wfile.write(chunk) self.wfile.write(chunk)
RATELIMITER.limit(len(chunk))
else: else:
self.wfile.write(data) self.wfile.write(data)

View File

@ -350,16 +350,17 @@ def copy_file(
source = str_to_fp(source) source = str_to_fp(source)
if not source.is_file:
raise SourceNotFile(source)
if destination_new_root is not None: if destination_new_root is not None:
source.correct_case()
destination = new_root(source, destination_new_root) destination = new_root(source, destination_new_root)
destination = str_to_fp(destination) destination = str_to_fp(destination)
callback = callback or do_nothing callback = callback or do_nothing
callback_verbose = callback_verbose or do_nothing callback_verbose = callback_verbose or do_nothing
if not source.is_file:
raise SourceNotFile(source)
if destination.is_dir: if destination.is_dir:
raise DestinationIsDirectory(destination) raise DestinationIsDirectory(destination)

65
ThreadedDL/threaded_dl.py Normal file
View File

@ -0,0 +1,65 @@
import os
import sys
import threading
import time
sys.path.append('C:\\git\\else\\clipext'); import clipext
sys.path.append('C:\\git\\else\\downloady'); import downloady
def remove_finished(threads):
threads = [t for t in threads if t.is_alive()]
return threads
def download_thread(url, filename_prefix=''):
url = url.strip()
if url == '':
return
basename = downloady.basename_from_url(url)
basename = filename_prefix + basename
if os.path.exists(basename):
print('Skipping existing file "%s"' % basename)
return
print('Starting "%s"' % basename)
downloady.download_file(url, basename)
print('Finished "%s"' % basename)
def listget(li, index, fallback):
try:
return li[index]
except IndexError:
return fallback
def threaded_dl(urls, thread_count=4):
threads = []
prefix_digits = len(str(len(urls)))
prefix_text = '%0{digits}d_'.format(digits=prefix_digits)
for (index, url) in enumerate(urls):
while len(threads) == thread_count:
threads = remove_finished(threads)
time.sleep(0.1)
prefix = prefix_text % index
t = threading.Thread(target=download_thread, args=[url, prefix])
t.daemon = True
threads.append(t)
t.start()
while len(threads) > 0:
threads = remove_finished(threads)
time.sleep(0.1)
def main():
filename = sys.argv[1]
if os.path.isfile(filename):
f = open(filename, 'r')
with f:
urls = f.read()
urls = urls.split()
else:
urls = clipext.resolve(filename)
urls = urls.split()
threaded_dl(urls)
if __name__ == '__main__':
main()