else
This commit is contained in:
parent
c491e417f5
commit
82f63a75ab
14 changed files with 501 additions and 264 deletions
8
Clipext/README.md
Normal file
8
Clipext/README.md
Normal file
|
@ -0,0 +1,8 @@
|
|||
Clip Extension
|
||||
==============
|
||||
|
||||
This module works with pyperclip to provide some handy features for commandline utilities.
|
||||
|
||||
Instead of having the user paste text into the commandline to run your script, just let them enter `script.py !c` and resolve it automatically. Pasting into the cmd on Windows is annoying and requires a mouse-click so this can be very convenient.
|
||||
|
||||
Since "!i" resolves to user input, your script can accept piping with `ls | script.py !i`.
|
31
Clipext/clipext.py
Normal file
31
Clipext/clipext.py
Normal file
|
@ -0,0 +1,31 @@
|
|||
import pyperclip
|
||||
|
||||
CLIPBOARD_STRINGS = ['!c', '!clip', '!clipboard']
|
||||
INPUT_STRINGS = ['!i', '!in', '!input', '!stdin']
|
||||
EOF = '\x1a'
|
||||
|
||||
def multi_line_input():
|
||||
userinput = []
|
||||
while True:
|
||||
try:
|
||||
additional = input()
|
||||
except EOFError:
|
||||
# If you enter nothing but ctrl-z
|
||||
additional = EOF
|
||||
|
||||
userinput.append(additional)
|
||||
|
||||
if EOF in additional:
|
||||
break
|
||||
|
||||
userinput = '\n'.join(userinput)
|
||||
userinput = userinput.split(EOF)[0]
|
||||
return userinput.strip()
|
||||
|
||||
def resolve(arg):
|
||||
lowered = arg.lower()
|
||||
if lowered in CLIPBOARD_STRINGS:
|
||||
return pyperclip.paste()
|
||||
if lowered in INPUT_STRINGS:
|
||||
return multi_line_input()
|
||||
return arg
|
6
Downloady/README.md
Normal file
6
Downloady/README.md
Normal file
|
@ -0,0 +1,6 @@
|
|||
Downloady
|
||||
=========
|
||||
|
||||
- 2016 08 16
|
||||
- Downloady now uses temporary files for incomplete downloads, and renames them when finished. This helps distinguish downloads that were interrupted and should be resumed from files that just happen to have the same name, which previously would have been interpreted as a resume. This improves overall ease-of-use, simplifies the behavior of the `overwrite` parameter, and will remove duplicate work from other programs.
|
||||
- Rewrote the plan creator and download function to do a better job of separating concerns and simplify the plan selector.
|
|
@ -1,27 +1,28 @@
|
|||
import sys
|
||||
sys.path.append('C:\\git\\else\\ratelimiter'); import ratelimiter
|
||||
sys.path.append('C:\\git\\else\\bytestring'); import bytestring
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import pyperclip # pip install pyperclip
|
||||
import requests
|
||||
import sys
|
||||
import time
|
||||
import urllib
|
||||
import warnings
|
||||
|
||||
sys.path.append('C:\\git\\else\\clipext'); import clipext
|
||||
sys.path.append('C:\\git\\else\\ratelimiter'); import ratelimiter
|
||||
sys.path.append('C:\\git\\else\\bytestring'); import bytestring
|
||||
|
||||
warnings.simplefilter('ignore')
|
||||
|
||||
HEADERS = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36'
|
||||
}
|
||||
SLEEPINESS = 3
|
||||
|
||||
FILENAME_BADCHARS = '*?"<>|'
|
||||
|
||||
last_request = 0
|
||||
CHUNKSIZE = 16 * bytestring.KIBIBYTE
|
||||
STOP = False
|
||||
TIMEOUT = 600
|
||||
TEMP_EXTENSION = '.downloadytemp'
|
||||
|
||||
def basename_from_url(url):
|
||||
'''
|
||||
|
@ -32,81 +33,6 @@ def basename_from_url(url):
|
|||
localname = localname.split('/')[-1]
|
||||
return localname
|
||||
|
||||
def determine_seek_and_range(
|
||||
file_handle,
|
||||
localname,
|
||||
local_exists,
|
||||
overwrite,
|
||||
remote_total_bytes,
|
||||
server_respects_range,
|
||||
user_provided_range,
|
||||
user_range_min,
|
||||
user_range_max,
|
||||
):
|
||||
''' THINGS THAT CAN HAPPEN '''
|
||||
seek_to = 0
|
||||
header_range_min = None
|
||||
header_range_max = None
|
||||
if local_exists:
|
||||
local_existing_bytes = os.path.getsize(localname)
|
||||
if overwrite is True:
|
||||
file_handle.truncate()
|
||||
if user_provided_range:
|
||||
header_range_min = user_range_min
|
||||
header_range_max = user_range_max
|
||||
seek_to = user_range_min
|
||||
|
||||
elif not user_provided_range:
|
||||
pass
|
||||
|
||||
elif overwrite is None:
|
||||
if local_existing_bytes == remote_total_bytes:
|
||||
print('File is 100%. Nothing to do.')
|
||||
return
|
||||
|
||||
if user_provided_range:
|
||||
if server_respects_range:
|
||||
seek_to = user_range_min
|
||||
|
||||
else:
|
||||
raise Exception('The server did not respect your range header')
|
||||
|
||||
elif not user_provided_range:
|
||||
if server_respects_range:
|
||||
print('Resuming from byte %d' % local_existing_bytes)
|
||||
header_range_min = local_existing_bytes
|
||||
header_range_max = ''
|
||||
seek_to = local_existing_bytes
|
||||
|
||||
else:
|
||||
print('File exists, but server doesn\'t allow resumes. Restart from 0?')
|
||||
permission = get_permission()
|
||||
if permission:
|
||||
file_handle.truncate()
|
||||
|
||||
else:
|
||||
raise Exception('Couldn\'t resume')
|
||||
|
||||
else:
|
||||
raise TypeError('Invalid value for `overwrite`. Must be True, False, or None')
|
||||
|
||||
elif not local_exists:
|
||||
if user_provided_range:
|
||||
if server_respects_range:
|
||||
file_handle.seek(user_range_min)
|
||||
file_handle.write(b'\0')
|
||||
|
||||
header_range_min = user_range_min
|
||||
header_range_max = user_range_max
|
||||
seek_to = user_range_min
|
||||
|
||||
else:
|
||||
raise Exception('The server did not respect your range header')
|
||||
|
||||
elif not user_provided_range:
|
||||
pass
|
||||
return (seek_to, header_range_min, header_range_max)
|
||||
|
||||
def download_file(
|
||||
url,
|
||||
localname=None,
|
||||
|
@ -114,33 +40,103 @@ def download_file(
|
|||
bytespersecond=None,
|
||||
callback_progress=None,
|
||||
headers=None,
|
||||
overwrite=None
|
||||
overwrite=False,
|
||||
verbose=False,
|
||||
):
|
||||
if headers is None:
|
||||
headers = {}
|
||||
''' Determine local filename '''
|
||||
url = url.replace('%3A//', '://')
|
||||
headers = headers or {}
|
||||
|
||||
url = sanitize_url(url)
|
||||
if localname in [None, '']:
|
||||
localname = basename_from_url(url)
|
||||
localname = sanitize_filename(localname)
|
||||
|
||||
localname = filepath_sanitize(localname)
|
||||
if verbose:
|
||||
print(url)
|
||||
|
||||
plan = prepare_plan(
|
||||
url,
|
||||
localname,
|
||||
auth=auth,
|
||||
bytespersecond=bytespersecond,
|
||||
headers=headers,
|
||||
overwrite=overwrite,
|
||||
)
|
||||
#print(plan)
|
||||
if plan is None:
|
||||
return
|
||||
|
||||
localname = plan['download_into']
|
||||
directory = os.path.split(localname)[0]
|
||||
if directory != '':
|
||||
os.makedirs(directory, exist_ok=True)
|
||||
touch(localname)
|
||||
file_handle = open(localname, 'r+b')
|
||||
file_handle.seek(plan['seek_to'])
|
||||
|
||||
if plan['header_range_min'] is not None:
|
||||
headers['range'] = 'bytes={min}-{max}'.format(
|
||||
min=plan['header_range_min'],
|
||||
max=plan['header_range_max'],
|
||||
)
|
||||
|
||||
if plan['plan_type'] == 'resume':
|
||||
bytes_downloaded = plan['seek_to']
|
||||
else:
|
||||
bytes_downloaded = 0
|
||||
|
||||
download_stream = request('get', url, stream=True, headers=headers, auth=auth)
|
||||
for chunk in download_stream.iter_content(chunk_size=CHUNKSIZE):
|
||||
bytes_downloaded += len(chunk)
|
||||
file_handle.write(chunk)
|
||||
if callback_progress is not None:
|
||||
callback_progress(bytes_downloaded, plan['remote_total_bytes'])
|
||||
|
||||
if plan['limiter'] is not None and bytes_downloaded < plan['remote_total_bytes']:
|
||||
plan['limiter'].limit(len(chunk))
|
||||
|
||||
file_handle.close()
|
||||
|
||||
if localname != plan['real_localname']:
|
||||
os.rename(localname, plan['real_localname'])
|
||||
|
||||
localsize = os.path.getsize(plan['real_localname'])
|
||||
if plan['plan_type'] != 'partial' and localsize < plan['remote_total_bytes']:
|
||||
message = 'File does not contain expected number of bytes. Received {size} / {total}'
|
||||
message = message.format(size=os.path.getsize(localname), total=plan['remote_total_bytes'])
|
||||
raise Exception(message)
|
||||
|
||||
return plan['real_localname']
|
||||
|
||||
def prepare_plan(
|
||||
url,
|
||||
localname,
|
||||
auth,
|
||||
bytespersecond,
|
||||
headers,
|
||||
overwrite,
|
||||
):
|
||||
# Chapter 1: File existence
|
||||
user_provided_range = 'range' in headers
|
||||
real_localname = localname
|
||||
temp_localname = localname + TEMP_EXTENSION
|
||||
real_exists = os.path.exists(real_localname)
|
||||
|
||||
if real_exists and overwrite is False and not user_provided_range:
|
||||
print('File exists and overwrite is off. Nothing to do.')
|
||||
return None
|
||||
temp_exists = os.path.exists(temp_localname)
|
||||
real_localsize = int(real_exists and os.path.getsize(real_localname))
|
||||
temp_localsize = int(temp_exists and os.path.getsize(temp_localname))
|
||||
|
||||
# Chapter 2: Ratelimiting
|
||||
if bytespersecond is None:
|
||||
limiter = None
|
||||
elif isinstance(bytespersecond, ratelimiter.Ratelimiter):
|
||||
limiter = bytespersecond
|
||||
else:
|
||||
limiter = ratelimiter.Ratelimiter(bytespersecond, period=1)
|
||||
limiter = ratelimiter.Ratelimiter(bytespersecond)
|
||||
|
||||
''' Prepare plan variables '''
|
||||
local_exists = os.path.exists(localname)
|
||||
if local_exists and overwrite is False:
|
||||
print('Overwrite off. Nothing to do.')
|
||||
return
|
||||
|
||||
user_provided_range = 'range' in headers
|
||||
# Chapter 3: Extracting range
|
||||
if user_provided_range:
|
||||
user_range_min = int(headers['range'].split('bytes=')[1].split('-')[0])
|
||||
user_range_max = headers['range'].split('-')[1]
|
||||
|
@ -150,71 +146,88 @@ def download_file(
|
|||
user_range_min = None
|
||||
user_range_max = None
|
||||
|
||||
# Chapter 4: Server range support
|
||||
# Always include a range on the first request to figure out whether the
|
||||
# server supports it. Use 0- so we get the right `remote_total_bytes`.
|
||||
# server supports it. Use 0- to get correct remote_total_bytes
|
||||
temp_headers = headers
|
||||
temp_headers.update({'range': 'bytes=0-'})
|
||||
|
||||
# I'm using a GET instead of an actual HEAD here because some servers respond
|
||||
# differently, even though they're not supposed to.
|
||||
head = request('get', url, stream=True, headers=temp_headers, auth=auth)
|
||||
remote_total_bytes = int(head.headers.get('content-length', 1))
|
||||
remote_total_bytes = int(head.headers.get('content-length', 0))
|
||||
server_respects_range = (head.status_code == 206 and 'content-range' in head.headers)
|
||||
head.connection.close()
|
||||
|
||||
touch(localname)
|
||||
file_handle = open(localname, 'r+b')
|
||||
file_handle.seek(0)
|
||||
if user_provided_range and not server_respects_range:
|
||||
raise Exception('Server did not respect your range header')
|
||||
|
||||
plan = determine_seek_and_range(
|
||||
file_handle=file_handle,
|
||||
localname=localname,
|
||||
local_exists=local_exists,
|
||||
overwrite=overwrite,
|
||||
remote_total_bytes=remote_total_bytes,
|
||||
server_respects_range=server_respects_range,
|
||||
user_provided_range=user_provided_range,
|
||||
user_range_min=user_range_min,
|
||||
user_range_max=user_range_max,
|
||||
# Chapter 5: Plan definitions
|
||||
plan_base = {
|
||||
'limiter': limiter,
|
||||
'real_localname': real_localname,
|
||||
'remote_total_bytes': remote_total_bytes,
|
||||
}
|
||||
plan_fulldownload = dict(
|
||||
plan_base,
|
||||
download_into=temp_localname,
|
||||
header_range_min=None,
|
||||
header_range_max=None,
|
||||
plan_type='fulldownload',
|
||||
seek_to=0,
|
||||
)
|
||||
plan_resume = dict(
|
||||
plan_base,
|
||||
download_into=temp_localname,
|
||||
header_range_min=temp_localsize,
|
||||
header_range_max='',
|
||||
plan_type='resume',
|
||||
seek_to=temp_localsize,
|
||||
)
|
||||
plan_partial = dict(
|
||||
plan_base,
|
||||
download_into=real_localname,
|
||||
header_range_min=user_range_min,
|
||||
header_range_max=user_range_max,
|
||||
plan_type='partial',
|
||||
seek_to=user_range_min,
|
||||
)
|
||||
if plan is None:
|
||||
return
|
||||
|
||||
(seek_to, header_range_min, header_range_max) = plan
|
||||
if header_range_min is not None:
|
||||
headers['range'] = 'bytes={0}-{1}'.format(header_range_min, header_range_max)
|
||||
# Chapter 6: Redeem your meal vouchers here
|
||||
if real_exists:
|
||||
if overwrite:
|
||||
os.remove(real_localname)
|
||||
|
||||
bytes_downloaded = seek_to
|
||||
file_handle.seek(seek_to)
|
||||
download_stream = request('get', url, stream=True, headers=headers, auth=auth)
|
||||
if user_provided_range:
|
||||
return plan_partial
|
||||
|
||||
''' Begin download '''
|
||||
for chunk in download_stream.iter_content(chunk_size=CHUNKSIZE):
|
||||
bytes_downloaded += len(chunk)
|
||||
file_handle.write(chunk)
|
||||
if callback_progress is not None:
|
||||
callback_progress(bytes_downloaded, remote_total_bytes)
|
||||
return plan_fulldownload
|
||||
|
||||
if limiter is not None and bytes_downloaded < remote_total_bytes:
|
||||
limiter.limit(len(chunk))
|
||||
elif temp_exists and temp_localsize > 0:
|
||||
if overwrite:
|
||||
return plan_fulldownload
|
||||
|
||||
file_handle.close()
|
||||
return localname
|
||||
if user_provided_range:
|
||||
return plan_partial
|
||||
|
||||
def filepath_sanitize(text, exclusions=''):
|
||||
bet = FILENAME_BADCHARS.replace(exclusions, '')
|
||||
for char in bet:
|
||||
text = text.replace(char, '')
|
||||
return text
|
||||
if server_respects_range:
|
||||
print('Resume from byte %d' % plan_resume['seek_to'])
|
||||
return plan_resume
|
||||
|
||||
else:
|
||||
if user_provided_range:
|
||||
return plan_partial
|
||||
|
||||
return plan_fulldownload
|
||||
|
||||
print('No plan was chosen?')
|
||||
return None
|
||||
|
||||
def get_permission(prompt='y/n\n>', affirmative=['y', 'yes']):
|
||||
permission = input(prompt)
|
||||
return permission.lower() in affirmative
|
||||
|
||||
def is_clipboard(s):
|
||||
return s.lower() in ['!c', '!clip', '!clipboard']
|
||||
|
||||
def progress(bytes_downloaded, bytes_total, prefix=''):
|
||||
def progress1(bytes_downloaded, bytes_total, prefix=''):
|
||||
divisor = bytestring.get_appropriate_divisor(bytes_total)
|
||||
bytes_total_string = bytestring.bytestring(bytes_total, force_unit=divisor)
|
||||
bytes_downloaded_string = bytestring.bytestring(bytes_downloaded, force_unit=divisor)
|
||||
|
@ -278,6 +291,16 @@ def request(method, url, stream=False, headers=None, timeout=TIMEOUT, **kwargs):
|
|||
req.raise_for_status()
|
||||
return req
|
||||
|
||||
def sanitize_filename(text, exclusions=''):
|
||||
bet = FILENAME_BADCHARS.replace(exclusions, '')
|
||||
for char in bet:
|
||||
text = text.replace(char, '')
|
||||
return text
|
||||
|
||||
def sanitize_url(url):
|
||||
url = url.replace('%3A//', '://')
|
||||
return url
|
||||
|
||||
def touch(filename):
|
||||
f = open(filename, 'ab')
|
||||
f.close()
|
||||
|
@ -286,26 +309,14 @@ def touch(filename):
|
|||
|
||||
def download_argparse(args):
|
||||
url = args.url
|
||||
if is_clipboard(url):
|
||||
url = pyperclip.paste()
|
||||
print(url)
|
||||
|
||||
overwrite = {
|
||||
'y':True, 't':True,
|
||||
'n':False, 'f':False,
|
||||
}.get(args.overwrite.lower(), None)
|
||||
url = clipext.resolve(url)
|
||||
|
||||
callback = {
|
||||
None: progress,
|
||||
'1': progress,
|
||||
None: progress1,
|
||||
'1': progress1,
|
||||
'2': progress2,
|
||||
}.get(args.callback, None)
|
||||
|
||||
callback = args.callback
|
||||
if callback == '1':
|
||||
callback = progress
|
||||
if callback == '2':
|
||||
callback = progress2
|
||||
}.get(args.callback, args.callback)
|
||||
|
||||
bytespersecond = args.bytespersecond
|
||||
if bytespersecond is not None:
|
||||
|
@ -321,18 +332,19 @@ def download_argparse(args):
|
|||
bytespersecond=bytespersecond,
|
||||
callback_progress=callback,
|
||||
headers=headers,
|
||||
overwrite=overwrite,
|
||||
overwrite=args.overwrite,
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
|
||||
#p_download_file = subparsers.add_parser('download_file')
|
||||
parser.add_argument('url')
|
||||
parser.add_argument('localname', nargs='?', default=None)
|
||||
parser.add_argument('-c', '--callback', dest='callback', default=progress)
|
||||
parser.add_argument('-c', '--callback', dest='callback', default=progress1)
|
||||
parser.add_argument('-bps', '--bytespersecond', dest='bytespersecond', default=None)
|
||||
parser.add_argument('-ow', '--overwrite', dest='overwrite', default='')
|
||||
parser.add_argument('-ow', '--overwrite', dest='overwrite', action='store_true')
|
||||
parser.add_argument('-r', '--range', dest='range', default=None)
|
||||
parser.set_defaults(func=download_argparse)
|
||||
|
||||
|
|
|
@ -1,45 +0,0 @@
|
|||
THINGS THAT CAN HAPPEN
|
||||
├───File exists
|
||||
│ ├───User disables overwrite
|
||||
│ │ └───Return because there's nothing to do
|
||||
│ │
|
||||
│ ├───User enables overwrite
|
||||
│ │ ├───User requests range
|
||||
│ │ │ └───Raise exception because requesting a range and forcing overwrite are mutually exclusive
|
||||
│ │ │
|
||||
│ │ └───User does not request range
|
||||
│ │ └───File opened, truncated, full download
|
||||
│ │
|
||||
│ └───User does not specify overwrite
|
||||
│ ├───File is same size as content-length
|
||||
│ │ └───Return because there's nothing to do.
|
||||
│ │
|
||||
│ ├───User requests range
|
||||
│ │ ├───Server respects range
|
||||
│ │ │ └───File opened, seeked to request, bytes filled in
|
||||
│ │ │
|
||||
│ │ └───Server does not respect range
|
||||
│ │ └───Raise exception because user's request can't be fulfilled
|
||||
│ │
|
||||
│ └───User does not request range
|
||||
│ ├───Server respects range
|
||||
│ │ └───File is opened, seeked to end, download resumes
|
||||
│ │
|
||||
│ └───Server does not respect range
|
||||
│ └───Ask for permission to overwrite from beginning
|
||||
│
|
||||
└───File does not exist
|
||||
├───User requests range
|
||||
│ ├───Server respects range
|
||||
│ │ └───File created, seeked to request, bytes filled in. everything else left 0
|
||||
│ └───Server does not respect range
|
||||
│ └───Raise exception because user's request can't be fulfilled
|
||||
│
|
||||
└───User does not request range
|
||||
└───File created, full download
|
||||
|
||||
Possible amibiguity: If the user requests a range, and the file does not exist, does he want:
|
||||
1. to fill the file with zeroes, and patch the requested bytes into their correct spot; or
|
||||
2. to create the file empty, and only write the requested bytes?
|
||||
|
||||
I will assume #1 because that plays nicely with other Things That Can Happen, such as letting the user patch the other bytes in later.
|
157
Instathief/instathief.py
Normal file
157
Instathief/instathief.py
Normal file
|
@ -0,0 +1,157 @@
|
|||
import argparse
|
||||
import bs4
|
||||
import datetime
|
||||
import json
|
||||
import os
|
||||
import requests
|
||||
import sys
|
||||
|
||||
sys.path.append('C:\\git\\else\\clipext'); import clipext
|
||||
sys.path.append('C:\\git\\else\\downloady'); import downloady
|
||||
|
||||
|
||||
''' '''
|
||||
STRFTIME = '%Y%m%d-%H%M%S'
|
||||
# strftime used for filenames when downloading
|
||||
|
||||
URL_PROFILE = 'https://www.instagram.com/{username}'
|
||||
URL_QUERY = 'https://www.instagram.com/query/'
|
||||
|
||||
PAGE_QUERY_TEMPLATE = '''
|
||||
ig_user({user_id})
|
||||
{{
|
||||
media.after({end_cur}, {count})
|
||||
{{
|
||||
count,
|
||||
nodes
|
||||
{{
|
||||
code,
|
||||
date,
|
||||
display_src,
|
||||
id,
|
||||
video_url
|
||||
}},
|
||||
page_info
|
||||
}}
|
||||
}}
|
||||
'''.replace('\n', '').replace(' ', '')
|
||||
|
||||
USERAGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
|
||||
''' '''
|
||||
|
||||
last_cookie = None
|
||||
|
||||
def download_media(media_list):
|
||||
for media in media_list:
|
||||
timestamp = datetime.datetime.utcfromtimestamp(media['created'])
|
||||
timestamp = timestamp.strftime(STRFTIME)
|
||||
basename = downloady.basename_from_url(media['url'])
|
||||
extension = os.path.splitext(basename)[1]
|
||||
|
||||
filename = timestamp + extension
|
||||
downloady.download_file(
|
||||
url=media['url'],
|
||||
localname=filename,
|
||||
callback_progress=downloady.progress2,
|
||||
overwrite=False,
|
||||
)
|
||||
|
||||
def get_page(user_id, end_cur, count, cookies):
|
||||
query = PAGE_QUERY_TEMPLATE.format(
|
||||
count=count,
|
||||
end_cur=end_cur,
|
||||
user_id=user_id,
|
||||
)
|
||||
headers = {
|
||||
'referer': 'https://www.instagram.com/',
|
||||
'user-agent': USERAGENT,
|
||||
'x-csrftoken': cookies['csrftoken'],
|
||||
}
|
||||
data = {
|
||||
'q': query,
|
||||
'ref': 'users::show',
|
||||
}
|
||||
|
||||
response = requests.post(
|
||||
url=URL_QUERY,
|
||||
cookies=cookies,
|
||||
data=data,
|
||||
headers=headers,
|
||||
)
|
||||
response.raise_for_status()
|
||||
page = response.json()
|
||||
return page
|
||||
|
||||
def get_user_info(username):
|
||||
global last_cookie
|
||||
url = URL_PROFILE.format(username=username)
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
|
||||
text = response.text
|
||||
soup = bs4.BeautifulSoup(text, 'html.parser')
|
||||
|
||||
scripts = soup.find_all('script')
|
||||
for script in scripts:
|
||||
if 'window._sharedData' in script.text:
|
||||
break
|
||||
else:
|
||||
raise Exception('Did not find expected javascript')
|
||||
|
||||
user_data = script.text
|
||||
user_data = user_data.split('window._sharedData = ')[1].rstrip(';').strip()
|
||||
user_data = json.loads(user_data)
|
||||
user_data = user_data['entry_data']['ProfilePage'][0]['user']
|
||||
|
||||
user_id = user_data['id']
|
||||
page_info = user_data['media']['page_info']
|
||||
if page_info['has_next_page']:
|
||||
end_cur = page_info['start_cursor']
|
||||
# Minus 1 because the queries use "after" parameters for pagination, and
|
||||
# if we just take this cursor then we will only get items after it.
|
||||
end_cur = int(end_cur) - 1
|
||||
else:
|
||||
end_cur = None
|
||||
|
||||
user_data = {
|
||||
'user_id': user_id,
|
||||
'end_cur': end_cur,
|
||||
'cookies': response.cookies,
|
||||
}
|
||||
last_cookie = response.cookies
|
||||
return user_data
|
||||
|
||||
def get_user_media(username):
|
||||
user_info = get_user_info(username)
|
||||
end_cur = user_info.pop('end_cur')
|
||||
|
||||
while True:
|
||||
page = get_page(count=50, end_cur=end_cur, **user_info)
|
||||
page = page['media']
|
||||
|
||||
posts = page['nodes']
|
||||
for post in posts:
|
||||
timestamp = post['date']
|
||||
media_url = post.get('video_url') or post.get('display_src')
|
||||
ret = {
|
||||
'created': timestamp,
|
||||
'url': media_url
|
||||
}
|
||||
yield ret
|
||||
|
||||
page_info = page['page_info']
|
||||
if page_info['has_next_page']:
|
||||
end_cur = page_info['end_cursor']
|
||||
else:
|
||||
break
|
||||
|
||||
|
||||
def main():
|
||||
username = sys.argv[1]
|
||||
media = get_user_media(username)
|
||||
for (timestamp, url) in media:
|
||||
print(url)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
|
@ -138,6 +138,7 @@ function create_odi_div(url)
|
|||
|
||||
if (paramless_url.match(IMAGE_TYPES))
|
||||
{
|
||||
console.log("Creating image div for " + paramless_url);
|
||||
var div = document.createElement("div");
|
||||
div.id = generate_id(32);
|
||||
div.className = "odi_image_div";
|
||||
|
@ -194,6 +195,7 @@ function create_odi_div(url)
|
|||
{
|
||||
return null;
|
||||
}
|
||||
console.log("Creating " + mediatype + " div for " + paramless_url);
|
||||
|
||||
var div = document.createElement("div");
|
||||
div.id = generate_id(32);
|
||||
|
@ -253,6 +255,7 @@ function create_odi_div(url)
|
|||
}
|
||||
function create_odi_divs(urls)
|
||||
{
|
||||
console.log("Creating odi divs");
|
||||
image_divs = [];
|
||||
media_divs = [];
|
||||
odi_divs = [];
|
||||
|
@ -332,6 +335,7 @@ function create_workspace()
|
|||
control_panel.appendChild(ingest_div);
|
||||
control_panel.appendChild(start_button);
|
||||
document.body.appendChild(workspace);
|
||||
console.log("finished workspace");
|
||||
}
|
||||
|
||||
function delete_odi_div(element)
|
||||
|
@ -430,6 +434,7 @@ function filter_re(pattern, do_delete)
|
|||
|
||||
function format_css()
|
||||
{
|
||||
console.log("Formatting CSS variables");
|
||||
var css = CSS;
|
||||
while (true)
|
||||
{
|
||||
|
@ -438,22 +443,24 @@ function format_css()
|
|||
{
|
||||
break;
|
||||
}
|
||||
|
||||
matches = Array.from(new Set(matches));
|
||||
for (var index = 0; index < matches.length; index += 1)
|
||||
console.log(matches);
|
||||
matches = new Set(matches);
|
||||
/* Originally used Array.from(set) and did regular iteration, but I found
|
||||
that sites can override and break that conversion. */
|
||||
matches.forEach(
|
||||
function(injector)
|
||||
{
|
||||
var injector = matches[index];
|
||||
var injected = injector.replace(new RegExp("\\$", 'g'), "");
|
||||
/*console.log(injector);*/
|
||||
/*console.log(injected);*/
|
||||
css = css.replace(injector, this[injected]);
|
||||
}
|
||||
);
|
||||
}
|
||||
return css;
|
||||
}
|
||||
|
||||
function get_all_urls()
|
||||
{
|
||||
console.log("Collecting urls");
|
||||
var urls = [];
|
||||
function include(source, attr)
|
||||
{
|
||||
|
@ -529,6 +536,7 @@ function get_basename(url)
|
|||
|
||||
function get_gfycat_video(id)
|
||||
{
|
||||
console.log("Resolving gfycat " + id);
|
||||
var url = "https://gfycat.com/cajax/get/" + id;
|
||||
var request = new XMLHttpRequest();
|
||||
request.answer = null;
|
||||
|
@ -595,6 +603,7 @@ function generate_id(length)
|
|||
function ingest()
|
||||
{
|
||||
/* Take the text from the INGEST box, and make odi divs from it */
|
||||
console.log("Ingesting");
|
||||
var odi_divs = get_odi_divs();
|
||||
var ingestbox = document.getElementById("ingestbox");
|
||||
var text = ingestbox.value;
|
||||
|
@ -622,6 +631,7 @@ function ingest()
|
|||
|
||||
function lazy_load_all()
|
||||
{
|
||||
console.log("Starting lazyload");
|
||||
lazies = get_lazy_divs();
|
||||
lazies.reverse();
|
||||
lazy_buttons = document.getElementsByClassName("load_button");
|
||||
|
|
|
@ -91,8 +91,8 @@ function swap_source(player, source_list)
|
|||
|
||||
function main()
|
||||
{
|
||||
var WIDTH = 3;
|
||||
var HEIGHT = 3;
|
||||
var WIDTH = 2;
|
||||
var HEIGHT = 2;
|
||||
var MEDIAS = get_media_links();
|
||||
|
||||
clear_page();
|
||||
|
|
|
@ -1,49 +1,52 @@
|
|||
Open Dir DL
|
||||
===========
|
||||
|
||||
The open directory downloader
|
||||
The open directory downloader.
|
||||
|
||||
Requires `pip install beautifulsoup4`
|
||||
Requires `pip install beautifulsoup4`.
|
||||
|
||||
See inside opendirdl.py for usage instructions.
|
||||
|
||||
- 2016 08 16
|
||||
- **[cleanup]** Now that Downloady uses temp files for incomplete downloads, that logic can be removed from opendirdl.
|
||||
|
||||
- 2016 08 10
|
||||
- Fixed bug in smart_insert caused by 404's being considered falsey, triggering the 'one and only one' exception.
|
||||
- Fixed bug in smart_insert where 404'd URLs were not being deleted from the database.
|
||||
- Added clickable links to each directory on HTML tree pages.
|
||||
- **[addition]** Added clickable links to each directory on HTML tree pages.
|
||||
- **[bugfix]** Fixed bug in smart_insert caused by 404's being considered falsey, triggering the 'one and only one' exception.
|
||||
- **[bugfix]** Fixed bug in smart_insert where 404'd URLs were not being deleted from the database.
|
||||
|
||||
- 2016 08 02
|
||||
- Removed the usage of div IDs on the Tree pages by making the collapse button use `this.nextSibling`.
|
||||
- Rewrote `build_file_tree` with a way simpler algorithm.
|
||||
- Removed the ability to set a Node's parent during `__init__` because it wasn't fully fleshed out and doesn't need to be used anyway.
|
||||
- **[cleanup]** Removed the need for div IDs on the Tree pages by making the collapse button use `this.nextSibling`.
|
||||
- **[cleanup]** Rewrote `build_file_tree` with a way simpler algorithm.
|
||||
- **[removal]** Removed the ability to set a Node's parent during `__init__` because it wasn't fully fleshed out and doesn't need to be used anyway.
|
||||
|
||||
- 2016 08 01
|
||||
- Made the digest work even if you forget the http://
|
||||
- **[addition]** Made the digest work even if you forget the http://
|
||||
|
||||
- 2016 07 29
|
||||
- Moved some nested function definitions out to the top level, and made the construction of the file tree its own function. These functions really don't need to be used on their own, but they were cluttering the logic of the `tree` command.
|
||||
- Renamed `Tree.listnodes` to `Tree.list_children` and the `customsort` now expects to operate on Node objects rather than `(identifier, Node)` tuples. Nodes already have their identifier so the tuple was unecessary.
|
||||
- Removed `Tree.sorted_children` since it was basically a duplicate of `Tree.listnodes` and I don't know why I had both.
|
||||
- Replaced all `safeprint` calls with `write` because it provides access to safeprint as well as file writing if needed.
|
||||
- Replaced local `download_file` function with a call to `downloady.download_file`. It supports download continuation and removes duplicate work.
|
||||
- **[change]** Moved some nested function definitions out to the top level, and made the construction of the file tree its own function. These functions really don't need to be used on their own, but they were cluttering the logic of the `tree` command.
|
||||
- **[change]** Renamed `Tree.listnodes` to `Tree.list_children`, and the `customsort` now expects to operate on Node objects rather than `(identifier, Node)` tuples. Nodes already have their identifier so the tuple was unecessary.
|
||||
- **[change]** Replaced local `download_file` function with a call to `downloady.download_file`. It supports download continuation and removes duplicate work.
|
||||
- **[cleanup]** Replaced all `safeprint` calls with `write` because it provides access to safeprint as well as file writing if needed.
|
||||
- **[removal]** Removed `Tree.sorted_children` since it was basically a duplicate of `Tree.listnodes` and I don't know why I had both.
|
||||
|
||||
- 2016 07 25
|
||||
- Removed the `Downloader` class after watching [this Jack Diederich talk](https://youtu.be/o9pEzgHorH0) about unecessary classes.
|
||||
- Bytespersecond is now parsed by `bytestring.parsebytes` rather than `eval`, so you can write "100k" as opposed to "100 * 1024" etc.
|
||||
- **[change]** Bytespersecond is now parsed by `bytestring.parsebytes` rather than `eval`, so you can write "100k" as opposed to "100 * 1024" etc.
|
||||
- **[removal]** Removed the `Downloader` class after watching [this Jack Diederich talk](https://youtu.be/o9pEzgHorH0) about unecessary classes.
|
||||
|
||||
- 2016 07 19
|
||||
- Rearranged the big blocks to be in a logical order rather than alphabetical order. Walker > Downloader > other classes
|
||||
- Renamed the `keep_pattern` and `remove_pattern` functions to `keep_pattern_argparse` etc to be consistent with the other functions used by the argparser. *Does not affect the commandline usage!*
|
||||
- Gave the HTML tree divs a very gentle shadow and alternating colors to help with depth perception.
|
||||
- Fixed some mismatched code vs comments
|
||||
- Fixed the allowed characters parameter of `filepath_sanitize`, which was not written correctly but worked out of luck.
|
||||
- **[addition]** Gave the HTML tree divs a very gentle shadow and alternating colors to help with depth perception.
|
||||
- **[bugfix]** Fixed the allowed characters parameter of `filepath_sanitize`, which was not written correctly but worked out of luck.
|
||||
- **[cleanup]** Rearranged the big blocks to be in a logical order rather than alphabetical order. Walker > Downloader > other classes
|
||||
- **[cleanup]** Renamed the `keep_pattern` and `remove_pattern` functions to `keep_pattern_argparse` etc to be consistent with the other functions used by the argparser. *Does not affect the commandline usage!*
|
||||
- **[cleanup]** Fixed some mismatched code vs comments
|
||||
|
||||
- 2016 07 08
|
||||
- Fixed bug in which trees wouldn't generate on server:port urls.
|
||||
- **[bugfix]** Fixed bug in which trees wouldn't generate on server:port urls.
|
||||
|
||||
- 2016 07 04
|
||||
- Added new argparse command "tree"
|
||||
- **[addition]** Added new argparse command "tree"
|
||||
|
||||
- 2016 02 08
|
||||
- Fixed bug where server:port urls did not create db files because of the colon. It's been replaced by a hash.
|
||||
- Moved db commits to only happen at the end of a digest.
|
||||
- **[bugfix]** Fixed bug where server:port urls did not create db files because of the colon. It's been replaced by a hash.
|
||||
- **[change]** Moved db commits to only happen at the end of a digest.
|
||||
|
|
|
@ -614,7 +614,7 @@ def fetch_generator(cur):
|
|||
|
||||
def filepath_sanitize(text, allowed=''):
|
||||
badchars = FILENAME_BADCHARS
|
||||
badchars = ''.join(char for char in FILENAME_BADCHARS if char not in allowed)
|
||||
badchars = set(char for char in FILENAME_BADCHARS if char not in allowed)
|
||||
text = ''.join(char for char in text if char not in badchars)
|
||||
return text
|
||||
|
||||
|
@ -886,32 +886,16 @@ def download(
|
|||
folder = os.path.join(outputdir, url_filepath['folder'])
|
||||
os.makedirs(folder, exist_ok=True)
|
||||
|
||||
final_fullname = os.path.join(folder, url_filepath['filename'])
|
||||
temporary_basename = hashit(url, 16) + '.oddltemporary'
|
||||
temporary_fullname = os.path.join(folder, temporary_basename)
|
||||
fullname = os.path.join(folder, url_filepath['filename'])
|
||||
|
||||
# Because we use .oddltemporary files, the behavior of `overwrite` here
|
||||
# is different than the behavior of `overwrite` in downloady.
|
||||
# The overwrite used in the following block refers to the finalized file.
|
||||
# The overwrite passed to downloady refers to the oddltemporary which
|
||||
# may be resumed.
|
||||
if os.path.isfile(final_fullname):
|
||||
if overwrite:
|
||||
os.remove(final_fullname)
|
||||
else:
|
||||
write('Skipping "%s". Use `--overwrite`' % final_fullname)
|
||||
continue
|
||||
|
||||
overwrite = overwrite or None
|
||||
write('Downloading "%s" as "%s"' % (final_fullname, temporary_basename))
|
||||
write('Downloading "%s"' % fullname)
|
||||
downloady.download_file(
|
||||
url,
|
||||
localname=temporary_fullname,
|
||||
localname=fullname,
|
||||
bytespersecond=bytespersecond,
|
||||
callback_progress=downloady.progress2,
|
||||
overwrite=overwrite
|
||||
)
|
||||
os.rename(temporary_fullname, final_fullname)
|
||||
|
||||
def download_argparse(args):
|
||||
return download(
|
||||
|
|
|
@ -7,7 +7,6 @@ class Path:
|
|||
def __init__(self, path):
|
||||
path = os.path.normpath(path)
|
||||
path = os.path.abspath(path)
|
||||
path = get_path_casing(path)
|
||||
self.absolute_path = path
|
||||
|
||||
def __contains__(self, other):
|
||||
|
@ -23,6 +22,10 @@ class Path:
|
|||
def basename(self):
|
||||
return os.path.basename(self.absolute_path)
|
||||
|
||||
def correct_case(self):
|
||||
self.absolute_path = get_path_casing(self.absolute_path)
|
||||
return self.absolute_path
|
||||
|
||||
@property
|
||||
def exists(self):
|
||||
return os.path.exists(self.absolute_path)
|
||||
|
|
|
@ -14,6 +14,7 @@ sys.path.append('C:\\git\\else\\Ratelimiter'); import ratelimiter
|
|||
sys.path.append('C:\\git\\else\\SpinalTap'); import spinal
|
||||
|
||||
FILE_READ_CHUNK = bytestring.MIBIBYTE
|
||||
RATELIMITER = ratelimiter.Ratelimiter(16 * bytestring.MIBIBYTE)
|
||||
|
||||
# The paths which the user may access.
|
||||
# Attempting to access anything outside will 403.
|
||||
|
@ -98,6 +99,7 @@ class RequestHandler(http.server.BaseHTTPRequestHandler):
|
|||
if isinstance(data, types.GeneratorType):
|
||||
for chunk in data:
|
||||
self.wfile.write(chunk)
|
||||
RATELIMITER.limit(len(chunk))
|
||||
else:
|
||||
self.wfile.write(data)
|
||||
|
||||
|
|
|
@ -350,16 +350,17 @@ def copy_file(
|
|||
|
||||
source = str_to_fp(source)
|
||||
|
||||
if not source.is_file:
|
||||
raise SourceNotFile(source)
|
||||
|
||||
if destination_new_root is not None:
|
||||
source.correct_case()
|
||||
destination = new_root(source, destination_new_root)
|
||||
destination = str_to_fp(destination)
|
||||
|
||||
callback = callback or do_nothing
|
||||
callback_verbose = callback_verbose or do_nothing
|
||||
|
||||
if not source.is_file:
|
||||
raise SourceNotFile(source)
|
||||
|
||||
if destination.is_dir:
|
||||
raise DestinationIsDirectory(destination)
|
||||
|
||||
|
|
65
ThreadedDL/threaded_dl.py
Normal file
65
ThreadedDL/threaded_dl.py
Normal file
|
@ -0,0 +1,65 @@
|
|||
import os
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
|
||||
sys.path.append('C:\\git\\else\\clipext'); import clipext
|
||||
sys.path.append('C:\\git\\else\\downloady'); import downloady
|
||||
|
||||
def remove_finished(threads):
|
||||
threads = [t for t in threads if t.is_alive()]
|
||||
return threads
|
||||
|
||||
def download_thread(url, filename_prefix=''):
|
||||
url = url.strip()
|
||||
if url == '':
|
||||
return
|
||||
|
||||
basename = downloady.basename_from_url(url)
|
||||
basename = filename_prefix + basename
|
||||
if os.path.exists(basename):
|
||||
print('Skipping existing file "%s"' % basename)
|
||||
return
|
||||
print('Starting "%s"' % basename)
|
||||
downloady.download_file(url, basename)
|
||||
print('Finished "%s"' % basename)
|
||||
|
||||
def listget(li, index, fallback):
|
||||
try:
|
||||
return li[index]
|
||||
except IndexError:
|
||||
return fallback
|
||||
|
||||
def threaded_dl(urls, thread_count=4):
|
||||
threads = []
|
||||
prefix_digits = len(str(len(urls)))
|
||||
prefix_text = '%0{digits}d_'.format(digits=prefix_digits)
|
||||
for (index, url) in enumerate(urls):
|
||||
while len(threads) == thread_count:
|
||||
threads = remove_finished(threads)
|
||||
time.sleep(0.1)
|
||||
|
||||
prefix = prefix_text % index
|
||||
t = threading.Thread(target=download_thread, args=[url, prefix])
|
||||
t.daemon = True
|
||||
threads.append(t)
|
||||
t.start()
|
||||
|
||||
while len(threads) > 0:
|
||||
threads = remove_finished(threads)
|
||||
time.sleep(0.1)
|
||||
|
||||
def main():
|
||||
filename = sys.argv[1]
|
||||
if os.path.isfile(filename):
|
||||
f = open(filename, 'r')
|
||||
with f:
|
||||
urls = f.read()
|
||||
urls = urls.split()
|
||||
else:
|
||||
urls = clipext.resolve(filename)
|
||||
urls = urls.split()
|
||||
threaded_dl(urls)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
Loading…
Reference in a new issue