else/TotalDL/totaldl.py

439 lines
13 KiB
Python
Raw Normal View History

2015-05-15 05:40:19 +00:00
import json
import requests
import os
import time
2015-05-20 03:17:00 +00:00
import sys
2015-05-15 05:40:19 +00:00
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36'}
2015-05-16 02:27:24 +00:00
DOWNLOAD_DIRECTORY = ''
2015-05-16 00:24:00 +00:00
# Save files to this folder
# If blank, it uses the local folder
2015-06-03 03:30:35 +00:00
IMGUR_ALBUM_INDV = 'Viewfullresolution<'
IMGUR_ALBUM_INDV2 = 'linkrel="image_src"'
2015-05-15 05:40:19 +00:00
# The HTML string which tells us that an image link is
# on this line.
IMGUR_ALBUMFOLDERS = True
# If True, the individual images belonging to an album will be placed
# into a folder named after the album, like <album_id>/<img_id>.jpg
# Else, files will be named <album_id>_<img_id>.jpg and placed
# in the local folder.
GFYCAT_MP4 = True
# If True, download gfycat urls in .mp4
# Else, .webm
GFYCAT_SUBDOMAINS = ['zippy', 'fat', 'giant']
SLEEPINESS = 2
# The number of seconds to wait in between making requests
# Similar to PRAW's ratelimit handling.
# Not required, but probably better for the environment.
VIMEO_DICT_START = '"files":{"h264":'
VIMEO_DICT_END = ',"hls"'
# The HTML string which tells us where the mp4 file is
VIMEO_PRIORITY = ['hd', 'sd', 'mobile']
# Download files in this priority
2015-05-16 00:01:23 +00:00
LIVELEAK_YOUTUBEIFRAME = 'youtube.com/embed'
2015-05-16 02:00:24 +00:00
LIVELEAK_RESOLUTIONS = ['h264_base', 'h264_720p', 'h264_270p']
2015-06-03 03:30:35 +00:00
YOUTUBE_DL_FORMAT = 'youtube-dl "{url}" --no-playlist --force-ipv4 -o "/{dir}/{name}.%(ext)s"'
# The format for the youtube-dl shell command
2015-05-15 05:40:19 +00:00
DO_GENERIC = True
# If true, attempt to download whatever URL goes in
# Else, only download from the explicitly supported sites
''' End user config '''
last_request = 0
2015-05-16 02:27:24 +00:00
if DOWNLOAD_DIRECTORY != '':
if DOWNLOAD_DIRECTORY[-1] not in ['/', '\\']:
DOWNLOAD_DIRECTORY += '\\'
if not os.path.exists(DOWNLOAD_DIRECTORY):
os.makedirs(DOWNLOAD_DIRECTORY)
2015-05-16 00:24:00 +00:00
2015-05-15 05:40:19 +00:00
class StatusExc(Exception):
pass
def download_file(url, localname):
2015-05-16 00:24:00 +00:00
localname = DOWNLOAD_DIRECTORY + localname
2015-05-16 22:44:25 +00:00
if 'twimg' in url:
localname = localname.replace(':large', '')
localname = localname.replace(':small', '')
2015-05-15 05:40:19 +00:00
if os.path.exists(localname):
print('\t%s already exists!!' % localname)
2015-06-03 20:48:48 +00:00
return localname
2015-05-15 05:40:19 +00:00
print('\tDownloading %s' % localname)
downloading = request_get(url, stream=True)
localfile = open(localname, 'wb')
for chunk in downloading.iter_content(chunk_size=1024):
if chunk:
localfile.write(chunk)
2015-06-03 03:30:35 +00:00
return localname
2015-05-15 05:40:19 +00:00
def request_get(url, stream=False):
global last_request
now = time.time()
diff = now - last_request
if diff < SLEEPINESS:
diff = SLEEPINESS - diff
time.sleep(diff)
last_request = time.time()
req = requests.get(url, stream=stream, headers=HEADERS)
if req.status_code != 200:
raise StatusExc("Status code %d on url %s" % (req.status_code, url))
return req
##############################################################################
##
def handle_imgur_html(url):
pagedata = request_get(url)
pagedata = pagedata.text.replace(' ', '')
2015-06-03 03:30:35 +00:00
pagedata = pagedata.replace('src="', 'href="')
pagedata = pagedata.replace(IMGUR_ALBUM_INDV2, IMGUR_ALBUM_INDV)
2015-05-15 05:40:19 +00:00
pagedata = pagedata.split('\n')
pagedata = [line.strip() for line in pagedata]
pagedata = [line for line in pagedata if IMGUR_ALBUM_INDV in line]
2015-06-03 03:30:35 +00:00
pagedata = [line.split('href=')[1] for line in pagedata]
pagedata = [line.replace('"//', '"http://') for line in pagedata]
pagedata = [line.split('"')[1] for line in pagedata]
2015-05-15 05:40:19 +00:00
links = []
2015-06-03 03:30:35 +00:00
first = pagedata[0].split('.')[0]
if [x.split('.')[0] for x in pagedata].count(first) > 1:
pagedata = pagedata[1:]
2015-05-15 05:40:19 +00:00
for image in pagedata:
image = image.split('?')[0]
if image not in links:
links.append(image)
return links
2015-05-15 21:46:32 +00:00
def handle_imgur(url, albumid='', customname=None):
2015-05-15 05:40:19 +00:00
name = url.split('/')[-1]
if 'imgur.com' in name:
# This link doesn't appear to have an image id
return
2015-06-03 03:30:35 +00:00
url = url.replace('/gallery/', '/a/')
2015-05-15 05:40:19 +00:00
basename = name.split('.')[0]
if '.' in name:
# This is a direct image link
2015-05-15 21:46:32 +00:00
if customname:
# replace the imgur ID with the customname, keep ext.
name = '%s.%s' % (customname, name.split('.')[-1])
if albumid and albumid != basename:
if IMGUR_ALBUMFOLDERS:
2015-05-16 00:24:00 +00:00
if not os.path.exists(DOWNLOAD_DIRECTORY + albumid):
os.makedirs(DOWNLOAD_DIRECTORY + albumid)
2015-05-15 21:46:32 +00:00
localpath = '%s\\%s' % (albumid, name)
else:
localpath = '%s_%s' % (albumid, name)
2015-05-15 05:40:19 +00:00
else:
localpath = name
2015-06-03 03:30:35 +00:00
return download_file(url, localpath)
2015-05-15 05:40:19 +00:00
else:
# Not a direct image link, let's read the html.
images = handle_imgur_html(url)
2015-05-15 21:46:32 +00:00
if customname:
name = customname
2015-05-15 05:40:19 +00:00
print('\tFound %d images' % len(images))
2015-06-03 03:30:35 +00:00
localfiles = []
2015-05-15 21:46:32 +00:00
if len(images) > 1:
for imagei in range(len(images)):
image = images[imagei]
2015-05-20 03:17:00 +00:00
iname = image.split('/')[-1]
iname = iname.split('.')[0]
2015-06-03 03:30:35 +00:00
x = handle_imgur(image, albumid=name, customname='%d_%s' % (imagei, iname))
localfiles.append(x)
2015-05-15 21:46:32 +00:00
else:
2015-06-03 03:30:35 +00:00
x = handle_imgur(images[0], customname=name)
localfiles.append(x)
return localfiles
2015-05-15 05:40:19 +00:00
2015-05-15 21:46:32 +00:00
def handle_gfycat(url, customname=None):
2015-05-15 05:40:19 +00:00
name = url.split('/')[-1]
name = name.split('.')[0]
2015-05-15 21:46:32 +00:00
if customname:
filename = customname
else:
filename = name
2015-05-15 05:40:19 +00:00
if GFYCAT_MP4:
name += '.mp4'
2015-05-15 21:46:32 +00:00
filename += '.mp4'
2015-05-15 05:40:19 +00:00
else:
name += '.webm'
2015-05-15 21:46:32 +00:00
filename += '.webm'
2015-05-15 05:40:19 +00:00
for subdomain in GFYCAT_SUBDOMAINS:
url = 'http://%s.gfycat.com/%s' % (subdomain, name)
try:
2015-06-03 03:30:35 +00:00
return download_file(url, filename)
2015-05-15 05:40:19 +00:00
except StatusExc:
pass
2015-05-15 21:46:32 +00:00
def handle_vimeo(url, customname=None):
2015-05-15 05:40:19 +00:00
name = url.split('/')[-1]
name = name.split('?')[0]
try:
int(name)
except ValueError as e:
print('Could not identify filename of %s' % url)
raise e
url = 'http://player.vimeo.com/video/%s' % name
pagedata = request_get(url)
pagedata = pagedata.text
pagedata = pagedata.replace('</script>', '<script')
pagedata = pagedata.split('<script>')
for chunk in pagedata:
if VIMEO_DICT_START in chunk:
break
chunk = chunk.split(VIMEO_DICT_START)[1]
chunk = chunk.split(VIMEO_DICT_END)[0]
chunk = json.loads(chunk)
for priority in VIMEO_PRIORITY:
if priority in chunk:
fileurl = chunk[priority]['url']
break
2015-05-15 21:46:32 +00:00
if customname:
filename = customname + '.mp4'
else:
filename = name + '.mp4'
2015-06-03 03:30:35 +00:00
return download_file(fileurl, filename)
2015-05-15 05:40:19 +00:00
2015-05-15 21:46:32 +00:00
def handle_liveleak(url, customname=None):
if customname:
name = customname
else:
name = url.split('=')[1]
name += '.mp4'
2015-05-15 05:40:19 +00:00
pagedata = request_get(url)
pagedata = pagedata.text
2015-05-16 00:01:23 +00:00
if LIVELEAK_YOUTUBEIFRAME in pagedata:
pagedata = pagedata.split('\n')
pagedata = [line for line in pagedata if LIVELEAK_YOUTUBEIFRAME in line]
pagedata = pagedata[0]
pagedata = pagedata.split('src="')[1]
pagedata = pagedata.split('"')[0]
print('\tFound youtube embed')
2015-06-03 03:33:46 +00:00
handle_master(pagedata, customname=customname)
2015-05-16 00:01:23 +00:00
else:
pagedata = pagedata.split('file: "')[1]
pagedata = pagedata.split('",')[0]
2015-05-16 02:00:24 +00:00
original = pagedata
2015-05-16 00:01:23 +00:00
pagedata = pagedata.split('.')
for spoti in range(len(pagedata)):
if 'h264_' in pagedata[spoti]:
2015-05-16 02:00:24 +00:00
pagedata[spoti] = 'LIVELEAKRESOLUTION'
2015-05-16 00:01:23 +00:00
pagedata = '.'.join(pagedata)
2015-05-16 02:00:24 +00:00
for res in LIVELEAK_RESOLUTIONS:
url = pagedata.replace('LIVELEAKRESOLUTION', res)
try:
2015-06-03 03:30:35 +00:00
return download_file(url, name)
2015-05-16 02:00:24 +00:00
except StatusExc:
pass
2015-06-03 03:30:35 +00:00
return download_file(original, name)
2015-05-16 02:00:24 +00:00
2015-05-15 05:40:19 +00:00
2015-05-15 21:46:32 +00:00
def handle_youtube(url, customname=None):
url = url.replace('&amp;', '&')
url = url.replace('feature=player_embedded&', '')
url = url.replace('&feature=player_embedded', '')
2015-06-03 03:30:35 +00:00
if not customname:
os.system(YOUTUBE_DL_FORMAT.format(url=url, dir=DOWNLOAD_DIRECTORY, name='%(title)s'))
return
os.system(YOUTUBE_DL_FORMAT.format(url=url, dir=DOWNLOAD_DIRECTORY, name=customname))
if DOWNLOAD_DIRECTORY:
return '%s/%s.mp4' % (DOWNLOAD_DIRECTORY, customname)
return '%s.mp4' % customname
2015-05-15 05:40:19 +00:00
2015-05-16 22:44:25 +00:00
def handle_twitter(url, customname=None):
pagedata = request_get(url)
pagedata = pagedata.text
2015-05-20 03:17:00 +00:00
idnumber = url.split('status/')[1].split('/')[0]
if customname:
name = customname
else:
name = idnumber
customname = idnumber
tweetpath = '%s.html' % (DOWNLOAD_DIRECTORY + name)
2015-06-03 03:30:35 +00:00
psplit = '<p class="TweetTextSize'
tweettext = pagedata.split(psplit)[1]
tweettext = tweettext.split('</p>')[0]
tweettext = psplit + tweettext + '</p>'
tweettext = '<html><body>%s</body></html>' % tweettext
tweettext = tweettext.replace('/hashtag/', 'http://twitter.com/hashtag/')
tweethtml = open(tweetpath, 'w', encoding='utf-8')
tweethtml.write(tweettext)
tweethtml.close()
print('\tSaved tweet text')
2015-05-16 22:44:25 +00:00
try:
link = pagedata.split('data-url="')[1]
link = link.split('"')[0]
2015-05-18 06:48:27 +00:00
if link != url:
handle_master(link, customname=customname)
2015-06-03 03:30:35 +00:00
return tweetpath
2015-05-16 22:44:25 +00:00
except IndexError:
try:
link = pagedata.split('data-expanded-url="')[1]
link = link.split('"')[0]
2015-05-18 06:48:27 +00:00
if link != url:
handle_master(link, customname=customname)
2015-06-03 03:30:35 +00:00
return tweetpath
2015-05-16 22:44:25 +00:00
except IndexError:
pass
2015-06-03 03:30:35 +00:00
return tweetpath
2015-05-16 22:44:25 +00:00
print('\tNo media detected')
2015-05-15 21:46:32 +00:00
def handle_generic(url, customname=None):
2015-05-15 05:40:19 +00:00
try:
name = url.split('/')[-1]
2015-06-03 20:48:48 +00:00
ext = name.split('.')[-1]
if ext == name:
ext = '.html'
2015-05-15 21:46:32 +00:00
if customname:
2015-06-03 20:48:48 +00:00
name = '%s.%s' % (customname, ext)
else:
name += ext
2015-06-03 03:30:35 +00:00
return download_file(url, name)
2015-05-15 05:40:19 +00:00
except:
pass
##
##############################################################################
HANDLERS = {
'imgur.com': handle_imgur,
'gfycat.com': handle_gfycat,
'vimeo.com': handle_vimeo,
'liveleak.com': handle_liveleak,
'youtube.com': handle_youtube,
2015-05-16 22:44:25 +00:00
'youtu.be': handle_youtube,
'twitter.com': handle_twitter
2015-05-15 05:40:19 +00:00
}
2015-05-15 21:46:32 +00:00
def handle_master(url, customname=None):
2015-05-15 05:40:19 +00:00
print('Handling %s' % url)
for handlerkey in HANDLERS:
if handlerkey.lower() in url.lower():
2015-06-03 03:30:35 +00:00
return HANDLERS[handlerkey](url, customname=customname)
2015-05-15 05:40:19 +00:00
if DO_GENERIC:
2015-06-03 03:30:35 +00:00
return handle_generic(url, customname=customname)
2015-05-15 05:40:19 +00:00
2015-05-16 10:31:00 +00:00
def test_imgur():
# Imgur gallery album
handle_master('http://imgur.com/gallery/s4WLG')
2015-05-15 05:40:19 +00:00
2015-05-16 10:31:00 +00:00
# Imgur standard album with customname
handle_master('http://imgur.com/a/s4WLG', customname='album')
2015-05-15 05:40:19 +00:00
2015-05-16 10:31:00 +00:00
# Imgur indirect
handle_master('http://imgur.com/gvJUct0')
2015-05-15 05:40:19 +00:00
2015-05-16 10:31:00 +00:00
# Imgur indirect single with customname
handle_master('http://imgur.com/gvJUct0', customname='indirect')
2015-05-15 21:46:32 +00:00
2015-05-16 10:31:00 +00:00
# Imgur direct single
handle_master('http://i.imgur.com/gvJUct0.jpg')
2015-05-15 05:40:19 +00:00
2015-05-16 10:31:00 +00:00
def test_gfycat():
# Gfycat direct .gif
handle_master('http://giant.gfycat.com/FatherlyBruisedIberianchiffchaff.gif')
2015-05-15 05:40:19 +00:00
2015-05-16 10:31:00 +00:00
# Gfycat general link
handle_master('http://www.gfycat.com/RawWetFlatcoatretriever')
2015-05-15 05:40:19 +00:00
2015-05-16 10:31:00 +00:00
# Gfycat general link with customname
handle_master('http://www.gfycat.com/RawWetFlatcoatretriever', customname='gfycatgeneral')
2015-05-15 21:46:32 +00:00
2015-05-16 10:31:00 +00:00
def test_vimeo():
# Vimeo standard link
handle_master('https://vimeo.com/109405701')
2015-05-15 05:40:19 +00:00
2015-05-16 10:31:00 +00:00
# Vimeo player link with customname
handle_master('https://player.vimeo.com/video/109405701', customname='vimeoplayer')
2015-05-15 21:46:32 +00:00
2015-05-16 10:31:00 +00:00
def test_liveleak():
# LiveLeak standard link
handle_master('http://www.liveleak.com/view?i=9d1_1429192014')
2015-05-15 05:40:19 +00:00
2015-05-16 10:31:00 +00:00
# Liveleak article with youtube embed
handle_master('http://www.liveleak.com/view?i=ab8_1367941301')
2015-05-16 00:01:23 +00:00
2015-05-16 10:31:00 +00:00
# LiveLeak standard link with customname
handle_master('http://www.liveleak.com/view?i=9d1_1429192014', customname='liveleak')
2015-05-15 21:46:32 +00:00
2015-05-16 10:31:00 +00:00
def test_youtube():
# Youtube standard link
handle_master('https://www.youtube.com/watch?v=bEgeh5hA5ko')
2015-05-15 05:40:19 +00:00
2015-05-16 10:31:00 +00:00
# Youtube short link
2015-06-03 03:30:35 +00:00
handle_master('https://youtu.be/GjOBTstnW20', customname='youtube')
2015-05-15 05:40:19 +00:00
2015-05-16 10:31:00 +00:00
# Youtube player embed link
handle_master('https://www.youtube.com/watch?feature=player_embedded&amp;v=bEgeh5hA5ko')
2015-05-15 21:46:32 +00:00
2015-05-16 22:44:25 +00:00
def test_twitter():
# Tiwtter with twitter-image embed
handle_master('https://twitter.com/PetoLucem/status/599493836214272000')
# Twitter with twitter-image embed
handle_master('https://twitter.com/Jalopnik/status/598287843128188929')
# Twitter with twitter-image embed and customname
handle_master('https://twitter.com/Jalopnik/status/598287843128188929', customname='twits')
# Twitter with youtube embed
handle_master('https://twitter.com/cp_orange_x3/status/599705117420457984')
# Twitter plain text
handle_master('https://twitter.com/cp_orange_x3/status/599700702382817280')
2015-05-18 06:48:27 +00:00
# Twitter plain text
handle_master('https://twitter.com/SyriacMFS/status/556513635913437184')
2015-06-03 03:30:35 +00:00
# Twitter with arabic characters
handle_master('https://twitter.com/HadiAlabdallah/status/600885154991706113')
2015-05-16 10:31:00 +00:00
def test_generic():
# Some link that might work
handle_master('https://raw.githubusercontent.com/voussoir/reddit/master/SubredditBirthdays/show/statistics.txt')
2015-05-15 05:40:19 +00:00
2015-05-16 10:31:00 +00:00
# Some link that might work with customname
handle_master('https://raw.githubusercontent.com/voussoir/reddit/master/SubredditBirthdays/show/statistics.txt', customname='sss')
2015-05-15 21:46:32 +00:00
2015-05-16 10:31:00 +00:00
# Some link that might work
handle_master('https://github.com/voussoir/reddit/tree/master/SubredditBirthdays/show')
2015-05-15 05:40:19 +00:00
2015-05-15 05:43:47 +00:00
if __name__ == '__main__':
2015-05-20 03:17:00 +00:00
if len(sys.argv) > 1:
handle_master(sys.argv[1])
else:
#test_imgur()
#test_gfycat()
#test_vimeo()
2015-06-03 03:30:35 +00:00
test_liveleak()
test_youtube()
#test_twitter()
2015-05-20 03:17:00 +00:00
#test_generic()
pass