This commit is contained in:
Voussoir 2015-06-02 20:30:35 -07:00
parent cc68bcbe06
commit 94e54c9861
3 changed files with 68 additions and 41 deletions

Binary file not shown.

View file

@ -1,6 +1,10 @@
import traceback import traceback
import sqlite3 import sqlite3
import totaldl import totaldl
import praw
r = praw.Reddit('')
r.login('', '')
sql = sqlite3.connect('!!testdata.db') sql = sqlite3.connect('!!testdata.db')
cur = sql.cursor() cur = sql.cursor()
@ -25,7 +29,13 @@ while True:
if len(title) > 35: if len(title) > 35:
title = title[:34] + '-' title = title[:34] + '-'
try: try:
totaldl.handle_master(url, customname=title) filepath = totaldl.handle_master(url, customname=title)
filepath = filepath.split('/')[-1]
if '.mp4' in filepath:
filepath = 'http://syriancivilwar.pw/Videos/' + filepath
submission = r.get_info(thing_id=item[1])
submission.add_comment('Mirror: %s' % filepath)
print(filepath)
except: except:
traceback.print_exc() traceback.print_exc()
cur2.execute('INSERT INTO totaldl_urls VALUES(?)', [url]) cur2.execute('INSERT INTO totaldl_urls VALUES(?)', [url])

View file

@ -10,7 +10,8 @@ DOWNLOAD_DIRECTORY = ''
# Save files to this folder # Save files to this folder
# If blank, it uses the local folder # If blank, it uses the local folder
IMGUR_ALBUM_INDV = '"og:image"content="htt' IMGUR_ALBUM_INDV = 'Viewfullresolution<'
IMGUR_ALBUM_INDV2 = 'linkrel="image_src"'
# The HTML string which tells us that an image link is # The HTML string which tells us that an image link is
# on this line. # on this line.
@ -42,6 +43,9 @@ LIVELEAK_YOUTUBEIFRAME = 'youtube.com/embed'
LIVELEAK_RESOLUTIONS = ['h264_base', 'h264_720p', 'h264_270p'] LIVELEAK_RESOLUTIONS = ['h264_base', 'h264_720p', 'h264_270p']
YOUTUBE_DL_FORMAT = 'youtube-dl "{url}" --no-playlist --force-ipv4 -o "/{dir}/{name}.%(ext)s"'
# The format for the youtube-dl shell command
DO_GENERIC = True DO_GENERIC = True
# If true, attempt to download whatever URL goes in # If true, attempt to download whatever URL goes in
# Else, only download from the explicitly supported sites # Else, only download from the explicitly supported sites
@ -73,7 +77,7 @@ def download_file(url, localname):
for chunk in downloading.iter_content(chunk_size=1024): for chunk in downloading.iter_content(chunk_size=1024):
if chunk: if chunk:
localfile.write(chunk) localfile.write(chunk)
return True return localname
def request_get(url, stream=False): def request_get(url, stream=False):
global last_request global last_request
@ -93,11 +97,18 @@ def request_get(url, stream=False):
def handle_imgur_html(url): def handle_imgur_html(url):
pagedata = request_get(url) pagedata = request_get(url)
pagedata = pagedata.text.replace(' ', '') pagedata = pagedata.text.replace(' ', '')
pagedata = pagedata.replace('src="', 'href="')
pagedata = pagedata.replace(IMGUR_ALBUM_INDV2, IMGUR_ALBUM_INDV)
pagedata = pagedata.split('\n') pagedata = pagedata.split('\n')
pagedata = [line.strip() for line in pagedata] pagedata = [line.strip() for line in pagedata]
pagedata = [line for line in pagedata if IMGUR_ALBUM_INDV in line] pagedata = [line for line in pagedata if IMGUR_ALBUM_INDV in line]
pagedata = [line.split('"')[-2] for line in pagedata] pagedata = [line.split('href=')[1] for line in pagedata]
pagedata = [line.replace('"//', '"http://') for line in pagedata]
pagedata = [line.split('"')[1] for line in pagedata]
links = [] links = []
first = pagedata[0].split('.')[0]
if [x.split('.')[0] for x in pagedata].count(first) > 1:
pagedata = pagedata[1:]
for image in pagedata: for image in pagedata:
image = image.split('?')[0] image = image.split('?')[0]
if image not in links: if image not in links:
@ -110,6 +121,7 @@ def handle_imgur(url, albumid='', customname=None):
# This link doesn't appear to have an image id # This link doesn't appear to have an image id
return return
url = url.replace('/gallery/', '/a/')
basename = name.split('.')[0] basename = name.split('.')[0]
if '.' in name: if '.' in name:
# This is a direct image link # This is a direct image link
@ -130,7 +142,7 @@ def handle_imgur(url, albumid='', customname=None):
else: else:
localpath = name localpath = name
download_file(url, localpath) return download_file(url, localpath)
else: else:
# Not a direct image link, let's read the html. # Not a direct image link, let's read the html.
@ -138,14 +150,19 @@ def handle_imgur(url, albumid='', customname=None):
if customname: if customname:
name = customname name = customname
print('\tFound %d images' % len(images)) print('\tFound %d images' % len(images))
localfiles = []
if len(images) > 1: if len(images) > 1:
for imagei in range(len(images)): for imagei in range(len(images)):
image = images[imagei] image = images[imagei]
iname = image.split('/')[-1] iname = image.split('/')[-1]
iname = iname.split('.')[0] iname = iname.split('.')[0]
handle_imgur(image, albumid=name, customname='%d_%s' % (imagei, iname)) x = handle_imgur(image, albumid=name, customname='%d_%s' % (imagei, iname))
localfiles.append(x)
else: else:
handle_imgur(images[0], customname=name) x = handle_imgur(images[0], customname=name)
localfiles.append(x)
return localfiles
def handle_gfycat(url, customname=None): def handle_gfycat(url, customname=None):
@ -166,8 +183,7 @@ def handle_gfycat(url, customname=None):
for subdomain in GFYCAT_SUBDOMAINS: for subdomain in GFYCAT_SUBDOMAINS:
url = 'http://%s.gfycat.com/%s' % (subdomain, name) url = 'http://%s.gfycat.com/%s' % (subdomain, name)
try: try:
download_file(url, filename) return download_file(url, filename)
break
except StatusExc: except StatusExc:
pass pass
@ -200,7 +216,7 @@ def handle_vimeo(url, customname=None):
filename = customname + '.mp4' filename = customname + '.mp4'
else: else:
filename = name + '.mp4' filename = name + '.mp4'
download_file(fileurl, filename) return download_file(fileurl, filename)
def handle_liveleak(url, customname=None): def handle_liveleak(url, customname=None):
@ -231,23 +247,24 @@ def handle_liveleak(url, customname=None):
for res in LIVELEAK_RESOLUTIONS: for res in LIVELEAK_RESOLUTIONS:
url = pagedata.replace('LIVELEAKRESOLUTION', res) url = pagedata.replace('LIVELEAKRESOLUTION', res)
try: try:
download_file(url, name) return download_file(url, name)
return
except StatusExc: except StatusExc:
pass pass
download_file(original, name) return download_file(original, name)
def handle_youtube(url, customname=None): def handle_youtube(url, customname=None):
# The customname doesn't do anything on this function
# but handle_master works better if everything uses
# the same format.
url = url.replace('&amp;', '&') url = url.replace('&amp;', '&')
url = url.replace('feature=player_embedded&', '') url = url.replace('feature=player_embedded&', '')
url = url.replace('&feature=player_embedded', '') url = url.replace('&feature=player_embedded', '')
os.system('youtube-dl "{0}" --no-playlist --force-ipv4 -o "/{1}/%(title)s.%(ext)s"'.format(url, DOWNLOAD_DIRECTORY)) if not customname:
os.system(YOUTUBE_DL_FORMAT.format(url=url, dir=DOWNLOAD_DIRECTORY, name='%(title)s'))
return
os.system(YOUTUBE_DL_FORMAT.format(url=url, dir=DOWNLOAD_DIRECTORY, name=customname))
if DOWNLOAD_DIRECTORY:
return '%s/%s.mp4' % (DOWNLOAD_DIRECTORY, customname)
return '%s.mp4' % customname
def handle_twitter(url, customname=None): def handle_twitter(url, customname=None):
pagedata = request_get(url) pagedata = request_get(url)
@ -260,34 +277,32 @@ def handle_twitter(url, customname=None):
name = idnumber name = idnumber
customname = idnumber customname = idnumber
tweetpath = '%s.html' % (DOWNLOAD_DIRECTORY + name) tweetpath = '%s.html' % (DOWNLOAD_DIRECTORY + name)
if not os.path.exists(tweetpath):
psplit = '<p class="TweetTextSize' psplit = '<p class="TweetTextSize'
tweettext = pagedata.split(psplit)[1] tweettext = pagedata.split(psplit)[1]
tweettext = tweettext.split('</p>')[0] tweettext = tweettext.split('</p>')[0]
tweettext = psplit + tweettext + '</p>' tweettext = psplit + tweettext + '</p>'
tweettext = '<html><body>%s</body></html>' % tweettext tweettext = '<html><body>%s</body></html>' % tweettext
tweettext = tweettext.replace('/hashtag/', 'http://twitter.com/hashtag/') tweettext = tweettext.replace('/hashtag/', 'http://twitter.com/hashtag/')
tweethtml = open(tweetpath, 'w') tweethtml = open(tweetpath, 'w', encoding='utf-8')
tweethtml.write(tweettext) tweethtml.write(tweettext)
tweethtml.close() tweethtml.close()
print('\tSaved tweet text') print('\tSaved tweet text')
else:
print('\tTweet text already exists')
try: try:
link = pagedata.split('data-url="')[1] link = pagedata.split('data-url="')[1]
link = link.split('"')[0] link = link.split('"')[0]
if link != url: if link != url:
handle_master(link, customname=customname) handle_master(link, customname=customname)
return return tweetpath
except IndexError: except IndexError:
try: try:
link = pagedata.split('data-expanded-url="')[1] link = pagedata.split('data-expanded-url="')[1]
link = link.split('"')[0] link = link.split('"')[0]
if link != url: if link != url:
handle_master(link, customname=customname) handle_master(link, customname=customname)
return return tweetpath
except IndexError: except IndexError:
pass pass
return tweetpath
print('\tNo media detected') print('\tNo media detected')
@ -298,7 +313,7 @@ def handle_generic(url, customname=None):
name = '%s.%s' % (customname, name.split('.')[-1]) name = '%s.%s' % (customname, name.split('.')[-1])
if '.' not in name: if '.' not in name:
name += '.html' name += '.html'
download_file(url, name) return download_file(url, name)
except: except:
pass pass
## ##
@ -318,10 +333,9 @@ def handle_master(url, customname=None):
print('Handling %s' % url) print('Handling %s' % url)
for handlerkey in HANDLERS: for handlerkey in HANDLERS:
if handlerkey.lower() in url.lower(): if handlerkey.lower() in url.lower():
HANDLERS[handlerkey](url, customname=customname) return HANDLERS[handlerkey](url, customname=customname)
return
if DO_GENERIC: if DO_GENERIC:
handle_generic(url, customname=customname) return handle_generic(url, customname=customname)
def test_imgur(): def test_imgur():
# Imgur gallery album # Imgur gallery album
@ -371,7 +385,7 @@ def test_youtube():
handle_master('https://www.youtube.com/watch?v=bEgeh5hA5ko') handle_master('https://www.youtube.com/watch?v=bEgeh5hA5ko')
# Youtube short link # Youtube short link
handle_master('https://youtu.be/GjOBTstnW20') handle_master('https://youtu.be/GjOBTstnW20', customname='youtube')
# Youtube player embed link # Youtube player embed link
handle_master('https://www.youtube.com/watch?feature=player_embedded&amp;v=bEgeh5hA5ko') handle_master('https://www.youtube.com/watch?feature=player_embedded&amp;v=bEgeh5hA5ko')
@ -395,6 +409,9 @@ def test_twitter():
# Twitter plain text # Twitter plain text
handle_master('https://twitter.com/SyriacMFS/status/556513635913437184') handle_master('https://twitter.com/SyriacMFS/status/556513635913437184')
# Twitter with arabic characters
handle_master('https://twitter.com/HadiAlabdallah/status/600885154991706113')
def test_generic(): def test_generic():
# Some link that might work # Some link that might work
handle_master('https://raw.githubusercontent.com/voussoir/reddit/master/SubredditBirthdays/show/statistics.txt') handle_master('https://raw.githubusercontent.com/voussoir/reddit/master/SubredditBirthdays/show/statistics.txt')
@ -412,8 +429,8 @@ if __name__ == '__main__':
#test_imgur() #test_imgur()
#test_gfycat() #test_gfycat()
#test_vimeo() #test_vimeo()
#test_liveleak() test_liveleak()
#test_youtube() test_youtube()
test_twitter() #test_twitter()
#test_generic() #test_generic()
pass pass