import bs4 import json import requests import os import time import sys HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.152 Safari/537.36'} DOWNLOAD_DIRECTORY = '' # Save files to this folder # If blank, it uses the local folder IMGUR_ALBUM_INDV = '/.jpg # Else, files will be named _.jpg and placed # in the local folder. GFYCAT_MP4 = False # If True, download gfycat urls in .mp4 # Else, .webm GFYCAT_SUBDOMAINS = ['zippy', 'fat', 'giant'] SLEEPINESS = 2 # The number of seconds to wait in between making requests # Similar to PRAW's ratelimit handling. # Not required, but probably better for the environment. VIMEO_DICT_START = '"files":{"h264":' VIMEO_DICT_END = ',"hls"' # The HTML string which tells us where the mp4 file is VIMEO_PRIORITY = ['hd', 'sd', 'mobile'] # Download files in this priority LIVELEAK_YOUTUBEIFRAME = 'youtube.com/embed' LIVELEAK_RESOLUTIONS = ['h264_base', 'h264_720p', 'h264_270p'] YOUTUBE_DL_FORMAT = 'youtube-dl "{url}" --no-playlist --force-ipv4 -o "/{dir}/{name}.%(ext)s"' # The format for the youtube-dl shell command DO_GENERIC = True # If true, attempt to download whatever URL goes in # Else, only download from the explicitly supported sites ''' End user config ''' last_request = 0 if DOWNLOAD_DIRECTORY != '': if DOWNLOAD_DIRECTORY[-1] not in ['/', '\\']: DOWNLOAD_DIRECTORY += '\\' if not os.path.exists(DOWNLOAD_DIRECTORY): os.makedirs(DOWNLOAD_DIRECTORY) class StatusExc(Exception): pass def download_file(url, localname, headers={}): localname = os.path.join(DOWNLOAD_DIRECTORY, localname) dirname = os.path.split(localname)[0] if dirname != '' and not os.path.exists(dirname): os.makedirs(dirname) if 'twimg' in url: localname = localname.replace(':large', '') localname = localname.replace(':small', '') if os.path.exists(localname): print('\t%s already exists!!' % localname) return localname print('\tDownloading %s' % localname) downloading = request_get(url, stream=True, headers=headers) localfile = open(localname, 'wb') for chunk in downloading.iter_content(chunk_size=1024): if chunk: localfile.write(chunk) localfile.close() return localname def request_get(url, stream=False, headers={}): global last_request now = time.time() diff = now - last_request if diff < SLEEPINESS: diff = SLEEPINESS - diff time.sleep(diff) last_request = time.time() h = HEADERS.copy() h.update(headers) req = requests.get(url, stream=stream, headers=h) if req.status_code not in [200,206]: raise StatusExc("Status code %d on url %s" % (req.status_code, url)) return req ############################################################################## ## def handle_gfycat(url, customname=None): print('Gfycat') name = url.split('/')[-1] name = name.split('.')[0] if customname: filename = customname else: filename = name if GFYCAT_MP4: name += '.mp4' filename += '.mp4' else: name += '.webm' filename += '.webm' for subdomain in GFYCAT_SUBDOMAINS: url = 'http://%s.gfycat.com/%s' % (subdomain, name) try: return download_file(url, filename) except StatusExc: pass def handle_liveleak(url, customname=None): print('Liveleak') if customname: name = customname else: name = url.split('=')[1] name += '.mp4' pagedata = request_get(url) pagedata = pagedata.text if LIVELEAK_YOUTUBEIFRAME in pagedata: pagedata = pagedata.split('\n') pagedata = [line for line in pagedata if LIVELEAK_YOUTUBEIFRAME in line] pagedata = pagedata[0] pagedata = pagedata.split('src="')[1] pagedata = pagedata.split('"')[0] print('\tFound youtube embed') handle_master(pagedata, customname=customname) else: pagedata = pagedata.split('file: "')[1] pagedata = pagedata.split('",')[0] original = pagedata pagedata = pagedata.split('.') for spoti in range(len(pagedata)): if 'h264_' in pagedata[spoti]: pagedata[spoti] = 'LIVELEAKRESOLUTION' pagedata = '.'.join(pagedata) for res in LIVELEAK_RESOLUTIONS: url = pagedata.replace('LIVELEAKRESOLUTION', res) try: return download_file(url, name) except StatusExc: pass return download_file(original, name) def handle_imgur_html(url): print('Imgur') pagedata = request_get(url) pagedata = pagedata.text.replace(' ', '') pagedata = pagedata.split('\n') pagedata = [line for line in pagedata if IMGUR_ALBUM_INDV in line] pagedata = [line.split('content="')[1] for line in pagedata] links = [line.split('"')[0] for line in pagedata] links = [line.split('?')[0] for line in links] print(links) return links def handle_imgur(url, albumid='', customname=None): print('Imgur') name = url.split('/')[-1] if 'imgur.com' in name: # This link doesn't appear to have an image id return url = url.replace('/gallery/', '/a/') basename = name.split('.')[0] if '.' in name: # This is a direct image link if customname: # replace the imgur ID with the customname, keep ext. name = '%s.%s' % (customname, name.split('.')[-1]) if albumid and albumid != basename: if IMGUR_ALBUMFOLDERS: if not os.path.exists(DOWNLOAD_DIRECTORY + albumid): os.makedirs(DOWNLOAD_DIRECTORY + albumid) localpath = '%s\\%s' % (albumid, name) else: localpath = '%s_%s' % (albumid, name) else: localpath = name return download_file(url, localpath) else: # Not a direct image link, let's read the html. images = handle_imgur_html(url) if customname: name = customname print('\tFound %d images' % len(images)) localfiles = [] if len(images) > 1: for imagei in range(len(images)): image = images[imagei] iname = image.split('/')[-1] iname = iname.split('.')[0] x = handle_imgur(image, albumid=name, customname='%d_%s' % (imagei, iname)) localfiles.append(x) else: x = handle_imgur(images[0], customname=name) localfiles.append(x) return localfiles def handle_twitter(url, customname=None): print('Twitter') pagedata = request_get(url) pagedata = pagedata.text idnumber = url.split('status/')[1].split('/')[0] if customname: name = customname else: name = idnumber customname = idnumber tweetpath = '%s.html' % (DOWNLOAD_DIRECTORY + name) psplit = '