master
unknown 2016-03-07 12:25:50 -08:00
parent 9b149ad4c1
commit 4d9871494b
3 changed files with 823 additions and 578 deletions

View File

@ -1,4 +1,16 @@
Spinal Spinal
======== ========
A couple of tools for copying files and directories. A couple of tools for copying files and directories.
2016 03 02
- Fixed issue where the copy's path casing was based on the input string and not the path's actual casing (since Windows doesn't care).
- Change the returned written_bytes to 0 if the file did not need to be copied. This is better for tracking how much actually happens during each backup.
- Fixed encode errors caused by callback_v1's print statement.
2016 03 03
- Moved directory / filename exclusion logic into the walk_generator so the caller doesn't need to worry about it.
- walk_generator now yields absolute filenames since copy_dir no longer needs to process exclusions, and that was the only reason walk_generator used to yield them in parts.
2016 03 04
- Created a FilePath class to cache os.stat data, which should reduce the number of unecessary filesystem calls.

View File

@ -1,7 +1,11 @@
import collections
import glob
import json import json
import os import os
import ratelimiter import ratelimiter
import shutil import shutil
import stat
import string
import sys import sys
import time import time
@ -10,8 +14,17 @@ KIBIBYTE = BYTE * 1024
MIBIBYTE = KIBIBYTE * 1024 MIBIBYTE = KIBIBYTE * 1024
GIBIBYTE = MIBIBYTE * 1024 GIBIBYTE = MIBIBYTE * 1024
TEBIBYTE = GIBIBYTE * 1024 TEBIBYTE = GIBIBYTE * 1024
SIZE_UNITS = (TEBIBYTE, GIBIBYTE, MIBIBYTE, KIBIBYTE, BYTE)
CHUNK_SIZE = 64 * KIBIBYTE UNIT_STRINGS = {
BYTE: 'b',
KIBIBYTE: 'KiB',
MIBIBYTE: 'MiB',
GIBIBYTE: 'GiB',
TEBIBYTE: 'TiB',
}
CHUNK_SIZE = 128 * KIBIBYTE
# Number of bytes to read and write at a time # Number of bytes to read and write at a time
@ -33,6 +46,69 @@ class SourceNotFile(Exception):
class SpinalError(Exception): class SpinalError(Exception):
pass pass
class FilePath:
def __init__(self, path):
self.path = os.path.abspath(path)
self._stat = None
self._isdir = None
self._isfile = None
self._islink = None
self._size = None
def __hash__(self):
return self.path.__hash__()
def __repr__(self):
return repr(self.path)
@property
def isdir(self):
return self.type_getter('_isdir', stat.S_ISDIR)
@property
def isfile(self):
return self.type_getter('_isfile', stat.S_ISREG)
@property
def islink(self):
return self.type_getter('_islink', stat.S_ISLNK)
@property
def size(self):
if self._size is None:
if self.stat is False:
self._size = None
else:
self._size = self.stat.st_size
return self._size
@property
def stat(self):
if self._stat is None:
try:
self._stat = os.stat(self.path)
except FileNotFoundError:
self._stat = False
return self._stat
def type_getter(self, attr, resolution):
if getattr(self, attr) is None:
if self.stat is False:
return False
else:
setattr(self, attr, resolution(self.stat.st_mode))
return getattr(self, attr)
def bytes_to_unit_string(bytes):
size_unit = 1
for unit in SIZE_UNITS:
if bytes >= unit:
size_unit = unit
break
size_unit_string = UNIT_STRINGS[size_unit]
size_string = '%.3f %s' % ((bytes / size_unit), size_unit_string)
return size_string
def callback_exclusion(name, path_type): def callback_exclusion(name, path_type):
''' '''
@ -40,18 +116,19 @@ def callback_exclusion(name, path_type):
''' '''
print('Excluding', name) print('Excluding', name)
def callback_v1(filename, written_bytes, total_bytes): def callback_v1(fpobj, written_bytes, total_bytes):
''' '''
Example of a copy callback function. Example of a copy callback function.
Prints "filename written/total (percent%)" Prints "fpobj written/total (percent%)"
''' '''
filename = fpobj.path.encode('ascii', 'replace').decode()
if written_bytes >= total_bytes: if written_bytes >= total_bytes:
ends = '\n' ends = '\n'
else: else:
ends = '' ends = ''
percent = (100 * written_bytes) / total_bytes percent = (100 * written_bytes) / total_bytes
percent = '%03.3f' % percent percent = '%07.3f' % percent
written = '{:,}'.format(written_bytes) written = '{:,}'.format(written_bytes)
total = '{:,}'.format(total_bytes) total = '{:,}'.format(total_bytes)
written = written.rjust(len(total), ' ') written = written.rjust(len(total), ' ')
@ -60,6 +137,197 @@ def callback_v1(filename, written_bytes, total_bytes):
print(status, end=ends) print(status, end=ends)
sys.stdout.flush() sys.stdout.flush()
def copy(source, file_args=None, file_kwargs=None, dir_args=None, dir_kwargs=None):
'''
Perform copy_dir or copy_file as appropriate for the source path.
'''
source = str_to_fp(source)
if source.isfile:
file_args = file_args or tuple()
file_kwargs = file_kwargs or dict()
return copy_file(source, *file_args, **file_kwargs)
elif source.isdir:
dir_args = dir_args or tuple()
dir_kwargs = dir_kwargs or dict()
return copy_dir(source, *dir_args, **dir_kwargs)
raise SpinalError('Neither file nor dir: %s' % source)
def copy_dir(
source,
destination=None,
destination_new_root=None,
bytes_per_second=None,
callback_directory=None,
callback_file=None,
callback_permission_denied=None,
dry_run=False,
exclude_directories=None,
exclude_filenames=None,
exclusion_callback=None,
overwrite_old=True,
precalcsize=False,
):
'''
Copy all of the contents from source to destination,
including subdirectories.
source:
The directory which will be copied.
destination:
The directory in which copied files are placed. Alternatively, use
destination_new_root.
destination_new_root:
Determine the destination path by calling
`new_root(source, destination_new_root)`.
Thus, this path acts as a root and the rest of the path is matched.
bytes_per_second:
Restrict file copying to this many bytes per second. Can be an integer
or an existing Ratelimiter object.
The provided BYTE, KIBIBYTE, etc constants may help.
Default = None
callback_directory:
This function will be called after each file copy with three parameters:
name of file copied, number of bytes written to destination so far,
total bytes needed (from precalcsize).
Default = None
callback_file:
Will be passed into each individual `copy_file` operation as the
`callback` for that file.
Default = None
callback_permission_denied:
Will be passed into each individual `copy_file` operation as the
`callback_permission_denied` for that file.
Default = None
dry_run:
Do everything except the actual file copying.
Default = False
exclude_filenames:
Passed directly into `walk_generator`.
Default = None
exclude_directories:
Passed directly into `walk_generator`.
Default = None
exclusion_callback:
Passed directly into `walk_generator`.
Default = None
overwrite_old:
If True, overwrite the destination file if the source file
has a more recent "last modified" timestamp.
Default = True
precalcsize:
If True, calculate the size of source before beginning the
operation. This number can be used in the callback_directory function.
Else, callback_directory will receive written bytes as total bytes
(showing 100% always).
This can take a long time.
Default = False
Returns: [destination path, number of bytes written to destination]
(Written bytes is 0 if all files already existed.)
'''
# Prepare parameters
if not is_xor(destination, destination_new_root):
m = 'One and only one of `destination` and '
m += '`destination_new_root` can be passed'
raise ValueError(m)
source = str_to_fp(source)
source = get_path_casing(source)
if destination_new_root is not None:
destination = new_root(source, destination_new_root)
destination = str_to_fp(destination)
if is_subfolder(source, destination):
raise RecursiveDirectory(source, destination)
if not source.isdir:
raise SourceNotDirectory(source)
if destination.isfile:
raise DestinationIsFile(destination)
if precalcsize is True:
total_bytes = get_dir_size(source)
else:
total_bytes = 0
if isinstance(bytes_per_second, ratelimiter.Ratelimiter):
limiter = bytes_per_second
elif bytes_per_second is not None:
limiter = ratelimiter.Ratelimiter(allowance_per_period=bytes_per_second, period=1)
else:
limiter = None
# Copy
written_bytes = 0
walker = walk_generator(
source,
exclude_directories=exclude_directories,
exclude_filenames=exclude_filenames,
exclusion_callback=exclusion_callback,
)
for (source_abspath) in walker:
# Terminology:
# abspath: C:\folder\subfolder\filename.txt
# location: C:\folder\subfolder
# base_name: filename.txt
# folder: subfolder
destination_abspath = source_abspath.path.replace(source.path, destination.path)
destination_abspath = str_to_fp(destination_abspath)
if destination_abspath.isdir:
raise DestinationIsDirectory(destination_abspath)
destination_location = os.path.split(destination_abspath.path)[0]
if not os.path.isdir(destination_location):
os.makedirs(destination_location)
copied = copy_file(
source_abspath,
destination_abspath,
bytes_per_second=limiter,
callback=callback_file,
callback_permission_denied=callback_permission_denied,
dry_run=dry_run,
overwrite_old=overwrite_old,
)
copiedname = copied[0]
written_bytes += copied[1]
if callback_directory is not None:
if precalcsize is False:
callback_directory(copiedname, written_bytes, written_bytes)
else:
callback_directory(copiedname, written_bytes, total_bytes)
return [destination, written_bytes]
def copy_file( def copy_file(
source, source,
destination=None, destination=None,
@ -68,6 +336,7 @@ def copy_file(
callback=None, callback=None,
dry_run=False, dry_run=False,
overwrite_old=True, overwrite_old=True,
callback_permission_denied=None,
): ):
''' '''
Copy a file from one place to another. Copy a file from one place to another.
@ -94,11 +363,20 @@ def copy_file(
callback: callback:
If provided, this function will be called after writing If provided, this function will be called after writing
each CHUNK_SIZE bytes to destination with three parameters: each CHUNK_SIZE bytes to destination with three parameters:
name of file being copied, number of bytes written so far, the FilePath object being copied, number of bytes written so far,
total number of bytes needed. total number of bytes needed.
Default = None Default = None
callback_permission_denied:
If provided, this function will be called when a source file denies
read access, with the file path and the exception object as parameters.
THE OPERATION WILL RETURN NORMALLY.
If not provided, the PermissionError is raised.
Default = None
dry_run: dry_run:
Do everything except the actual file copying. Do everything except the actual file copying.
@ -111,6 +389,7 @@ def copy_file(
Default = True Default = True
Returns: [destination filename, number of bytes written to destination] Returns: [destination filename, number of bytes written to destination]
(Written bytes is 0 if the file already existed.)
''' '''
# Prepare parameters # Prepare parameters
if not is_xor(destination, destination_new_root): if not is_xor(destination, destination_new_root):
@ -118,16 +397,17 @@ def copy_file(
m += '`destination_new_root` can be passed' m += '`destination_new_root` can be passed'
raise ValueError(m) raise ValueError(m)
source = str_to_fp(source)
source = get_path_casing(source)
if destination_new_root is not None: if destination_new_root is not None:
destination = new_root(source, destination_new_root) destination = new_root(source, destination_new_root)
destination = str_to_fp(destination)
source = os.path.abspath(source) if not source.isfile:
destination = os.path.abspath(destination)
if not os.path.isfile(source):
raise SourceNotFile(source) raise SourceNotFile(source)
if os.path.isdir(destination): if destination.isdir:
raise DestinationIsDirectory(destination) raise DestinationIsDirectory(destination)
if isinstance(bytes_per_second, ratelimiter.Ratelimiter): if isinstance(bytes_per_second, ratelimiter.Ratelimiter):
@ -137,35 +417,47 @@ def copy_file(
else: else:
limiter = None limiter = None
source_bytes = os.path.getsize(source)
# Determine overwrite # Determine overwrite
destination_exists = os.path.exists(destination) if destination.stat is not False:
if destination_exists: destination_modtime = destination.stat.st_mtime
if overwrite_old is False:
return [destination, source_bytes]
source_modtime = os.path.getmtime(source) if overwrite_old is False:
destination_modtime = os.path.getmtime(destination) return [destination, 0]
source_modtime = source.stat.st_mtime
if source_modtime == destination_modtime: if source_modtime == destination_modtime:
return [destination, source_bytes] return [destination, 0]
# Copy # Copy
if dry_run: if dry_run:
if callback is not None: if callback is not None:
callback(destination, source_bytes, source_bytes) callback(destination, 0, 0)
return [destination, source_bytes] return [destination, 0]
source_bytes = source.size
destination_location = os.path.split(destination.path)[0]
if not os.path.exists(destination_location):
os.makedirs(destination_location)
written_bytes = 0 written_bytes = 0
source_file = open(source, 'rb')
destionation_file = open(destination, 'wb') try:
source_file = open(source.path, 'rb')
destination_file = open(destination.path, 'wb')
except PermissionError as exception:
if callback_permission_denied is not None:
callback_permission_denied(source, exception)
return [destination, 0]
else:
raise
while True: while True:
data_chunk = source_file.read(CHUNK_SIZE) data_chunk = source_file.read(CHUNK_SIZE)
data_bytes = len(data_chunk) data_bytes = len(data_chunk)
if data_bytes == 0: if data_bytes == 0:
break break
destionation_file.write(data_chunk) destination_file.write(data_chunk)
written_bytes += data_bytes written_bytes += data_bytes
if limiter is not None: if limiter is not None:
@ -176,64 +468,102 @@ def copy_file(
# Fin # Fin
source_file.close() source_file.close()
destionation_file.close() destination_file.close()
shutil.copystat(source, destination) shutil.copystat(source.path, destination.path)
return [destination, written_bytes] return [destination, written_bytes]
def copy_dir( def get_path_casing(path):
source_dir, '''
destination_dir=None, Take what is perhaps incorrectly cased input and get the path's actual
destination_new_root=None, casing according to the filesystem.
bytes_per_second=None,
callback_directory=None, Thank you
callback_file=None, Ethan Furman http://stackoverflow.com/a/7133137/5430534
dry_run=False, xvorsx http://stackoverflow.com/a/14742779/5430534
'''
p = str_to_fp(path)
path = p.path
(drive, subpath) = os.path.splitdrive(path)
pattern = ["%s[%s]" % (piece[:-1], piece[-1]) for piece in subpath.split(os.sep)[1:]]
pattern = os.sep.join(pattern)
pattern = drive.upper() + os.sep + pattern
try:
return str_to_fp(glob.glob(pattern)[0])
except IndexError:
return p
def get_dir_size(path):
'''
Calculate the total number of bytes across all files in this directory
and its subdirectories.
'''
path = str_to_fp(path)
if not path.isdir:
raise SourceNotDirectory(path)
total_bytes = 0
for (directory, filename) in walk_generator(path):
filename = os.path.join(directory, filename)
filesize = os.path.getsize(filename)
total_bytes += filesize
return total_bytes
def is_subfolder(parent, child):
'''
Determine whether parent contains child.
'''
parent = normalize(str_to_fp(parent).path) + os.sep
child = normalize(str_to_fp(child).path) + os.sep
return child.startswith(parent)
def is_xor(*args):
'''
Return True if and only if one arg is truthy.
'''
return [bool(a) for a in args].count(True) == 1
def new_root(filepath, root):
'''
Prepend `root` to `filepath`, drive letter included. For example:
"C:\\folder\\subfolder\\file.txt" and "C:\\backups" becomes
"C:\\backups\\C\\folder\\subfolder\\file.txt"
I use this so that my G: drive can have backups from my C: and D: drives
while preserving directory structure in G:\\D and G:\\C.
'''
filepath = str_to_fp(filepath).path
root = str_to_fp(root).path
filepath = filepath.replace(':', os.sep)
filepath = os.path.normpath(filepath)
filepath = os.path.join(root, filepath)
return str_to_fp(filepath)
def normalize(text):
'''
Apply os.path.normpath and os.path.normcase.
'''
return os.path.normpath(os.path.normcase(text))
def str_to_fp(path):
'''
If `path` is a string, create a FilePath object, otherwise just return it.
'''
if isinstance(path, str):
path = FilePath(path)
return path
def walk_generator(
path,
exclude_directories=None, exclude_directories=None,
exclude_filenames=None, exclude_filenames=None,
exclusion_callback=None, exclusion_callback=None,
overwrite_old=True,
precalcsize=False,
): ):
''' '''
Copy all of the contents from source_dir to destination_dir, Yield (location, filename) from the file tree similar to os.walk.
including subdirectories. Example value: ('C:\\Users\\Michael\\Music', 'song.mp3')
source_dir:
The directory which will be copied.
destination_dir:
The directory in which copied files are placed. Alternatively, use
destination_new_root.
destination_new_root:
Determine the destination path by calling
`new_root(source_dir, destination_new_root)`.
Thus, this path acts as a root and the rest of the path is matched.
bytes_per_second:
Restrict file copying to this many bytes per second. Can be an integer
or an existing Ratelimiter object.
The provided BYTE, KIBIBYTE, etc constants may help.
Default = None
callback_directory:
This function will be called after each file copy with three parameters:
name of file copied, number of bytes written to destination_dir so far,
total bytes needed (from precalcsize).
Default = None
callback_file:
Will be passed into each individual copy_file() as the `callback`
for that file.
Default = None
dry_run:
Do everything except the actual file copying.
Default = False
exclude_filenames: exclude_filenames:
A set of filenames that will not be copied. Entries can be absolute A set of filenames that will not be copied. Entries can be absolute
@ -256,193 +586,62 @@ def copy_dir(
two parameters: the path, and 'file' or 'directory'. two parameters: the path, and 'file' or 'directory'.
Default = None Default = None
overwrite_old:
If True, overwrite the destination file if the source file
has a more recent "last modified" timestamp.
Default = True
precalcsize:
If True, calculate the size of source_dir before beginning the
operation. This number can be used in the callback_directory function.
Else, callback_directory will receive written bytes as total bytes
(showing 100% always).
This can take a long time.
Default = False
Returns: [destination_dir path, number of bytes written to destination_dir]
''' '''
if exclude_directories is None:
# Prepare parameters exclude_directories = set()
if not is_xor(destination_dir, destination_new_root):
m = 'One and only one of `destination_dir` and '
m += '`destination_new_root` can be passed'
raise ValueError(m)
if destination_new_root is not None:
destination_dir = new_root(source_dir, destination_new_root)
source_dir = os.path.normpath(os.path.abspath(source_dir))
destination_dir = os.path.normpath(os.path.abspath(destination_dir))
if is_subfolder(source_dir, destination_dir):
raise RecursiveDirectory(source_dir, destination_dir)
if not os.path.isdir(source_dir):
raise SourceNotDirectory(source_dir)
if os.path.isfile(destination_dir):
raise DestinationIsFile(destination_dir)
if exclusion_callback is None:
exclusion_callback = lambda *x: None
if exclude_filenames is None: if exclude_filenames is None:
exclude_filenames = set() exclude_filenames = set()
if exclude_directories is None: if exclusion_callback is None:
exclude_directories = set() exclusion_callback = lambda *x: None
exclude_filenames = {normalize(f) for f in exclude_filenames} exclude_filenames = {normalize(f) for f in exclude_filenames}
exclude_directories = {normalize(f) for f in exclude_directories} exclude_directories = {normalize(f) for f in exclude_directories}
if precalcsize is True: path = str_to_fp(path).path
total_bytes = get_dir_size(source_dir)
else:
total_bytes = 0
if isinstance(bytes_per_second, ratelimiter.Ratelimiter): if normalize(path) in exclude_directories:
limiter = bytes_per_second exclusion_callback(path, 'directory')
elif bytes_per_second is not None: return
limiter = ratelimiter.Ratelimiter(allowance_per_period=bytes_per_second, period=1)
else:
limiter = None
# Copy if normalize(os.path.split(path)[1]) in exclude_directories:
written_bytes = 0 exclusion_callback(path, 'directory')
for (source_location, base_filename) in walk_generator(source_dir): return
# Terminology:
# abspath: C:\folder\subfolder\filename.txt
# base_filename: filename.txt
# folder: subfolder
# location: C:\folder\subfolder
#source_location = normalize(source_location)
#base_filename = normalize(base_filename)
source_folder_name = os.path.split(source_location)[1] directory_queue = collections.deque()
source_abspath = os.path.join(source_location, base_filename) directory_queue.append(path)
destination_abspath = source_abspath.replace(source_dir, destination_dir) # This is a recursion-free workplace.
destination_location = os.path.split(destination_abspath)[0] # Thank you for your cooperation.
while len(directory_queue) > 0:
location = directory_queue.popleft()
contents = os.listdir(location)
if base_filename in exclude_filenames: directories = []
exclusion_callback(source_abspath, 'file') for base_name in contents:
continue absolute_name = os.path.join(location, base_name)
if source_abspath in exclude_filenames:
exclusion_callback(source_abspath, 'file')
continue
if source_location in exclude_directories:
exclusion_callback(source_location, 'directory')
continue
if source_folder_name in exclude_directories:
exclusion_callback(source_location, 'directory')
continue
if os.path.isdir(destination_abspath): if os.path.isdir(absolute_name):
raise DestinationIsDirectory(destination_abspath) if normalize(absolute_name) in exclude_directories:
exclusion_callback(absolute_name, 'directory')
continue
if not os.path.isdir(destination_location): if normalize(base_name) in exclude_directories:
os.makedirs(destination_location) exclusion_callback(absolute_name, 'directory')
continue
copied = copy_file( directories.append(absolute_name)
source_abspath,
destination_abspath,
bytes_per_second=limiter,
callback=callback_file,
dry_run=dry_run,
overwrite_old=overwrite_old,
)
copiedname = copied[0]
written_bytes += copied[1]
if callback_directory is not None:
if precalcsize is False:
callback_directory(copiedname, written_bytes, written_bytes)
else: else:
callback_directory(copiedname, written_bytes, total_bytes) if normalize(base_name) in exclude_filenames:
exclusion_callback(absolute_name, 'file')
continue
if normalize(absolute_name) in exclude_filenames:
exclusion_callback(absolute_filename, 'file')
continue
return [destination_dir, written_bytes] yield(str_to_fp(absolute_name))
def execute_spinaltask(task): directories.reverse()
''' directory_queue.extendleft(directories)
Execute a spinal task.
'''
pass
def get_dir_size(source_dir):
'''
Calculate the total number of bytes across all files in this directory
and its subdirectories.
'''
source_dir = os.path.abspath(source_dir)
if not os.path.isdir(source_dir):
raise SourceNotDirectory(source_dir)
total_bytes = 0
for (directory, filename) in walk_generator(source_dir):
filename = os.path.join(directory, filename)
filesize = os.path.getsize(filename)
total_bytes += filesize
return total_bytes
def is_subfolder(parent, child):
'''
Determine whether parent contains child.
'''
parent = normalize(os.path.abspath(parent)) + os.sep
child = normalize(os.path.abspath(child)) + os.sep
return child.startswith(parent)
def is_xor(*args):
'''
Return True if and only if one arg is truthy.
'''
return [bool(a) for a in args].count(True) == 1
def new_root(filepath, root):
'''
Prepend `root` to `filepath`, drive letter included. For example:
"C:\\folder\\subfolder\\file.txt" and "C:\\backups" becomes
"C:\\backups\\C\\folder\\subfolder\\file.txt"
I use this so that my G: drive can have backups from my C: and D: drives
while preserving directory structure in G:\\D and G:\\C.
'''
filepath = os.path.abspath(filepath)
root = os.path.abspath(root)
filepath = filepath.replace(':', os.sep)
filepath = os.path.normpath(filepath)
filepath = os.path.join(root, filepath)
return filepath
def normalize(text):
'''
Apply os.path.normpath and os.path.normcase.
'''
return os.path.normpath(os.path.normcase(text))
def walk_generator(path):
'''
Yield filenames from os.walk so the caller doesn't need to deal with the
nested for-loops.
'''
path = os.path.abspath(path)
walker = os.walk(path)
for (location, folders, files) in walker:
for filename in files:
yield (location, filename)

View File

@ -1,3 +1,4 @@
import bs4
import json import json
import requests import requests
import os import os
@ -21,7 +22,7 @@ IMGUR_ALBUMFOLDERS = True
# Else, files will be named <album_id>_<img_id>.jpg and placed # Else, files will be named <album_id>_<img_id>.jpg and placed
# in the local folder. # in the local folder.
GFYCAT_MP4 = True GFYCAT_MP4 = False
# If True, download gfycat urls in .mp4 # If True, download gfycat urls in .mp4
# Else, .webm # Else, .webm
@ -55,403 +56,436 @@ DO_GENERIC = True
last_request = 0 last_request = 0
if DOWNLOAD_DIRECTORY != '': if DOWNLOAD_DIRECTORY != '':
if DOWNLOAD_DIRECTORY[-1] not in ['/', '\\']: if DOWNLOAD_DIRECTORY[-1] not in ['/', '\\']:
DOWNLOAD_DIRECTORY += '\\' DOWNLOAD_DIRECTORY += '\\'
if not os.path.exists(DOWNLOAD_DIRECTORY): if not os.path.exists(DOWNLOAD_DIRECTORY):
os.makedirs(DOWNLOAD_DIRECTORY) os.makedirs(DOWNLOAD_DIRECTORY)
class StatusExc(Exception): class StatusExc(Exception):
pass pass
def download_file(url, localname, headers={}): def download_file(url, localname, headers={}):
localname = DOWNLOAD_DIRECTORY + localname localname = os.path.join(DOWNLOAD_DIRECTORY, localname)
if 'twimg' in url: dirname = os.path.split(localname)[0]
localname = localname.replace(':large', '') if dirname != '' and not os.path.exists(dirname):
localname = localname.replace(':small', '') os.makedirs(dirname)
if os.path.exists(localname): if 'twimg' in url:
print('\t%s already exists!!' % localname) localname = localname.replace(':large', '')
return localname localname = localname.replace(':small', '')
print('\tDownloading %s' % localname) if os.path.exists(localname):
downloading = request_get(url, stream=True, headers=headers) print('\t%s already exists!!' % localname)
localfile = open(localname, 'wb') return localname
for chunk in downloading.iter_content(chunk_size=1024): print('\tDownloading %s' % localname)
if chunk: downloading = request_get(url, stream=True, headers=headers)
localfile.write(chunk) localfile = open(localname, 'wb')
localfile.close() for chunk in downloading.iter_content(chunk_size=1024):
return localname if chunk:
localfile.write(chunk)
localfile.close()
return localname
def request_get(url, stream=False, headers={}): def request_get(url, stream=False, headers={}):
global last_request global last_request
now = time.time() now = time.time()
diff = now - last_request diff = now - last_request
if diff < SLEEPINESS: if diff < SLEEPINESS:
diff = SLEEPINESS - diff diff = SLEEPINESS - diff
time.sleep(diff) time.sleep(diff)
last_request = time.time() last_request = time.time()
h = HEADERS.copy() h = HEADERS.copy()
h.update(headers) h.update(headers)
req = requests.get(url, stream=stream, headers=h) req = requests.get(url, stream=stream, headers=h)
if req.status_code not in [200,206]: if req.status_code not in [200,206]:
raise StatusExc("Status code %d on url %s" % (req.status_code, url)) raise StatusExc("Status code %d on url %s" % (req.status_code, url))
return req return req
############################################################################## ##############################################################################
## ##
def handle_imgur_html(url):
pagedata = request_get(url)
pagedata = pagedata.text.replace(' ', '')
pagedata = pagedata.split('\n')
pagedata = [line for line in pagedata if IMGUR_ALBUM_INDV in line]
pagedata = [line.split('content="')[1] for line in pagedata]
links = [line.split('"')[0] for line in pagedata]
links = [line.split('?')[0] for line in links]
print(links)
return links
def handle_imgur(url, albumid='', customname=None):
name = url.split('/')[-1]
if 'imgur.com' in name:
# This link doesn't appear to have an image id
return
url = url.replace('/gallery/', '/a/')
basename = name.split('.')[0]
if '.' in name:
# This is a direct image link
if customname:
# replace the imgur ID with the customname, keep ext.
name = '%s.%s' % (customname, name.split('.')[-1])
if albumid and albumid != basename:
if IMGUR_ALBUMFOLDERS:
if not os.path.exists(DOWNLOAD_DIRECTORY + albumid):
os.makedirs(DOWNLOAD_DIRECTORY + albumid)
localpath = '%s\\%s' % (albumid, name)
else:
localpath = '%s_%s' % (albumid, name)
else:
localpath = name
return download_file(url, localpath)
else:
# Not a direct image link, let's read the html.
images = handle_imgur_html(url)
if customname:
name = customname
print('\tFound %d images' % len(images))
localfiles = []
if len(images) > 1:
for imagei in range(len(images)):
image = images[imagei]
iname = image.split('/')[-1]
iname = iname.split('.')[0]
x = handle_imgur(image, albumid=name, customname='%d_%s' % (imagei, iname))
localfiles.append(x)
else:
x = handle_imgur(images[0], customname=name)
localfiles.append(x)
return localfiles
def handle_gfycat(url, customname=None): def handle_gfycat(url, customname=None):
name = url.split('/')[-1] print('Gfycat')
name = name.split('.')[0] name = url.split('/')[-1]
if customname: name = name.split('.')[0]
filename = customname if customname:
else: filename = customname
filename = name else:
filename = name
if GFYCAT_MP4: if GFYCAT_MP4:
name += '.mp4' name += '.mp4'
filename += '.mp4' filename += '.mp4'
else: else:
name += '.webm' name += '.webm'
filename += '.webm' filename += '.webm'
for subdomain in GFYCAT_SUBDOMAINS:
url = 'http://%s.gfycat.com/%s' % (subdomain, name)
try:
return download_file(url, filename)
except StatusExc:
pass
def handle_vidme(url, customname=None):
if customname is None:
customname = url.split('/')[-1]+'.mp4'
pagedata = request_get(url)
pagedata = pagedata.text
pagedata = pagedata.split('\n')
pagedata = [l for l in pagedata if '.mp4' in l and 'og:video:url' in l]
pagedata = pagedata[0]
pagedata = pagedata.split('content="')[1].split('"')[0]
pagedata = pagedata.replace('&amp;', '&')
headers = {'Referer': 'https://vid.me/',
'Range':'bytes=0-',
'Host':'d1wst0behutosd.cloudfront.net',
'Cache-Control':'max-age=0'}
return download_file(pagedata, customname, headers=headers)
def handle_vimeo(url, customname=None):
name = url.split('/')[-1]
name = name.split('?')[0]
try:
int(name)
except ValueError as e:
print('Could not identify filename of %s' % url)
raise e
url = 'http://player.vimeo.com/video/%s' % name
pagedata = request_get(url)
pagedata = pagedata.text
pagedata = pagedata.replace('</script>', '<script')
pagedata = pagedata.split('<script>')
for chunk in pagedata:
if VIMEO_DICT_START in chunk:
break
chunk = chunk.split(VIMEO_DICT_START)[1]
chunk = chunk.split(VIMEO_DICT_END)[0]
chunk = json.loads(chunk)
for priority in VIMEO_PRIORITY:
if priority in chunk:
fileurl = chunk[priority]['url']
break
if customname:
filename = customname + '.mp4'
else:
filename = name + '.mp4'
return download_file(fileurl, filename)
for subdomain in GFYCAT_SUBDOMAINS:
url = 'http://%s.gfycat.com/%s' % (subdomain, name)
try:
return download_file(url, filename)
except StatusExc:
pass
def handle_liveleak(url, customname=None): def handle_liveleak(url, customname=None):
if customname: print('Liveleak')
name = customname if customname:
else: name = customname
name = url.split('=')[1] else:
name += '.mp4' name = url.split('=')[1]
pagedata = request_get(url) name += '.mp4'
pagedata = pagedata.text pagedata = request_get(url)
if LIVELEAK_YOUTUBEIFRAME in pagedata: pagedata = pagedata.text
pagedata = pagedata.split('\n') if LIVELEAK_YOUTUBEIFRAME in pagedata:
pagedata = [line for line in pagedata if LIVELEAK_YOUTUBEIFRAME in line] pagedata = pagedata.split('\n')
pagedata = pagedata[0] pagedata = [line for line in pagedata if LIVELEAK_YOUTUBEIFRAME in line]
pagedata = pagedata.split('src="')[1] pagedata = pagedata[0]
pagedata = pagedata.split('"')[0] pagedata = pagedata.split('src="')[1]
print('\tFound youtube embed') pagedata = pagedata.split('"')[0]
handle_master(pagedata, customname=customname) print('\tFound youtube embed')
else: handle_master(pagedata, customname=customname)
pagedata = pagedata.split('file: "')[1] else:
pagedata = pagedata.split('",')[0] pagedata = pagedata.split('file: "')[1]
original = pagedata pagedata = pagedata.split('",')[0]
pagedata = pagedata.split('.') original = pagedata
for spoti in range(len(pagedata)): pagedata = pagedata.split('.')
if 'h264_' in pagedata[spoti]: for spoti in range(len(pagedata)):
pagedata[spoti] = 'LIVELEAKRESOLUTION' if 'h264_' in pagedata[spoti]:
pagedata = '.'.join(pagedata) pagedata[spoti] = 'LIVELEAKRESOLUTION'
for res in LIVELEAK_RESOLUTIONS: pagedata = '.'.join(pagedata)
url = pagedata.replace('LIVELEAKRESOLUTION', res) for res in LIVELEAK_RESOLUTIONS:
try: url = pagedata.replace('LIVELEAKRESOLUTION', res)
return download_file(url, name) try:
except StatusExc: return download_file(url, name)
pass except StatusExc:
return download_file(original, name) pass
return download_file(original, name)
def handle_imgur_html(url):
print('Imgur')
pagedata = request_get(url)
pagedata = pagedata.text.replace(' ', '')
pagedata = pagedata.split('\n')
pagedata = [line for line in pagedata if IMGUR_ALBUM_INDV in line]
pagedata = [line.split('content="')[1] for line in pagedata]
links = [line.split('"')[0] for line in pagedata]
links = [line.split('?')[0] for line in links]
print(links)
return links
def handle_imgur(url, albumid='', customname=None):
print('Imgur')
name = url.split('/')[-1]
if 'imgur.com' in name:
# This link doesn't appear to have an image id
return
def handle_youtube(url, customname=None): url = url.replace('/gallery/', '/a/')
url = url.replace('&amp;', '&') basename = name.split('.')[0]
url = url.replace('feature=player_embedded&', '') if '.' in name:
url = url.replace('&feature=player_embedded', '') # This is a direct image link
if not customname: if customname:
os.system(YOUTUBE_DL_FORMAT.format(url=url, dir=DOWNLOAD_DIRECTORY, name='%(title)s')) # replace the imgur ID with the customname, keep ext.
return name = '%s.%s' % (customname, name.split('.')[-1])
os.system(YOUTUBE_DL_FORMAT.format(url=url, dir=DOWNLOAD_DIRECTORY, name=customname)) if albumid and albumid != basename:
if DOWNLOAD_DIRECTORY:
return '%s/%s.mp4' % (DOWNLOAD_DIRECTORY, customname) if IMGUR_ALBUMFOLDERS:
return '%s.mp4' % customname
if not os.path.exists(DOWNLOAD_DIRECTORY + albumid):
os.makedirs(DOWNLOAD_DIRECTORY + albumid)
localpath = '%s\\%s' % (albumid, name)
else:
localpath = '%s_%s' % (albumid, name)
else:
localpath = name
return download_file(url, localpath)
else:
# Not a direct image link, let's read the html.
images = handle_imgur_html(url)
if customname:
name = customname
print('\tFound %d images' % len(images))
localfiles = []
if len(images) > 1:
for imagei in range(len(images)):
image = images[imagei]
iname = image.split('/')[-1]
iname = iname.split('.')[0]
x = handle_imgur(image, albumid=name, customname='%d_%s' % (imagei, iname))
localfiles.append(x)
else:
x = handle_imgur(images[0], customname=name)
localfiles.append(x)
return localfiles
def handle_twitter(url, customname=None): def handle_twitter(url, customname=None):
pagedata = request_get(url) print('Twitter')
pagedata = pagedata.text pagedata = request_get(url)
pagedata = pagedata.text
idnumber = url.split('status/')[1].split('/')[0] idnumber = url.split('status/')[1].split('/')[0]
if customname: if customname:
name = customname name = customname
else: else:
name = idnumber name = idnumber
customname = idnumber customname = idnumber
tweetpath = '%s.html' % (DOWNLOAD_DIRECTORY + name) tweetpath = '%s.html' % (DOWNLOAD_DIRECTORY + name)
psplit = '<p class="TweetTextSize' psplit = '<p class="TweetTextSize'
tweettext = pagedata.split(psplit)[1] tweettext = pagedata.split(psplit)[1]
tweettext = tweettext.split('</p>')[0] tweettext = tweettext.split('</p>')[0]
tweettext = psplit + tweettext + '</p>' tweettext = psplit + tweettext + '</p>'
tweettext = '<html><body>%s</body></html>' % tweettext tweettext = '<html><body>%s</body></html>' % tweettext
tweettext = tweettext.replace('/hashtag/', 'http://twitter.com/hashtag/') tweettext = tweettext.replace('/hashtag/', 'http://twitter.com/hashtag/')
tweethtml = open(tweetpath, 'w', encoding='utf-8') tweethtml = open(tweetpath, 'w', encoding='utf-8')
tweethtml.write(tweettext) tweethtml.write(tweettext)
tweethtml.close() tweethtml.close()
print('\tSaved tweet text') print('\tSaved tweet text')
try: try:
link = pagedata.split('data-url="')[1] link = pagedata.split('data-url="')[1]
link = link.split('"')[0] link = link.split('"')[0]
if link != url: if link != url:
handle_master(link, customname=customname) handle_master(link, customname=customname)
return tweetpath return tweetpath
except IndexError: except IndexError:
try: try:
link = pagedata.split('data-expanded-url="')[1] link = pagedata.split('data-expanded-url="')[1]
link = link.split('"')[0] link = link.split('"')[0]
if link != url: if link != url:
handle_master(link, customname=customname) handle_master(link, customname=customname)
return tweetpath return tweetpath
except IndexError: except IndexError:
pass pass
return tweetpath return tweetpath
print('\tNo media detected') print('\tNo media detected')
def handle_vidble(url, customname=None):
print('Vidble')
if '/album/' in url:
pagedata = request_get(url)
pagedata.raise_for_status()
pagedata = pagedata.text
soup = bs4.BeautifulSoup(pagedata)
images = soup.find_all('img')
images = [i for i in images if i.attrs.get('src', None)]
images = [i.attrs['src'] for i in images]
images = [i for i in images if '/assets/' not in i]
images = [i for i in images if i[0] == '/']
if customname:
folder = customname
else:
folder = url.split('/album/')[1].split('/')[0]
for (index, image) in enumerate(images):
name = image.split('/')[-1]
localname = '{folder}\\{index}_{name}'.format(folder=folder, index=index, name=name)
image = 'https://vidble.com' + image
download_file(image, localname)
else:
localname = url.split('/')[-1]
extension = os.path.splitext(localname)[1]
localname = customname + extension
download_file(url, localname)
def handle_vidme(url, customname=None):
print('Vidme')
if customname is None:
customname = url.split('/')[-1]+'.mp4'
pagedata = request_get(url)
pagedata = pagedata.text
pagedata = pagedata.split('\n')
pagedata = [l for l in pagedata if '.mp4' in l and 'og:video:url' in l]
pagedata = pagedata[0]
pagedata = pagedata.split('content="')[1].split('"')[0]
pagedata = pagedata.replace('&amp;', '&')
headers = {'Referer': 'https://vid.me/',
'Range':'bytes=0-',
'Host':'d1wst0behutosd.cloudfront.net',
'Cache-Control':'max-age=0'}
return download_file(pagedata, customname, headers=headers)
def handle_vimeo(url, customname=None):
print('Vimeo')
name = url.split('/')[-1]
name = name.split('?')[0]
try:
int(name)
except ValueError as e:
print('Could not identify filename of %s' % url)
raise e
url = 'http://player.vimeo.com/video/%s' % name
pagedata = request_get(url)
pagedata = pagedata.text
pagedata = pagedata.replace('</script>', '<script')
pagedata = pagedata.split('<script>')
for chunk in pagedata:
if VIMEO_DICT_START in chunk:
break
chunk = chunk.split(VIMEO_DICT_START)[1]
chunk = chunk.split(VIMEO_DICT_END)[0]
chunk = json.loads(chunk)
for priority in VIMEO_PRIORITY:
if priority in chunk:
fileurl = chunk[priority]['url']
break
if customname:
filename = customname + '.mp4'
else:
filename = name + '.mp4'
return download_file(fileurl, filename)
def handle_youtube(url, customname=None):
print('Youtube')
url = url.replace('&amp;', '&')
url = url.replace('feature=player_embedded&', '')
url = url.replace('&feature=player_embedded', '')
if not customname:
os.system(YOUTUBE_DL_FORMAT.format(url=url, dir=DOWNLOAD_DIRECTORY, name='%(title)s'))
return
os.system(YOUTUBE_DL_FORMAT.format(url=url, dir=DOWNLOAD_DIRECTORY, name=customname))
if DOWNLOAD_DIRECTORY:
return '%s/%s.mp4' % (DOWNLOAD_DIRECTORY, customname)
return '%s.mp4' % customname
def handle_generic(url, customname=None): def handle_generic(url, customname=None):
try: print('Generic')
if customname: try:
name = customname if customname:
else: name = customname
name = url.split('/')[-1] else:
name = url.split('/')[-1]
base = name.split('.')[0] base = name.split('.')[0]
ext = name.split('.')[-1] ext = name.split('.')[-1]
if ext in [base, '']: if ext in [base, '']:
ext = 'html' ext = 'html'
print(base) print(base)
print(ext) print(ext)
name = '%s.%s' % (base, ext) name = '%s.%s' % (base, ext)
return download_file(url, name) return download_file(url, name)
except: except:
pass pass
## ##
############################################################################## ##############################################################################
HANDLERS = { HANDLERS = {
'imgur.com': handle_imgur, 'gfycat.com': handle_gfycat,
'gfycat.com': handle_gfycat, 'imgur.com': handle_imgur,
'vimeo.com': handle_vimeo, 'liveleak.com': handle_liveleak,
'vid.me': handle_vidme, 'vid.me': handle_vidme,
'liveleak.com': handle_liveleak, 'vidble.com': handle_vidble,
'youtube.com': handle_youtube, 'vimeo.com': handle_vimeo,
'youtu.be': handle_youtube, 'youtube.com': handle_youtube,
'twitter.com': handle_twitter 'youtu.be': handle_youtube,
} 'twitter.com': handle_twitter
}
def handle_master(url, customname=None): def handle_master(url, customname=None):
print('Handling %s' % url) print('Handling %s' % url)
for handlerkey in HANDLERS: for handlerkey in HANDLERS:
if handlerkey.lower() in url.lower(): if handlerkey.lower() in url.lower():
return HANDLERS[handlerkey](url, customname=customname) return HANDLERS[handlerkey](url, customname=customname)
if DO_GENERIC: if DO_GENERIC:
return handle_generic(url, customname=customname) return handle_generic(url, customname=customname)
def test_imgur(): def test_imgur():
# Imgur gallery album # Imgur gallery album
handle_master('http://imgur.com/gallery/s4WLG') handle_master('http://imgur.com/gallery/s4WLG')
# Imgur standard album with customname # Imgur standard album with customname
handle_master('http://imgur.com/a/s4WLG', customname='album') handle_master('http://imgur.com/a/s4WLG', customname='album')
# Imgur indirect # Imgur indirect
handle_master('http://imgur.com/gvJUct0') handle_master('http://imgur.com/gvJUct0')
# Imgur indirect single with customname # Imgur indirect single with customname
handle_master('http://imgur.com/gvJUct0', customname='indirect') handle_master('http://imgur.com/gvJUct0', customname='indirect')
# Imgur direct single # Imgur direct single
handle_master('http://i.imgur.com/gvJUct0.jpg') handle_master('http://i.imgur.com/gvJUct0.jpg')
def test_gfycat(): def test_gfycat():
# Gfycat direct .gif # Gfycat direct .gif
handle_master('http://giant.gfycat.com/FatherlyBruisedIberianchiffchaff.gif') handle_master('http://giant.gfycat.com/FatherlyBruisedIberianchiffchaff.gif')
# Gfycat general link # Gfycat general link
handle_master('http://www.gfycat.com/RawWetFlatcoatretriever') handle_master('http://www.gfycat.com/RawWetFlatcoatretriever')
# Gfycat general link with customname # Gfycat general link with customname
handle_master('http://www.gfycat.com/RawWetFlatcoatretriever', customname='gfycatgeneral') handle_master('http://www.gfycat.com/RawWetFlatcoatretriever', customname='gfycatgeneral')
def test_vimeo(): def test_vimeo():
# Vimeo standard link # Vimeo standard link
handle_master('https://vimeo.com/109405701') handle_master('https://vimeo.com/109405701')
# Vimeo player link with customname # Vimeo player link with customname
handle_master('https://player.vimeo.com/video/109405701', customname='vimeoplayer') handle_master('https://player.vimeo.com/video/109405701', customname='vimeoplayer')
def test_liveleak(): def test_liveleak():
# LiveLeak standard link # LiveLeak standard link
handle_master('http://www.liveleak.com/view?i=9d1_1429192014') handle_master('http://www.liveleak.com/view?i=9d1_1429192014')
# Liveleak article with youtube embed # Liveleak article with youtube embed
handle_master('http://www.liveleak.com/view?i=ab8_1367941301') handle_master('http://www.liveleak.com/view?i=ab8_1367941301')
# LiveLeak standard link with customname # LiveLeak standard link with customname
handle_master('http://www.liveleak.com/view?i=9d1_1429192014', customname='liveleak') handle_master('http://www.liveleak.com/view?i=9d1_1429192014', customname='liveleak')
def test_youtube(): def test_youtube():
# Youtube standard link # Youtube standard link
handle_master('https://www.youtube.com/watch?v=bEgeh5hA5ko') handle_master('https://www.youtube.com/watch?v=bEgeh5hA5ko')
# Youtube short link # Youtube short link
handle_master('https://youtu.be/GjOBTstnW20', customname='youtube') handle_master('https://youtu.be/GjOBTstnW20', customname='youtube')
# Youtube player embed link # Youtube player embed link
handle_master('https://www.youtube.com/watch?feature=player_embedded&amp;v=bEgeh5hA5ko') handle_master('https://www.youtube.com/watch?feature=player_embedded&amp;v=bEgeh5hA5ko')
def test_twitter(): def test_twitter():
# Tiwtter with twitter-image embed # Tiwtter with twitter-image embed
handle_master('https://twitter.com/PetoLucem/status/599493836214272000') handle_master('https://twitter.com/PetoLucem/status/599493836214272000')
# Twitter with twitter-image embed # Twitter with twitter-image embed
handle_master('https://twitter.com/Jalopnik/status/598287843128188929') handle_master('https://twitter.com/Jalopnik/status/598287843128188929')
# Twitter with twitter-image embed and customname # Twitter with twitter-image embed and customname
handle_master('https://twitter.com/Jalopnik/status/598287843128188929', customname='twits') handle_master('https://twitter.com/Jalopnik/status/598287843128188929', customname='twits')
# Twitter with youtube embed # Twitter with youtube embed
handle_master('https://twitter.com/cp_orange_x3/status/599705117420457984') handle_master('https://twitter.com/cp_orange_x3/status/599705117420457984')
# Twitter plain text # Twitter plain text
handle_master('https://twitter.com/cp_orange_x3/status/599700702382817280') handle_master('https://twitter.com/cp_orange_x3/status/599700702382817280')
# Twitter plain text # Twitter plain text
handle_master('https://twitter.com/SyriacMFS/status/556513635913437184') handle_master('https://twitter.com/SyriacMFS/status/556513635913437184')
# Twitter with arabic characters # Twitter with arabic characters
handle_master('https://twitter.com/HadiAlabdallah/status/600885154991706113') handle_master('https://twitter.com/HadiAlabdallah/status/600885154991706113')
def test_generic(): def test_generic():
# Some link that might work # Some link that might work
handle_master('https://raw.githubusercontent.com/voussoir/reddit/master/SubredditBirthdays/show/statistics.txt') handle_master('https://raw.githubusercontent.com/voussoir/reddit/master/SubredditBirthdays/show/statistics.txt')
# Some link that might work with customname # Some link that might work with customname
handle_master('https://raw.githubusercontent.com/voussoir/reddit/master/SubredditBirthdays/show/statistics.txt', customname='sss') handle_master('https://raw.githubusercontent.com/voussoir/reddit/master/SubredditBirthdays/show/statistics.txt', customname='sss')
# Some link that might work # Some link that might work
handle_master('https://github.com/voussoir/reddit/tree/master/SubredditBirthdays/show') handle_master('https://github.com/voussoir/reddit/tree/master/SubredditBirthdays/show')
if __name__ == '__main__': if __name__ == '__main__':
if len(sys.argv) > 1: if len(sys.argv) > 1:
handle_master(sys.argv[1]) handle_master(sys.argv[1])
else: else:
#test_imgur() #test_imgur()
#test_gfycat() #test_gfycat()
#test_vimeo() #test_vimeo()
test_liveleak() test_liveleak()
test_youtube() test_youtube()
#test_twitter() #test_twitter()
#test_generic() #test_generic()
pass pass