epubfile/epubfile.py

1706 lines
59 KiB
Python
Raw Normal View History

import copy
import functools
import io
2019-08-04 05:13:27 +00:00
import mimetypes
import os
import re
import tempfile
import urllib.parse
import uuid
import zipfile
import bs4
import tinycss2
2020-03-07 21:52:29 +00:00
from voussoirkit import getpermission
2019-08-04 05:13:27 +00:00
from voussoirkit import pathclass
HTML_LINK_PROPERTIES = {
'a': ['href'],
'audio': ['src'],
'image': ['href', 'xlink:href'],
'img': ['src'],
'link': ['href'],
'script': ['src'],
'source': ['src'],
'track': ['src'],
'video': ['src', 'poster'],
}
EXTENSION_MIMETYPES = {
2019-08-07 06:36:13 +00:00
'htm': 'application/xhtml+xml',
'html': 'application/xhtml+xml',
'otf': 'font/otf',
'pls': 'application/pls+xml',
2019-08-04 05:13:27 +00:00
'smi': 'application/smil+xml',
'smil': 'application/smil+xml',
'sml': 'application/smil+xml',
'ttf': 'font/ttf',
'woff': 'font/woff',
2019-08-04 05:13:27 +00:00
'woff2': 'font/woff2',
'xhtml': 'application/xhtml+xml',
2019-09-06 23:26:16 +00:00
'xpgt': 'application/vnd.adobe-page-template+xml',
2019-08-04 05:13:27 +00:00
}
MIMETYPE_DIRECTORIES = {
'application/font-sfnt': 'Fonts',
'application/x-dtbncx+xml': '.',
'application/x-font-ttf': 'Fonts',
2019-08-04 05:13:27 +00:00
'application/xhtml+xml': 'Text',
'audio': 'Audio',
2019-08-04 05:13:27 +00:00
'font': 'Fonts',
'image': 'Images',
'text/css': 'Styles',
'video': 'Video',
}
MIMETYPE_FILE_TEMPLATE = 'application/epub+zip'
CONTAINER_XML_TEMPLATE = '''
<?xml version="1.0" encoding="UTF-8"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
</rootfiles>
</container>
'''.strip()
OPF_TEMPLATE = '''
<?xml version="1.0" encoding="utf-8"?>
<package version="3.0" unique-identifier="BookId" xmlns="http://www.idpf.org/2007/opf">
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
<dc:identifier id="BookId">{uuid}</dc:identifier>
<dc:creator id="cre">author</dc:creator>
<meta scheme="marc:relators" refines="#cre" property="role">aut</meta>
<dc:title>title</dc:title>
<dc:language>und</dc:language>
2019-08-04 05:13:27 +00:00
</metadata>
<manifest>
<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
<item id="nav.xhtml" href="Text/nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>
</manifest>
<spine toc="ncx">
<itemref idref="nav.xhtml" linear="no"/>
</spine>
</package>
'''.strip()
NCX_TEMPLATE = '''
<?xml version="1.0" encoding="utf-8"?>
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
<head>
<meta name="dtb:uid" content="{uuid}" />
</head>
<docTitle>
<text>{title}</text>
2019-08-04 05:13:27 +00:00
</docTitle>
<navMap>
{navpoints}
2019-08-04 05:13:27 +00:00
</navMap>
</ncx>
'''.strip()
NAV_XHTML_TEMPLATE = '''
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
<head>
<meta charset="utf-8"/>
</head>
<body epub:type="frontmatter">
<nav epub:type="toc" id="toc">
<h1>Table of Contents</h1>
{toc_contents}
2019-08-04 05:13:27 +00:00
</nav>
</body>
</html>
'''.strip()
TEXT_TEMPLATE = '''
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
<head>
{head_content}
</head>
<body>
{body_content}
</body>
</html>
'''.strip()
# EPUB COMPRESSION
################################################################################
2019-08-04 05:13:27 +00:00
def compress_epub(directory, epub_filepath):
directory = pathclass.Path(directory)
epub_filepath = pathclass.Path(epub_filepath)
if epub_filepath in directory:
raise ValueError('Epub inside its own directory')
if epub_filepath.extension != 'epub':
epub_filepath = epub_filepath.add_extension('epub')
with zipfile.ZipFile(epub_filepath.absolute_path, 'w') as z:
z.write(directory.with_child('mimetype').absolute_path, arcname='mimetype')
for file in directory.walk():
if file in [directory.with_child('mimetype'), directory.with_child('sigil.cfg')]:
continue
z.write(
file.absolute_path,
arcname=file.relative_to(directory),
compress_type=zipfile.ZIP_DEFLATED,
)
return epub_filepath
def extract_epub(epub_filepath, directory):
epub_filepath = pathclass.Path(epub_filepath)
directory = pathclass.Path(directory)
with zipfile.ZipFile(epub_filepath.absolute_path, 'r') as z:
z.extractall(directory.absolute_path)
# XHTML TOOLS
################################################################################
2019-08-04 05:13:27 +00:00
def fix_xhtml(xhtml, return_soup=False):
if isinstance(xhtml, bs4.BeautifulSoup):
soup = xhtml
else:
# For the text pages, html5lib is the best because html.parser and lxml
# lowercase all attributes, breaking svg's case-sensitive viewBox etc.
# and xml loses all of the namespaces when namespaced elements are nested
# like <html xmlns="..."><svg xmlns:xlink="..."></svg></html>.
# The downside of html5lib is it turns the xml declaration at the top
# into a comment which we must undo manually.
soup = bs4.BeautifulSoup(xhtml, 'html5lib')
if not soup.html:
html = soup.new_tag('html')
for child in list(soup.contents):
html.append(child)
soup.append(html)
if not soup.html.body:
body = soup.new_tag('body')
for child in list(soup.html.contents):
body.append(child)
soup.html.append(body)
if not soup.html.get('xmlns'):
soup.html['xmlns'] = 'http://www.w3.org/1999/xhtml'
try:
doctype = next(i for i in soup.contents if isinstance(i, bs4.Doctype))
except StopIteration:
doctype = bs4.Doctype('html')
soup.html.insert_before(doctype)
# html5lib turns the xml declaration into a comment which we must revert.
try:
if isinstance(soup.contents[0], bs4.Comment):
declaration = bs4.Declaration('xml version="1.0" encoding="utf-8"')
soup.insert(0, declaration)
declaration.next.extract()
except StopIteration:
pass
try:
declaration = next(i for i in soup.contents if isinstance(i, bs4.Declaration))
except StopIteration:
declaration = bs4.Declaration('xml version="1.0" encoding="utf-8"')
doctype.insert_before(declaration)
if return_soup:
return soup
return str(soup)
def xhtml_replacements(xhtml, replacements, return_soup=False):
if isinstance(xhtml, bs4.BeautifulSoup):
xhtml = str(xhtml)
for (re_from, re_to) in replacements:
xhtml = re.sub(re_from, re_to, xhtml, flags=re.DOTALL)
if return_soup:
soup = bs4.BeautifulSoup(xhtml, 'html5lib')
return soup
return xhtml
def demote_xhtml_headers(xhtml, return_soup=False):
replacements = [
(r'<h5([^>]*?>.*?)</h5>', r'<h6\1</h6>'),
(r'<h4([^>]*?>.*?)</h4>', r'<h5\1</h5>'),
(r'<h3([^>]*?>.*?)</h3>', r'<h4\1</h4>'),
(r'<h2([^>]*?>.*?)</h2>', r'<h3\1</h3>'),
(r'<h1([^>]*?>.*?)</h1>', r'<h2\1</h2>'),
]
return xhtml_replacements(xhtml, replacements, return_soup=return_soup)
def promote_xhtml_headers(xhtml, return_soup=False):
replacements = [
(r'<h2([^>]*?>.*?)</h2>', r'<h1\1</h1>'),
(r'<h3([^>]*?>.*?)</h3>', r'<h2\1</h2>'),
(r'<h4([^>]*?>.*?)</h4>', r'<h3\1</h3>'),
(r'<h5([^>]*?>.*?)</h5>', r'<h4\1</h4>'),
(r'<h6([^>]*?>.*?)</h6>', r'<h5\1</h5>'),
]
return xhtml_replacements(xhtml, replacements, return_soup=return_soup)
# MIMETYPE DECISIONMAKERS
################################################################################
2019-08-04 05:13:27 +00:00
def get_directory_for_mimetype(mime):
directory = (
MIMETYPE_DIRECTORIES.get(mime) or
MIMETYPE_DIRECTORIES.get(mime.split('/')[0]) or
'Misc'
)
return directory
def get_mimetype_for_basename(basename):
extension = os.path.splitext(basename)[1].strip('.')
mime = (
EXTENSION_MIMETYPES.get(extension) or
mimetypes.guess_type(basename)[0] or
'application/octet-stream'
)
return mime
# OPF ELEMENT GENERATORS
################################################################################
2019-08-04 05:13:27 +00:00
def make_manifest_item(id, href, mime):
manifest_item = f'<item id="{id}" href="{href}" media-type="{mime}"/>'
# 'html.parser' just for having the simplest output.
manifest_item = bs4.BeautifulSoup(manifest_item, 'html.parser')
return manifest_item.item
def make_meta_item(content=None, attrs=None):
if content:
meta_item = f'<meta>{content}</meta>'
else:
meta_item = f'<meta/>'
# 'html.parser' just for having the simplest output.
2019-08-04 05:13:27 +00:00
meta_item = bs4.BeautifulSoup(meta_item, 'html.parser')
if attrs:
meta_item.attrs.update(attrs)
return meta_item.meta
def make_spine_item(id):
spine_item = f'<itemref idref="{id}"/>'
# 'html.parser' just for having the simplest output.
spine_item = bs4.BeautifulSoup(spine_item, 'html.parser')
return spine_item.itemref
# DECORATORS
################################################################################
def writes(method):
@functools.wraps(method)
def wrapped_method(self, *args, **kwargs):
if self.read_only:
raise ReadOnly(method.__qualname__)
return method(self, *args, **kwargs)
return wrapped_method
# CLASSES
################################################################################
2019-08-04 05:13:27 +00:00
class EpubfileException(Exception):
error_message = ''
def __init__(self, *args, **kwargs):
super().__init__()
self.given_args = args
self.given_kwargs = kwargs
self.error_message = self.error_message.format(*args, **kwargs)
self.args = (self.error_message, args, kwargs)
def __str__(self):
return self.error_message
2019-09-06 22:44:28 +00:00
class InvalidEpub(EpubfileException):
error_message = '{} is invalid: {}'
2019-08-04 05:13:27 +00:00
class FileExists(EpubfileException):
error_message = 'There is already a file at {}.'
class IDExists(EpubfileException):
error_message = 'There is already a file with id {}.'
class NotInManifest(EpubfileException):
error_message = '{} is not in the manifest.'
class NotInSpine(EpubfileException):
error_message = '{} is not in the spine.'
class ReadOnly(EpubfileException):
error_message = 'Can\'t do {} in read-only mode.'
2019-08-04 05:13:27 +00:00
class Epub:
def __init__(self, epub_path, *, read_only=False):
'''
epub_path:
The path to an .epub file, or to a directory that contains unzipped
epub contents.
read_only:
If True, all write operations will be forbidden. The benefit is that
the .epub file will not be extracted. This is recommended if you
only need to read data from a book and don't need to write to it.
'''
epub_path = self._keep_tempdir_reference(epub_path)
epub_path = pathclass.Path(epub_path)
self.original_path = epub_path
self.read_only = read_only
if epub_path.is_dir:
self.__init_from_dir(epub_path)
elif self.read_only:
self.__init_from_file_read_only(epub_path)
else:
self.__init_from_file(epub_path)
opfs = self.get_opfs()
self.opf_filepath = opfs[0]
self.opf = self.read_opf(self.opf_filepath)
2019-08-04 05:13:27 +00:00
def __init_from_dir(self, directory):
self.is_zip = False
self.root_directory = pathclass.Path(directory)
def __init_from_file_read_only(self, epub_path):
# It may appear that is_zip is a synonym for read_only, but don't forget
# that we can also open a directory in readonly mode. It's just that
# readonly dirs don't need a special init, all they have to do is
# forbid writes.
self.is_zip = True
self.root_directory = pathclass.Path(epub_path)
self.zip = zipfile.ZipFile(self.root_directory.absolute_path)
def __init_from_file(self, epub_path):
extract_to = tempfile.TemporaryDirectory(prefix='epubfile-')
extract_epub(epub_path, extract_to.name)
directory = self._keep_tempdir_reference(extract_to)
self.__init_from_dir(directory)
2019-08-04 05:13:27 +00:00
def __repr__(self):
if self.read_only:
return f'Epub({repr(self.root_directory.absolute_path)}, read_only=True)'
else:
return f'Epub({repr(self.root_directory.absolute_path)})'
def _fopen(self, *args, **kwargs):
'''
Not to be confused with the high level `open_file` method, this method
is the one that actually reads off the disk.
'''
if self.is_zip:
return self._fopen_zip(*args, **kwargs)
else:
return self._fopen_disk(*args, **kwargs)
def _fopen_disk(self, path, mode, *, encoding=None):
'''
If the book was opened as a directory, we can read files off disk with
Python open.
'''
return open(path, mode, encoding=encoding)
def _fopen_zip(self, path, mode, *, encoding=None):
'''
If the book was opened as a read-only zip, we can read files out of
the zip.
'''
p_path = self.root_directory.spawn(path)
if p_path in self.root_directory:
path = p_path.relative_to(self.root_directory, simple=True)
# Zip files always use forward slash internally, even on Windows.
path = path.replace('\\', '/')
if mode == 'rb':
return self.zip.open(path, 'r')
if mode == 'r':
return io.TextIOWrapper(self.zip.open(path, 'r'), encoding)
# At this time ZipFS is only used for read-only epubs anyway.
if mode == 'wb':
return self.zip.open(path, 'w')
if mode == 'w':
return io.TextIOWrapper(self.zip.open(path, 'w'), encoding)
raise ValueError('mode should be r, w, rb, or wb.')
def _keep_tempdir_reference(self, p):
'''
If the given path object is actually a tempfile.TemporaryDirectory,
store that TD reference here so that it does not get cleaned up even
if the caller releases it. Then return the actual filepath.
'''
if isinstance(p, tempfile.TemporaryDirectory):
self._tempdir_reference = p
p = p.name
return p
2019-08-04 05:13:27 +00:00
def assert_file_not_exists(self, filepath):
if filepath.exists:
existing = filepath.relative_to(self.opf_filepath.parent)
raise FileExists(existing)
def assert_id_not_exists(self, id):
if self.opf.manifest.find('item', {'id': id}):
raise IDExists(id)
2019-09-06 22:44:28 +00:00
# VALIDATION
############################################################################
@writes
2019-09-06 22:44:28 +00:00
def auto_correct_and_validate(self):
# Ensure we have a mimetype file.
mimetype_file = self.root_directory.with_child('mimetype')
if not mimetype_file.exists:
with self._fopen(mimetype_file.absolute_path, 'w', encoding='utf-8') as handle:
2019-09-06 22:44:28 +00:00
handle.write(MIMETYPE_FILE_TEMPLATE)
# Assert that all manifest items exist on disk.
for item in self.get_manifest_items(soup=True):
filepath = self.get_filepath(item['id'])
if not filepath.exists:
2020-02-03 03:36:57 +00:00
message = f'Manifest item {item["id"]} = {item["href"]} does not exist.'
raise InvalidEpub(self.original_path, message)
2019-09-06 22:44:28 +00:00
2019-08-04 05:13:27 +00:00
# LOADING AND SAVING
############################################################################
@classmethod
def new(cls):
2020-03-08 00:36:32 +00:00
'''
Create a new book. It will start as a temporary directory, so don't
forget to call `save` when you are done.
'''
2019-08-04 05:13:27 +00:00
def writefile(filepath, content):
os.makedirs(filepath.parent.absolute_path, exist_ok=True)
# This line uses Python open instead of self._fopen because the epub
# hasn't been instantiated yet! At this time, creating a book with
# Epub.new always creates it as a directory. We do not support
# creating a book directly into a fresh zip file.
2019-08-04 05:13:27 +00:00
with open(filepath.absolute_path, 'w', encoding='utf-8') as handle:
handle.write(content)
uid = uuid.uuid4().urn
tempdir = tempfile.TemporaryDirectory(prefix='epubfile-')
root = pathclass.Path(tempdir.name)
writefile(root.join('mimetype'), MIMETYPE_FILE_TEMPLATE)
writefile(root.join('META-INF/container.xml'), CONTAINER_XML_TEMPLATE)
writefile(root.join('OEBPS/content.opf'), OPF_TEMPLATE.format(uuid=uid))
writefile(root.join('OEBPS/toc.ncx'), NCX_TEMPLATE.format(uuid=uid, title='Unknown', navpoints=''))
writefile(root.join('OEBPS/Text/nav.xhtml'), NAV_XHTML_TEMPLATE.format(toc_contents=''))
2019-08-04 05:13:27 +00:00
return cls(tempdir)
@writes
2019-08-04 05:13:27 +00:00
def save(self, epub_filepath):
self.write_opf()
2019-09-06 22:44:28 +00:00
self.auto_correct_and_validate()
2019-08-04 05:13:27 +00:00
compress_epub(self.root_directory, epub_filepath)
# CONTAINER & OPF
############################################################################
def get_opfs(self):
2020-03-08 00:36:32 +00:00
'''
Read the container.xml to find all available OPFs (aka rootfiles).
'''
2019-08-04 05:13:27 +00:00
container = self.read_container_xml()
rootfiles = container.find_all('rootfile')
rootfiles = [x.get('full-path') for x in rootfiles]
rootfiles = [self.root_directory.join(x) for x in rootfiles]
return rootfiles
def read_container_xml(self):
container_xml_path = self.root_directory.join('META-INF/container.xml')
container = self._fopen(container_xml_path.absolute_path, 'r', encoding='utf-8')
# 'xml' and 'html.parser' seem about even here except that html.parser
# doesn't self-close.
2019-08-04 05:13:27 +00:00
container = bs4.BeautifulSoup(container, 'xml')
return container
def read_opf(self, rootfile):
rootfile = pathclass.Path(rootfile, force_sep='/')
rootfile_xml = self._fopen(rootfile.absolute_path, 'r', encoding='utf-8').read()
2019-08-04 05:13:27 +00:00
# 'html.parser' preserves namespacing the best, but unfortunately it
# botches the <meta> items because it wants them to be self-closing
# and the string contents come out. We will fix in just a moment.
# This is still preferable to 'xml' which handles the dc: prefixes when
# parsing only the metadata block, but loses all namespaces when parsing
# the whole doc. 'lxml' wraps the content in <html><body> and also
# botches the metas so it's not any better than html.parser.
opf = bs4.BeautifulSoup(rootfile_xml, 'html.parser')
2019-08-04 05:13:27 +00:00
# Let's fix those metas.
metas = opf.select('meta')
2019-08-04 05:13:27 +00:00
for meta in metas:
neighbor = meta.next
if neighbor.parent != meta.parent:
# This happens on the last meta, neighbor is outside of the manifest
2019-08-04 05:13:27 +00:00
break
if not isinstance(neighbor, bs4.element.NavigableString):
continue
meta.append(neighbor.extract().strip())
return opf
2019-08-04 05:13:27 +00:00
@writes
2019-08-04 05:13:27 +00:00
def write_container_xml(self, container):
if isinstance(container, bs4.BeautifulSoup):
container = str(container)
container_xml_path = self.root_directory.join('META-INF/container.xml')
container_xml = self._fopen(container_xml_path.absolute_path, 'w', encoding='utf-8')
2019-08-04 05:13:27 +00:00
container_xml.write(container)
@writes
2019-08-04 05:13:27 +00:00
def write_opf(self):
with self._fopen(self.opf_filepath.absolute_path, 'w', encoding='utf-8') as rootfile:
2019-08-04 05:13:27 +00:00
rootfile.write(str(self.opf))
# FILE OPERATIONS
############################################################################
@writes
2019-08-04 05:13:27 +00:00
def add_file(self, id, basename, content):
self.assert_id_not_exists(id)
2019-08-04 05:13:27 +00:00
basename = os.path.basename(basename)
mime = get_mimetype_for_basename(basename)
directory = get_directory_for_mimetype(mime)
directory = self.opf_filepath.parent.with_child(directory)
os.makedirs(directory.absolute_path, exist_ok=True)
filepath = directory.with_child(basename)
self.assert_file_not_exists(filepath)
if mime == 'application/xhtml+xml':
# bs4 converts bytes to str so this must come before the handle choice.
content = fix_xhtml(content)
if isinstance(content, str):
handle = self._fopen(filepath.absolute_path, 'w', encoding='utf-8')
2019-08-04 05:13:27 +00:00
elif isinstance(content, bytes):
handle = self._fopen(filepath.absolute_path, 'wb')
2019-08-04 05:13:27 +00:00
else:
raise TypeError(f'content should be str or bytes, not {type(content)}.')
2019-08-04 05:13:27 +00:00
with handle:
handle.write(content)
href = filepath.relative_to(self.opf_filepath.parent, simple=True)
href = urllib.parse.quote(href)
manifest_item = make_manifest_item(id, href, mime)
self.opf.manifest.append(manifest_item)
if mime == 'application/xhtml+xml':
spine_item = make_spine_item(id)
self.opf.spine.append(spine_item)
return id
@writes
2019-08-04 05:13:27 +00:00
def easy_add_file(self, filepath):
2020-03-08 00:36:32 +00:00
'''
Add a file from disk into the book. The manifest ID and href will be
automatically generated.
'''
2019-08-04 05:13:27 +00:00
filepath = pathclass.Path(filepath)
with self._fopen(filepath.absolute_path, 'rb') as handle:
2019-08-04 05:13:27 +00:00
self.add_file(
id=filepath.basename,
basename=filepath.basename,
content=handle.read(),
)
@writes
2019-08-04 05:13:27 +00:00
def delete_file(self, id):
manifest_item = self.opf.manifest.find('item', {'id': id})
manifest_item.extract()
2019-08-04 05:13:27 +00:00
spine_item = self.opf.spine.find('itemref', {'idref': id})
if spine_item:
spine_item.extract()
os.remove(self.get_filepath(id).absolute_path)
2019-08-04 05:13:27 +00:00
def get_filepath(self, id):
href = self.opf.manifest.find('item', {'id': id})['href']
filepath = self.opf_filepath.parent.join(href)
2020-03-08 00:36:32 +00:00
# TODO: In the case of a read-only zipped epub, this condition will
# definitely fail and we won't be unquoting names that need it.
# Double-check the consequences of this and make a patch for file
# exists inside zip check if needed.
2019-08-04 05:13:27 +00:00
if not filepath.exists:
href = urllib.parse.unquote(href)
filepath = self.opf_filepath.parent.join(href)
return filepath
def open_file(self, id, mode):
if mode not in ('r', 'w'):
raise ValueError(f'mode should be either r or w, not {mode}.')
2019-08-04 05:13:27 +00:00
if mode == 'w' and self.read_only:
raise ReadOnly(self.open_file.__qualname__)
2019-08-04 05:13:27 +00:00
filepath = self.get_filepath(id)
mime = self.opf.manifest.find('item', {'id': id})['media-type']
is_text = (
mime in ('application/xhtml+xml', 'application/x-dtbncx+xml') or
mime.startswith('text/')
)
if is_text:
handle = self._fopen(filepath.absolute_path, mode, encoding='utf-8')
2019-08-04 05:13:27 +00:00
else:
handle = self._fopen(filepath.absolute_path, mode + 'b')
2019-08-04 05:13:27 +00:00
return handle
def read_file(self, id, *, soup=False):
# text vs binary handled by open_file.
content = self.open_file(id, 'r').read()
if soup and self.get_manifest_item(id)['media-type'] == 'application/xhtml+xml':
return fix_xhtml(content, return_soup=True)
return content
@writes
2019-08-04 05:13:27 +00:00
def rename_file(self, id, new_basename=None, *, fix_interlinking=True):
if isinstance(id, dict):
basename_map = id
else:
if new_basename is None:
raise TypeError('new_basename can be omitted if id is a dict.')
basename_map = {id: new_basename}
rename_map = {}
for (id, new_basename) in basename_map.items():
old_filepath = self.get_filepath(id)
new_filepath = old_filepath.parent.with_child(new_basename)
if not new_filepath.extension:
new_filepath = new_filepath.add_extension(old_filepath.extension)
self.assert_file_not_exists(new_filepath)
os.rename(old_filepath.absolute_path, new_filepath.absolute_path)
rename_map[old_filepath] = new_filepath
if fix_interlinking:
self.fix_interlinking(rename_map)
else:
self.fix_interlinking_opf(rename_map)
return rename_map
@writes
2019-08-04 05:13:27 +00:00
def write_file(self, id, content):
# text vs binary handled by open_file.
if isinstance(content, bs4.BeautifulSoup):
content = str(content)
with self.open_file(id, 'w') as handle:
handle.write(content)
# GETTING THINGS
############################################################################
def get_manifest_items(self, filter='', soup=False, spine_order=False):
query = f'item{filter}'
items = self.opf.manifest.select(query)
if spine_order:
items = {x['id']: x for x in items}
ordered_items = []
for spine_id in self.get_spine_order():
ordered_items.append(items.pop(spine_id))
ordered_items.extend(items.values())
items = ordered_items
if soup:
return items
return [x['id'] for x in items]
def get_manifest_item(self, id):
item = self.opf.manifest.find('item', {'id': id})
if not item:
raise NotInManifest(id)
return item
def get_fonts(self, *, soup=False):
return self.get_manifest_items(
filter='[media-type^="application/font"],[media-type^="font/"]',
soup=soup,
)
def get_images(self, *, soup=False):
return self.get_manifest_items(
filter='[media-type^="image/"]',
soup=soup,
)
def get_media(self, *, soup=False):
return self.get_manifest_items(
filter='[media-type^="video/"],[media-type^="audio/"]',
soup=soup,
)
def get_nav(self, *, soup=False):
nav = self.opf.manifest.find('item', {'properties': 'nav'})
if not nav:
return None
if soup:
return nav
return nav['id']
def get_ncx(self, *, soup=False):
ncx = self.opf.manifest.find('item', {'media-type': 'application/x-dtbncx+xml'})
if not ncx:
return None
if soup:
return ncx
return ncx['id']
def get_styles(self, *, soup=False):
return self.get_manifest_items(
filter='[media-type="text/css"]',
soup=soup,
)
def get_texts(self, *, soup=False, skip_nav=False):
texts = self.get_manifest_items(
filter='[media-type="application/xhtml+xml"]',
soup=True,
spine_order=True,
)
if skip_nav:
texts = [x for x in texts if x.get('properties') != 'nav']
if soup:
return texts
return [x['id'] for x in texts]
# COVER
############################################################################
def get_cover_image(self, *, soup=False):
cover = self.opf.manifest.find('item', {'properties': 'cover-image'})
if cover:
return cover if soup else cover['id']
cover = self.opf.metadata.find('meta', {'name': 'cover'})
if cover:
return cover if soup else cover['content']
return None
@writes
2019-08-04 05:13:27 +00:00
def remove_cover_image(self):
current_cover = self.get_cover_image(soup=True)
if not current_cover:
return
del current_cover['properties']
meta = self.opf.metadata.find('meta', {'name': 'cover'})
if meta:
meta.extract()
@writes
2019-08-04 05:13:27 +00:00
def set_cover_image(self, id):
if id is None:
self.remove_cover_image()
current_cover = self.get_cover_image(soup=True)
if not current_cover:
pass
elif current_cover['id'] == id:
return
else:
del current_cover['properties']
manifest_item = self.get_manifest_item(id)
manifest_item['properties'] = 'cover-image'
current_meta = self.opf.metadata.find('meta', {'name': 'cover'})
if current_meta:
current_meta[content] = id
else:
meta = make_meta_item(attrs={'name': 'cover', 'content': id})
self.opf.metadata.append(meta)
# SPINE
############################################################################
def get_spine_order(self, *, linear_only=False):
2019-08-04 05:13:27 +00:00
items = self.opf.spine.find_all('itemref')
if linear_only:
2019-08-04 05:13:27 +00:00
items = [x for x in items if x.get('linear') != 'no']
return [x['idref'] for x in items]
return ids
@writes
2019-08-04 05:13:27 +00:00
def set_spine_order(self, ids):
manifest_ids = self.get_manifest_items()
# Fetch the existing entries so that we can preserve their attributes
# while rearranging, only creating new spine entries for ids that aren't
# already present.
spine_items = self.opf.spine.select('itemref')
spine_items = {item['idref']: item for item in spine_items}
for id in ids:
if id not in manifest_ids:
raise NotInManifest(id)
if id in spine_items:
self.opf.spine.append(spine_items.pop(id))
else:
self.opf.spine.append(make_spine_item(id))
# The remainder of the current spine items were not used, so pop them out.
for spine_item in spine_items.values():
spine_item.extract()
def get_spine_linear(self, id):
spine_item = self.opf.spine.find('itemref', {'idref': id})
if not spine_item:
raise NotInSpine(id)
linear = spine_item.get('linear')
linear = {None: None, 'yes': True, 'no': False}.get(linear, linear)
return linear
@writes
2019-08-04 05:13:27 +00:00
def set_spine_linear(self, id, linear):
'''
Set linear to yes or no. Or pass None to remove the property.
'''
spine_item = self.opf.spine.find('itemref', {'idref': id})
if not spine_item:
raise NotInSpine(id)
if linear is None:
del spine_item['linear']
return
if isinstance(linear, str):
if linear not in ('yes', 'no'):
raise ValueError(f'Linear must be yes or no, not {linear}.')
elif isinstance(linear, (bool, int)):
linear = {True: 'yes', False: 'no'}[bool(linear)]
else:
raise TypeError(linear)
spine_item['linear'] = linear
# METADATA
############################################################################
def get_authors(self):
'''
Thank you double_j for showing how to deal with find_all not working
on namespaced tags.
https://stackoverflow.com/a/44681560
'''
creators = self.opf.metadata.find_all({'dc:creator'})
creators = [str(c.contents[0]) for c in creators if len(c.contents) == 1]
return creators
def get_languages(self):
languages = self.opf.metadata.find_all({'dc:language'})
languages = [str(l.contents[0]) for l in languages if len(l.contents) == 1]
return languages
def get_titles(self):
titles = self.opf.metadata.find_all({'dc:title'})
titles = [str(t.contents[0]) for t in titles if len(t.contents) == 1]
return titles
@writes
def remove_metadata_of_type(self, tag_name):
for meta in self.opf.metadata.find_all({tag_name}):
if meta.get('id'):
for refines in self.opf.metadata.find_all('meta', {'refines': f'#{meta["id"]}'}):
refines.extract()
meta.extract()
@writes
def set_languages(self, languages):
'''
A list like ['en', 'fr', 'ko'].
'''
self.remove_metadata_of_type('dc:language')
for language in languages:
element = f'<dc:language>{language}</dc:language>'
element = bs4.BeautifulSoup(element, 'html.parser')
self.opf.metadata.append(element)
2019-08-04 05:13:27 +00:00
# UTILITIES
############################################################################