54ae615730
Per rfc5646, omitting is preferred where permitted, but in Epub that's not permitted!
859 lines
29 KiB
Python
859 lines
29 KiB
Python
import mimetypes
|
|
import os
|
|
import re
|
|
import tempfile
|
|
import urllib.parse
|
|
import uuid
|
|
import zipfile
|
|
|
|
import bs4
|
|
import tinycss2
|
|
|
|
from voussoirkit import pathclass
|
|
|
|
MIMETYPE_CONTENT = 'application/epub+zip'
|
|
|
|
HTML_LINK_PROPERTIES = {
|
|
'a': ['href'],
|
|
'audio': ['src'],
|
|
'image': ['href', 'xlink:href'],
|
|
'img': ['src'],
|
|
'link': ['href'],
|
|
'script': ['src'],
|
|
'source': ['src'],
|
|
'track': ['src'],
|
|
'video': ['src', 'poster'],
|
|
}
|
|
|
|
EXTENSION_MIMETYPES = {
|
|
'html': 'application/xhtml+xml',
|
|
'xhtml': 'application/xhtml+xml',
|
|
'smi': 'application/smil+xml',
|
|
'smil': 'application/smil+xml',
|
|
'sml': 'application/smil+xml',
|
|
'otf': 'application/font-sfnt',
|
|
'ttf': 'application/font-sfnt',
|
|
'pls': 'application/pls+xml',
|
|
'woff': 'application/font-woff',
|
|
'woff2': 'font/woff2',
|
|
}
|
|
|
|
MIMETYPE_DIRECTORIES = {
|
|
'application/x-dtbncx+xml': '.',
|
|
'application/font-sfnt': 'Fonts',
|
|
'application/xhtml+xml': 'Text',
|
|
'font': 'Fonts',
|
|
'image': 'Images',
|
|
'text/css': 'Styles',
|
|
'audio': 'Audio',
|
|
'video': 'Video',
|
|
}
|
|
|
|
MIMETYPE_FILE_TEMPLATE = 'application/epub+zip'
|
|
|
|
CONTAINER_XML_TEMPLATE = '''
|
|
<?xml version="1.0" encoding="UTF-8"?>
|
|
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
|
|
<rootfiles>
|
|
<rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
|
|
</rootfiles>
|
|
</container>
|
|
'''.strip()
|
|
|
|
OPF_TEMPLATE = '''
|
|
<?xml version="1.0" encoding="utf-8"?>
|
|
<package version="3.0" unique-identifier="BookId" xmlns="http://www.idpf.org/2007/opf">
|
|
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
|
|
<dc:identifier id="BookId">{uuid}</dc:identifier>
|
|
<dc:creator id="cre">author</dc:creator>
|
|
<meta scheme="marc:relators" refines="#cre" property="role">aut</meta>
|
|
<dc:title>title</dc:title>
|
|
<dc:language>und</dc:language>
|
|
</metadata>
|
|
<manifest>
|
|
<item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
|
|
<item id="nav.xhtml" href="Text/nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>
|
|
</manifest>
|
|
<spine toc="ncx">
|
|
<itemref idref="nav.xhtml" linear="no"/>
|
|
</spine>
|
|
</package>
|
|
'''.strip()
|
|
|
|
NCX_TEMPLATE = '''
|
|
<?xml version="1.0" encoding="utf-8"?>
|
|
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
|
|
<head>
|
|
<meta name="dtb:uid" content="{uuid}" />
|
|
</head>
|
|
<docTitle>
|
|
<text>Unknown</text>
|
|
</docTitle>
|
|
<navMap>
|
|
</navMap>
|
|
</ncx>
|
|
'''.strip()
|
|
|
|
NAV_XHTML_TEMPLATE = '''
|
|
<?xml version="1.0" encoding="utf-8"?>
|
|
<!DOCTYPE html>
|
|
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
|
|
<head>
|
|
<meta charset="utf-8"/>
|
|
</head>
|
|
<body epub:type="frontmatter">
|
|
<nav epub:type="toc" id="toc">
|
|
<h1>Table of Contents</h1>
|
|
<ol>
|
|
</ol>
|
|
</nav>
|
|
</body>
|
|
</html>
|
|
'''.strip()
|
|
|
|
TEXT_TEMPLATE = '''
|
|
<?xml version="1.0" encoding="utf-8"?>
|
|
<!DOCTYPE html>
|
|
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
|
|
<head>
|
|
{head_content}
|
|
</head>
|
|
<body>
|
|
{body_content}
|
|
</body>
|
|
</html>
|
|
'''.strip()
|
|
|
|
|
|
def compress_epub(directory, epub_filepath):
|
|
directory = pathclass.Path(directory)
|
|
epub_filepath = pathclass.Path(epub_filepath)
|
|
|
|
if epub_filepath in directory:
|
|
raise ValueError('Epub inside its own directory')
|
|
|
|
if epub_filepath.extension != 'epub':
|
|
epub_filepath = epub_filepath.add_extension('epub')
|
|
|
|
with zipfile.ZipFile(epub_filepath.absolute_path, 'w') as z:
|
|
z.write(directory.with_child('mimetype').absolute_path, arcname='mimetype')
|
|
for file in directory.walk():
|
|
if file in [directory.with_child('mimetype'), directory.with_child('sigil.cfg')]:
|
|
continue
|
|
z.write(
|
|
file.absolute_path,
|
|
arcname=file.relative_to(directory),
|
|
compress_type=zipfile.ZIP_DEFLATED,
|
|
)
|
|
return epub_filepath
|
|
|
|
def extract_epub(epub_filepath, directory):
|
|
epub_filepath = pathclass.Path(epub_filepath)
|
|
directory = pathclass.Path(directory)
|
|
|
|
with zipfile.ZipFile(epub_filepath.absolute_path, 'r') as z:
|
|
z.extractall(directory.absolute_path)
|
|
|
|
def fix_xhtml(xhtml, return_soup=False):
|
|
if isinstance(xhtml, bs4.BeautifulSoup):
|
|
soup = xhtml
|
|
else:
|
|
# For the text pages, html5lib is the best because html.parser and lxml
|
|
# lowercase all attributes, breaking svg's case-sensitive viewBox etc.
|
|
# and xml loses all of the namespaces when namespaced elements are nested
|
|
# like <html xmlns="..."><svg xmlns:xlink="..."></svg></html>.
|
|
# The downside of html5lib is it turns the xml declaration at the top
|
|
# into a comment which we must undo manually.
|
|
soup = bs4.BeautifulSoup(xhtml, 'html5lib')
|
|
|
|
if not soup.html:
|
|
html = soup.new_tag('html')
|
|
for child in list(soup.contents):
|
|
html.append(child)
|
|
soup.append(html)
|
|
|
|
if not soup.html.body:
|
|
body = soup.new_tag('body')
|
|
for child in list(soup.html.contents):
|
|
body.append(child)
|
|
soup.html.append(body)
|
|
|
|
if not soup.html.get('xmlns'):
|
|
soup.html['xmlns'] = 'http://www.w3.org/1999/xhtml'
|
|
|
|
try:
|
|
doctype = next(i for i in soup.contents if isinstance(i, bs4.Doctype))
|
|
except StopIteration:
|
|
doctype = bs4.Doctype('html')
|
|
soup.html.insert_before(doctype)
|
|
|
|
# html5lib turns the xml declaration into a comment which we must revert.
|
|
try:
|
|
if isinstance(soup.contents[0], bs4.Comment):
|
|
declaration = bs4.Declaration('xml version="1.0" encoding="utf-8"')
|
|
soup.insert(0, declaration)
|
|
declaration.next.extract()
|
|
except StopIteration:
|
|
pass
|
|
|
|
try:
|
|
declaration = next(i for i in soup.contents if isinstance(i, bs4.Declaration))
|
|
except StopIteration:
|
|
declaration = bs4.Declaration('xml version="1.0" encoding="utf-8"')
|
|
doctype.insert_before(declaration)
|
|
|
|
if return_soup:
|
|
return soup
|
|
return str(soup)
|
|
|
|
def get_directory_for_mimetype(mime):
|
|
directory = (
|
|
MIMETYPE_DIRECTORIES.get(mime) or
|
|
MIMETYPE_DIRECTORIES.get(mime.split('/')[0]) or
|
|
'Misc'
|
|
)
|
|
return directory
|
|
|
|
def get_mimetype_for_basename(basename):
|
|
extension = os.path.splitext(basename)[1].strip('.')
|
|
mime = (
|
|
EXTENSION_MIMETYPES.get(extension) or
|
|
mimetypes.guess_type(basename)[0] or
|
|
'application/octet-stream'
|
|
)
|
|
return mime
|
|
|
|
def make_manifest_item(id, href, mime):
|
|
manifest_item = f'<item id="{id}" href="{href}" media-type="{mime}"/>'
|
|
# 'html.parser' just for having the simplest output.
|
|
manifest_item = bs4.BeautifulSoup(manifest_item, 'html.parser')
|
|
return manifest_item.item
|
|
|
|
def make_meta_item(content=None, attrs=None):
|
|
if content:
|
|
meta_item = f'<meta>{content}</meta>'
|
|
else:
|
|
meta_item = f'<meta/>'
|
|
meta_item = bs4.BeautifulSoup(meta_item, 'html.parser')
|
|
if attrs:
|
|
meta_item.attrs.update(attrs)
|
|
return meta_item.meta
|
|
|
|
def make_spine_item(id):
|
|
spine_item = f'<itemref idref="{id}"/>'
|
|
# 'html.parser' just for having the simplest output.
|
|
spine_item = bs4.BeautifulSoup(spine_item, 'html.parser')
|
|
return spine_item.itemref
|
|
|
|
|
|
class EpubfileException(Exception):
|
|
error_message = ''
|
|
|
|
def __init__(self, *args, **kwargs):
|
|
super().__init__()
|
|
self.given_args = args
|
|
self.given_kwargs = kwargs
|
|
self.error_message = self.error_message.format(*args, **kwargs)
|
|
self.args = (self.error_message, args, kwargs)
|
|
|
|
def __str__(self):
|
|
return self.error_message
|
|
|
|
class FileExists(EpubfileException):
|
|
error_message = 'There is already a file at {}.'
|
|
|
|
class IDExists(EpubfileException):
|
|
error_message = 'There is already a file with id {}.'
|
|
|
|
class NotInManifest(EpubfileException):
|
|
error_message = '{} is not in the manifest.'
|
|
|
|
class NotInSpine(EpubfileException):
|
|
error_message = '{} is not in the spine.'
|
|
|
|
|
|
class Epub:
|
|
def __init__(self, directory):
|
|
if isinstance(directory, tempfile.TemporaryDirectory):
|
|
self._tempdir_reference = directory
|
|
directory = directory.name
|
|
|
|
self.root_directory = pathclass.Path(directory, force_sep='/')
|
|
|
|
self.opf_filepath = None
|
|
self.opf = None
|
|
|
|
self.read_opf(self.get_opfs()[0])
|
|
|
|
def __repr__(self):
|
|
return f'Epub({repr(self.root_directory.absolute_path)})'
|
|
|
|
def assert_file_not_exists(self, filepath):
|
|
if filepath.exists:
|
|
existing = filepath.relative_to(self.opf_filepath.parent)
|
|
raise FileExists(existing)
|
|
|
|
# LOADING AND SAVING
|
|
############################################################################
|
|
@classmethod
|
|
def new(cls):
|
|
def writefile(filepath, content):
|
|
os.makedirs(filepath.parent.absolute_path, exist_ok=True)
|
|
with open(filepath.absolute_path, 'w', encoding='utf-8') as handle:
|
|
handle.write(content)
|
|
|
|
uid = uuid.uuid4().urn
|
|
|
|
tempdir = tempfile.TemporaryDirectory(prefix='epubfile-')
|
|
root = pathclass.Path(tempdir.name)
|
|
writefile(root.join('mimetype'), MIMETYPE_FILE_TEMPLATE)
|
|
writefile(root.join('META-INF/container.xml'), CONTAINER_XML_TEMPLATE)
|
|
writefile(root.join('OEBPS/content.opf'), OPF_TEMPLATE.format(uuid=uid))
|
|
writefile(root.join('OEBPS/toc.ncx'), NCX_TEMPLATE.format(uuid=uid))
|
|
writefile(root.join('OEBPS/Text/nav.xhtml'), NAV_XHTML_TEMPLATE)
|
|
|
|
return cls(tempdir)
|
|
|
|
@classmethod
|
|
def open(cls, epub_filepath):
|
|
extract_to = tempfile.TemporaryDirectory(prefix='epubfile-')
|
|
extract_epub(epub_filepath, extract_to.name)
|
|
return cls(extract_to)
|
|
|
|
def save(self, epub_filepath):
|
|
self.write_opf()
|
|
compress_epub(self.root_directory, epub_filepath)
|
|
|
|
# CONTAINER & OPF
|
|
############################################################################
|
|
def get_opfs(self):
|
|
container = self.read_container_xml()
|
|
rootfiles = container.find_all('rootfile')
|
|
rootfiles = [x.get('full-path') for x in rootfiles]
|
|
rootfiles = [self.root_directory.join(x) for x in rootfiles]
|
|
return rootfiles
|
|
|
|
def read_container_xml(self):
|
|
container_xml_path = self.root_directory.join('META-INF/container.xml')
|
|
container = open(container_xml_path.absolute_path, 'r', encoding='utf-8')
|
|
# 'xml' and 'html.parser' seem about even here except that html.parser doesn't self-close.
|
|
container = bs4.BeautifulSoup(container, 'xml')
|
|
return container
|
|
|
|
def read_opf(self, rootfile):
|
|
rootfile = pathclass.Path(rootfile, force_sep='/')
|
|
rootfile_xml = open(rootfile.absolute_path, 'r', encoding='utf-8').read()
|
|
# 'html.parser' preserves namespacing the best, but unfortunately it
|
|
# botches the <meta> items because it wants them to be self-closing
|
|
# and the string contents come out. We will fix in just a moment.
|
|
# This is still preferable to 'xml' which handles the dc: prefixes when
|
|
# parsing only the metadata block, but loses all namespaces when parsing
|
|
# the whole doc. 'lxml' wraps the content in <html><body> and also
|
|
# botches the metas so it's not any better than html.parser.
|
|
self.opf = bs4.BeautifulSoup(rootfile_xml, 'html.parser')
|
|
# Let's fix those metas.
|
|
metas = self.opf.select('meta')
|
|
for meta in metas:
|
|
neighbor = meta.next
|
|
if neighbor.parent != meta.parent:
|
|
break
|
|
if not isinstance(neighbor, bs4.element.NavigableString):
|
|
continue
|
|
meta.append(neighbor.extract().strip())
|
|
|
|
self.opf_filepath = rootfile
|
|
return self.opf
|
|
|
|
def write_container_xml(self, container):
|
|
if isinstance(container, bs4.BeautifulSoup):
|
|
container = str(container)
|
|
container_xml_path = self.root_directory.join('META-INF/container.xml')
|
|
container_xml = open(container_xml_path.absolute_path, 'w', encoding='utf-8')
|
|
container_xml.write(container)
|
|
|
|
def write_opf(self):
|
|
with open(self.opf_filepath.absolute_path, 'w', encoding='utf-8') as rootfile:
|
|
rootfile.write(str(self.opf))
|
|
|
|
# FILE OPERATIONS
|
|
############################################################################
|
|
def add_file(self, id, basename, content):
|
|
if self.opf.manifest.find('item', {'id': id}):
|
|
raise IDExists(id)
|
|
|
|
basename = os.path.basename(basename)
|
|
mime = get_mimetype_for_basename(basename)
|
|
directory = get_directory_for_mimetype(mime)
|
|
directory = self.opf_filepath.parent.with_child(directory)
|
|
os.makedirs(directory.absolute_path, exist_ok=True)
|
|
filepath = directory.with_child(basename)
|
|
|
|
self.assert_file_not_exists(filepath)
|
|
|
|
if mime == 'application/xhtml+xml':
|
|
# bs4 converts bytes to str so this must come before the handle choice.
|
|
content = fix_xhtml(content)
|
|
|
|
if isinstance(content, str):
|
|
handle = open(filepath.absolute_path, 'w', encoding='utf-8')
|
|
elif isinstance(content, bytes):
|
|
handle = open(filepath.absolute_path, 'wb')
|
|
else:
|
|
raise TypeError(type(content))
|
|
|
|
with handle:
|
|
handle.write(content)
|
|
|
|
href = filepath.relative_to(self.opf_filepath.parent, simple=True)
|
|
href = urllib.parse.quote(href)
|
|
|
|
manifest_item = make_manifest_item(id, href, mime)
|
|
self.opf.manifest.append(manifest_item)
|
|
|
|
if mime == 'application/xhtml+xml':
|
|
spine_item = make_spine_item(id)
|
|
self.opf.spine.append(spine_item)
|
|
|
|
return id
|
|
|
|
def easy_add_file(self, filepath):
|
|
filepath = pathclass.Path(filepath)
|
|
with open(filepath.absolute_path, 'rb') as handle:
|
|
self.add_file(
|
|
id=filepath.basename,
|
|
basename=filepath.basename,
|
|
content=handle.read(),
|
|
)
|
|
|
|
def delete_file(self, id):
|
|
os.remove(self.get_filepath(id).absolute_path)
|
|
spine_item = self.opf.spine.find('itemref', {'idref': id})
|
|
if spine_item:
|
|
spine_item.extract()
|
|
manifest_item = self.opf.manifest.find('item', {'id': id})
|
|
manifest_item.extract()
|
|
|
|
def get_filepath(self, id):
|
|
href = self.opf.manifest.find('item', {'id': id})['href']
|
|
filepath = self.opf_filepath.parent.join(href)
|
|
if not filepath.exists:
|
|
href = urllib.parse.unquote(href)
|
|
filepath = self.opf_filepath.parent.join(href)
|
|
return filepath
|
|
|
|
def open_file(self, id, mode):
|
|
if mode not in ('r', 'w'):
|
|
raise ValueError(f'Mode {mode} should be either r or w.')
|
|
|
|
filepath = self.get_filepath(id)
|
|
mime = self.opf.manifest.find('item', {'id': id})['media-type']
|
|
is_text = (
|
|
mime in ('application/xhtml+xml', 'application/x-dtbncx+xml') or
|
|
mime.startswith('text/')
|
|
)
|
|
|
|
if is_text:
|
|
handle = open(filepath.absolute_path, mode, encoding='utf-8')
|
|
else:
|
|
handle = open(filepath.absolute_path, mode + 'b')
|
|
|
|
return handle
|
|
|
|
def read_file(self, id, *, soup=False):
|
|
# text vs binary handled by open_file.
|
|
content = self.open_file(id, 'r').read()
|
|
if soup and self.get_manifest_item(id)['media-type'] == 'application/xhtml+xml':
|
|
return fix_xhtml(content, return_soup=True)
|
|
return content
|
|
|
|
def rename_file(self, id, new_basename=None, *, fix_interlinking=True):
|
|
if isinstance(id, dict):
|
|
basename_map = id
|
|
else:
|
|
if new_basename is None:
|
|
raise TypeError('new_basename can be omitted if id is a dict.')
|
|
basename_map = {id: new_basename}
|
|
|
|
rename_map = {}
|
|
for (id, new_basename) in basename_map.items():
|
|
old_filepath = self.get_filepath(id)
|
|
new_filepath = old_filepath.parent.with_child(new_basename)
|
|
if not new_filepath.extension:
|
|
new_filepath = new_filepath.add_extension(old_filepath.extension)
|
|
self.assert_file_not_exists(new_filepath)
|
|
os.rename(old_filepath.absolute_path, new_filepath.absolute_path)
|
|
rename_map[old_filepath] = new_filepath
|
|
|
|
if fix_interlinking:
|
|
self.fix_interlinking(rename_map)
|
|
else:
|
|
self.fix_interlinking_opf(rename_map)
|
|
|
|
return rename_map
|
|
|
|
def write_file(self, id, content):
|
|
# text vs binary handled by open_file.
|
|
if isinstance(content, bs4.BeautifulSoup):
|
|
content = str(content)
|
|
|
|
with self.open_file(id, 'w') as handle:
|
|
handle.write(content)
|
|
|
|
# GETTING THINGS
|
|
############################################################################
|
|
def get_manifest_items(self, filter='', soup=False, spine_order=False):
|
|
query = f'item{filter}'
|
|
items = self.opf.manifest.select(query)
|
|
|
|
if spine_order:
|
|
items = {x['id']: x for x in items}
|
|
ordered_items = []
|
|
|
|
for spine_id in self.get_spine_order():
|
|
ordered_items.append(items.pop(spine_id))
|
|
ordered_items.extend(items.values())
|
|
items = ordered_items
|
|
|
|
if soup:
|
|
return items
|
|
|
|
return [x['id'] for x in items]
|
|
|
|
def get_manifest_item(self, id):
|
|
item = self.opf.manifest.find('item', {'id': id})
|
|
if not item:
|
|
raise NotInManifest(id)
|
|
return item
|
|
|
|
def get_fonts(self, *, soup=False):
|
|
return self.get_manifest_items(
|
|
filter='[media-type^="application/font"],[media-type^="font/"]',
|
|
soup=soup,
|
|
)
|
|
|
|
def get_images(self, *, soup=False):
|
|
return self.get_manifest_items(
|
|
filter='[media-type^="image/"]',
|
|
soup=soup,
|
|
)
|
|
|
|
def get_media(self, *, soup=False):
|
|
return self.get_manifest_items(
|
|
filter='[media-type^="video/"],[media-type^="audio/"]',
|
|
soup=soup,
|
|
)
|
|
|
|
def get_nav(self, *, soup=False):
|
|
nav = self.opf.manifest.find('item', {'properties': 'nav'})
|
|
if not nav:
|
|
return None
|
|
if soup:
|
|
return nav
|
|
return nav['id']
|
|
|
|
def get_ncx(self, *, soup=False):
|
|
ncx = self.opf.manifest.find('item', {'media-type': 'application/x-dtbncx+xml'})
|
|
if not ncx:
|
|
return None
|
|
if soup:
|
|
return ncx
|
|
return ncx['id']
|
|
|
|
def get_styles(self, *, soup=False):
|
|
return self.get_manifest_items(
|
|
filter='[media-type="text/css"]',
|
|
soup=soup,
|
|
)
|
|
|
|
def get_texts(self, *, soup=False, skip_nav=False):
|
|
texts = self.get_manifest_items(
|
|
filter='[media-type="application/xhtml+xml"]',
|
|
soup=True,
|
|
spine_order=True,
|
|
)
|
|
if skip_nav:
|
|
texts = [x for x in texts if x.get('properties') != 'nav']
|
|
|
|
if soup:
|
|
return texts
|
|
return [x['id'] for x in texts]
|
|
|
|
# COVER
|
|
############################################################################
|
|
def get_cover_image(self, *, soup=False):
|
|
cover = self.opf.manifest.find('item', {'properties': 'cover-image'})
|
|
if cover:
|
|
return cover if soup else cover['id']
|
|
|
|
cover = self.opf.metadata.find('meta', {'name': 'cover'})
|
|
if cover:
|
|
return cover if soup else cover['content']
|
|
|
|
return None
|
|
|
|
def remove_cover_image(self):
|
|
current_cover = self.get_cover_image(soup=True)
|
|
if not current_cover:
|
|
return
|
|
|
|
del current_cover['properties']
|
|
|
|
meta = self.opf.metadata.find('meta', {'name': 'cover'})
|
|
if meta:
|
|
meta.extract()
|
|
|
|
def set_cover_image(self, id):
|
|
if id is None:
|
|
self.remove_cover_image()
|
|
|
|
current_cover = self.get_cover_image(soup=True)
|
|
|
|
if not current_cover:
|
|
pass
|
|
elif current_cover['id'] == id:
|
|
return
|
|
else:
|
|
del current_cover['properties']
|
|
|
|
manifest_item = self.get_manifest_item(id)
|
|
manifest_item['properties'] = 'cover-image'
|
|
|
|
current_meta = self.opf.metadata.find('meta', {'name': 'cover'})
|
|
if current_meta:
|
|
current_meta[content] = id
|
|
else:
|
|
meta = make_meta_item(attrs={'name': 'cover', 'content': id})
|
|
self.opf.metadata.append(meta)
|
|
|
|
# SPINE
|
|
############################################################################
|
|
def get_spine_order(self, *, only_linear=False):
|
|
items = self.opf.spine.find_all('itemref')
|
|
if only_linear:
|
|
items = [x for x in items if x.get('linear') != 'no']
|
|
return [x['idref'] for x in items]
|
|
return ids
|
|
|
|
def set_spine_order(self, ids):
|
|
manifest_ids = self.get_manifest_items()
|
|
# Fetch the existing entries so that we can preserve their attributes
|
|
# while rearranging, only creating new spine entries for ids that aren't
|
|
# already present.
|
|
spine_items = self.opf.spine.select('itemref')
|
|
spine_items = {item['idref']: item for item in spine_items}
|
|
for id in ids:
|
|
if id not in manifest_ids:
|
|
raise NotInManifest(id)
|
|
if id in spine_items:
|
|
self.opf.spine.append(spine_items.pop(id))
|
|
else:
|
|
self.opf.spine.append(make_spine_item(id))
|
|
|
|
# The remainder of the current spine items were not used, so pop them out.
|
|
for spine_item in spine_items.values():
|
|
spine_item.extract()
|
|
|
|
def get_spine_linear(self, id):
|
|
spine_item = self.opf.spine.find('itemref', {'idref': id})
|
|
if not spine_item:
|
|
raise NotInSpine(id)
|
|
linear = spine_item.get('linear')
|
|
linear = {None: None, 'yes': True, 'no': False}.get(linear, linear)
|
|
return linear
|
|
|
|
def set_spine_linear(self, id, linear):
|
|
'''
|
|
Set linear to yes or no. Or pass None to remove the property.
|
|
'''
|
|
spine_item = self.opf.spine.find('itemref', {'idref': id})
|
|
if not spine_item:
|
|
raise NotInSpine(id)
|
|
|
|
if linear is None:
|
|
del spine_item['linear']
|
|
return
|
|
|
|
if isinstance(linear, str):
|
|
if linear not in ('yes', 'no'):
|
|
raise ValueError(f'Linear must be yes or no, not {linear}.')
|
|
elif isinstance(linear, (bool, int)):
|
|
linear = {True: 'yes', False: 'no'}[bool(linear)]
|
|
else:
|
|
raise TypeError(linear)
|
|
|
|
spine_item['linear'] = linear
|
|
|
|
# UTILITIES
|
|
############################################################################
|
|
def fix_all_xhtml(self):
|
|
for id in self.get_texts():
|
|
self.write_file(id, self.read_file(id, soup=True))
|
|
|
|
@staticmethod
|
|
def _fix_interlinking_helper(link, rename_map, relative_to, old_relative_to=None):
|
|
if link is None:
|
|
return None
|
|
|
|
link = urllib.parse.urlsplit(link)
|
|
if link.scheme:
|
|
return None
|
|
|
|
if old_relative_to is None:
|
|
old_relative_to = relative_to
|
|
|
|
new_filepath = (
|
|
rename_map.get(link.path) or
|
|
rename_map.get(old_relative_to.join(link.path)) or
|
|
rename_map.get(old_relative_to.join(urllib.parse.unquote(link.path))) or
|
|
None
|
|
)
|
|
if new_filepath is None:
|
|
return None
|
|
|
|
link = link._replace(path=new_filepath.relative_to(relative_to, simple=True))
|
|
link = link._replace(path=urllib.parse.quote(link.path))
|
|
|
|
return link.geturl()
|
|
|
|
@staticmethod
|
|
def _fix_interlinking_css_helper(tag):
|
|
links = []
|
|
commit = lambda: None
|
|
|
|
if not isinstance(tag, bs4.element.Tag):
|
|
pass
|
|
|
|
elif tag.name == 'style' and tag.contents:
|
|
style = tinycss2.parse_stylesheet(tag.contents[0])
|
|
links = [
|
|
token
|
|
for rule in style if isinstance(rule, tinycss2.ast.QualifiedRule)
|
|
for token in rule.content if isinstance(token, tinycss2.ast.URLToken)
|
|
]
|
|
commit = lambda: tag.contents[0].replace_with(tinycss2.serialize(style))
|
|
|
|
elif tag.get('style'):
|
|
style = tinycss2.parse_declaration_list(tag['style'])
|
|
links = [
|
|
token
|
|
for declaration in style if isinstance(declaration, tinycss2.ast.Declaration)
|
|
for token in declaration.value if isinstance(token, tinycss2.ast.URLToken)
|
|
]
|
|
commit = lambda: tag.attrs.update(style=tinycss2.serialize(style))
|
|
|
|
return (links, commit)
|
|
|
|
def fix_interlinking_text(self, id, rename_map, old_relative_to=None):
|
|
text_parent = self.get_filepath(id).parent
|
|
soup = self.read_file(id, soup=True)
|
|
for tag in soup.descendants:
|
|
for link_property in HTML_LINK_PROPERTIES.get(tag.name, []):
|
|
link = tag.get(link_property)
|
|
link = self._fix_interlinking_helper(link, rename_map, text_parent, old_relative_to)
|
|
if not link:
|
|
continue
|
|
tag[link_property] = link
|
|
|
|
(style_links, style_commit) = self._fix_interlinking_css_helper(tag)
|
|
for token in style_links:
|
|
link = token.value
|
|
link = self._fix_interlinking_helper(link, rename_map, text_parent, old_relative_to)
|
|
if not link:
|
|
continue
|
|
token.value = link
|
|
style_commit()
|
|
|
|
text = str(soup)
|
|
self.write_file(id, text)
|
|
|
|
def fix_interlinking_ncx(self, rename_map, old_relative_to=None):
|
|
ncx_id = self.get_ncx()
|
|
if not ncx_id:
|
|
return
|
|
|
|
ncx_parent = self.get_filepath(ncx_id).parent
|
|
ncx = self.read_file(ncx_id)
|
|
# 'xml' because 'lxml' and 'html.parser' lowercase the navPoint tag name.
|
|
ncx = bs4.BeautifulSoup(ncx, 'xml')
|
|
for point in ncx.select('navPoint > content[src]'):
|
|
link = point['src']
|
|
link = self._fix_interlinking_helper(link, rename_map, ncx_parent, old_relative_to)
|
|
if not link:
|
|
continue
|
|
point['src'] = link
|
|
|
|
ncx = str(ncx)
|
|
self.write_file(ncx_id, ncx)
|
|
|
|
def fix_interlinking_opf(self, rename_map):
|
|
opf_parent = self.opf_filepath.parent
|
|
for opf_item in self.opf.select('guide > reference[href], manifest > item[href]'):
|
|
link = opf_item['href']
|
|
link = self._fix_interlinking_helper(link, rename_map, opf_parent)
|
|
if not link:
|
|
continue
|
|
opf_item['href'] = link
|
|
|
|
def fix_interlinking(self, rename_map):
|
|
self.fix_interlinking_opf(rename_map)
|
|
for id in self.get_texts():
|
|
self.fix_interlinking_text(id, rename_map)
|
|
self.fix_interlinking_ncx(rename_map)
|
|
|
|
def move_nav_to_end(self):
|
|
'''
|
|
Move the nav.xhtml file to the end and set linear=no.
|
|
'''
|
|
nav = self.get_nav()
|
|
if not nav:
|
|
return
|
|
|
|
spine = self.get_spine_order()
|
|
for (index, id) in enumerate(spine):
|
|
if id == nav:
|
|
spine.append(spine.pop(index))
|
|
break
|
|
self.set_spine_order(spine)
|
|
|
|
self.set_spine_linear(nav, False)
|
|
|
|
def normalize_directory_structure(self):
|
|
# This must come before the opf rewrite because that would affect the
|
|
# location of all all manifest item hrefs.
|
|
manifest_items = self.get_manifest_items(soup=True)
|
|
old_filepaths = {item['id']: self.get_filepath(item['id']) for item in manifest_items}
|
|
old_ncx = self.get_ncx()
|
|
try:
|
|
old_ncx_parent = self.get_filepath(self.get_ncx()).parent
|
|
except Exception:
|
|
old_ncx_parent = None
|
|
|
|
if self.opf_filepath.parent == self.root_directory:
|
|
oebps = self.root_directory.with_child('OEBPS')
|
|
os.makedirs(oebps.absolute_path)
|
|
self.write_opf()
|
|
new_opf_path = oebps.with_child(self.opf_filepath.basename)
|
|
os.rename(self.opf_filepath.absolute_path, new_opf_path.absolute_path)
|
|
container = self.read_container_xml()
|
|
rootfile = container.find('rootfile', {'full-path': self.opf_filepath.basename})
|
|
rootfile['full-path'] = new_opf_path.relative_to(self.root_directory, simple=True)
|
|
self.write_container_xml(container)
|
|
self.opf_filepath = new_opf_path
|
|
|
|
rename_map = {}
|
|
for manifest_item in manifest_items:
|
|
old_filepath = old_filepaths[manifest_item['id']]
|
|
|
|
directory = get_directory_for_mimetype(manifest_item['media-type'])
|
|
directory = self.opf_filepath.parent.with_child(directory)
|
|
os.makedirs(directory.absolute_path, exist_ok=True)
|
|
|
|
new_filepath = directory.with_child(old_filepath.basename)
|
|
rename_map[old_filepath] = new_filepath
|
|
os.rename(old_filepath.absolute_path, new_filepath.absolute_path)
|
|
manifest_item['href'] = new_filepath.relative_to(self.opf_filepath.parent, simple=True)
|
|
|
|
self.fix_interlinking_opf(rename_map)
|
|
for id in self.get_texts():
|
|
self.fix_interlinking_text(id, rename_map, old_relative_to=old_filepaths[id].parent)
|
|
self.fix_interlinking_ncx(rename_map, old_relative_to=old_ncx_parent)
|