From 58c3de6c5d20619642e38a98fd5a8e7757fad3c7 Mon Sep 17 00:00:00 2001 From: Ethan Dalool Date: Sat, 3 Aug 2019 22:13:27 -0700 Subject: [PATCH] Initial commit. --- LICENSE.txt | 30 ++ README.md | 40 +++ epubfile.py | 858 +++++++++++++++++++++++++++++++++++++++++++++++ requirements.txt | 4 + 4 files changed, 932 insertions(+) create mode 100644 LICENSE.txt create mode 100644 README.md create mode 100644 epubfile.py create mode 100644 requirements.txt diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..74d2d67 --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,30 @@ +BSD 3-Clause License + +Copyright (c) 2019, Ethan Dalool +https://github.com/voussoir/epubfile +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..7753c33 --- /dev/null +++ b/README.md @@ -0,0 +1,40 @@ +epubfile +======== + +```Python +import epubfile +book = epubfile.Epub.open('mybook.epub') + +for text_id in book.get_texts(): + soup = book.read_file(text_id, soup=True) + ... + book.write_file(text_id, soup) + +for image_id in book.get_images(): + data = book.read_file(image_id) + ... + book.write_file(image_id, data) + +# Note that this does not reverse the table of contents. +book.set_spine_order(reversed(book.get_spine_order())) + +cover_id = book.get_cover_image() +if cover_id: + book.rename_file(cover_id, 'myfavoritecoverimage') + +book.save('modifiedbook.epub') +``` + +epubfile provides simple editing of epub books. epubfile attempts to keep file modifications to a minimum. It does not add, remove, or rearrange files unless you ask it to, and does not inject additional metadata. As such, it works for both epub2 and epub3 assuming you stick to supported operations for your book version. + +# Spec compliance + +epubfile does not rigorously enforce the epub spec and you can create noncompliant books with it. Basic errors are checked, and I am open to issues and comments regarding ways to improve spec-compliance without adding significant size or complexity to the library. I am prioritizing simplicity and ease of use over perfection. + +# Pairs well with... + +For advanced inter-file operations and better validation, I suggest using this library in conjunction with a good editor like [Sigil](https://github.com/Sigil-Ebook/Sigil). I wrote this library because although Sigil plugins are great for processing a single book, it is difficult to use Sigil to process multiple books, read book data for use in other programs, or do other inter-book operations. + +# What not to expect + +I do not intend to implement an object model for book metadata, beyond perhaps some basic getters and setters. You have full control over the `Epub.opf` BeautifulSoup object so you can edit the metadata however you want. diff --git a/epubfile.py b/epubfile.py new file mode 100644 index 0000000..75d970f --- /dev/null +++ b/epubfile.py @@ -0,0 +1,858 @@ +import mimetypes +import os +import re +import tempfile +import urllib.parse +import uuid +import zipfile + +import bs4 +import tinycss2 + +from voussoirkit import pathclass + +MIMETYPE_CONTENT = 'application/epub+zip' + +HTML_LINK_PROPERTIES = { + 'a': ['href'], + 'audio': ['src'], + 'image': ['href', 'xlink:href'], + 'img': ['src'], + 'link': ['href'], + 'script': ['src'], + 'source': ['src'], + 'track': ['src'], + 'video': ['src', 'poster'], +} + +EXTENSION_MIMETYPES = { + 'html': 'application/xhtml+xml', + 'xhtml': 'application/xhtml+xml', + 'smi': 'application/smil+xml', + 'smil': 'application/smil+xml', + 'sml': 'application/smil+xml', + 'otf': 'application/font-sfnt', + 'ttf': 'application/font-sfnt', + 'pls': 'application/pls+xml', + 'woff': 'application/font-woff', + 'woff2': 'font/woff2', +} + +MIMETYPE_DIRECTORIES = { + 'application/x-dtbncx+xml': '.', + 'application/font-sfnt': 'Fonts', + 'application/xhtml+xml': 'Text', + 'font': 'Fonts', + 'image': 'Images', + 'text/css': 'Styles', + 'audio': 'Audio', + 'video': 'Video', +} + +MIMETYPE_FILE_TEMPLATE = 'application/epub+zip' + +CONTAINER_XML_TEMPLATE = ''' + + + + + + +'''.strip() + +OPF_TEMPLATE = ''' + + + + {uuid} + author + aut + title + + + + + + + + + +'''.strip() + +NCX_TEMPLATE = ''' + + + + + + + Unknown + + + + +'''.strip() + +NAV_XHTML_TEMPLATE = ''' + + + + + + + + + + +'''.strip() + +TEXT_TEMPLATE = ''' + + + + +{head_content} + + +{body_content} + + +'''.strip() + + +def compress_epub(directory, epub_filepath): + directory = pathclass.Path(directory) + epub_filepath = pathclass.Path(epub_filepath) + + if epub_filepath in directory: + raise ValueError('Epub inside its own directory') + + if epub_filepath.extension != 'epub': + epub_filepath = epub_filepath.add_extension('epub') + + with zipfile.ZipFile(epub_filepath.absolute_path, 'w') as z: + z.write(directory.with_child('mimetype').absolute_path, arcname='mimetype') + for file in directory.walk(): + if file in [directory.with_child('mimetype'), directory.with_child('sigil.cfg')]: + continue + z.write( + file.absolute_path, + arcname=file.relative_to(directory), + compress_type=zipfile.ZIP_DEFLATED, + ) + return epub_filepath + +def extract_epub(epub_filepath, directory): + epub_filepath = pathclass.Path(epub_filepath) + directory = pathclass.Path(directory) + + with zipfile.ZipFile(epub_filepath.absolute_path, 'r') as z: + z.extractall(directory.absolute_path) + +def fix_xhtml(xhtml, return_soup=False): + if isinstance(xhtml, bs4.BeautifulSoup): + soup = xhtml + else: + # For the text pages, html5lib is the best because html.parser and lxml + # lowercase all attributes, breaking svg's case-sensitive viewBox etc. + # and xml loses all of the namespaces when namespaced elements are nested + # like . + # The downside of html5lib is it turns the xml declaration at the top + # into a comment which we must undo manually. + soup = bs4.BeautifulSoup(xhtml, 'html5lib') + + if not soup.html: + html = soup.new_tag('html') + for child in list(soup.contents): + html.append(child) + soup.append(html) + + if not soup.html.body: + body = soup.new_tag('body') + for child in list(soup.html.contents): + body.append(child) + soup.html.append(body) + + if not soup.html.get('xmlns'): + soup.html['xmlns'] = 'http://www.w3.org/1999/xhtml' + + try: + doctype = next(i for i in soup.contents if isinstance(i, bs4.Doctype)) + except StopIteration: + doctype = bs4.Doctype('html') + soup.html.insert_before(doctype) + + # html5lib turns the xml declaration into a comment which we must revert. + try: + if isinstance(soup.contents[0], bs4.Comment): + declaration = bs4.Declaration('xml version="1.0" encoding="utf-8"') + soup.insert(0, declaration) + declaration.next.extract() + except StopIteration: + pass + + try: + declaration = next(i for i in soup.contents if isinstance(i, bs4.Declaration)) + except StopIteration: + declaration = bs4.Declaration('xml version="1.0" encoding="utf-8"') + doctype.insert_before(declaration) + + if return_soup: + return soup + return str(soup) + +def get_directory_for_mimetype(mime): + directory = ( + MIMETYPE_DIRECTORIES.get(mime) or + MIMETYPE_DIRECTORIES.get(mime.split('/')[0]) or + 'Misc' + ) + return directory + +def get_mimetype_for_basename(basename): + extension = os.path.splitext(basename)[1].strip('.') + mime = ( + EXTENSION_MIMETYPES.get(extension) or + mimetypes.guess_type(basename)[0] or + 'application/octet-stream' + ) + return mime + +def make_manifest_item(id, href, mime): + manifest_item = f'' + # 'html.parser' just for having the simplest output. + manifest_item = bs4.BeautifulSoup(manifest_item, 'html.parser') + return manifest_item.item + +def make_meta_item(content=None, attrs=None): + if content: + meta_item = f'{content}' + else: + meta_item = f'' + meta_item = bs4.BeautifulSoup(meta_item, 'html.parser') + if attrs: + meta_item.attrs.update(attrs) + return meta_item.meta + +def make_spine_item(id): + spine_item = f'' + # 'html.parser' just for having the simplest output. + spine_item = bs4.BeautifulSoup(spine_item, 'html.parser') + return spine_item.itemref + + +class EpubfileException(Exception): + error_message = '' + + def __init__(self, *args, **kwargs): + super().__init__() + self.given_args = args + self.given_kwargs = kwargs + self.error_message = self.error_message.format(*args, **kwargs) + self.args = (self.error_message, args, kwargs) + + def __str__(self): + return self.error_message + +class FileExists(EpubfileException): + error_message = 'There is already a file at {}.' + +class IDExists(EpubfileException): + error_message = 'There is already a file with id {}.' + +class NotInManifest(EpubfileException): + error_message = '{} is not in the manifest.' + +class NotInSpine(EpubfileException): + error_message = '{} is not in the spine.' + + +class Epub: + def __init__(self, directory): + if isinstance(directory, tempfile.TemporaryDirectory): + self._tempdir_reference = directory + directory = directory.name + + self.root_directory = pathclass.Path(directory, force_sep='/') + + self.opf_filepath = None + self.opf = None + + self.read_opf(self.get_opfs()[0]) + + def __repr__(self): + return f'Epub({repr(self.root_directory.absolute_path)})' + + def assert_file_not_exists(self, filepath): + if filepath.exists: + existing = filepath.relative_to(self.opf_filepath.parent) + raise FileExists(existing) + + # LOADING AND SAVING + ############################################################################ + @classmethod + def new(cls): + def writefile(filepath, content): + os.makedirs(filepath.parent.absolute_path, exist_ok=True) + with open(filepath.absolute_path, 'w', encoding='utf-8') as handle: + handle.write(content) + + uid = uuid.uuid4().urn + + tempdir = tempfile.TemporaryDirectory(prefix='epubfile-') + root = pathclass.Path(tempdir.name) + writefile(root.join('mimetype'), MIMETYPE_FILE_TEMPLATE) + writefile(root.join('META-INF/container.xml'), CONTAINER_XML_TEMPLATE) + writefile(root.join('OEBPS/content.opf'), OPF_TEMPLATE.format(uuid=uid)) + writefile(root.join('OEBPS/toc.ncx'), NCX_TEMPLATE.format(uuid=uid)) + writefile(root.join('OEBPS/Text/nav.xhtml'), NAV_XHTML_TEMPLATE) + + return cls(tempdir) + + @classmethod + def open(cls, epub_filepath): + extract_to = tempfile.TemporaryDirectory(prefix='epubfile-') + extract_epub(epub_filepath, extract_to.name) + return cls(extract_to) + + def save(self, epub_filepath): + self.write_opf() + compress_epub(self.root_directory, epub_filepath) + + # CONTAINER & OPF + ############################################################################ + def get_opfs(self): + container = self.read_container_xml() + rootfiles = container.find_all('rootfile') + rootfiles = [x.get('full-path') for x in rootfiles] + rootfiles = [self.root_directory.join(x) for x in rootfiles] + return rootfiles + + def read_container_xml(self): + container_xml_path = self.root_directory.join('META-INF/container.xml') + container = open(container_xml_path.absolute_path, 'r', encoding='utf-8') + # 'xml' and 'html.parser' seem about even here except that html.parser doesn't self-close. + container = bs4.BeautifulSoup(container, 'xml') + return container + + def read_opf(self, rootfile): + rootfile = pathclass.Path(rootfile, force_sep='/') + rootfile_xml = open(rootfile.absolute_path, 'r', encoding='utf-8').read() + # 'html.parser' preserves namespacing the best, but unfortunately it + # botches the items because it wants them to be self-closing + # and the string contents come out. We will fix in just a moment. + # This is still preferable to 'xml' which handles the dc: prefixes when + # parsing only the metadata block, but loses all namespaces when parsing + # the whole doc. 'lxml' wraps the content in and also + # botches the metas so it's not any better than html.parser. + self.opf = bs4.BeautifulSoup(rootfile_xml, 'html.parser') + # Let's fix those metas. + metas = self.opf.select('meta') + for meta in metas: + neighbor = meta.next + if neighbor.parent != meta.parent: + break + if not isinstance(neighbor, bs4.element.NavigableString): + continue + meta.append(neighbor.extract().strip()) + + self.opf_filepath = rootfile + return self.opf + + def write_container_xml(self, container): + if isinstance(container, bs4.BeautifulSoup): + container = str(container) + container_xml_path = self.root_directory.join('META-INF/container.xml') + container_xml = open(container_xml_path.absolute_path, 'w', encoding='utf-8') + container_xml.write(container) + + def write_opf(self): + with open(self.opf_filepath.absolute_path, 'w', encoding='utf-8') as rootfile: + rootfile.write(str(self.opf)) + + # FILE OPERATIONS + ############################################################################ + def add_file(self, id, basename, content): + if self.opf.manifest.find('item', {'id': id}): + raise IDExists(id) + + basename = os.path.basename(basename) + mime = get_mimetype_for_basename(basename) + directory = get_directory_for_mimetype(mime) + directory = self.opf_filepath.parent.with_child(directory) + os.makedirs(directory.absolute_path, exist_ok=True) + filepath = directory.with_child(basename) + + self.assert_file_not_exists(filepath) + + if mime == 'application/xhtml+xml': + # bs4 converts bytes to str so this must come before the handle choice. + content = fix_xhtml(content) + + if isinstance(content, str): + handle = open(filepath.absolute_path, 'w', encoding='utf-8') + elif isinstance(content, bytes): + handle = open(filepath.absolute_path, 'wb') + else: + raise TypeError(type(content)) + + with handle: + handle.write(content) + + href = filepath.relative_to(self.opf_filepath.parent, simple=True) + href = urllib.parse.quote(href) + + manifest_item = make_manifest_item(id, href, mime) + self.opf.manifest.append(manifest_item) + + if mime == 'application/xhtml+xml': + spine_item = make_spine_item(id) + self.opf.spine.append(spine_item) + + return id + + def easy_add_file(self, filepath): + filepath = pathclass.Path(filepath) + with open(filepath.absolute_path, 'rb') as handle: + self.add_file( + id=filepath.basename, + basename=filepath.basename, + content=handle.read(), + ) + + def delete_file(self, id): + os.remove(self.get_filepath(id).absolute_path) + spine_item = self.opf.spine.find('itemref', {'idref': id}) + if spine_item: + spine_item.extract() + manifest_item = self.opf.manifest.find('item', {'id': id}) + manifest_item.extract() + + def get_filepath(self, id): + href = self.opf.manifest.find('item', {'id': id})['href'] + filepath = self.opf_filepath.parent.join(href) + if not filepath.exists: + href = urllib.parse.unquote(href) + filepath = self.opf_filepath.parent.join(href) + return filepath + + def open_file(self, id, mode): + if mode not in ('r', 'w'): + raise ValueError(f'Mode {mode} should be either r or w.') + + filepath = self.get_filepath(id) + mime = self.opf.manifest.find('item', {'id': id})['media-type'] + is_text = ( + mime in ('application/xhtml+xml', 'application/x-dtbncx+xml') or + mime.startswith('text/') + ) + + if is_text: + handle = open(filepath.absolute_path, mode, encoding='utf-8') + else: + handle = open(filepath.absolute_path, mode + 'b') + + return handle + + def read_file(self, id, *, soup=False): + # text vs binary handled by open_file. + content = self.open_file(id, 'r').read() + if soup and self.get_manifest_item(id)['media-type'] == 'application/xhtml+xml': + return fix_xhtml(content, return_soup=True) + return content + + def rename_file(self, id, new_basename=None, *, fix_interlinking=True): + if isinstance(id, dict): + basename_map = id + else: + if new_basename is None: + raise TypeError('new_basename can be omitted if id is a dict.') + basename_map = {id: new_basename} + + rename_map = {} + for (id, new_basename) in basename_map.items(): + old_filepath = self.get_filepath(id) + new_filepath = old_filepath.parent.with_child(new_basename) + if not new_filepath.extension: + new_filepath = new_filepath.add_extension(old_filepath.extension) + self.assert_file_not_exists(new_filepath) + os.rename(old_filepath.absolute_path, new_filepath.absolute_path) + rename_map[old_filepath] = new_filepath + + if fix_interlinking: + self.fix_interlinking(rename_map) + else: + self.fix_interlinking_opf(rename_map) + + return rename_map + + def write_file(self, id, content): + # text vs binary handled by open_file. + if isinstance(content, bs4.BeautifulSoup): + content = str(content) + + with self.open_file(id, 'w') as handle: + handle.write(content) + + # GETTING THINGS + ############################################################################ + def get_manifest_items(self, filter='', soup=False, spine_order=False): + query = f'item{filter}' + items = self.opf.manifest.select(query) + + if spine_order: + items = {x['id']: x for x in items} + ordered_items = [] + + for spine_id in self.get_spine_order(): + ordered_items.append(items.pop(spine_id)) + ordered_items.extend(items.values()) + items = ordered_items + + if soup: + return items + + return [x['id'] for x in items] + + def get_manifest_item(self, id): + item = self.opf.manifest.find('item', {'id': id}) + if not item: + raise NotInManifest(id) + return item + + def get_fonts(self, *, soup=False): + return self.get_manifest_items( + filter='[media-type^="application/font"],[media-type^="font/"]', + soup=soup, + ) + + def get_images(self, *, soup=False): + return self.get_manifest_items( + filter='[media-type^="image/"]', + soup=soup, + ) + + def get_media(self, *, soup=False): + return self.get_manifest_items( + filter='[media-type^="video/"],[media-type^="audio/"]', + soup=soup, + ) + + def get_nav(self, *, soup=False): + nav = self.opf.manifest.find('item', {'properties': 'nav'}) + if not nav: + return None + if soup: + return nav + return nav['id'] + + def get_ncx(self, *, soup=False): + ncx = self.opf.manifest.find('item', {'media-type': 'application/x-dtbncx+xml'}) + if not ncx: + return None + if soup: + return ncx + return ncx['id'] + + def get_styles(self, *, soup=False): + return self.get_manifest_items( + filter='[media-type="text/css"]', + soup=soup, + ) + + def get_texts(self, *, soup=False, skip_nav=False): + texts = self.get_manifest_items( + filter='[media-type="application/xhtml+xml"]', + soup=True, + spine_order=True, + ) + if skip_nav: + texts = [x for x in texts if x.get('properties') != 'nav'] + + if soup: + return texts + return [x['id'] for x in texts] + + # COVER + ############################################################################ + def get_cover_image(self, *, soup=False): + cover = self.opf.manifest.find('item', {'properties': 'cover-image'}) + if cover: + return cover if soup else cover['id'] + + cover = self.opf.metadata.find('meta', {'name': 'cover'}) + if cover: + return cover if soup else cover['content'] + + return None + + def remove_cover_image(self): + current_cover = self.get_cover_image(soup=True) + if not current_cover: + return + + del current_cover['properties'] + + meta = self.opf.metadata.find('meta', {'name': 'cover'}) + if meta: + meta.extract() + + def set_cover_image(self, id): + if id is None: + self.remove_cover_image() + + current_cover = self.get_cover_image(soup=True) + + if not current_cover: + pass + elif current_cover['id'] == id: + return + else: + del current_cover['properties'] + + manifest_item = self.get_manifest_item(id) + manifest_item['properties'] = 'cover-image' + + current_meta = self.opf.metadata.find('meta', {'name': 'cover'}) + if current_meta: + current_meta[content] = id + else: + meta = make_meta_item(attrs={'name': 'cover', 'content': id}) + self.opf.metadata.append(meta) + + # SPINE + ############################################################################ + def get_spine_order(self, *, only_linear=False): + items = self.opf.spine.find_all('itemref') + if only_linear: + items = [x for x in items if x.get('linear') != 'no'] + return [x['idref'] for x in items] + return ids + + def set_spine_order(self, ids): + manifest_ids = self.get_manifest_items() + # Fetch the existing entries so that we can preserve their attributes + # while rearranging, only creating new spine entries for ids that aren't + # already present. + spine_items = self.opf.spine.select('itemref') + spine_items = {item['idref']: item for item in spine_items} + for id in ids: + if id not in manifest_ids: + raise NotInManifest(id) + if id in spine_items: + self.opf.spine.append(spine_items.pop(id)) + else: + self.opf.spine.append(make_spine_item(id)) + + # The remainder of the current spine items were not used, so pop them out. + for spine_item in spine_items.values(): + spine_item.extract() + + def get_spine_linear(self, id): + spine_item = self.opf.spine.find('itemref', {'idref': id}) + if not spine_item: + raise NotInSpine(id) + linear = spine_item.get('linear') + linear = {None: None, 'yes': True, 'no': False}.get(linear, linear) + return linear + + def set_spine_linear(self, id, linear): + ''' + Set linear to yes or no. Or pass None to remove the property. + ''' + spine_item = self.opf.spine.find('itemref', {'idref': id}) + if not spine_item: + raise NotInSpine(id) + + if linear is None: + del spine_item['linear'] + return + + if isinstance(linear, str): + if linear not in ('yes', 'no'): + raise ValueError(f'Linear must be yes or no, not {linear}.') + elif isinstance(linear, (bool, int)): + linear = {True: 'yes', False: 'no'}[bool(linear)] + else: + raise TypeError(linear) + + spine_item['linear'] = linear + + # UTILITIES + ############################################################################ + def fix_all_xhtml(self): + for id in self.get_texts(): + self.write_file(id, self.read_file(id, soup=True)) + + @staticmethod + def _fix_interlinking_helper(link, rename_map, relative_to, old_relative_to=None): + if link is None: + return None + + link = urllib.parse.urlsplit(link) + if link.scheme: + return None + + if old_relative_to is None: + old_relative_to = relative_to + + new_filepath = ( + rename_map.get(link.path) or + rename_map.get(old_relative_to.join(link.path)) or + rename_map.get(old_relative_to.join(urllib.parse.unquote(link.path))) or + None + ) + if new_filepath is None: + return None + + link = link._replace(path=new_filepath.relative_to(relative_to, simple=True)) + link = link._replace(path=urllib.parse.quote(link.path)) + + return link.geturl() + + @staticmethod + def _fix_interlinking_css_helper(tag): + links = [] + commit = lambda: None + + if not isinstance(tag, bs4.element.Tag): + pass + + elif tag.name == 'style' and tag.contents: + style = tinycss2.parse_stylesheet(tag.contents[0]) + links = [ + token + for rule in style if isinstance(rule, tinycss2.ast.QualifiedRule) + for token in rule.content if isinstance(token, tinycss2.ast.URLToken) + ] + commit = lambda: tag.contents[0].replace_with(tinycss2.serialize(style)) + + elif tag.get('style'): + style = tinycss2.parse_declaration_list(tag['style']) + links = [ + token + for declaration in style if isinstance(declaration, tinycss2.ast.Declaration) + for token in declaration.value if isinstance(token, tinycss2.ast.URLToken) + ] + commit = lambda: tag.attrs.update(style=tinycss2.serialize(style)) + + return (links, commit) + + def fix_interlinking_text(self, id, rename_map, old_relative_to=None): + text_parent = self.get_filepath(id).parent + soup = self.read_file(id, soup=True) + for tag in soup.descendants: + for link_property in HTML_LINK_PROPERTIES.get(tag.name, []): + link = tag.get(link_property) + link = self._fix_interlinking_helper(link, rename_map, text_parent, old_relative_to) + if not link: + continue + tag[link_property] = link + + (style_links, style_commit) = self._fix_interlinking_css_helper(tag) + for token in style_links: + link = token.value + link = self._fix_interlinking_helper(link, rename_map, text_parent, old_relative_to) + if not link: + continue + token.value = link + style_commit() + + text = str(soup) + self.write_file(id, text) + + def fix_interlinking_ncx(self, rename_map, old_relative_to=None): + ncx_id = self.get_ncx() + if not ncx_id: + return + + ncx_parent = self.get_filepath(ncx_id).parent + ncx = self.read_file(ncx_id) + # 'xml' because 'lxml' and 'html.parser' lowercase the navPoint tag name. + ncx = bs4.BeautifulSoup(ncx, 'xml') + for point in ncx.select('navPoint > content[src]'): + link = point['src'] + link = self._fix_interlinking_helper(link, rename_map, ncx_parent, old_relative_to) + if not link: + continue + point['src'] = link + + ncx = str(ncx) + self.write_file(ncx_id, ncx) + + def fix_interlinking_opf(self, rename_map): + opf_parent = self.opf_filepath.parent + for opf_item in self.opf.select('guide > reference[href], manifest > item[href]'): + link = opf_item['href'] + link = self._fix_interlinking_helper(link, rename_map, opf_parent) + if not link: + continue + opf_item['href'] = link + + def fix_interlinking(self, rename_map): + self.fix_interlinking_opf(rename_map) + for id in self.get_texts(): + self.fix_interlinking_text(id, rename_map) + self.fix_interlinking_ncx(rename_map) + + def move_nav_to_end(self): + ''' + Move the nav.xhtml file to the end and set linear=no. + ''' + nav = self.get_nav() + if not nav: + return + + spine = self.get_spine_order() + for (index, id) in enumerate(spine): + if id == nav: + spine.append(spine.pop(index)) + break + self.set_spine_order(spine) + + self.set_spine_linear(nav, False) + + def normalize_directory_structure(self): + # This must come before the opf rewrite because that would affect the + # location of all all manifest item hrefs. + manifest_items = self.get_manifest_items(soup=True) + old_filepaths = {item['id']: self.get_filepath(item['id']) for item in manifest_items} + old_ncx = self.get_ncx() + try: + old_ncx_parent = self.get_filepath(self.get_ncx()).parent + except Exception: + old_ncx_parent = None + + if self.opf_filepath.parent == self.root_directory: + oebps = self.root_directory.with_child('OEBPS') + os.makedirs(oebps.absolute_path) + self.write_opf() + new_opf_path = oebps.with_child(self.opf_filepath.basename) + os.rename(self.opf_filepath.absolute_path, new_opf_path.absolute_path) + container = self.read_container_xml() + rootfile = container.find('rootfile', {'full-path': self.opf_filepath.basename}) + rootfile['full-path'] = new_opf_path.relative_to(self.root_directory, simple=True) + self.write_container_xml(container) + self.opf_filepath = new_opf_path + + rename_map = {} + for manifest_item in manifest_items: + old_filepath = old_filepaths[manifest_item['id']] + + directory = get_directory_for_mimetype(manifest_item['media-type']) + directory = self.opf_filepath.parent.with_child(directory) + os.makedirs(directory.absolute_path, exist_ok=True) + + new_filepath = directory.with_child(old_filepath.basename) + rename_map[old_filepath] = new_filepath + os.rename(old_filepath.absolute_path, new_filepath.absolute_path) + manifest_item['href'] = new_filepath.relative_to(self.opf_filepath.parent, simple=True) + + self.fix_interlinking_opf(rename_map) + for id in self.get_texts(): + self.fix_interlinking_text(id, rename_map, old_relative_to=old_filepaths[id].parent) + self.fix_interlinking_ncx(rename_map, old_relative_to=old_ncx_parent) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..363d5e3 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +bs4 +html5lib +tinycss2 +voussoirkit