epubfile/epubfile.py

import argparse
import bs4
import copy
import functools
import html
import io
import mimetypes
import os
import random
import re
import string
import sys
import tempfile
import tinycss2
import urllib.parse
import uuid
import zipfile

from voussoirkit import betterhelp
from voussoirkit import interactive
from voussoirkit import pathclass
from voussoirkit import pipeable
from voussoirkit import vlogging

log = vlogging.get_logger(__name__, 'epubfile')

HTML_LINK_PROPERTIES = {
    'a': ['href'],
    'audio': ['src'],
    'image': ['href', 'xlink:href'],
    'img': ['src'],
    'link': ['href'],
    'script': ['src'],
    'source': ['src'],
    'track': ['src'],
    'video': ['src', 'poster'],
}

EXTENSION_MIMETYPES = {
    'htm': 'application/xhtml+xml',
    'html': 'application/xhtml+xml',
    'otf': 'font/otf',
    'pls': 'application/pls+xml',
    'smi': 'application/smil+xml',
    'smil': 'application/smil+xml',
    'sml': 'application/smil+xml',
    'ttf': 'font/ttf',
    'woff': 'font/woff',
    'woff2': 'font/woff2',
    'xhtml': 'application/xhtml+xml',
    'xpgt': 'application/vnd.adobe-page-template+xml',
}

MIMETYPE_DIRECTORIES = {
    'application/font-sfnt': 'Fonts',
    'application/x-dtbncx+xml': '.',
    'application/x-font-ttf': 'Fonts',
    'application/xhtml+xml': 'Text',
    'audio': 'Audio',
    'font': 'Fonts',
    'image': 'Images',
    'text/css': 'Styles',
    'video': 'Video',
}

MIMETYPE_FILE_TEMPLATE = 'application/epub+zip'

CONTAINER_XML_TEMPLATE = '''
<?xml version="1.0" encoding="UTF-8"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
    <rootfiles>
        <rootfile full-path="OEBPS/content.opf" media-type="application/oebps-package+xml"/>
   </rootfiles>
</container>
'''.strip()

OPF_TEMPLATE = '''
<?xml version="1.0" encoding="utf-8"?>
<package version="3.0" unique-identifier="BookId" xmlns="http://www.idpf.org/2007/opf">
  <metadata xmlns:dc="http://purl.org/dc/elements/1.1/">
    <dc:identifier id="BookId">{uuid}</dc:identifier>
  </metadata>
  <manifest>
    <item id="ncx" href="toc.ncx" media-type="application/x-dtbncx+xml"/>
    <item id="nav.xhtml" href="Text/nav.xhtml" media-type="application/xhtml+xml" properties="nav"/>
  </manifest>
  <spine toc="ncx">
    <itemref idref="nav.xhtml" linear="no"/>
  </spine>
</package>
'''.strip()

NCX_TEMPLATE = '''
<?xml version="1.0" encoding="utf-8"?>
<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1">
  <head>
    <meta name="dtb:uid" content="{uuid}" />
  </head>
<navMap>
{navpoints}
</navMap>
</ncx>
'''.strip()

NAV_XHTML_TEMPLATE = '''
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
<head>
  <meta charset="utf-8"/>
</head>
<body epub:type="frontmatter">
  <nav epub:type="toc" id="toc">
    <h1>Table of Contents</h1>
    {toc_contents}
  </nav>
</body>
</html>
'''.strip()

TEXT_TEMPLATE = '''
<?xml version="1.0" encoding="utf-8"?>
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml" xmlns:epub="http://www.idpf.org/2007/ops">
<head>
{head_content}
</head>
<body>
{body_content}
</body>
</html>
'''.strip()

# EPUB COMPRESSION
################################################################################
def compress_epub(directory, epub_filepath):
    directory = pathclass.Path(directory)
    epub_filepath = pathclass.Path(epub_filepath)
    log.debug('Compressing %s to %s.', directory.absolute_path, epub_filepath.absolute_path)

    if epub_filepath in directory:
        raise ValueError('Epub inside its own directory')

    if epub_filepath.extension != 'epub':
        epub_filepath = epub_filepath.add_extension('epub')

    with zipfile.ZipFile(epub_filepath, 'w') as z:
        z.write(directory.with_child('mimetype'), arcname='mimetype')
        for file in directory.walk():
            if file in [directory.with_child('mimetype'), directory.with_child('sigil.cfg')]:
                continue
            z.write(
                file,
                arcname=file.relative_to(directory).replace('\\', '/'),
                compress_type=zipfile.ZIP_DEFLATED,
            )
    return epub_filepath

def extract_epub(epub_filepath, directory):
    epub_filepath = pathclass.Path(epub_filepath)
    directory = pathclass.Path(directory)
    log.debug('Extracting %s to %s.', epub_filepath.absolute_path, directory.absolute_path)

    with zipfile.ZipFile(epub_filepath, 'r') as z:
        z.extractall(directory)

# XHTML TOOLS
################################################################################
def fix_xhtml(xhtml, return_soup=False):
    if isinstance(xhtml, bs4.BeautifulSoup):
        soup = xhtml
    else:
        # For the text pages, html5lib is the best because html.parser and lxml
        # lowercase all attributes, breaking svg's case-sensitive viewBox etc.
        # and xml loses all of the namespaces when namespaced elements are nested
        # like <html xmlns="..."><svg xmlns:xlink="..."></svg></html>.
        # The downside of html5lib is it turns the xml declaration at the top
        # into a comment which we must undo manually.
        soup = bs4.BeautifulSoup(xhtml, 'html5lib')

    if not soup.html:
        html = soup.new_tag('html')
        for child in list(soup.contents):
            html.append(child)
        soup.append(html)

    if not soup.html.body:
        body = soup.new_tag('body')
        for child in list(soup.html.contents):
            body.append(child)
        soup.html.append(body)

    if not soup.html.get('xmlns'):
        soup.html['xmlns'] = 'http://www.w3.org/1999/xhtml'

    try:
        doctype = next(i for i in soup.contents if isinstance(i, bs4.Doctype))
    except StopIteration:
        doctype = bs4.Doctype('html')
        soup.html.insert_before(doctype)

    # html5lib turns the xml declaration into a comment which we must revert.
    try:
        if isinstance(soup.contents[0], bs4.Comment):
            declaration = bs4.Declaration('xml version="1.0" encoding="utf-8"')
            soup.insert(0, declaration)
            declaration.next.extract()
    except StopIteration:
        pass

    try:
        declaration = next(i for i in soup.contents if isinstance(i, bs4.Declaration))
    except StopIteration:
        declaration = bs4.Declaration('xml version="1.0" encoding="utf-8"')
        doctype.insert_before(declaration)

    if return_soup:
        return soup
    return str(soup)

def xhtml_replacements(xhtml, replacements, return_soup=False):
    if isinstance(xhtml, bs4.BeautifulSoup):
        xhtml = str(xhtml)

    for (re_from, re_to) in replacements:
        xhtml = re.sub(re_from, re_to, xhtml, flags=re.DOTALL)

    if return_soup:
        soup = bs4.BeautifulSoup(xhtml, 'html5lib')
        return soup

    return xhtml

def demote_xhtml_headers(xhtml, return_soup=False):
    replacements = [
        (r'<h5([^>]*?>.*?)</h5>', r'<h6\1</h6>'),
        (r'<h4([^>]*?>.*?)</h4>', r'<h5\1</h5>'),
        (r'<h3([^>]*?>.*?)</h3>', r'<h4\1</h4>'),
        (r'<h2([^>]*?>.*?)</h2>', r'<h3\1</h3>'),
        (r'<h1([^>]*?>.*?)</h1>', r'<h2\1</h2>'),
    ]
    return xhtml_replacements(xhtml, replacements, return_soup=return_soup)

def promote_xhtml_headers(xhtml, return_soup=False):
    replacements = [
        (r'<h2([^>]*?>.*?)</h2>', r'<h1\1</h1>'),
        (r'<h3([^>]*?>.*?)</h3>', r'<h2\1</h2>'),
        (r'<h4([^>]*?>.*?)</h4>', r'<h3\1</h3>'),
        (r'<h5([^>]*?>.*?)</h5>', r'<h4\1</h4>'),
        (r'<h6([^>]*?>.*?)</h6>', r'<h5\1</h5>'),
    ]
    return xhtml_replacements(xhtml, replacements, return_soup=return_soup)

# MIMETYPE DECISIONMAKERS
################################################################################
def get_directory_for_mimetype(mime):
    directory = (
        MIMETYPE_DIRECTORIES.get(mime) or
        MIMETYPE_DIRECTORIES.get(mime.split('/')[0]) or
        'Misc'
    )
    return directory

def get_mimetype_for_basename(basename):
    extension = os.path.splitext(basename)[1].strip('.')
    mime = (
        EXTENSION_MIMETYPES.get(extension) or
        mimetypes.guess_type(basename)[0] or
        'application/octet-stream'
    )
    return mime

# OPF ELEMENT GENERATORS
################################################################################
def make_manifest_item(id, href, mime):
    manifest_item = f'<item id="{id}" href="{href}" media-type="{mime}"/>'
    # 'html.parser' just for having the simplest output.
    manifest_item = bs4.BeautifulSoup(manifest_item, 'html.parser')
    return manifest_item.item

def make_meta_item(content=None, attrs=None):
    if content:
        meta_item = f'<meta>{content}</meta>'
    else:
        meta_item = f'<meta/>'
    # 'html.parser' just for having the simplest output.
    meta_item = bs4.BeautifulSoup(meta_item, 'html.parser')
    if attrs:
        meta_item.attrs.update(attrs)
    return meta_item.meta

def make_spine_item(id):
    spine_item = f'<itemref idref="{id}"/>'
    # 'html.parser' just for having the simplest output.
    spine_item = bs4.BeautifulSoup(spine_item, 'html.parser')
    return spine_item.itemref

# DECORATORS
################################################################################
def writes(method):
    '''
    Indicates that the given method performs write operations to files inside
    the book. The decorated method will raise ReadOnly if the book was opened
    in read-only mode.
    '''
    @functools.wraps(method)
    def wrapped_method(self, *args, **kwargs):
        if self.read_only:
            raise ReadOnly(method.__qualname__)
        return method(self, *args, **kwargs)
    return wrapped_method

# CLASSES
################################################################################
class EpubfileException(Exception):
    error_message = ''

    def __init__(self, *args, **kwargs):
        super().__init__()
        self.given_args = args
        self.given_kwargs = kwargs
        self.error_message = self.error_message.format(*args, **kwargs)
        self.args = (self.error_message, args, kwargs)

    def __str__(self):
        return self.error_message

class InvalidEpub(EpubfileException):
    error_message = '{} is invalid: {}'

class FileExists(EpubfileException):
    error_message = 'There is already a file at {}.'

class IDExists(EpubfileException):
    error_message = 'There is already a file with id {}.'

class NotInManifest(EpubfileException):
    error_message = '{} is not in the manifest.'

class NotInSpine(EpubfileException):
    error_message = '{} is not in the spine.'

class ReadOnly(EpubfileException):
    error_message = 'Can\'t do {} in read-only mode.'

class Epub:
    def __init__(self, epub_path, *, read_only=False):
        '''
        epub_path:
            The path to an .epub file, or to a directory that contains unzipped
             epub contents.

        read_only:
            If True, all write operations will be forbidden. The benefit is that
            the .epub file will not be extracted. This is recommended if you
            only need to read data from a book and don't need to write to it.
        '''
        epub_path = self._keep_tempdir_reference(epub_path)
        epub_path = pathclass.Path(epub_path)
        self.original_path = epub_path
        self.read_only = read_only

        if epub_path.is_dir:
            self.__init_from_dir(epub_path)
        elif self.read_only:
            self.__init_from_file_read_only(epub_path)
        else:
            self.__init_from_file(epub_path)

        opfs = self.get_opfs()
        self.opf_filepath = opfs[0]
        self.opf = self.read_opf(self.opf_filepath)

    def __init_from_dir(self, directory):
        self.is_zip = False
        self.root_directory = pathclass.Path(directory)

    def __init_from_file_read_only(self, epub_path):
        # It may appear that is_zip is a synonym for read_only, but don't forget
        # that we can also open a directory in readonly mode. It's just that
        # readonly dirs don't need a special init, all they have to do is
        # forbid writes.
        self.is_zip = True
        self.root_directory = pathclass.Path(epub_path)
        self.zip = zipfile.ZipFile(self.root_directory)

    def __init_from_file(self, epub_path):
        extract_to = tempfile.TemporaryDirectory(prefix='epubfile-')
        extract_epub(epub_path, extract_to.name)
        directory = self._keep_tempdir_reference(extract_to)
        self.__init_from_dir(directory)

    def __repr__(self):
        if self.read_only:
            return f'Epub({repr(self.root_directory.absolute_path)}, read_only=True)'
        else:
            return f'Epub({repr(self.root_directory.absolute_path)})'

    def _fopen(self, *args, **kwargs):
        '''
        Not to be confused with the high level `open_file` method, this method
        is the one that actually reads off the disk.
        '''
        if self.is_zip:
            return self._fopen_zip(*args, **kwargs)
        else:
            return self._fopen_disk(*args, **kwargs)

    def _fopen_disk(self, path, mode, *, encoding=None):
        '''
        If the book was opened as a directory, we can read files off disk with
        Python open.
        '''
        return open(path, mode, encoding=encoding)

    def _fopen_zip(self, path, mode, *, encoding=None):
        '''
        If the book was opened as a read-only zip, we can read files out of
        the zip.
        '''
        # When reading from a zip, root_directory is the zip file itself.
        # So if the user is trying to read a filepath called
        # D:\book.epub\dir1\file1.html, we need to convert it to the relative
        # path dir1\file1.html
        # But if they have already given us the relative path, we keep that.
        normalized = path
        if not isinstance(normalized, pathclass.Path):
            normalized = pathclass.Path(normalized)

        if normalized in self.root_directory:
            # The given path was an absolute path including the epub.
            path = normalized.relative_to(self.root_directory, simple=True)
        else:
            # The given path was either a relative path already inside the epub,
            # or an absolute path somewhere totally wrong.
            path = os.fspath(path)

        # Zip files always use forward slash internally, even on Windows.
        path = path.replace('\\', '/')

        if mode == 'rb':
            return self.zip.open(path, 'r')
        if mode == 'r':
            return io.TextIOWrapper(self.zip.open(path, 'r'), encoding)
        # At this time fopen_zip is only used for read-only epubs anyway.
        if mode == 'wb':
            return self.zip.open(path, 'w')
        if mode == 'w':
            return io.TextIOWrapper(self.zip.open(path, 'w'), encoding)
        raise ValueError('mode should be r, w, rb, or wb.')

    def _keep_tempdir_reference(self, p):
        '''
        If the given path object is actually a tempfile.TemporaryDirectory,
        store that TD reference here so that it does not get cleaned up even
        if the caller releases it. Then return the actual filepath.
        '''
        if isinstance(p, tempfile.TemporaryDirectory):
            self._tempdir_reference = p
            p = p.name
        return p

    def assert_file_not_exists(self, filepath):
        if filepath.exists:
            existing = filepath.relative_to(self.opf_filepath.parent)
            raise FileExists(existing)

    def assert_id_not_exists(self, id):
        try:
            self.get_manifest_item(id)
            raise IDExists(id)
        except NotInManifest:
            pass

    # VALIDATION
    ############################################################################
    @writes
    def auto_correct_and_validate(self):
        # Ensure we have a mimetype file.
        mimetype_file = self.root_directory.with_child('mimetype')
        if not mimetype_file.exists:
            with self._fopen(mimetype_file, 'w', encoding='utf-8') as handle:
                handle.write(MIMETYPE_FILE_TEMPLATE)

        # Assert that all manifest items exist on disk.
        for item in self.get_manifest_items(soup=True):
            filepath = self.get_filepath(item['id'])
            if not filepath.exists:
                message = f'Manifest item {item["id"]} = {item["href"]} does not exist.'
                raise InvalidEpub(self.original_path, message)

    # LOADING AND SAVING
    ############################################################################
    @classmethod
    def new(cls):
        '''
        Create a new book. It will start as a temporary directory, so don't
        forget to call `save` when you are done.
        '''
        def writefile(filepath, content):
            filepath.parent.makedirs(exist_ok=True)
            # This line uses Python open instead of self._fopen because the epub
            # hasn't been instantiated yet! At this time, creating a book with
            # Epub.new always creates it as a directory. We do not support
            # creating a book directly into a fresh zip file.
            with filepath.open('w', encoding='utf-8') as handle:
                handle.write(content)

        uid = uuid.uuid4().urn

        tempdir = tempfile.TemporaryDirectory(prefix='epubfile-')
        root = pathclass.Path(tempdir.name)
        writefile(root.join('mimetype'), MIMETYPE_FILE_TEMPLATE)
        writefile(root.join('META-INF/container.xml'), CONTAINER_XML_TEMPLATE)
        writefile(root.join('OEBPS/content.opf'), OPF_TEMPLATE.format(uuid=uid))
        writefile(root.join('OEBPS/toc.ncx'), NCX_TEMPLATE.format(uuid=uid, navpoints=''))
        writefile(root.join('OEBPS/Text/nav.xhtml'), NAV_XHTML_TEMPLATE.format(toc_contents=''))

        return cls(tempdir)

    @writes
    def save(self, epub_filepath):
        self.write_opf()
        self.auto_correct_and_validate()
        compress_epub(self.root_directory, epub_filepath)

    # CONTAINER & OPF
    ############################################################################
    def get_opfs(self):
        '''
        Read the container.xml to find all available OPFs (aka rootfiles).
        '''
        container = self.read_container_xml()
        rootfiles = container.find_all('rootfile')
        rootfiles = [x.get('full-path') for x in rootfiles]
        rootfiles = [self.root_directory.join(x) for x in rootfiles]
        return rootfiles

    def read_container_xml(self):
        container_xml_path = self.root_directory.join('META-INF/container.xml')
        container = self._fopen(container_xml_path, 'r', encoding='utf-8')
        # 'xml' and 'html.parser' seem about even here except that html.parser
        # doesn't self-close.
        container = bs4.BeautifulSoup(container, 'xml')
        return container

    def read_opf(self, rootfile):
        rootfile = pathclass.Path(rootfile)
        rootfile_xml = self._fopen(rootfile, 'r', encoding='utf-8').read()
        # 'html.parser' preserves namespacing the best, but unfortunately it
        # botches the <meta> items because it wants them to be self-closing
        # and the string contents come out. We will fix in just a moment.
        # This is still preferable to 'xml' which handles the dc: prefixes when
        # parsing only the metadata block, but loses all namespaces when parsing
        # the whole doc. 'lxml' wraps the content in <html><body> and also
        # botches the metas so it's not any better than html.parser.
        opf = bs4.BeautifulSoup(rootfile_xml, 'html.parser')

        # Let's fix those metas.
        metas = opf.select('meta')
        for meta in metas:
            neighbor = meta.next
            if neighbor.parent != meta.parent:
                # This happens on the last meta, neighbor is outside of the manifest
                break
            if not isinstance(neighbor, bs4.element.NavigableString):
                continue
            meta.append(neighbor.extract().strip())

        return opf

    @writes
    def write_container_xml(self, container):
        if isinstance(container, bs4.BeautifulSoup):
            container = str(container)
        container_xml_path = self.root_directory.join('META-INF/container.xml')
        container_xml = self._fopen(container_xml_path, 'w', encoding='utf-8')
        container_xml.write(container)

    @writes
    def write_opf(self):
        with self._fopen(self.opf_filepath, 'w', encoding='utf-8') as rootfile:
            rootfile.write(str(self.opf))

    # FILE OPERATIONS
    ############################################################################
    @writes
    def add_file(self, id, basename, content):
        self.assert_id_not_exists(id)

        basename = os.path.basename(basename)
        mime = get_mimetype_for_basename(basename)
        directory = get_directory_for_mimetype(mime)
        directory = self.opf_filepath.parent.with_child(directory)
        directory.makedirs(exist_ok=True)
        filepath = directory.with_child(basename)

        self.assert_file_not_exists(filepath)

        if mime == 'application/xhtml+xml':
            # bs4 converts bytes to str so this must come before the handle choice.
            content = fix_xhtml(content)

        if isinstance(content, str):
            handle = self._fopen(filepath, 'w', encoding='utf-8')
        elif isinstance(content, bytes):
            handle = self._fopen(filepath, 'wb')
        else:
            raise TypeError(f'content should be str or bytes, not {type(content)}.')

        with handle:
            handle.write(content)

        href = filepath.relative_to(self.opf_filepath.parent, simple=True).replace('\\', '/')
        href = urllib.parse.quote(href)

        manifest_item = make_manifest_item(id, href, mime)
        self.opf.manifest.append(manifest_item)

        if mime == 'application/xhtml+xml':
            spine_item = make_spine_item(id)
            self.opf.spine.append(spine_item)

        return id

    @writes
    def easy_add_file(self, filepath):
        '''
        Add a file from disk into the book. The manifest ID and href will be
        automatically generated.
        '''
        filepath = pathclass.Path(filepath)
        with self._fopen(filepath, 'rb') as handle:
            return self.add_file(
                id=filepath.basename,
                basename=filepath.basename,
                content=handle.read(),
            )

    @writes
    def delete_file(self, id):
        manifest_item = self.get_manifest_item(id)
        filepath = self.get_filepath(id)

        manifest_item.extract()
        spine_item = self.opf.spine.find('itemref', {'idref': id})
        if spine_item:
            spine_item.extract()
        os.remove(filepath)

    def get_filepath(self, id):
        href = self.get_manifest_item(id)['href']
        filepath = self.opf_filepath.parent.join(href)
        # TODO: In the case of a read-only zipped epub, this condition will
        # definitely fail and we won't be unquoting names that need it.
        # Double-check the consequences of this and make a patch for file
        # exists inside zip check if needed.
        if not filepath.exists:
            href = urllib.parse.unquote(href)
            filepath = self.opf_filepath.parent.join(href)
        return filepath

    def open_file(self, id, mode):
        if mode not in ('r', 'w'):
            raise ValueError(f'mode should be either r or w, not {mode}.')

        if mode == 'w' and self.read_only:
            raise ReadOnly(self.open_file.__qualname__)

        filepath = self.get_filepath(id)
        mime = self.get_manifest_item(id)['media-type']
        is_text = (
            mime in ('application/xhtml+xml', 'application/x-dtbncx+xml') or
            mime.startswith('text/')
        )

        if is_text:
            handle = self._fopen(filepath, mode, encoding='utf-8')
        else:
            handle = self._fopen(filepath, mode + 'b')

        return handle

    def read_file(self, id, *, soup=False):
        # text vs binary handled by open_file.
        content = self.open_file(id, 'r').read()
        if soup and self.get_manifest_item(id)['media-type'] == 'application/xhtml+xml':
            return fix_xhtml(content, return_soup=True)
        return content

    @writes
    def rename_file(self, id, new_basename=None, *, fix_interlinking=True):
        if isinstance(id, dict):
            basename_map = id
        else:
            if new_basename is None:
                raise TypeError('new_basename can be omitted if id is a dict.')
            basename_map = {id: new_basename}

        rename_map = {}
        for (id, new_basename) in basename_map.items():
            old_filepath = self.get_filepath(id)
            new_filepath = old_filepath.parent.with_child(new_basename)
            if not new_filepath.extension:
                new_filepath = new_filepath.add_extension(old_filepath.extension)
            self.assert_file_not_exists(new_filepath)
            os.rename(old_filepath, new_filepath)
            rename_map[old_filepath] = new_filepath

        if fix_interlinking:
            self.fix_interlinking(rename_map)
        else:
            self.fix_interlinking_opf(rename_map)

        return rename_map

    @writes
    def write_file(self, id, content):
        # text vs binary handled by open_file.
        if isinstance(content, bs4.BeautifulSoup):
            content = str(content)

        with self.open_file(id, 'w') as handle:
            handle.write(content)

    # GETTING THINGS
    ############################################################################
    def get_manifest_items(self, filter='', soup=False, spine_order=False):
        query = f'item{filter}'
        items = self.opf.manifest.select(query)

        if spine_order:
            items = {x['id']: x for x in items}
            ordered_items = []

            for spine_id in self.get_spine_order():
                ordered_items.append(items.pop(spine_id))
            ordered_items.extend(items.values())
            items = ordered_items

        if soup:
            return items

        return [x['id'] for x in items]

    def get_manifest_item(self, id):
        item = self.opf.manifest.find('item', {'id': id})
        if not item:
            raise NotInManifest(id)
        return item

    def get_fonts(self, *, soup=False):
        return self.get_manifest_items(
            filter='[media-type*="font"],[media-type*="opentype"]',
            soup=soup,
        )

    def get_images(self, *, soup=False):
        return self.get_manifest_items(
            filter='[media-type^="image/"]',
            soup=soup,
        )

    def get_media(self, *, soup=False):
        return self.get_manifest_items(
            filter='[media-type^="video/"],[media-type^="audio/"]',
            soup=soup,
        )

    def get_nav(self, *, soup=False):
        nav = self.opf.manifest.find('item', {'properties': 'nav'})
        if not nav:
            return None
        if soup:
            return nav
        return nav['id']

    def get_ncx(self, *, soup=False):
        ncx = self.opf.manifest.find('item', {'media-type': 'application/x-dtbncx+xml'})
        if not ncx:
            return None
        if soup:
            return ncx
        return ncx['id']

    def get_styles(self, *, soup=False):
        return self.get_manifest_items(
            filter='[media-type="text/css"]',
            soup=soup,
        )

    def get_texts(self, *, soup=False, skip_nav=False):
        texts = self.get_manifest_items(
            filter='[media-type="application/xhtml+xml"]',
            soup=True,
            spine_order=True,
        )
        if skip_nav:
            texts = [x for x in texts if x.get('properties') != 'nav']

        if soup:
            return texts
        return [x['id'] for x in texts]

    # COVER
    ############################################################################
    def get_cover_image(self, *, soup=False):
        cover = self.opf.manifest.find('item', {'properties': 'cover-image'})
        if cover:
            return cover if soup else cover['id']

        cover = self.opf.metadata.find('meta', {'name': 'cover'})
        if cover:
            return cover if soup else cover['content']

        return None

    @writes
    def remove_cover_image(self):
        current_cover = self.get_cover_image(soup=True)
        if not current_cover:
            return

        del current_cover['properties']

        meta = self.opf.metadata.find('meta', {'name': 'cover'})
        if meta:
            meta.extract()

    @writes
    def set_cover_image(self, id):
        if id is None:
            self.remove_cover_image()

        current_cover = self.get_cover_image(soup=True)

        if not current_cover:
            pass
        elif current_cover['id'] == id:
            return
        else:
            del current_cover['properties']

        manifest_item = self.get_manifest_item(id)
        manifest_item['properties'] = 'cover-image'

        current_meta = self.opf.metadata.find('meta', {'name': 'cover'})
        if current_meta:
            current_meta['content'] = id
        else:
            meta = make_meta_item(attrs={'name': 'cover', 'content': id})
            self.opf.metadata.append(meta)

    # SPINE
    ############################################################################
    def get_spine_order(self, *, linear_only=False):
        items = self.opf.spine.find_all('itemref')
        if linear_only:
            items = [x for x in items if x.get('linear') != 'no']
        return [x['idref'] for x in items]

    @writes
    def set_spine_order(self, ids):
        manifest_ids = self.get_manifest_items()
        # Fetch the existing entries so that we can preserve their attributes
        # while rearranging, only creating new spine entries for ids that aren't
        # already present.
        spine_items = self.opf.spine.select('itemref')
        spine_items = {item['idref']: item for item in spine_items}
        for id in ids:
            if id not in manifest_ids:
                raise NotInManifest(id)
            if id in spine_items:
                self.opf.spine.append(spine_items.pop(id))
            else:
                self.opf.spine.append(make_spine_item(id))

        # The remainder of the current spine items were not used, so pop them out.
        for spine_item in spine_items.values():
            spine_item.extract()

    def get_spine_linear(self, id):
        spine_item = self.opf.spine.find('itemref', {'idref': id})
        if not spine_item:
            raise NotInSpine(id)
        linear = spine_item.get('linear')
        linear = {None: None, 'yes': True, 'no': False}.get(linear, linear)
        return linear

    @writes
    def set_spine_linear(self, id, linear):
        '''
        Set linear to yes or no. Or pass None to remove the property.
        '''
        spine_item = self.opf.spine.find('itemref', {'idref': id})
        if not spine_item:
            raise NotInSpine(id)

        if linear is None:
            del spine_item['linear']
            return

        if isinstance(linear, str):
            if linear not in ('yes', 'no'):
                raise ValueError(f'Linear must be yes or no, not {linear}.')
        elif isinstance(linear, (bool, int)):
            linear = {True: 'yes', False: 'no'}[bool(linear)]
        else:
            raise TypeError(linear)

        spine_item['linear'] = linear

    # METADATA
    ############################################################################
    def get_authors(self):
        '''
        Thank you double_j for showing how to deal with find_all not working
        on namespaced tags.
        https://stackoverflow.com/a/44681560
        '''
        creators = self.opf.metadata.find_all({'dc:creator'})
        creators = [str(c.contents[0]) for c in creators if len(c.contents) == 1]
        return creators

    def get_dates(self):
        dates = self.opf.metadata.find_all({'dc:date'})
        dates = [str(t.contents[0]) for t in dates if len(t.contents) == 1]
        return dates

    def get_languages(self):
        languages = self.opf.metadata.find_all({'dc:language'})
        languages = [str(l.contents[0]) for l in languages if len(l.contents) == 1]
        return languages

    def get_titles(self):
        titles = self.opf.metadata.find_all({'dc:title'})
        titles = [str(t.contents[0]) for t in titles if len(t.contents) == 1]
        return titles

    @writes
    def remove_metadata_of_type(self, tag_name):
        for meta in self.opf.metadata.find_all({tag_name}):
            if meta.get('id'):
                for refines in self.opf.metadata.find_all('meta', {'refines': f'#{meta["id"]}'}):
                    refines.extract()
            meta.extract()

    @writes
    def set_languages(self, languages):
        '''
        A list like ['en', 'fr', 'ko'].
        '''
        self.remove_metadata_of_type('dc:language')
        for language in languages:
            element = f'<dc:language>{language}</dc:language>'
            element = bs4.BeautifulSoup(element, 'html.parser')
            self.opf.metadata.append(element)

    # UTILITIES
    ############################################################################
    @writes
    def fix_all_xhtml(self):
        for id in self.get_texts():
            self.write_file(id, self.read_file(id, soup=True))

    @staticmethod
    def _fix_interlinking_helper(link, rename_map, relative_to, old_relative_to=None):
        '''
        Given an old link that was found in one of the documents, and the
        rename_map, produce a new link that points to the new location.

        relative_to controls the relative pathing for the new link.
        For example, the links inside a  text document usually need to step from
        Text/ to ../Images/ to link an image. But the links inside the OPF file
        start with Images/ right away.

        old_relative_to is needed when, for example, all of the files were in a
        single directory together, and now we are splitting them into Text/,
        Images/, etc. In this case, recognizing the old link requires that we
        understand the old relative location, then we can correct it using the
        new relative location.
        '''
        if link is None:
            return None

        link = urllib.parse.urlsplit(link)
        if link.scheme:
            return None

        if old_relative_to is None:
            old_relative_to = relative_to

        new_filepath = (
            rename_map.get(link.path) or
            rename_map.get(old_relative_to.join(link.path)) or
            rename_map.get(old_relative_to.join(urllib.parse.unquote(link.path))) or
            None
        )
        if new_filepath is None:
            return None

        link = link._replace(path=new_filepath.relative_to(relative_to, simple=True).replace('\\', '/'))
        link = link._replace(path=urllib.parse.quote(link.path))

        return link.geturl()

    @staticmethod
    def _fix_interlinking_css_helper(tag):
        '''
        Given a <style> tag or a tag with a style="" attribute, fix interlinking
        for things like `background-image: url("");`.
        '''
        links = []
        commit = lambda: None

        if not isinstance(tag, bs4.element.Tag):
            pass

        elif tag.name == 'style' and tag.contents:
            style = tinycss2.parse_stylesheet(tag.contents[0])
            links = [
                token
                for rule in style if isinstance(rule, tinycss2.ast.QualifiedRule)
                for token in rule.content if isinstance(token, tinycss2.ast.URLToken)
            ]
            commit = lambda: tag.contents[0].replace_with(tinycss2.serialize(style))

        elif tag.get('style'):
            style = tinycss2.parse_declaration_list(tag['style'])
            links = [
                token
                for declaration in style if isinstance(declaration, tinycss2.ast.Declaration)
                for token in declaration.value if isinstance(token, tinycss2.ast.URLToken)
            ]
            commit = lambda: tag.attrs.update(style=tinycss2.serialize(style))

        return (links, commit)

    @writes
    def fix_interlinking_text(self, id, rename_map, old_relative_to=None):
        if not rename_map:
            return
        text_parent = self.get_filepath(id).parent
        soup = self.read_file(id, soup=True)
        for tag in soup.descendants:
            for link_property in HTML_LINK_PROPERTIES.get(tag.name, []):
                link = tag.get(link_property)
                link = self._fix_interlinking_helper(link, rename_map, text_parent, old_relative_to)
                if not link:
                    continue
                tag[link_property] = link

            (style_links, style_commit) = self._fix_interlinking_css_helper(tag)
            for token in style_links:
                link = token.value
                link = self._fix_interlinking_helper(link, rename_map, text_parent, old_relative_to)
                if not link:
                    continue
                token.value = link
            style_commit()

        text = str(soup)
        self.write_file(id, text)

    @writes
    def fix_interlinking_ncx(self, rename_map, old_relative_to=None):
        if not rename_map:
            return
        ncx_id = self.get_ncx()
        if not ncx_id:
            return

        ncx_parent = self.get_filepath(ncx_id).parent
        ncx = self.read_file(ncx_id)
        # 'xml' because 'lxml' and 'html.parser' lowercase the navPoint tag name.
        ncx = bs4.BeautifulSoup(ncx, 'xml')
        for point in ncx.select('navPoint > content[src]'):
            link = point['src']
            link = self._fix_interlinking_helper(link, rename_map, ncx_parent, old_relative_to)
            if not link:
                continue
            point['src'] = link

        ncx = str(ncx)
        self.write_file(ncx_id, ncx)

    @writes
    def fix_interlinking_opf(self, rename_map):
        if not rename_map:
            return
        opf_parent = self.opf_filepath.parent
        for opf_item in self.opf.select('guide > reference[href], manifest > item[href]'):
            link = opf_item['href']
            link = self._fix_interlinking_helper(link, rename_map, opf_parent)
            if not link:
                continue
            opf_item['href'] = link

    @writes
    def fix_interlinking(self, rename_map):
        if not rename_map:
            return
        self.fix_interlinking_opf(rename_map)
        for id in self.get_texts():
            self.fix_interlinking_text(id, rename_map)
        self.fix_interlinking_ncx(rename_map)

    def _set_nav_toc(self, nav_id, new_toc):
        '''
        Write the table of contents created by `generate_toc` to the nav file.
        '''
        for li in new_toc.find_all('li'):
            href = li['nav_anchor']
            atag = new_toc.new_tag('a')
            atag.append(li['text'])
            atag['href'] = href
            li.insert(0, atag)
            del li['nav_anchor']
            del li['ncx_anchor']
            del li['text']
        soup = self.read_file(nav_id, soup=True)
        toc = soup.find('nav', {'epub:type': 'toc'})
        if not toc:
            toc = soup.new_tag('nav')
            toc['epub:type'] = 'toc'
            soup.body.insert(0, toc)
        if toc.ol:
            toc.ol.extract()
        toc.append(new_toc.ol)
        self.write_file(nav_id, soup)

    def _set_ncx_toc(self, ncx_id, new_toc):
        '''
        Write the table of contents created by `generate_toc` to the ncx file.
        '''
        play_order = 1
        def li_to_navpoint(li):
            # result:
            # <navPoint id="navPoint{X}" playOrder="{X}">
            #   <navLabel>
            #     <text>{text}</text>
            #   </navLabel>
            #   <content src="{ncx_anchor}" />
            #   {children}
            # </navPoint>
            nonlocal play_order
            navpoint = new_toc.new_tag('navPoint', id=f'navPoint{play_order}', playOrder=play_order)
            play_order += 1
            label = new_toc.new_tag('navLabel')
            text = new_toc.new_tag('text')
            text.append(li['text'])
            label.append(text)
            navpoint.append(label)

            content = new_toc.new_tag('content', src=li['ncx_anchor'])
            navpoint.append(content)

            children = li.ol.children if li.ol else []
            children = [li_to_navpoint(li) for li in children]
            for child in children:
                navpoint.append(child)
            return navpoint

        # xml because we have to preserve the casing on navMap.
        soup = bs4.BeautifulSoup(self.read_file(ncx_id), 'xml')
        navmap = soup.navMap
        for child in list(navmap.children):
            child.extract()
        for li in list(new_toc.ol.children):
            navpoint = li_to_navpoint(li)
            li.insert_before(navpoint)
            li.extract()
        for navpoint in list(new_toc.ol.children):
            navmap.append(navpoint)
        self.write_file(ncx_id, soup)

    @writes
    def generate_toc(self, max_level=None, linear_only=True):
        '''
        Generate the table of contents (toc.nav and nav.xhtml) by collecting
        <h1>..<h6> throughout all of the text documents.

        max_level: If provided, only collect the headers from h1..hX, inclusive.

        linear_only: Ignore spine items that are marked as linear=no.
        '''
        def new_list(root=False):
            r = bs4.BeautifulSoup('<ol></ol>', 'html.parser')
            if root:
                return r
            return r.ol

        # Official HTML headers only go up to 6.
        if max_level is None:
            max_level = 6

        elif max_level < 1:
            raise ValueError('max_level must be >= 1.')

        header_pattern = re.compile(rf'^h[1-{max_level}]$')

        nav_id = self.get_nav()
        if nav_id:
            nav_filepath = self.get_filepath(nav_id)

        ncx_id = self.get_ncx()
        if ncx_id:
            ncx_filepath = self.get_filepath(ncx_id)

        if not nav_id and not ncx_id:
            return

        # Note: The toc generated by the upcoming loop is in a sort of agnostic
        # format, since it needs to be converted into nav.html and toc.ncx which
        # have different structural requirements. The attributes that I'm using
        # in this initial toc object DO NOT represent any part of the epub format.
        toc = new_list(root=True)

        current_list = toc.ol
        current_list['level'] = None

        spine = self.get_spine_order(linear_only=linear_only)
        spine = [s for s in spine if s != nav_id]

        for file_id in spine:
            file_path = self.get_filepath(file_id)
            soup = self.read_file(file_id, soup=True)

            headers = soup.find_all(header_pattern)
            for (toc_line_index, header) in enumerate(headers, start=1):
                # 'hX' -> X
                level = int(header.name[1])

                header['id'] = f'toc_{toc_line_index}'

                toc_line = toc.new_tag('li')
                toc_line['text'] = header.text

                # In Lithium, the TOC drawer only remembers your position if
                # the page that you're reading corresponds to a TOC entry
                # exactly. Which is to say, if you left off on page5.html,
                # there needs to be a TOC line with href="page5.html" or else
                # the TOC drawer will be in the default position at the top of
                # the list and not highlight the current chapter. Any #anchor
                # in the href will break this feature. So, this code will make
                # the first <hX> on a given page not have an #anchor. If you
                # have a significant amount of text on the page before this
                # header, then this will look bad. But for the majority of
                # cases I expect the first header on the page will be at the
                # very top, or near enough that the Lithium fix is still
                # worthwhile.
                if toc_line_index == 1:
                    hash_anchor = ''
                else:
                    hash_anchor = f'#{header["id"]}'

                if nav_id:
                    relative = file_path.relative_to(nav_filepath.parent, simple=True).replace('\\', '/')
                    toc_line['nav_anchor'] = f'{relative}{hash_anchor}'
                if ncx_id:
                    relative = file_path.relative_to(ncx_filepath.parent, simple=True).replace('\\', '/')
                    toc_line['ncx_anchor'] = f'{relative}{hash_anchor}'

                if current_list['level'] is None:
                    current_list['level'] = level

                while level < current_list['level']:
                    # Because the sub-<ol> are actually a child of the last
                    # <li> of the previous <ol>, we must .parent twice.
                    # The second .parent is conditional because if the current
                    # list is toc.ol, then parent is a Soup document object, and
                    # parenting again would be a mistake. We'll recover from
                    # this in just a moment.
                    current_list = current_list.parent
                    if current_list.name == 'li':
                        current_list = current_list.parent
                    # If the file has headers in a non-ascending order, like the
                    # first header is an h4 and then an h1 comes later, then
                    # this while loop would keep attempting to climb the .parent
                    # which would take us too far, off the top of the tree.
                    # So, if we reach `current_list == toc.ol` then we've
                    # reached the root and should stop climbing. At that point
                    # we can just snap current_level and use the root list again.
                    # In the resulting toc, that initial h4 would have the same
                    # toc depth as the later h1 since it never had parents.
                    if current_list == toc:
                        current_list['level'] = level
                        current_list = toc.ol

                if level > current_list['level']:
                    # In order to properly render nested <ol>, you're supposed
                    # to make the new <ol> a child of the last <li> of the
                    # previous <ol>. NOT a child of the prev <ol> directly.
                    # Don't worry, .children can never be empty because on the
                    # first <li> this condition can never occur, and new <ol>s
                    # always receive a child right after being created.
                    _l = new_list()
                    _l['level'] = level
                    final_li = list(current_list.children)[-1]
                    final_li.append(_l)
                    current_list = _l

                current_list.append(toc_line)

            # We have to save the id="toc_X" that we gave to all the headers.
            self.write_file(file_id, soup)

        for ol in toc.find_all('ol'):
            del ol['level']

        if nav_id:
            self._set_nav_toc(nav_id, copy.copy(toc))

        if ncx_id:
            self._set_ncx_toc(ncx_id, copy.copy(toc))

    @writes
    def move_nav_to_end(self):
        '''
        Move the nav.xhtml file to the end and set its linear=no.
        '''
        nav = self.get_nav()
        if not nav:
            return

        spine = self.get_spine_order()

        try:
            index = spine.index(nav)
            spine.append(spine.pop(index))
        except ValueError:
            spine.append(nav)

        self.set_spine_order(spine)
        self.set_spine_linear(nav, False)

    @writes
    def normalize_directory_structure(self):
        # This must come before the opf rewrite because that would affect the
        # location of all all manifest item hrefs.
        manifest_items = self.get_manifest_items(soup=True)
        old_filepaths = {item['id']: self.get_filepath(item['id']) for item in manifest_items}

        try:
            old_ncx_parent = self.get_filepath(self.get_ncx()).parent
        except Exception:
            old_ncx_parent = None

        if self.opf_filepath.parent == self.root_directory:
            oebps = self.root_directory.with_child('OEBPS')
            oebps.makedirs(exist_ok=True)
            self.write_opf()
            new_opf_path = oebps.with_child(self.opf_filepath.basename)
            os.rename(self.opf_filepath, new_opf_path)
            container = self.read_container_xml()
            rootfile = container.find('rootfile', {'full-path': self.opf_filepath.basename})
            rootfile['full-path'] = new_opf_path.relative_to(self.root_directory, simple=True).replace('\\', '/')
            self.write_container_xml(container)
            self.opf_filepath = new_opf_path

        rename_map = {}
        for manifest_item in manifest_items:
            old_filepath = old_filepaths[manifest_item['id']]

            directory = get_directory_for_mimetype(manifest_item['media-type'])
            directory = self.opf_filepath.parent.with_child(directory)
            if directory.exists:
                # On Windows, this will fix any incorrect casing.
                # On Linux it is inert.
                os.rename(directory, directory)
            else:
                directory.makedirs()

            new_filepath = directory.with_child(old_filepath.basename)
            if new_filepath.absolute_path != old_filepath.absolute_path:
                rename_map[old_filepath] = new_filepath
                os.rename(old_filepath, new_filepath)
            manifest_item['href'] = new_filepath.relative_to(self.opf_filepath.parent, simple=True).replace('\\', '/')

        self.fix_interlinking_opf(rename_map)
        for id in self.get_texts():
            self.fix_interlinking_text(id, rename_map, old_relative_to=old_filepaths[id].parent)
        self.fix_interlinking_ncx(rename_map, old_relative_to=old_ncx_parent)

    @writes
    def normalize_opf(self):
        for tag in self.opf.descendants:
            if tag.name:
                tag.name = tag.name.replace('opf:', '')
        for item in self.get_manifest_items(soup=True):
            if item['href'] in ['toc.ncx', 'Misc/toc.ncx']:
                item['media-type'] = 'application/x-dtbncx+xml'

# COMMAND LINE TOOLS
################################################################################

def random_string(length, characters=string.ascii_lowercase):
    return ''.join(random.choices(characters, k=length))

def addfile_argparse(args):
    book = Epub(args.epub)

    for pattern in args.files:
        for file in pathclass.glob_files(pattern):
            log.info('Adding file %s.', file.absolute_path)
            try:
                book.easy_add_file(file)
            except (IDExists, FileExists) as exc:
                rand_suffix = random_string(3, string.digits)
                base = file.replace_extension('').basename
                id = f'{base}_{rand_suffix}'
                basename = f'{base}_{rand_suffix}{file.extension.with_dot}'
                content = file.open('rb').read()
                book.add_file(id, basename, content)

    book.move_nav_to_end()
    book.save(args.epub)
    return 0

def covercomesfirst(book):
    basenames = {i: book.get_filepath(i).basename for i in book.get_images()}
    if len(basenames) <= 1:
        return

    cover_image = book.get_cover_image()
    if not cover_image:
        return

    cover_basename = book.get_filepath(cover_image).basename

    cover_index = sorted(basenames.values()).index(cover_basename)
    if cover_index == 0:
        return

    rename_map = basenames.copy()

    if not cover_basename.startswith('!'):
        cover_basename = '!' + cover_basename
        rename_map[cover_image] = cover_basename
    else:
        rename_map.pop(cover_image)

    for (id, basename) in rename_map.copy().items():
        if id == cover_image:
            continue
        if basename > cover_basename:
            rename_map.pop(id)
            continue
        if basename < cover_basename and basename.startswith('!'):
            basename = basename.lstrip('!')
            rename_map[id] = basename
        if basename < cover_basename or basename.startswith('.'):
            basename = '_' + basename
            rename_map[id] = basename

    book.rename_file(rename_map)

def covercomesfirst_argparse(args):
    epubs = pathclass.glob_many(args.epubs)
    for epub in epubs:
        book = Epub(epub)
        log.info('Moving %s\'s cover.', book)
        covercomesfirst(book)
        book.save(epub)
        pipeable.stdout(epub.absolute_path)
    return 0

def exec_argparse(args):
    epubs = pathclass.glob_many(args.epubs)
    for epub in epubs:
        book = Epub(epub)
        exec(args.command)
        book.save(epub)
        pipeable.stdout(epub.absolute_path)
    return 0

def generate_toc_argparse(args):
    epubs = pathclass.glob_many(args.epubs)
    books = []
    for epub in epubs:
        book = Epub(epub)
        book.generate_toc(max_level=args.max_level)
        book.save(epub)
    return 0

def holdit_argparse(args):
    epubs = pathclass.glob_many(args.epubs)
    books = []
    for epub in epubs:
        book = Epub(epub)
        pipeable.stderr(f'{epub} = {book.root_directory.absolute_path}')
        books.append((epub, book))

    input('Press Enter when ready.')
    for (epub, book) in books:
        # Saving re-writes the opf from memory, which might undo any manual changes.
        # So let's re-read it first.
        book.read_opf(book.opf_filepath)
        book.save(epub)
        pipeable.stdout(epub.absolute_path)
    return 0

def merge(
        input_filepaths,
        demote_headers=False,
        do_headerfile=False,
        number_headerfile=False,
    ):
    book = Epub.new()

    input_filepaths = list(pathclass.glob_many(input_filepaths))
    index_length = len(str(len(input_filepaths)))
    rand_prefix = random_string(3, string.digits)

    # Number books from 1 for human sanity.
    for (index, input_filepath) in enumerate(input_filepaths, start=1):
        log.info('Merging %s.', input_filepath.absolute_path)
        prefix = f'{rand_prefix}_{index:>0{index_length}}_{{}}'
        input_book = Epub(input_filepath)
        input_book.normalize_directory_structure()

        input_ncx = input_book.get_ncx()
        input_nav = input_book.get_nav()
        manifest_ids = input_book.get_manifest_items(spine_order=True)
        manifest_ids = [x for x in manifest_ids if x not in (input_ncx, input_nav)]

        basename_map = {}
        for id in manifest_ids:
            old_basename = input_book.get_filepath(id).basename
            new_basename = prefix.format(old_basename)
            basename_map[id] = new_basename

        # Don't worry, we're not going to save over the input book!
        input_book.rename_file(basename_map)

        if do_headerfile:
            content = ''
            try:
                title = input_book.get_titles()[0]
            except IndexError:
                title = input_filepath.replace_extension('').basename

            try:
                year = input_book.get_dates()[0]
            except IndexError:
                pass
            else:
                title = f'{title} ({year})'

            if number_headerfile:
                title = f'{index:>0{index_length}}. {title}'

            content += f'<h1>{html.escape(title)}</h1>'

            try:
                author = input_book.get_authors()[0]
                content += f'<p>{html.escape(author)}</p>'
            except IndexError:
                pass

            headerfile_id = prefix.format('headerfile')
            headerfile_basename = prefix.format('headerfile.html')
            book.add_file(headerfile_id, headerfile_basename, content)

        for id in manifest_ids:
            new_id = prefix.format(id)
            new_basename = basename_map[id]
            if demote_headers:
                content = input_book.read_file(id, soup=True)
                if isinstance(content, bs4.BeautifulSoup):
                    content = demote_xhtml_headers(content)
            else:
                content = input_book.read_file(id)
            book.add_file(new_id, new_basename, content)

    book.move_nav_to_end()
    return book

def merge_argparse(args):
    output = pathclass.Path(args.output)

    if output.exists:
        if not (args.autoyes or interactive.getpermission(f'Overwrite {args.output}?')):
            raise ValueError(f'{args.output} exists.')

    book = merge(
        input_filepaths=args.epubs,
        demote_headers=args.demote_headers,
        do_headerfile=args.headerfile,
        number_headerfile=args.number_headerfile,
    )
    book.save(output)
    pipeable.stdout(output.absolute_path)
    return 0

def new_book_argparse(args):
    output = pathclass.Path(args.epub)
    if output.exists:
        if not (args.autoyes or interactive.getpermission(f'Overwrite {args.epub}?')):
            raise ValueError(f'{output.absolute_path} exists.')
    book = Epub.new()
    book.save(output)
    pipeable.stdout(output.absolute_path)
    return 0

def normalize_argparse(args):
    epubs = pathclass.glob_many(args.epubs)
    for epub in epubs:
        log.info('Normalizing %s.', epub.absolute_path)
        book = Epub(epub)
        book.normalize_opf()
        book.normalize_directory_structure()
        book.move_nav_to_end()
        book.save(epub)
        pipeable.stdout(epub.absolute_path)
    return 0

def setfont(book, new_font, autoyes=False):
    css_id = 'epubfile_setfont'
    css_basename = 'epubfile_setfont.css'

    new_font = pathclass.Path(new_font)
    new_font.assert_is_file()

    try:
        book.assert_id_not_exists(css_id)
    except IDExists:
        if not (autoyes or interactive.getpermission(f'Overwrite {css_id}?')):
            return
        book.delete_file(css_id)

    for existing_font in book.get_fonts():
        font_path = book.get_filepath(existing_font)
        if font_path.basename == new_font.basename:
            font_id = existing_font
            break
    else:
        font_id = book.easy_add_file(new_font)
        font_path = book.get_filepath(font_id)

    # The font_path may have come from an existing font in the book, so we have
    # no guarantees about its path layout. The css file, however, is definitely
    # going to be inside OEBPS/Styles since we're the ones creating it.
    # So, we should be getting the correct number of .. in the relative path.
    family = font_path.replace_extension('').basename
    relative = font_path.relative_to(book.opf_filepath.parent.with_child('Styles')).replace('\\', '/')

    css = f'''
    @font-face {{
    font-family: "{family}";
    font-weight: normal;
    font-style: normal;
    src: url("{relative}");
    }}

    * {{
        font-family: "{family}" !important;
    }}
    '''

    book.add_file(
        id=css_id,
        basename=css_basename,
        content=css,
    )
    css_path = book.get_filepath(css_id)

    for text_id in book.get_texts():
        text_path = book.get_filepath(text_id)
        soup = book.read_file(text_id, soup=True)
        head = soup.head
        if head.find('link', {'id': css_id}):
            continue
        link = soup.new_tag('link')
        link['id'] = css_id
        link['href'] = css_path.relative_to(text_path.parent).replace('\\', '/')
        link['rel'] = 'stylesheet'
        link['type'] = 'text/css'
        head.append(link)
        book.write_file(text_id, soup)

def setfont_argparse(args):
    epubs = pathclass.glob_many(args.epubs)
    for epub in epubs:
        book = Epub(epub)
        setfont(book, args.font, autoyes=args.autoyes)
        book.save(epub)
        pipeable.stdout(epub.absolute_path)
    return 0

@vlogging.main_decorator
def main(argv):
    parser = argparse.ArgumentParser(
        description='''
        A simple Python .epub scripting tool.
        ''',
    )
    subparsers = parser.add_subparsers()

    ################################################################################################

    p_addfile = subparsers.add_parser(
        'addfile',
        description='''
        Add files into the book.
        ''',
    )
    p_addfile.add_argument(
        'epub',
    )
    p_addfile.add_argument(
        'files',
        nargs='+',
    )
    p_addfile.set_defaults(func=addfile_argparse)

    ################################################################################################

    p_covercomesfirst = subparsers.add_parser(
        'covercomesfirst',
        description='''
        Rename the cover image file so that it is the alphabetically-first image.

        I use CBXShell to get thumbnails of epub files on Windows, and because it
        is generalized for zip files and doesn't read epub metadata, alphabetized
        mode works best for getting epub covers as icons.

        In my testing, CBXShell considers the image's whole path and not just the
        basename, so you may want to consider normalizing the directory structure
        first, otherwise some /a/image.jpg will always be before /images/cover.jpg.
        ''',
    )
    p_covercomesfirst.add_argument(
        'epubs',
        nargs='+',
    )
    p_covercomesfirst.set_defaults(func=covercomesfirst_argparse)

    ################################################################################################

    p_exec = subparsers.add_parser(
        'exec',
        description='''
        Execute a snippet of Python code against the book. The book will be saved
        again after the command has finished.
        ''',
    )
    p_exec.examples = [
        ['mybook.epub', '--command', 'print(book.get_authors())'],
        ['*.epub', '--command', 'book.remove_cover_image()'],
    ]
    p_exec.add_argument(
        'epubs',
        nargs='+',
    )
    p_exec.add_argument(
        '--command',
        default=None,
        required=True,
        help='''
        The variable `book` will be the Epub object.
        ''',
    )
    p_exec.set_defaults(func=exec_argparse)

    ################################################################################################

    p_generate_toc = subparsers.add_parser(
        'generate_toc',
        description='''
        Regenerate the toc.ncx and nav.xhtml based on html <hX> headers in the text.
        ''',
    )
    p_generate_toc.add_argument(
        'epubs',
        nargs='+',
    )
    p_generate_toc.add_argument(
        '--max_level',
        '--max-level',
        type=int,
        default=None,
        help='''
        Only generate toc entries for headers up to level X.
        That is, h1, h2, ... hX.
        ''',
    )
    p_generate_toc.set_defaults(func=generate_toc_argparse)

    ################################################################################################

    p_holdit = subparsers.add_parser(
        'holdit',
        description='''
        Extract the book so that you can manually edit the files on disk, then
        compress them back into the original file.

        This is helpful when you want to do some file processing that is outside of
        epubfile's scope. epubfile will save you the effort of extracting and
        compressing the epub so you can focus on doing the file operations.
        ''',
    )
    p_holdit.add_argument(
        'epubs',
        nargs='+',
    )
    p_holdit.set_defaults(func=holdit_argparse)

    ################################################################################################

    p_merge = subparsers.add_parser(
        'merge',
        description='''
        Merge multiple books into one.
        ''',
    )
    p_merge.add_argument(
        'epubs',
        nargs='+',
    )
    p_merge.add_argument(
        '--output',
        required=True,
    )
    p_merge.add_argument(
        '--headerfile',
        action='store_true',
        help='''
        Add a file before each book with an <h1> containing its title.
        ''',
    )
    p_merge.add_argument(
        '--demote_headers',
        '--demote-headers',
        action='store_true',
        help='''
        All h1 in the book will be demoted to h2, and so forth, so that the
        headerfiles are the only h1s and the table of contents will generate
        with a good hierarchy.
        ''',
    )
    p_merge.add_argument(
        '--number_headerfile',
        '--number-headerfile',
        action='store_true',
        help='''
        In the headerfile, the <h1> will start with the book's index, like
        "01. First Book"
        ''',
    )
    p_merge.add_argument(
        '-y',
        '--yes',
        '--autoyes',
        dest='autoyes',
        action='store_true',
        help='''
        Overwrite the output file without prompting.
        ''',
    )
    p_merge.set_defaults(func=merge_argparse)

    ################################################################################################

    p_new = subparsers.add_parser(
        'new',
        description='''
        Create a new, blank epub file.
        ''',
    )
    p_new.add_argument(
        'epub',
    )
    p_new.add_argument(
        '-y',
        '--yes',
        '--autoyes',
        dest='autoyes',
        action='store_true',
        help='''
        Overwrite the file without prompting.
        ''',
    )
    p_new.set_defaults(func=new_book_argparse)

    ################################################################################################

    p_normalize = subparsers.add_parser(
        'normalize',
        description='''
        Rename files and directories in the book to match a common structure.

        Moves all book content from / into /OEBPS and sorts files into
        subdirectories by type: Text, Images, Styles, etc.
        ''',
    )
    p_normalize.add_argument(
        'epubs',
        nargs='+',
    )
    p_normalize.set_defaults(func=normalize_argparse)

    ################################################################################################

    p_setfont = subparsers.add_parser(
        'setfont',
        description='''
        Set the font for every page in the whole book.

        A stylesheet called epubfile_setfont.css will be created that sets
        * { font-family: ... !important } with a font file of your choice.
        ''',
    )
    p_setfont.add_argument(
        'epubs',
        nargs='+',
    )
    p_setfont.add_argument(
        '--font',
        required=True,
    )
    p_setfont.add_argument(
        '--yes',
        dest='autoyes',
        action='store_true',
        help='''
        Overwrite the epubfile_setfont.css without prompting.
        ''',
    )
    p_setfont.set_defaults(func=setfont_argparse)

    return betterhelp.go(parser, argv)

if __name__ == '__main__':
    raise SystemExit(main(sys.argv[1:]))