import argparse import bs4 import copy import functools import html import io import mimetypes import os import random import re import string import sys import tempfile import tinycss2 import urllib.parse import uuid import zipfile from voussoirkit import betterhelp from voussoirkit import interactive from voussoirkit import pathclass from voussoirkit import pipeable from voussoirkit import vlogging log = vlogging.get_logger(__name__, 'epubfile') HTML_LINK_PROPERTIES = { 'a': ['href'], 'audio': ['src'], 'image': ['href', 'xlink:href'], 'img': ['src'], 'link': ['href'], 'script': ['src'], 'source': ['src'], 'track': ['src'], 'video': ['src', 'poster'], } EXTENSION_MIMETYPES = { 'htm': 'application/xhtml+xml', 'html': 'application/xhtml+xml', 'otf': 'font/otf', 'pls': 'application/pls+xml', 'smi': 'application/smil+xml', 'smil': 'application/smil+xml', 'sml': 'application/smil+xml', 'ttf': 'font/ttf', 'woff': 'font/woff', 'woff2': 'font/woff2', 'xhtml': 'application/xhtml+xml', 'xpgt': 'application/vnd.adobe-page-template+xml', } MIMETYPE_DIRECTORIES = { 'application/font-sfnt': 'Fonts', 'application/x-dtbncx+xml': '.', 'application/x-font-ttf': 'Fonts', 'application/xhtml+xml': 'Text', 'audio': 'Audio', 'font': 'Fonts', 'image': 'Images', 'text/css': 'Styles', 'video': 'Video', } MIMETYPE_FILE_TEMPLATE = 'application/epub+zip' CONTAINER_XML_TEMPLATE = ''' '''.strip() OPF_TEMPLATE = ''' {uuid} '''.strip() NCX_TEMPLATE = ''' {navpoints} '''.strip() NAV_XHTML_TEMPLATE = ''' '''.strip() TEXT_TEMPLATE = ''' {head_content} {body_content} '''.strip() # EPUB COMPRESSION ################################################################################ def compress_epub(directory, epub_filepath): directory = pathclass.Path(directory) epub_filepath = pathclass.Path(epub_filepath) log.debug('Compressing %s to %s.', directory.absolute_path, epub_filepath.absolute_path) if epub_filepath in directory: raise ValueError('Epub inside its own directory') if epub_filepath.extension != 'epub': epub_filepath = epub_filepath.add_extension('epub') with zipfile.ZipFile(epub_filepath, 'w') as z: z.write(directory.with_child('mimetype'), arcname='mimetype') for file in directory.walk(): if file in [directory.with_child('mimetype'), directory.with_child('sigil.cfg')]: continue z.write( file, arcname=file.relative_to(directory).replace('\\', '/'), compress_type=zipfile.ZIP_DEFLATED, ) return epub_filepath def extract_epub(epub_filepath, directory): epub_filepath = pathclass.Path(epub_filepath) directory = pathclass.Path(directory) log.debug('Extracting %s to %s.', epub_filepath.absolute_path, directory.absolute_path) with zipfile.ZipFile(epub_filepath, 'r') as z: z.extractall(directory) # XHTML TOOLS ################################################################################ def fix_xhtml(xhtml, return_soup=False): if isinstance(xhtml, bs4.BeautifulSoup): soup = xhtml else: # For the text pages, html5lib is the best because html.parser and lxml # lowercase all attributes, breaking svg's case-sensitive viewBox etc. # and xml loses all of the namespaces when namespaced elements are nested # like . # The downside of html5lib is it turns the xml declaration at the top # into a comment which we must undo manually. soup = bs4.BeautifulSoup(xhtml, 'html5lib') if not soup.html: html = soup.new_tag('html') for child in list(soup.contents): html.append(child) soup.append(html) if not soup.html.body: body = soup.new_tag('body') for child in list(soup.html.contents): body.append(child) soup.html.append(body) if not soup.html.get('xmlns'): soup.html['xmlns'] = 'http://www.w3.org/1999/xhtml' try: doctype = next(i for i in soup.contents if isinstance(i, bs4.Doctype)) except StopIteration: doctype = bs4.Doctype('html') soup.html.insert_before(doctype) # html5lib turns the xml declaration into a comment which we must revert. try: if isinstance(soup.contents[0], bs4.Comment): declaration = bs4.Declaration('xml version="1.0" encoding="utf-8"') soup.insert(0, declaration) declaration.next.extract() except StopIteration: pass try: declaration = next(i for i in soup.contents if isinstance(i, bs4.Declaration)) except StopIteration: declaration = bs4.Declaration('xml version="1.0" encoding="utf-8"') doctype.insert_before(declaration) if return_soup: return soup return str(soup) def xhtml_replacements(xhtml, replacements, return_soup=False): if isinstance(xhtml, bs4.BeautifulSoup): xhtml = str(xhtml) for (re_from, re_to) in replacements: xhtml = re.sub(re_from, re_to, xhtml, flags=re.DOTALL) if return_soup: soup = bs4.BeautifulSoup(xhtml, 'html5lib') return soup return xhtml def demote_xhtml_headers(xhtml, return_soup=False): replacements = [ (r']*?>.*?)', r''), (r']*?>.*?)', r''), (r']*?>.*?)', r''), (r']*?>.*?)', r''), (r']*?>.*?)', r''), ] return xhtml_replacements(xhtml, replacements, return_soup=return_soup) def promote_xhtml_headers(xhtml, return_soup=False): replacements = [ (r']*?>.*?)', r''), (r']*?>.*?)', r''), (r']*?>.*?)', r''), (r']*?>.*?)', r''), (r']*?>.*?)', r''), ] return xhtml_replacements(xhtml, replacements, return_soup=return_soup) # MIMETYPE DECISIONMAKERS ################################################################################ def get_directory_for_mimetype(mime): directory = ( MIMETYPE_DIRECTORIES.get(mime) or MIMETYPE_DIRECTORIES.get(mime.split('/')[0]) or 'Misc' ) return directory def get_mimetype_for_basename(basename): extension = os.path.splitext(basename)[1].strip('.') mime = ( EXTENSION_MIMETYPES.get(extension) or mimetypes.guess_type(basename)[0] or 'application/octet-stream' ) return mime # OPF ELEMENT GENERATORS ################################################################################ def make_manifest_item(id, href, mime): manifest_item = f'' # 'html.parser' just for having the simplest output. manifest_item = bs4.BeautifulSoup(manifest_item, 'html.parser') return manifest_item.item def make_meta_item(content=None, attrs=None): if content: meta_item = f'{content}' else: meta_item = f'' # 'html.parser' just for having the simplest output. meta_item = bs4.BeautifulSoup(meta_item, 'html.parser') if attrs: meta_item.attrs.update(attrs) return meta_item.meta def make_spine_item(id): spine_item = f'' # 'html.parser' just for having the simplest output. spine_item = bs4.BeautifulSoup(spine_item, 'html.parser') return spine_item.itemref # DECORATORS ################################################################################ def writes(method): ''' Indicates that the given method performs write operations to files inside the book. The decorated method will raise ReadOnly if the book was opened in read-only mode. ''' @functools.wraps(method) def wrapped_method(self, *args, **kwargs): if self.read_only: raise ReadOnly(method.__qualname__) return method(self, *args, **kwargs) return wrapped_method # CLASSES ################################################################################ class EpubfileException(Exception): error_message = '' def __init__(self, *args, **kwargs): super().__init__() self.given_args = args self.given_kwargs = kwargs self.error_message = self.error_message.format(*args, **kwargs) self.args = (self.error_message, args, kwargs) def __str__(self): return self.error_message class InvalidEpub(EpubfileException): error_message = '{} is invalid: {}' class FileExists(EpubfileException): error_message = 'There is already a file at {}.' class IDExists(EpubfileException): error_message = 'There is already a file with id {}.' class NotInManifest(EpubfileException): error_message = '{} is not in the manifest.' class NotInSpine(EpubfileException): error_message = '{} is not in the spine.' class ReadOnly(EpubfileException): error_message = 'Can\'t do {} in read-only mode.' class Epub: def __init__(self, epub_path, *, read_only=False): ''' epub_path: The path to an .epub file, or to a directory that contains unzipped epub contents. read_only: If True, all write operations will be forbidden. The benefit is that the .epub file will not be extracted. This is recommended if you only need to read data from a book and don't need to write to it. ''' epub_path = self._keep_tempdir_reference(epub_path) epub_path = pathclass.Path(epub_path) self.original_path = epub_path self.read_only = read_only if epub_path.is_dir: self.__init_from_dir(epub_path) elif self.read_only: self.__init_from_file_read_only(epub_path) else: self.__init_from_file(epub_path) opfs = self.get_opfs() self.opf_filepath = opfs[0] self.opf = self.read_opf(self.opf_filepath) def __init_from_dir(self, directory): self.is_zip = False self.root_directory = pathclass.Path(directory) def __init_from_file_read_only(self, epub_path): # It may appear that is_zip is a synonym for read_only, but don't forget # that we can also open a directory in readonly mode. It's just that # readonly dirs don't need a special init, all they have to do is # forbid writes. self.is_zip = True self.root_directory = pathclass.Path(epub_path) self.zip = zipfile.ZipFile(self.root_directory) def __init_from_file(self, epub_path): extract_to = tempfile.TemporaryDirectory(prefix='epubfile-') extract_epub(epub_path, extract_to.name) directory = self._keep_tempdir_reference(extract_to) self.__init_from_dir(directory) def __repr__(self): if self.read_only: return f'Epub({repr(self.root_directory.absolute_path)}, read_only=True)' else: return f'Epub({repr(self.root_directory.absolute_path)})' def _fopen(self, *args, **kwargs): ''' Not to be confused with the high level `open_file` method, this method is the one that actually reads off the disk. ''' if self.is_zip: return self._fopen_zip(*args, **kwargs) else: return self._fopen_disk(*args, **kwargs) def _fopen_disk(self, path, mode, *, encoding=None): ''' If the book was opened as a directory, we can read files off disk with Python open. ''' return open(path, mode, encoding=encoding) def _fopen_zip(self, path, mode, *, encoding=None): ''' If the book was opened as a read-only zip, we can read files out of the zip. ''' # When reading from a zip, root_directory is the zip file itself. # So if the user is trying to read a filepath called # D:\book.epub\dir1\file1.html, we need to convert it to the relative # path dir1\file1.html # But if they have already given us the relative path, we keep that. normalized = path if not isinstance(normalized, pathclass.Path): normalized = pathclass.Path(normalized) if normalized in self.root_directory: # The given path was an absolute path including the epub. path = normalized.relative_to(self.root_directory, simple=True) else: # The given path was either a relative path already inside the epub, # or an absolute path somewhere totally wrong. path = os.fspath(path) # Zip files always use forward slash internally, even on Windows. path = path.replace('\\', '/') if mode == 'rb': return self.zip.open(path, 'r') if mode == 'r': return io.TextIOWrapper(self.zip.open(path, 'r'), encoding) # At this time fopen_zip is only used for read-only epubs anyway. if mode == 'wb': return self.zip.open(path, 'w') if mode == 'w': return io.TextIOWrapper(self.zip.open(path, 'w'), encoding) raise ValueError('mode should be r, w, rb, or wb.') def _keep_tempdir_reference(self, p): ''' If the given path object is actually a tempfile.TemporaryDirectory, store that TD reference here so that it does not get cleaned up even if the caller releases it. Then return the actual filepath. ''' if isinstance(p, tempfile.TemporaryDirectory): self._tempdir_reference = p p = p.name return p def assert_file_not_exists(self, filepath): if filepath.exists: existing = filepath.relative_to(self.opf_filepath.parent) raise FileExists(existing) def assert_id_not_exists(self, id): try: self.get_manifest_item(id) raise IDExists(id) except NotInManifest: pass # VALIDATION ############################################################################ @writes def auto_correct_and_validate(self): # Ensure we have a mimetype file. mimetype_file = self.root_directory.with_child('mimetype') if not mimetype_file.exists: with self._fopen(mimetype_file, 'w', encoding='utf-8') as handle: handle.write(MIMETYPE_FILE_TEMPLATE) # Assert that all manifest items exist on disk. for item in self.get_manifest_items(soup=True): filepath = self.get_filepath(item['id']) if not filepath.exists: message = f'Manifest item {item["id"]} = {item["href"]} does not exist.' raise InvalidEpub(self.original_path, message) # LOADING AND SAVING ############################################################################ @classmethod def new(cls): ''' Create a new book. It will start as a temporary directory, so don't forget to call `save` when you are done. ''' def writefile(filepath, content): filepath.parent.makedirs(exist_ok=True) # This line uses Python open instead of self._fopen because the epub # hasn't been instantiated yet! At this time, creating a book with # Epub.new always creates it as a directory. We do not support # creating a book directly into a fresh zip file. with filepath.open('w', encoding='utf-8') as handle: handle.write(content) uid = uuid.uuid4().urn tempdir = tempfile.TemporaryDirectory(prefix='epubfile-') root = pathclass.Path(tempdir.name) writefile(root.join('mimetype'), MIMETYPE_FILE_TEMPLATE) writefile(root.join('META-INF/container.xml'), CONTAINER_XML_TEMPLATE) writefile(root.join('OEBPS/content.opf'), OPF_TEMPLATE.format(uuid=uid)) writefile(root.join('OEBPS/toc.ncx'), NCX_TEMPLATE.format(uuid=uid, navpoints='')) writefile(root.join('OEBPS/Text/nav.xhtml'), NAV_XHTML_TEMPLATE.format(toc_contents='')) return cls(tempdir) @writes def save(self, epub_filepath): self.write_opf() self.auto_correct_and_validate() compress_epub(self.root_directory, epub_filepath) # CONTAINER & OPF ############################################################################ def get_opfs(self): ''' Read the container.xml to find all available OPFs (aka rootfiles). ''' container = self.read_container_xml() rootfiles = container.find_all('rootfile') rootfiles = [x.get('full-path') for x in rootfiles] rootfiles = [self.root_directory.join(x) for x in rootfiles] return rootfiles def read_container_xml(self): container_xml_path = self.root_directory.join('META-INF/container.xml') container = self._fopen(container_xml_path, 'r', encoding='utf-8') # 'xml' and 'html.parser' seem about even here except that html.parser # doesn't self-close. container = bs4.BeautifulSoup(container, 'xml') return container def read_opf(self, rootfile): rootfile = pathclass.Path(rootfile) rootfile_xml = self._fopen(rootfile, 'r', encoding='utf-8').read() # 'html.parser' preserves namespacing the best, but unfortunately it # botches the items because it wants them to be self-closing # and the string contents come out. We will fix in just a moment. # This is still preferable to 'xml' which handles the dc: prefixes when # parsing only the metadata block, but loses all namespaces when parsing # the whole doc. 'lxml' wraps the content in and also # botches the metas so it's not any better than html.parser. opf = bs4.BeautifulSoup(rootfile_xml, 'html.parser') # Let's fix those metas. metas = opf.select('meta') for meta in metas: neighbor = meta.next if neighbor.parent != meta.parent: # This happens on the last meta, neighbor is outside of the manifest break if not isinstance(neighbor, bs4.element.NavigableString): continue meta.append(neighbor.extract().strip()) return opf @writes def write_container_xml(self, container): if isinstance(container, bs4.BeautifulSoup): container = str(container) container_xml_path = self.root_directory.join('META-INF/container.xml') container_xml = self._fopen(container_xml_path, 'w', encoding='utf-8') container_xml.write(container) @writes def write_opf(self): with self._fopen(self.opf_filepath, 'w', encoding='utf-8') as rootfile: rootfile.write(str(self.opf)) # FILE OPERATIONS ############################################################################ @writes def add_file(self, id, basename, content): self.assert_id_not_exists(id) basename = os.path.basename(basename) mime = get_mimetype_for_basename(basename) directory = get_directory_for_mimetype(mime) directory = self.opf_filepath.parent.with_child(directory) directory.makedirs(exist_ok=True) filepath = directory.with_child(basename) self.assert_file_not_exists(filepath) if mime == 'application/xhtml+xml': # bs4 converts bytes to str so this must come before the handle choice. content = fix_xhtml(content) if isinstance(content, str): handle = self._fopen(filepath, 'w', encoding='utf-8') elif isinstance(content, bytes): handle = self._fopen(filepath, 'wb') else: raise TypeError(f'content should be str or bytes, not {type(content)}.') with handle: handle.write(content) href = filepath.relative_to(self.opf_filepath.parent, simple=True).replace('\\', '/') href = urllib.parse.quote(href) manifest_item = make_manifest_item(id, href, mime) self.opf.manifest.append(manifest_item) if mime == 'application/xhtml+xml': spine_item = make_spine_item(id) self.opf.spine.append(spine_item) return id @writes def easy_add_file(self, filepath): ''' Add a file from disk into the book. The manifest ID and href will be automatically generated. ''' filepath = pathclass.Path(filepath) with self._fopen(filepath, 'rb') as handle: return self.add_file( id=filepath.basename, basename=filepath.basename, content=handle.read(), ) @writes def delete_file(self, id): manifest_item = self.get_manifest_item(id) filepath = self.get_filepath(id) manifest_item.extract() spine_item = self.opf.spine.find('itemref', {'idref': id}) if spine_item: spine_item.extract() os.remove(filepath) def get_filepath(self, id): href = self.get_manifest_item(id)['href'] filepath = self.opf_filepath.parent.join(href) # TODO: In the case of a read-only zipped epub, this condition will # definitely fail and we won't be unquoting names that need it. # Double-check the consequences of this and make a patch for file # exists inside zip check if needed. if not filepath.exists: href = urllib.parse.unquote(href) filepath = self.opf_filepath.parent.join(href) return filepath def open_file(self, id, mode): if mode not in ('r', 'w'): raise ValueError(f'mode should be either r or w, not {mode}.') if mode == 'w' and self.read_only: raise ReadOnly(self.open_file.__qualname__) filepath = self.get_filepath(id) mime = self.get_manifest_item(id)['media-type'] is_text = ( mime in ('application/xhtml+xml', 'application/x-dtbncx+xml') or mime.startswith('text/') ) if is_text: handle = self._fopen(filepath, mode, encoding='utf-8') else: handle = self._fopen(filepath, mode + 'b') return handle def read_file(self, id, *, soup=False): # text vs binary handled by open_file. content = self.open_file(id, 'r').read() if soup and self.get_manifest_item(id)['media-type'] == 'application/xhtml+xml': return fix_xhtml(content, return_soup=True) return content @writes def rename_file(self, id, new_basename=None, *, fix_interlinking=True): if isinstance(id, dict): basename_map = id else: if new_basename is None: raise TypeError('new_basename can be omitted if id is a dict.') basename_map = {id: new_basename} rename_map = {} for (id, new_basename) in basename_map.items(): old_filepath = self.get_filepath(id) new_filepath = old_filepath.parent.with_child(new_basename) if not new_filepath.extension: new_filepath = new_filepath.add_extension(old_filepath.extension) self.assert_file_not_exists(new_filepath) os.rename(old_filepath, new_filepath) rename_map[old_filepath] = new_filepath if fix_interlinking: self.fix_interlinking(rename_map) else: self.fix_interlinking_opf(rename_map) return rename_map @writes def write_file(self, id, content): # text vs binary handled by open_file. if isinstance(content, bs4.BeautifulSoup): content = str(content) with self.open_file(id, 'w') as handle: handle.write(content) # GETTING THINGS ############################################################################ def get_manifest_items(self, filter='', soup=False, spine_order=False): query = f'item{filter}' items = self.opf.manifest.select(query) if spine_order: items = {x['id']: x for x in items} ordered_items = [] for spine_id in self.get_spine_order(): ordered_items.append(items.pop(spine_id)) ordered_items.extend(items.values()) items = ordered_items if soup: return items return [x['id'] for x in items] def get_manifest_item(self, id): item = self.opf.manifest.find('item', {'id': id}) if not item: raise NotInManifest(id) return item def get_fonts(self, *, soup=False): return self.get_manifest_items( filter='[media-type*="font"],[media-type*="opentype"]', soup=soup, ) def get_images(self, *, soup=False): return self.get_manifest_items( filter='[media-type^="image/"]', soup=soup, ) def get_media(self, *, soup=False): return self.get_manifest_items( filter='[media-type^="video/"],[media-type^="audio/"]', soup=soup, ) def get_nav(self, *, soup=False): nav = self.opf.manifest.find('item', {'properties': 'nav'}) if not nav: return None if soup: return nav return nav['id'] def get_ncx(self, *, soup=False): ncx = self.opf.manifest.find('item', {'media-type': 'application/x-dtbncx+xml'}) if not ncx: return None if soup: return ncx return ncx['id'] def get_styles(self, *, soup=False): return self.get_manifest_items( filter='[media-type="text/css"]', soup=soup, ) def get_texts(self, *, soup=False, skip_nav=False): texts = self.get_manifest_items( filter='[media-type="application/xhtml+xml"]', soup=True, spine_order=True, ) if skip_nav: texts = [x for x in texts if x.get('properties') != 'nav'] if soup: return texts return [x['id'] for x in texts] # COVER ############################################################################ def get_cover_image(self, *, soup=False): cover = self.opf.manifest.find('item', {'properties': 'cover-image'}) if cover: return cover if soup else cover['id'] cover = self.opf.metadata.find('meta', {'name': 'cover'}) if cover: return cover if soup else cover['content'] return None @writes def remove_cover_image(self): current_cover = self.get_cover_image(soup=True) if not current_cover: return del current_cover['properties'] meta = self.opf.metadata.find('meta', {'name': 'cover'}) if meta: meta.extract() @writes def set_cover_image(self, id): if id is None: self.remove_cover_image() current_cover = self.get_cover_image(soup=True) if not current_cover: pass elif current_cover['id'] == id: return else: del current_cover['properties'] manifest_item = self.get_manifest_item(id) manifest_item['properties'] = 'cover-image' current_meta = self.opf.metadata.find('meta', {'name': 'cover'}) if current_meta: current_meta['content'] = id else: meta = make_meta_item(attrs={'name': 'cover', 'content': id}) self.opf.metadata.append(meta) # SPINE ############################################################################ def get_spine_order(self, *, linear_only=False): items = self.opf.spine.find_all('itemref') if linear_only: items = [x for x in items if x.get('linear') != 'no'] return [x['idref'] for x in items] @writes def set_spine_order(self, ids): manifest_ids = self.get_manifest_items() # Fetch the existing entries so that we can preserve their attributes # while rearranging, only creating new spine entries for ids that aren't # already present. spine_items = self.opf.spine.select('itemref') spine_items = {item['idref']: item for item in spine_items} for id in ids: if id not in manifest_ids: raise NotInManifest(id) if id in spine_items: self.opf.spine.append(spine_items.pop(id)) else: self.opf.spine.append(make_spine_item(id)) # The remainder of the current spine items were not used, so pop them out. for spine_item in spine_items.values(): spine_item.extract() def get_spine_linear(self, id): spine_item = self.opf.spine.find('itemref', {'idref': id}) if not spine_item: raise NotInSpine(id) linear = spine_item.get('linear') linear = {None: None, 'yes': True, 'no': False}.get(linear, linear) return linear @writes def set_spine_linear(self, id, linear): ''' Set linear to yes or no. Or pass None to remove the property. ''' spine_item = self.opf.spine.find('itemref', {'idref': id}) if not spine_item: raise NotInSpine(id) if linear is None: del spine_item['linear'] return if isinstance(linear, str): if linear not in ('yes', 'no'): raise ValueError(f'Linear must be yes or no, not {linear}.') elif isinstance(linear, (bool, int)): linear = {True: 'yes', False: 'no'}[bool(linear)] else: raise TypeError(linear) spine_item['linear'] = linear # METADATA ############################################################################ def get_authors(self): ''' Thank you double_j for showing how to deal with find_all not working on namespaced tags. https://stackoverflow.com/a/44681560 ''' creators = self.opf.metadata.find_all({'dc:creator'}) creators = [str(c.contents[0]) for c in creators if len(c.contents) == 1] return creators def get_dates(self): dates = self.opf.metadata.find_all({'dc:date'}) dates = [str(t.contents[0]) for t in dates if len(t.contents) == 1] return dates def get_languages(self): languages = self.opf.metadata.find_all({'dc:language'}) languages = [str(l.contents[0]) for l in languages if len(l.contents) == 1] return languages def get_titles(self): titles = self.opf.metadata.find_all({'dc:title'}) titles = [str(t.contents[0]) for t in titles if len(t.contents) == 1] return titles @writes def remove_metadata_of_type(self, tag_name): for meta in self.opf.metadata.find_all({tag_name}): if meta.get('id'): for refines in self.opf.metadata.find_all('meta', {'refines': f'#{meta["id"]}'}): refines.extract() meta.extract() @writes def set_languages(self, languages): ''' A list like ['en', 'fr', 'ko']. ''' self.remove_metadata_of_type('dc:language') for language in languages: element = f'{language}' element = bs4.BeautifulSoup(element, 'html.parser') self.opf.metadata.append(element) # UTILITIES ############################################################################ @writes def fix_all_xhtml(self): for id in self.get_texts(): self.write_file(id, self.read_file(id, soup=True)) @staticmethod def _fix_interlinking_helper(link, rename_map, relative_to, old_relative_to=None): ''' Given an old link that was found in one of the documents, and the rename_map, produce a new link that points to the new location. relative_to controls the relative pathing for the new link. For example, the links inside a text document usually need to step from Text/ to ../Images/ to link an image. But the links inside the OPF file start with Images/ right away. old_relative_to is needed when, for example, all of the files were in a single directory together, and now we are splitting them into Text/, Images/, etc. In this case, recognizing the old link requires that we understand the old relative location, then we can correct it using the new relative location. ''' if link is None: return None link = urllib.parse.urlsplit(link) if link.scheme: return None if old_relative_to is None: old_relative_to = relative_to new_filepath = ( rename_map.get(link.path) or rename_map.get(old_relative_to.join(link.path)) or rename_map.get(old_relative_to.join(urllib.parse.unquote(link.path))) or None ) if new_filepath is None: return None link = link._replace(path=new_filepath.relative_to(relative_to, simple=True).replace('\\', '/')) link = link._replace(path=urllib.parse.quote(link.path)) return link.geturl() @staticmethod def _fix_interlinking_css_helper(tag): ''' Given a