import copy import mimetypes import os import re import tempfile import urllib.parse import uuid import zipfile import bs4 import tinycss2 from voussoirkit import pathclass HTML_LINK_PROPERTIES = { 'a': ['href'], 'audio': ['src'], 'image': ['href', 'xlink:href'], 'img': ['src'], 'link': ['href'], 'script': ['src'], 'source': ['src'], 'track': ['src'], 'video': ['src', 'poster'], } EXTENSION_MIMETYPES = { 'htm': 'application/xhtml+xml', 'html': 'application/xhtml+xml', 'otf': 'font/otf', 'pls': 'application/pls+xml', 'smi': 'application/smil+xml', 'smil': 'application/smil+xml', 'sml': 'application/smil+xml', 'ttf': 'font/ttf', 'woff': 'font/woff', 'woff2': 'font/woff2', 'xhtml': 'application/xhtml+xml', 'xpgt': 'application/vnd.adobe-page-template+xml', } MIMETYPE_DIRECTORIES = { 'application/font-sfnt': 'Fonts', 'application/x-dtbncx+xml': '.', 'application/x-font-ttf': 'Fonts', 'application/xhtml+xml': 'Text', 'audio': 'Audio', 'font': 'Fonts', 'image': 'Images', 'text/css': 'Styles', 'video': 'Video', } MIMETYPE_FILE_TEMPLATE = 'application/epub+zip' CONTAINER_XML_TEMPLATE = ''' '''.strip() OPF_TEMPLATE = ''' {uuid} author aut title und '''.strip() NCX_TEMPLATE = ''' {title} {navpoints} '''.strip() NAV_XHTML_TEMPLATE = ''' '''.strip() TEXT_TEMPLATE = ''' {head_content} {body_content} '''.strip() # EPUB COMPRESSION ################################################################################ def compress_epub(directory, epub_filepath): directory = pathclass.Path(directory) epub_filepath = pathclass.Path(epub_filepath) if epub_filepath in directory: raise ValueError('Epub inside its own directory') if epub_filepath.extension != 'epub': epub_filepath = epub_filepath.add_extension('epub') with zipfile.ZipFile(epub_filepath.absolute_path, 'w') as z: z.write(directory.with_child('mimetype').absolute_path, arcname='mimetype') for file in directory.walk(): if file in [directory.with_child('mimetype'), directory.with_child('sigil.cfg')]: continue z.write( file.absolute_path, arcname=file.relative_to(directory), compress_type=zipfile.ZIP_DEFLATED, ) return epub_filepath def extract_epub(epub_filepath, directory): epub_filepath = pathclass.Path(epub_filepath) directory = pathclass.Path(directory) with zipfile.ZipFile(epub_filepath.absolute_path, 'r') as z: z.extractall(directory.absolute_path) # XHTML TOOLS ################################################################################ def fix_xhtml(xhtml, return_soup=False): if isinstance(xhtml, bs4.BeautifulSoup): soup = xhtml else: # For the text pages, html5lib is the best because html.parser and lxml # lowercase all attributes, breaking svg's case-sensitive viewBox etc. # and xml loses all of the namespaces when namespaced elements are nested # like . # The downside of html5lib is it turns the xml declaration at the top # into a comment which we must undo manually. soup = bs4.BeautifulSoup(xhtml, 'html5lib') if not soup.html: html = soup.new_tag('html') for child in list(soup.contents): html.append(child) soup.append(html) if not soup.html.body: body = soup.new_tag('body') for child in list(soup.html.contents): body.append(child) soup.html.append(body) if not soup.html.get('xmlns'): soup.html['xmlns'] = 'http://www.w3.org/1999/xhtml' try: doctype = next(i for i in soup.contents if isinstance(i, bs4.Doctype)) except StopIteration: doctype = bs4.Doctype('html') soup.html.insert_before(doctype) # html5lib turns the xml declaration into a comment which we must revert. try: if isinstance(soup.contents[0], bs4.Comment): declaration = bs4.Declaration('xml version="1.0" encoding="utf-8"') soup.insert(0, declaration) declaration.next.extract() except StopIteration: pass try: declaration = next(i for i in soup.contents if isinstance(i, bs4.Declaration)) except StopIteration: declaration = bs4.Declaration('xml version="1.0" encoding="utf-8"') doctype.insert_before(declaration) if return_soup: return soup return str(soup) def xhtml_replacements(xhtml, replacements, return_soup=False): if isinstance(xhtml, bs4.BeautifulSoup): xhtml = str(xhtml) for (re_from, re_to) in replacements: xhtml = re.sub(re_from, re_to, xhtml, flags=re.DOTALL) if return_soup: soup = bs4.BeautifulSoup(xhtml, 'html5lib') return soup return xhtml def demote_xhtml_headers(xhtml, return_soup=False): replacements = [ (r']*?>.*?)', r''), (r']*?>.*?)', r''), (r']*?>.*?)', r''), (r']*?>.*?)', r''), (r']*?>.*?)', r''), ] return xhtml_replacements(xhtml, replacements, return_soup=return_soup) def promote_xhtml_headers(xhtml, return_soup=False): replacements = [ (r']*?>.*?)', r''), (r']*?>.*?)', r''), (r']*?>.*?)', r''), (r']*?>.*?)', r''), (r']*?>.*?)', r''), ] return xhtml_replacements(xhtml, replacements, return_soup=return_soup) # MIMETYPE DECISIONMAKERS ################################################################################ def get_directory_for_mimetype(mime): directory = ( MIMETYPE_DIRECTORIES.get(mime) or MIMETYPE_DIRECTORIES.get(mime.split('/')[0]) or 'Misc' ) return directory def get_mimetype_for_basename(basename): extension = os.path.splitext(basename)[1].strip('.') mime = ( EXTENSION_MIMETYPES.get(extension) or mimetypes.guess_type(basename)[0] or 'application/octet-stream' ) return mime # OPF ELEMENT GENERATORS ################################################################################ def make_manifest_item(id, href, mime): manifest_item = f'' # 'html.parser' just for having the simplest output. manifest_item = bs4.BeautifulSoup(manifest_item, 'html.parser') return manifest_item.item def make_meta_item(content=None, attrs=None): if content: meta_item = f'{content}' else: meta_item = f'' # 'html.parser' just for having the simplest output. meta_item = bs4.BeautifulSoup(meta_item, 'html.parser') if attrs: meta_item.attrs.update(attrs) return meta_item.meta def make_spine_item(id): spine_item = f'' # 'html.parser' just for having the simplest output. spine_item = bs4.BeautifulSoup(spine_item, 'html.parser') return spine_item.itemref class EpubfileException(Exception): error_message = '' def __init__(self, *args, **kwargs): super().__init__() self.given_args = args self.given_kwargs = kwargs self.error_message = self.error_message.format(*args, **kwargs) self.args = (self.error_message, args, kwargs) def __str__(self): return self.error_message class InvalidEpub(EpubfileException): error_message = '{} is invalid: {}' class FileExists(EpubfileException): error_message = 'There is already a file at {}.' class IDExists(EpubfileException): error_message = 'There is already a file with id {}.' class NotInManifest(EpubfileException): error_message = '{} is not in the manifest.' class NotInSpine(EpubfileException): error_message = '{} is not in the spine.' class Epub: def __init__(self, directory, _original_epub_filepath=None): if isinstance(directory, tempfile.TemporaryDirectory): # Hold a reference so the tempdir doesn't clean up. self._tempdir_reference = directory directory = directory.name self.root_directory = pathclass.Path(directory, force_sep='/') self._original_path = _original_epub_filepath or self.root_directory.absolute_path opfs = self.get_opfs() self.opf_filepath = opfs[0] self.opf = self.read_opf(self.opf_filepath) def __repr__(self): return f'Epub({repr(self.root_directory.absolute_path)})' def assert_file_not_exists(self, filepath): if filepath.exists: existing = filepath.relative_to(self.opf_filepath.parent) raise FileExists(existing) def assert_id_not_exists(self, id): if self.opf.manifest.find('item', {'id': id}): raise IDExists(id) # VALIDATION ############################################################################ def auto_correct_and_validate(self): # Ensure we have a mimetype file. mimetype_file = self.root_directory.with_child('mimetype') if not mimetype_file.exists: with open(mimetype_file.absolute_path, 'w', encoding='utf-8') as handle: handle.write(MIMETYPE_FILE_TEMPLATE) # Assert that all manifest items exist on disk. for item in self.get_manifest_items(soup=True): filepath = self.get_filepath(item['id']) if not filepath.exists: raise InvalidEpub(self._original_path, f'Manifest item {item["id"]} = {item["href"]} does not exist.') # LOADING AND SAVING ############################################################################ @classmethod def new(cls): def writefile(filepath, content): os.makedirs(filepath.parent.absolute_path, exist_ok=True) with open(filepath.absolute_path, 'w', encoding='utf-8') as handle: handle.write(content) uid = uuid.uuid4().urn tempdir = tempfile.TemporaryDirectory(prefix='epubfile-') root = pathclass.Path(tempdir.name) writefile(root.join('mimetype'), MIMETYPE_FILE_TEMPLATE) writefile(root.join('META-INF/container.xml'), CONTAINER_XML_TEMPLATE) writefile(root.join('OEBPS/content.opf'), OPF_TEMPLATE.format(uuid=uid)) writefile(root.join('OEBPS/toc.ncx'), NCX_TEMPLATE.format(uuid=uid, title='Unknown', navpoints='')) writefile(root.join('OEBPS/Text/nav.xhtml'), NAV_XHTML_TEMPLATE.format(toc_contents='')) return cls(tempdir) @classmethod def open(cls, epub_filepath): extract_to = tempfile.TemporaryDirectory(prefix='epubfile-') extract_epub(epub_filepath, extract_to.name) return cls(extract_to, _original_epub_filepath=epub_filepath) def save(self, epub_filepath): self.write_opf() self.auto_correct_and_validate() compress_epub(self.root_directory, epub_filepath) # CONTAINER & OPF ############################################################################ def get_opfs(self): container = self.read_container_xml() rootfiles = container.find_all('rootfile') rootfiles = [x.get('full-path') for x in rootfiles] rootfiles = [self.root_directory.join(x) for x in rootfiles] return rootfiles def read_container_xml(self): container_xml_path = self.root_directory.join('META-INF/container.xml') container = open(container_xml_path.absolute_path, 'r', encoding='utf-8') # 'xml' and 'html.parser' seem about even here except that html.parser # doesn't self-close. container = bs4.BeautifulSoup(container, 'xml') return container def read_opf(self, rootfile): rootfile = pathclass.Path(rootfile, force_sep='/') rootfile_xml = open(rootfile.absolute_path, 'r', encoding='utf-8').read() # 'html.parser' preserves namespacing the best, but unfortunately it # botches the items because it wants them to be self-closing # and the string contents come out. We will fix in just a moment. # This is still preferable to 'xml' which handles the dc: prefixes when # parsing only the metadata block, but loses all namespaces when parsing # the whole doc. 'lxml' wraps the content in and also # botches the metas so it's not any better than html.parser. opf = bs4.BeautifulSoup(rootfile_xml, 'html.parser') # Let's fix those metas. metas = opf.select('meta') for meta in metas: neighbor = meta.next if neighbor.parent != meta.parent: # This happens on the last meta, neighbor is outside of the manifest break if not isinstance(neighbor, bs4.element.NavigableString): continue meta.append(neighbor.extract().strip()) return opf def write_container_xml(self, container): if isinstance(container, bs4.BeautifulSoup): container = str(container) container_xml_path = self.root_directory.join('META-INF/container.xml') container_xml = open(container_xml_path.absolute_path, 'w', encoding='utf-8') container_xml.write(container) def write_opf(self): with open(self.opf_filepath.absolute_path, 'w', encoding='utf-8') as rootfile: rootfile.write(str(self.opf)) # FILE OPERATIONS ############################################################################ def add_file(self, id, basename, content): self.assert_id_not_exists(id) basename = os.path.basename(basename) mime = get_mimetype_for_basename(basename) directory = get_directory_for_mimetype(mime) directory = self.opf_filepath.parent.with_child(directory) os.makedirs(directory.absolute_path, exist_ok=True) filepath = directory.with_child(basename) self.assert_file_not_exists(filepath) if mime == 'application/xhtml+xml': # bs4 converts bytes to str so this must come before the handle choice. content = fix_xhtml(content) if isinstance(content, str): handle = open(filepath.absolute_path, 'w', encoding='utf-8') elif isinstance(content, bytes): handle = open(filepath.absolute_path, 'wb') else: raise TypeError(f'content should be str or bytes, not {type(content)}.') with handle: handle.write(content) href = filepath.relative_to(self.opf_filepath.parent, simple=True) href = urllib.parse.quote(href) manifest_item = make_manifest_item(id, href, mime) self.opf.manifest.append(manifest_item) if mime == 'application/xhtml+xml': spine_item = make_spine_item(id) self.opf.spine.append(spine_item) return id def easy_add_file(self, filepath): filepath = pathclass.Path(filepath) with open(filepath.absolute_path, 'rb') as handle: self.add_file( id=filepath.basename, basename=filepath.basename, content=handle.read(), ) def delete_file(self, id): manifest_item = self.opf.manifest.find('item', {'id': id}) manifest_item.extract() spine_item = self.opf.spine.find('itemref', {'idref': id}) if spine_item: spine_item.extract() os.remove(self.get_filepath(id).absolute_path) def get_filepath(self, id): href = self.opf.manifest.find('item', {'id': id})['href'] filepath = self.opf_filepath.parent.join(href) if not filepath.exists: href = urllib.parse.unquote(href) filepath = self.opf_filepath.parent.join(href) return filepath def open_file(self, id, mode): if mode not in ('r', 'w'): raise ValueError(f'mode should be either r or w, not {mode}.') filepath = self.get_filepath(id) mime = self.opf.manifest.find('item', {'id': id})['media-type'] is_text = ( mime in ('application/xhtml+xml', 'application/x-dtbncx+xml') or mime.startswith('text/') ) if is_text: handle = open(filepath.absolute_path, mode, encoding='utf-8') else: handle = open(filepath.absolute_path, mode + 'b') return handle def read_file(self, id, *, soup=False): # text vs binary handled by open_file. content = self.open_file(id, 'r').read() if soup and self.get_manifest_item(id)['media-type'] == 'application/xhtml+xml': return fix_xhtml(content, return_soup=True) return content def rename_file(self, id, new_basename=None, *, fix_interlinking=True): if isinstance(id, dict): basename_map = id else: if new_basename is None: raise TypeError('new_basename can be omitted if id is a dict.') basename_map = {id: new_basename} rename_map = {} for (id, new_basename) in basename_map.items(): old_filepath = self.get_filepath(id) new_filepath = old_filepath.parent.with_child(new_basename) if not new_filepath.extension: new_filepath = new_filepath.add_extension(old_filepath.extension) self.assert_file_not_exists(new_filepath) os.rename(old_filepath.absolute_path, new_filepath.absolute_path) rename_map[old_filepath] = new_filepath if fix_interlinking: self.fix_interlinking(rename_map) else: self.fix_interlinking_opf(rename_map) return rename_map def write_file(self, id, content): # text vs binary handled by open_file. if isinstance(content, bs4.BeautifulSoup): content = str(content) with self.open_file(id, 'w') as handle: handle.write(content) # GETTING THINGS ############################################################################ def get_manifest_items(self, filter='', soup=False, spine_order=False): query = f'item{filter}' items = self.opf.manifest.select(query) if spine_order: items = {x['id']: x for x in items} ordered_items = [] for spine_id in self.get_spine_order(): ordered_items.append(items.pop(spine_id)) ordered_items.extend(items.values()) items = ordered_items if soup: return items return [x['id'] for x in items] def get_manifest_item(self, id): item = self.opf.manifest.find('item', {'id': id}) if not item: raise NotInManifest(id) return item def get_fonts(self, *, soup=False): return self.get_manifest_items( filter='[media-type^="application/font"],[media-type^="font/"]', soup=soup, ) def get_images(self, *, soup=False): return self.get_manifest_items( filter='[media-type^="image/"]', soup=soup, ) def get_media(self, *, soup=False): return self.get_manifest_items( filter='[media-type^="video/"],[media-type^="audio/"]', soup=soup, ) def get_nav(self, *, soup=False): nav = self.opf.manifest.find('item', {'properties': 'nav'}) if not nav: return None if soup: return nav return nav['id'] def get_ncx(self, *, soup=False): ncx = self.opf.manifest.find('item', {'media-type': 'application/x-dtbncx+xml'}) if not ncx: return None if soup: return ncx return ncx['id'] def get_styles(self, *, soup=False): return self.get_manifest_items( filter='[media-type="text/css"]', soup=soup, ) def get_texts(self, *, soup=False, skip_nav=False): texts = self.get_manifest_items( filter='[media-type="application/xhtml+xml"]', soup=True, spine_order=True, ) if skip_nav: texts = [x for x in texts if x.get('properties') != 'nav'] if soup: return texts return [x['id'] for x in texts] # COVER ############################################################################ def get_cover_image(self, *, soup=False): cover = self.opf.manifest.find('item', {'properties': 'cover-image'}) if cover: return cover if soup else cover['id'] cover = self.opf.metadata.find('meta', {'name': 'cover'}) if cover: return cover if soup else cover['content'] return None def remove_cover_image(self): current_cover = self.get_cover_image(soup=True) if not current_cover: return del current_cover['properties'] meta = self.opf.metadata.find('meta', {'name': 'cover'}) if meta: meta.extract() def set_cover_image(self, id): if id is None: self.remove_cover_image() current_cover = self.get_cover_image(soup=True) if not current_cover: pass elif current_cover['id'] == id: return else: del current_cover['properties'] manifest_item = self.get_manifest_item(id) manifest_item['properties'] = 'cover-image' current_meta = self.opf.metadata.find('meta', {'name': 'cover'}) if current_meta: current_meta[content] = id else: meta = make_meta_item(attrs={'name': 'cover', 'content': id}) self.opf.metadata.append(meta) # SPINE ############################################################################ def get_spine_order(self, *, linear_only=False): items = self.opf.spine.find_all('itemref') if linear_only: items = [x for x in items if x.get('linear') != 'no'] return [x['idref'] for x in items] return ids def set_spine_order(self, ids): manifest_ids = self.get_manifest_items() # Fetch the existing entries so that we can preserve their attributes # while rearranging, only creating new spine entries for ids that aren't # already present. spine_items = self.opf.spine.select('itemref') spine_items = {item['idref']: item for item in spine_items} for id in ids: if id not in manifest_ids: raise NotInManifest(id) if id in spine_items: self.opf.spine.append(spine_items.pop(id)) else: self.opf.spine.append(make_spine_item(id)) # The remainder of the current spine items were not used, so pop them out. for spine_item in spine_items.values(): spine_item.extract() def get_spine_linear(self, id): spine_item = self.opf.spine.find('itemref', {'idref': id}) if not spine_item: raise NotInSpine(id) linear = spine_item.get('linear') linear = {None: None, 'yes': True, 'no': False}.get(linear, linear) return linear def set_spine_linear(self, id, linear): ''' Set linear to yes or no. Or pass None to remove the property. ''' spine_item = self.opf.spine.find('itemref', {'idref': id}) if not spine_item: raise NotInSpine(id) if linear is None: del spine_item['linear'] return if isinstance(linear, str): if linear not in ('yes', 'no'): raise ValueError(f'Linear must be yes or no, not {linear}.') elif isinstance(linear, (bool, int)): linear = {True: 'yes', False: 'no'}[bool(linear)] else: raise TypeError(linear) spine_item['linear'] = linear # METADATA ############################################################################ def get_authors(self): ''' Thank you double_j for showing how to deal with find_all not working on namespaced tags. https://stackoverflow.com/a/44681560 ''' creators = self.opf.metadata.find_all({'dc:creator'}) creators = [str(c.contents[0]) for c in creators if len(c.contents) == 1] return creators def get_languages(self): languages = self.opf.metadata.find_all({'dc:language'}) languages = [str(l.contents[0]) for l in languages if len(l.contents) == 1] return languages def get_titles(self): titles = self.opf.metadata.find_all({'dc:title'}) titles = [str(t.contents[0]) for t in titles if len(t.contents) == 1] return titles # UTILITIES ############################################################################ def fix_all_xhtml(self): for id in self.get_texts(): self.write_file(id, self.read_file(id, soup=True)) @staticmethod def _fix_interlinking_helper(link, rename_map, relative_to, old_relative_to=None): ''' Given an old link that was found in one of the documents, and the rename_map, produce a new link that points to the new location. relative_to controls the relative pathing for the new link. For example, the links inside a text document usually need to step from Text/ to ../Images/ to link an image. But the links inside the OPF file start with Images/ right away. old_relative_to is needed when, for example, all of the files were in a single directory together, and now we are splitting them into Text/, Images/, etc. In this case, recognizing the old link requires that we understand the old relative location, then we can correct it using the new relative location. ''' if link is None: return None link = urllib.parse.urlsplit(link) if link.scheme: return None if old_relative_to is None: old_relative_to = relative_to new_filepath = ( rename_map.get(link.path) or rename_map.get(old_relative_to.join(link.path)) or rename_map.get(old_relative_to.join(urllib.parse.unquote(link.path))) or None ) if new_filepath is None: return None link = link._replace(path=new_filepath.relative_to(relative_to, simple=True)) link = link._replace(path=urllib.parse.quote(link.path)) return link.geturl() @staticmethod def _fix_interlinking_css_helper(tag): ''' Given a