Remove Epub.open, use Epub('path'); support read-only mode.

Instead of Epub.open('path'), now use Epub('path'). This makes the class more intuitive to use. This also removes the need for the _original_epub_filepath argument which was simply awkward. We now support read-only books with the read_only argument. The main benefit is that we don't need to unzip the epub file, so if you are going to be reading from epubs but don't need to write, setting read_only can significately improve speed.
2020-03-07 16:18:53 -08:00 · 2020-03-07 16:18:53 -08:00 · 88fb94e84e
commit 88fb94e84e
parent 6c63444d27
2 changed files with 160 additions and 34 deletions
--- a/README.md
+++ b/README.md
@ -3,7 +3,7 @@ epubfile

 ```Python
 import epubfile
-book = epubfile.Epub.open('mybook.epub')
+book = epubfile.Epub('mybook.epub')

 for text_id in book.get_texts():
    soup = book.read_file(text_id, soup=True)
@ -15,7 +15,8 @@ for image_id in book.get_images():
    ...
    book.write_file(image_id, data)

-# Note that this does not reverse the table of contents.
+# Note that this only reverses the spine in the opf file, it does not reverse
+# the table of contents, since the toc is stored as nav.xhtml and toc.ncx.
 book.set_spine_order(reversed(book.get_spine_order()))

 cover_id = book.get_cover_image()
@ -33,7 +34,7 @@ epubfile provides simple editing of epub books. epubfile attempts to keep file m

 # Command line utilities

-This library was born out of my own needs. So there are a couple of builtin utilities.
+This library was born out of my own needs. So there are a couple of builtin utilities. You can run `epubfile.py` with no arguments to see a summary of all builtin utilities. Here are some examples:

 ```
 addfile:
--- a/epubfile.py
+++ b/epubfile.py
@ -1,4 +1,6 @@
 import copy
+import functools
+import io
 import mimetypes
 import os
 import re
@ -290,7 +292,18 @@ def make_spine_item(id):
    spine_item = bs4.BeautifulSoup(spine_item, 'html.parser')
    return spine_item.itemref

+# DECORATORS
+################################################################################
+def writes(method):
+    @functools.wraps(method)
+    def wrapped_method(self, *args, **kwargs):
+        if self.read_only:
+            raise ReadOnly(method.__qualname__)
+        return method(self, *args, **kwargs)
+    return wrapped_method

+# CLASSES
+################################################################################
 class EpubfileException(Exception):
    error_message = ''

@ -319,25 +332,114 @@ class NotInManifest(EpubfileException):
 class NotInSpine(EpubfileException):
    error_message = '{} is not in the spine.'

+class ReadOnly(EpubfileException):
+    error_message = 'Can\'t do {} in read-only mode.'
+

 class Epub:
-    def __init__(self, directory, _original_epub_filepath=None):
-        if isinstance(directory, tempfile.TemporaryDirectory):
-            # Hold a reference so the tempdir doesn't clean up.
-            self._tempdir_reference = directory
-            directory = directory.name
+    def __init__(self, epub_path, *, read_only=False):
+        '''
+        epub_path:
+            The path to an .epub file, or to a directory that contains unzipped
+             epub contents.

-        self.root_directory = pathclass.Path(directory, force_sep='/')
+        read_only:
+            If True, all write operations will be forbidden. The benefit is that
+            the .epub file will not be extracted. This is recommended if you
+            only need to read data from a book and don't need to write to it.
+        '''
+        epub_path = self._keep_tempdir_reference(epub_path)
+        epub_path = pathclass.Path(epub_path)
+        self.original_path = epub_path
+        self.read_only = read_only

-        self._original_path = _original_epub_filepath or self.root_directory.absolute_path
+        if epub_path.is_dir:
+            self.__init_from_dir(epub_path)
+        elif self.read_only:
+            self.__init_from_file_read_only(epub_path)
+        else:
+            self.__init_from_file(epub_path)

        opfs = self.get_opfs()
        self.opf_filepath = opfs[0]
        self.opf = self.read_opf(self.opf_filepath)

+    def __init_from_dir(self, directory):
+        self.is_zip = False
+        self.root_directory = pathclass.Path(directory)
+
+    def __init_from_file_read_only(self, epub_path):
+        # It may appear that is_zip is a synonym for read_only, but don't forget
+        # that we can also open a directory in readonly mode. It's just that
+        # readonly dirs don't need a special init, all they have to do is
+        # forbid writes.
+        self.is_zip = True
+        self.root_directory = pathclass.Path(epub_path)
+        self.zip = zipfile.ZipFile(self.root_directory.absolute_path)
+
+    def __init_from_file(self, epub_path):
+        extract_to = tempfile.TemporaryDirectory(prefix='epubfile-')
+        extract_epub(epub_path, extract_to.name)
+        directory = self._keep_tempdir_reference(extract_to)
+        self.__init_from_dir(directory)
+
    def __repr__(self):
+        if self.read_only:
+            return f'Epub({repr(self.root_directory.absolute_path)}, read_only=True)'
+        else:
            return f'Epub({repr(self.root_directory.absolute_path)})'

+    def _fopen(self, *args, **kwargs):
+        '''
+        Not to be confused with the high level `open_file` method, this method
+        is the one that actually reads off the disk.
+        '''
+        if self.is_zip:
+            return self._fopen_zip(*args, **kwargs)
+        else:
+            return self._fopen_disk(*args, **kwargs)
+
+    def _fopen_disk(self, path, mode, *, encoding=None):
+        '''
+        If the book was opened as a directory, we can read files off disk with
+        Python open.
+        '''
+        return open(path, mode, encoding=encoding)
+
+    def _fopen_zip(self, path, mode, *, encoding=None):
+        '''
+        If the book was opened as a read-only zip, we can read files out of
+        the zip.
+        '''
+        p_path = self.root_directory.spawn(path)
+        if p_path in self.root_directory:
+            path = p_path.relative_to(self.root_directory, simple=True)
+
+        # Zip files always use forward slash internally, even on Windows.
+        path = path.replace('\\', '/')
+
+        if mode == 'rb':
+            return self.zip.open(path, 'r')
+        if mode == 'r':
+            return io.TextIOWrapper(self.zip.open(path, 'r'), encoding)
+        # At this time ZipFS is only used for read-only epubs anyway.
+        if mode == 'wb':
+            return self.zip.open(path, 'w')
+        if mode == 'w':
+            return io.TextIOWrapper(self.zip.open(path, 'w'), encoding)
+        raise ValueError('mode should be r, w, rb, or wb.')
+
+    def _keep_tempdir_reference(self, p):
+        '''
+        If the given path object is actually a tempfile.TemporaryDirectory,
+        store that TD reference here so that it does not get cleaned up even
+        if the caller releases it. Then return the actual filepath.
+        '''
+        if isinstance(p, tempfile.TemporaryDirectory):
+            self._tempdir_reference = p
+            p = p.name
+        return p
+
    def assert_file_not_exists(self, filepath):
        if filepath.exists:
            existing = filepath.relative_to(self.opf_filepath.parent)
@ -349,11 +451,12 @@ class Epub:

    # VALIDATION
    ############################################################################
+    @writes
    def auto_correct_and_validate(self):
        # Ensure we have a mimetype file.
        mimetype_file = self.root_directory.with_child('mimetype')
        if not mimetype_file.exists:
-            with open(mimetype_file.absolute_path, 'w', encoding='utf-8') as handle:
+            with self._fopen(mimetype_file.absolute_path, 'w', encoding='utf-8') as handle:
                handle.write(MIMETYPE_FILE_TEMPLATE)

        # Assert that all manifest items exist on disk.
@ -369,6 +472,10 @@ class Epub:
    def new(cls):
        def writefile(filepath, content):
            os.makedirs(filepath.parent.absolute_path, exist_ok=True)
+            # This line uses Python open instead of self._fopen because the epub
+            # hasn't been instantiated yet! At this time, creating a book with
+            # Epub.new always creates it as a directory. We do not support
+            # creating a book directly into a fresh zip file.
            with open(filepath.absolute_path, 'w', encoding='utf-8') as handle:
                handle.write(content)

@ -384,12 +491,7 @@ class Epub:

        return cls(tempdir)

-    @classmethod
-    def open(cls, epub_filepath):
-        extract_to = tempfile.TemporaryDirectory(prefix='epubfile-')
-        extract_epub(epub_filepath, extract_to.name)
-        return cls(extract_to, _original_epub_filepath=epub_filepath)
-
+    @writes
    def save(self, epub_filepath):
        self.write_opf()
        self.auto_correct_and_validate()
@ -406,7 +508,7 @@ class Epub:

    def read_container_xml(self):
        container_xml_path = self.root_directory.join('META-INF/container.xml')
-        container = open(container_xml_path.absolute_path, 'r', encoding='utf-8')
+        container = self._fopen(container_xml_path.absolute_path, 'r', encoding='utf-8')
        # 'xml' and 'html.parser' seem about even here except that html.parser
        # doesn't self-close.
        container = bs4.BeautifulSoup(container, 'xml')
@ -414,7 +516,7 @@ class Epub:

    def read_opf(self, rootfile):
        rootfile = pathclass.Path(rootfile, force_sep='/')
-        rootfile_xml = open(rootfile.absolute_path, 'r', encoding='utf-8').read()
+        rootfile_xml = self._fopen(rootfile.absolute_path, 'r', encoding='utf-8').read()
        # 'html.parser' preserves namespacing the best, but unfortunately it
        # botches the <meta> items because it wants them to be self-closing
        # and the string contents come out. We will fix in just a moment.
@ -437,19 +539,22 @@ class Epub:

        return opf

+    @writes
    def write_container_xml(self, container):
        if isinstance(container, bs4.BeautifulSoup):
            container = str(container)
        container_xml_path = self.root_directory.join('META-INF/container.xml')
-        container_xml = open(container_xml_path.absolute_path, 'w', encoding='utf-8')
+        container_xml = self._fopen(container_xml_path.absolute_path, 'w', encoding='utf-8')
        container_xml.write(container)

+    @writes
    def write_opf(self):
-        with open(self.opf_filepath.absolute_path, 'w', encoding='utf-8') as rootfile:
+        with self._fopen(self.opf_filepath.absolute_path, 'w', encoding='utf-8') as rootfile:
            rootfile.write(str(self.opf))

    # FILE OPERATIONS
    ############################################################################
+    @writes
    def add_file(self, id, basename, content):
        self.assert_id_not_exists(id)

@ -467,9 +572,9 @@ class Epub:
            content = fix_xhtml(content)

        if isinstance(content, str):
-            handle = open(filepath.absolute_path, 'w', encoding='utf-8')
+            handle = self._fopen(filepath.absolute_path, 'w', encoding='utf-8')
        elif isinstance(content, bytes):
-            handle = open(filepath.absolute_path, 'wb')
+            handle = self._fopen(filepath.absolute_path, 'wb')
        else:
            raise TypeError(f'content should be str or bytes, not {type(content)}.')

@ -488,15 +593,17 @@ class Epub:

        return id

+    @writes
    def easy_add_file(self, filepath):
        filepath = pathclass.Path(filepath)
-        with open(filepath.absolute_path, 'rb') as handle:
+        with self._fopen(filepath.absolute_path, 'rb') as handle:
            self.add_file(
                id=filepath.basename,
                basename=filepath.basename,
                content=handle.read(),
            )

+    @writes
    def delete_file(self, id):
        manifest_item = self.opf.manifest.find('item', {'id': id})
        manifest_item.extract()
@ -517,6 +624,9 @@ class Epub:
        if mode not in ('r', 'w'):
            raise ValueError(f'mode should be either r or w, not {mode}.')

+        if mode == 'w' and self.read_only:
+            raise ReadOnly(self.open_file.__qualname__)
+
        filepath = self.get_filepath(id)
        mime = self.opf.manifest.find('item', {'id': id})['media-type']
        is_text = (
@ -525,9 +635,9 @@ class Epub:
        )

        if is_text:
-            handle = open(filepath.absolute_path, mode, encoding='utf-8')
+            handle = self._fopen(filepath.absolute_path, mode, encoding='utf-8')
        else:
-            handle = open(filepath.absolute_path, mode + 'b')
+            handle = self._fopen(filepath.absolute_path, mode + 'b')

        return handle

@ -538,6 +648,7 @@ class Epub:
            return fix_xhtml(content, return_soup=True)
        return content

+    @writes
    def rename_file(self, id, new_basename=None, *, fix_interlinking=True):
        if isinstance(id, dict):
            basename_map = id
@ -563,6 +674,7 @@ class Epub:

        return rename_map

+    @writes
    def write_file(self, id, content):
        # text vs binary handled by open_file.
        if isinstance(content, bs4.BeautifulSoup):
@ -663,6 +775,7 @@ class Epub:

        return None

+    @writes
    def remove_cover_image(self):
        current_cover = self.get_cover_image(soup=True)
        if not current_cover:
@ -674,6 +787,7 @@ class Epub:
        if meta:
            meta.extract()

+    @writes
    def set_cover_image(self, id):
        if id is None:
            self.remove_cover_image()
@ -706,6 +820,7 @@ class Epub:
        return [x['idref'] for x in items]
        return ids

+    @writes
    def set_spine_order(self, ids):
        manifest_ids = self.get_manifest_items()
        # Fetch the existing entries so that we can preserve their attributes
@ -733,6 +848,7 @@ class Epub:
        linear = {None: None, 'yes': True, 'no': False}.get(linear, linear)
        return linear

+    @writes
    def set_spine_linear(self, id, linear):
        '''
        Set linear to yes or no. Or pass None to remove the property.
@ -779,6 +895,7 @@ class Epub:

    # UTILITIES
    ############################################################################
+    @writes
    def fix_all_xhtml(self):
        for id in self.get_texts():
            self.write_file(id, self.read_file(id, soup=True))
@ -856,6 +973,7 @@ class Epub:

        return (links, commit)

+    @writes
    def fix_interlinking_text(self, id, rename_map, old_relative_to=None):
        if not rename_map:
            return
@ -881,6 +999,7 @@ class Epub:
        text = str(soup)
        self.write_file(id, text)

+    @writes
    def fix_interlinking_ncx(self, rename_map, old_relative_to=None):
        if not rename_map:
            return
@ -902,6 +1021,7 @@ class Epub:
        ncx = str(ncx)
        self.write_file(ncx_id, ncx)

+    @writes
    def fix_interlinking_opf(self, rename_map):
        if not rename_map:
            return
@ -913,6 +1033,7 @@ class Epub:
                continue
            opf_item['href'] = link

+    @writes
    def fix_interlinking(self, rename_map):
        if not rename_map:
            return
@ -990,6 +1111,7 @@ class Epub:
            navmap.append(navpoint)
        self.write_file(ncx_id, soup)

+    @writes
    def generate_toc(self, max_level=None, linear_only=True):
        '''
        Generate the table of contents (toc.nav and nav.xhtml) by collecting
@ -1110,6 +1232,7 @@ class Epub:
        if ncx_id:
            self._set_ncx_toc(ncx_id, copy.copy(toc))

+    @writes
    def move_nav_to_end(self):
        '''
        Move the nav.xhtml file to the end and set its linear=no.
@ -1129,6 +1252,7 @@ class Epub:
        self.set_spine_order(spine)
        self.set_spine_linear(nav, False)

+    @writes
    def normalize_directory_structure(self):
        # This must come before the opf rewrite because that would affect the
        # location of all all manifest item hrefs.
@ -1176,6 +1300,7 @@ class Epub:
            self.fix_interlinking_text(id, rename_map, old_relative_to=old_filepaths[id].parent)
        self.fix_interlinking_ncx(rename_map, old_relative_to=old_ncx_parent)

+    @writes
    def normalize_opf(self):
        for tag in self.opf.descendants:
            if tag.name:
@ -1307,7 +1432,7 @@ def random_string(length, characters=string.ascii_lowercase):
    return ''.join(random.choice(characters) for x in range(length))

 def addfile_argparse(args):
-    book = Epub.open(args.epub)
+    book = Epub(args.epub)

    for pattern in args.files:
        for file in winglob.glob(pattern):
@ -1368,7 +1493,7 @@ def covercomesfirst_argparse(args):
    epubs = [epub for pattern in args.epubs for epub in winglob.glob(pattern)]
    for epub in epubs:
        print(epub)
-        book = Epub.open(epub)
+        book = Epub(epub)
        covercomesfirst(book)
        book.save(args.epub)

@ -1376,7 +1501,7 @@ def exec_argparse(args):
    epubs = [epub for pattern in args.epubs for epub in winglob.glob(pattern)]
    for epub in epubs:
        print(epub)
-        book = Epub.open(epub)
+        book = Epub(epub)
        exec(args.command)
        book.save(epub)

@ -1384,7 +1509,7 @@ def generate_toc_argparse(args):
    epubs = [epub for pattern in args.epubs for epub in winglob.glob(pattern)]
    books = []
    for epub in epubs:
-        book = Epub.open(epub)
+        book = Epub(epub)
        book.generate_toc(max_level=int(args.max_level) if args.max_level else None)
        book.save(epub)

@ -1392,7 +1517,7 @@ def holdit_argparse(args):
    epubs = [epub for pattern in args.epubs for epub in winglob.glob(pattern)]
    books = []
    for epub in epubs:
-        book = Epub.open(epub)
+        book = Epub(epub)
        print(f'{epub} = {book.root_directory.absolute_path}')
        books.append((epub, book))

@ -1420,7 +1545,7 @@ def merge(
    for (index, input_filepath) in enumerate(input_filepaths, start=1):
        print(f'Merging {input_filepath.absolute_path}.')
        prefix = f'{rand_prefix}_{index:>0{index_length}}_{{}}'
-        input_book = Epub.open(input_filepath)
+        input_book = Epub(input_filepath)
        input_book.normalize_directory_structure()

        input_ncx = input_book.get_ncx()
@ -1488,7 +1613,7 @@ def normalize_argparse(args):
    epubs = [epub for pattern in args.epubs for epub in winglob.glob(pattern)]
    for epub in epubs:
        print(epub)
-        book = Epub.open(epub)
+        book = Epub(epub)
        book.normalize_opf()
        book.normalize_directory_structure()
        book.move_nav_to_end()