diff --git a/epubfile.py b/epubfile.py index c70a9bf..b46f54d 100644 --- a/epubfile.py +++ b/epubfile.py @@ -158,6 +158,26 @@ def extract_epub(epub_filepath, directory): with zipfile.ZipFile(epub_filepath.absolute_path, 'r') as z: z.extractall(directory.absolute_path) +def demote_xhtml_headers(xhtml, return_soup=False): + if isinstance(xhtml, bs4.BeautifulSoup): + xhtml = str(xhtml) + + replacements = [ + (r']*?>.*?)', r''), + (r']*?>.*?)', r''), + (r']*?>.*?)', r''), + (r']*?>.*?)', r''), + (r']*?>.*?)', r''), + ] + for (re_from, re_to) in replacements: + xhtml = re.sub(re_from, re_to, xhtml, flags=re.DOTALL) + + if return_soup: + soup = bs4.BeautifulSoup(xhtml, 'html5lib') + return soup + + return xhtml + def fix_xhtml(xhtml, return_soup=False): if isinstance(xhtml, bs4.BeautifulSoup): soup = xhtml @@ -1323,7 +1343,13 @@ def holdit_argparse(args): book.read_opf(book.opf_filepath) book.save(epub) -def merge(input_filepaths, output_filename, do_headerfile=False, number_headerfile=False): +def merge( + input_filepaths, + output_filename, + demote_headers=False, + do_headerfile=False, + number_headerfile=False, + ): book = Epub.new() input_filepaths = [pathclass.Path(p) for pattern in input_filepaths for p in winglob.glob(pattern)] @@ -1375,7 +1401,13 @@ def merge(input_filepaths, output_filename, do_headerfile=False, number_headerfi for id in manifest_ids: new_id = prefix.format(id) new_basename = basename_map[id] - book.add_file(new_id, new_basename, input_book.read_file(id)) + if demote_headers: + content = input_book.read_file(id, soup=True) + if isinstance(content, bs4.BeautifulSoup): + content = demote_xhtml_headers(content) + else: + content = input_book.read_file(id) + book.add_file(new_id, new_basename, content) book.move_nav_to_end() book.save(output_filename) @@ -1391,6 +1423,7 @@ def merge_argparse(args): return merge( input_filepaths=args.epubs, output_filename=args.output, + demote_headers=args.demote_headers, do_headerfile=args.headerfile, number_headerfile=args.number_headerfile, ) @@ -1437,6 +1470,7 @@ def main(argv): p_merge.add_argument('epubs', nargs='+', default=[]) p_merge.add_argument('--output', dest='output', default=None, required=True) p_merge.add_argument('--headerfile', dest='headerfile', action='store_true') + p_merge.add_argument('--demote_headers', dest='demote_headers', action='store_true') p_merge.add_argument('--number_headerfile', dest='number_headerfile', action='store_true') p_merge.add_argument('-y', '--autoyes', dest='autoyes', action='store_true') p_merge.set_defaults(func=merge_argparse)