import re import sys import bs4 import os html_cleaners = [] soup_cleaners = [] global_footnotes = {} def html_cleaner(function): html_cleaners.append(function) def soup_cleaner(function): soup_cleaners.append(function) def raise_children_and_delete(element): children = list(element.children) while children: element.insert_after(children.pop(-1)) element.decompose() def remove_class(element, cls): if not hasattr(element, 'class') or element['class'] is None: return if isinstance(element['class'], str): if element['class'] == cls: del element['class'] return else: element['class'] = element['class'].split() try: element['class'].remove(cls) except IndexError: pass if len(element['class']) == 0: del element['class'] @html_cleaner def remove_unwanted_stylesheets(html): html = re.sub(r'', '', html) return html @html_cleaner def merge_neighboring_sametag(html): html = re.sub(r'', '', html) html = re.sub(r'\s*', ' ', html) html = re.sub(r'', '', html) html = re.sub(r'\s*', ' ', html) html = re.sub(r'', '', html) html = re.sub(r'\s*', ' ', html) return html @html_cleaner def bring_punctuation_into_italics(html): for tag in ['i', 'b']: for punct in ['.', ',', '-', '—']: html = re.sub('\\{punct}<{tag}>'.format(**locals()), '<{tag}>{punct}'.format(**locals()), html) html = re.sub('\\{punct}'.format(**locals()), '{punct}'.format(**locals()), html) return html @html_cleaner def remove_space_around_br(html): html = re.sub(r'\s*
\s*', '
', html) return html @html_cleaner def replace_smart_quotes(html): html = re.sub(r'”|“', '"', html) html = re.sub(r'‘|’|ʹ', "'", html) return html @html_cleaner def remove_empty_elements(html): html = re.sub(r'(?s)<(\w+)>(&(nbsp|emsp|ensp|thinsp|#160);|\s|
)*', '', html) return html @soup_cleaner def inject_footnotes(soup): footnotes = soup.find_all('blockquote', {'class': 'gcufootnote_content'}) for footnote in footnotes: try: footnote_id = next(footnote.stripped_strings) except StopIteration: print(footnote, 'is malformed. No string contents.') continue if not footnote_id.startswith('['): print(footnote, 'is malformed. Should start with [id].') continue footnote_id = footnote_id.split('[', 1)[-1].split(']', 1)[0] global_footnotes[footnote_id] = footnote footnote_links = soup.find_all('span', {'class': 'gcufootnote_link'}) for footnote_link in reversed(footnote_links): if len(footnote_link.contents) != 1: print(footnote_link, 'is malformed. Should just be >[id<.') footnote_id = footnote_link.contents[0] if not footnote_id.startswith('['): print(footnote_link, 'is malformed. Should start with [id].') continue footnote_id = footnote_id.split('[', 1)[-1].split(']', 1)[0] if footnote_id not in global_footnotes: continue footnote = global_footnotes[footnote_id] footnote_link.parent.insert_after(footnote) footnote_link.insert_before(footnote_link.contents[0]) footnote_link.decompose() remove_class(footnote, 'gcufootnote_content') @soup_cleaner def convert_textdivs_p(soup): divs = soup.find_all('div') for div in divs: children = list(div.children) if len(children) == 1 and isinstance(children[0], (str, bs4.element.NavigableString)): div.name = 'p' @soup_cleaner def remove_body_br(soup): for br in soup.find_all('br'): if br.parent.name == 'body': br.decompose() @soup_cleaner def remove_empty_paragraphs(soup): brs = soup.find_all('br') br_parents = set(br.parent for br in brs) for br_parent in br_parents: if all(child.name == 'br' for child in br_parent.contents): br_parent.decompose() @soup_cleaner def remove_calibre_classes(soup): PATTERNS = [ r'calibre\d*', r'mbppagebreak', r'calibre_pb_\d+', r'filepos\d*', r'pgepubid\d*', ] for tag in soup.descendants: try: tag['class'] except (TypeError, KeyError): pass else: if isinstance(tag['class'], str): tag['class'] = tag['class'].split() for cls in list(tag['class']): if any(re.match(pattern, cls) for pattern in PATTERNS): tag['class'].remove(cls) if len(tag['class']) == 0 or tag['class'][0] == '': del tag['class'] try: tag['id'] except (TypeError, KeyError): pass else: if any(re.match(pattern, tag['id']) for pattern in PATTERNS): del tag['id'] continue @soup_cleaner def remove_header_italic_bold(soup): headers = [h for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] for h in soup.find_all(tag)] for header in headers: children = list(header.children) if len(children) > 1: continue child = children[0] if isinstance(child, str): continue if child.name in ['i', 'b']: raise_children_and_delete(child) @soup_cleaner def remove_useless_divs(soup): divs = soup.find_all('div') for div in divs: if not div.attrs: if all(isinstance(child, bs4.element.Tag) or child.isspace() for child in div.contents): raise_children_and_delete(div) @soup_cleaner def remove_useless_spans(soup): spans = soup.find_all('span') for span in spans: if span.attrs: continue raise_children_and_delete(span) @soup_cleaner def remove_nested_italic(soup): elements = [element for tag in ['b', 'i'] for element in soup.find_all(tag)] for element in elements: if element.parent.name == element.name: raise_children_and_delete(element) @soup_cleaner def replace_italic_bold_span(soup): tags = {'italic': 'i', 'italics': 'i', 'bold': 'b'} spans = set(span for cls in tags for span in soup.find_all('span', {'class': cls})) for span in spans: if isinstance(span['class'], str): span['class'] = span['class'].split() if len(span['class']) == 1: new_name = tags[span['class'][0]] del span['class'] span.name = new_name elif all(cls in tags for cls in span['class']): b = soup.new_tag('b') del span['class'] span.name = 'i' span.insert_before(b) b.insert(0, span) @soup_cleaner def replace_pblock_blockquote(soup): classes = ['block', 'block1', 'blockquote'] ptags = set(ptag for cls in classes for ptag in soup.find_all('p', {'class': cls})) for ptag in ptags: if isinstance(ptag['class'], str): span['class'] = span['class'].split() if len(ptag['class']) == 1: ptag.name = 'blockquote' ptag['class'] = [] @soup_cleaner def strip_ptag_whitespace(soup): ps = soup.find_all('p') + soup.find_all('blockquote') for p in ps: descendants = list(p.descendants) while descendants and not isinstance(descendants[0], bs4.element.NavigableString): if descendants[0].name == 'br': descendants[0].decompose() descendants.pop(0) while descendants and not isinstance(descendants[-1], bs4.element.NavigableString): if descendants[-1].name == 'br': descendants[-1].decompose() descendants.pop(-1) if not descendants: continue if len(descendants) == 1: descendants[0].replace_with(descendants[0].strip()) continue descendants[0].replace_with(descendants[0].lstrip()) descendants[-1].replace_with(descendants[-1].rstrip()) def cleanup_page(html): previous_html = None while previous_html != html: previous_html = html for cleaner in html_cleaners: html = cleaner(html) soup = bs4.BeautifulSoup(html, 'html.parser') for cleaner in soup_cleaners: cleaner(soup) html = str(soup) return html def run_once(book): for (id, href) in book.text_iter(): if id in ('navid', 'nav.xhtml', 'nav.html'): continue print('Cleaning', id) html = book.readfile(id) html = cleanup_page(html) book.writefile(id, html) def run(book): run_once(book) if global_footnotes: run_once(book) return 0