From 45ba419f16cb5978a849a9ae7eb7d3790d16843e Mon Sep 17 00:00:00 2001 From: Ethan Dalool Date: Sun, 8 Sep 2019 21:20:32 -0700 Subject: [PATCH] Update. --- cleanerupper/plugin.py | 185 ++++++++++++++++++++++++++++++++--------- 1 file changed, 144 insertions(+), 41 deletions(-) diff --git a/cleanerupper/plugin.py b/cleanerupper/plugin.py index 87f7d6a..6327baf 100644 --- a/cleanerupper/plugin.py +++ b/cleanerupper/plugin.py @@ -37,29 +37,35 @@ def remove_class(element, cls): @html_cleaner def remove_unwanted_stylesheets(html): - html = re.sub(r'', '', html) + html = re.sub(r'', '', html) + html = re.sub(r'style="margin-top: 0px; margin-left: 0px; margin-right: 0px; margin-bottom: 0px; text-align: center;"', '', html) return html @html_cleaner def merge_neighboring_sametag(html): - html = re.sub(r'', '', html) - html = re.sub(r'\s*', ' ', html) + tags = ['i', 'b', 'em', 'strong', 'u', 'small'] + for tag in tags: + html = re.sub(r'<%s>' % (tag, tag), '', html) + html = re.sub(r'\s*<%s>' % (tag, tag), ' ', html) + html = re.sub(r'\s*
\s*<%s>' % (tag, tag), '
', html) - html = re.sub(r'', '', html) - html = re.sub(r'\s*', ' ', html) - - html = re.sub(r'', '', html) - html = re.sub(r'\s*', ' ', html) return html @html_cleaner def bring_punctuation_into_italics(html): - for tag in ['i', 'b']: + for tag in ['i', 'b', 'em', 'strong']: for punct in ['.', ',', '-', 'β€”']: html = re.sub('\\{punct}<{tag}>'.format(**locals()), '<{tag}>{punct}'.format(**locals()), html) html = re.sub('\\{punct}'.format(**locals()), '{punct}'.format(**locals()), html) return html +@html_cleaner +def remove_misc_strings(html): + html = html.replace('epub:type="pagebreak"', '') + html = re.sub(r'title="[ivx]+"', '', html) + html = re.sub(r'title="\d+"', '', html) + return html + @html_cleaner def remove_space_around_br(html): html = re.sub(r'\s*
\s*', '
', html) @@ -71,6 +77,14 @@ def replace_smart_quotes(html): html = re.sub(r'β€˜|’|ΚΉ', "'", html) return html +@html_cleaner +def remove_empty_attributes(html): + html = re.sub(r'alt="\s*"', '', html) + html = re.sub(r'class="\s*"', '', html) + html = re.sub(r'id="\s*"', '', html) + html = re.sub(r'title="\s*"', '', html) + return html + @html_cleaner def remove_empty_elements(html): html = re.sub(r'(?s)<(\w+)>(&(nbsp|emsp|ensp|thinsp|#160);|\s|
)*', '', html) @@ -110,6 +124,16 @@ def inject_footnotes(soup): footnote_link.decompose() remove_class(footnote, 'gcufootnote_content') +@soup_cleaner +def center_images(soup): + for img in soup.find_all('img'): + if img.parent.name == 'body': + center = soup.new_tag('center') + img.insert_before(center) + center.append(img) + elif img.parent.name in ['div', 'p'] and not img.parent.attrs: + img.parent.name = 'center' + @soup_cleaner def convert_textdivs_p(soup): divs = soup.find_all('div') @@ -133,23 +157,51 @@ def remove_empty_paragraphs(soup): br_parent.decompose() @soup_cleaner -def remove_calibre_classes(soup): +def remove_unwanted_classes_ids(soup): PATTERNS = [ - r'calibre\d*', - r'mbppagebreak', + r'big\d+', + r'blnonindent\d*', + r'c\d+', + r'calibre_?\d*', r'calibre_pb_\d+', + r'chapter', + r'div\d+', + r'dropcaps', r'filepos\d*', + r'font', + r'hanging', + r'indent\d*', + r'initial\d*', + r'initialcaps', + r'large', + r'mbp_?pagebreak', + r'morespaceabove', + r'nonindent\d*', + r'p_?[ivx]+', + r'p_?\d+', + r'page_?[ivx]+', + r'page_?\d+', + r'page_top_padding', + r'pagebreak', r'pgepubid\d*', + r'right', + r'section', + r'spaceabove', + r'squeeze(\d+)?', + r'stickupcaps', + r'title', ] for tag in soup.descendants: - try: - tag['class'] - except (TypeError, KeyError): - pass - else: + if not isinstance(tag, bs4.element.Tag): + continue + + if tag.get('class'): if isinstance(tag['class'], str): tag['class'] = tag['class'].split() + else: + tag['class'] = list(tag['class']) + # Intentional list() duplicate so we can remove from original. for cls in list(tag['class']): if any(re.match(pattern, cls) for pattern in PATTERNS): tag['class'].remove(cls) @@ -157,14 +209,9 @@ def remove_calibre_classes(soup): if len(tag['class']) == 0 or tag['class'][0] == '': del tag['class'] - try: - tag['id'] - except (TypeError, KeyError): - pass - else: + if tag.get('id'): if any(re.match(pattern, tag['id']) for pattern in PATTERNS): del tag['id'] - continue @soup_cleaner def remove_header_italic_bold(soup): @@ -173,19 +220,32 @@ def remove_header_italic_bold(soup): children = list(header.children) if len(children) > 1: continue + if len(children) == 0: + header.extract() + continue child = children[0] if isinstance(child, str): continue - if child.name in ['i', 'b']: + if child.name in ['i', 'b', 'em', 'strong']: raise_children_and_delete(child) @soup_cleaner def remove_useless_divs(soup): divs = soup.find_all('div') for div in divs: - if not div.attrs: - if all(isinstance(child, bs4.element.Tag) or child.isspace() for child in div.contents): - raise_children_and_delete(div) + if div.attrs: + continue + if all(isinstance(child, bs4.element.Tag) or child.isspace() for child in div.contents): + raise_children_and_delete(div) + +@soup_cleaner +def remove_useless_blockquote(soup): + blocks = soup.find_all('blockquote') + for block in blocks: + if block.attrs: + continue + if all(child.name == 'blockquote' or (isinstance(child, bs4.element.NavigableString) and child.isspace()) for child in block.contents): + raise_children_and_delete(block) @soup_cleaner def remove_useless_spans(soup): @@ -195,9 +255,29 @@ def remove_useless_spans(soup): continue raise_children_and_delete(span) +@soup_cleaner +def remove_useless_atags(soup): + atags = soup.find_all('a') + for atag in atags: + if atag.attrs: + continue + raise_children_and_delete(atag) + +@soup_cleaner +def remove_useless_meta(soup): + selectors = [ + 'link[type="application/vnd.adobe-page-template+xml"]', + 'meta[http-equiv="Content-Type"]', + 'meta[name="Adept.expected.resource"]', + 'meta[name="Adept.resource"]', + ] + for selector in selectors: + for item in soup.select(selector): + item.extract() + @soup_cleaner def remove_nested_italic(soup): - elements = [element for tag in ['b', 'i'] for element in soup.find_all(tag)] + elements = [element for tag in ['b', 'i', 'em', 'strong'] for element in soup.find_all(tag)] for element in elements: if element.parent.name == element.name: raise_children_and_delete(element) @@ -223,22 +303,45 @@ def replace_italic_bold_span(soup): b.insert(0, span) @soup_cleaner -def replace_pblock_blockquote(soup): - classes = ['block', 'block1', 'blockquote'] - ptags = set(ptag for cls in classes for ptag in soup.find_all('p', {'class': cls})) - for ptag in ptags: - if isinstance(ptag['class'], str): - span['class'] = span['class'].split() - if len(ptag['class']) == 1: - ptag.name = 'blockquote' - ptag['class'] = [] +def replace_classes_real_tags(soup): + replace = { + 'div.block': 'blockquote', + 'div.block1': 'blockquote', + 'div.block2': 'blockquote', + 'div.block3': 'blockquote', + 'div.blockquote': 'blockquote', + 'div.center': 'center', + 'div.center1': 'center', + 'p.block': 'blockquote', + 'p.block1': 'blockquote', + 'p.block2': 'blockquote', + 'p.block3': 'blockquote', + 'p.blockquote': 'blockquote', + 'p.center': 'center', + 'p.center1': 'center', + 'span.b': 'b', + 'span.i': 'i', + 'span.italic': 'i', + 'span.small': 'small', + 'span.small1': 'small', + 'span.smallcaps': 'small', + 'span.underline': 'u', + } + for (selector, new_name) in replace.items(): + for tag in soup.select(selector): + if isinstance(tag['class'], str): + tag['class'] = tag['class'].strip().split() + if len(tag['class']) == 1: + tag.name = new_name + tag['class'] = [] @soup_cleaner -def strip_ptag_whitespace(soup): - ps = soup.find_all('p') + soup.find_all('blockquote') +def strip_unecessary_whitespace(soup): + tags = ['p', 'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'] + elements = [element for tag in tags for element in soup.find_all(tag)] - for p in ps: - descendants = list(p.descendants) + for element in elements: + descendants = list(element.descendants) while descendants and not isinstance(descendants[0], bs4.element.NavigableString): if descendants[0].name == 'br': descendants[0].decompose()