From 45ba419f16cb5978a849a9ae7eb7d3790d16843e Mon Sep 17 00:00:00 2001
From: Ethan Dalool <ethan@voussoir.net>
Date: Sun, 8 Sep 2019 21:20:32 -0700
Subject: [PATCH] Update.

---
 cleanerupper/plugin.py | 185 ++++++++++++++++++++++++++++++++---------
 1 file changed, 144 insertions(+), 41 deletions(-)
diff --git a/cleanerupper/plugin.py b/cleanerupper/plugin.py
index 87f7d6a..6327baf 100644
--- a/cleanerupper/plugin.py
+++ b/cleanerupper/plugin.py
@@ -37,29 +37,35 @@ def remove_class(element, cls):
 
 @html_cleaner
 def remove_unwanted_stylesheets(html):
-    html = re.sub(r'<style type="text/css">\s*@page { margin-bottom: 5.000000pt; margin-top: 5.000000pt; }\s*</style>', '', html)
+    html = re.sub(r'<style type="text/css">\s*@page { margin-bottom: 5\.000000pt; margin-top: 5\.000000pt; }\s*</style>', '', html)
+    html = re.sub(r'style="margin-top: 0px; margin-left: 0px; margin-right: 0px; margin-bottom: 0px; text-align: center;"', '', html)
     return html
 
 @html_cleaner
 def merge_neighboring_sametag(html):
-    html = re.sub(r'</i><i>', '', html)
-    html = re.sub(r'</i>\s*<i>', ' ', html)
+    tags = ['i', 'b', 'em', 'strong', 'u', 'small']
+    for tag in tags:
+        html = re.sub(r'</%s><%s>' % (tag, tag), '', html)
+        html = re.sub(r'</%s>\s*<%s>' % (tag, tag), ' ', html)
+        html = re.sub(r'</%s>\s*<br/?>\s*<%s>' % (tag, tag), '<br/>', html)
 
-    html = re.sub(r'</b><b>', '', html)
-    html = re.sub(r'</b>\s*<b>', ' ', html)
-
-    html = re.sub(r'</small><small>', '', html)
-    html = re.sub(r'</small>\s*<small>', ' ', html)
     return html
 
 @html_cleaner
 def bring_punctuation_into_italics(html):
-    for tag in ['i', 'b']:
+    for tag in ['i', 'b', 'em', 'strong']:
         for punct in ['.', ',', '-', '—']:
             html = re.sub('\\{punct}<{tag}>'.format(**locals()), '<{tag}>{punct}'.format(**locals()), html)
             html = re.sub('</{tag}>\\{punct}'.format(**locals()), '{punct}</{tag}>'.format(**locals()), html)
     return html
 
+@html_cleaner
+def remove_misc_strings(html):
+    html = html.replace('epub:type="pagebreak"', '')
+    html = re.sub(r'title="[ivx]+"', '', html)
+    html = re.sub(r'title="\d+"', '', html)
+    return html
+
 @html_cleaner
 def remove_space_around_br(html):
     html = re.sub(r'\s*<br/?>\s*', '<br/>', html)
@@ -71,6 +77,14 @@ def replace_smart_quotes(html):
     html = re.sub(r'‘|’|ʹ', "'", html)
     return html
 
+@html_cleaner
+def remove_empty_attributes(html):
+    html = re.sub(r'alt="\s*"', '', html)
+    html = re.sub(r'class="\s*"', '', html)
+    html = re.sub(r'id="\s*"', '', html)
+    html = re.sub(r'title="\s*"', '', html)
+    return html
+
 @html_cleaner
 def remove_empty_elements(html):
     html = re.sub(r'(?s)<(\w+)>(&(nbsp|emsp|ensp|thinsp|#160);|\s|<br/?>)*</\1>', '', html)
@@ -110,6 +124,16 @@ def inject_footnotes(soup):
         footnote_link.decompose()
         remove_class(footnote, 'gcufootnote_content')
 
+@soup_cleaner
+def center_images(soup):
+    for img in soup.find_all('img'):
+        if img.parent.name == 'body':
+            center = soup.new_tag('center')
+            img.insert_before(center)
+            center.append(img)
+        elif img.parent.name in ['div', 'p'] and not img.parent.attrs:
+            img.parent.name = 'center'
+
 @soup_cleaner
 def convert_textdivs_p(soup):
     divs = soup.find_all('div')
@@ -133,23 +157,51 @@ def remove_empty_paragraphs(soup):
             br_parent.decompose()
 
 @soup_cleaner
-def remove_calibre_classes(soup):
+def remove_unwanted_classes_ids(soup):
     PATTERNS = [
-        r'calibre\d*',
-        r'mbppagebreak',
+        r'big\d+',
+        r'blnonindent\d*',
+        r'c\d+',
+        r'calibre_?\d*',
         r'calibre_pb_\d+',
+        r'chapter',
+        r'div\d+',
+        r'dropcaps',
         r'filepos\d*',
+        r'font',
+        r'hanging',
+        r'indent\d*',
+        r'initial\d*',
+        r'initialcaps',
+        r'large',
+        r'mbp_?pagebreak',
+        r'morespaceabove',
+        r'nonindent\d*',
+        r'p_?[ivx]+',
+        r'p_?\d+',
+        r'page_?[ivx]+',
+        r'page_?\d+',
+        r'page_top_padding',
+        r'pagebreak',
         r'pgepubid\d*',
+        r'right',
+        r'section',
+        r'spaceabove',
+        r'squeeze(\d+)?',
+        r'stickupcaps',
+        r'title',
     ]
     for tag in soup.descendants:
-        try:
-            tag['class']
-        except (TypeError, KeyError):
-            pass
-        else:
+        if not isinstance(tag, bs4.element.Tag):
+            continue
+
+        if tag.get('class'):
             if isinstance(tag['class'], str):
                 tag['class'] = tag['class'].split()
+            else:
+                tag['class'] = list(tag['class'])
 
+            # Intentional list() duplicate so we can remove from original.
             for cls in list(tag['class']):
                 if any(re.match(pattern, cls) for pattern in PATTERNS):
                     tag['class'].remove(cls)
@@ -157,14 +209,9 @@ def remove_calibre_classes(soup):
             if len(tag['class']) == 0 or tag['class'][0] == '':
                 del tag['class']
 
-        try:
-            tag['id']
-        except (TypeError, KeyError):
-            pass
-        else:
+        if tag.get('id'):
             if any(re.match(pattern, tag['id']) for pattern in PATTERNS):
                 del tag['id']
-                continue
 
 @soup_cleaner
 def remove_header_italic_bold(soup):
@@ -173,19 +220,32 @@ def remove_header_italic_bold(soup):
         children = list(header.children)
         if len(children) > 1:
             continue
+        if len(children) == 0:
+            header.extract()
+            continue
         child = children[0]
         if isinstance(child, str):
             continue
-        if child.name in ['i', 'b']:
+        if child.name in ['i', 'b', 'em', 'strong']:
             raise_children_and_delete(child)
 
 @soup_cleaner
 def remove_useless_divs(soup):
     divs = soup.find_all('div')
     for div in divs:
-        if not div.attrs:
-            if all(isinstance(child, bs4.element.Tag) or child.isspace() for child in div.contents):
-                raise_children_and_delete(div)
+        if div.attrs:
+            continue
+        if all(isinstance(child, bs4.element.Tag) or child.isspace() for child in div.contents):
+            raise_children_and_delete(div)
+
+@soup_cleaner
+def remove_useless_blockquote(soup):
+    blocks = soup.find_all('blockquote')
+    for block in blocks:
+        if block.attrs:
+            continue
+        if all(child.name == 'blockquote' or (isinstance(child, bs4.element.NavigableString) and child.isspace()) for child in block.contents):
+            raise_children_and_delete(block)
 
 @soup_cleaner
 def remove_useless_spans(soup):
@@ -195,9 +255,29 @@ def remove_useless_spans(soup):
             continue
         raise_children_and_delete(span)
 
+@soup_cleaner
+def remove_useless_atags(soup):
+    atags = soup.find_all('a')
+    for atag in atags:
+        if atag.attrs:
+            continue
+        raise_children_and_delete(atag)
+
+@soup_cleaner
+def remove_useless_meta(soup):
+    selectors = [
+        'link[type="application/vnd.adobe-page-template+xml"]',
+        'meta[http-equiv="Content-Type"]',
+        'meta[name="Adept.expected.resource"]',
+        'meta[name="Adept.resource"]',
+    ]
+    for selector in selectors:
+        for item in soup.select(selector):
+            item.extract()
+
 @soup_cleaner
 def remove_nested_italic(soup):
-    elements = [element for tag in ['b', 'i'] for element in soup.find_all(tag)]
+    elements = [element for tag in ['b', 'i', 'em', 'strong'] for element in soup.find_all(tag)]
     for element in elements:
         if element.parent.name == element.name:
             raise_children_and_delete(element)
@@ -223,22 +303,45 @@ def replace_italic_bold_span(soup):
             b.insert(0, span)
 
 @soup_cleaner
-def replace_pblock_blockquote(soup):
-    classes = ['block', 'block1', 'blockquote']
-    ptags = set(ptag for cls in classes for ptag in soup.find_all('p', {'class': cls}))
-    for ptag in ptags:
-        if isinstance(ptag['class'], str):
-            span['class'] = span['class'].split()
-        if len(ptag['class']) == 1:
-            ptag.name = 'blockquote'
-            ptag['class'] = []
+def replace_classes_real_tags(soup):
+    replace = {
+        'div.block': 'blockquote',
+        'div.block1': 'blockquote',
+        'div.block2': 'blockquote',
+        'div.block3': 'blockquote',
+        'div.blockquote': 'blockquote',
+        'div.center': 'center',
+        'div.center1': 'center',
+        'p.block': 'blockquote',
+        'p.block1': 'blockquote',
+        'p.block2': 'blockquote',
+        'p.block3': 'blockquote',
+        'p.blockquote': 'blockquote',
+        'p.center': 'center',
+        'p.center1': 'center',
+        'span.b': 'b',
+        'span.i': 'i',
+        'span.italic': 'i',
+        'span.small': 'small',
+        'span.small1': 'small',
+        'span.smallcaps': 'small',
+        'span.underline': 'u',
+    }
+    for (selector, new_name) in replace.items():
+        for tag in soup.select(selector):
+            if isinstance(tag['class'], str):
+                tag['class'] = tag['class'].strip().split()
+            if len(tag['class']) == 1:
+                tag.name = new_name
+                tag['class'] = []
 
 @soup_cleaner
-def strip_ptag_whitespace(soup):
-    ps = soup.find_all('p') + soup.find_all('blockquote')
+def strip_unecessary_whitespace(soup):
+    tags = ['p', 'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
+    elements = [element for tag in tags for element in soup.find_all(tag)]
 
-    for p in ps:
-        descendants = list(p.descendants)
+    for element in elements:
+        descendants = list(element.descendants)
         while descendants and not isinstance(descendants[0], bs4.element.NavigableString):
             if descendants[0].name == 'br':
                 descendants[0].decompose()