This commit is contained in:
Ethan Dalool 2019-09-08 21:20:32 -07:00
parent 0bd5315e0a
commit 45ba419f16

View file

@ -37,29 +37,35 @@ def remove_class(element, cls):
@html_cleaner @html_cleaner
def remove_unwanted_stylesheets(html): def remove_unwanted_stylesheets(html):
html = re.sub(r'<style type="text/css">\s*@page { margin-bottom: 5.000000pt; margin-top: 5.000000pt; }\s*</style>', '', html) html = re.sub(r'<style type="text/css">\s*@page { margin-bottom: 5\.000000pt; margin-top: 5\.000000pt; }\s*</style>', '', html)
html = re.sub(r'style="margin-top: 0px; margin-left: 0px; margin-right: 0px; margin-bottom: 0px; text-align: center;"', '', html)
return html return html
@html_cleaner @html_cleaner
def merge_neighboring_sametag(html): def merge_neighboring_sametag(html):
html = re.sub(r'</i><i>', '', html) tags = ['i', 'b', 'em', 'strong', 'u', 'small']
html = re.sub(r'</i>\s*<i>', ' ', html) for tag in tags:
html = re.sub(r'</%s><%s>' % (tag, tag), '', html)
html = re.sub(r'</%s>\s*<%s>' % (tag, tag), ' ', html)
html = re.sub(r'</%s>\s*<br/?>\s*<%s>' % (tag, tag), '<br/>', html)
html = re.sub(r'</b><b>', '', html)
html = re.sub(r'</b>\s*<b>', ' ', html)
html = re.sub(r'</small><small>', '', html)
html = re.sub(r'</small>\s*<small>', ' ', html)
return html return html
@html_cleaner @html_cleaner
def bring_punctuation_into_italics(html): def bring_punctuation_into_italics(html):
for tag in ['i', 'b']: for tag in ['i', 'b', 'em', 'strong']:
for punct in ['.', ',', '-', '']: for punct in ['.', ',', '-', '']:
html = re.sub('\\{punct}<{tag}>'.format(**locals()), '<{tag}>{punct}'.format(**locals()), html) html = re.sub('\\{punct}<{tag}>'.format(**locals()), '<{tag}>{punct}'.format(**locals()), html)
html = re.sub('</{tag}>\\{punct}'.format(**locals()), '{punct}</{tag}>'.format(**locals()), html) html = re.sub('</{tag}>\\{punct}'.format(**locals()), '{punct}</{tag}>'.format(**locals()), html)
return html return html
@html_cleaner
def remove_misc_strings(html):
html = html.replace('epub:type="pagebreak"', '')
html = re.sub(r'title="[ivx]+"', '', html)
html = re.sub(r'title="\d+"', '', html)
return html
@html_cleaner @html_cleaner
def remove_space_around_br(html): def remove_space_around_br(html):
html = re.sub(r'\s*<br/?>\s*', '<br/>', html) html = re.sub(r'\s*<br/?>\s*', '<br/>', html)
@ -71,6 +77,14 @@ def replace_smart_quotes(html):
html = re.sub(r'||ʹ', "'", html) html = re.sub(r'||ʹ', "'", html)
return html return html
@html_cleaner
def remove_empty_attributes(html):
html = re.sub(r'alt="\s*"', '', html)
html = re.sub(r'class="\s*"', '', html)
html = re.sub(r'id="\s*"', '', html)
html = re.sub(r'title="\s*"', '', html)
return html
@html_cleaner @html_cleaner
def remove_empty_elements(html): def remove_empty_elements(html):
html = re.sub(r'(?s)<(\w+)>(&(nbsp|emsp|ensp|thinsp|#160);|\s|<br/?>)*</\1>', '', html) html = re.sub(r'(?s)<(\w+)>(&(nbsp|emsp|ensp|thinsp|#160);|\s|<br/?>)*</\1>', '', html)
@ -110,6 +124,16 @@ def inject_footnotes(soup):
footnote_link.decompose() footnote_link.decompose()
remove_class(footnote, 'gcufootnote_content') remove_class(footnote, 'gcufootnote_content')
@soup_cleaner
def center_images(soup):
for img in soup.find_all('img'):
if img.parent.name == 'body':
center = soup.new_tag('center')
img.insert_before(center)
center.append(img)
elif img.parent.name in ['div', 'p'] and not img.parent.attrs:
img.parent.name = 'center'
@soup_cleaner @soup_cleaner
def convert_textdivs_p(soup): def convert_textdivs_p(soup):
divs = soup.find_all('div') divs = soup.find_all('div')
@ -133,23 +157,51 @@ def remove_empty_paragraphs(soup):
br_parent.decompose() br_parent.decompose()
@soup_cleaner @soup_cleaner
def remove_calibre_classes(soup): def remove_unwanted_classes_ids(soup):
PATTERNS = [ PATTERNS = [
r'calibre\d*', r'big\d+',
r'mbppagebreak', r'blnonindent\d*',
r'c\d+',
r'calibre_?\d*',
r'calibre_pb_\d+', r'calibre_pb_\d+',
r'chapter',
r'div\d+',
r'dropcaps',
r'filepos\d*', r'filepos\d*',
r'font',
r'hanging',
r'indent\d*',
r'initial\d*',
r'initialcaps',
r'large',
r'mbp_?pagebreak',
r'morespaceabove',
r'nonindent\d*',
r'p_?[ivx]+',
r'p_?\d+',
r'page_?[ivx]+',
r'page_?\d+',
r'page_top_padding',
r'pagebreak',
r'pgepubid\d*', r'pgepubid\d*',
r'right',
r'section',
r'spaceabove',
r'squeeze(\d+)?',
r'stickupcaps',
r'title',
] ]
for tag in soup.descendants: for tag in soup.descendants:
try: if not isinstance(tag, bs4.element.Tag):
tag['class'] continue
except (TypeError, KeyError):
pass if tag.get('class'):
else:
if isinstance(tag['class'], str): if isinstance(tag['class'], str):
tag['class'] = tag['class'].split() tag['class'] = tag['class'].split()
else:
tag['class'] = list(tag['class'])
# Intentional list() duplicate so we can remove from original.
for cls in list(tag['class']): for cls in list(tag['class']):
if any(re.match(pattern, cls) for pattern in PATTERNS): if any(re.match(pattern, cls) for pattern in PATTERNS):
tag['class'].remove(cls) tag['class'].remove(cls)
@ -157,14 +209,9 @@ def remove_calibre_classes(soup):
if len(tag['class']) == 0 or tag['class'][0] == '': if len(tag['class']) == 0 or tag['class'][0] == '':
del tag['class'] del tag['class']
try: if tag.get('id'):
tag['id']
except (TypeError, KeyError):
pass
else:
if any(re.match(pattern, tag['id']) for pattern in PATTERNS): if any(re.match(pattern, tag['id']) for pattern in PATTERNS):
del tag['id'] del tag['id']
continue
@soup_cleaner @soup_cleaner
def remove_header_italic_bold(soup): def remove_header_italic_bold(soup):
@ -173,19 +220,32 @@ def remove_header_italic_bold(soup):
children = list(header.children) children = list(header.children)
if len(children) > 1: if len(children) > 1:
continue continue
if len(children) == 0:
header.extract()
continue
child = children[0] child = children[0]
if isinstance(child, str): if isinstance(child, str):
continue continue
if child.name in ['i', 'b']: if child.name in ['i', 'b', 'em', 'strong']:
raise_children_and_delete(child) raise_children_and_delete(child)
@soup_cleaner @soup_cleaner
def remove_useless_divs(soup): def remove_useless_divs(soup):
divs = soup.find_all('div') divs = soup.find_all('div')
for div in divs: for div in divs:
if not div.attrs: if div.attrs:
if all(isinstance(child, bs4.element.Tag) or child.isspace() for child in div.contents): continue
raise_children_and_delete(div) if all(isinstance(child, bs4.element.Tag) or child.isspace() for child in div.contents):
raise_children_and_delete(div)
@soup_cleaner
def remove_useless_blockquote(soup):
blocks = soup.find_all('blockquote')
for block in blocks:
if block.attrs:
continue
if all(child.name == 'blockquote' or (isinstance(child, bs4.element.NavigableString) and child.isspace()) for child in block.contents):
raise_children_and_delete(block)
@soup_cleaner @soup_cleaner
def remove_useless_spans(soup): def remove_useless_spans(soup):
@ -195,9 +255,29 @@ def remove_useless_spans(soup):
continue continue
raise_children_and_delete(span) raise_children_and_delete(span)
@soup_cleaner
def remove_useless_atags(soup):
atags = soup.find_all('a')
for atag in atags:
if atag.attrs:
continue
raise_children_and_delete(atag)
@soup_cleaner
def remove_useless_meta(soup):
selectors = [
'link[type="application/vnd.adobe-page-template+xml"]',
'meta[http-equiv="Content-Type"]',
'meta[name="Adept.expected.resource"]',
'meta[name="Adept.resource"]',
]
for selector in selectors:
for item in soup.select(selector):
item.extract()
@soup_cleaner @soup_cleaner
def remove_nested_italic(soup): def remove_nested_italic(soup):
elements = [element for tag in ['b', 'i'] for element in soup.find_all(tag)] elements = [element for tag in ['b', 'i', 'em', 'strong'] for element in soup.find_all(tag)]
for element in elements: for element in elements:
if element.parent.name == element.name: if element.parent.name == element.name:
raise_children_and_delete(element) raise_children_and_delete(element)
@ -223,22 +303,45 @@ def replace_italic_bold_span(soup):
b.insert(0, span) b.insert(0, span)
@soup_cleaner @soup_cleaner
def replace_pblock_blockquote(soup): def replace_classes_real_tags(soup):
classes = ['block', 'block1', 'blockquote'] replace = {
ptags = set(ptag for cls in classes for ptag in soup.find_all('p', {'class': cls})) 'div.block': 'blockquote',
for ptag in ptags: 'div.block1': 'blockquote',
if isinstance(ptag['class'], str): 'div.block2': 'blockquote',
span['class'] = span['class'].split() 'div.block3': 'blockquote',
if len(ptag['class']) == 1: 'div.blockquote': 'blockquote',
ptag.name = 'blockquote' 'div.center': 'center',
ptag['class'] = [] 'div.center1': 'center',
'p.block': 'blockquote',
'p.block1': 'blockquote',
'p.block2': 'blockquote',
'p.block3': 'blockquote',
'p.blockquote': 'blockquote',
'p.center': 'center',
'p.center1': 'center',
'span.b': 'b',
'span.i': 'i',
'span.italic': 'i',
'span.small': 'small',
'span.small1': 'small',
'span.smallcaps': 'small',
'span.underline': 'u',
}
for (selector, new_name) in replace.items():
for tag in soup.select(selector):
if isinstance(tag['class'], str):
tag['class'] = tag['class'].strip().split()
if len(tag['class']) == 1:
tag.name = new_name
tag['class'] = []
@soup_cleaner @soup_cleaner
def strip_ptag_whitespace(soup): def strip_unecessary_whitespace(soup):
ps = soup.find_all('p') + soup.find_all('blockquote') tags = ['p', 'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
elements = [element for tag in tags for element in soup.find_all(tag)]
for p in ps: for element in elements:
descendants = list(p.descendants) descendants = list(element.descendants)
while descendants and not isinstance(descendants[0], bs4.element.NavigableString): while descendants and not isinstance(descendants[0], bs4.element.NavigableString):
if descendants[0].name == 'br': if descendants[0].name == 'br':
descendants[0].decompose() descendants[0].decompose()