2019-08-03 07:24:52 +00:00
|
|
|
|
import re
|
|
|
|
|
import sys
|
|
|
|
|
import bs4
|
|
|
|
|
import os
|
|
|
|
|
|
|
|
|
|
html_cleaners = []
|
|
|
|
|
soup_cleaners = []
|
|
|
|
|
global_footnotes = {}
|
|
|
|
|
|
|
|
|
|
def html_cleaner(function):
|
|
|
|
|
html_cleaners.append(function)
|
|
|
|
|
|
|
|
|
|
def soup_cleaner(function):
|
|
|
|
|
soup_cleaners.append(function)
|
|
|
|
|
|
|
|
|
|
def raise_children_and_delete(element):
|
|
|
|
|
children = list(element.children)
|
|
|
|
|
while children:
|
|
|
|
|
element.insert_after(children.pop(-1))
|
|
|
|
|
element.decompose()
|
|
|
|
|
|
2019-12-29 21:03:44 +00:00
|
|
|
|
def contains_class(element, cls):
|
|
|
|
|
try:
|
|
|
|
|
element['class']
|
|
|
|
|
except (AttributeError, KeyError):
|
|
|
|
|
return False
|
|
|
|
|
if isinstance(element['class'], str):
|
|
|
|
|
element['class'] = element['class'].split()
|
|
|
|
|
|
|
|
|
|
return cls in element['class']
|
|
|
|
|
|
2019-08-03 07:24:52 +00:00
|
|
|
|
def remove_class(element, cls):
|
2019-12-29 21:03:44 +00:00
|
|
|
|
if not contains_class(element, cls):
|
2019-08-03 07:24:52 +00:00
|
|
|
|
return
|
2019-12-29 21:03:44 +00:00
|
|
|
|
|
2019-08-03 07:24:52 +00:00
|
|
|
|
try:
|
|
|
|
|
element['class'].remove(cls)
|
|
|
|
|
except IndexError:
|
|
|
|
|
pass
|
|
|
|
|
if len(element['class']) == 0:
|
|
|
|
|
del element['class']
|
|
|
|
|
|
|
|
|
|
@html_cleaner
|
|
|
|
|
def remove_unwanted_stylesheets(html):
|
2019-09-09 04:20:32 +00:00
|
|
|
|
html = re.sub(r'<style type="text/css">\s*@page { margin-bottom: 5\.000000pt; margin-top: 5\.000000pt; }\s*</style>', '', html)
|
|
|
|
|
html = re.sub(r'style="margin-top: 0px; margin-left: 0px; margin-right: 0px; margin-bottom: 0px; text-align: center;"', '', html)
|
2019-08-03 07:24:52 +00:00
|
|
|
|
return html
|
|
|
|
|
|
|
|
|
|
@html_cleaner
|
|
|
|
|
def merge_neighboring_sametag(html):
|
2019-09-09 04:20:32 +00:00
|
|
|
|
tags = ['i', 'b', 'em', 'strong', 'u', 'small']
|
|
|
|
|
for tag in tags:
|
|
|
|
|
html = re.sub(r'</%s><%s>' % (tag, tag), '', html)
|
|
|
|
|
html = re.sub(r'</%s>\s*<%s>' % (tag, tag), ' ', html)
|
|
|
|
|
html = re.sub(r'</%s>\s*<br/?>\s*<%s>' % (tag, tag), '<br/>', html)
|
2019-08-25 19:54:03 +00:00
|
|
|
|
|
2019-08-03 07:24:52 +00:00
|
|
|
|
return html
|
|
|
|
|
|
|
|
|
|
@html_cleaner
|
|
|
|
|
def bring_punctuation_into_italics(html):
|
2019-09-09 04:20:32 +00:00
|
|
|
|
for tag in ['i', 'b', 'em', 'strong']:
|
2019-08-03 07:24:52 +00:00
|
|
|
|
for punct in ['.', ',', '-', '—']:
|
|
|
|
|
html = re.sub('\\{punct}<{tag}>'.format(**locals()), '<{tag}>{punct}'.format(**locals()), html)
|
|
|
|
|
html = re.sub('</{tag}>\\{punct}'.format(**locals()), '{punct}</{tag}>'.format(**locals()), html)
|
|
|
|
|
return html
|
|
|
|
|
|
2020-01-08 02:23:44 +00:00
|
|
|
|
@html_cleaner
|
|
|
|
|
def remove_header_br(html):
|
|
|
|
|
html = re.sub(r'<h1>([^\n]+?)\s*<br/>\s*([^\n]+?)</h1>', r'<h1>\1 \2</h1>', html)
|
|
|
|
|
html = re.sub(r'<h2>([^\n]+?)\s*<br/>\s*([^\n]+?)</h2>', r'<h2>\1 \2</h2>', html)
|
|
|
|
|
html = re.sub(r'<h3>([^\n]+?)\s*<br/>\s*([^\n]+?)</h3>', r'<h3>\1 \2</h3>', html)
|
|
|
|
|
html = re.sub(r'<h4>([^\n]+?)\s*<br/>\s*([^\n]+?)</h4>', r'<h4>\1 \2</h4>', html)
|
|
|
|
|
html = re.sub(r'<h5>([^\n]+?)\s*<br/>\s*([^\n]+?)</h5>', r'<h5>\1 \2</h5>', html)
|
|
|
|
|
html = re.sub(r'<h6>([^\n]+?)\s*<br/>\s*([^\n]+?)</h6>', r'<h6>\1 \2</h6>', html)
|
|
|
|
|
return html
|
|
|
|
|
|
2019-09-09 04:20:32 +00:00
|
|
|
|
@html_cleaner
|
|
|
|
|
def remove_misc_strings(html):
|
|
|
|
|
html = html.replace('epub:type="pagebreak"', '')
|
2020-01-26 23:01:06 +00:00
|
|
|
|
html = html.replace('<!-- BodyStart-->', '')
|
|
|
|
|
html = html.replace('<!-- BodyEnd-->', '')
|
2019-09-09 04:20:32 +00:00
|
|
|
|
html = re.sub(r'title="[ivx]+"', '', html)
|
|
|
|
|
html = re.sub(r'title="\d+"', '', html)
|
|
|
|
|
return html
|
|
|
|
|
|
2019-08-03 07:24:52 +00:00
|
|
|
|
@html_cleaner
|
|
|
|
|
def remove_space_around_br(html):
|
|
|
|
|
html = re.sub(r'\s*<br/?>\s*', '<br/>', html)
|
|
|
|
|
return html
|
|
|
|
|
|
|
|
|
|
@html_cleaner
|
|
|
|
|
def replace_smart_quotes(html):
|
|
|
|
|
html = re.sub(r'”|“', '"', html)
|
2020-06-11 01:25:59 +00:00
|
|
|
|
html = re.sub(r'‘|’|ʹ|`', "'", html)
|
2019-08-03 07:24:52 +00:00
|
|
|
|
return html
|
|
|
|
|
|
2019-09-09 04:20:32 +00:00
|
|
|
|
@html_cleaner
|
|
|
|
|
def remove_empty_attributes(html):
|
|
|
|
|
html = re.sub(r'alt="\s*"', '', html)
|
|
|
|
|
html = re.sub(r'class="\s*"', '', html)
|
|
|
|
|
html = re.sub(r'id="\s*"', '', html)
|
|
|
|
|
html = re.sub(r'title="\s*"', '', html)
|
|
|
|
|
return html
|
|
|
|
|
|
2019-08-03 07:24:52 +00:00
|
|
|
|
@html_cleaner
|
|
|
|
|
def remove_empty_elements(html):
|
|
|
|
|
html = re.sub(r'(?s)<(\w+)>(&(nbsp|emsp|ensp|thinsp|#160);|\s|<br/?>)*</\1>', '', html)
|
|
|
|
|
return html
|
|
|
|
|
|
|
|
|
|
@soup_cleaner
|
2019-12-29 21:04:42 +00:00
|
|
|
|
def collect_footnotes(soup):
|
2019-08-03 07:24:52 +00:00
|
|
|
|
footnotes = soup.find_all('blockquote', {'class': 'gcufootnote_content'})
|
|
|
|
|
for footnote in footnotes:
|
|
|
|
|
try:
|
|
|
|
|
footnote_id = next(footnote.stripped_strings)
|
|
|
|
|
except StopIteration:
|
|
|
|
|
print(footnote, 'is malformed. No string contents.')
|
|
|
|
|
continue
|
|
|
|
|
if not footnote_id.startswith('['):
|
|
|
|
|
print(footnote, 'is malformed. Should start with [id].')
|
|
|
|
|
continue
|
|
|
|
|
footnote_id = footnote_id.split('[', 1)[-1].split(']', 1)[0]
|
|
|
|
|
|
|
|
|
|
global_footnotes[footnote_id] = footnote
|
|
|
|
|
|
2019-12-29 21:04:42 +00:00
|
|
|
|
@soup_cleaner
|
|
|
|
|
def inject_footnotes(soup):
|
2019-08-03 07:24:52 +00:00
|
|
|
|
footnote_links = soup.find_all('span', {'class': 'gcufootnote_link'})
|
|
|
|
|
for footnote_link in reversed(footnote_links):
|
2019-12-29 21:05:28 +00:00
|
|
|
|
if contains_class(footnote_link.parent, 'gcufootnote_content'):
|
|
|
|
|
# In the case of nested footnotes, let's place the parent first
|
|
|
|
|
# and come back for this child on the next go around.
|
|
|
|
|
continue
|
2019-08-03 07:24:52 +00:00
|
|
|
|
if len(footnote_link.contents) != 1:
|
2019-12-29 21:05:28 +00:00
|
|
|
|
print(footnote_link, 'is malformed. Should just be >[id]<.')
|
|
|
|
|
|
2019-08-03 07:24:52 +00:00
|
|
|
|
footnote_id = footnote_link.contents[0]
|
2019-12-29 21:06:51 +00:00
|
|
|
|
|
2019-08-03 07:24:52 +00:00
|
|
|
|
if not footnote_id.startswith('['):
|
|
|
|
|
print(footnote_link, 'is malformed. Should start with [id].')
|
|
|
|
|
continue
|
2019-12-29 21:06:51 +00:00
|
|
|
|
|
2019-08-03 07:24:52 +00:00
|
|
|
|
footnote_id = footnote_id.split('[', 1)[-1].split(']', 1)[0]
|
|
|
|
|
|
|
|
|
|
if footnote_id not in global_footnotes:
|
|
|
|
|
continue
|
2019-12-29 21:06:51 +00:00
|
|
|
|
|
2019-08-03 07:24:52 +00:00
|
|
|
|
footnote = global_footnotes[footnote_id]
|
2019-12-29 21:06:51 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
parent = footnote_link.parent
|
|
|
|
|
while parent and parent.name not in ['p', 'blockquote', 'div']:
|
|
|
|
|
parent = parent.parent
|
|
|
|
|
|
|
|
|
|
if parent is None:
|
|
|
|
|
print(footnote_link, 'doesn\'t have a <p> or <blockquote> ancestor.')
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
parent.insert_after(footnote)
|
2019-08-03 07:24:52 +00:00
|
|
|
|
footnote_link.insert_before(footnote_link.contents[0])
|
|
|
|
|
footnote_link.decompose()
|
|
|
|
|
remove_class(footnote, 'gcufootnote_content')
|
|
|
|
|
|
2019-09-09 04:20:32 +00:00
|
|
|
|
@soup_cleaner
|
|
|
|
|
def center_images(soup):
|
|
|
|
|
for img in soup.find_all('img'):
|
|
|
|
|
if img.parent.name == 'body':
|
|
|
|
|
center = soup.new_tag('center')
|
|
|
|
|
img.insert_before(center)
|
|
|
|
|
center.append(img)
|
|
|
|
|
elif img.parent.name in ['div', 'p'] and not img.parent.attrs:
|
|
|
|
|
img.parent.name = 'center'
|
|
|
|
|
|
2019-08-03 07:24:52 +00:00
|
|
|
|
@soup_cleaner
|
|
|
|
|
def convert_textdivs_p(soup):
|
|
|
|
|
divs = soup.find_all('div')
|
|
|
|
|
for div in divs:
|
|
|
|
|
children = list(div.children)
|
2019-12-29 21:07:35 +00:00
|
|
|
|
convertme = True
|
|
|
|
|
for child in children:
|
|
|
|
|
if isinstance(child, bs4.element.NavigableString):
|
|
|
|
|
pass
|
|
|
|
|
elif child.name in ['i', 'b', 'em', 'strong', 'a', 'span', 'small']:
|
|
|
|
|
pass
|
|
|
|
|
else:
|
|
|
|
|
convertme = False
|
|
|
|
|
break
|
|
|
|
|
if convertme:
|
2019-08-03 07:24:52 +00:00
|
|
|
|
div.name = 'p'
|
|
|
|
|
|
|
|
|
|
@soup_cleaner
|
|
|
|
|
def remove_body_br(soup):
|
|
|
|
|
for br in soup.find_all('br'):
|
|
|
|
|
if br.parent.name == 'body':
|
|
|
|
|
br.decompose()
|
|
|
|
|
|
|
|
|
|
@soup_cleaner
|
|
|
|
|
def remove_empty_paragraphs(soup):
|
|
|
|
|
brs = soup.find_all('br')
|
|
|
|
|
br_parents = set(br.parent for br in brs)
|
|
|
|
|
for br_parent in br_parents:
|
|
|
|
|
if all(child.name == 'br' for child in br_parent.contents):
|
|
|
|
|
br_parent.decompose()
|
|
|
|
|
|
|
|
|
|
@soup_cleaner
|
2019-09-09 04:20:32 +00:00
|
|
|
|
def remove_unwanted_classes_ids(soup):
|
2019-08-03 07:24:52 +00:00
|
|
|
|
PATTERNS = [
|
2019-09-09 04:20:32 +00:00
|
|
|
|
r'big\d+',
|
|
|
|
|
r'blnonindent\d*',
|
2019-12-29 21:08:31 +00:00
|
|
|
|
r'bodyMatter',
|
2019-09-09 04:20:32 +00:00
|
|
|
|
r'c\d+',
|
|
|
|
|
r'calibre_?\d*',
|
2019-08-03 07:24:52 +00:00
|
|
|
|
r'calibre_pb_\d+',
|
2020-01-06 05:33:15 +00:00
|
|
|
|
r'calibreclass\d*',
|
2019-09-09 04:20:32 +00:00
|
|
|
|
r'chapter',
|
|
|
|
|
r'div\d+',
|
|
|
|
|
r'dropcaps',
|
2019-08-03 07:24:52 +00:00
|
|
|
|
r'filepos\d*',
|
2019-09-09 04:20:32 +00:00
|
|
|
|
r'font',
|
|
|
|
|
r'hanging',
|
|
|
|
|
r'indent\d*',
|
|
|
|
|
r'initial\d*',
|
|
|
|
|
r'initialcaps',
|
|
|
|
|
r'large',
|
|
|
|
|
r'mbp_?pagebreak',
|
|
|
|
|
r'morespaceabove',
|
2020-01-06 07:33:15 +00:00
|
|
|
|
r'noindent\d*',
|
2019-09-09 04:20:32 +00:00
|
|
|
|
r'nonindent\d*',
|
|
|
|
|
r'p_?[ivx]+',
|
|
|
|
|
r'p_?\d+',
|
|
|
|
|
r'page_?[ivx]+',
|
|
|
|
|
r'page_?\d+',
|
|
|
|
|
r'page_top_padding',
|
|
|
|
|
r'pagebreak',
|
2019-12-29 21:08:31 +00:00
|
|
|
|
r'para',
|
2019-08-25 19:54:27 +00:00
|
|
|
|
r'pgepubid\d*',
|
2019-09-09 04:20:32 +00:00
|
|
|
|
r'right',
|
|
|
|
|
r'section',
|
2020-01-06 07:33:15 +00:00
|
|
|
|
r'space[Bb]reak',
|
2019-09-09 04:20:32 +00:00
|
|
|
|
r'spaceabove',
|
|
|
|
|
r'squeeze(\d+)?',
|
|
|
|
|
r'stickupcaps',
|
|
|
|
|
r'title',
|
2020-01-06 07:33:15 +00:00
|
|
|
|
r'xrefInternal',
|
2019-08-03 07:24:52 +00:00
|
|
|
|
]
|
|
|
|
|
for tag in soup.descendants:
|
2019-09-09 04:20:32 +00:00
|
|
|
|
if not isinstance(tag, bs4.element.Tag):
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
if tag.get('class'):
|
2019-08-03 07:24:52 +00:00
|
|
|
|
if isinstance(tag['class'], str):
|
|
|
|
|
tag['class'] = tag['class'].split()
|
2019-09-09 04:20:32 +00:00
|
|
|
|
else:
|
|
|
|
|
tag['class'] = list(tag['class'])
|
2019-08-03 07:24:52 +00:00
|
|
|
|
|
2019-12-29 21:08:12 +00:00
|
|
|
|
try:
|
|
|
|
|
tag['class'].remove('')
|
|
|
|
|
except ValueError:
|
|
|
|
|
pass
|
|
|
|
|
|
2019-09-09 04:20:32 +00:00
|
|
|
|
# Intentional list() duplicate so we can remove from original.
|
2019-08-03 07:24:52 +00:00
|
|
|
|
for cls in list(tag['class']):
|
|
|
|
|
if any(re.match(pattern, cls) for pattern in PATTERNS):
|
|
|
|
|
tag['class'].remove(cls)
|
|
|
|
|
|
|
|
|
|
if len(tag['class']) == 0 or tag['class'][0] == '':
|
|
|
|
|
del tag['class']
|
|
|
|
|
|
2019-09-09 04:20:32 +00:00
|
|
|
|
if tag.get('id'):
|
2019-08-03 07:24:52 +00:00
|
|
|
|
if any(re.match(pattern, tag['id']) for pattern in PATTERNS):
|
|
|
|
|
del tag['id']
|
|
|
|
|
|
|
|
|
|
@soup_cleaner
|
|
|
|
|
def remove_header_italic_bold(soup):
|
|
|
|
|
headers = [h for tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] for h in soup.find_all(tag)]
|
|
|
|
|
for header in headers:
|
|
|
|
|
children = list(header.children)
|
|
|
|
|
if len(children) > 1:
|
|
|
|
|
continue
|
2019-09-09 04:20:32 +00:00
|
|
|
|
if len(children) == 0:
|
|
|
|
|
header.extract()
|
|
|
|
|
continue
|
2019-08-03 07:24:52 +00:00
|
|
|
|
child = children[0]
|
|
|
|
|
if isinstance(child, str):
|
|
|
|
|
continue
|
2019-09-09 04:20:32 +00:00
|
|
|
|
if child.name in ['i', 'b', 'em', 'strong']:
|
2019-08-03 07:24:52 +00:00
|
|
|
|
raise_children_and_delete(child)
|
|
|
|
|
|
|
|
|
|
@soup_cleaner
|
|
|
|
|
def remove_useless_divs(soup):
|
|
|
|
|
divs = soup.find_all('div')
|
|
|
|
|
for div in divs:
|
2019-09-09 04:20:32 +00:00
|
|
|
|
if div.attrs:
|
|
|
|
|
continue
|
|
|
|
|
if all(isinstance(child, bs4.element.Tag) or child.isspace() for child in div.contents):
|
|
|
|
|
raise_children_and_delete(div)
|
|
|
|
|
|
|
|
|
|
@soup_cleaner
|
|
|
|
|
def remove_useless_blockquote(soup):
|
|
|
|
|
blocks = soup.find_all('blockquote')
|
|
|
|
|
for block in blocks:
|
|
|
|
|
if block.attrs:
|
|
|
|
|
continue
|
|
|
|
|
if all(child.name == 'blockquote' or (isinstance(child, bs4.element.NavigableString) and child.isspace()) for child in block.contents):
|
|
|
|
|
raise_children_and_delete(block)
|
2019-08-03 07:24:52 +00:00
|
|
|
|
|
|
|
|
|
@soup_cleaner
|
|
|
|
|
def remove_useless_spans(soup):
|
|
|
|
|
spans = soup.find_all('span')
|
|
|
|
|
for span in spans:
|
|
|
|
|
if span.attrs:
|
|
|
|
|
continue
|
|
|
|
|
raise_children_and_delete(span)
|
|
|
|
|
|
2019-09-09 04:20:32 +00:00
|
|
|
|
@soup_cleaner
|
|
|
|
|
def remove_useless_atags(soup):
|
|
|
|
|
atags = soup.find_all('a')
|
|
|
|
|
for atag in atags:
|
|
|
|
|
if atag.attrs:
|
|
|
|
|
continue
|
|
|
|
|
raise_children_and_delete(atag)
|
|
|
|
|
|
|
|
|
|
@soup_cleaner
|
|
|
|
|
def remove_useless_meta(soup):
|
|
|
|
|
selectors = [
|
|
|
|
|
'link[type="application/vnd.adobe-page-template+xml"]',
|
|
|
|
|
'meta[http-equiv="Content-Type"]',
|
|
|
|
|
'meta[name="Adept.expected.resource"]',
|
|
|
|
|
'meta[name="Adept.resource"]',
|
|
|
|
|
]
|
|
|
|
|
for selector in selectors:
|
|
|
|
|
for item in soup.select(selector):
|
|
|
|
|
item.extract()
|
|
|
|
|
|
2019-08-03 07:24:52 +00:00
|
|
|
|
@soup_cleaner
|
|
|
|
|
def remove_nested_italic(soup):
|
2019-09-09 04:20:32 +00:00
|
|
|
|
elements = [element for tag in ['b', 'i', 'em', 'strong'] for element in soup.find_all(tag)]
|
2019-08-03 07:24:52 +00:00
|
|
|
|
for element in elements:
|
|
|
|
|
if element.parent.name == element.name:
|
|
|
|
|
raise_children_and_delete(element)
|
|
|
|
|
|
|
|
|
|
@soup_cleaner
|
|
|
|
|
def replace_italic_bold_span(soup):
|
|
|
|
|
tags = {'italic': 'i', 'italics': 'i', 'bold': 'b'}
|
|
|
|
|
spans = set(span for cls in tags for span in soup.find_all('span', {'class': cls}))
|
|
|
|
|
for span in spans:
|
|
|
|
|
if isinstance(span['class'], str):
|
|
|
|
|
span['class'] = span['class'].split()
|
|
|
|
|
|
|
|
|
|
if len(span['class']) == 1:
|
|
|
|
|
new_name = tags[span['class'][0]]
|
|
|
|
|
del span['class']
|
|
|
|
|
span.name = new_name
|
|
|
|
|
|
|
|
|
|
elif all(cls in tags for cls in span['class']):
|
|
|
|
|
b = soup.new_tag('b')
|
|
|
|
|
del span['class']
|
|
|
|
|
span.name = 'i'
|
|
|
|
|
span.insert_before(b)
|
|
|
|
|
b.insert(0, span)
|
|
|
|
|
|
|
|
|
|
@soup_cleaner
|
2019-09-09 04:20:32 +00:00
|
|
|
|
def replace_classes_real_tags(soup):
|
2020-01-06 07:34:14 +00:00
|
|
|
|
CLASSTAGS = {
|
|
|
|
|
'div': {
|
|
|
|
|
r'block\d*': 'blockquote',
|
|
|
|
|
r'blockquote': 'blockquote',
|
|
|
|
|
r'center\d*': 'center',
|
|
|
|
|
r'ext': 'blockquote',
|
|
|
|
|
r'extract': 'blockquote',
|
|
|
|
|
r'p+': 'p',
|
|
|
|
|
},
|
|
|
|
|
'p': {
|
|
|
|
|
r'block\d*': 'blockquote',
|
|
|
|
|
r'blockquote': 'blockquote',
|
|
|
|
|
r'center\d*': 'center',
|
|
|
|
|
r'h2-?[abcde]': 'h2',
|
|
|
|
|
r'h2-\d+': 'h2',
|
|
|
|
|
r'p+': 'p',
|
|
|
|
|
},
|
|
|
|
|
'span': {
|
|
|
|
|
r'b(old)?': 'b',
|
|
|
|
|
r'i(talic)?': 'i',
|
|
|
|
|
r'sc': 'small',
|
|
|
|
|
r'small\d*': 'small',
|
|
|
|
|
r'small[Cc]aps\d*': 'small',
|
|
|
|
|
r'strike': 'strike',
|
|
|
|
|
r'under(line)?': 'u',
|
2019-09-09 04:20:32 +00:00
|
|
|
|
}
|
2020-01-06 07:34:14 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
for tag in soup.descendants:
|
|
|
|
|
if not isinstance(tag, bs4.element.Tag):
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
if tag.name not in CLASSTAGS:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
if not tag.get('class'):
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
if isinstance(tag['class'], str):
|
|
|
|
|
tag['class'] = tag['class'].split()
|
|
|
|
|
else:
|
|
|
|
|
tag['class'] = list(tag['class'])
|
|
|
|
|
|
|
|
|
|
if len(tag['class']) != 1:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
for (selector, new_name) in CLASSTAGS[tag.name].items():
|
|
|
|
|
if re.match(selector, tag['class'][0]):
|
2019-09-09 04:20:32 +00:00
|
|
|
|
tag.name = new_name
|
2020-01-06 07:34:14 +00:00
|
|
|
|
del tag['class']
|
|
|
|
|
break
|
2019-08-03 07:24:52 +00:00
|
|
|
|
|
|
|
|
|
@soup_cleaner
|
2019-09-09 04:20:32 +00:00
|
|
|
|
def strip_unecessary_whitespace(soup):
|
|
|
|
|
tags = ['p', 'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']
|
|
|
|
|
elements = [element for tag in tags for element in soup.find_all(tag)]
|
2019-08-03 07:24:52 +00:00
|
|
|
|
|
2019-09-09 04:20:32 +00:00
|
|
|
|
for element in elements:
|
|
|
|
|
descendants = list(element.descendants)
|
2019-08-03 07:24:52 +00:00
|
|
|
|
while descendants and not isinstance(descendants[0], bs4.element.NavigableString):
|
|
|
|
|
if descendants[0].name == 'br':
|
|
|
|
|
descendants[0].decompose()
|
|
|
|
|
descendants.pop(0)
|
|
|
|
|
while descendants and not isinstance(descendants[-1], bs4.element.NavigableString):
|
|
|
|
|
if descendants[-1].name == 'br':
|
|
|
|
|
descendants[-1].decompose()
|
|
|
|
|
descendants.pop(-1)
|
|
|
|
|
|
|
|
|
|
if not descendants:
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
if len(descendants) == 1:
|
|
|
|
|
descendants[0].replace_with(descendants[0].strip())
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
descendants[0].replace_with(descendants[0].lstrip())
|
|
|
|
|
descendants[-1].replace_with(descendants[-1].rstrip())
|
|
|
|
|
|
|
|
|
|
def cleanup_page(html):
|
|
|
|
|
previous_html = None
|
|
|
|
|
while previous_html != html:
|
|
|
|
|
previous_html = html
|
|
|
|
|
|
|
|
|
|
for cleaner in html_cleaners:
|
|
|
|
|
html = cleaner(html)
|
|
|
|
|
|
|
|
|
|
soup = bs4.BeautifulSoup(html, 'html.parser')
|
|
|
|
|
|
|
|
|
|
for cleaner in soup_cleaners:
|
|
|
|
|
cleaner(soup)
|
|
|
|
|
|
|
|
|
|
html = str(soup)
|
|
|
|
|
|
|
|
|
|
return html
|
|
|
|
|
|
|
|
|
|
def run_once(book):
|
|
|
|
|
for (id, href) in book.text_iter():
|
|
|
|
|
if id in ('navid', 'nav.xhtml', 'nav.html'):
|
|
|
|
|
continue
|
|
|
|
|
print('Cleaning', id)
|
|
|
|
|
html = book.readfile(id)
|
|
|
|
|
html = cleanup_page(html)
|
|
|
|
|
book.writefile(id, html)
|
|
|
|
|
|
|
|
|
|
def run(book):
|
|
|
|
|
run_once(book)
|
|
|
|
|
|
|
|
|
|
if global_footnotes:
|
|
|
|
|
run_once(book)
|
|
|
|
|
|
|
|
|
|
return 0
|