Redesign replace_classes_real_tags to use regex.

I started to get sick of all the center center1 center2 business.
This commit is contained in:
Ethan Dalool 2020-01-05 23:34:14 -08:00
parent ca1c5e6373
commit 8026bb55c4

View file

@ -349,47 +349,57 @@ def replace_italic_bold_span(soup):
@soup_cleaner @soup_cleaner
def replace_classes_real_tags(soup): def replace_classes_real_tags(soup):
replace = { CLASSTAGS = {
'div.block': 'blockquote', 'div': {
'div.block1': 'blockquote', r'block\d*': 'blockquote',
'div.block2': 'blockquote', r'blockquote': 'blockquote',
'div.block3': 'blockquote', r'center\d*': 'center',
'div.blockquote': 'blockquote', r'ext': 'blockquote',
'div.center': 'center', r'extract': 'blockquote',
'div.center1': 'center', r'p+': 'p',
'div.ext': 'blockquote', },
'div.extract': 'blockquote', 'p': {
'div.p': 'p', r'block\d*': 'blockquote',
'div.pp': 'p', r'blockquote': 'blockquote',
'p.block': 'blockquote', r'center\d*': 'center',
'p.block1': 'blockquote', r'h2-?[abcde]': 'h2',
'p.block2': 'blockquote', r'h2-\d+': 'h2',
'p.block3': 'blockquote', r'p+': 'p',
'p.blockquote': 'blockquote', },
'p.center': 'center', 'span': {
'p.center1': 'center', r'b(old)?': 'b',
'p.p': 'p', r'i(talic)?': 'i',
'p.pp': 'p', r'sc': 'small',
'span.b': 'b', r'small\d*': 'small',
'span.i': 'i', r'small[Cc]aps\d*': 'small',
'span.italic': 'i', r'strike': 'strike',
'span.sc': 'small', r'under(line)?': 'u',
'span.small': 'small',
'span.small1': 'small',
'span.smallcaps': 'small',
'span.smallCaps': 'small',
'span.smallCaps1': 'small',
'span.strike': 'strike',
'span.under': 'u',
'span.underline': 'u',
} }
for (selector, new_name) in replace.items(): }
for tag in soup.select(selector):
for tag in soup.descendants:
if not isinstance(tag, bs4.element.Tag):
continue
if tag.name not in CLASSTAGS:
continue
if not tag.get('class'):
continue
if isinstance(tag['class'], str): if isinstance(tag['class'], str):
tag['class'] = tag['class'].strip().split() tag['class'] = tag['class'].split()
if len(tag['class']) == 1: else:
tag['class'] = list(tag['class'])
if len(tag['class']) != 1:
continue
for (selector, new_name) in CLASSTAGS[tag.name].items():
if re.match(selector, tag['class'][0]):
tag.name = new_name tag.name = new_name
tag['class'] = [] del tag['class']
break
@soup_cleaner @soup_cleaner
def strip_unecessary_whitespace(soup): def strip_unecessary_whitespace(soup):