Redesign replace_classes_real_tags to use regex.
I started to get sick of all the center center1 center2 business.
This commit is contained in:
parent
ca1c5e6373
commit
8026bb55c4
1 changed files with 49 additions and 39 deletions
|
@ -349,47 +349,57 @@ def replace_italic_bold_span(soup):
|
||||||
|
|
||||||
@soup_cleaner
|
@soup_cleaner
|
||||||
def replace_classes_real_tags(soup):
|
def replace_classes_real_tags(soup):
|
||||||
replace = {
|
CLASSTAGS = {
|
||||||
'div.block': 'blockquote',
|
'div': {
|
||||||
'div.block1': 'blockquote',
|
r'block\d*': 'blockquote',
|
||||||
'div.block2': 'blockquote',
|
r'blockquote': 'blockquote',
|
||||||
'div.block3': 'blockquote',
|
r'center\d*': 'center',
|
||||||
'div.blockquote': 'blockquote',
|
r'ext': 'blockquote',
|
||||||
'div.center': 'center',
|
r'extract': 'blockquote',
|
||||||
'div.center1': 'center',
|
r'p+': 'p',
|
||||||
'div.ext': 'blockquote',
|
},
|
||||||
'div.extract': 'blockquote',
|
'p': {
|
||||||
'div.p': 'p',
|
r'block\d*': 'blockquote',
|
||||||
'div.pp': 'p',
|
r'blockquote': 'blockquote',
|
||||||
'p.block': 'blockquote',
|
r'center\d*': 'center',
|
||||||
'p.block1': 'blockquote',
|
r'h2-?[abcde]': 'h2',
|
||||||
'p.block2': 'blockquote',
|
r'h2-\d+': 'h2',
|
||||||
'p.block3': 'blockquote',
|
r'p+': 'p',
|
||||||
'p.blockquote': 'blockquote',
|
},
|
||||||
'p.center': 'center',
|
'span': {
|
||||||
'p.center1': 'center',
|
r'b(old)?': 'b',
|
||||||
'p.p': 'p',
|
r'i(talic)?': 'i',
|
||||||
'p.pp': 'p',
|
r'sc': 'small',
|
||||||
'span.b': 'b',
|
r'small\d*': 'small',
|
||||||
'span.i': 'i',
|
r'small[Cc]aps\d*': 'small',
|
||||||
'span.italic': 'i',
|
r'strike': 'strike',
|
||||||
'span.sc': 'small',
|
r'under(line)?': 'u',
|
||||||
'span.small': 'small',
|
|
||||||
'span.small1': 'small',
|
|
||||||
'span.smallcaps': 'small',
|
|
||||||
'span.smallCaps': 'small',
|
|
||||||
'span.smallCaps1': 'small',
|
|
||||||
'span.strike': 'strike',
|
|
||||||
'span.under': 'u',
|
|
||||||
'span.underline': 'u',
|
|
||||||
}
|
}
|
||||||
for (selector, new_name) in replace.items():
|
}
|
||||||
for tag in soup.select(selector):
|
|
||||||
|
for tag in soup.descendants:
|
||||||
|
if not isinstance(tag, bs4.element.Tag):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if tag.name not in CLASSTAGS:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not tag.get('class'):
|
||||||
|
continue
|
||||||
|
|
||||||
if isinstance(tag['class'], str):
|
if isinstance(tag['class'], str):
|
||||||
tag['class'] = tag['class'].strip().split()
|
tag['class'] = tag['class'].split()
|
||||||
if len(tag['class']) == 1:
|
else:
|
||||||
|
tag['class'] = list(tag['class'])
|
||||||
|
|
||||||
|
if len(tag['class']) != 1:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for (selector, new_name) in CLASSTAGS[tag.name].items():
|
||||||
|
if re.match(selector, tag['class'][0]):
|
||||||
tag.name = new_name
|
tag.name = new_name
|
||||||
tag['class'] = []
|
del tag['class']
|
||||||
|
break
|
||||||
|
|
||||||
@soup_cleaner
|
@soup_cleaner
|
||||||
def strip_unecessary_whitespace(soup):
|
def strip_unecessary_whitespace(soup):
|
||||||
|
|
Loading…
Reference in a new issue