voussoirkit/voussoirkit/stringtools.py

83 lines
2.3 KiB
Python

import re
import unicodedata
def collapse_whitespace(text):
'''
Replace all whitespace sequences with a single space and strip the ends.
'''
text = re.sub(r'\s+', ' ', text.strip())
return text
def comma_space_split(text):
'''
Split the string by commas and spaces, discarding all extra
whitespace and blank parts.
'a b, c,,d' -> ['a', 'b', 'c', 'd']
'''
if text is None:
return text
return re.split(r'[ ,]+', text.strip())
def excise(text, mark_left, mark_right):
'''
Remove the text between the left and right landmarks, including the
landmarks themselves, and return the rest of the text.
excise('What a wonderful day [soundtrack].mp3', ' [', ']') ->
returns 'What a wonderful day.mp3'
'''
if mark_left in text and mark_right in text:
return text.split(mark_left, 1)[0] + text.rsplit(mark_right, 1)[-1]
return text
def pascal_to_loudsnakes(text):
'''
PascalCase -> PASCAL_CASE
HTMLDocument -> HTML_DOCUMENT
'''
text = re.sub(r'([a-z])([A-Z])', r'\1_\2', text)
text = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', text)
text = text.upper()
return text
def remove_characters(text, characters):
translator = {ord(c): None for c in characters}
text = text.translate(translator)
return text
def remove_control_characters(text):
'''
Thanks Alex Quinn
https://stackoverflow.com/a/19016117
unicodedata.category(character) returns some two-character string
where if [0] is a C then the character is a control character.
'''
return ''.join(c for c in text if unicodedata.category(c)[0] != 'C')
def title_capitalize(text):
text = text.strip().title()
articles = [
'a',
'an',
'and',
'at',
'for',
'from',
'in',
'of',
'on',
'the',
'to',
]
for article in articles:
text = re.sub(rf' {article}\b', f' {article.lower()}', text, flags=re.IGNORECASE)
text = text.replace('\'S', '\'s')
# Roman numerals. Not handling L, M yet because I don't want to mess up
# real words like "mix", but let's take a look at expanding this in
# the future.
text = re.sub(r'(\b[ivx]+\b)', lambda m: m.group(1).upper(), text, flags=re.IGNORECASE)
return text