voussoirkit/voussoirkit/stringtools.py

import re
import unicodedata

def collapse_whitespace(text):
    '''
    Replace all whitespace sequences with a single space and strip the ends.
    '''
    text = re.sub(r'\s+', ' ', text.strip())
    return text

def comma_space_split(text):
    '''
    Split the string by commas and spaces, discarding all extra
    whitespace and blank parts.

    'a b, c,,d' -> ['a', 'b', 'c', 'd']
    '''
    if text is None:
        return text
    return re.split(r'[ ,]+', text.strip())

def excise(text, mark_left, mark_right):
    '''
    Remove the text between the left and right landmarks, including the
    landmarks themselves, and return the rest of the text.

    excise('What a wonderful day [soundtrack].mp3', ' [', ']') ->
    returns 'What a wonderful day.mp3'
    '''
    if mark_left in text and mark_right in text:
        return text.split(mark_left, 1)[0] + text.rsplit(mark_right, 1)[-1]
    return text

def pascal_to_loudsnakes(text):
    '''
    PascalCase -> PASCAL_CASE
    HTMLDocument -> HTML_DOCUMENT
    '''
    text = re.sub(r'([a-z])([A-Z])', r'\1_\2', text)
    text = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', text)
    text = text.upper()
    return text

def remove_characters(text, characters):
    translator = {ord(c): None for c in characters}
    text = text.translate(translator)
    return text

def remove_control_characters(text):
    '''
    Thanks Alex Quinn
    https://stackoverflow.com/a/19016117

    unicodedata.category(character) returns some two-character string
    where if [0] is a C then the character is a control character.
    '''
    return ''.join(c for c in text if unicodedata.category(c)[0] != 'C')

def title_capitalize(text):
    text = text.strip().title()
    articles = [
        'a',
        'an',
        'and',
        'at',
        'for',
        'from',
        'in',
        'of',
        'on',
        'the',
        'to',
    ]
    for article in articles:
        text = re.sub(rf' {article}\b', f' {article.lower()}', text, flags=re.IGNORECASE)

    text = text.replace('\'S', '\'s')

    # Roman numerals. Not handling L, M yet because I don't want to mess up
    # real words like "mix", but let's take a look at expanding this in
    # the future.
    text = re.sub(r'(\b[ivx]+\b)', lambda m: m.group(1).upper(), text, flags=re.IGNORECASE)
    return text
Add stringtools.py, gentools.py. 2020-11-16 04:58:54 +00:00			`import re`
			`import unicodedata`

Rename variable s -> text. 2020-11-27 02:18:26 +00:00			`def collapse_whitespace(text):`
Add stringtools.py, gentools.py. 2020-11-16 04:58:54 +00:00			`'''`
			`Replace all whitespace sequences with a single space and strip the ends.`
			`'''`
Rename variable s -> text. 2020-11-27 02:18:26 +00:00			`text = re.sub(r'\s+', ' ', text.strip())`
			`return text`
Add stringtools.py, gentools.py. 2020-11-16 04:58:54 +00:00
Rename variable s -> text. 2020-11-27 02:18:26 +00:00			`def comma_space_split(text):`
Add stringtools.py, gentools.py. 2020-11-16 04:58:54 +00:00			`'''`
			`Split the string by commas and spaces, discarding all extra`
			`whitespace and blank parts.`

			`'a b, c,,d' -> ['a', 'b', 'c', 'd']`
			`'''`
Rename variable s -> text. 2020-11-27 02:18:26 +00:00			`if text is None:`
			`return text`
			`return re.split(r'[ ,]+', text.strip())`
Add stringtools.py, gentools.py. 2020-11-16 04:58:54 +00:00
Add function excise. 2020-11-27 02:18:47 +00:00			`def excise(text, mark_left, mark_right):`
			`'''`
			`Remove the text between the left and right landmarks, including the`
			`landmarks themselves, and return the rest of the text.`

			`excise('What a wonderful day [soundtrack].mp3', ' [', ']') ->`
			`returns 'What a wonderful day.mp3'`
			`'''`
			`if mark_left in text and mark_right in text:`
			`return text.split(mark_left, 1)[0] + text.rsplit(mark_right, 1)[-1]`
			`return text`

Add stringtools.py, gentools.py. 2020-11-16 04:58:54 +00:00			`def pascal_to_loudsnakes(text):`
			`'''`
			`PascalCase -> PASCAL_CASE`
			`HTMLDocument -> HTML_DOCUMENT`
			`'''`
			`text = re.sub(r'([a-z])([A-Z])', r'\1_\2', text)`
			`text = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', text)`
			`text = text.upper()`
			`return text`

			`def remove_characters(text, characters):`
			`translator = {ord(c): None for c in characters}`
			`text = text.translate(translator)`
			`return text`

			`def remove_control_characters(text):`
			`'''`
			`Thanks Alex Quinn`
			`https://stackoverflow.com/a/19016117`

			`unicodedata.category(character) returns some two-character string`
			`where if [0] is a C then the character is a control character.`
			`'''`
			`return ''.join(c for c in text if unicodedata.category(c)[0] != 'C')`

			`def title_capitalize(text):`
			`text = text.strip().title()`
			`articles = [`
			`'a',`
			`'an',`
			`'and',`
			`'at',`
			`'for',`
			`'from',`
			`'in',`
			`'of',`
			`'on',`
			`'the',`
			`'to',`
			`]`
			`for article in articles:`
			`text = re.sub(rf' {article}\b', f' {article.lower()}', text, flags=re.IGNORECASE)`

			`text = text.replace('\'S', '\'s')`

			`# Roman numerals. Not handling L, M yet because I don't want to mess up`
			`# real words like "mix", but let's take a look at expanding this in`
			`# the future.`
			`text = re.sub(r'(\b[ivx]+\b)', lambda m: m.group(1).upper(), text, flags=re.IGNORECASE)`
			`return text`