Add stringtools.py, gentools.py.
This commit is contained in:
parent
2cbb93386e
commit
29541d2570
2 changed files with 86 additions and 0 deletions
15
voussoirkit/gentools.py
Normal file
15
voussoirkit/gentools.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
import itertools
|
||||
|
||||
def chunk_generator(sequence, chunk_length):
|
||||
'''
|
||||
Given any sequence input, yield lists of length <= `chunk_length`.
|
||||
|
||||
Note: this generator always yields lists, even if the input was a string.
|
||||
I don't want to deal with special cases of types that return differently.
|
||||
'''
|
||||
iterator = iter(sequence)
|
||||
while True:
|
||||
chunk = list(itertools.islice(iterator, chunk_length))
|
||||
if not chunk:
|
||||
break
|
||||
yield chunk
|
71
voussoirkit/stringtools.py
Normal file
71
voussoirkit/stringtools.py
Normal file
|
@ -0,0 +1,71 @@
|
|||
import re
|
||||
import unicodedata
|
||||
|
||||
def collapse_whitespace(s):
|
||||
'''
|
||||
Replace all whitespace sequences with a single space and strip the ends.
|
||||
'''
|
||||
s = re.sub(r'\s+', ' ', s.strip())
|
||||
return s
|
||||
|
||||
def comma_space_split(s):
|
||||
'''
|
||||
Split the string by commas and spaces, discarding all extra
|
||||
whitespace and blank parts.
|
||||
|
||||
'a b, c,,d' -> ['a', 'b', 'c', 'd']
|
||||
'''
|
||||
if s is None:
|
||||
return s
|
||||
return re.split(r'[ ,]+', s.strip())
|
||||
|
||||
def pascal_to_loudsnakes(text):
|
||||
'''
|
||||
PascalCase -> PASCAL_CASE
|
||||
HTMLDocument -> HTML_DOCUMENT
|
||||
'''
|
||||
text = re.sub(r'([a-z])([A-Z])', r'\1_\2', text)
|
||||
text = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', text)
|
||||
text = text.upper()
|
||||
return text
|
||||
|
||||
def remove_characters(text, characters):
|
||||
translator = {ord(c): None for c in characters}
|
||||
text = text.translate(translator)
|
||||
return text
|
||||
|
||||
def remove_control_characters(text):
|
||||
'''
|
||||
Thanks Alex Quinn
|
||||
https://stackoverflow.com/a/19016117
|
||||
|
||||
unicodedata.category(character) returns some two-character string
|
||||
where if [0] is a C then the character is a control character.
|
||||
'''
|
||||
return ''.join(c for c in text if unicodedata.category(c)[0] != 'C')
|
||||
|
||||
def title_capitalize(text):
|
||||
text = text.strip().title()
|
||||
articles = [
|
||||
'a',
|
||||
'an',
|
||||
'and',
|
||||
'at',
|
||||
'for',
|
||||
'from',
|
||||
'in',
|
||||
'of',
|
||||
'on',
|
||||
'the',
|
||||
'to',
|
||||
]
|
||||
for article in articles:
|
||||
text = re.sub(rf' {article}\b', f' {article.lower()}', text, flags=re.IGNORECASE)
|
||||
|
||||
text = text.replace('\'S', '\'s')
|
||||
|
||||
# Roman numerals. Not handling L, M yet because I don't want to mess up
|
||||
# real words like "mix", but let's take a look at expanding this in
|
||||
# the future.
|
||||
text = re.sub(r'(\b[ivx]+\b)', lambda m: m.group(1).upper(), text, flags=re.IGNORECASE)
|
||||
return text
|
Loading…
Reference in a new issue