Add stringtools.py, gentools.py.

2020-11-15 20:58:54 -08:00 · 2020-11-15 20:58:54 -08:00 · 29541d2570
commit 29541d2570
parent 2cbb93386e
2 changed files with 86 additions and 0 deletions
--- a/voussoirkit/gentools.py
+++ b/voussoirkit/gentools.py
@ -0,0 +1,15 @@
+import itertools
+
+def chunk_generator(sequence, chunk_length):
+    '''
+    Given any sequence input, yield lists of length <= `chunk_length`.
+
+    Note: this generator always yields lists, even if the input was a string.
+    I don't want to deal with special cases of types that return differently.
+    '''
+    iterator = iter(sequence)
+    while True:
+        chunk = list(itertools.islice(iterator, chunk_length))
+        if not chunk:
+            break
+        yield chunk
--- a/voussoirkit/stringtools.py
+++ b/voussoirkit/stringtools.py
@ -0,0 +1,71 @@
+import re
+import unicodedata
+
+def collapse_whitespace(s):
+    '''
+    Replace all whitespace sequences with a single space and strip the ends.
+    '''
+    s = re.sub(r'\s+', ' ', s.strip())
+    return s
+
+def comma_space_split(s):
+    '''
+    Split the string by commas and spaces, discarding all extra
+    whitespace and blank parts.
+
+    'a b, c,,d' -> ['a', 'b', 'c', 'd']
+    '''
+    if s is None:
+        return s
+    return re.split(r'[ ,]+', s.strip())
+
+def pascal_to_loudsnakes(text):
+    '''
+    PascalCase -> PASCAL_CASE
+    HTMLDocument -> HTML_DOCUMENT
+    '''
+    text = re.sub(r'([a-z])([A-Z])', r'\1_\2', text)
+    text = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', text)
+    text = text.upper()
+    return text
+
+def remove_characters(text, characters):
+    translator = {ord(c): None for c in characters}
+    text = text.translate(translator)
+    return text
+
+def remove_control_characters(text):
+    '''
+    Thanks Alex Quinn
+    https://stackoverflow.com/a/19016117
+
+    unicodedata.category(character) returns some two-character string
+    where if [0] is a C then the character is a control character.
+    '''
+    return ''.join(c for c in text if unicodedata.category(c)[0] != 'C')
+
+def title_capitalize(text):
+    text = text.strip().title()
+    articles = [
+        'a',
+        'an',
+        'and',
+        'at',
+        'for',
+        'from',
+        'in',
+        'of',
+        'on',
+        'the',
+        'to',
+    ]
+    for article in articles:
+        text = re.sub(rf' {article}\b', f' {article.lower()}', text, flags=re.IGNORECASE)
+
+    text = text.replace('\'S', '\'s')
+
+    # Roman numerals. Not handling L, M yet because I don't want to mess up
+    # real words like "mix", but let's take a look at expanding this in
+    # the future.
+    text = re.sub(r'(\b[ivx]+\b)', lambda m: m.group(1).upper(), text, flags=re.IGNORECASE)
+    return text