diff --git a/voussoirkit/gentools.py b/voussoirkit/gentools.py new file mode 100644 index 0000000..c271640 --- /dev/null +++ b/voussoirkit/gentools.py @@ -0,0 +1,15 @@ +import itertools + +def chunk_generator(sequence, chunk_length): + ''' + Given any sequence input, yield lists of length <= `chunk_length`. + + Note: this generator always yields lists, even if the input was a string. + I don't want to deal with special cases of types that return differently. + ''' + iterator = iter(sequence) + while True: + chunk = list(itertools.islice(iterator, chunk_length)) + if not chunk: + break + yield chunk diff --git a/voussoirkit/stringtools.py b/voussoirkit/stringtools.py new file mode 100644 index 0000000..bc188e0 --- /dev/null +++ b/voussoirkit/stringtools.py @@ -0,0 +1,71 @@ +import re +import unicodedata + +def collapse_whitespace(s): + ''' + Replace all whitespace sequences with a single space and strip the ends. + ''' + s = re.sub(r'\s+', ' ', s.strip()) + return s + +def comma_space_split(s): + ''' + Split the string by commas and spaces, discarding all extra + whitespace and blank parts. + + 'a b, c,,d' -> ['a', 'b', 'c', 'd'] + ''' + if s is None: + return s + return re.split(r'[ ,]+', s.strip()) + +def pascal_to_loudsnakes(text): + ''' + PascalCase -> PASCAL_CASE + HTMLDocument -> HTML_DOCUMENT + ''' + text = re.sub(r'([a-z])([A-Z])', r'\1_\2', text) + text = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', text) + text = text.upper() + return text + +def remove_characters(text, characters): + translator = {ord(c): None for c in characters} + text = text.translate(translator) + return text + +def remove_control_characters(text): + ''' + Thanks Alex Quinn + https://stackoverflow.com/a/19016117 + + unicodedata.category(character) returns some two-character string + where if [0] is a C then the character is a control character. + ''' + return ''.join(c for c in text if unicodedata.category(c)[0] != 'C') + +def title_capitalize(text): + text = text.strip().title() + articles = [ + 'a', + 'an', + 'and', + 'at', + 'for', + 'from', + 'in', + 'of', + 'on', + 'the', + 'to', + ] + for article in articles: + text = re.sub(rf' {article}\b', f' {article.lower()}', text, flags=re.IGNORECASE) + + text = text.replace('\'S', '\'s') + + # Roman numerals. Not handling L, M yet because I don't want to mess up + # real words like "mix", but let's take a look at expanding this in + # the future. + text = re.sub(r'(\b[ivx]+\b)', lambda m: m.group(1).upper(), text, flags=re.IGNORECASE) + return text