From dfca0e96b6bdfb8c49319a5e5095540c12e3d9cb Mon Sep 17 00:00:00 2001 From: Ethan Dalool Date: Sat, 12 Feb 2022 19:54:34 -0800 Subject: [PATCH] Use new betterhelp. --- timesearch.py | 785 ++++++++++++++++++++++++++++++-------------------- 1 file changed, 465 insertions(+), 320 deletions(-) diff --git a/timesearch.py b/timesearch.py index d7434e1..dd5ff9b 100644 --- a/timesearch.py +++ b/timesearch.py @@ -18,270 +18,6 @@ from timesearch_modules import exceptions # importing those modules, which will subsequently import PRAW and a whole lot # of other things. This made TS very slow to load which is okay when you're # actually using it but really terrible when you're just viewing the help text. -DOCSTRING = ''' -Timesearch -The subreddit archiver - -The basics: -1. Collect a subreddit's submissions - python timesearch.py get_submissions -r subredditname - -2. Collect the comments for those submissions - python timesearch.py get_comments -r subredditname - -3. Stay up to date - python timesearch.py livestream -r subredditname - -Commands for collecting: - -{get_submissions} - -{get_comments} - -{livestream} - -{get_styles} - -{get_wiki} - -Commands for processing: - -{breakdown} - -{index} - -{merge_db} - -{offline_reading} - -TO SEE DETAILS ON EACH COMMAND, RUN -> python timesearch.py --help -'''.lstrip() - -SUB_DOCSTRINGS = dict( -breakdown=''' -breakdown: - Give the comment / submission counts for users in a subreddit, or - the subreddits that a user posts to. - - Automatically dumps into a _breakdown.json file - in the same directory as the database. - - python timesearch.py breakdown -r subredditname - python timesearch.py breakdown -u username - - flags: - -r "test" | --subreddit "test": - The subreddit database to break down. - - -u "test" | --username "test": - The username database to break down. - - --sort "name" | "submissions" | "comments" | "total_posts" - Sort the output. -'''.strip(), - -get_comments=''' -get_comments: - Collect comments on a subreddit or comments made by a user. - - python timesearch.py get_comments -r subredditname - python timesearch.py get_comments -u username - - flags: - -s "t3_xxxxxx" | --specific "t3_xxxxxx": - Given a submission ID, t3_xxxxxx, scan only that submission. - - -l "update" | --lower "update": - If a number - the unix timestamp to start at. - If "update" - continue from latest comment in db. - WARNING: If at some point you collected comments for a particular - submission which was ahead of the rest of your comments, using "update" - will start from that later submission, and you will miss the stuff in - between that specific post and the past. - Default: update - - -up 1467460221 | --upper 1467460221: - If a number - the unix timestamp to stop at. - If not provided - stop at current time. - Default: current time - - --dont_supplement: - If provided, trust the pushshift data and do not fetch live copies - from reddit. -'''.strip(), - -get_styles=''' -get_styles: - Collect the stylesheet, and css images. - - python timesearch.py get_styles -r subredditname -'''.strip(), - -get_submissions=''' -get_submissions: - Collect submissions from the subreddit across all of history, or - Collect submissions by a user (as many as possible). - - python timesearch.py get_submissions -r subredditname - python timesearch.py get_submissions -u username - - -r "test" | --subreddit "test": - The subreddit to scan. Mutually exclusive with username. - - -u "test" | --username "test": - The user to scan. Mutually exclusive with subreddit. - - -l "update" | --lower "update": - If a number - the unix timestamp to start at. - If "update" - continue from latest submission in db. - Default: update - - -up 1467460221 | --upper 1467460221: - If a number - the unix timestamp to stop at. - If not provided - stop at current time. - Default: current time - - --dont_supplement: - If provided, trust the pushshift data and do not fetch live copies - from reddit. -'''.strip(), - -get_wiki=''' -get_wiki: - Collect all available wiki pages. - - python timesearch.py get_wiki -r subredditname -'''.strip(), - -index=''' -index: - Dump submission listings to a plaintext or HTML file. - - python timesearch.py index -r subredditname - python timesearch.py index -u username - - flags: - -r "test" | --subreddit "test": - The subreddit database to dump - - -u "test" | --username "test": - The username database to dump - - --html: - Write HTML files instead of plain text. - - --offline: - The links in the index will point to the files generated by - offline_reading. That is, `../offline_reading/fullname.html` instead - of `http://redd.it/id`. This will NOT trigger offline_reading to - generate the files now, so you must run that tool separately. - - -st 50 | --score_threshold 50: - Only index posts with at least this many points. - Applies to ALL indexes! - - --all: - Perform all of the indexes listed below. - - --date: - Perform a index sorted by date. - - --title: - Perform a index sorted by title. - - --score: - Perform a index sorted by score. - - --author: - For subreddit databases only. - Perform a index sorted by author. - - --sub: - For username databases only. - Perform a index sorted by subreddit. - - --flair: - Perform a index sorted by flair. - - examples: - `timesearch index -r botwatch --date` - does only the date file. - - `timesearch index -r botwatch --score --title` - does both the score and title files. - - `timesearch index -r botwatch --score --score_threshold 50` - only shows submissions with >= 50 points. - - `timesearch index -r botwatch --all` - performs all of the different mashes. -'''.strip(), - -livestream=''' -livestream: - Continously collect submissions and/or comments. - - python timesearch.py livestream -r subredditname - python timesearch.py livestream -u username - - flags: - -r "test" | --subreddit "test": - The subreddit to collect from. - - -u "test" | --username "test": - The redditor to collect from. - - -s | --submissions: - If provided, do collect submissions. Otherwise don't. - - -c | --comments: - If provided, do collect comments. Otherwise don't. - - If submissions and comments are BOTH left unspecified, then they will - BOTH be collected. - - -w 30 | --wait 30: - The number of seconds to wait between cycles. - - -1 | --once: - If provided, only do a single loop. Otherwise go forever. -'''.strip(), - -merge_db=''' -merge_db: - Copy all new posts from one timesearch database into another. - - python timesearch.py merge_db --from redditdev1.db --to redditdev2.db - - flags: - --from: - The database file containing the posts you wish to copy. - - --to: - The database file to which you will copy the posts. - The database is modified in-place. - Existing posts will be ignored and not updated. -'''.strip(), - -offline_reading=''' -offline_reading: - Render submissions and comment threads to HTML via Markdown. - - python timesearch.py offline_reading -r subredditname - python timesearch.py offline_reading -u username - - flags: - -s "t3_xxxxxx" | --specific "t3_xxxxxx": - Given a submission ID, t3_xxxxxx, render only that submission. - Otherwise render every submission in the database. -'''.strip(), -) - -DOCSTRING = betterhelp.add_previews(DOCSTRING, SUB_DOCSTRINGS) - -#################################################################################################### -#################################################################################################### def breakdown_gateway(args): from timesearch_modules import breakdown @@ -321,83 +57,492 @@ def get_submissions_gateway(args): @vlogging.main_decorator def main(argv): - parser = argparse.ArgumentParser(description=__doc__) + parser = argparse.ArgumentParser( + description=''' + The subreddit archiver + + The basics: + 1. Collect a subreddit's submissions + timesearch get_submissions -r subredditname + + 2. Collect the comments for those submissions + timesearch get_comments -r subredditname + + 3. Stay up to date + timesearch livestream -r subredditname + ''', + ) subparsers = parser.add_subparsers() - p_breakdown = subparsers.add_parser('breakdown') - p_breakdown.add_argument('--sort', dest='sort', default=None) - p_breakdown.add_argument('-r', '--subreddit', dest='subreddit', default=None) - p_breakdown.add_argument('-u', '--user', dest='username', default=None) + # BREAKDOWN + p_breakdown = subparsers.add_parser( + 'breakdown', + description=''' + Generate the comment / submission counts for users in a subreddit, or + the subreddits that a user posts to. + + Automatically dumps into a _breakdown.json file + in the same directory as the database. + ''', + ) + p_breakdown.add_argument( + '--sort', + dest='sort', + type=str, + default=None, + help=''' + Sort the output by one property. + Should be one of "name", "submissions", "comments", "total_posts". + ''', + ) + p_breakdown.add_argument( + '-r', + '--subreddit', + dest='subreddit', + default=None, + help=''' + The subreddit database to break down. + ''', + ) + p_breakdown.add_argument( + '-u', + '--user', + dest='username', + default=None, + help=''' + The username database to break down. + ''', + ) p_breakdown.set_defaults(func=breakdown_gateway) - p_get_comments = subparsers.add_parser('get_comments', aliases=['commentaugment']) - p_get_comments.add_argument('-r', '--subreddit', dest='subreddit', default=None) - p_get_comments.add_argument('-s', '--specific', dest='specific_submission', default=None) - p_get_comments.add_argument('-u', '--user', dest='username', default=None) - p_get_comments.add_argument('--dont_supplement', '--dont-supplement', dest='do_supplement', action='store_false') - p_get_comments.add_argument('-l', '--lower', dest='lower', default='update') - p_get_comments.add_argument('-up', '--upper', dest='upper', default=None) + # GET_COMMENTS + p_get_comments = subparsers.add_parser( + 'get_comments', + aliases=['commentaugment'], + description=''' + Collect comments on a subreddit or comments made by a user. + ''', + ) + p_get_comments.add_argument( + '-r', + '--subreddit', + dest='subreddit', + default=None, + ) + p_get_comments.add_argument( + '-s', + '--specific', + dest='specific_submission', + default=None, + help=''' + Given a submission ID like t3_xxxxxx, scan only that submission. + ''', + ) + p_get_comments.add_argument( + '-u', + '--user', + dest='username', + default=None, + ) + p_get_comments.add_argument( + '--dont_supplement', + '--dont-supplement', + dest='do_supplement', + action='store_false', + help=''' + If provided, trust the pushshift data and do not fetch live copies + from reddit. + ''', + ) + p_get_comments.add_argument( + '--lower', + dest='lower', + default='update', + help=''' + If a number - the unix timestamp to start at. + If "update" - continue from latest comment in db. + WARNING: If at some point you collected comments for a particular + submission which was ahead of the rest of your comments, using "update" + will start from that later submission, and you will miss the stuff in + between that specific post and the past. + ''', + ) + p_get_comments.add_argument( + '--upper', + dest='upper', + default=None, + help=''' + If a number - the unix timestamp to stop at. + If not provided - stop at current time. + ''', + ) p_get_comments.set_defaults(func=get_comments_gateway) - p_get_styles = subparsers.add_parser('get_styles', aliases=['getstyles']) - p_get_styles.add_argument('-r', '--subreddit', dest='subreddit') + # GET_STYLES + p_get_styles = subparsers.add_parser( + 'get_styles', + aliases=['getstyles'], + help=''' + Collect the stylesheet, and css images. + ''', + ) + p_get_styles.add_argument( + '-r', + '--subreddit', + dest='subreddit', + ) p_get_styles.set_defaults(func=get_styles_gateway) - p_get_wiki = subparsers.add_parser('get_wiki', aliases=['getwiki']) - p_get_wiki.add_argument('-r', '--subreddit', dest='subreddit') + # GET_WIKI + p_get_wiki = subparsers.add_parser( + 'get_wiki', + aliases=['getwiki'], + description=''' + Collect all available wiki pages. + ''', + ) + p_get_wiki.add_argument( + '-r', + '--subreddit', + dest='subreddit', + ) p_get_wiki.set_defaults(func=get_wiki_gateway) - p_livestream = subparsers.add_parser('livestream') - p_livestream.add_argument('-1', '--once', dest='once', action='store_true') - p_livestream.add_argument('-c', '--comments', dest='comments', action='store_true') - p_livestream.add_argument('-l', '--limit', dest='limit', default=None) - p_livestream.add_argument('-r', '--subreddit', dest='subreddit', default=None) - p_livestream.add_argument('-s', '--submissions', dest='submissions', action='store_true') - p_livestream.add_argument('-u', '--user', dest='username', default=None) - p_livestream.add_argument('-w', '--wait', dest='sleepy', default=30) + # LIVESTREAM + p_livestream = subparsers.add_parser( + 'livestream', + description=''' + Continously collect submissions and/or comments. + ''', + ) + p_livestream.add_argument( + '--once', + dest='once', + action='store_true', + help=''' + If provided, only do a single loop. Otherwise go forever. + ''', + ) + p_livestream.add_argument( + '-c', + '--comments', + dest='comments', + action='store_true', + help=''' + If provided, do collect comments. Otherwise don't. + + If submissions and comments are BOTH left unspecified, then they will + BOTH be collected. + ''', + ) + p_livestream.add_argument( + '--limit', + dest='limit', + type=int, + default=None, + help=''' + Number of items to fetch per request. + ''', + ) + p_livestream.add_argument( + '-r', + '--subreddit', + dest='subreddit', + default=None, + help=''' + The subreddit to collect from. + ''', + ) + p_livestream.add_argument( + '-s', + '--submissions', + dest='submissions', + action='store_true', + help=''' + If provided, do collect submissions. Otherwise don't. + + If submissions and comments are BOTH left unspecified, then they will + BOTH be collected. + ''', + ) + p_livestream.add_argument( + '-u', + '--user', + dest='username', + default=None, + help=''' + The redditor to collect from. + ''', + ) + p_livestream.add_argument( + '-w', + '--wait', + dest='sleepy', + default=30, + help=''' + The number of seconds to wait between cycles. + ''', + ) p_livestream.set_defaults(func=livestream_gateway) - p_merge_db = subparsers.add_parser('merge_db', aliases=['mergedb']) - p_merge_db.add_argument('--from', dest='from_db_path', required=True) - p_merge_db.add_argument('--to', dest='to_db_path', required=True) + # MERGEDB' + p_merge_db = subparsers.add_parser( + 'merge_db', + aliases=['mergedb'], + description=''' + Copy all new posts from one timesearch database into another. + ''', + ) + p_merge_db.examples = [ + '--from redditdev1.db --to redditdev2.db', + ] + p_merge_db.add_argument( + '--from', + dest='from_db_path', + required=True, + help=''' + The database file containing the posts you wish to copy. + ''', + ) + p_merge_db.add_argument( + '--to', + dest='to_db_path', + required=True, + help=''' + The database file to which you will copy the posts. + The database is modified in-place. + Existing posts will be ignored and not updated. + ''', + ) p_merge_db.set_defaults(func=merge_db_gateway) - p_offline_reading = subparsers.add_parser('offline_reading') - p_offline_reading.add_argument('-r', '--subreddit', dest='subreddit', default=None) - p_offline_reading.add_argument('-s', '--specific', dest='specific_submission', default=None) - p_offline_reading.add_argument('-u', '--user', dest='username', default=None) + # OFFLINE_READING + p_offline_reading = subparsers.add_parser( + 'offline_reading', + description=''' + Render submissions and comment threads to HTML via Markdown. + ''', + ) + p_offline_reading.add_argument( + '-r', + '--subreddit', + dest='subreddit', + default=None, + ) + p_offline_reading.add_argument( + '-s', + '--specific', + dest='specific_submission', + default=None, + type=str, + help=''' + Given a submission ID like t3_xxxxxx, render only that submission. + Otherwise render every submission in the database. + ''', + ) + p_offline_reading.add_argument( + '-u', + '--user', + dest='username', + default=None, + ) p_offline_reading.set_defaults(func=offline_reading_gateway) - p_index = subparsers.add_parser('index', aliases=['redmash']) - p_index.add_argument('--all', dest='do_all', action='store_true') - p_index.add_argument('--author', dest='do_author', action='store_true') - p_index.add_argument('--date', dest='do_date', action='store_true') - p_index.add_argument('--flair', dest='do_flair', action='store_true') - p_index.add_argument('--html', dest='html', action='store_true') - p_index.add_argument('--score', dest='do_score', action='store_true') - p_index.add_argument('--sub', dest='do_subreddit', action='store_true') - p_index.add_argument('--title', dest='do_title', action='store_true') - p_index.add_argument('--offline', dest='offline', action='store_true') - p_index.add_argument('-r', '--subreddit', dest='subreddit', default=None) - p_index.add_argument('-st', '--score_threshold', '--score-threshold', dest='score_threshold', default=0) - p_index.add_argument('-u', '--user', dest='username', default=None) + # INDEX + p_index = subparsers.add_parser( + 'index', + aliases=['redmash'], + description=''' + Dump submission listings to a plaintext or HTML file. + ''', + ) + p_index.examples = [ + { + 'args': '-r botwatch --date', + 'comment': 'Does only the date file.' + }, + { + 'args': '-r botwatch --score --title', + 'comment': 'Does both the score and title files.' + }, + { + 'args': '-r botwatch --score --score_threshold 50', + 'comment': 'Only shows submissions with >= 50 points.' + }, + { + 'args': '-r botwatch --all', + 'comment': 'Performs all of the different mashes.' + }, + ] + p_index.add_argument( + '-r', + '--subreddit', + dest='subreddit', + default=None, + help=''' + The subreddit database to dump. + ''', + ) + p_index.add_argument( + '-u', + '--user', + dest='username', + default=None, + help=''' + The username database to dump. + ''', + ) + p_index.add_argument( + '--all', + dest='do_all', + action='store_true', + help=''' + Perform all of the indexes listed below. + ''', + ) + p_index.add_argument( + '--author', + dest='do_author', + action='store_true', + help=''' + For subreddit databases only. + Perform an index sorted by author. + ''', + ) + p_index.add_argument( + '--date', + dest='do_date', + action='store_true', + help=''' + Perform an index sorted by date. + ''', + ) + p_index.add_argument( + '--flair', + dest='do_flair', + action='store_true', + help=''' + Perform an index sorted by flair. + ''', + ) + p_index.add_argument( + '--html', + dest='html', + action='store_true', + help=''' + Write HTML files instead of plain text. + ''', + ) + p_index.add_argument( + '--score', + dest='do_score', + action='store_true', + help=''' + Perform an index sorted by score. + ''', + ) + p_index.add_argument( + '--sub', + dest='do_subreddit', + action='store_true', + help=''' + For username databases only. + Perform an index sorted by subreddit. + ''', + ) + p_index.add_argument( + '--title', + dest='do_title', + action='store_true', + help=''' + Perform an index sorted by title. + ''', + ) + p_index.add_argument( + '--offline', + dest='offline', + action='store_true', + help=''' + The links in the index will point to the files generated by + offline_reading. That is, `../offline_reading/fullname.html` instead + of `http://redd.it/id`. This will NOT trigger offline_reading to + generate the files now, so you must run that tool separately. + ''', + ) + p_index.add_argument( + '--score_threshold', + '--score-threshold', + dest='score_threshold', + type=int, + default=0, + help=''' + Only index posts with at least this many points. + Applies to ALL indexes! + ''', + ) p_index.set_defaults(func=index_gateway) - p_get_submissions = subparsers.add_parser('get_submissions', aliases=['timesearch']) - p_get_submissions.add_argument('-l', '--lower', dest='lower', default='update') - p_get_submissions.add_argument('-r', '--subreddit', dest='subreddit', default=None) - p_get_submissions.add_argument('-u', '--user', dest='username', default=None) - p_get_submissions.add_argument('-up', '--upper', dest='upper', default=None) - p_get_submissions.add_argument('--dont_supplement', '--dont-supplement', dest='do_supplement', action='store_false') + # GET_SUBMISSIONS + p_get_submissions = subparsers.add_parser( + 'get_submissions', + aliases=['timesearch'], + description=''' + Collect submissions from the subreddit across all of history, or + Collect submissions by a user (as many as possible). + ''', + ) + p_get_submissions.add_argument( + '--lower', + dest='lower', + default='update', + help=''' + If a number - the unix timestamp to start at. + If "update" - continue from latest submission in db. + ''', + ) + p_get_submissions.add_argument( + '-r', + '--subreddit', + dest='subreddit', + type=str, + default=None, + help=''' + The subreddit to scan. Mutually exclusive with username. + ''', + ) + p_get_submissions.add_argument( + '-u', + '--user', + dest='username', + type=str, + default=None, + help=''' + The user to scan. Mutually exclusive with subreddit. + ''', + ) + p_get_submissions.add_argument( + '--upper', + dest='upper', + default=None, + help=''' + If a number - the unix timestamp to stop at. + If not provided - stop at current time. + ''', + ) + p_get_submissions.add_argument( + '--dont_supplement', + '--dont-supplement', + dest='do_supplement', + action='store_false', + help=''' + If provided, trust the pushshift data and do not fetch live copies + from reddit. + ''', + ) p_get_submissions.set_defaults(func=get_submissions_gateway) try: - return betterhelp.subparser_main( - argv, - parser, - main_docstring=DOCSTRING, - sub_docstrings=SUB_DOCSTRINGS, - ) + return betterhelp.go(parser, argv) except exceptions.DatabaseNotFound as exc: message = str(exc) message += '\nHave you used any of the other utilities to collect data?'