From e6b7ed739bcf59b54ed8314be5cc6ba04fe026d9 Mon Sep 17 00:00:00 2001 From: Ethan Dalool Date: Mon, 9 Apr 2018 19:53:53 -0700 Subject: [PATCH] Integrate with Pushshift.io to restore timesearch. And improve commentaugment. --- README.md | 14 ++- timesearch/__init__.py | 46 +++----- timesearch/commentaugment.py | 211 +++++++++++---------------------- timesearch/common.py | 7 ++ timesearch/livestream.py | 35 +++--- timesearch/pushshift.py | 218 +++++++++++++++++++++++++++++++++++ timesearch/timesearch.py | 134 +++++++-------------- timesearch/tsdb.py | 12 +- 8 files changed, 394 insertions(+), 283 deletions(-) create mode 100644 timesearch/pushshift.py diff --git a/README.md b/README.md index e3268f7..cdc2ed8 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,13 @@ timesearch ========== -# CRITICAL (2018 01 29): +# NEWS (2018 04 09): -[Reddit is in the process of removing the timestamp search feature which makes timesearch possible](https://www.reddit.com/r/changelog/comments/7tus5f/update_to_search_api/dtfcdn0/). Please message them and let them know that this feature is important to you. Thank you. +[Reddit has removed the timestamp search feature which timesearch was built off of](https://www.reddit.com/r/changelog/comments/7tus5f/update_to_search_api/dtfcdn0/). Please message the admins by [sending a PM to /r/reddit.com](https://www.reddit.com/message/compose?to=%2Fr%2Freddit.com&subject=Timestamp+search). Let them know that this feature is important to you, and you would like them to restore it on the new search stack. + +Thankfully, Jason Baumgartner aka [/u/Stuck_in_the_Matrix](https://reddit.com/u/Stuck_in_the_Matrix/overview), owner of [Pushshift.io](https://github.com/pushshift/api), has made it easy to interact with his dataset. Timesearch now queries his API to get post data, and then uses reddit's /api/info to get up-to-date information about those posts (scores, edited text bodies, ...). While we're at it, this also gives us the ability to speed up commentaugment. In addition, we can get all of a user's comments which was not possible through reddit alone. + +NOTE: Because Pushshift is an independent dataset run by a regular person, it does not contain posts from private subreddits. Without the timestamp search parameter, scanning private subreddits is now impossible. I urge once again that you contact ~~your senator~~ the admins to have this feature restored. --- @@ -24,8 +28,7 @@ Timesearch is a collection of utilities for archiving subreddits. `> timesearch.py timesearch -r subredditname ` `> timesearch.py timesearch -u username ` -- **commentaugment**: Although we can search for submissions, we cannot search for comments. After performing a timesearch, you can use commentaugment to download the comment tree for each submission. - Note: commentaugment only gets the comments attached to the submissions that you found in your timesearch scan. If you're trying to commentaugment on a user, you're going to get comments that were made on their submissions, **not** comments they made on other people's submissions. Therefore, comprehensively collecting a user's activity is not possible. You will have to use someone else's dataset like that of [/u/Stuck_in_the_Matrix](https://reddit.com/u/Stuck_in_the_Matrix) at [pushshift.io](https://pushshift.io). +- **commentaugment**: Although we can search for submissions, we cannot search for comments. After performing a timesearch, you can use commentaugment to download the comments on a subreddit, or the comments made by a user. `> timesearch.py commentaugment -r subredditname ` `> timesearch.py commentaugment -u username ` @@ -62,6 +65,9 @@ You will need both the `timesearch` package (folder) and the external `timesearc I recommend [sqlitebrowser](https://github.com/sqlitebrowser/sqlitebrowser/releases) if you want to inspect the database yourself. ### Changelog +- 2018 04 09 + - Integrated with Pushshift to restore timesearch functionality, speed up commentaugment, and get user comments. + - 2017 11 13 - Gave timesearch its own Github repository so that (1) it will be easier for people to download it and (2) it has a cleaner, more independent URL. [voussoir/timesearch](https://github.com/voussoir/timesearch) diff --git a/timesearch/__init__.py b/timesearch/__init__.py index 39b4be0..4e13eae 100644 --- a/timesearch/__init__.py +++ b/timesearch/__init__.py @@ -65,37 +65,21 @@ breakdown: 'commentaugment': ''' commentaugment: - Collect comments for the submissions in the database. - NOTE - if you did a timesearch scan on a username, this function is - mostly useless. It collects comments that were made on OP's submissions - but it does not find OP's comments on other people's submissions which - is what you probably wanted. Unfortunately that's not possible. + Collect comments on a subreddit or comments made by a user. > timesearch.py commentaugment -r subredditname > timesearch.py commentaugment -u username flags: - -l 18 | --limit 18: - The number of MoreComments objects to replace. - Default: No limit - - -t 5 | --threshold 5: - The number of comments a MoreComments object must claim to have - for us to open it. - Actual number received may be lower. - Default: >= 0 - - -n 4 | --num_thresh 4: - The number of comments a submission must claim to have for us to - scan it at all. - Actual number received may be lower. - Default: >= 1 - -s "t3_xxxxxx" | --specific "t3_xxxxxx": Given a submission ID, t3_xxxxxx, scan only that submission. + --dont_supplement: + If provided, trust the pushshift data and do not fetch live copies + from reddit. + -v | --verbose: - If provided, print more stuff while working. + If provided, print extra information to the screen. ''', 'getstyles': ''' @@ -256,11 +240,12 @@ timesearch: If not provided - stop at current time. Default: current time - -i 86400 | --interval 86400: - The initial interval for the scanning window, in seconds. - This is only a starting value. The window will shrink and stretch - as necessary based on received submission counts. - Default: 86400 + --dont_supplement: + If provided, trust the pushshift data and do not fetch live copies + from reddit. + + -v | --verbose: + If provided, print extra information to the screen. ''', } @@ -339,13 +324,11 @@ p_breakdown.add_argument('-u', '--user', dest='username', default=None) p_breakdown.set_defaults(func=breakdown_gateway) p_commentaugment = subparsers.add_parser('commentaugment') -p_commentaugment.add_argument('-l', '--limit', dest='limit', default=None) -p_commentaugment.add_argument('-n', '--num_thresh', dest='num_thresh', default=1) p_commentaugment.add_argument('-r', '--subreddit', dest='subreddit', default=None) p_commentaugment.add_argument('-s', '--specific', dest='specific_submission', default=None) -p_commentaugment.add_argument('-t', '--threshold', dest='threshold', default=0) p_commentaugment.add_argument('-u', '--user', dest='username', default=None) p_commentaugment.add_argument('-v', '--verbose', dest='verbose', action='store_true') +p_commentaugment.add_argument('--dont_supplement', dest='do_supplement', action='store_false') p_commentaugment.set_defaults(func=commentaugment_gateway) p_getstyles = subparsers.add_parser('getstyles') @@ -393,11 +376,12 @@ p_redmash.add_argument('-u', '--user', dest='username', default=None) p_redmash.set_defaults(func=redmash_gateway) p_timesearch = subparsers.add_parser('timesearch') -p_timesearch.add_argument('-i', '--interval', dest='interval', default=86400) p_timesearch.add_argument('-l', '--lower', dest='lower', default='update') p_timesearch.add_argument('-r', '--subreddit', dest='subreddit', default=None) p_timesearch.add_argument('-u', '--user', dest='username', default=None) p_timesearch.add_argument('-up', '--upper', dest='upper', default=None) +p_timesearch.add_argument('-v', '--verbose', dest='verbose', action='store_true') +p_timesearch.add_argument('--dont_supplement', dest='do_supplement', action='store_false') p_timesearch.set_defaults(func=timesearch_gateway) def main(argv): diff --git a/timesearch/commentaugment.py b/timesearch/commentaugment.py index 541141f..6352290 100644 --- a/timesearch/commentaugment.py +++ b/timesearch/commentaugment.py @@ -2,27 +2,22 @@ import traceback from . import common from . import exceptions +from . import pushshift; print('Thank you Jason Baumgartner, owner of Pushshift.io!') from . import tsdb def commentaugment( subreddit=None, username=None, - limit=0, - num_thresh=0, specific_submission=None, - threshold=0, - verbose=0, + do_supplement=True, ): - ''' - Take the IDs of collected submissions, and gather comments from those threads. - Please see the global DOCSTRING_COMMENTAUGMENT variable. - ''' - if specific_submission is None: - if not common.is_xor(subreddit, username): - raise exceptions.NotExclusive(['subreddit', 'username']) + if not specific_submission and not common.is_xor(subreddit, username): + raise exceptions.NotExclusive(['subreddit', 'username']) + if username and specific_submission: + raise exceptions.NotExclusive(['username', 'specific_submission']) - common.r = common.bot.login(common.r) + common.login() if specific_submission is not None: specific_submission = common.t3_prefix(specific_submission)[3:] specific_submission_obj = common.r.submission(specific_submission) @@ -35,144 +30,78 @@ def commentaugment( (database, username) = tsdb.TSDB.for_user(username, do_create=False, fix_name=True) cur = database.sql.cursor() - if limit == 0: - limit = None - - if specific_submission is None: - query = ''' - SELECT idstr FROM submissions - WHERE idstr IS NOT NULL - AND augmented_at IS NULL - AND num_comments >= ? - ORDER BY num_comments DESC - ''' - bindings = [num_thresh] - cur.execute(query, bindings) - fetchall = [item[0] for item in cur.fetchall()] - else: - # Make sure the object we're augmenting is in the table too! + if specific_submission is not None: database.insert(specific_submission_obj) - fetchall = [specific_submission] - totalthreads = len(fetchall) + lower = 0 + query_latest = 'SELECT created FROM comments ORDER BY created DESC LIMIT 1' + if subreddit: + # Instead of blindly taking the highest timestamp currently in the db, + # we must consider the case that the user has previously done a + # specific_submission scan and now wants to do a general scan, which + # would trick the latest timestamp into missing anything before that + # specific submission. + query = ''' + SELECT created FROM comments WHERE NOT EXISTS ( + SELECT 1 FROM submissions + WHERE submissions.idstr == comments.submission + AND submissions.augmented_at IS NOT NULL + ) + ORDER BY created DESC LIMIT 1 + ''' + unaugmented = cur.execute(query).fetchone() + if unaugmented: + lower = unaugmented[0] - 1 + else: + latest = cur.execute(query_latest).fetchone() + if latest: + lower = latest[0] - 1 + if username: + latest = cur.execute(query_latest).fetchone() + if latest: + lower = latest[0] - 1 - if verbose: - spacer = '\n\t' - else: - spacer = ' ' + if specific_submission: + comments = pushshift.get_comments_from_submission(specific_submission_obj) + elif subreddit: + comments = pushshift.get_comments_from_subreddit(subreddit, lower=lower) + elif username: + comments = pushshift.get_comments_from_user(username, lower=lower) - scannedthreads = 0 - get_submission = common.nofailrequest(get_submission_immediately) - while len(fetchall) > 0: - id_batch = fetchall[:100] - fetchall = fetchall[100:] + form = '{lower} - {upper} +{gain}' - for submission in id_batch: - submission = get_submission(submission.split('_')[-1]) - message = 'Processing {fullname}{spacer}expecting {num_comments} | ' - message = message.format( - fullname=submission.fullname, - spacer=spacer, - num_comments=submission.num_comments, - ) - - print(message, end='', flush=True) - if verbose: - print() - - comments = get_comments_for_thread(submission, limit, threshold, verbose) - - database.insert(comments, commit=False) - query = ''' - UPDATE submissions - set augmented_at = ?, - augmented_count = ? - WHERE idstr == ? - ''' - bindings = [common.get_now(), len(comments), submission.fullname] - cur.execute(query, bindings) - database.sql.commit() - - scannedthreads += 1 - if verbose: - print('\t', end='') - message = 'Found {count} |{spacer}{scannedthreads} / {totalthreads}' - message = message.format( - count=len(comments), - spacer=spacer, - scannedthreads=scannedthreads, - totalthreads=totalthreads, - ) - print(message) - -def get_comments_for_thread(submission, limit, threshold, verbose): - comments = common.nofailrequest(lambda x: x.comments)(submission) - # PRAW4 flatten is just list(). - comments = manually_replace_comments(comments, limit, threshold, verbose) - return comments - -def get_submission_immediately(submission_id): - submission = common.r.submission(submission_id) - # force the lazyloader - submission.title = submission.title - return submission - -def manually_replace_comments(incomments, limit=None, threshold=0, verbose=False): - ''' - PRAW's replace_more_comments method cannot continue - where it left off in the case of an Ow! screen. - So I'm writing my own function to get each MoreComments item individually - - Furthermore, this function will maximize the number of retrieved comments by - sorting the MoreComments objects and getting the big chunks before worrying - about the tail ends. - ''' - incomments = incomments.list() - comments = [] - morecomments = [] - while len(incomments) > 0: - item = incomments.pop() - if isinstance(item, common.praw.models.MoreComments) and item.count >= threshold: - morecomments.append(item) - elif isinstance(item, common.praw.models.Comment): - comments.append(item) - - while True: - try: - if limit is not None and limit <= 0: - break - if len(morecomments) == 0: - break - morecomments.sort(key=lambda x: x.count) - mc = morecomments.pop() - additional = common.nofailrequest(mc.comments)() - additionals = 0 - if limit is not None: - limit -= 1 - for item in additional: - if isinstance(item, common.praw.models.MoreComments) and item.count >= threshold: - morecomments.append(item) - elif isinstance(item, common.praw.models.Comment): - comments.append(item) - additionals += 1 - if verbose: - s = '\tGot %d more, %d so far.' % (additionals, len(comments)) - if limit is not None: - s += ' Can perform %d more replacements' % limit - print(s) - except KeyboardInterrupt: - raise - except Exception: - traceback.print_exc() - return comments + if do_supplement: + comments = pushshift.supplement_reddit_data(comments, chunk_size=100) + comments = common.generator_chunker(comments, 200) + for chunk in comments: + step = database.insert(chunk) + message = form.format( + lower=common.human(chunk[0].created_utc), + upper=common.human(chunk[-1].created_utc), + gain=step['new_comments'], + ) + print(message) + if specific_submission: + query = ''' + UPDATE submissions + set augmented_at = ? + WHERE idstr == ? + ''' + bindings = [common.get_now(), specific_submission_obj.fullname] + cur.execute(query, bindings) + database.sql.commit() def commentaugment_argparse(args): + if args.verbose: + common.log.setLevel(common.logging.DEBUG) + return commentaugment( subreddit=args.subreddit, username=args.username, - limit=common.int_none(args.limit), - threshold=common.int_none(args.threshold), - num_thresh=common.int_none(args.num_thresh), - verbose=args.verbose, + #limit=common.int_none(args.limit), + #threshold=common.int_none(args.threshold), + #num_thresh=common.int_none(args.num_thresh), + #verbose=args.verbose, specific_submission=args.specific_submission, + do_supplement=args.do_supplement, ) diff --git a/timesearch/common.py b/timesearch/common.py index dbb5aaa..81b1872 100644 --- a/timesearch/common.py +++ b/timesearch/common.py @@ -4,6 +4,8 @@ import os import time import traceback +VERSION = '2018.04.09.0' + try: import praw except ImportError: @@ -93,6 +95,11 @@ def is_xor(*args): ''' return [bool(a) for a in args].count(True) == 1 +def login(): + global r + log.debug('Logging in to reddit.') + r = bot.login(r) + def nofailrequest(function): ''' Creates a function that will retry until it succeeds. diff --git a/timesearch/livestream.py b/timesearch/livestream.py index 1187b7c..e98bfa6 100644 --- a/timesearch/livestream.py +++ b/timesearch/livestream.py @@ -8,11 +8,14 @@ from . import tsdb def generator_printer(generator): + prev_message_length = 0 for step in generator: newtext = '%s: +%ds, %dc' % (step['tsdb'].filepath.basename, step['new_submissions'], step['new_comments']) totalnew = step['new_submissions'] + step['new_comments'] status = '{now} {new}'.format(now=common.human(common.get_now()), new=newtext) - print(status, end='') + clear_prev = (' ' * prev_message_length) + '\r' + print(clear_prev + status, end='') + prev_message_length = len(status) if totalnew == 0 and common.log.level != common.logging.DEBUG: # Since there were no news, allow the next line to overwrite status print('\r', end='', flush=True) @@ -20,6 +23,14 @@ def generator_printer(generator): print() yield None +def cycle_generators(generators, only_once, sleepy): + while True: + for (index, generator) in enumerate(generators): + yield next(generator) + if only_once: + break + time.sleep(sleepy) + def livestream( subreddit=None, username=None, @@ -66,19 +77,15 @@ def livestream( return generators[0] return generators - generators = [generator_printer(generator) for generator in generators] + generator = cycle_generators(generators, only_once, sleepy) + generator = generator_printer(generator) - while True: - try: - for generator in generators: - step = next(generator) - if only_once: - break - time.sleep(sleepy) - - except KeyboardInterrupt: - print() - return + try: + while True: + step = next(generator) + except KeyboardInterrupt: + print() + return hangman = lambda: livestream( username='gallowboob', @@ -101,7 +108,7 @@ def _livestream_as_a_generator( if not any([do_submissions, do_comments]): raise TypeError('Required do_submissions and/or do_comments parameter') - common.r = common.bot.login(common.r) + common.login() if subreddit: common.log.debug('Getting subreddit %s', subreddit) diff --git a/timesearch/pushshift.py b/timesearch/pushshift.py new file mode 100644 index 0000000..ef1dbea --- /dev/null +++ b/timesearch/pushshift.py @@ -0,0 +1,218 @@ +''' +On January 29, 2018, reddit announced the death of the ?timestamp cloudsearch +parameter for submissions. RIP. +https://www.reddit.com/r/changelog/comments/7tus5f/update_to_search_api/dtfcdn0 + +This module interfaces with api.pushshift.io to restore this functionality. +It also provides new features previously impossible through reddit alone, such +as scanning all of a user's comments. +''' +import html +import requests +import time + +from . import common + +from voussoirkit import ratelimiter + + +USERAGENT = 'Timesearch ({version}) ({contact})' +API_URL = 'https://api.pushshift.io/reddit/' + +DEFAULT_PARAMS = { + 'size': 1000, + 'sort': 'asc', + 'sort_type': 'created_utc', +} + +# Pushshift does not supply attributes that are null. So we fill them back in. +FALLBACK_ATTRIBUTES = { + 'distinguished': None, + 'edited': False, + 'link_flair_css_class': None, + 'link_flair_text': None, + 'score': 0, + 'selftext': '', +} + +contact_info_message = ''' +Please add a CONTACT_INFO string variable to your bot.py file. +This will be added to your pushshift useragent. +'''.strip() +if not getattr(common.bot, 'CONTACT_INFO', ''): + raise ValueError(contact_info_message) + +useragent = USERAGENT.format(version=common.VERSION, contact=common.bot.CONTACT_INFO) +ratelimit = ratelimiter.Ratelimiter(allowance=60, period=60) +session = requests.Session() +session.headers.update({'User-Agent': useragent}) + + +class DummyObject: + ''' + These classes are used to convert the JSON data we get from pushshift into + objects so that the rest of timesearch can operate transparently. + This requires a bit of whack-a-mole including: + - Fleshing out the attributes which PS did not include because they were + null (we use FALLBACK_ATTRIBUTES to replace them). + - Providing the convenience methods and @properties that PRAW provides. + - Mimicking the rich attributes like author and subreddit. + ''' + def __init__(self, **attributes): + for (key, val) in attributes.items(): + if key == 'author': + val = DummyObject(name=val) + elif key == 'subreddit': + val = DummyObject(display_name=val) + elif key in ['body', 'selftext']: + val = html.unescape(val) + + setattr(self, key, val) + + for (key, val) in FALLBACK_ATTRIBUTES.items(): + if not hasattr(self, key): + setattr(self, key, val) +# This seems to occur in rare cases such as promo posts. +FALLBACK_ATTRIBUTES['subreddit'] = DummyObject(display_name=None) + +class DummySubmission(DummyObject): + @property + def fullname(self): + return 't3_' + self.id + +class DummyComment(DummyObject): + @property + def fullname(self): + return 't1_' + self.id + + +def _normalize_subreddit(subreddit): + if isinstance(subreddit, str): + return subreddit + else: + return subreddit.display_name + +def _normalize_user(user): + if isinstance(user, str): + return user + else: + return user.display_name + +def _pagination_core(url, params, dummy_type, lower=None, upper=None): + if upper is not None: + params['before'] = upper + if lower is not None: + params['after'] = lower + + setify = lambda items: set(item['id'] for item in items) + prev_batch_ids = set() + while True: + batch = get(url, params) + batch_ids = setify(batch) + if len(batch_ids) == 0 or batch_ids.issubset(prev_batch_ids): + break + submissions = [dummy_type(**x) for x in batch if x['id'] not in prev_batch_ids] + submissions.sort(key=lambda x: x.created_utc) + # Take the latest-1 to avoid the lightning strike chance that two posts + # have the same timestamp and this occurs at a page boundary. + # Since ?after=latest would cause us to miss that second one. + params['after'] = submissions[-1].created_utc - 1 + yield from submissions + + prev_batch_ids = batch_ids + ratelimit.limit() + +def get(url, params=None): + if not url.startswith('https://'): + url = API_URL + url.lstrip('/') + + if params is None: + params = {} + + for (key, val) in DEFAULT_PARAMS.items(): + params.setdefault(key, val) + + common.log.debug('Requesting %s with %s', url, params) + response = session.get(url, params=params) + response.raise_for_status() + response = response.json() + data = response['data'] + return data + +def get_comments_from_submission(submission): + if isinstance(submission, str): + submission_id = common.t3_prefix(submission)[3:] + else: + submission_id = submission.id + + params = {'link_id': submission_id} + comments = _pagination_core( + url='comment/search/', + params=params, + dummy_type=DummyComment, + ) + yield from comments + +def get_comments_from_subreddit(subreddit, **kwargs): + subreddit = _normalize_subreddit(subreddit) + params = {'subreddit': subreddit} + comments = _pagination_core( + url='comment/search/', + params=params, + dummy_type=DummyComment, + **kwargs + ) + yield from comments + +def get_comments_from_user(user, **kwargs): + user = _normalize_user(user) + params = {'author': user} + comments = _pagination_core( + url='comment/search/', + params=params, + dummy_type=DummyComment, + **kwargs + ) + yield from comments + +def get_submissions_from_subreddit(subreddit, **kwargs): + subreddit = _normalize_subreddit(subreddit) + params = {'subreddit': subreddit} + submissions = _pagination_core( + url='submission/search/', + params=params, + dummy_type=DummySubmission, + **kwargs + ) + yield from submissions + +def get_submissions_from_user(user, **kwargs): + user = _normalize_user(user) + params = {'author': user} + submissions = _pagination_core( + url='submission/search/', + params=params, + dummy_type=DummySubmission, + **kwargs + ) + yield from submissions + +def supplement_reddit_data(dummies, chunk_size=100): + ''' + Given an iterable of the Dummy Pushshift objects, yield them back and also + yield the live Reddit objects they refer to according to reddit's /api/info. + The live object will always come after the corresponding dummy object. + By doing this, we enjoy the strengths of both data sources: Pushshift + will give us deleted or removed objects that reddit would not, and reddit + gives us up-to-date scores and text bodies. + ''' + chunks = common.generator_chunker(dummies, chunk_size) + for chunk in chunks: + ids = [item.fullname for item in chunk] + live_copies = list(common.r.info(ids)) + live_copies = {item.fullname: item for item in live_copies} + for item in chunk: + yield item + live_item = live_copies.get(item.fullname, None) + if live_item: + yield live_item diff --git a/timesearch/timesearch.py b/timesearch/timesearch.py index beb05ed..f302f66 100644 --- a/timesearch/timesearch.py +++ b/timesearch/timesearch.py @@ -3,20 +3,34 @@ import traceback from . import common from . import exceptions +from . import pushshift; print('Thank you Jason Baumgartner, owner of Pushshift.io!') from . import tsdb -# The maximum amount by which it can multiply the interval -# when not enough posts are found. -MAXIMUM_EXPANSION_MULTIPLIER = 2 +def _normalize_subreddit(subreddit): + if subreddit is None: + pass + elif isinstance(subreddit, str): + subreddit = common.r.subreddit(subreddit) + elif not isinstance(subreddit, common.praw.models.Subreddit): + raise TypeError(type(subreddit)) + return subreddit +def _normalize_user(user): + if user is None: + pass + elif isinstance(user, str): + user = common.r.redditor(user) + elif not isinstance(user, common.praw.models.Redditor): + raise TypeError(type(user)) + return user def timesearch( subreddit=None, username=None, lower=None, upper=None, - interval=86400, + do_supplement=True, ): ''' Collect submissions across time. @@ -25,120 +39,60 @@ def timesearch( if not common.is_xor(subreddit, username): raise exceptions.NotExclusive(['subreddit', 'username']) - common.r = common.bot.login(common.r) + common.login() if subreddit: (database, subreddit) = tsdb.TSDB.for_subreddit(subreddit, fix_name=True) - else: - # When searching, we'll take the user's submissions from anywhere. - subreddit = 'all' + elif username: (database, username) = tsdb.TSDB.for_user(username, fix_name=True) cur = database.sql.cursor() + subreddit = _normalize_subreddit(subreddit) + user = _normalize_user(username) + if lower == 'update': # Start from the latest submission - cur.execute('SELECT * FROM submissions ORDER BY idint DESC LIMIT 1') - f = cur.fetchone() - if f: - lower = f[tsdb.SQL_SUBMISSION['created']] - print(f[tsdb.SQL_SUBMISSION['idstr']], common.human(lower), lower) + cur.execute('SELECT created FROM submissions ORDER BY created DESC LIMIT 1') + fetch = cur.fetchone() + if fetch is not None: + lower = fetch[0] else: lower = None + if lower is None: + lower = 0 - if not isinstance(subreddit, common.praw.models.Subreddit): - subreddit = common.r.subreddit(subreddit) + if upper is None: + upper = common.get_now() + 86400 - if subreddit != 'all': - if isinstance(subreddit, common.praw.models.Subreddit): - creation = subreddit.created_utc - else: - subreddits = subreddit.split('+') - subreddits = [common.r.subreddit(sr) for sr in subreddits] - creation = min([sr.created_utc for sr in subreddits]) - else: - if not isinstance(username, common.praw.models.Redditor): - user = common.r.redditor(username) - creation = user.created_utc + form = '{lower} - {upper} +{gain}' - if lower is None or lower < creation: - lower = creation - - maxupper = upper - if maxupper is None: - maxupper = common.get_now() + 86400 - - form = '{upper} - {lower} +{gain}' if username: - query = 'author:"%s"' % username + submissions = pushshift.get_submissions_from_user(username, lower=lower, upper=upper) else: - query = None + submissions = pushshift.get_submissions_from_subreddit(subreddit, lower=lower, upper=upper) - submissions = subreddit.submissions(start=lower, end=maxupper, extra_query=query) - submissions = common.generator_chunker(submissions, 100) + if do_supplement: + submissions = pushshift.supplement_reddit_data(submissions, chunk_size=100) + submissions = common.generator_chunker(submissions, 200) for chunk in submissions: - chunk.sort(key=lambda x: x.created_utc, reverse=True) + chunk.sort(key=lambda x: x.created_utc) new_count = database.insert(chunk)['new_submissions'] message = form.format( - upper=common.human(chunk[0].created_utc), - lower=common.human(chunk[-1].created_utc), + lower=common.human(chunk[0].created_utc), + upper=common.human(chunk[-1].created_utc), gain=new_count, ) print(message) - #upper = lower + interval - #toomany_inarow = 0 - # while lower < maxupper: - # print('\nCurrent interval:', interval, 'seconds') - # print('Lower:', common.human(lower), lower) - # print('Upper:', common.human(upper), upper) - # if username: - # query = '(and author:"%s" (and timestamp:%d..%d))' % (username, lower, upper) - # else: - # query = 'timestamp:%d..%d' % (lower, upper) - - # try: - # searchresults = subreddit.search( - # query, - # sort='new', - # limit=100, - # syntax='cloudsearch' - # ) - # searchresults = list(searchresults) - # except Exception: - # traceback.print_exc() - # print('resuming in 5...') - # time.sleep(5) - # continue - - # searchresults.sort(key=lambda x: x.created_utc) - # print([i.id for i in searchresults]) - - # itemsfound = len(searchresults) - # print('Found', itemsfound, 'items.') - # if itemsfound < 50: - # print('Too few results, increasing interval', end='') - # diff = (1 - (itemsfound / 75)) + 1 - # diff = min(MAXIMUM_EXPANSION_MULTIPLIER, diff) - # interval = int(interval * diff) - # if itemsfound > 99: - # #Intentionally not elif - # print('Too many results, reducing interval', end='') - # interval = int(interval * (0.8 - (0.05 * toomany_inarow))) - # upper = lower + interval - # toomany_inarow += 1 - # else: - # lower = upper - # upper = lower + interval - # toomany_inarow = max(0, toomany_inarow-1) - # print(database.insert(searchresults)) - # print() - cur.execute('SELECT COUNT(idint) FROM submissions') itemcount = cur.fetchone()[0] print('Ended with %d items in %s' % (itemcount, database.filepath.basename)) def timesearch_argparse(args): + if args.verbose: + common.log.setLevel(common.logging.DEBUG) + if args.lower == 'update': lower = 'update' else: @@ -149,5 +103,5 @@ def timesearch_argparse(args): username=args.username, lower=lower, upper=common.int_none(args.upper), - interval=common.int_none(args.interval), + do_supplement=args.do_supplement, ) diff --git a/timesearch/tsdb.py b/timesearch/tsdb.py index eddacaa..2f1c6af 100644 --- a/timesearch/tsdb.py +++ b/timesearch/tsdb.py @@ -5,6 +5,7 @@ import types from . import common from . import exceptions +from . import pushshift from voussoirkit import pathclass @@ -136,6 +137,8 @@ SQL_EDITS_COLUMNS = [ SQL_SUBMISSION = {key:index for (index, key) in enumerate(SQL_SUBMISSION_COLUMNS)} SQL_COMMENT = {key:index for (index, key) in enumerate(SQL_COMMENT_COLUMNS)} +SUBMISSION_TYPES = (common.praw.models.Submission, pushshift.DummySubmission) +COMMENT_TYPES = (common.praw.models.Comment, pushshift.DummyComment) class TSDB: def __init__(self, filepath, do_create=True): @@ -247,7 +250,7 @@ class TSDB: Then, if the database is configured to store edited text, do so. Finally, return the body that we want to store in the main table. ''' - if isinstance(obj, common.praw.models.Submission): + if isinstance(obj, SUBMISSION_TYPES): existing_body = existing_entry[SQL_SUBMISSION['selftext']] body = obj.selftext else: @@ -275,6 +278,9 @@ class TSDB: common.praw.models.Submission: (self.insert_submission, 'new_submissions'), common.praw.models.Comment: (self.insert_comment, 'new_comments'), } + methods[pushshift.DummySubmission] = methods[common.praw.models.Submission] + methods[pushshift.DummyComment] = methods[common.praw.models.Comment] + for obj in objects: (method, key) = methods.get(type(obj), (None, None)) if method is None: @@ -295,7 +301,7 @@ class TSDB: the appropriate *_edits table containing the text that is being replaced. ''' - if isinstance(obj, common.praw.models.Submission): + if isinstance(obj, SUBMISSION_TYPES): table = 'submission_edits' else: table = 'comment_edits' @@ -477,7 +483,7 @@ def should_keep_existing_text(obj): This function puts away the work I would otherwise have to duplicate for both submissions and comments. ''' - body = obj.selftext if isinstance(obj, common.praw.models.Submission) else obj.body + body = obj.selftext if isinstance(obj, SUBMISSION_TYPES) else obj.body if obj.author is None and body in ['[removed]', '[deleted]']: return True