''' On January 29, 2018, reddit announced the death of the ?timestamp cloudsearch parameter for submissions. RIP. https://www.reddit.com/r/changelog/comments/7tus5f/update_to_search_api/dtfcdn0 This module interfaces with api.pushshift.io to restore this functionality. It also provides new features previously impossible through reddit alone, such as scanning all of a user's comments. ''' import html import requests import time import traceback from . import common from voussoirkit import ratelimiter from voussoirkit import vlogging log = vlogging.get_logger(__name__) print('Thank you Jason Baumgartner of Pushshift.io!') USERAGENT = 'Timesearch ({version}) ({contact})' API_URL = 'https://api.pushshift.io/reddit/' DEFAULT_PARAMS = { 'size': 1000, 'order': 'asc', 'sort': 'created_utc', } # Pushshift does not supply attributes that are null. So we fill them back in. FALLBACK_ATTRIBUTES = { 'distinguished': None, 'edited': False, 'link_flair_css_class': None, 'link_flair_text': None, 'score': 0, 'selftext': '', } contact_info_message = ''' Please add a CONTACT_INFO string variable to your bot.py file. This will be added to your pushshift useragent. '''.strip() if not getattr(common.bot, 'CONTACT_INFO', ''): raise ValueError(contact_info_message) useragent = USERAGENT.format(version=common.VERSION, contact=common.bot.CONTACT_INFO) ratelimit = None session = requests.Session() session.headers.update({'User-Agent': useragent}) ratelimit = ratelimiter.Ratelimiter(allowance=120, period=60) class DummyObject: ''' These classes are used to convert the JSON data we get from pushshift into objects so that the rest of timesearch can operate transparently. This requires a bit of whack-a-mole including: - Fleshing out the attributes which PS did not include because they were null (we use FALLBACK_ATTRIBUTES to replace them). - Providing the convenience methods and @properties that PRAW provides. - Mimicking the rich attributes like author and subreddit. ''' def __init__(self, **attributes): for (key, val) in attributes.items(): if key == 'author': val = DummyObject(name=val) elif key == 'subreddit': val = DummyObject(display_name=val) elif key in ['body', 'selftext']: val = html.unescape(val) elif key == 'parent_id': if val is None: val = attributes['link_id'] elif isinstance(val, int): val = 't1_' + common.b36(val) setattr(self, key, val) for (key, val) in FALLBACK_ATTRIBUTES.items(): if not hasattr(self, key): setattr(self, key, val) # In rare cases, things sometimes don't have a subreddit. # Promo posts seem to be one example. FALLBACK_ATTRIBUTES['subreddit'] = DummyObject(display_name=None) class DummySubmission(DummyObject): @property def fullname(self): return 't3_' + self.id class DummyComment(DummyObject): @property def fullname(self): return 't1_' + self.id def _normalize_subreddit(subreddit): if isinstance(subreddit, str): return subreddit else: return subreddit.display_name def _normalize_user(user): if isinstance(user, str): return user else: return user.name def _pagination_core(url, params, dummy_type, lower=None, upper=None): if upper is not None: params['before'] = upper if lower is not None: params['after'] = lower setify = lambda items: set(item['id'] for item in items) prev_batch_ids = set() while True: for retry in range(5): try: batch = get(url, params) except requests.exceptions.HTTPError as exc: traceback.print_exc() print('Retrying in 5...') time.sleep(5) else: break log.debug('Got batch of %d items.', len(batch)) batch_ids = setify(batch) if len(batch_ids) == 0 or batch_ids.issubset(prev_batch_ids): break submissions = [dummy_type(**x) for x in batch if x['id'] not in prev_batch_ids] submissions.sort(key=lambda x: x.created_utc) # Take the latest-1 to avoid the lightning strike chance that two posts # have the same timestamp and this occurs at a page boundary. # Since ?after=latest would cause us to miss that second one. params['after'] = submissions[-1].created_utc - 1 yield from submissions prev_batch_ids = batch_ids ratelimit.limit() def get(url, params=None): if not url.startswith('https://'): url = API_URL + url.lstrip('/') if params is None: params = {} for (key, val) in DEFAULT_PARAMS.items(): params.setdefault(key, val) log.debug('Requesting %s with %s', url, params) ratelimit.limit() response = session.get(url, params=params) response.raise_for_status() response = response.json() data = response['data'] return data def get_comments_from_submission(submission): if isinstance(submission, str): submission_id = common.t3_prefix(submission)[3:] else: submission_id = submission.id params = {'link_id': submission_id} comments = _pagination_core( url='comment/search/', params=params, dummy_type=DummyComment, ) yield from comments def get_comments_from_subreddit(subreddit, **kwargs): subreddit = _normalize_subreddit(subreddit) params = {'subreddit': subreddit} comments = _pagination_core( url='comment/search/', params=params, dummy_type=DummyComment, **kwargs ) yield from comments def get_comments_from_user(user, **kwargs): user = _normalize_user(user) params = {'author': user} comments = _pagination_core( url='comment/search/', params=params, dummy_type=DummyComment, **kwargs ) yield from comments def get_submissions_from_subreddit(subreddit, **kwargs): subreddit = _normalize_subreddit(subreddit) params = {'subreddit': subreddit} submissions = _pagination_core( url='submission/search/', params=params, dummy_type=DummySubmission, **kwargs ) yield from submissions def get_submissions_from_user(user, **kwargs): user = _normalize_user(user) params = {'author': user} submissions = _pagination_core( url='submission/search/', params=params, dummy_type=DummySubmission, **kwargs ) yield from submissions def supplement_reddit_data(dummies, chunk_size=100): ''' Given an iterable of the Dummy Pushshift objects, yield them back and also yield the live Reddit objects they refer to according to reddit's /api/info. The live object will always come after the corresponding dummy object. By doing this, we enjoy the strengths of both data sources: Pushshift will give us deleted or removed objects that reddit would not, and reddit gives us up-to-date scores and text bodies. ''' chunks = common.generator_chunker(dummies, chunk_size) for chunk in chunks: log.debug('Supplementing %d items with live reddit data.', len(chunk)) ids = [item.fullname for item in chunk] live_copies = list(common.r.info(ids)) live_copies = {item.fullname: item for item in live_copies} for item in chunk: yield item live_item = live_copies.get(item.fullname, None) if live_item: yield live_item