From ed1b7dc4eb15eca5ac07b9f928a793e8f63759ac Mon Sep 17 00:00:00 2001 From: Ethan Dalool Date: Sun, 25 Jun 2023 13:24:39 -0700 Subject: [PATCH] Add module ingest_jsonfile. --- timesearch.py | 39 +++++++++++++++ timesearch_modules/ingest_jsonfile.py | 71 +++++++++++++++++++++++++++ timesearch_modules/tsdb.py | 6 ++- 3 files changed, 115 insertions(+), 1 deletion(-) create mode 100644 timesearch_modules/ingest_jsonfile.py diff --git a/timesearch.py b/timesearch.py index f848f4a..e2470cd 100644 --- a/timesearch.py +++ b/timesearch.py @@ -35,6 +35,10 @@ def get_wiki_gateway(args): from timesearch_modules import get_wiki get_wiki.get_wiki_argparse(args) +def ingest_jsonfile_gateway(args): + from timesearch_modules import ingest_jsonfile + ingest_jsonfile.ingest_jsonfile_argparse(args) + def livestream_gateway(args): from timesearch_modules import livestream livestream.livestream_argparse(args) @@ -208,6 +212,41 @@ def main(argv): ) p_get_wiki.set_defaults(func=get_wiki_gateway) + # INGEST_JSONFILE + p_ingest_jsonfile = subparsers.add_parser( + 'ingest_jsonfile', + description=''' + This module was added after reddit's June 2023 API changes which + resulted in pushshift losing API access, and pushshift's own API was + disabled. The community has made archive files available for download. + These archive files contain 1 object (a submission or a comment) per + line in a JSON format. + + You can ingest these into timesearch so that you can continue to use + timesearch's offline_reading or index features. + ''', + ) + p_ingest_jsonfile.add_argument( + 'json_file', + help=''' + Path to a file containing 1 json object per line. Each object must be + either a submission or a comment. + ''', + ) + p_ingest_jsonfile.add_argument( + '-r', + '--subreddit', + dest='subreddit', + default=None, + ) + p_ingest_jsonfile.add_argument( + '-u', + '--user', + dest='username', + default=None, + ) + p_ingest_jsonfile.set_defaults(func=ingest_jsonfile_gateway) + # LIVESTREAM p_livestream = subparsers.add_parser( 'livestream', diff --git a/timesearch_modules/ingest_jsonfile.py b/timesearch_modules/ingest_jsonfile.py new file mode 100644 index 0000000..bc4aede --- /dev/null +++ b/timesearch_modules/ingest_jsonfile.py @@ -0,0 +1,71 @@ +import json +import time +import traceback + +from voussoirkit import pathclass + +from . import common +from . import exceptions +from . import pushshift +from . import tsdb + +def is_submission(obj): + return ( + obj.get('name', '').startswith('t3_') + or obj.get('over_18') is not None + ) + +def is_comment(obj): + return ( + obj.get('name', '').startswith('t1_') + or obj.get('parent_id', '').startswith('t3_') + or obj.get('link_id', '').startswith('t3_') + ) + +def jsonfile_to_objects(filepath): + filepath = pathclass.Path(filepath) + filepath.assert_is_file() + + with filepath.open('r', encoding='utf-8') as handle: + for line in handle: + line = line.strip() + if not line: + break + obj = json.loads(line) + if is_submission(obj): + yield pushshift.DummySubmission(**obj) + elif is_comment(obj): + yield pushshift.DummyComment(**obj) + else: + raise ValueError(f'Could not recognize object type {obj}.') + +def ingest_jsonfile( + filepath, + subreddit=None, + username=None, + ): + if not common.is_xor(subreddit, username): + raise exceptions.NotExclusive(['subreddit', 'username']) + + if subreddit: + (database, subreddit) = tsdb.TSDB.for_subreddit(subreddit, fix_name=True) + elif username: + (database, username) = tsdb.TSDB.for_user(username, fix_name=True) + cur = database.sql.cursor() + + objects = jsonfile_to_objects(filepath) + database.insert(objects) + + cur.execute('SELECT COUNT(idint) FROM submissions') + submissioncount = cur.fetchone()[0] + cur.execute('SELECT COUNT(idint) FROM comments') + commentcount = cur.fetchone()[0] + + print('Ended with %d submissions and %d comments in %s' % (submissioncount, commentcount, database.filepath.basename)) + +def ingest_jsonfile_argparse(args): + return ingest_jsonfile( + subreddit=args.subreddit, + username=args.username, + filepath=args.json_file, + ) diff --git a/timesearch_modules/tsdb.py b/timesearch_modules/tsdb.py index 798d0e7..48e332b 100644 --- a/timesearch_modules/tsdb.py +++ b/timesearch_modules/tsdb.py @@ -326,7 +326,11 @@ class TSDB: def insert(self, objects, commit=True): if not isinstance(objects, (list, tuple, types.GeneratorType)): objects = [objects] - log.debug('Trying to insert %d objects.', len(objects)) + + if isinstance(objects, types.GeneratorType): + log.debug('Trying to insert a generator of objects.') + else: + log.debug('Trying to insert %d objects.', len(objects)) new_values = { 'tsdb': self,