Add module ingest_jsonfile.

2023-06-25 13:24:39 -07:00 · 2023-06-25 13:24:39 -07:00 · ed1b7dc4eb
commit ed1b7dc4eb
parent 643386b88c
3 changed files with 115 additions and 1 deletions
--- a/timesearch.py
+++ b/timesearch.py
@ -35,6 +35,10 @@ def get_wiki_gateway(args):
    from timesearch_modules import get_wiki
    get_wiki.get_wiki_argparse(args)
 def ingest_jsonfile_gateway(args):
    from timesearch_modules import ingest_jsonfile
    ingest_jsonfile.ingest_jsonfile_argparse(args)
 def livestream_gateway(args):
    from timesearch_modules import livestream
    livestream.livestream_argparse(args)
@ -208,6 +212,41 @@ def main(argv):
    )
    p_get_wiki.set_defaults(func=get_wiki_gateway)
    # INGEST_JSONFILE
    p_ingest_jsonfile = subparsers.add_parser(
        'ingest_jsonfile',
        description='''
        This module was added after reddit's June 2023 API changes which
        resulted in pushshift losing API access, and pushshift's own API was
        disabled. The community has made archive files available for download.
        These archive files contain 1 object (a submission or a comment) per
        line in a JSON format.
        You can ingest these into timesearch so that you can continue to use
        timesearch's offline_reading or index features.
        ''',
    )
    p_ingest_jsonfile.add_argument(
        'json_file',
        help='''
        Path to a file containing 1 json object per line. Each object must be
        either a submission or a comment.
        ''',
    )
    p_ingest_jsonfile.add_argument(
        '-r',
        '--subreddit',
        dest='subreddit',
        default=None,
    )
    p_ingest_jsonfile.add_argument(
        '-u',
        '--user',
        dest='username',
        default=None,
    )
    p_ingest_jsonfile.set_defaults(func=ingest_jsonfile_gateway)
    # LIVESTREAM
    p_livestream = subparsers.add_parser(
        'livestream',
--- a/timesearch_modules/ingest_jsonfile.py
+++ b/timesearch_modules/ingest_jsonfile.py
@ -0,0 +1,71 @@
 import json
 import time
 import traceback
 from voussoirkit import pathclass
 from . import common
 from . import exceptions
 from . import pushshift
 from . import tsdb
 def is_submission(obj):
    return (
        obj.get('name', '').startswith('t3_')
        or obj.get('over_18') is not None
    )
 def is_comment(obj):
    return (
        obj.get('name', '').startswith('t1_')
        or obj.get('parent_id', '').startswith('t3_')
        or obj.get('link_id', '').startswith('t3_')
    )
 def jsonfile_to_objects(filepath):
    filepath = pathclass.Path(filepath)
    filepath.assert_is_file()
    with filepath.open('r', encoding='utf-8') as handle:
        for line in handle:
            line = line.strip()
            if not line:
                break
            obj = json.loads(line)
            if is_submission(obj):
                yield pushshift.DummySubmission(**obj)
            elif is_comment(obj):
                yield pushshift.DummyComment(**obj)
            else:
                raise ValueError(f'Could not recognize object type {obj}.')
 def ingest_jsonfile(
        filepath,
        subreddit=None,
        username=None,
    ):
    if not common.is_xor(subreddit, username):
        raise exceptions.NotExclusive(['subreddit', 'username'])
    if subreddit:
        (database, subreddit) = tsdb.TSDB.for_subreddit(subreddit, fix_name=True)
    elif username:
        (database, username) = tsdb.TSDB.for_user(username, fix_name=True)
    cur = database.sql.cursor()
    objects = jsonfile_to_objects(filepath)
    database.insert(objects)
    cur.execute('SELECT COUNT(idint) FROM submissions')
    submissioncount = cur.fetchone()[0]
    cur.execute('SELECT COUNT(idint) FROM comments')
    commentcount = cur.fetchone()[0]
    print('Ended with %d submissions and %d comments in %s' % (submissioncount, commentcount, database.filepath.basename))
 def ingest_jsonfile_argparse(args):
    return ingest_jsonfile(
        subreddit=args.subreddit,
        username=args.username,
        filepath=args.json_file,
    )
--- a/timesearch_modules/tsdb.py
+++ b/timesearch_modules/tsdb.py
@ -326,6 +326,10 @@ class TSDB:
    def insert(self, objects, commit=True):
        if not isinstance(objects, (list, tuple, types.GeneratorType)):
            objects = [objects]
        if isinstance(objects, types.GeneratorType):
            log.debug('Trying to insert a generator of objects.')
        else:
            log.debug('Trying to insert %d objects.', len(objects))
        new_values = {