timesearch/timesearch_modules/ingest_jsonfile.py

72 lines
2.0 KiB
Python

import json
import time
import traceback
from voussoirkit import pathclass
from . import common
from . import exceptions
from . import pushshift
from . import tsdb
def is_submission(obj):
return (
obj.get('name', '').startswith('t3_')
or obj.get('over_18') is not None
)
def is_comment(obj):
return (
obj.get('name', '').startswith('t1_')
or obj.get('parent_id', '').startswith('t3_')
or obj.get('link_id', '').startswith('t3_')
)
def jsonfile_to_objects(filepath):
filepath = pathclass.Path(filepath)
filepath.assert_is_file()
with filepath.open('r', encoding='utf-8') as handle:
for line in handle:
line = line.strip()
if not line:
break
obj = json.loads(line)
if is_submission(obj):
yield pushshift.DummySubmission(**obj)
elif is_comment(obj):
yield pushshift.DummyComment(**obj)
else:
raise ValueError(f'Could not recognize object type {obj}.')
def ingest_jsonfile(
filepath,
subreddit=None,
username=None,
):
if not common.is_xor(subreddit, username):
raise exceptions.NotExclusive(['subreddit', 'username'])
if subreddit:
(database, subreddit) = tsdb.TSDB.for_subreddit(subreddit, fix_name=True)
elif username:
(database, username) = tsdb.TSDB.for_user(username, fix_name=True)
cur = database.sql.cursor()
objects = jsonfile_to_objects(filepath)
database.insert(objects)
cur.execute('SELECT COUNT(idint) FROM submissions')
submissioncount = cur.fetchone()[0]
cur.execute('SELECT COUNT(idint) FROM comments')
commentcount = cur.fetchone()[0]
print('Ended with %d submissions and %d comments in %s' % (submissioncount, commentcount, database.filepath.basename))
def ingest_jsonfile_argparse(args):
return ingest_jsonfile(
subreddit=args.subreddit,
username=args.username,
filepath=args.json_file,
)