Add module ingest_jsonfile.
This commit is contained in:
parent
643386b88c
commit
ed1b7dc4eb
3 changed files with 115 additions and 1 deletions
|
@ -35,6 +35,10 @@ def get_wiki_gateway(args):
|
||||||
from timesearch_modules import get_wiki
|
from timesearch_modules import get_wiki
|
||||||
get_wiki.get_wiki_argparse(args)
|
get_wiki.get_wiki_argparse(args)
|
||||||
|
|
||||||
|
def ingest_jsonfile_gateway(args):
|
||||||
|
from timesearch_modules import ingest_jsonfile
|
||||||
|
ingest_jsonfile.ingest_jsonfile_argparse(args)
|
||||||
|
|
||||||
def livestream_gateway(args):
|
def livestream_gateway(args):
|
||||||
from timesearch_modules import livestream
|
from timesearch_modules import livestream
|
||||||
livestream.livestream_argparse(args)
|
livestream.livestream_argparse(args)
|
||||||
|
@ -208,6 +212,41 @@ def main(argv):
|
||||||
)
|
)
|
||||||
p_get_wiki.set_defaults(func=get_wiki_gateway)
|
p_get_wiki.set_defaults(func=get_wiki_gateway)
|
||||||
|
|
||||||
|
# INGEST_JSONFILE
|
||||||
|
p_ingest_jsonfile = subparsers.add_parser(
|
||||||
|
'ingest_jsonfile',
|
||||||
|
description='''
|
||||||
|
This module was added after reddit's June 2023 API changes which
|
||||||
|
resulted in pushshift losing API access, and pushshift's own API was
|
||||||
|
disabled. The community has made archive files available for download.
|
||||||
|
These archive files contain 1 object (a submission or a comment) per
|
||||||
|
line in a JSON format.
|
||||||
|
|
||||||
|
You can ingest these into timesearch so that you can continue to use
|
||||||
|
timesearch's offline_reading or index features.
|
||||||
|
''',
|
||||||
|
)
|
||||||
|
p_ingest_jsonfile.add_argument(
|
||||||
|
'json_file',
|
||||||
|
help='''
|
||||||
|
Path to a file containing 1 json object per line. Each object must be
|
||||||
|
either a submission or a comment.
|
||||||
|
''',
|
||||||
|
)
|
||||||
|
p_ingest_jsonfile.add_argument(
|
||||||
|
'-r',
|
||||||
|
'--subreddit',
|
||||||
|
dest='subreddit',
|
||||||
|
default=None,
|
||||||
|
)
|
||||||
|
p_ingest_jsonfile.add_argument(
|
||||||
|
'-u',
|
||||||
|
'--user',
|
||||||
|
dest='username',
|
||||||
|
default=None,
|
||||||
|
)
|
||||||
|
p_ingest_jsonfile.set_defaults(func=ingest_jsonfile_gateway)
|
||||||
|
|
||||||
# LIVESTREAM
|
# LIVESTREAM
|
||||||
p_livestream = subparsers.add_parser(
|
p_livestream = subparsers.add_parser(
|
||||||
'livestream',
|
'livestream',
|
||||||
|
|
71
timesearch_modules/ingest_jsonfile.py
Normal file
71
timesearch_modules/ingest_jsonfile.py
Normal file
|
@ -0,0 +1,71 @@
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
from voussoirkit import pathclass
|
||||||
|
|
||||||
|
from . import common
|
||||||
|
from . import exceptions
|
||||||
|
from . import pushshift
|
||||||
|
from . import tsdb
|
||||||
|
|
||||||
|
def is_submission(obj):
|
||||||
|
return (
|
||||||
|
obj.get('name', '').startswith('t3_')
|
||||||
|
or obj.get('over_18') is not None
|
||||||
|
)
|
||||||
|
|
||||||
|
def is_comment(obj):
|
||||||
|
return (
|
||||||
|
obj.get('name', '').startswith('t1_')
|
||||||
|
or obj.get('parent_id', '').startswith('t3_')
|
||||||
|
or obj.get('link_id', '').startswith('t3_')
|
||||||
|
)
|
||||||
|
|
||||||
|
def jsonfile_to_objects(filepath):
|
||||||
|
filepath = pathclass.Path(filepath)
|
||||||
|
filepath.assert_is_file()
|
||||||
|
|
||||||
|
with filepath.open('r', encoding='utf-8') as handle:
|
||||||
|
for line in handle:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
break
|
||||||
|
obj = json.loads(line)
|
||||||
|
if is_submission(obj):
|
||||||
|
yield pushshift.DummySubmission(**obj)
|
||||||
|
elif is_comment(obj):
|
||||||
|
yield pushshift.DummyComment(**obj)
|
||||||
|
else:
|
||||||
|
raise ValueError(f'Could not recognize object type {obj}.')
|
||||||
|
|
||||||
|
def ingest_jsonfile(
|
||||||
|
filepath,
|
||||||
|
subreddit=None,
|
||||||
|
username=None,
|
||||||
|
):
|
||||||
|
if not common.is_xor(subreddit, username):
|
||||||
|
raise exceptions.NotExclusive(['subreddit', 'username'])
|
||||||
|
|
||||||
|
if subreddit:
|
||||||
|
(database, subreddit) = tsdb.TSDB.for_subreddit(subreddit, fix_name=True)
|
||||||
|
elif username:
|
||||||
|
(database, username) = tsdb.TSDB.for_user(username, fix_name=True)
|
||||||
|
cur = database.sql.cursor()
|
||||||
|
|
||||||
|
objects = jsonfile_to_objects(filepath)
|
||||||
|
database.insert(objects)
|
||||||
|
|
||||||
|
cur.execute('SELECT COUNT(idint) FROM submissions')
|
||||||
|
submissioncount = cur.fetchone()[0]
|
||||||
|
cur.execute('SELECT COUNT(idint) FROM comments')
|
||||||
|
commentcount = cur.fetchone()[0]
|
||||||
|
|
||||||
|
print('Ended with %d submissions and %d comments in %s' % (submissioncount, commentcount, database.filepath.basename))
|
||||||
|
|
||||||
|
def ingest_jsonfile_argparse(args):
|
||||||
|
return ingest_jsonfile(
|
||||||
|
subreddit=args.subreddit,
|
||||||
|
username=args.username,
|
||||||
|
filepath=args.json_file,
|
||||||
|
)
|
|
@ -326,6 +326,10 @@ class TSDB:
|
||||||
def insert(self, objects, commit=True):
|
def insert(self, objects, commit=True):
|
||||||
if not isinstance(objects, (list, tuple, types.GeneratorType)):
|
if not isinstance(objects, (list, tuple, types.GeneratorType)):
|
||||||
objects = [objects]
|
objects = [objects]
|
||||||
|
|
||||||
|
if isinstance(objects, types.GeneratorType):
|
||||||
|
log.debug('Trying to insert a generator of objects.')
|
||||||
|
else:
|
||||||
log.debug('Trying to insert %d objects.', len(objects))
|
log.debug('Trying to insert %d objects.', len(objects))
|
||||||
|
|
||||||
new_values = {
|
new_values = {
|
||||||
|
|
Loading…
Reference in a new issue