timesearch/timesearch/commentaugment.py
2017-12-13 14:35:36 -08:00

180 lines
6.1 KiB
Python

import traceback
from . import common
from . import exceptions
from . import tsdb
def commentaugment(
subreddit=None,
username=None,
limit=0,
num_thresh=0,
specific_submission=None,
threshold=0,
verbose=0,
):
'''
Take the IDs of collected submissions, and gather comments from those threads.
Please see the global DOCSTRING_COMMENTAUGMENT variable.
'''
if not common.is_xor(subreddit, username):
raise exceptions.NotExclusive(['subreddit', 'username'])
common.bot.login(common.r)
if specific_submission is not None:
if not specific_submission.startswith('t3_'):
specific_submission = 't3_' + specific_submission
specific_submission_obj = common.r.submission(specific_submission[3:])
subreddit = specific_submission_obj.subreddit.display_name
if subreddit:
if specific_submission is None:
database = tsdb.TSDB.for_subreddit(subreddit, do_create=False)
else:
database = tsdb.TSDB.for_subreddit(subreddit, do_create=True)
else:
database = tsdb.TSDB.for_user(username, do_create=False)
cur = database.sql.cursor()
if limit == 0:
limit = None
if specific_submission is None:
query = '''
SELECT idstr FROM submissions
WHERE idstr IS NOT NULL
AND augmented_at IS NULL
AND num_comments >= ?
ORDER BY num_comments DESC
'''
bindings = [num_thresh]
cur.execute(query, bindings)
fetchall = [item[0] for item in cur.fetchall()]
else:
# Make sure the object we're augmenting is in the table too!
database.insert(specific_submission_obj)
fetchall = [specific_submission]
totalthreads = len(fetchall)
if verbose:
spacer = '\n\t'
else:
spacer = ' '
scannedthreads = 0
get_submission = common.nofailrequest(get_submission_immediately)
while len(fetchall) > 0:
id_batch = fetchall[:100]
fetchall = fetchall[100:]
for submission in id_batch:
submission = get_submission(submission.split('_')[-1])
message = 'Processing {fullname}{spacer}expecting {num_comments} | '
message = message.format(
fullname=submission.fullname,
spacer=spacer,
num_comments=submission.num_comments,
)
print(message, end='', flush=True)
if verbose:
print()
comments = get_comments_for_thread(submission, limit, threshold, verbose)
database.insert(comments, commit=False)
query = '''
UPDATE submissions
set augmented_at = ?,
augmented_count = ?
WHERE idstr == ?
'''
bindings = [common.get_now(), len(comments), submission.fullname]
cur.execute(query, bindings)
database.sql.commit()
scannedthreads += 1
if verbose:
print('\t', end='')
message = 'Found {count} |{spacer}{scannedthreads} / {totalthreads}'
message = message.format(
count=len(comments),
spacer=spacer,
scannedthreads=scannedthreads,
totalthreads=totalthreads,
)
print(message)
def get_comments_for_thread(submission, limit, threshold, verbose):
comments = common.nofailrequest(lambda x: x.comments)(submission)
# PRAW4 flatten is just list().
comments = manually_replace_comments(comments, limit, threshold, verbose)
return comments
def get_submission_immediately(submission_id):
submission = common.r.submission(submission_id)
# force the lazyloader
submission.title = submission.title
return submission
def manually_replace_comments(incomments, limit=None, threshold=0, verbose=False):
'''
PRAW's replace_more_comments method cannot continue
where it left off in the case of an Ow! screen.
So I'm writing my own function to get each MoreComments item individually
Furthermore, this function will maximize the number of retrieved comments by
sorting the MoreComments objects and getting the big chunks before worrying
about the tail ends.
'''
incomments = incomments.list()
comments = []
morecomments = []
while len(incomments) > 0:
item = incomments.pop()
if isinstance(item, common.praw.models.MoreComments) and item.count >= threshold:
morecomments.append(item)
elif isinstance(item, common.praw.models.Comment):
comments.append(item)
while True:
try:
if limit is not None and limit <= 0:
break
if len(morecomments) == 0:
break
morecomments.sort(key=lambda x: x.count)
mc = morecomments.pop()
additional = common.nofailrequest(mc.comments)()
additionals = 0
if limit is not None:
limit -= 1
for item in additional:
if isinstance(item, common.praw.models.MoreComments) and item.count >= threshold:
morecomments.append(item)
elif isinstance(item, common.praw.models.Comment):
comments.append(item)
additionals += 1
if verbose:
s = '\tGot %d more, %d so far.' % (additionals, len(comments))
if limit is not None:
s += ' Can perform %d more replacements' % limit
print(s)
except KeyboardInterrupt:
raise
except Exception:
traceback.print_exc()
return comments
def commentaugment_argparse(args):
return commentaugment(
subreddit=args.subreddit,
username=args.username,
limit=common.int_none(args.limit),
threshold=common.int_none(args.threshold),
num_thresh=common.int_none(args.num_thresh),
verbose=args.verbose,
specific_submission=args.specific_submission,
)