Add tables submission_edits, comment_edits. Somewhat experimental.

But as search API reaches EOL anyway it should be fine.
master
Ethan Dalool 2018-03-22 17:08:57 -07:00
parent 5eccfb18af
commit c7dd46d6ec
1 changed files with 94 additions and 11 deletions

View File

@ -1,5 +1,6 @@
import os import os
import sqlite3 import sqlite3
import time
import types import types
from . import common from . import common
@ -71,9 +72,24 @@ CREATE TABLE IF NOT EXISTS comments(
textlen INT textlen INT
); );
CREATE INDEX IF NOT EXISTS comment_index ON comments(idstr); CREATE INDEX IF NOT EXISTS comment_index ON comments(idstr);
----------------------------------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS submission_edits(
idstr TEXT,
previous_selftext TEXT,
replaced_at INT
);
CREATE INDEX IF NOT EXISTS submission_edits_index ON submission_edits(idstr);
----------------------------------------------------------------------------------------------------
CREATE TABLE IF NOT EXISTS comment_edits(
idstr TEXT,
previous_body TEXT,
replaced_at INT
);
CREATE INDEX IF NOT EXISTS comment_edits_index ON comment_edits(idstr);
'''.format(user_version=DATABASE_VERSION) '''.format(user_version=DATABASE_VERSION)
DEFAULT_CONFIG = { DEFAULT_CONFIG = {
'store_edits': True,
} }
SQL_SUBMISSION_COLUMNS = [ SQL_SUBMISSION_COLUMNS = [
@ -111,6 +127,12 @@ SQL_COMMENT_COLUMNS = [
'textlen', 'textlen',
] ]
SQL_EDITS_COLUMNS = [
'idstr',
'text',
'replaced_at',
]
SQL_SUBMISSION = {key:index for (index, key) in enumerate(SQL_SUBMISSION_COLUMNS)} SQL_SUBMISSION = {key:index for (index, key) in enumerate(SQL_SUBMISSION_COLUMNS)}
SQL_COMMENT = {key:index for (index, key) in enumerate(SQL_COMMENT_COLUMNS)} SQL_COMMENT = {key:index for (index, key) in enumerate(SQL_COMMENT_COLUMNS)}
@ -216,12 +238,36 @@ class TSDB:
path_formats=DB_FORMATS_USER, path_formats=DB_FORMATS_USER,
) )
def check_for_edits(self, obj, existing_entry):
'''
If the item's current text doesn't match the stored text, decide what
to do.
Firstly, make sure to ignore deleted comments.
Then, if the database is configured to store edited text, do so.
Finally, return the body that we want to store in the main table.
'''
if isinstance(obj, common.praw.models.Submission):
existing_body = existing_entry[SQL_SUBMISSION['selftext']]
body = obj.selftext
else:
existing_body = existing_entry[SQL_COMMENT['body']]
body = obj.body
if body != existing_body:
if should_keep_existing_text(obj):
body = existing_body
elif self.config['store_edits']:
self.insert_edited(obj, old_text=existing_body)
return body
def insert(self, objects, commit=True): def insert(self, objects, commit=True):
if not isinstance(objects, (list, tuple, types.GeneratorType)): if not isinstance(objects, (list, tuple, types.GeneratorType)):
objects = [objects] objects = [objects]
common.log.debug('Trying to insert %d objects.', len(objects)) common.log.debug('Trying to insert %d objects.', len(objects))
new_values = { new_values = {
'tsdb': self,
'new_submissions': 0, 'new_submissions': 0,
'new_comments': 0, 'new_comments': 0,
} }
@ -243,6 +289,32 @@ class TSDB:
common.log.debug('Done inserting.') common.log.debug('Done inserting.')
return new_values return new_values
def insert_edited(self, obj, old_text):
'''
Having already detected that the item has been edited, add a record to
the appropriate *_edits table containing the text that is being
replaced.
'''
if isinstance(obj, common.praw.models.Submission):
table = 'submission_edits'
else:
table = 'comment_edits'
if obj.edited is False:
replaced_at = int(time.time())
else:
replaced_at = int(obj.edited)
postdata = {
'idstr': obj.fullname,
'text': old_text,
'replaced_at': replaced_at,
}
cur = self.sql.cursor()
(qmarks, bindings) = binding_filler(SQL_EDITS_COLUMNS, postdata, require_all=True)
query = 'INSERT INTO %s VALUES(%s)' % (table, qmarks)
cur.execute(query, bindings)
def insert_submission(self, submission): def insert_submission(self, submission):
cur = self.sql.cursor() cur = self.sql.cursor()
cur.execute('SELECT * FROM submissions WHERE idstr == ?', [submission.fullname]) cur.execute('SELECT * FROM submissions WHERE idstr == ?', [submission.fullname])
@ -285,12 +357,7 @@ class TSDB:
cur.execute(query, bindings) cur.execute(query, bindings)
else: else:
if submission.author is None: selftext = self.check_for_edits(submission, existing_entry=existing_entry)
# This post is deleted, therefore its text probably says [deleted] or [removed].
# Discard that, and keep the data we already had here.
selftext = existing_entry[SQL_SUBMISSION['selftext']]
else:
selftext = submission.selftext
query = ''' query = '''
UPDATE submissions SET UPDATE submissions SET
@ -346,11 +413,7 @@ class TSDB:
cur.execute(query, bindings) cur.execute(query, bindings)
else: else:
greasy = ['has been overwritten', 'pastebin.com/64GuVi2F'] body = self.check_for_edits(comment, existing_entry=existing_entry)
if comment.author is None or any(grease in comment.body for grease in greasy):
body = existing_entry[SQL_COMMENT['body']]
else:
body = comment.body
query = ''' query = '''
UPDATE comments SET UPDATE comments SET
@ -403,3 +466,23 @@ def name_from_path(filepath):
name = os.path.splitext(filepath)[0] name = os.path.splitext(filepath)[0]
name = name.strip('@') name = name.strip('@')
return name return name
def should_keep_existing_text(obj):
'''
Under certain conditions we do not want to update the entry in the db
with the most recent copy of the text. For example, if the post has
been deleted and the text now shows '[deleted]' we would prefer to
keep whatever we already have.
This function puts away the work I would otherwise have to duplicate
for both submissions and comments.
'''
body = obj.selftext if isinstance(obj, common.praw.models.Submission) else obj.body
if obj.author is None and body in ['[removed]', '[deleted]']:
return True
greasy = ['has been overwritten', 'pastebin.com/64GuVi2F']
if any(grease in body for grease in greasy):
return True
return False