Add tables submission_edits, comment_edits. Somewhat experimental.
But as search API reaches EOL anyway it should be fine.
This commit is contained in:
parent
5eccfb18af
commit
c7dd46d6ec
1 changed files with 94 additions and 11 deletions
|
@ -1,5 +1,6 @@
|
||||||
import os
|
import os
|
||||||
import sqlite3
|
import sqlite3
|
||||||
|
import time
|
||||||
import types
|
import types
|
||||||
|
|
||||||
from . import common
|
from . import common
|
||||||
|
@ -71,9 +72,24 @@ CREATE TABLE IF NOT EXISTS comments(
|
||||||
textlen INT
|
textlen INT
|
||||||
);
|
);
|
||||||
CREATE INDEX IF NOT EXISTS comment_index ON comments(idstr);
|
CREATE INDEX IF NOT EXISTS comment_index ON comments(idstr);
|
||||||
|
----------------------------------------------------------------------------------------------------
|
||||||
|
CREATE TABLE IF NOT EXISTS submission_edits(
|
||||||
|
idstr TEXT,
|
||||||
|
previous_selftext TEXT,
|
||||||
|
replaced_at INT
|
||||||
|
);
|
||||||
|
CREATE INDEX IF NOT EXISTS submission_edits_index ON submission_edits(idstr);
|
||||||
|
----------------------------------------------------------------------------------------------------
|
||||||
|
CREATE TABLE IF NOT EXISTS comment_edits(
|
||||||
|
idstr TEXT,
|
||||||
|
previous_body TEXT,
|
||||||
|
replaced_at INT
|
||||||
|
);
|
||||||
|
CREATE INDEX IF NOT EXISTS comment_edits_index ON comment_edits(idstr);
|
||||||
'''.format(user_version=DATABASE_VERSION)
|
'''.format(user_version=DATABASE_VERSION)
|
||||||
|
|
||||||
DEFAULT_CONFIG = {
|
DEFAULT_CONFIG = {
|
||||||
|
'store_edits': True,
|
||||||
}
|
}
|
||||||
|
|
||||||
SQL_SUBMISSION_COLUMNS = [
|
SQL_SUBMISSION_COLUMNS = [
|
||||||
|
@ -111,6 +127,12 @@ SQL_COMMENT_COLUMNS = [
|
||||||
'textlen',
|
'textlen',
|
||||||
]
|
]
|
||||||
|
|
||||||
|
SQL_EDITS_COLUMNS = [
|
||||||
|
'idstr',
|
||||||
|
'text',
|
||||||
|
'replaced_at',
|
||||||
|
]
|
||||||
|
|
||||||
SQL_SUBMISSION = {key:index for (index, key) in enumerate(SQL_SUBMISSION_COLUMNS)}
|
SQL_SUBMISSION = {key:index for (index, key) in enumerate(SQL_SUBMISSION_COLUMNS)}
|
||||||
SQL_COMMENT = {key:index for (index, key) in enumerate(SQL_COMMENT_COLUMNS)}
|
SQL_COMMENT = {key:index for (index, key) in enumerate(SQL_COMMENT_COLUMNS)}
|
||||||
|
|
||||||
|
@ -216,12 +238,36 @@ class TSDB:
|
||||||
path_formats=DB_FORMATS_USER,
|
path_formats=DB_FORMATS_USER,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def check_for_edits(self, obj, existing_entry):
|
||||||
|
'''
|
||||||
|
If the item's current text doesn't match the stored text, decide what
|
||||||
|
to do.
|
||||||
|
|
||||||
|
Firstly, make sure to ignore deleted comments.
|
||||||
|
Then, if the database is configured to store edited text, do so.
|
||||||
|
Finally, return the body that we want to store in the main table.
|
||||||
|
'''
|
||||||
|
if isinstance(obj, common.praw.models.Submission):
|
||||||
|
existing_body = existing_entry[SQL_SUBMISSION['selftext']]
|
||||||
|
body = obj.selftext
|
||||||
|
else:
|
||||||
|
existing_body = existing_entry[SQL_COMMENT['body']]
|
||||||
|
body = obj.body
|
||||||
|
|
||||||
|
if body != existing_body:
|
||||||
|
if should_keep_existing_text(obj):
|
||||||
|
body = existing_body
|
||||||
|
elif self.config['store_edits']:
|
||||||
|
self.insert_edited(obj, old_text=existing_body)
|
||||||
|
return body
|
||||||
|
|
||||||
def insert(self, objects, commit=True):
|
def insert(self, objects, commit=True):
|
||||||
if not isinstance(objects, (list, tuple, types.GeneratorType)):
|
if not isinstance(objects, (list, tuple, types.GeneratorType)):
|
||||||
objects = [objects]
|
objects = [objects]
|
||||||
common.log.debug('Trying to insert %d objects.', len(objects))
|
common.log.debug('Trying to insert %d objects.', len(objects))
|
||||||
|
|
||||||
new_values = {
|
new_values = {
|
||||||
|
'tsdb': self,
|
||||||
'new_submissions': 0,
|
'new_submissions': 0,
|
||||||
'new_comments': 0,
|
'new_comments': 0,
|
||||||
}
|
}
|
||||||
|
@ -243,6 +289,32 @@ class TSDB:
|
||||||
common.log.debug('Done inserting.')
|
common.log.debug('Done inserting.')
|
||||||
return new_values
|
return new_values
|
||||||
|
|
||||||
|
def insert_edited(self, obj, old_text):
|
||||||
|
'''
|
||||||
|
Having already detected that the item has been edited, add a record to
|
||||||
|
the appropriate *_edits table containing the text that is being
|
||||||
|
replaced.
|
||||||
|
'''
|
||||||
|
if isinstance(obj, common.praw.models.Submission):
|
||||||
|
table = 'submission_edits'
|
||||||
|
else:
|
||||||
|
table = 'comment_edits'
|
||||||
|
|
||||||
|
if obj.edited is False:
|
||||||
|
replaced_at = int(time.time())
|
||||||
|
else:
|
||||||
|
replaced_at = int(obj.edited)
|
||||||
|
|
||||||
|
postdata = {
|
||||||
|
'idstr': obj.fullname,
|
||||||
|
'text': old_text,
|
||||||
|
'replaced_at': replaced_at,
|
||||||
|
}
|
||||||
|
cur = self.sql.cursor()
|
||||||
|
(qmarks, bindings) = binding_filler(SQL_EDITS_COLUMNS, postdata, require_all=True)
|
||||||
|
query = 'INSERT INTO %s VALUES(%s)' % (table, qmarks)
|
||||||
|
cur.execute(query, bindings)
|
||||||
|
|
||||||
def insert_submission(self, submission):
|
def insert_submission(self, submission):
|
||||||
cur = self.sql.cursor()
|
cur = self.sql.cursor()
|
||||||
cur.execute('SELECT * FROM submissions WHERE idstr == ?', [submission.fullname])
|
cur.execute('SELECT * FROM submissions WHERE idstr == ?', [submission.fullname])
|
||||||
|
@ -285,12 +357,7 @@ class TSDB:
|
||||||
cur.execute(query, bindings)
|
cur.execute(query, bindings)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
if submission.author is None:
|
selftext = self.check_for_edits(submission, existing_entry=existing_entry)
|
||||||
# This post is deleted, therefore its text probably says [deleted] or [removed].
|
|
||||||
# Discard that, and keep the data we already had here.
|
|
||||||
selftext = existing_entry[SQL_SUBMISSION['selftext']]
|
|
||||||
else:
|
|
||||||
selftext = submission.selftext
|
|
||||||
|
|
||||||
query = '''
|
query = '''
|
||||||
UPDATE submissions SET
|
UPDATE submissions SET
|
||||||
|
@ -346,11 +413,7 @@ class TSDB:
|
||||||
cur.execute(query, bindings)
|
cur.execute(query, bindings)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
greasy = ['has been overwritten', 'pastebin.com/64GuVi2F']
|
body = self.check_for_edits(comment, existing_entry=existing_entry)
|
||||||
if comment.author is None or any(grease in comment.body for grease in greasy):
|
|
||||||
body = existing_entry[SQL_COMMENT['body']]
|
|
||||||
else:
|
|
||||||
body = comment.body
|
|
||||||
|
|
||||||
query = '''
|
query = '''
|
||||||
UPDATE comments SET
|
UPDATE comments SET
|
||||||
|
@ -403,3 +466,23 @@ def name_from_path(filepath):
|
||||||
name = os.path.splitext(filepath)[0]
|
name = os.path.splitext(filepath)[0]
|
||||||
name = name.strip('@')
|
name = name.strip('@')
|
||||||
return name
|
return name
|
||||||
|
|
||||||
|
def should_keep_existing_text(obj):
|
||||||
|
'''
|
||||||
|
Under certain conditions we do not want to update the entry in the db
|
||||||
|
with the most recent copy of the text. For example, if the post has
|
||||||
|
been deleted and the text now shows '[deleted]' we would prefer to
|
||||||
|
keep whatever we already have.
|
||||||
|
|
||||||
|
This function puts away the work I would otherwise have to duplicate
|
||||||
|
for both submissions and comments.
|
||||||
|
'''
|
||||||
|
body = obj.selftext if isinstance(obj, common.praw.models.Submission) else obj.body
|
||||||
|
if obj.author is None and body in ['[removed]', '[deleted]']:
|
||||||
|
return True
|
||||||
|
|
||||||
|
greasy = ['has been overwritten', 'pastebin.com/64GuVi2F']
|
||||||
|
if any(grease in body for grease in greasy):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
Loading…
Reference in a new issue