Improve MMF search speed by pre-filtering photo IDs

Not perfected yet but it's an improvement.
Waste less time checking all photos, presearch for matching photos via photo_tag_rel table.
This commit is contained in:
voussoir 2017-05-01 21:27:54 -07:00
parent fbf73ac515
commit 5b7c05c39d
3 changed files with 153 additions and 71 deletions

View file

@ -321,6 +321,9 @@ def select_generator(sql, query, bindings=None):
break break
yield fetch yield fetch
def sql_listify(items):
return '(%s)' % ', '.join('"%s"' % item for item in items)
def truthystring(s): def truthystring(s):
''' '''
Convert strings to True, False, or None based on the options presented Convert strings to True, False, or None based on the options presented

View file

@ -707,38 +707,11 @@ class PDBPhotoMixin:
notnulls.append('bytes') notnulls.append('bytes')
if duration: if duration:
notnulls.append('duration') notnulls.append('duration')
query = searchhelpers.build_query(orderby, notnulls)
print(query)
generator = helpers.select_generator(self.sql, query)
if orderby is None: if orderby is None:
giveback_orderby = None giveback_orderby = None
else: else:
giveback_orderby = [term.replace('RANDOM()', 'random') for term in orderby] giveback_orderby = [term.replace('RANDOM()', 'random') for term in orderby]
if give_back_parameters:
parameters = {
'area': area,
'width': width,
'height': height,
'ratio': ratio,
'bytes': bytes,
'duration': duration,
'authors': authors,
'created': created,
'extension': extension,
'extension_not': extension_not,
'filename': filename,
'has_tags': has_tags,
'mimetype': mimetype,
'tag_musts': tag_musts,
'tag_mays': tag_mays,
'tag_forbids': tag_forbids,
'tag_expression': tag_expression,
'limit': limit,
'offset': offset,
'orderby': giveback_orderby,
}
yield parameters
# FROZEN CHILDREN # FROZEN CHILDREN
# To lighten the amount of database reading here, `frozen_children` is a dict where # To lighten the amount of database reading here, `frozen_children` is a dict where
@ -773,6 +746,44 @@ class PDBPhotoMixin:
filename_tree = expressionmatch.ExpressionTree.parse(filename) filename_tree = expressionmatch.ExpressionTree.parse(filename)
filename_tree.map(lambda x: x.lower()) filename_tree.map(lambda x: x.lower())
if give_back_parameters:
parameters = {
'area': area,
'width': width,
'height': height,
'ratio': ratio,
'bytes': bytes,
'duration': duration,
'authors': authors,
'created': created,
'extension': extension,
'extension_not': extension_not,
'filename': filename,
'has_tags': has_tags,
'mimetype': mimetype,
'tag_musts': tag_musts,
'tag_mays': tag_mays,
'tag_forbids': tag_forbids,
'tag_expression': tag_expression,
'limit': limit,
'offset': offset,
'orderby': giveback_orderby,
}
yield parameters
if is_must_may_forbid:
mmf_results = searchhelpers.mmf_photoids(self, tag_musts, tag_mays, tag_forbids, frozen_children)
#print('mmf accept:', mmf_results)
else:
mmf_results = None
if mmf_results is not None and mmf_results['photoids'] == set():
generator = []
else:
query = searchhelpers.build_query(orderby, notnulls, minimums, maximums, mmf_results=mmf_results)
print(query[:200])
generator = helpers.select_generator(self.sql, query)
photos_received = 0 photos_received = 0
# LET'S GET STARTED # LET'S GET STARTED
@ -813,21 +824,21 @@ class PDBPhotoMixin:
#print('Failed filename') #print('Failed filename')
continue continue
if any( # if any(
fetch[constants.SQL_PHOTO[key]] is None or # fetch[constants.SQL_PHOTO[key]] is None or
fetch[constants.SQL_PHOTO[key]] > value # fetch[constants.SQL_PHOTO[key]] > value
for (key, value) in maximums.items() # for (key, value) in maximums.items()
): # ):
#print('Failed maximums') # #print('Failed maximums')
continue # continue
if any( # if any(
fetch[constants.SQL_PHOTO[key]] is None or # fetch[constants.SQL_PHOTO[key]] is None or
fetch[constants.SQL_PHOTO[key]] < value # fetch[constants.SQL_PHOTO[key]] < value
for (key, value) in minimums.items() # for (key, value) in minimums.items()
): # ):
#print('Failed minimums') # #print('Failed minimums')
continue # continue
if (has_tags is not None) or is_tagsearch: if (has_tags is not None) or is_tagsearch:
photo_tags = set(photo.tags()) photo_tags = set(photo.tags())
@ -848,17 +859,22 @@ class PDBPhotoMixin:
if not success: if not success:
#print('Failed tag expression') #print('Failed tag expression')
continue continue
elif is_must_may_forbid: elif is_must_may_forbid:
success = searchfilter_must_may_forbid( pass
photo_tags=photo_tags, # if photo.id not in mmf_results:
tag_musts=tag_musts, # #print('Failed tag mmf')
tag_mays=tag_mays, # continue
tag_forbids=tag_forbids, # success = searchfilter_must_may_forbid(
frozen_children=frozen_children, # photo_tags=photo_tags,
) # tag_musts=tag_musts,
if not success: # tag_mays=tag_mays,
#print('Failed tag mmf') # tag_forbids=tag_forbids,
continue # frozen_children=frozen_children,
# )
# if not success:
# #print('Failed tag mmf')
# continue
if offset > 0: if offset > 0:
offset -= 1 offset -= 1

View file

@ -5,34 +5,45 @@ from . import objects
from voussoirkit import expressionmatch from voussoirkit import expressionmatch
def build_query(orderby, notnulls):
query = 'SELECT * FROM photos' def build_query(orderby, notnulls, minimums, maximums, mmf_results=None):
query = ['SELECT * FROM photos']
wheres = []
if mmf_results:
wheres.append('id %s %s' % (mmf_results['operator'], helpers.sql_listify(mmf_results['photoids'])))
if orderby: if orderby:
orderby = [o.split('-') for o in orderby] orderby = [o.split('-') for o in orderby]
orderby_columns = [column for (column, sorter) in orderby if column != 'RANDOM()']
else: else:
orderby_columns = [] orderby = [('created', 'DESC')]
if notnulls: for (column, direction) in orderby:
notnulls.extend(orderby_columns) if column != 'RANDOM()':
elif orderby_columns: notnulls.append(column)
notnulls = orderby_columns
if notnulls: for column in notnulls:
notnulls = [x + ' IS NOT NULL' for x in notnulls] wheres.append(column + ' IS NOT NULL')
notnulls = ' AND '.join(notnulls)
query += ' WHERE ' + notnulls
if not orderby:
query += ' ORDER BY created DESC'
return query
# Combine each column+sorter for (column, value) in minimums.items():
wheres.append(column + ' >= ' + str(value))
for (column, value) in maximums.items():
wheres.append(column + ' <= ' + str(value))
## Assemble
if wheres:
wheres = 'WHERE ' + ' AND '.join(wheres)
query.append(wheres)
if orderby:
orderby = [' '.join(o) for o in orderby] orderby = [' '.join(o) for o in orderby]
# Combine everything
orderby = ', '.join(orderby) orderby = ', '.join(orderby)
query += ' ORDER BY %s' % orderby orderby = 'ORDER BY ' + orderby
query.append(orderby)
query = ' '.join(query)
return query return query
def get_user(photodb, username_or_id): def get_user(photodb, username_or_id):
@ -90,6 +101,58 @@ def minmax(key, value, minimums, maximums, warning_bag=None):
if high is not None: if high is not None:
maximums[key] = high maximums[key] = high
def mmf_photoids(photodb, tag_musts, tag_mays, tag_forbids, frozen_children):
if not(tag_musts or tag_mays or tag_forbids):
return None
cur = photodb.sql.cursor()
operator = 'IN'
first_time = True
no_results = False
results = set()
if tag_mays:
for tag in tag_mays:
choices = helpers.sql_listify(tag.id for tag in frozen_children[tag])
query = 'SELECT photoid FROM photo_tag_rel WHERE tagid in %s' % choices
cur.execute(query)
results.update(fetch[0] for fetch in cur.fetchall())
first_time = False
if tag_musts:
for tag in tag_musts:
choices = helpers.sql_listify(tag.id for tag in frozen_children[tag])
query = 'SELECT photoid FROM photo_tag_rel WHERE tagid in %s' % choices
cur.execute(query)
photo_ids = (fetch[0] for fetch in cur.fetchall())
if first_time:
results.update(photo_ids)
first_time = False
else:
results = results.intersection(photo_ids)
if not results:
no_results = True
break
if tag_forbids and not no_results:
if not results:
operator = 'NOT IN'
for tag in tag_forbids:
choices = helpers.sql_listify(tag.id for tag in frozen_children[tag])
query = 'SELECT photoid FROM photo_tag_rel WHERE tagid in %s' % choices
cur.execute(query)
photo_ids = (fetch[0] for fetch in cur.fetchall())
if operator == 'IN':
results = results.difference(photo_ids)
if not results:
no_results = True
break
else:
results.update(photo_ids)
return {'operator': operator, 'photoids': results}
def normalize_authors(authors, photodb, warning_bag=None): def normalize_authors(authors, photodb, warning_bag=None):
''' '''
Either: Either: