Completely rewrite search to use more SQL and less application.

In order to achieve tag_musts, we break each of the musts down
into separate EXISTS queries for each of the matchable children.
Then we INTERSECT those, and finally do other filtering and
ordering as usual.
This commit is contained in:
voussoir 2018-03-21 19:20:43 -07:00
parent db827d17ec
commit 5be174d1b3
3 changed files with 284 additions and 339 deletions

View file

@ -28,41 +28,6 @@ from voussoirkit import sqlhelpers
logging.basicConfig()
def _helper_filenamefilter(subject, terms):
basename = subject.lower()
return all(term in basename for term in terms)
def searchfilter_must_may_forbid(photo_tags, tag_musts, tag_mays, tag_forbids, frozen_children):
if tag_musts:
for must in tag_musts:
for option in frozen_children[must]:
if option in photo_tags:
break
else:
# Fail when ANY of the tags fails to find an option.
return False
if tag_mays:
for may in tag_mays:
for option in frozen_children[may]:
if option in photo_tags:
break
else:
continue
break
else:
# Fail when ALL of the tags fail to find an option.
return False
if tag_forbids:
for forbid in tag_forbids:
for option in frozen_children[forbid]:
if option in photo_tags:
return False
return True
####################################################################################################
####################################################################################################
@ -461,53 +426,6 @@ class PDBPhotoMixin:
'''
start_time = time.time()
# MINMAXERS
has_tags = searchhelpers.normalize_has_tags(has_tags)
if has_tags is False:
tag_musts = None
tag_mays = None
tag_forbids = None
tag_expression = None
else:
_helper = lambda tagset: searchhelpers.normalize_tag_mmf(
photodb=self,
tags=tagset,
warning_bag=warning_bag,
)
tag_musts = _helper(tag_musts)
tag_mays = _helper(tag_mays)
tag_forbids = _helper(tag_forbids)
tag_expression = searchhelpers.normalize_tag_expression(tag_expression)
#print(tag_musts, tag_mays, tag_forbids)
if (tag_musts or tag_mays or tag_forbids) and tag_expression:
exc = exceptions.NotExclusive(['tag_musts+mays+forbids', 'tag_expression'])
if warning_bag:
warning_bag.add(exc.error_message)
tag_musts = None
tag_mays = None
tag_forbids = None
tag_expression = None
else:
raise exc
extension = searchhelpers.normalize_extensions(extension)
extension_not = searchhelpers.normalize_extensions(extension_not)
mimetype = searchhelpers.normalize_extensions(mimetype)
authors = searchhelpers.normalize_authors(authors, photodb=self, warning_bag=warning_bag)
filename = searchhelpers.normalize_filename(filename)
limit = searchhelpers.normalize_limit(limit, warning_bag=warning_bag)
has_thumbnail = searchhelpers.normalize_has_thumbnail(has_thumbnail)
is_searchhidden = searchhelpers.normalize_is_searchhidden(is_searchhidden)
offset = searchhelpers.normalize_offset(offset)
if offset is None:
offset = 0
maximums = {}
minimums = {}
searchhelpers.minmax('area', area, minimums, maximums, warning_bag=warning_bag)
@ -518,52 +436,44 @@ class PDBPhotoMixin:
searchhelpers.minmax('bytes', bytes, minimums, maximums, warning_bag=warning_bag)
searchhelpers.minmax('duration', duration, minimums, maximums, warning_bag=warning_bag)
orderby = searchhelpers.normalize_orderby(orderby, warning_bag=warning_bag)
authors = searchhelpers.normalize_authors(authors, photodb=self, warning_bag=warning_bag)
extension = searchhelpers.normalize_extensions(extension)
extension_not = searchhelpers.normalize_extensions(extension_not)
filename = searchhelpers.normalize_filename(filename)
has_tags = searchhelpers.normalize_has_tags(has_tags)
has_thumbnail = searchhelpers.normalize_has_thumbnail(has_thumbnail)
is_searchhidden = searchhelpers.normalize_is_searchhidden(is_searchhidden)
mimetype = searchhelpers.normalize_extensions(mimetype)
notnulls = set()
yesnulls = set()
wheres = []
if extension or mimetype:
notnulls.add('extension')
if width or height or ratio or area:
notnulls.add('width')
if bytes:
notnulls.add('bytes')
if duration:
notnulls.add('duration')
if has_thumbnail is True:
notnulls.add('thumbnail')
elif has_thumbnail is False:
yesnulls.add('thumbnail')
if is_searchhidden is True:
wheres.append('searchhidden == 1')
elif is_searchhidden is False:
wheres.append('searchhidden == 0')
if orderby is None:
giveback_orderby = None
if has_tags is False:
tag_musts = None
tag_mays = None
tag_forbids = None
tag_expression = None
else:
giveback_orderby = [term.replace('RANDOM()', 'random') for term in orderby]
tag_musts = searchhelpers.normalize_tagset(self, tag_musts, warning_bag=warning_bag)
tag_mays = searchhelpers.normalize_tagset(self, tag_mays, warning_bag=warning_bag)
tag_forbids = searchhelpers.normalize_tagset(self, tag_forbids, warning_bag=warning_bag)
tag_expression = searchhelpers.normalize_tag_expression(tag_expression)
# FROZEN CHILDREN
# To lighten the amount of database reading here, `frozen_children` is a dict where
# EVERY tag in the db is a key, and the value is a list of ALL ITS NESTED CHILDREN.
# This representation is memory inefficient, but it is faster than repeated
# database lookups
is_must_may_forbid = bool(tag_musts or tag_mays or tag_forbids)
is_tagsearch = is_must_may_forbid or tag_expression
if is_tagsearch:
if self._cached_frozen_children:
frozen_children = self._cached_frozen_children
else:
frozen_children = tag_export.flat_dict(self.get_tags())
self._cached_frozen_children = frozen_children
else:
frozen_children = None
if extension is not None and extension_not is not None:
extension = extension.difference(extension_not)
mmf_expression_noconflict = searchhelpers.check_mmf_expression_exclusive(
tag_musts,
tag_mays,
tag_forbids,
tag_expression,
warning_bag
)
if not mmf_expression_noconflict:
tag_musts = None
tag_mays = None
tag_forbids = None
tag_expression = None
if tag_expression:
frozen_children = self.get_cached_frozen_children()
tag_expression_tree = searchhelpers.tag_expression_tree_builder(
tag_expression=tag_expression,
photodb=self,
@ -571,10 +481,22 @@ class PDBPhotoMixin:
warning_bag=warning_bag,
)
if tag_expression_tree is None:
giveback_tag_expression = None
tag_expression = None
else:
print(tag_expression_tree)
giveback_tag_expression = str(tag_expression_tree)
print(giveback_tag_expression)
tag_match_function = searchhelpers.tag_expression_matcher_builder(frozen_children)
else:
giveback_tag_expression = None
if has_tags is True and (tag_musts or tag_mays):
# has_tags check is redundant then, so disable it.
has_tags = None
limit = searchhelpers.normalize_positive_integer(limit, warning_bag=warning_bag)
offset = searchhelpers.normalize_positive_integer(offset, warning_bag=warning_bag)
orderby = searchhelpers.normalize_orderby(orderby, warning_bag=warning_bag)
if filename:
try:
@ -585,6 +507,14 @@ class PDBPhotoMixin:
else:
filename_tree = None
giveback_orderby = [
'%s-%s' % (column.replace('RANDOM()', 'random'), direction)
for (column, direction) in orderby
]
if not orderby:
orderby = [('created', 'desc')]
if give_back_parameters:
parameters = {
'area': area,
@ -595,74 +525,119 @@ class PDBPhotoMixin:
'duration': duration,
'authors': authors,
'created': created,
'extension': extension,
'extension_not': extension_not,
'filename': filename,
'extension': extension or None,
'extension_not': extension_not or None,
'filename': filename or None,
'has_tags': has_tags,
'has_thumbnail': has_thumbnail,
'mimetype': mimetype,
'tag_musts': tag_musts,
'tag_mays': tag_mays,
'tag_forbids': tag_forbids,
'tag_expression': tag_expression,
'mimetype': mimetype or None,
'tag_musts': tag_musts or None,
'tag_mays': tag_mays or None,
'tag_forbids': tag_forbids or None,
'tag_expression': giveback_tag_expression or None,
'limit': limit,
'offset': offset,
'offset': offset or None,
'orderby': giveback_orderby,
}
yield parameters
if is_must_may_forbid:
mmf_results = searchhelpers.mmf_photo_ids(
self,
photo_tag_rel_intersections = searchhelpers.photo_tag_rel_intersections(
tag_musts,
tag_mays,
tag_forbids,
frozen_children,
)
else:
mmf_results = None
if mmf_results is not None and mmf_results['photo_ids'] == set():
generator = []
else:
query = searchhelpers.build_query(
author_ids=authors,
maximums=maximums,
minimums=minimums,
mmf_results=mmf_results,
notnulls=notnulls,
yesnulls=yesnulls,
orderby=orderby,
wheres=wheres,
)
print(query[:200])
generator = helpers.select_generator(self.sql, query)
notnulls = set()
yesnulls = set()
wheres = []
bindings = []
if authors:
wheres.append('author_id IN %s' % helpers.sql_listify(authors))
if extension:
if '*' in extension:
wheres.append('extension != ""')
else:
binders = ', '.join('?' * len(extension))
wheres.append('extension IN (%s)' % binders)
bindings.extend(extension)
if extension_not:
if '*' in extension_not:
wheres.append('extension == ""')
else:
binders = ', '.join('?' * len(extension_not))
wheres.append('extension NOT IN (%s)' % binders)
bindings.extend(extension_not)
if mimetype:
notnulls.add('extension')
if has_tags is True:
wheres.append('EXISTS (SELECT 1 FROM photo_tag_rel WHERE photoid == photos.id)')
if has_tags is False:
wheres.append('NOT EXISTS (SELECT 1 FROM photo_tag_rel WHERE photoid == photos.id)')
if has_thumbnail is True:
notnulls.add('thumbnail')
elif has_thumbnail is False:
yesnulls.add('thumbnail')
for (column, direction) in orderby:
if column != 'RANDOM()':
notnulls.add(column)
if is_searchhidden is True:
wheres.append('searchhidden == 1')
elif is_searchhidden is False:
wheres.append('searchhidden == 0')
for column in notnulls:
wheres.append(column + ' IS NOT NULL')
for column in yesnulls:
wheres.append(column + ' IS NULL')
for (column, value) in minimums.items():
wheres.append(column + ' >= ' + str(value))
for (column, value) in maximums.items():
wheres.append(column + ' <= ' + str(value))
# In order to use ORDER BY RANDOM(), we must place all of the intersect
# tag searches into a subquery. If we simply try to do
# SELECT * ... INTERSECT SELECT * ... ORDER BY RANDOM()
# we get an error that random is not a column. But placing all of the
# selects into a named subquery fixes that.
query = ['SELECT * FROM']
if photo_tag_rel_intersections:
intersections = '(%s) photos' % '\nINTERSECT\n'.join(photo_tag_rel_intersections)
query.append(intersections)
else:
query.append('photos')
if wheres:
wheres = 'WHERE ' + ' AND '.join(wheres)
query.append(wheres)
if orderby:
orderby = ['%s %s' % (column, direction) for (column, direction) in orderby]
orderby = ', '.join(orderby)
orderby = 'ORDER BY ' + orderby
query.append(orderby)
query = ' '.join(query)
query = '%s\n%s\n%s' % ('-' * 80, query, '-' * 80)
print(query, bindings)
#cur = self.sql.cursor()
#cur.execute('EXPLAIN QUERY PLAN ' + query, bindings)
#print('\n'.join(str(x) for x in cur.fetchall()))
generator = helpers.select_generator(self.sql, query, bindings)
photos_received = 0
# LET'S GET STARTED
for fetch in generator:
photo = objects.Photo(self, fetch)
ext_okay = (
not extension or
(
('*' in extension and photo.extension) or
photo.extension in extension
)
)
if not ext_okay:
continue
ext_fail = (
extension_not and
(
('*' in extension_not and photo.extension) or
photo.extension in extension_not
)
)
if ext_fail:
continue
for row in generator:
photo = objects.Photo(self, row)
if mimetype and photo.simple_mimetype not in mimetype:
continue
@ -670,16 +645,8 @@ class PDBPhotoMixin:
if filename_tree and not filename_tree.evaluate(photo.basename.lower()):
continue
if (has_tags is not None) or is_tagsearch:
photo_tags = set(photo.get_tags())
if has_tags is False and len(photo_tags) > 0:
continue
if has_tags is True and len(photo_tags) == 0:
continue
if tag_expression:
photo_tags = set(photo.get_tags())
success = tag_expression_tree.evaluate(
photo_tags,
match_function=tag_match_function,
@ -687,7 +654,7 @@ class PDBPhotoMixin:
if not success:
continue
if offset > 0:
if offset is not None and offset > 0:
offset -= 1
continue
@ -701,7 +668,7 @@ class PDBPhotoMixin:
yield warning_bag
end_time = time.time()
print('Search results took:', end_time - start_time)
print('Search took:', end_time - start_time)
class PDBSQLMixin:
@ -884,7 +851,7 @@ class PDBTagMixin:
self.log.debug('New Tag: %s', tagname)
tagid = self.generate_id('tags')
self._cached_frozen_children = None
self._uncache()
author_id = self.get_user_id_or_none(author)
data = {
'id': tagid,
@ -1352,6 +1319,7 @@ class PhotoDB(
# OTHER
self._cached_frozen_children = None
self._cached_qualname_map = None
self._album_cache.maxlen = self.config['cache_size']['album']
self._bookmark_cache.maxlen = self.config['cache_size']['bookmark']
@ -1398,6 +1366,7 @@ class PhotoDB(
def _uncache(self):
self._cached_frozen_children = None
self._cached_qualname_map = None
def generate_id(self, table):
'''
@ -1429,6 +1398,16 @@ class PhotoDB(
cur.execute('UPDATE id_numbers SET last_id = ? WHERE tab == ?', [new_id, table])
return new_id
def get_cached_frozen_children(self):
if self._cached_frozen_children is None:
self._cached_frozen_children = tag_export.flat_dict(self.get_tags())
return self._cached_frozen_children
def get_cached_qualname_map(self):
if self._cached_qualname_map is None:
self._cached_qualname_map = tag_export.qualified_names(self.get_tags())
return self._cached_qualname_map
def get_thing_by_id(self, thing_type, thing_id):
thing_map = _THING_CLASSES[thing_type]

View file

@ -12,79 +12,71 @@ from . import objects
from voussoirkit import expressionmatch
def build_query(
author_ids=None,
maximums=None,
minimums=None,
mmf_results=None,
notnulls=None,
yesnulls=None,
orderby=None,
wheres=None,
def check_mmf_expression_exclusive(
tag_musts,
tag_mays,
tag_forbids,
tag_expression,
warning_bag=None
):
if notnulls is None:
notnulls = set()
if yesnulls is None:
yesnulls = set()
if wheres is None:
wheres = set()
if (tag_musts or tag_mays or tag_forbids) and tag_expression:
exc = exceptions.NotExclusive(['tag_musts+mays+forbids', 'tag_expression'])
if warning_bag:
warning_bag.add(exc.error_message)
else:
wheres = set(wheres)
raise exc
query = ['SELECT * FROM photos']
return False
return True
if author_ids:
notnulls.add('author_id')
wheres.add('author_id in %s' % helpers.sql_listify(author_ids))
def expand_mmf(tag_musts, tag_mays, tag_forbids):
def _set(x):
if x is None:
return set()
return set(x)
if mmf_results:
# "id IN/NOT IN (1, 2, 3)"
operator = mmf_results['operator']
photo_ids = helpers.sql_listify(mmf_results['photo_ids'])
wheres.add('id %s %s' % (operator, photo_ids))
tag_musts = _set(tag_musts)
tag_mays = _set(tag_mays)
tag_forbids = _set(tag_forbids)
if orderby:
orderby = [o.split('-') for o in orderby]
else:
orderby = [('created', 'DESC')]
forbids_expanded = set()
for (column, direction) in orderby:
if column != 'RANDOM()':
notnulls.add(column)
def _expand_flat(tagset):
'''
I am not using tag.walk_children because if the user happens to give us
two tags in the same lineage, we have the opportunity to bail early,
which walk_children won't know about. So instead I'm doing the queue
popping and pushing myself.
'''
expanded = set()
while len(tagset) > 0:
tag = tagset.pop()
if tag in forbids_expanded:
continue
if tag in expanded:
continue
expanded.add(tag)
tagset.update(tag.get_children())
return expanded
if minimums:
for (column, value) in minimums.items():
wheres.add(column + ' >= ' + str(value))
if maximums:
for (column, value) in maximums.items():
wheres.add(column + ' <= ' + str(value))
## Assemble
for column in notnulls:
wheres.add(column + ' IS NOT NULL')
for column in yesnulls:
wheres.add(column + ' IS NULL')
if wheres:
wheres = 'WHERE ' + ' AND '.join(wheres)
query.append(wheres)
if orderby:
orderby = [' '.join(o) for o in orderby]
orderby = ', '.join(orderby)
orderby = 'ORDER BY ' + orderby
query.append(orderby)
query = ' '.join(query)
return query
def _expand_nested(tagset):
expanded = []
total = set()
for tag in tagset:
if tag in total:
continue
this_expanded = _expand_flat(set([tag]))
total.update(this_expanded)
expanded.append(this_expanded)
return expanded
# forbids must come first so that musts and mays don't waste their time
# expanding the forbidden subtrees.
forbids_expanded = _expand_flat(tag_forbids)
musts_expanded = _expand_nested(tag_musts)
mays_expanded = _expand_flat(tag_mays)
return (musts_expanded, mays_expanded, forbids_expanded)
def minmax(key, value, minimums, maximums, warning_bag=None):
'''
@ -130,58 +122,6 @@ def minmax(key, value, minimums, maximums, warning_bag=None):
if high is not None:
maximums[key] = high
def mmf_photo_ids(photodb, tag_musts, tag_mays, tag_forbids, frozen_children):
if not(tag_musts or tag_mays or tag_forbids):
return None
cur = photodb.sql.cursor()
operator = 'IN'
first_time = True
no_results = False
results = set()
if tag_mays:
for tag in tag_mays:
choices = helpers.sql_listify(t.id for t in frozen_children[tag])
query = 'SELECT photoid FROM photo_tag_rel WHERE tagid in %s' % choices
cur.execute(query)
results.update(fetch[0] for fetch in cur.fetchall())
first_time = False
if tag_musts:
for tag in tag_musts:
choices = helpers.sql_listify(t.id for t in frozen_children[tag])
query = 'SELECT photoid FROM photo_tag_rel WHERE tagid in %s' % choices
cur.execute(query)
photo_ids = (fetch[0] for fetch in cur.fetchall())
if first_time:
results.update(photo_ids)
first_time = False
else:
results = results.intersection(photo_ids)
if not results:
no_results = True
break
if tag_forbids and not no_results:
if not results:
operator = 'NOT IN'
for tag in tag_forbids:
choices = helpers.sql_listify(t.id for t in frozen_children[tag])
query = 'SELECT photoid FROM photo_tag_rel WHERE tagid in %s' % choices
cur.execute(query)
photo_ids = (fetch[0] for fetch in cur.fetchall())
if operator == 'IN':
results = results.difference(photo_ids)
if not results:
no_results = True
break
else:
results.update(photo_ids)
return {'operator': operator, 'photo_ids': results}
def normalize_authors(authors, photodb, warning_bag=None):
'''
Either:
@ -223,21 +163,14 @@ def normalize_authors(authors, photodb, warning_bag=None):
return user_ids
def normalize_extensions(extensions):
if not extensions:
return None
if extensions is None:
extensions = set()
if isinstance(extensions, str):
elif isinstance(extensions, str):
extensions = helpers.comma_space_split(extensions)
if len(extensions) == 0:
return None
extensions = [e.lower().strip('.').strip() for e in extensions]
extensions = set(extensions)
extensions = {e for e in extensions if e}
if len(extensions) == 0:
return None
extensions = set(e for e in extensions if e)
return extensions
@ -280,16 +213,13 @@ def normalize_offset(offset, warning_bag=None):
return normalize_positive_integer(limit, warning_bag)
def normalize_orderby(orderby, warning_bag=None):
if not orderby:
return None
if orderby is None:
orderby = []
if isinstance(orderby, str):
orderby = orderby.replace('-', ' ')
orderby = orderby.split(',')
if not orderby:
return None
final_orderby = []
for requested_order in orderby:
requested_order = requested_order.lower().strip()
@ -334,12 +264,15 @@ def normalize_orderby(orderby, warning_bag=None):
raise ValueError(message)
direction = 'desc'
requested_order = '%s-%s' % (column, direction)
requested_order = (column, direction)
final_orderby.append(requested_order)
return final_orderby
def normalize_positive_integer(number, warning_bag=None):
if number is None:
return None
if not number:
number = 0
@ -383,7 +316,39 @@ def normalize_tag_expression(expression):
return expression
def normalize_tag_mmf(tags, photodb, warning_bag=None):
INTERSECT_FORMAT = '''
SELECT * FROM photos WHERE {operator} (
SELECT 1 FROM photo_tag_rel WHERE photos.id == photo_tag_rel.photoid
AND tagid IN {tagset}
)
'''.strip()
def photo_tag_rel_intersections(tag_musts, tag_mays, tag_forbids):
(tag_musts, tag_mays, tag_forbids) = expand_mmf(
tag_musts,
tag_mays,
tag_forbids,
)
intersections = []
for tag_must_group in tag_musts:
intersections.append( ('EXISTS', tag_must_group) )
if tag_mays:
intersections.append( ('EXISTS', tag_mays) )
if tag_forbids:
intersections.append( ('NOT EXISTS', tag_forbids) )
intersections = [
#(operator, helpers.sql_listify([tag.id for tag in tagset] + [""]))
(operator, helpers.sql_listify(tag.id for tag in tagset))
for (operator, tagset) in intersections
]
intersections = [
INTERSECT_FORMAT.format(operator=operator, tagset=tagset)
for (operator, tagset) in intersections
]
return intersections
def normalize_tagset(photodb, tags, warning_bag=None):
if not tags:
return None
@ -413,10 +378,6 @@ def normalize_tag_mmf(tags, photodb, warning_bag=None):
else:
raise exc
tagset.add(tag)
if len(tagset) == 0:
return None
return tagset
def tag_expression_tree_builder(
@ -425,6 +386,8 @@ def tag_expression_tree_builder(
frozen_children,
warning_bag=None
):
if not tag_expression:
return None
try:
expression_tree = expressionmatch.ExpressionTree.parse(tag_expression)
except expressionmatch.NoTokens:
@ -464,7 +427,7 @@ def tag_expression_matcher_builder(frozen_children):
'''
Used as the `match_function` for the ExpressionTree evaluation.
photo:
photo_tags:
The set of tag names owned by the photo in question.
tagname:
The tag which the ExpressionTree wants it to have.

View file

@ -367,7 +367,7 @@ def get_search_core():
def get_search_html():
search_results = get_search_core()
search_kwargs = search_results['search_kwargs']
qualname_map = etiquette.tag_export.qualified_names(common.P.get_tags())
qualname_map = common.P.get_cached_qualname_map()
session = session_manager.get(request)
response = flask.render_template(
'search.html',
@ -389,4 +389,7 @@ def get_search_json():
search_results['photos'] = [
etiquette.jsonify.photo(photo, include_albums=False) for photo in search_results['photos']
]
search_results['total_tags'] = [
etiquette.jsonify.tag(tag, minimal=True) for tag in search_results['total_tags']
]
return jsonify.make_json_response(search_results)