From 5b7c05c39d1841491c8b137a2310d7c83aeffd01 Mon Sep 17 00:00:00 2001 From: Ethan Dalool Date: Mon, 1 May 2017 21:27:54 -0700 Subject: [PATCH] Improve MMF search speed by pre-filtering photo IDs Not perfected yet but it's an improvement. Waste less time checking all photos, presearch for matching photos via photo_tag_rel table. --- etiquette/helpers.py | 3 + etiquette/photodb.py | 118 +++++++++++++++++++++---------------- etiquette/searchhelpers.py | 103 +++++++++++++++++++++++++------- 3 files changed, 153 insertions(+), 71 deletions(-) diff --git a/etiquette/helpers.py b/etiquette/helpers.py index a348f6f..9b2fd1e 100644 --- a/etiquette/helpers.py +++ b/etiquette/helpers.py @@ -321,6 +321,9 @@ def select_generator(sql, query, bindings=None): break yield fetch +def sql_listify(items): + return '(%s)' % ', '.join('"%s"' % item for item in items) + def truthystring(s): ''' Convert strings to True, False, or None based on the options presented diff --git a/etiquette/photodb.py b/etiquette/photodb.py index a2f8f5d..009e9fa 100644 --- a/etiquette/photodb.py +++ b/etiquette/photodb.py @@ -707,38 +707,11 @@ class PDBPhotoMixin: notnulls.append('bytes') if duration: notnulls.append('duration') - query = searchhelpers.build_query(orderby, notnulls) - print(query) - generator = helpers.select_generator(self.sql, query) if orderby is None: giveback_orderby = None else: giveback_orderby = [term.replace('RANDOM()', 'random') for term in orderby] - if give_back_parameters: - parameters = { - 'area': area, - 'width': width, - 'height': height, - 'ratio': ratio, - 'bytes': bytes, - 'duration': duration, - 'authors': authors, - 'created': created, - 'extension': extension, - 'extension_not': extension_not, - 'filename': filename, - 'has_tags': has_tags, - 'mimetype': mimetype, - 'tag_musts': tag_musts, - 'tag_mays': tag_mays, - 'tag_forbids': tag_forbids, - 'tag_expression': tag_expression, - 'limit': limit, - 'offset': offset, - 'orderby': giveback_orderby, - } - yield parameters # FROZEN CHILDREN # To lighten the amount of database reading here, `frozen_children` is a dict where @@ -773,6 +746,44 @@ class PDBPhotoMixin: filename_tree = expressionmatch.ExpressionTree.parse(filename) filename_tree.map(lambda x: x.lower()) + if give_back_parameters: + parameters = { + 'area': area, + 'width': width, + 'height': height, + 'ratio': ratio, + 'bytes': bytes, + 'duration': duration, + 'authors': authors, + 'created': created, + 'extension': extension, + 'extension_not': extension_not, + 'filename': filename, + 'has_tags': has_tags, + 'mimetype': mimetype, + 'tag_musts': tag_musts, + 'tag_mays': tag_mays, + 'tag_forbids': tag_forbids, + 'tag_expression': tag_expression, + 'limit': limit, + 'offset': offset, + 'orderby': giveback_orderby, + } + yield parameters + + if is_must_may_forbid: + mmf_results = searchhelpers.mmf_photoids(self, tag_musts, tag_mays, tag_forbids, frozen_children) + #print('mmf accept:', mmf_results) + else: + mmf_results = None + + if mmf_results is not None and mmf_results['photoids'] == set(): + generator = [] + else: + query = searchhelpers.build_query(orderby, notnulls, minimums, maximums, mmf_results=mmf_results) + print(query[:200]) + generator = helpers.select_generator(self.sql, query) + photos_received = 0 # LET'S GET STARTED @@ -813,21 +824,21 @@ class PDBPhotoMixin: #print('Failed filename') continue - if any( - fetch[constants.SQL_PHOTO[key]] is None or - fetch[constants.SQL_PHOTO[key]] > value - for (key, value) in maximums.items() - ): - #print('Failed maximums') - continue + # if any( + # fetch[constants.SQL_PHOTO[key]] is None or + # fetch[constants.SQL_PHOTO[key]] > value + # for (key, value) in maximums.items() + # ): + # #print('Failed maximums') + # continue - if any( - fetch[constants.SQL_PHOTO[key]] is None or - fetch[constants.SQL_PHOTO[key]] < value - for (key, value) in minimums.items() - ): - #print('Failed minimums') - continue + # if any( + # fetch[constants.SQL_PHOTO[key]] is None or + # fetch[constants.SQL_PHOTO[key]] < value + # for (key, value) in minimums.items() + # ): + # #print('Failed minimums') + # continue if (has_tags is not None) or is_tagsearch: photo_tags = set(photo.tags()) @@ -848,17 +859,22 @@ class PDBPhotoMixin: if not success: #print('Failed tag expression') continue + elif is_must_may_forbid: - success = searchfilter_must_may_forbid( - photo_tags=photo_tags, - tag_musts=tag_musts, - tag_mays=tag_mays, - tag_forbids=tag_forbids, - frozen_children=frozen_children, - ) - if not success: - #print('Failed tag mmf') - continue + pass + # if photo.id not in mmf_results: + # #print('Failed tag mmf') + # continue + # success = searchfilter_must_may_forbid( + # photo_tags=photo_tags, + # tag_musts=tag_musts, + # tag_mays=tag_mays, + # tag_forbids=tag_forbids, + # frozen_children=frozen_children, + # ) + # if not success: + # #print('Failed tag mmf') + # continue if offset > 0: offset -= 1 diff --git a/etiquette/searchhelpers.py b/etiquette/searchhelpers.py index 2eaed38..5495680 100644 --- a/etiquette/searchhelpers.py +++ b/etiquette/searchhelpers.py @@ -5,34 +5,45 @@ from . import objects from voussoirkit import expressionmatch -def build_query(orderby, notnulls): - query = 'SELECT * FROM photos' + +def build_query(orderby, notnulls, minimums, maximums, mmf_results=None): + query = ['SELECT * FROM photos'] + wheres = [] + + if mmf_results: + wheres.append('id %s %s' % (mmf_results['operator'], helpers.sql_listify(mmf_results['photoids']))) if orderby: orderby = [o.split('-') for o in orderby] - orderby_columns = [column for (column, sorter) in orderby if column != 'RANDOM()'] else: - orderby_columns = [] + orderby = [('created', 'DESC')] - if notnulls: - notnulls.extend(orderby_columns) - elif orderby_columns: - notnulls = orderby_columns + for (column, direction) in orderby: + if column != 'RANDOM()': + notnulls.append(column) - if notnulls: - notnulls = [x + ' IS NOT NULL' for x in notnulls] - notnulls = ' AND '.join(notnulls) - query += ' WHERE ' + notnulls - if not orderby: - query += ' ORDER BY created DESC' - return query + for column in notnulls: + wheres.append(column + ' IS NOT NULL') - # Combine each column+sorter - orderby = [' '.join(o) for o in orderby] + for (column, value) in minimums.items(): + wheres.append(column + ' >= ' + str(value)) - # Combine everything - orderby = ', '.join(orderby) - query += ' ORDER BY %s' % orderby + for (column, value) in maximums.items(): + wheres.append(column + ' <= ' + str(value)) + + ## Assemble + + if wheres: + wheres = 'WHERE ' + ' AND '.join(wheres) + query.append(wheres) + + if orderby: + orderby = [' '.join(o) for o in orderby] + orderby = ', '.join(orderby) + orderby = 'ORDER BY ' + orderby + query.append(orderby) + + query = ' '.join(query) return query def get_user(photodb, username_or_id): @@ -90,6 +101,58 @@ def minmax(key, value, minimums, maximums, warning_bag=None): if high is not None: maximums[key] = high +def mmf_photoids(photodb, tag_musts, tag_mays, tag_forbids, frozen_children): + if not(tag_musts or tag_mays or tag_forbids): + return None + + cur = photodb.sql.cursor() + + operator = 'IN' + first_time = True + no_results = False + results = set() + + if tag_mays: + for tag in tag_mays: + choices = helpers.sql_listify(tag.id for tag in frozen_children[tag]) + query = 'SELECT photoid FROM photo_tag_rel WHERE tagid in %s' % choices + cur.execute(query) + results.update(fetch[0] for fetch in cur.fetchall()) + first_time = False + + if tag_musts: + for tag in tag_musts: + choices = helpers.sql_listify(tag.id for tag in frozen_children[tag]) + query = 'SELECT photoid FROM photo_tag_rel WHERE tagid in %s' % choices + cur.execute(query) + photo_ids = (fetch[0] for fetch in cur.fetchall()) + if first_time: + results.update(photo_ids) + first_time = False + else: + results = results.intersection(photo_ids) + if not results: + no_results = True + break + + if tag_forbids and not no_results: + if not results: + operator = 'NOT IN' + for tag in tag_forbids: + choices = helpers.sql_listify(tag.id for tag in frozen_children[tag]) + query = 'SELECT photoid FROM photo_tag_rel WHERE tagid in %s' % choices + cur.execute(query) + photo_ids = (fetch[0] for fetch in cur.fetchall()) + if operator == 'IN': + results = results.difference(photo_ids) + if not results: + no_results = True + break + else: + results.update(photo_ids) + + return {'operator': operator, 'photoids': results} + def normalize_authors(authors, photodb, warning_bag=None): ''' Either: