Optimise the Story browsing query

The `story_get_all` function currently constructs a query which perfoms
very poorly, making search in StoryBoard painful to the point of being
almost unusable with only ~30000 stories in the database. There are a
number of reasons for the poor performance, but the most impactful is
the fact that we join two subqueries, one of which contains the entire
stories table, and the other which contains a filtered list of story
IDs to implement most of the filtering.

This commit removes all of the subquerying, and just does simple
filtering in a single WHERE clause. Basic comparison of the two queries
shows around an 85% - 90% speedup in the execution time using the new
approach, from ~100ms for the current query, and ~10-15ms for the new
query (using a test dataset containing 10000 stories).

This approach adds a bit of code complexity around filtering stories by
story status, but this is a cost worth paying for the performance
improvement. The new approach will also allow extra flexibility in the
meaning of the story statuses, making it easier to have per-project
statuses for example.

Story: 2005845
Task: 33627
Change-Id: I7c19372506017726e4fc61ffb09d3cfb9c6b36d3
This commit is contained in:
Adam Coldrick 2020-07-21 00:49:25 +01:00
parent 983bb8e2d7
commit 63a003af7f
9 changed files with 349 additions and 204 deletions

View File

@ -1,7 +1,7 @@
version: '2'
services:
mysql:
image: "mysql:latest"
image: "mysql:5.7"
ports:
- "3306:3306"
environment:

View File

@ -14,7 +14,6 @@
# limitations under the License.
from oslo_db.exception import DBError
from sqlalchemy.orm import subqueryload
from sqlalchemy_fulltext import FullTextSearch
import sqlalchemy_fulltext.modes as FullTextMode
@ -74,33 +73,16 @@ class SqlAlchemySearchImpl(search_engine.SearchEngine):
return query.all()
def _story_fulltext_query(self, query, q, status, marker, offset,
def _story_fulltext_query(self, query, q, status, offset,
limit, mode, sort_field, sort_dir):
clean_query = self._build_fulltext_search(
query = self._build_fulltext_search(
models.Story, query, q, mode=mode)
count = query.count()
# Turn the whole shebang into a subquery.
clean_query = clean_query.subquery('filtered_stories')
query = stories_api._order_and_paginate(
query, offset, limit, [sort_field], sort_dir)
# Return the story summary.
query = api_base.model_query(models.StorySummary)\
.options(subqueryload(models.StorySummary.tags))
id_col = tuple(clean_query.c)[0]
query = query.join(clean_query,
models.StorySummary.id == id_col)
if status:
query = query.filter(models.StorySummary.status.in_(status))
query = self._apply_pagination(models.StorySummary,
query,
marker,
offset,
limit,
sort_field=sort_field,
sort_dir=sort_dir)
return query.all()
return [stories_api._StorySummary(row) for row in query.all()], count
def stories_query(self, q, status=None, assignee_id=None,
creator_id=None, project_group_id=None, project_id=None,
@ -108,43 +90,29 @@ class SqlAlchemySearchImpl(search_engine.SearchEngine):
marker=None, offset=None,
limit=None, tags_filter_type="all", sort_field='id',
sort_dir='asc', current_user=None):
session = api_base.get_session()
subquery = stories_api._story_build_query(
query = stories_api._story_build_query(
status=status,
assignee_id=assignee_id,
creator_id=creator_id,
project_group_id=project_group_id,
project_id=project_id,
subscriber_id=subscriber_id,
tags=tags,
updated_since=updated_since,
tags_filter_type=tags_filter_type,
current_user=current_user,
session=session)
# Filter by subscriber ID
if subscriber_id is not None:
subs = api_base.model_query(models.Subscription)
subs = api_base.apply_query_filters(query=subs,
model=models.Subscription,
target_type='story',
user_id=subscriber_id)
subs = subs.subquery()
subquery = subquery.join(subs, subs.c.target_id == models.Story.id)
# Make a query that isn't full of aliases so that fulltext works
query = api_base.model_query(models.Story)
query = api_base.apply_query_filters(
query=query,
model=models.Story,
id=[story.id for story in subquery.all()])
current_user=current_user)
# Attempt a boolean search first, and if there's an error
# try a natural language search instead. This saves having to
# validate whether a search query is a valid boolean search, and
# makes failure less common.
try:
return self._story_fulltext_query(
query, q, status, marker, offset, limit, FullTextMode.BOOLEAN,
query, q, status, offset, limit, FullTextMode.BOOLEAN,
sort_field, sort_dir)
except DBError:
return self._story_fulltext_query(
query, q, status, marker, offset, limit, FullTextMode.NATURAL,
query, q, status, offset, limit, FullTextMode.NATURAL,
sort_field, sort_dir)
def tasks_query(self, q, story_id=None, assignee_id=None, project_id=None,

View File

@ -535,7 +535,7 @@ class StoriesController(rest.RestController):
"""
user = request.current_user_id
stories = SEARCH_ENGINE.stories_query(
stories, count = SEARCH_ENGINE.stories_query(
q,
status=status,
assignee_id=assignee_id,
@ -552,6 +552,13 @@ class StoriesController(rest.RestController):
sort_dir=sort_dir,
current_user=user)
# Apply the query response headers.
if limit:
response.headers['X-Limit'] = str(limit)
response.headers['X-Total'] = str(count)
if offset is not None:
response.headers['X-Offset'] = str(offset)
return [create_story_wmodel(story) for story in stories]
@expose()

View File

@ -114,7 +114,7 @@ class TagsController(rest.RestController):
"""
tags = (tags or []) + (body or [])
story = stories_api.story_get(
story = stories_api.story_get_simple(
story_id, current_user=request.current_user_id)
if not story:
raise exc.NotFound("Story %s not found" % story_id)
@ -122,8 +122,6 @@ class TagsController(rest.RestController):
for tag in tags:
stories_api.story_add_tag(
story_id, tag, current_user=request.current_user_id)
# For some reason the story gets cached and the tags do not appear.
stories_api.api_base.get_session().expunge(story)
story = stories_api.story_get(
story_id, current_user=request.current_user_id)

View File

@ -28,4 +28,4 @@ class NameType(types.StringType):
_name_regex = r'^[a-zA-Z0-9]+([_\-\./]?[a-zA-Z0-9]+)*$'
def __init__(self):
super(NameType, self).__init__(min_length=3, pattern=self._name_regex)
super(NameType, self).__init__(min_length=1, pattern=self._name_regex)

View File

@ -387,16 +387,16 @@ def filter_private_stories(query, current_user, story_model=models.Story):
query = query.filter(
or_(
story_model.permissions.any(
or_(
models.Permission.users.any(
models.User.id == current_user
)
),
story_model.permissions.any(
models.Permission.teams.any(
models.Team.users.any(
models.User.id == current_user
)
)
)
),
story_model.private == false(),
story_model.id.is_(None)
@ -422,6 +422,9 @@ def filter_private_worklists(query, current_user, hide_lanes=True):
are lanes in a board.
"""
# TODO(SotK): This query doesn't need to have a UNION, we can just
# filter more sensibly.
# Lanes inherit board permissions and ignore their own, so need to be
# handled separately. If we're not hiding all lanes from the result,
# these two queries are combined at the end.
@ -436,16 +439,16 @@ def filter_private_worklists(query, current_user, hide_lanes=True):
models.Worklist.boards.any(
or_(
models.Board.permissions.any(
or_(
models.Permission.users.any(
models.User.id == current_user
)
),
models.Board.permissions.any(
models.Permission.teams.any(
models.Team.users.any(
models.User.id == current_user
)
)
)
),
models.Board.private == false(),
models.Board.id.is_(None)
@ -455,16 +458,16 @@ def filter_private_worklists(query, current_user, hide_lanes=True):
lists = lists.filter(
or_(
models.Worklist.permissions.any(
or_(
models.Permission.users.any(
models.User.id == current_user
)
),
models.Worklist.permissions.any(
models.Permission.teams.any(
models.Team.users.any(
models.User.id == current_user
)
)
)
),
models.Worklist.private == false(),
models.Worklist.id.is_(None)
@ -505,16 +508,16 @@ def filter_private_boards(query, current_user):
query = query.filter(
or_(
models.Board.permissions.any(
or_(
models.Permission.users.any(
models.User.id == current_user
)
),
models.Board.permissions.any(
models.Permission.teams.any(
models.Team.users.any(
models.User.id == current_user
)
)
)
),
models.Board.private == false(),
models.Board.id.is_(None)

View File

@ -19,6 +19,9 @@ import pytz
from oslo_db import api as oslo_db_api
from sqlalchemy.orm import subqueryload
import sqlalchemy.sql.expression as expr
import sqlalchemy.sql.functions as func
from sqlalchemy import Integer, and_, asc, desc, or_
from wsme.exc import ClientSideError
from storyboard._i18n import _
@ -32,6 +35,50 @@ from storyboard.db.api import users as users_api
from storyboard.db import models
STORY_STATUSES = {
'active': ['todo', 'inprogress', 'review'],
'merged': ['merged'],
'invalid': ['invalid']
}
class _StorySummary(object):
"""Abstraction for a Story's calculated status and task counts.
Takes a row of results from a query for both the Story model and some
extra data about its status and count of tasks per-status. The status
is calculated at query time, hence needing this abstraction.
This was previously implemented as a different database model, but
that approach was inflexible about status calculations, and caused
queries with very poor performance at scale to be generated.
"""
def __init__(self, row):
self._summary_dict = {}
story_obj = row.Story
if story_obj:
self._summary_dict.update(story_obj.as_dict())
self._summary_dict['status'] = row.status
for task_status in models.Task.TASK_STATUSES:
self._summary_dict[task_status] = getattr(row, task_status)
def __getattr__(self, attr):
if attr in self._summary_dict:
return self._summary_dict[attr]
raise AttributeError(attr)
def __getitem__(self, key):
return self._summary_dict[key]
def as_dict(self):
return self._summary_dict
def story_get_simple(story_id, session=None, current_user=None,
no_permissions=False):
"""Return a story with the given ID.
@ -59,15 +106,88 @@ def story_get_simple(story_id, session=None, current_user=None,
def story_get(story_id, session=None, current_user=None):
query = api_base.model_query(models.StorySummary, session)
query = query.options(subqueryload(models.StorySummary.tags))\
.filter_by(id=story_id)
"""Return a summary object for the story with the given ID.
The object returned from this function isn't a SQLAlchemy model, and
so shouldn't be used with an expectation of modifications being reflected
in the underlying database.
If trying to query for an actual ``models.Story`` object,
``story_get_simple`` should be used instead.
:param story_id: ID of the story to return.
:param session: DB session to use.
:param current_user: The ID of the user making the request.
:return: A summary object for the story, or None if nothing found.
"""
query = _story_query_base()
query = query.filter(models.Story.id == story_id)
query = query.options(subqueryload(models.Story.due_dates))
query = query.options(subqueryload(models.Story.tags))
query = query.options(subqueryload(models.Story.permissions))
# Filter out stories that the current user can't see
query = api_base.filter_private_stories(query, current_user,
story_model=models.StorySummary)
query = api_base.filter_private_stories(query, current_user)
return query.first()
print(query.statement.compile(compile_kwargs={"literal_binds": True}))
story = query.first()
if story and story[0] is not None:
story = _StorySummary(story)
else:
story = None
return story
def _order_and_paginate(query, offset, limit, sort_keys, sort_dir=None,
sort_dirs=None):
# NOTE(SotK): This is inspired by the `paginate_query` function in
# oslo_db/sqlalchemy/utils.py, however it implements offset/limit
# pagination rather than using a marker.
#
# Even if we supported marker-based pagination we'd still need a
# custom implementation here, since we want to be able to do
# pagination on fields that aren't part of the database model
# (namely `status`). The oslo.db implementation requires that
# all the sort keys are part of the ORM model, and also implements
# comparison in a way that won't work for our use-case.
if sort_dir and sort_dirs:
raise AssertionError('Only one of sort_dir and sort_dirs can be set')
if sort_dir is None and sort_dirs is None:
sort_dir = 'asc'
if sort_dirs is None:
sort_dirs = [sort_dir for _key in sort_keys]
if len(sort_dirs) != len(sort_keys):
raise AssertionError(
'sort_dirs and sort_keys must have the same length')
for current_key, current_dir in zip(sort_keys, sort_dirs):
try:
sort_key_attr = getattr(models.Story, current_key)
except AttributeError:
# Fallback to ordering by the raw sort_key provided.
# TODO(SotK): make sure that this is only an expected field?
sort_key_attr = current_key
sort_dir_func = {
'asc': asc,
'desc': desc
}[current_dir]
query = query.order_by(sort_dir_func(sort_key_attr))
# TODO(SotK): We should switch to using marker-based pagination
# as soon as we can, it just needs the web client to understand
# what to do with a marker id.
if offset is not None:
query = query.offset(offset)
if limit is not None:
query = query.limit(limit)
return query
def story_get_all(title=None, description=None, status=None, assignee_id=None,
@ -84,105 +204,91 @@ def story_get_all(title=None, description=None, status=None, assignee_id=None,
if not isinstance(status, list) and status is not None:
status = [status]
# Build the query.
subquery = _story_build_query(title=title,
query = _story_build_query(title=title,
description=description,
status=status,
assignee_id=assignee_id,
creator_id=creator_id,
project_group_id=project_group_id,
project_id=project_id,
subscriber_id=subscriber_id,
tags=tags,
updated_since=updated_since,
board_id=board_id,
worklist_id=worklist_id,
current_user=current_user,
tags_filter_type=tags_filter_type)
tags_filter_type=tags_filter_type,
current_user=current_user)
# Filter by subscriber ID
if subscriber_id is not None:
subs = api_base.model_query(models.Subscription)
subs = api_base.apply_query_filters(query=subs,
model=models.Subscription,
target_type='story',
user_id=subscriber_id)
subs = subs.subquery()
subquery = subquery.join(subs, subs.c.target_id == models.Story.id)
query = _order_and_paginate(
query, offset, limit, [sort_field], sort_dir=sort_dir)
# Turn the whole shebang into a subquery.
subquery = subquery.subquery('filtered_stories')
query = query.options(subqueryload(models.Story.due_dates))
query = query.options(subqueryload(models.Story.tags))
query = query.options(subqueryload(models.Story.permissions))
# Return the story summary.
query = api_base.model_query(models.StorySummary)\
.options(subqueryload(models.StorySummary.tags))
id_col = tuple(subquery.c)[0]
query = query.join(subquery,
models.StorySummary.id == id_col)
if status:
query = query.filter(models.StorySummary.status.in_(status))
# paginate the query
query = api_base.paginate_query(query=query,
model=models.StorySummary,
limit=limit,
sort_key=sort_field,
marker=marker,
offset=offset,
sort_dir=sort_dir)
raw_stories = query.all()
return raw_stories
results = query.all()
return [_StorySummary(row) for row in results]
def story_get_count(title=None, description=None, status=None,
assignee_id=None, creator_id=None,
project_group_id=None, project_id=None,
subscriber_id=None, tags=None, updated_since=None,
board_id=None, worklist_id=None,
tags_filter_type="all", current_user=None):
query = _story_build_query(title=title,
description=description,
status=status,
assignee_id=assignee_id,
creator_id=creator_id,
project_group_id=project_group_id,
project_id=project_id,
updated_since=updated_since,
subscriber_id=subscriber_id,
tags=tags,
updated_since=updated_since,
board_id=board_id,
worklist_id=worklist_id,
tags_filter_type=tags_filter_type,
current_user=current_user)
# Filter by subscriber ID
if subscriber_id is not None:
subs = api_base.model_query(models.Subscription)
subs = api_base.apply_query_filters(query=subs,
model=models.Subscription,
target_type='story',
user_id=subscriber_id)
subs = subs.subquery()
query = query.join(subs, subs.c.target_id == models.Story.id)
# Turn the whole shebang into a subquery.
subquery = query.subquery('filtered_stories')
# Return the story summary.
query = api_base.model_query(models.StorySummary)\
.options(subqueryload(models.StorySummary.tags))
id_col = tuple(subquery.c)[0]
query = query.join(subquery,
models.StorySummary.id == id_col)
if status:
query = query.filter(models.StorySummary.status.in_(status))
return query.count()
def _story_build_query(title=None, description=None, assignee_id=None,
creator_id=None, project_group_id=None,
project_id=None, updated_since=None, tags=None,
board_id=None, worklist_id=None, tags_filter_type='all',
current_user=None, session=None):
# First build a standard story query.
query = api_base.model_query(models.Story.id, session=session).distinct()
def _story_query_base():
session = api_base.get_session()
select_items = []
select_items.append(models.Story)
select_items.append(
expr.case(
[(func.sum(models.Task.status.in_(STORY_STATUSES['active'])) > 0,
'active'),
((func.sum(models.Task.status == 'merged')) > 0, 'merged')],
else_='invalid'
).label('status')
)
for task_status in models.Task.TASK_STATUSES:
select_items.append(expr.cast(
func.sum(models.Task.status == task_status), Integer
).label(task_status))
query = session.query(*select_items)
# This join is needed to be able to do the status calculations
query = query.outerjoin(
(models.Task, models.Task.story_id == models.Story.id))
return query
def _story_build_query(title=None, description=None, status=None,
assignee_id=None, creator_id=None,
project_group_id=None, project_id=None,
subscriber_id=None, tags=None, updated_since=None,
board_id=None, worklist_id=None,
tags_filter_type="all", current_user=None):
# Get a basic story query, containing task summaries
query = _story_query_base()
# Get rid of anything the requester can't see
query = api_base.filter_private_stories(query, current_user)
# Apply basic filters
query = api_base.apply_query_filters(query=query,
@ -190,12 +296,11 @@ def _story_build_query(title=None, description=None, assignee_id=None,
title=title,
description=description,
creator_id=creator_id)
# Only get stories that have been updated since a given datetime
if updated_since:
query = query.filter(models.Story.updated_at > updated_since)
# Filter out stories that the current user can't see
query = api_base.filter_private_stories(query, current_user)
# Filtering by tags
if tags:
if tags_filter_type == 'all':
@ -207,47 +312,97 @@ def _story_build_query(title=None, description=None, assignee_id=None,
else:
raise exc.NotFound("Tags filter not found.")
# Are we filtering by project group?
# Filtering by project group
if project_group_id:
query = query.join(
(models.Task, models.Task.story_id == models.Story.id))
query = query.join(models.Project,
models.project_group_mapping,
models.ProjectGroup)
query = query.filter(models.ProjectGroup.id == project_group_id)
# The project_group_mapping table contains all the information we need
# here; namely the project id (to map to tasks) and the project group
# id (which we're actually filtering by). There's no need to actually
# join either of the projects or project_groups tables.
pgm = models.project_group_mapping
query = query.join((pgm, pgm.c.project_id == models.Task.project_id))
query = query.filter(pgm.c.project_group_id == project_group_id)
# Are we filtering by task?
if assignee_id or project_id:
if not project_group_id: # We may already have joined this table
query = query.join(
(models.Task, models.Task.story_id == models.Story.id))
# Filtering by task assignee
if assignee_id:
query = query.filter(models.Task.assignee_id == assignee_id)
# Filtering by project
if project_id:
query = query.filter(models.Task.project_id == project_id)
# Filtering by subscriber
if subscriber_id:
on_clause = and_(
models.Subscription.target_id == models.Story.id,
models.Subscription.target_type == 'story'
)
query = query.join((models.Subscription, on_clause))
query = query.filter(models.Subscription.user_id == subscriber_id)
# Filtering by either worklist or board requires joining the worklists
# table, via worklist_items
if worklist_id or board_id:
query = query.outerjoin(
(models.WorklistItem,
models.WorklistItem.item_id == models.Story.id))
query = query.filter(models.WorklistItem.item_type == "story")
query = query.outerjoin(models.Worklist)
on_clause = and_(
models.WorklistItem.item_id == models.Story.id,
models.WorklistItem.item_type == 'story'
)
query = query.join((models.WorklistItem, on_clause), models.Worklist)
# Filter by worklist
if worklist_id:
query = query.filter(models.Worklist.id == worklist_id)
query = api_base.filter_private_worklists(
query, current_user, hide_lanes=False)
query = query.filter(models.Worklist.id == worklist_id)
# Filter by board
if board_id:
query = query.outerjoin(models.BoardWorklist, models.Board)
query = query.join(models.BoardWorklist, models.Board)
query = api_base.filter_private_boards(query, current_user)
query = query.filter(models.Board.id == board_id)
query = api_base.filter_private_worklists(
query, current_user, hide_lanes=False)
return query.distinct()
# We need to do GROUP BY to allow the status calculation to work correctly
query = query.group_by(models.Story.id)
# Filter by story status. This duplicates the definition of the different
# story statuses from the SELECT clause of the query, but means we can
# do all the filtering without a subquery.
if status is not None:
if any(s not in STORY_STATUSES for s in status):
raise ValueError(
f'Story status must be in {STORY_STATUSES.keys()}')
criteria = []
if 'active' in status:
# A story is active if it has at least one unmerged valid task,
# so the check is simple
criteria.append(func.sum(
models.Task.status.in_(STORY_STATUSES['active'])) > 0
)
if 'merged' in status:
# A story is merged if it has at least one merged task, and also
# doesn't meet the criteria to be active
criteria.append(and_(
func.sum(models.Task.status.in_(STORY_STATUSES['merged'])) > 0,
func.sum(models.Task.status.in_(STORY_STATUSES['active'])) == 0
))
if 'invalid' in status:
# A story is invalid if it only has invalid tasks, or no tasks
criteria.append(or_(
and_(
func.sum(
models.Task.status.in_(STORY_STATUSES['invalid'])
) >= 0,
func.sum(
models.Task.status.in_(
STORY_STATUSES['active']
+ STORY_STATUSES['merged']
)
) == 0
),
func.sum(models.Task.id) == None # noqa
))
query = query.having(or_(*criteria))
return query
def story_create(values):
@ -344,7 +499,7 @@ def story_remove_tag(story_id, tag_name, current_user=None):
def story_delete(story_id, current_user=None):
story = story_get(story_id, current_user=current_user)
story = story_get_simple(story_id, current_user=current_user)
if story:
api_base.entity_hard_delete(models.Story, story_id)
@ -417,9 +572,7 @@ def story_can_mutate(story, new_story_type_id):
def create_permission(story, users, teams, session=None):
story = api_base.model_query(models.Story, session) \
.options(subqueryload(models.Story.tags)) \
.filter_by(id=story.id).first()
story = story_get_simple(story.id, session=session, no_permissions=True)
permission_dict = {
'name': 'view_story_%d' % story.id,
'codename': 'view_story'
@ -438,9 +591,7 @@ def create_permission(story, users, teams, session=None):
def update_permission(story, users, teams, session=None):
story = api_base.model_query(models.Story, session) \
.options(subqueryload(models.Story.tags)) \
.filter_by(id=story.id).first()
story = story_get_simple(story.id, session=session, no_permissions=True)
if not story.permissions:
raise exc.NotFound(_("Permissions for story %d not found.")
% story.id)

View File

@ -34,6 +34,7 @@ from sqlalchemy.ext import declarative
from sqlalchemy import ForeignKey
from sqlalchemy import Integer
from sqlalchemy.orm.collections import attribute_mapped_collection
from sqlalchemy.orm.exc import DetachedInstanceError
from sqlalchemy.orm import backref, relationship
from sqlalchemy import schema
from sqlalchemy import select
@ -333,6 +334,23 @@ class Story(FullText, ModelBuilder, Base):
_public_fields = ["id", "creator_id", "title", "description", "is_bug",
"tasks", "events", "tags"]
def as_dict(self):
d = super(Story, self).as_dict()
try:
d["due_dates"] = [due_date for due_date in self.due_dates]
except DetachedInstanceError:
pass
try:
d["permissions"] = [permission for permission in self.permissions]
except DetachedInstanceError:
pass
try:
d["tags"] = [t.name for t in self.tags]
except DetachedInstanceError:
pass
return d
class Task(FullText, ModelBuilder, Base):
__fulltext_columns__ = ['title']

View File

@ -418,16 +418,16 @@ class TestStorySearch(base.FunctionalTest):
result = results.json[0]
self.assertEqual(1, result['id'])
def test_search_marker(self):
def test_search_offset(self):
url = self.build_search_url({
'title': 'foo',
'marker': 1 # Last item in previous list.
'offset': 1
})
results = self.get_json(url, expect_errors=True)
self.assertEqual(1, len(results.json))
self.assertEqual('2', results.headers['X-Total'])
self.assertEqual('1', results.headers['X-Marker'])
self.assertEqual('1', results.headers['X-Offset'])
result = results.json[0]
self.assertEqual(3, result['id'])