Correctly limit buildsets with multiple refs

The current SQL query will not correctly limit the number of buildsets
when some of the buildsets are related to multiple refs (circular
dependencies). The problem is that LIMTI works on number of rows, but we
want to limit only on the number of buildsets.

This corrects the problem by using a subquery to identify the distinct
buildsets, limiting that, and then querying for all of the information
about those buildsets.

This also updates the methods which perform the same function for builds,
even though we are not yet seeing an issue in practice.  It is
theoretically possible to call the getBuilds method with 'provides' and
'limit' arguments, which would produce the same problem as the buildsets
query.  That is not possible in practice, as the REST API doesn't support
provides, and the scheduler which does pass the provides argument doesn't
pass limit.  However, that could easily change in the future.  Additionally,
this future-proofs us if we add more queryable one-to-many relationships
to builds in the future (such as if we linked builds to multiple refs).
Also, it's easier to maintain these methods if they follow the same pattern.

There does not appear to be a performance loss in either mysql or postgres
with local testing on large data sets.  There may actually be an improvement
(but it's within the margin of error, so hard to say).

The index hints previously needed for mysql appear to no longer be
necessary and are removed.

Change-Id: Ib19e4cb8171f5d4d2873fb6b9c0301eb5d4ee43d
Co-Authored-By: James E. Blair <jim@acmegating.com>
This commit is contained in:
Simon Westphahl 2024-03-21 10:11:10 +01:00 committed by James E. Blair
parent 3a6208f342
commit 8bcfb8bc4a
1 changed files with 97 additions and 74 deletions

View File

@ -173,10 +173,9 @@ class DatabaseSession(object):
buildset_table = self.connection.zuul_buildset_table
provides_table = self.connection.zuul_provides_table
# contains_eager allows us to perform eager loading on the
# buildset *and* use that table in filters (unlike
# joinedload).
q = self.session().query(self.connection.buildModel).\
q = self.session().query(
self.connection.buildModel.id.label('inner_id')).\
distinct().\
join(self.connection.buildSetModel).\
join(self.connection.refModel)
# Avoid joining the provides table unless necessary; postgres
@ -186,33 +185,6 @@ class DatabaseSession(object):
# the query.
if provides:
q = q.outerjoin(self.connection.providesModel)
q = q.options(orm.contains_eager(self.connection.buildModel.buildset),
orm.contains_eager(self.connection.buildModel.ref),
orm.selectinload(self.connection.buildModel.provides),
orm.selectinload(self.connection.buildModel.artifacts))
# If the query planner isn't able to reduce either the number
# of rows returned by the buildset or build tables, then it
# tends to produce a very slow query. This hint produces
# better results, but only in those cases. When we can narrow
# things down with indexes, it's better to omit the hint.
# job_name is a tricky one. It is indexed, but if there are a
# lot of rows, it is better to include the hint, but if there
# are few, it is better to not include it. We include the hint
# regardless of whether job_name is specified (optimizing for
# the more common case).
if not (project or change or uuid):
q = q.with_hint(build_table, 'USE INDEX (PRIMARY)', 'mysql')
if query_timeout:
# For MySQL, we can add a query hint directly.
q = q.prefix_with(
f'/*+ MAX_EXECUTION_TIME({query_timeout}) */',
dialect='mysql')
# For Postgres, we add a comment that we parse in our
# event handler.
q = q.with_statement_hint(
f'/* statement_timeout={query_timeout} */',
dialect_name='postgresql')
q = self.listFilter(q, buildset_table.c.tenant, tenant)
q = self.listFilterFuzzy(q, buildset_table.c.pipeline, pipeline)
@ -254,10 +226,43 @@ class DatabaseSession(object):
q = q.order_by(buildset_table.c.id.desc())
else:
q = q.order_by(build_table.c.id.desc())
q = q.limit(limit).offset(offset)
q = q.limit(limit).offset(offset).subquery()
# contains_eager allows us to perform eager loading on the
# buildset *and* use that table in filters (unlike
# joinedload).
bq = self.session().query(self.connection.buildModel).\
join(self.connection.buildSetModel).\
join(self.connection.refModel).\
options(orm.contains_eager(self.connection.buildModel.buildset),
orm.contains_eager(self.connection.buildModel.ref),
orm.selectinload(self.connection.buildModel.provides),
orm.selectinload(self.connection.buildModel.artifacts))
if sort_by_buildset:
# If we don't need the builds to be strictly ordered, this
# query can be much faster as it may avoid the use of a
# temporary table.
bq = bq.order_by(buildset_table.c.id.desc())
else:
bq = bq.order_by(build_table.c.id.desc())
bq = bq.join(q, self.connection.buildModel.id == q.c.inner_id)
if query_timeout:
# For MySQL, we can add a query hint directly.
bq = bq.prefix_with(
f'/*+ MAX_EXECUTION_TIME({query_timeout}) */',
dialect='mysql')
# For Postgres, we add a comment that we parse in our
# event handler.
bq = bq.with_statement_hint(
f'/* statement_timeout={query_timeout} */',
dialect_name='postgresql')
try:
return q.all()
return bq.all()
except sqlalchemy.orm.exc.NoResultFound:
return []
@ -306,25 +311,12 @@ class DatabaseSession(object):
build_table = self.connection.zuul_build_table
buildset_table = self.connection.zuul_buildset_table
# contains_eager allows us to perform eager loading on the
# buildset *and* use that table in filters (unlike
# joinedload).
q = self.session().query(self.connection.buildModel).\
q = self.session().query(
self.connection.buildModel.id.label('inner_id'),
self.connection.buildModel.end_time.label('inner_end_time')).\
distinct().\
join(self.connection.buildSetModel).\
join(self.connection.refModel).\
options(orm.contains_eager(self.connection.buildModel.buildset),
orm.contains_eager(self.connection.buildModel.ref))
if query_timeout:
# For MySQL, we can add a query hint directly.
q = q.prefix_with(
f'/*+ MAX_EXECUTION_TIME({query_timeout}) */',
dialect='mysql')
# For Postgres, we add a comment that we parse in our
# event handler.
q = q.with_statement_hint(
f'/* statement_timeout={query_timeout} */',
dialect_name='postgresql')
join(self.connection.refModel)
q = self.listFilter(q, buildset_table.c.tenant, tenant)
q = self.listFilter(q, buildset_table.c.pipeline, pipeline)
@ -345,11 +337,36 @@ class DatabaseSession(object):
if result is None:
q = q.filter(build_table.c.result != None) # noqa
q = q.order_by(build_table.c.end_time.desc())
q = q.limit(limit).offset(offset)
q = q.order_by(build_table.c.end_time.desc()).\
limit(limit).\
offset(offset).\
subquery()
# contains_eager allows us to perform eager loading on the
# buildset *and* use that table in filters (unlike
# joinedload).
bq = self.session().query(self.connection.buildModel).\
join(self.connection.buildSetModel).\
join(self.connection.refModel).\
options(orm.contains_eager(self.connection.buildModel.buildset),
orm.contains_eager(self.connection.buildModel.ref))
bq = bq.order_by(build_table.c.id.desc())
if query_timeout:
# For MySQL, we can add a query hint directly.
bq = bq.prefix_with(
f'/*+ MAX_EXECUTION_TIME({query_timeout}) */',
dialect='mysql')
# For Postgres, we add a comment that we parse in our
# event handler.
bq = bq.with_statement_hint(
f'/* statement_timeout={query_timeout} */',
dialect_name='postgresql')
bq = bq.join(q, self.connection.buildModel.id == q.c.inner_id)
try:
return q.all()
return bq.all()
except sqlalchemy.orm.exc.NoResultFound:
return []
@ -394,24 +411,11 @@ class DatabaseSession(object):
buildset_table = self.connection.zuul_buildset_table
ref_table = self.connection.zuul_ref_table
# See note above about the hint.
q = self.session().query(self.connection.buildSetModel).\
q = self.session().query(
self.connection.buildSetModel.id.label('inner_id')).\
distinct().\
join(self.connection.buildSetRefModel).\
join(self.connection.refModel).\
options(orm.contains_eager(self.connection.buildSetModel.refs))
if not (project or change or uuid):
q = q.with_hint(buildset_table, 'USE INDEX (PRIMARY)', 'mysql')
if query_timeout:
# For MySQL, we can add a query hint directly.
q = q.prefix_with(
f'/*+ MAX_EXECUTION_TIME({query_timeout}) */',
dialect='mysql')
# For Postgres, we add a comment that we parse in our
# event handler.
q = q.with_statement_hint(
f'/* statement_timeout={query_timeout} */',
dialect_name='postgresql')
join(self.connection.refModel)
q = self.listFilter(q, buildset_table.c.tenant, tenant)
q = self.listFilterFuzzy(q, buildset_table.c.pipeline, pipeline)
@ -435,13 +439,32 @@ class DatabaseSession(object):
if updated_max:
q = q.filter(buildset_table.c.updated < updated_max)
q = q.order_by(buildset_table.c.id.desc()).\
limit(limit).\
offset(offset)
offset(offset).\
subquery()
bq = self.session().query(self.connection.buildSetModel).\
join(self.connection.buildSetRefModel).\
join(self.connection.refModel).\
options(orm.contains_eager(self.connection.buildSetModel.refs))
bq = bq.order_by(buildset_table.c.id.desc())
if query_timeout:
# For MySQL, we can add a query hint directly.
bq = bq.prefix_with(
f'/*+ MAX_EXECUTION_TIME({query_timeout}) */',
dialect='mysql')
# For Postgres, we add a comment that we parse in our
# event handler.
bq = bq.with_statement_hint(
f'/* statement_timeout={query_timeout} */',
dialect_name='postgresql')
bq = bq.join(q, self.connection.buildSetModel.id == q.c.inner_id)
try:
return q.all()
return bq.all()
except sqlalchemy.orm.exc.NoResultFound:
return []