Split build/buildset list queries into two

The only way to get reasonable performance out of all three databases
(mariadb, mysql, postgres) appears to be to make the following changes:

* Use group-by instead of distinct.
  Some versions of mysql produce the wrong output with distinct;
  this is more descriptive of what we want anyway.
* Split the query into two statements instead of joining on a subquery.
  Mariadb seems to be unable to produce a good query plan for the
  buildset list query.  Neither mariadb nor mysql support using
  "IN" with a subquery (so the idea of using IN instead of JOIN for
  the subquery is out).

These methods now perform a query to get the ids of the builds or
buildsets that match the criteria, then perform a second query to load
the ORM objects that match those ids.  This appears to be quite
fast for all three queries with the latest versions of all three database
systems.

Change-Id: I30bb3214807dfa8b26a848f85bb7a7bc660c6c1d
This commit is contained in:
James E. Blair 2024-04-08 16:02:40 -07:00
parent 014717bde6
commit 95bd778fcc
2 changed files with 96 additions and 52 deletions

View File

@ -2477,6 +2477,45 @@ class TestGerritCircularDependencies(ZuulTestCase):
def test_job_deduplication_semaphore_resources_first(self): def test_job_deduplication_semaphore_resources_first(self):
self._test_job_deduplication_semaphore() self._test_job_deduplication_semaphore()
def _merge_noop_pair(self):
A = self.fake_gerrit.addFakeChange('org/project1', 'master', 'A')
B = self.fake_gerrit.addFakeChange('org/project1', 'master', 'B')
# A <-> B
A.data["commitMessage"] = "{}\n\nDepends-On: {}\n".format(
A.subject, B.data["url"]
)
B.data["commitMessage"] = "{}\n\nDepends-On: {}\n".format(
B.subject, A.data["url"]
)
A.addApproval('Code-Review', 2)
B.addApproval('Code-Review', 2)
self.fake_gerrit.addEvent(A.addApproval('Approved', 1))
self.fake_gerrit.addEvent(B.addApproval('Approved', 1))
self.waitUntilSettled()
self.assertEqual(A.data['status'], 'MERGED')
self.assertEqual(B.data['status'], 'MERGED')
@simple_layout('layouts/job-dedup-noop.yaml')
def test_job_deduplication_buildsets(self):
self._merge_noop_pair()
self._merge_noop_pair()
self._merge_noop_pair()
buildsets = self.scheds.first.connections.connections[
'database'].getBuildsets()
self.assertEqual(len(buildsets), 3)
# If we accidentally limit by rows, we should get fewer than 2
# buildsets returned here.
buildsets = self.scheds.first.connections.connections[
'database'].getBuildsets(limit=2)
self.assertEqual(len(buildsets), 2)
# Start check tests # Start check tests
def _test_job_deduplication_check(self): def _test_job_deduplication_check(self):

View File

@ -174,10 +174,10 @@ class DatabaseSession(object):
provides_table = self.connection.zuul_provides_table provides_table = self.connection.zuul_provides_table
q = self.session().query( q = self.session().query(
self.connection.buildModel.id.label('inner_id')).\ self.connection.buildModel.id).\
distinct().\
join(self.connection.buildSetModel).\ join(self.connection.buildSetModel).\
join(self.connection.refModel) join(self.connection.refModel).\
group_by(self.connection.buildModel.id)
# If the query planner isn't able to reduce either the number # If the query planner isn't able to reduce either the number
# of rows returned by the buildset or build tables, then it # of rows returned by the buildset or build tables, then it
@ -241,8 +241,26 @@ class DatabaseSession(object):
else: else:
q = q.order_by(build_table.c.id.desc()) q = q.order_by(build_table.c.id.desc())
q = q.limit(limit).offset(offset).subquery() q = q.limit(limit).offset(offset)
if query_timeout:
# For MySQL, we can add a query hint directly.
q = q.prefix_with(
f'/*+ MAX_EXECUTION_TIME({query_timeout}) */',
dialect='mysql')
# For Postgres, we add a comment that we parse in our
# event handler.
q = q.with_statement_hint(
f'/* statement_timeout={query_timeout} */',
dialect_name='postgresql')
ids = [x[0] for x in q.all()]
# We use two queries here, principally because mariadb does
# not produce a reasonable plan if we combine these into one
# query with a join, and neither mysql nor mariadb support
# using limit inside a subquery that is used with the "in"
# operator. Therefore, we run a query to get the ids, then
# run a second to load the data.
# contains_eager allows us to perform eager loading on the # contains_eager allows us to perform eager loading on the
# buildset *and* use that table in filters (unlike # buildset *and* use that table in filters (unlike
# joinedload). # joinedload).
@ -254,6 +272,8 @@ class DatabaseSession(object):
orm.selectinload(self.connection.buildModel.provides), orm.selectinload(self.connection.buildModel.provides),
orm.selectinload(self.connection.buildModel.artifacts)) orm.selectinload(self.connection.buildModel.artifacts))
bq = bq.filter(self.connection.buildModel.id.in_(ids))
if sort_by_buildset: if sort_by_buildset:
# If we don't need the builds to be strictly ordered, this # If we don't need the builds to be strictly ordered, this
# query can be much faster as it may avoid the use of a # query can be much faster as it may avoid the use of a
@ -262,19 +282,6 @@ class DatabaseSession(object):
else: else:
bq = bq.order_by(build_table.c.id.desc()) bq = bq.order_by(build_table.c.id.desc())
bq = bq.join(q, self.connection.buildModel.id == q.c.inner_id)
if query_timeout:
# For MySQL, we can add a query hint directly.
bq = bq.prefix_with(
f'/*+ MAX_EXECUTION_TIME({query_timeout}) */',
dialect='mysql')
# For Postgres, we add a comment that we parse in our
# event handler.
bq = bq.with_statement_hint(
f'/* statement_timeout={query_timeout} */',
dialect_name='postgresql')
try: try:
return bq.all() return bq.all()
except sqlalchemy.orm.exc.NoResultFound: except sqlalchemy.orm.exc.NoResultFound:
@ -326,11 +333,11 @@ class DatabaseSession(object):
buildset_table = self.connection.zuul_buildset_table buildset_table = self.connection.zuul_buildset_table
q = self.session().query( q = self.session().query(
self.connection.buildModel.id.label('inner_id'), self.connection.buildModel.id,
self.connection.buildModel.end_time.label('inner_end_time')).\ self.connection.buildModel.end_time).\
distinct().\
join(self.connection.buildSetModel).\ join(self.connection.buildSetModel).\
join(self.connection.refModel) join(self.connection.refModel).\
group_by(self.connection.buildModel.id)
# See note above about the hint. # See note above about the hint.
if not (project): if not (project):
@ -357,9 +364,20 @@ class DatabaseSession(object):
q = q.order_by(build_table.c.end_time.desc()).\ q = q.order_by(build_table.c.end_time.desc()).\
limit(limit).\ limit(limit).\
offset(offset).\ offset(offset)
subquery()
if query_timeout:
# For MySQL, we can add a query hint directly.
q = q.prefix_with(
f'/*+ MAX_EXECUTION_TIME({query_timeout}) */',
dialect='mysql')
# For Postgres, we add a comment that we parse in our
# event handler.
q = q.with_statement_hint(
f'/* statement_timeout={query_timeout} */',
dialect_name='postgresql')
ids = [x[0] for x in q.all()]
# contains_eager allows us to perform eager loading on the # contains_eager allows us to perform eager loading on the
# buildset *and* use that table in filters (unlike # buildset *and* use that table in filters (unlike
# joinedload). # joinedload).
@ -367,21 +385,9 @@ class DatabaseSession(object):
join(self.connection.buildSetModel).\ join(self.connection.buildSetModel).\
join(self.connection.refModel).\ join(self.connection.refModel).\
options(orm.contains_eager(self.connection.buildModel.buildset), options(orm.contains_eager(self.connection.buildModel.buildset),
orm.contains_eager(self.connection.buildModel.ref)) orm.contains_eager(self.connection.buildModel.ref)).\
bq = bq.order_by(build_table.c.id.desc()) filter(self.connection.buildModel.id.in_(ids)).\
order_by(build_table.c.end_time.desc())
if query_timeout:
# For MySQL, we can add a query hint directly.
bq = bq.prefix_with(
f'/*+ MAX_EXECUTION_TIME({query_timeout}) */',
dialect='mysql')
# For Postgres, we add a comment that we parse in our
# event handler.
bq = bq.with_statement_hint(
f'/* statement_timeout={query_timeout} */',
dialect_name='postgresql')
bq = bq.join(q, self.connection.buildModel.id == q.c.inner_id)
try: try:
return bq.all() return bq.all()
@ -430,10 +436,10 @@ class DatabaseSession(object):
ref_table = self.connection.zuul_ref_table ref_table = self.connection.zuul_ref_table
q = self.session().query( q = self.session().query(
self.connection.buildSetModel.id.label('inner_id')).\ self.connection.buildSetModel.id).\
distinct().\
join(self.connection.buildSetRefModel).\ join(self.connection.buildSetRefModel).\
join(self.connection.refModel) join(self.connection.refModel).\
group_by(self.connection.buildSetModel.id)
q = self.listFilter(q, buildset_table.c.tenant, tenant) q = self.listFilter(q, buildset_table.c.tenant, tenant)
q = self.listFilterFuzzy(q, buildset_table.c.pipeline, pipeline) q = self.listFilterFuzzy(q, buildset_table.c.pipeline, pipeline)
@ -459,27 +465,26 @@ class DatabaseSession(object):
q = q.filter(buildset_table.c.updated < updated_max) q = q.filter(buildset_table.c.updated < updated_max)
q = q.order_by(buildset_table.c.id.desc()).\ q = q.order_by(buildset_table.c.id.desc()).\
limit(limit).\ limit(limit).\
offset(offset).\ offset(offset)
subquery()
bq = self.session().query(self.connection.buildSetModel).\
join(self.connection.buildSetRefModel).\
join(self.connection.refModel).\
options(orm.contains_eager(self.connection.buildSetModel.refs))
bq = bq.order_by(buildset_table.c.id.desc())
if query_timeout: if query_timeout:
# For MySQL, we can add a query hint directly. # For MySQL, we can add a query hint directly.
bq = bq.prefix_with( q = q.prefix_with(
f'/*+ MAX_EXECUTION_TIME({query_timeout}) */', f'/*+ MAX_EXECUTION_TIME({query_timeout}) */',
dialect='mysql') dialect='mysql')
# For Postgres, we add a comment that we parse in our # For Postgres, we add a comment that we parse in our
# event handler. # event handler.
bq = bq.with_statement_hint( q = q.with_statement_hint(
f'/* statement_timeout={query_timeout} */', f'/* statement_timeout={query_timeout} */',
dialect_name='postgresql') dialect_name='postgresql')
bq = bq.join(q, self.connection.buildSetModel.id == q.c.inner_id) ids = [x[0] for x in q.all()]
bq = self.session().query(self.connection.buildSetModel).\
join(self.connection.buildSetRefModel).\
join(self.connection.refModel).\
options(orm.contains_eager(self.connection.buildSetModel.refs)).\
filter(self.connection.buildSetModel.id.in_(ids)).\
order_by(buildset_table.c.id.desc())
try: try:
return bq.all() return bq.all()