Fix prune-database command

This command had two problems:

* It would only delete the first 50 buildsets
* Depending on DB configuration, it may not have deleted anything
  or left orphan data.

We did not tell sqlalchemy to cascade delete operations, meaning that
when we deleted the buildset, we didn't delete anything else.

If the database enforces foreign keys (innodb, psql) then the command
would have failed.  If it doesn't (myisam) then it would have deleted
the buildset rows but not anything else.

The tests use myisam, so they ran without error and without deleting
the builds.  They check that the builds are deleted, but only through
the ORM via a joined load with the buildsets, and since the buildsets
are gone, the builds weren't returned.

To address this shortcoming, the tests now use distinct ORM methods
which return objects without any joins.  This would have caught
the error had it been in place before.

Additionally, the delet operation retained the default limit of 50
rows (set in place for the web UI), meaning that when it did run,
it would only delete the most recent 50 matching builds.

We now explicitly set the limit to a user-configurable batch size
(by default, 10,000 builds) so that we keep transaction sizes
manageable and avoid monopolizing database locks.  We continue deleting
buildsets in batches as long as any matching buildsets remain. This
should allow users to remove very large amounts of data without
affecting ongoing operations too much.

Change-Id: I4c678b294eeda25589b75ab1ce7c5d0b93a07df3
This commit is contained in:
James E. Blair 2023-02-24 15:19:25 -08:00
parent 48ad958bb4
commit 7153505cd5
4 changed files with 170 additions and 27 deletions

View File

@ -0,0 +1,16 @@
---
fixes:
- |
The `zuul-admin prune-database` command did not completely delete
expected data from the database. It may not have deleted all of
the buildsets older than the specified cutoff time, and it may
have left orphaned data in ancillary tables. This has been
corrected and it should now work as expected. Additionally, a
`--batch-size` argument has been added so that it may delete data
in multiple transactions which can facilitate smoother operation
when run while Zuul is operational.
Users who have previously run the command may need to manually
delete rows from the `zuul_build`, `zuul_build_event`,
`zuul_artifact`, and `zuul_provides` tables which do not have
corresponding entries in the `zuul_buildset` table.

View File

@ -21,10 +21,12 @@ import time
import configparser import configparser
import datetime import datetime
import dateutil.tz import dateutil.tz
import uuid
import fixtures import fixtures
import jwt import jwt
import testtools import testtools
import sqlalchemy
from zuul.zk import ZooKeeperClient from zuul.zk import ZooKeeperClient
from zuul.cmd.client import parse_cutoff from zuul.cmd.client import parse_cutoff
@ -467,27 +469,107 @@ class TestDBPruneParse(BaseTestCase):
class DBPruneTestCase(ZuulTestCase): class DBPruneTestCase(ZuulTestCase):
tenant_config_file = 'config/single-tenant/main.yaml' tenant_config_file = 'config/single-tenant/main.yaml'
# This should be larger than the limit size in sqlconnection
num_buildsets = 55
def _createBuildset(self, update_time):
connection = self.scheds.first.sched.sql.connection
buildset_uuid = uuid.uuid4().hex
event_id = uuid.uuid4().hex
with connection.getSession() as db:
start_time = update_time - datetime.timedelta(seconds=1)
end_time = update_time
db_buildset = db.createBuildSet(
uuid=buildset_uuid,
tenant='tenant-one',
pipeline='check',
project='org/project',
change='1',
patchset='1',
ref='refs/changes/1',
oldrev='',
newrev='',
branch='master',
zuul_ref='Zref',
ref_url='http://gerrit.example.com/1',
event_id=event_id,
event_timestamp=update_time,
updated=update_time,
first_build_start_time=start_time,
last_build_end_time=end_time,
result='SUCCESS',
)
for build_num in range(2):
build_uuid = uuid.uuid4().hex
db_build = db_buildset.createBuild(
uuid=build_uuid,
job_name=f'job{build_num}',
start_time=start_time,
end_time=end_time,
result='SUCCESS',
voting=True,
)
for art_num in range(2):
db_build.createArtifact(
name=f'artifact{art_num}',
url='http://example.com',
)
for provides_num in range(2):
db_build.createProvides(
name=f'item{provides_num}',
)
for event_num in range(2):
db_build.createBuildEvent(
event_type=f'event{event_num}',
event_time=start_time,
)
def _query(self, db, model):
table = model.__table__
q = db.session().query(model).order_by(table.c.id.desc())
try:
return q.all()
except sqlalchemy.orm.exc.NoResultFound:
return []
def _getBuildsets(self, db):
return self._query(db, db.connection.buildSetModel)
def _getBuilds(self, db):
return self._query(db, db.connection.buildModel)
def _getProvides(self, db):
return self._query(db, db.connection.providesModel)
def _getArtifacts(self, db):
return self._query(db, db.connection.artifactModel)
def _getBuildEvents(self, db):
return self._query(db, db.connection.buildEventModel)
def _setup(self): def _setup(self):
config_file = os.path.join(self.test_root, 'zuul.conf') config_file = os.path.join(self.test_root, 'zuul.conf')
with open(config_file, 'w') as f: with open(config_file, 'w') as f:
self.config.write(f) self.config.write(f)
A = self.fake_gerrit.addFakeChange('org/project', 'master', 'A') update_time = (datetime.datetime.utcnow() -
self.fake_gerrit.addEvent(A.getPatchsetCreatedEvent(1)) datetime.timedelta(minutes=self.num_buildsets))
self.waitUntilSettled() for x in range(self.num_buildsets):
update_time = update_time + datetime.timedelta(minutes=1)
time.sleep(1) self._createBuildset(update_time)
B = self.fake_gerrit.addFakeChange('org/project', 'master', 'B')
self.fake_gerrit.addEvent(B.getPatchsetCreatedEvent(1))
self.waitUntilSettled()
connection = self.scheds.first.sched.sql.connection connection = self.scheds.first.sched.sql.connection
buildsets = connection.getBuildsets() with connection.getSession() as db:
builds = connection.getBuilds() buildsets = self._getBuildsets(db)
self.assertEqual(len(buildsets), 2) builds = self._getBuilds(db)
self.assertEqual(len(builds), 6) artifacts = self._getArtifacts(db)
provides = self._getProvides(db)
events = self._getBuildEvents(db)
self.assertEqual(len(buildsets), self.num_buildsets)
self.assertEqual(len(builds), 2 * self.num_buildsets)
self.assertEqual(len(artifacts), 4 * self.num_buildsets)
self.assertEqual(len(provides), 4 * self.num_buildsets)
self.assertEqual(len(events), 4 * self.num_buildsets)
for build in builds: for build in builds:
self.log.debug("Build %s %s %s", self.log.debug("Build %s %s %s",
build, build.start_time, build.end_time) build, build.start_time, build.end_time)
@ -503,6 +585,7 @@ class DBPruneTestCase(ZuulTestCase):
start_time = buildsets[0].first_build_start_time start_time = buildsets[0].first_build_start_time
self.log.debug("Cutoff %s", start_time) self.log.debug("Cutoff %s", start_time)
# Use the default batch size (omit --batch-size arg)
p = subprocess.Popen( p = subprocess.Popen(
[os.path.join(sys.prefix, 'bin/zuul-admin'), [os.path.join(sys.prefix, 'bin/zuul-admin'),
'-c', config_file, '-c', config_file,
@ -513,13 +596,20 @@ class DBPruneTestCase(ZuulTestCase):
out, _ = p.communicate() out, _ = p.communicate()
self.log.debug(out.decode('utf8')) self.log.debug(out.decode('utf8'))
buildsets = connection.getBuildsets() with connection.getSession() as db:
builds = connection.getBuilds() buildsets = self._getBuildsets(db)
self.assertEqual(len(buildsets), 1) builds = self._getBuilds(db)
self.assertEqual(len(builds), 3) artifacts = self._getArtifacts(db)
provides = self._getProvides(db)
events = self._getBuildEvents(db)
for build in builds: for build in builds:
self.log.debug("Build %s %s %s", self.log.debug("Build %s %s %s",
build, build.start_time, build.end_time) build, build.start_time, build.end_time)
self.assertEqual(len(buildsets), 1)
self.assertEqual(len(builds), 2)
self.assertEqual(len(artifacts), 4)
self.assertEqual(len(provides), 4)
self.assertEqual(len(events), 4)
def test_db_prune_older_than(self): def test_db_prune_older_than(self):
# Test pruning buildsets older than a relative time # Test pruning buildsets older than a relative time
@ -535,15 +625,23 @@ class DBPruneTestCase(ZuulTestCase):
'-c', config_file, '-c', config_file,
'prune-database', 'prune-database',
'--older-than', '0d', '--older-than', '0d',
'--batch-size', '5',
], ],
stdout=subprocess.PIPE) stdout=subprocess.PIPE)
out, _ = p.communicate() out, _ = p.communicate()
self.log.debug(out.decode('utf8')) self.log.debug(out.decode('utf8'))
buildsets = connection.getBuildsets() with connection.getSession() as db:
builds = connection.getBuilds() buildsets = self._getBuildsets(db)
builds = self._getBuilds(db)
artifacts = self._getArtifacts(db)
provides = self._getProvides(db)
events = self._getBuildEvents(db)
self.assertEqual(len(buildsets), 0) self.assertEqual(len(buildsets), 0)
self.assertEqual(len(builds), 0) self.assertEqual(len(builds), 0)
self.assertEqual(len(artifacts), 0)
self.assertEqual(len(provides), 0)
self.assertEqual(len(events), 0)
class TestDBPruneMysql(DBPruneTestCase): class TestDBPruneMysql(DBPruneTestCase):

View File

@ -542,6 +542,10 @@ class Client(zuul.cmd.ZuulApp):
cmd_prune_database.add_argument( cmd_prune_database.add_argument(
'--older-than', '--older-than',
help='relative time (e.g., "24h" or "180d")') help='relative time (e.g., "24h" or "180d")')
cmd_prune_database.add_argument(
'--batch-size',
default=10000,
help='transaction batch size')
cmd_prune_database.set_defaults(func=self.prune_database) cmd_prune_database.set_defaults(func=self.prune_database)
return parser return parser
@ -1061,7 +1065,7 @@ class Client(zuul.cmd.ZuulApp):
cutoff = parse_cutoff(now, args.before, args.older_than) cutoff = parse_cutoff(now, args.before, args.older_than)
self.configure_connections(source_only=False, require_sql=True) self.configure_connections(source_only=False, require_sql=True)
connection = self.connections.getSqlConnection() connection = self.connections.getSqlConnection()
connection.deleteBuildsets(cutoff) connection.deleteBuildsets(cutoff, args.batch_size)
sys.exit(0) sys.exit(0)

View File

@ -247,12 +247,25 @@ class DatabaseSession(object):
except sqlalchemy.orm.exc.MultipleResultsFound: except sqlalchemy.orm.exc.MultipleResultsFound:
raise Exception("Multiple buildset found with uuid %s", uuid) raise Exception("Multiple buildset found with uuid %s", uuid)
def deleteBuildsets(self, cutoff): def deleteBuildsets(self, cutoff, batch_size):
"""Delete buildsets before the cutoff""" """Delete buildsets before the cutoff"""
# delete buildsets updated before the cutoff # delete buildsets updated before the cutoff
for buildset in self.getBuildsets(updated_max=cutoff): deleted = True
self.session().delete(buildset) while deleted:
deleted = False
oldest = None
for buildset in self.getBuildsets(
updated_max=cutoff, limit=batch_size):
deleted = True
if oldest is None:
oldest = buildset.updated
else:
oldest = min(oldest, buildset.updated)
self.session().delete(buildset)
self.session().commit()
if deleted:
self.log.info("Deleted from %s to %s", oldest, cutoff)
class SQLConnection(BaseConnection): class SQLConnection(BaseConnection):
@ -409,7 +422,10 @@ class SQLConnection(BaseConnection):
final = sa.Column(sa.Boolean) final = sa.Column(sa.Boolean)
held = sa.Column(sa.Boolean) held = sa.Column(sa.Boolean)
nodeset = sa.Column(sa.String(255)) nodeset = sa.Column(sa.String(255))
buildset = orm.relationship(BuildSetModel, backref="builds") buildset = orm.relationship(BuildSetModel,
backref=orm.backref(
"builds",
cascade="all, delete-orphan"))
sa.Index(self.table_prefix + 'job_name_buildset_id_idx', sa.Index(self.table_prefix + 'job_name_buildset_id_idx',
job_name, buildset_id) job_name, buildset_id)
@ -468,7 +484,10 @@ class SQLConnection(BaseConnection):
name = sa.Column(sa.String(255)) name = sa.Column(sa.String(255))
url = sa.Column(sa.TEXT()) url = sa.Column(sa.TEXT())
meta = sa.Column('metadata', sa.TEXT()) meta = sa.Column('metadata', sa.TEXT())
build = orm.relationship(BuildModel, backref="artifacts") build = orm.relationship(BuildModel,
backref=orm.backref(
"artifacts",
cascade="all, delete-orphan"))
class ProvidesModel(Base): class ProvidesModel(Base):
__tablename__ = self.table_prefix + PROVIDES_TABLE __tablename__ = self.table_prefix + PROVIDES_TABLE
@ -476,7 +495,10 @@ class SQLConnection(BaseConnection):
build_id = sa.Column(sa.Integer, sa.ForeignKey( build_id = sa.Column(sa.Integer, sa.ForeignKey(
self.table_prefix + BUILD_TABLE + ".id")) self.table_prefix + BUILD_TABLE + ".id"))
name = sa.Column(sa.String(255)) name = sa.Column(sa.String(255))
build = orm.relationship(BuildModel, backref="provides") build = orm.relationship(BuildModel,
backref=orm.backref(
"provides",
cascade="all, delete-orphan"))
class BuildEventModel(Base): class BuildEventModel(Base):
__tablename__ = self.table_prefix + BUILD_EVENTS_TABLE __tablename__ = self.table_prefix + BUILD_EVENTS_TABLE
@ -486,7 +508,10 @@ class SQLConnection(BaseConnection):
event_time = sa.Column(sa.DateTime) event_time = sa.Column(sa.DateTime)
event_type = sa.Column(sa.String(255)) event_type = sa.Column(sa.String(255))
description = sa.Column(sa.TEXT()) description = sa.Column(sa.TEXT())
build = orm.relationship(BuildModel, backref="build_events") build = orm.relationship(BuildModel,
backref=orm.backref(
"build_events",
cascade="all, delete-orphan"))
self.buildEventModel = BuildEventModel self.buildEventModel = BuildEventModel
self.zuul_build_event_table = self.buildEventModel.__table__ self.zuul_build_event_table = self.buildEventModel.__table__