Support deleting DIB images while builder is offline

We don't currently support deleting a DIB image while the builder
that built it is offline.  The reason for this is to ensure that
we actually remove the files from disk on the builder.  The mechanism
is for all other builders to defer handling "DELETING" image nodes
in ZK to the builder which built them.

This can be problematic if the builder is offline for an extended
period, or permanently.

To address this case without compromising the original goal, we now
let any builder delete the uploads and ZK nodes for a DIB build.
Subsequently, every builder will now look for DIB manifest directories
within its image-dir, and if it finds one that does not have a
corresponding ZK node, it garbage collects that image from disk.

Change-Id: I65efb31ca02cea4bcf7ef8f962a00b5263ccf69c
This commit is contained in:
James E. Blair 2022-06-20 15:17:30 -07:00
parent d01f19d762
commit 6386170914
2 changed files with 82 additions and 40 deletions

View File

@ -15,6 +15,7 @@
import fcntl
import logging
import os
import re
import select
import shutil
import socket
@ -226,7 +227,7 @@ class CleanupWorker(BaseWorker):
return False
@staticmethod
def deleteLocalBuild(images_dir, image, build, log):
def deleteLocalBuild(images_dir, image_name, build_id, log):
'''
Remove expired image build from local disk.
@ -247,12 +248,12 @@ class CleanupWorker(BaseWorker):
if e.errno != 2: # No such file or directory
raise e
base = "-".join([image, build.id])
base = "-".join([image_name, build_id])
files = DibImageFile.from_image_id(images_dir, base)
if not files:
return
log.info("Doing cleanup for %s:%s" % (image, build.id))
log.info("Doing cleanup for %s:%s" % (image_name, build_id))
manifest_dir = None
@ -271,9 +272,9 @@ class CleanupWorker(BaseWorker):
if e.errno != 2: # No such file or directory
raise e
def _deleteLocalBuild(self, image, build):
def _deleteLocalBuild(self, image_name, build_id):
CleanupWorker.deleteLocalBuild(
self._config.images_dir, image, build, self.log)
self._config.images_dir, image_name, build_id, self.log)
def _cleanupProvider(self, provider, image, build_id):
all_uploads = self._zk.getUploads(image, build_id, provider.name)
@ -377,6 +378,27 @@ class CleanupWorker(BaseWorker):
except Exception:
self.log.exception("Exception cleaning up image %s:", image)
for local_build in self._getLocalBuilds():
try:
image_name, build_id = local_build
build = self._zk.getBuild(image_name, build_id)
if not build or build.state == zk.DELETING:
self.log.info("Deleting on-disk build: "
"%s-%s", image_name, build_id)
self._deleteLocalBuild(image_name, build_id)
except Exception:
self.log.exception("Exception cleaning up local build %s:",
local_build)
MANIFEST_RE = re.compile(r'(.*)-(\d+)\.d')
def _getLocalBuilds(self):
for entry in os.scandir(self._config.images_dir):
m = self.MANIFEST_RE.match(entry.name)
if m and entry.is_dir():
image_name, build_id = m.groups()
yield (image_name, build_id)
def _emitBuildRequestStats(self):
'''Emit general build request stats
@ -395,15 +417,6 @@ class CleanupWorker(BaseWorker):
pipeline.gauge(key, count)
pipeline.send()
def _filterLocalBuilds(self, image, builds):
'''Return the subset of builds that are local'''
ret = []
for build in builds:
is_local = build.builder_id == self._builder_id
if is_local:
ret.append(build)
return ret
def _cleanupCurrentProviderUploads(self, provider, image, build_id):
'''
Remove cruft from a current build.
@ -453,21 +466,16 @@ class CleanupWorker(BaseWorker):
# Get the list of all builds, then work from that so that we
# have a consistent view of the data.
all_builds = self._zk.getBuilds(image)
builds_to_keep = set([b for b in sorted(all_builds, reverse=True,
key=lambda y: y.state_time)
if b.state == zk.READY][:2])
local_builds = set(self._filterLocalBuilds(image, all_builds))
diskimage = self._config.diskimages.get(image)
if not diskimage and not local_builds:
# This builder is not configured to build this image and was not
# responsible for this image build so ignore it.
return
# Remove any local builds that are not in use.
if not diskimage or (diskimage and not diskimage.image_types):
builds_to_keep -= local_builds
builds_to_keep = set()
# TODO(jeblair): When all builds for an image which is not
# in use are deleted, the image znode should be deleted as
# well.
else:
builds_to_keep = set([b for b in sorted(all_builds, reverse=True,
key=lambda y: y.state_time)
if b.state == zk.READY][:2])
for build in all_builds:
# Start by deleting any uploads that are no longer needed
@ -518,13 +526,13 @@ class CleanupWorker(BaseWorker):
for p in self._zk.getBuildProviders(image, build.id):
uploads += self._zk.getUploads(image, build.id, p)
# If we own this build, delete the local DIB files as soon as all
# provider uploads are in a deleting state. This prevents us from
# keeping local files around while we wait on clouds to remove
# the image on their side (which can be very slow).
# Mark the image as deleting (which will permit deleting
# local DIB files) as soon as all provider uploads are in
# a deleting state. This prevents us from keeping local
# files around while we wait on clouds to remove the image
# on their side (which can be very slow).
all_deleting = all(map(lambda x: x.state == zk.DELETING, uploads))
is_local = build.builder_id == self._builder_id
if (not uploads or all_deleting) and is_local:
if (not uploads or all_deleting):
# If we got here, it's either an explicit delete (user
# activated), or we're deleting because we have newer
# images. We're about to start deleting files, so
@ -536,16 +544,10 @@ class CleanupWorker(BaseWorker):
build.state = zk.DELETING
self._zk.storeBuild(image, build, build.id)
self._deleteLocalBuild(image, build)
if not uploads:
if is_local:
# Only remove the db build record from the builder that
# built this image. This prevents us from removing the db
# record without first removing the local image files.
if not self._zk.deleteBuild(image, build.id):
self.log.error("Unable to delete build %s because"
" uploads still remain.", build)
if not self._zk.deleteBuild(image, build.id):
self.log.error("Unable to delete build %s because"
" uploads still remain.", build)
def run(self):
'''
@ -729,7 +731,7 @@ class BuildWorker(BaseWorker):
bnum, diskimage.name)
data.id = bnum
CleanupWorker.deleteLocalBuild(
self._config.images_dir, diskimage.name, data, self.log)
self._config.images_dir, diskimage.name, data.id, self.log)
data.state = zk.FAILED
return data

View File

@ -257,6 +257,46 @@ class TestNodepoolCMD(tests.DBTestCase):
self.assert_listed(
configfile, ['dib-image-list'], 0, 'fake-image-0000000001', 0)
def test_dib_image_delete_two_builders(self):
# This tests deleting an image when its builder is offline
# 1. Build the image with builder1
configfile1 = self.setup_config('node.yaml')
builder1 = self.useBuilder(configfile1)
self.waitForImage('fake-provider', 'fake-image')
# Check the image exists
builds = self.zk.getMostRecentBuilds(1, 'fake-image', zk.READY)
# 2. Stop builder1; start builder2
for worker in builder1._upload_workers:
worker.shutdown()
worker.join()
builder1.stop()
# setup_config() makes a new images_dir each time, so this
# acts as a different builder.
configfile2 = self.setup_config('node.yaml')
builder2 = self.useBuilder(configfile2)
# 3. Set image to 'deleted' in ZK
self.patch_argv('-c', configfile1, 'dib-image-delete',
'fake-image-%s' % (builds[0].id))
nodepoolcmd.main()
# 4. Verify builder2 deleted the ZK records, but image is still on disk
self.waitForBuildDeletion('fake-image', '0000000001')
self.assertTrue(
os.path.exists(os.path.join(builder1._config.images_dir,
'fake-image-0000000001.d')))
self.assertFalse(
os.path.exists(os.path.join(builder2._config.images_dir,
'fake-image-0000000001.d')))
# 5. Start builder1 and verify it deletes image on disk
builder1 = self.useBuilder(configfile1)
for _ in iterate_timeout(30, AssertionError, 'image file delete'):
if not os.path.exists(os.path.join(builder1._config.images_dir,
'fake-image-0000000001.d')):
break
def test_delete(self):
configfile = self.setup_config('node.yaml')
pool = self.useNodepool(configfile, watermark_sleep=1)