Support deleting DIB images while builder is offline

We don't currently support deleting a DIB image while the builder
that built it is offline.  The reason for this is to ensure that
we actually remove the files from disk on the builder.  The mechanism
is for all other builders to defer handling "DELETING" image nodes
in ZK to the builder which built them.

This can be problematic if the builder is offline for an extended
period, or permanently.

To address this case without compromising the original goal, we now
let any builder delete the uploads and ZK nodes for a DIB build.
Subsequently, every builder will now look for DIB manifest directories
within its image-dir, and if it finds one that does not have a
corresponding ZK node, it garbage collects that image from disk.

Change-Id: I65efb31ca02cea4bcf7ef8f962a00b5263ccf69c
This commit is contained in:
James E. Blair 2022-06-20 15:17:30 -07:00
parent d01f19d762
commit 6386170914
2 changed files with 82 additions and 40 deletions

View File

@ -15,6 +15,7 @@
import fcntl import fcntl
import logging import logging
import os import os
import re
import select import select
import shutil import shutil
import socket import socket
@ -226,7 +227,7 @@ class CleanupWorker(BaseWorker):
return False return False
@staticmethod @staticmethod
def deleteLocalBuild(images_dir, image, build, log): def deleteLocalBuild(images_dir, image_name, build_id, log):
''' '''
Remove expired image build from local disk. Remove expired image build from local disk.
@ -247,12 +248,12 @@ class CleanupWorker(BaseWorker):
if e.errno != 2: # No such file or directory if e.errno != 2: # No such file or directory
raise e raise e
base = "-".join([image, build.id]) base = "-".join([image_name, build_id])
files = DibImageFile.from_image_id(images_dir, base) files = DibImageFile.from_image_id(images_dir, base)
if not files: if not files:
return return
log.info("Doing cleanup for %s:%s" % (image, build.id)) log.info("Doing cleanup for %s:%s" % (image_name, build_id))
manifest_dir = None manifest_dir = None
@ -271,9 +272,9 @@ class CleanupWorker(BaseWorker):
if e.errno != 2: # No such file or directory if e.errno != 2: # No such file or directory
raise e raise e
def _deleteLocalBuild(self, image, build): def _deleteLocalBuild(self, image_name, build_id):
CleanupWorker.deleteLocalBuild( CleanupWorker.deleteLocalBuild(
self._config.images_dir, image, build, self.log) self._config.images_dir, image_name, build_id, self.log)
def _cleanupProvider(self, provider, image, build_id): def _cleanupProvider(self, provider, image, build_id):
all_uploads = self._zk.getUploads(image, build_id, provider.name) all_uploads = self._zk.getUploads(image, build_id, provider.name)
@ -377,6 +378,27 @@ class CleanupWorker(BaseWorker):
except Exception: except Exception:
self.log.exception("Exception cleaning up image %s:", image) self.log.exception("Exception cleaning up image %s:", image)
for local_build in self._getLocalBuilds():
try:
image_name, build_id = local_build
build = self._zk.getBuild(image_name, build_id)
if not build or build.state == zk.DELETING:
self.log.info("Deleting on-disk build: "
"%s-%s", image_name, build_id)
self._deleteLocalBuild(image_name, build_id)
except Exception:
self.log.exception("Exception cleaning up local build %s:",
local_build)
MANIFEST_RE = re.compile(r'(.*)-(\d+)\.d')
def _getLocalBuilds(self):
for entry in os.scandir(self._config.images_dir):
m = self.MANIFEST_RE.match(entry.name)
if m and entry.is_dir():
image_name, build_id = m.groups()
yield (image_name, build_id)
def _emitBuildRequestStats(self): def _emitBuildRequestStats(self):
'''Emit general build request stats '''Emit general build request stats
@ -395,15 +417,6 @@ class CleanupWorker(BaseWorker):
pipeline.gauge(key, count) pipeline.gauge(key, count)
pipeline.send() pipeline.send()
def _filterLocalBuilds(self, image, builds):
'''Return the subset of builds that are local'''
ret = []
for build in builds:
is_local = build.builder_id == self._builder_id
if is_local:
ret.append(build)
return ret
def _cleanupCurrentProviderUploads(self, provider, image, build_id): def _cleanupCurrentProviderUploads(self, provider, image, build_id):
''' '''
Remove cruft from a current build. Remove cruft from a current build.
@ -453,21 +466,16 @@ class CleanupWorker(BaseWorker):
# Get the list of all builds, then work from that so that we # Get the list of all builds, then work from that so that we
# have a consistent view of the data. # have a consistent view of the data.
all_builds = self._zk.getBuilds(image) all_builds = self._zk.getBuilds(image)
builds_to_keep = set([b for b in sorted(all_builds, reverse=True,
key=lambda y: y.state_time)
if b.state == zk.READY][:2])
local_builds = set(self._filterLocalBuilds(image, all_builds))
diskimage = self._config.diskimages.get(image) diskimage = self._config.diskimages.get(image)
if not diskimage and not local_builds:
# This builder is not configured to build this image and was not
# responsible for this image build so ignore it.
return
# Remove any local builds that are not in use.
if not diskimage or (diskimage and not diskimage.image_types): if not diskimage or (diskimage and not diskimage.image_types):
builds_to_keep -= local_builds builds_to_keep = set()
# TODO(jeblair): When all builds for an image which is not # TODO(jeblair): When all builds for an image which is not
# in use are deleted, the image znode should be deleted as # in use are deleted, the image znode should be deleted as
# well. # well.
else:
builds_to_keep = set([b for b in sorted(all_builds, reverse=True,
key=lambda y: y.state_time)
if b.state == zk.READY][:2])
for build in all_builds: for build in all_builds:
# Start by deleting any uploads that are no longer needed # Start by deleting any uploads that are no longer needed
@ -518,13 +526,13 @@ class CleanupWorker(BaseWorker):
for p in self._zk.getBuildProviders(image, build.id): for p in self._zk.getBuildProviders(image, build.id):
uploads += self._zk.getUploads(image, build.id, p) uploads += self._zk.getUploads(image, build.id, p)
# If we own this build, delete the local DIB files as soon as all # Mark the image as deleting (which will permit deleting
# provider uploads are in a deleting state. This prevents us from # local DIB files) as soon as all provider uploads are in
# keeping local files around while we wait on clouds to remove # a deleting state. This prevents us from keeping local
# the image on their side (which can be very slow). # files around while we wait on clouds to remove the image
# on their side (which can be very slow).
all_deleting = all(map(lambda x: x.state == zk.DELETING, uploads)) all_deleting = all(map(lambda x: x.state == zk.DELETING, uploads))
is_local = build.builder_id == self._builder_id if (not uploads or all_deleting):
if (not uploads or all_deleting) and is_local:
# If we got here, it's either an explicit delete (user # If we got here, it's either an explicit delete (user
# activated), or we're deleting because we have newer # activated), or we're deleting because we have newer
# images. We're about to start deleting files, so # images. We're about to start deleting files, so
@ -536,16 +544,10 @@ class CleanupWorker(BaseWorker):
build.state = zk.DELETING build.state = zk.DELETING
self._zk.storeBuild(image, build, build.id) self._zk.storeBuild(image, build, build.id)
self._deleteLocalBuild(image, build)
if not uploads: if not uploads:
if is_local: if not self._zk.deleteBuild(image, build.id):
# Only remove the db build record from the builder that self.log.error("Unable to delete build %s because"
# built this image. This prevents us from removing the db " uploads still remain.", build)
# record without first removing the local image files.
if not self._zk.deleteBuild(image, build.id):
self.log.error("Unable to delete build %s because"
" uploads still remain.", build)
def run(self): def run(self):
''' '''
@ -729,7 +731,7 @@ class BuildWorker(BaseWorker):
bnum, diskimage.name) bnum, diskimage.name)
data.id = bnum data.id = bnum
CleanupWorker.deleteLocalBuild( CleanupWorker.deleteLocalBuild(
self._config.images_dir, diskimage.name, data, self.log) self._config.images_dir, diskimage.name, data.id, self.log)
data.state = zk.FAILED data.state = zk.FAILED
return data return data

View File

@ -257,6 +257,46 @@ class TestNodepoolCMD(tests.DBTestCase):
self.assert_listed( self.assert_listed(
configfile, ['dib-image-list'], 0, 'fake-image-0000000001', 0) configfile, ['dib-image-list'], 0, 'fake-image-0000000001', 0)
def test_dib_image_delete_two_builders(self):
# This tests deleting an image when its builder is offline
# 1. Build the image with builder1
configfile1 = self.setup_config('node.yaml')
builder1 = self.useBuilder(configfile1)
self.waitForImage('fake-provider', 'fake-image')
# Check the image exists
builds = self.zk.getMostRecentBuilds(1, 'fake-image', zk.READY)
# 2. Stop builder1; start builder2
for worker in builder1._upload_workers:
worker.shutdown()
worker.join()
builder1.stop()
# setup_config() makes a new images_dir each time, so this
# acts as a different builder.
configfile2 = self.setup_config('node.yaml')
builder2 = self.useBuilder(configfile2)
# 3. Set image to 'deleted' in ZK
self.patch_argv('-c', configfile1, 'dib-image-delete',
'fake-image-%s' % (builds[0].id))
nodepoolcmd.main()
# 4. Verify builder2 deleted the ZK records, but image is still on disk
self.waitForBuildDeletion('fake-image', '0000000001')
self.assertTrue(
os.path.exists(os.path.join(builder1._config.images_dir,
'fake-image-0000000001.d')))
self.assertFalse(
os.path.exists(os.path.join(builder2._config.images_dir,
'fake-image-0000000001.d')))
# 5. Start builder1 and verify it deletes image on disk
builder1 = self.useBuilder(configfile1)
for _ in iterate_timeout(30, AssertionError, 'image file delete'):
if not os.path.exists(os.path.join(builder1._config.images_dir,
'fake-image-0000000001.d')):
break
def test_delete(self): def test_delete(self):
configfile = self.setup_config('node.yaml') configfile = self.setup_config('node.yaml')
pool = self.useNodepool(configfile, watermark_sleep=1) pool = self.useNodepool(configfile, watermark_sleep=1)