Delete dib images when all uploads set to deleting

We have noticed that in the real world that sometimes images refuse to
delete. Boot from volume with leaked volumes can cause this as well as
other unknown issues. When this happens we keep old dib builds around
longer than expected which eventually leads to full builder disks and
sadness.

Once we've set all provider uploads to the delete state we know we no
longer what that image at all either locally or in the cloud. We can
delete the local copies of the files as soon as we reach this state so
that we aren't waiting for the provider image uploads to go away first.

This should make the builders more reliable as they'll clean their disks
as soon as is possible.

Change-Id: I7549dca5172bc82f143463994ac8578cc98a4375
This commit is contained in:
Clark Boylan 2020-01-10 16:55:18 -08:00
parent b791b4932c
commit 4ac8bf0cd9
3 changed files with 86 additions and 18 deletions

View File

@ -388,9 +388,12 @@ class CleanupWorker(BaseWorker):
'''Return the subset of builds that are local'''
ret = []
for build in builds:
base = "-".join([image, build.id])
files = DibImageFile.from_image_id(self._config.imagesdir, base)
if files:
# To maintain backward compatibility with builders that didn't
# use unique builder IDs before, but do now, always compare to
# hostname as well since some ZK data may still reference that.
is_local = build.builder_id == self._builder_id or \
build.builder == self._hostname
if is_local:
ret.append(build)
return ret
@ -444,8 +447,8 @@ class CleanupWorker(BaseWorker):
local_builds = set(self._filterLocalBuilds(image, all_builds))
diskimage = self._config.diskimages.get(image)
if not diskimage and not local_builds:
# This builder is and was not responsible for this image,
# so ignore it.
# This builder is not configured to build this image and was not
# responsible for this image build so ignore it.
return
# Remove any local builds that are not in use.
if not diskimage or (diskimage and not diskimage.image_types):
@ -499,13 +502,26 @@ class CleanupWorker(BaseWorker):
"of image %s in provider %s:",
build, image, provider)
uploads_exist = False
uploads = []
for p in self._zk.getBuildProviders(image, build.id):
if self._zk.getImageUploadNumbers(image, build.id, p):
uploads_exist = True
break
uploads += self._zk.getUploads(image, build.id, p)
if not uploads_exist:
# If we own this build, delete the local DIB files as soon as all
# provider uploads are in a deleting state. This prevents us from
# keeping local files around while we wait on clouds to remove
# the image on their side (which can be very slow).
# To maintain backward compatibility with builders that didn't
# use unique builder IDs before, but do now, always compare to
# hostname as well since some ZK data may still reference that.
all_deleting = all(map(lambda x: x.state == zk.DELETING, uploads))
is_local = build.builder_id == self._builder_id or \
build.builder == self._hostname
if (not uploads or all_deleting) and is_local:
self._deleteLocalBuild(image, build)
if not uploads:
# Finally when the clouds catch up we can clean up our db
# records to reflect there is nothing else to cleanup.
if build.state != zk.DELETING:
with self._zk.imageBuildNumberLock(
image, build.id, blocking=False
@ -514,14 +530,10 @@ class CleanupWorker(BaseWorker):
self._zk.storeBuild(image, build, build.id)
# Release the lock here so we can delete the build znode
# If we own this build, delete the local DIB files.
# To maintain backward compatibility with builders that didn't
# use unique builder IDs before, but do now, always compare to
# hostname as well since some ZK data may still reference that.
if (build.builder_id == self._builder_id or
build.builder == self._hostname
):
self._deleteLocalBuild(image, build)
if is_local:
# Only remove the db build record from the builder that
# built this image. This prevents us from removing the db
# record without first removing the local image files.
if not self._zk.deleteBuild(image, build.id):
self.log.error("Unable to delete build %s because"
" uploads still remain.", build)

View File

@ -383,6 +383,21 @@ class FakeLaunchAndDeleteFailCloud(FakeOpenStackCloud):
self).delete_server(*args, **kwargs)
class FakeDeleteImageFailCloud(FakeOpenStackCloud):
log = logging.getLogger("nodepool.FakeDeleteImageFailCloud")
def __init__(self):
super().__init__()
self._fail = True
def delete_image(self, *args, **kwargs):
if self._fail:
raise Exception('Induced failure for testing')
else:
return super(FakeDeleteImageFailCloud,
self).delete_image(*args, **kwargs)
class FakeProvider(OpenStackProvider):
fake_cloud = FakeOpenStackCloud

View File

@ -187,6 +187,47 @@ class TestNodePoolBuilder(tests.DBTestCase):
self.waitForImageDeletion('fake-provider', 'fake-image2')
self.waitForBuildDeletion('fake-image2', '0000000001')
def test_image_removal_dib_deletes_first(self):
# Break cloud image deleting
fake_client = fakeprovider.FakeDeleteImageFailCloud()
def get_fake_client(*args, **kwargs):
return fake_client
self.useFixture(fixtures.MockPatchObject(
fakeprovider.FakeProvider, '_getClient',
get_fake_client))
configfile = self.setup_config('node_two_image.yaml')
self.useBuilder(configfile)
self.waitForImage('fake-provider', 'fake-image')
img = self.waitForImage('fake-provider', 'fake-image2')
# Ask nodepool to delete the image build and uploads
self.replace_config(configfile, 'node_two_image_remove.yaml')
# Wait for image files on disk to be deleted.
for _ in iterate_timeout(10, Exception,
'DIB disk files did not delete first'):
self.wait_for_threads()
files = builder.DibImageFile.from_image_id(
self._config_images_dir.path, 'fake-image2-0000000001')
if not files:
break
# Check image is still in fake-provider cloud
img.state = zk.DELETING
self.assertEqual(
self.zk.getImageUpload('fake-image2', '0000000001',
'fake-provider', '0000000001'),
img)
# Release things by unbreaking image deleting. This allows cloud
# and zk records to be removed.
fake_client._fail = False
# Check image is removed from cloud and zk
self.waitForImageDeletion('fake-provider', 'fake-image2', match=img)
# Check build is removed from zk
self.waitForBuildDeletion('fake-image2', '0000000001')
def test_image_rebuild_age(self):
self._test_image_rebuild_age()