Fix race with two builders deleting images

In a situation with multiple builders, each configured with different
providers, it is possible for one builder to delete the ZK ImageBuild
record for a build from another builder between the time that the build
is completed but before the first upload starts.

This is because every builder looks for images to delete from ZK.  It
keeps the 2 most recent ready images (this should normally cover the
time period between a build and upload), unless the image is not
configured for any provider this builder knows about.  This is where
the disjoint providers come into play -- builder1 in our scenario
is not expected to have a configuration for provider2.

To correct this, we adjust this check so that the only time we
bypass the 2-most-recent-ready-images check is if the diskimage is
not configured at all.

That means that we still expect all builders to have a "diskimage"
entry for every image, but we don't need those to be configured
for any providers which this builder is not expected to handle.

Change-Id: Ic2fefda293fa0bcbc98ee7313198b37df0576299
This commit is contained in:
James E. Blair 2022-07-25 13:06:25 -07:00
parent 9641b719f9
commit 6a56940275
5 changed files with 165 additions and 3 deletions

View File

@ -467,7 +467,13 @@ class CleanupWorker(BaseWorker):
# have a consistent view of the data.
all_builds = self._zk.getBuilds(image)
diskimage = self._config.diskimages.get(image)
if not diskimage or (diskimage and not diskimage.image_types):
if not diskimage:
# If the diskimage does not appear in the config at all,
# we interpret that as the user intending to remove it.
# If it appears in the config, even if it's not used by a
# provider, we will assume there is some other builder
# responsible for it, so we fall through to the else
# condition below and rely on data in ZK.
builds_to_keep = set()
# TODO(jeblair): When all builds for an image which is not
# in use are deleted, the image znode should be deleted as

View File

@ -508,7 +508,8 @@ class DBTestCase(BaseTestCase):
break
self.wait_for_threads()
def waitForBuild(self, image_name, build_id, states=None):
def waitForBuild(self, image_name, build_id, states=None,
check_files=True):
if states is None:
states = (zk.READY,)
@ -523,7 +524,7 @@ class DBTestCase(BaseTestCase):
break
# We should only expect a dib manifest with a successful build.
while build.state == zk.READY:
while check_files and build.state == zk.READY:
self.wait_for_threads()
files = builder.DibImageFile.from_image_id(
self._config_images_dir.path, base)

View File

@ -0,0 +1,64 @@
elements-dir: .
images-dir: '{images_dir}'
build-log-dir: '{build_log_dir}'
build-log-retention: 1
zookeeper-servers:
- host: {zookeeper_host}
port: {zookeeper_port}
chroot: {zookeeper_chroot}
zookeeper-tls:
ca: {zookeeper_ca}
cert: {zookeeper_cert}
key: {zookeeper_key}
labels:
- name: fake-label1
- name: fake-label2
providers:
- name: fake-provider1
cloud: fake
driver: fake
region-name: fake-region
rate: 0.0001
diskimages:
- name: fake-image1
pools:
- name: main
max-servers: 96
availability-zones:
- az1
networks:
- net-name
labels:
- name: fake-label1
diskimage: fake-image1
min-ram: 8192
flavor-name: 'Fake'
diskimages:
- name: fake-image1
elements:
- fedora
- vm
release: 21
dib-cmd: nodepool/tests/fake-image-create
env-vars:
TMPDIR: /opt/dib_tmp
DIB_IMAGE_CACHE: /opt/dib_cache
DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2
- name: fake-image2
elements:
- fedora
- vm
release: 21
dib-cmd: nodepool/tests/fake-image-create
env-vars:
TMPDIR: /opt/dib_tmp
DIB_IMAGE_CACHE: /opt/dib_cache
DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2

View File

@ -0,0 +1,64 @@
elements-dir: .
images-dir: '{images_dir}'
build-log-dir: '{build_log_dir}'
build-log-retention: 1
zookeeper-servers:
- host: {zookeeper_host}
port: {zookeeper_port}
chroot: {zookeeper_chroot}
zookeeper-tls:
ca: {zookeeper_ca}
cert: {zookeeper_cert}
key: {zookeeper_key}
labels:
- name: fake-label1
- name: fake-label2
providers:
- name: fake-provider2
cloud: fake
driver: fake
region-name: fake-region
rate: 0.0001
diskimages:
- name: fake-image2
pools:
- name: main
max-servers: 96
availability-zones:
- az1
networks:
- net-name
labels:
- name: fake-label2
diskimage: fake-image2
min-ram: 8192
flavor-name: 'Fake'
diskimages:
- name: fake-image1
elements:
- fedora
- vm
release: 21
dib-cmd: nodepool/tests/fake-image-create
env-vars:
TMPDIR: /opt/dib_tmp
DIB_IMAGE_CACHE: /opt/dib_cache
DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2
- name: fake-image2
elements:
- fedora
- vm
release: 21
dib-cmd: nodepool/tests/fake-image-create
env-vars:
TMPDIR: /opt/dib_tmp
DIB_IMAGE_CACHE: /opt/dib_cache
DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2

View File

@ -235,6 +235,33 @@ class TestNodePoolBuilder(tests.DBTestCase):
self.waitForImageDeletion('fake-provider', 'fake-image2')
self.waitForBuildDeletion('fake-image2', '0000000001')
def test_image_removal_two_builders(self):
# This tests the gap between building and uploading an image.
# We build an image on one builder, then delete any uploads
# (to simulate the time between when the builder completed a
# build and started the first upload). Then we start a second
# builder which is not configured with the same provider as
# the first builder and ensure that it doesn't delete the
# ready-but-not-yet-uploaded build from the other builder.
configfile1 = self.setup_config('builder_image1.yaml')
builder1 = self.useBuilder(configfile1)
self.waitForImage('fake-provider1', 'fake-image1')
self.waitForBuild('fake-image1', '0000000001')
for worker in builder1._upload_workers:
worker.shutdown()
worker.join()
builder1.stop()
self.zk.deleteUpload('fake-image1', '0000000001',
'fake-provider1', '0000000001')
configfile2 = self.setup_config('builder_image2.yaml')
self.useBuilder(configfile2)
self.waitForImage('fake-provider2', 'fake-image2')
# Don't check files because the image path switched to the
# second builder; we're really only interested in ZK.
self.waitForBuild('fake-image1', '0000000001', check_files=False)
def test_image_removal_dib_deletes_first(self):
# Break cloud image deleting
fake_client = fakeprovider.FakeDeleteImageFailCloud()