From 6a569402755a2dd307675d02f25819cf590b37f4 Mon Sep 17 00:00:00 2001 From: "James E. Blair" Date: Mon, 25 Jul 2022 13:06:25 -0700 Subject: [PATCH] Fix race with two builders deleting images In a situation with multiple builders, each configured with different providers, it is possible for one builder to delete the ZK ImageBuild record for a build from another builder between the time that the build is completed but before the first upload starts. This is because every builder looks for images to delete from ZK. It keeps the 2 most recent ready images (this should normally cover the time period between a build and upload), unless the image is not configured for any provider this builder knows about. This is where the disjoint providers come into play -- builder1 in our scenario is not expected to have a configuration for provider2. To correct this, we adjust this check so that the only time we bypass the 2-most-recent-ready-images check is if the diskimage is not configured at all. That means that we still expect all builders to have a "diskimage" entry for every image, but we don't need those to be configured for any providers which this builder is not expected to handle. Change-Id: Ic2fefda293fa0bcbc98ee7313198b37df0576299 --- nodepool/builder.py | 8 ++- nodepool/tests/__init__.py | 5 +- nodepool/tests/fixtures/builder_image1.yaml | 64 +++++++++++++++++++++ nodepool/tests/fixtures/builder_image2.yaml | 64 +++++++++++++++++++++ nodepool/tests/unit/test_builder.py | 27 +++++++++ 5 files changed, 165 insertions(+), 3 deletions(-) create mode 100644 nodepool/tests/fixtures/builder_image1.yaml create mode 100644 nodepool/tests/fixtures/builder_image2.yaml diff --git a/nodepool/builder.py b/nodepool/builder.py index e712a8e43..0e737141a 100644 --- a/nodepool/builder.py +++ b/nodepool/builder.py @@ -467,7 +467,13 @@ class CleanupWorker(BaseWorker): # have a consistent view of the data. all_builds = self._zk.getBuilds(image) diskimage = self._config.diskimages.get(image) - if not diskimage or (diskimage and not diskimage.image_types): + if not diskimage: + # If the diskimage does not appear in the config at all, + # we interpret that as the user intending to remove it. + # If it appears in the config, even if it's not used by a + # provider, we will assume there is some other builder + # responsible for it, so we fall through to the else + # condition below and rely on data in ZK. builds_to_keep = set() # TODO(jeblair): When all builds for an image which is not # in use are deleted, the image znode should be deleted as diff --git a/nodepool/tests/__init__.py b/nodepool/tests/__init__.py index fed7001e4..925cb8b20 100644 --- a/nodepool/tests/__init__.py +++ b/nodepool/tests/__init__.py @@ -508,7 +508,8 @@ class DBTestCase(BaseTestCase): break self.wait_for_threads() - def waitForBuild(self, image_name, build_id, states=None): + def waitForBuild(self, image_name, build_id, states=None, + check_files=True): if states is None: states = (zk.READY,) @@ -523,7 +524,7 @@ class DBTestCase(BaseTestCase): break # We should only expect a dib manifest with a successful build. - while build.state == zk.READY: + while check_files and build.state == zk.READY: self.wait_for_threads() files = builder.DibImageFile.from_image_id( self._config_images_dir.path, base) diff --git a/nodepool/tests/fixtures/builder_image1.yaml b/nodepool/tests/fixtures/builder_image1.yaml new file mode 100644 index 000000000..6f35c1d15 --- /dev/null +++ b/nodepool/tests/fixtures/builder_image1.yaml @@ -0,0 +1,64 @@ +elements-dir: . +images-dir: '{images_dir}' +build-log-dir: '{build_log_dir}' +build-log-retention: 1 + +zookeeper-servers: + - host: {zookeeper_host} + port: {zookeeper_port} + chroot: {zookeeper_chroot} + +zookeeper-tls: + ca: {zookeeper_ca} + cert: {zookeeper_cert} + key: {zookeeper_key} + +labels: + - name: fake-label1 + - name: fake-label2 + +providers: + - name: fake-provider1 + cloud: fake + driver: fake + region-name: fake-region + rate: 0.0001 + diskimages: + - name: fake-image1 + pools: + - name: main + max-servers: 96 + availability-zones: + - az1 + networks: + - net-name + labels: + - name: fake-label1 + diskimage: fake-image1 + min-ram: 8192 + flavor-name: 'Fake' + +diskimages: + - name: fake-image1 + elements: + - fedora + - vm + release: 21 + dib-cmd: nodepool/tests/fake-image-create + env-vars: + TMPDIR: /opt/dib_tmp + DIB_IMAGE_CACHE: /opt/dib_cache + DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/ + BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2 + + - name: fake-image2 + elements: + - fedora + - vm + release: 21 + dib-cmd: nodepool/tests/fake-image-create + env-vars: + TMPDIR: /opt/dib_tmp + DIB_IMAGE_CACHE: /opt/dib_cache + DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/ + BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2 diff --git a/nodepool/tests/fixtures/builder_image2.yaml b/nodepool/tests/fixtures/builder_image2.yaml new file mode 100644 index 000000000..0c3fc5653 --- /dev/null +++ b/nodepool/tests/fixtures/builder_image2.yaml @@ -0,0 +1,64 @@ +elements-dir: . +images-dir: '{images_dir}' +build-log-dir: '{build_log_dir}' +build-log-retention: 1 + +zookeeper-servers: + - host: {zookeeper_host} + port: {zookeeper_port} + chroot: {zookeeper_chroot} + +zookeeper-tls: + ca: {zookeeper_ca} + cert: {zookeeper_cert} + key: {zookeeper_key} + +labels: + - name: fake-label1 + - name: fake-label2 + +providers: + - name: fake-provider2 + cloud: fake + driver: fake + region-name: fake-region + rate: 0.0001 + diskimages: + - name: fake-image2 + pools: + - name: main + max-servers: 96 + availability-zones: + - az1 + networks: + - net-name + labels: + - name: fake-label2 + diskimage: fake-image2 + min-ram: 8192 + flavor-name: 'Fake' + +diskimages: + - name: fake-image1 + elements: + - fedora + - vm + release: 21 + dib-cmd: nodepool/tests/fake-image-create + env-vars: + TMPDIR: /opt/dib_tmp + DIB_IMAGE_CACHE: /opt/dib_cache + DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/ + BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2 + + - name: fake-image2 + elements: + - fedora + - vm + release: 21 + dib-cmd: nodepool/tests/fake-image-create + env-vars: + TMPDIR: /opt/dib_tmp + DIB_IMAGE_CACHE: /opt/dib_cache + DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/ + BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2 diff --git a/nodepool/tests/unit/test_builder.py b/nodepool/tests/unit/test_builder.py index d6159509d..4566f7dfc 100644 --- a/nodepool/tests/unit/test_builder.py +++ b/nodepool/tests/unit/test_builder.py @@ -235,6 +235,33 @@ class TestNodePoolBuilder(tests.DBTestCase): self.waitForImageDeletion('fake-provider', 'fake-image2') self.waitForBuildDeletion('fake-image2', '0000000001') + def test_image_removal_two_builders(self): + # This tests the gap between building and uploading an image. + # We build an image on one builder, then delete any uploads + # (to simulate the time between when the builder completed a + # build and started the first upload). Then we start a second + # builder which is not configured with the same provider as + # the first builder and ensure that it doesn't delete the + # ready-but-not-yet-uploaded build from the other builder. + configfile1 = self.setup_config('builder_image1.yaml') + builder1 = self.useBuilder(configfile1) + self.waitForImage('fake-provider1', 'fake-image1') + self.waitForBuild('fake-image1', '0000000001') + for worker in builder1._upload_workers: + worker.shutdown() + worker.join() + builder1.stop() + + self.zk.deleteUpload('fake-image1', '0000000001', + 'fake-provider1', '0000000001') + + configfile2 = self.setup_config('builder_image2.yaml') + self.useBuilder(configfile2) + self.waitForImage('fake-provider2', 'fake-image2') + # Don't check files because the image path switched to the + # second builder; we're really only interested in ZK. + self.waitForBuild('fake-image1', '0000000001', check_files=False) + def test_image_removal_dib_deletes_first(self): # Break cloud image deleting fake_client = fakeprovider.FakeDeleteImageFailCloud()