Do not overwrite image upload ZK data on delete

This code is setting the ImageUpload state field to DELETING
and updating the data in ZooKeeper. However, it was using a new
ImageUpload object to do so, and we lose all other data, like the
external_id, external_name, etc. If the delete should fail in the
provider on the first attempt, the next attempt to delete will not
have these attributes, some of which (external_name) are needed to
actually delete the image. This means we leak the image in the
provider because the 2nd attempt will not try to delete in the
provider and will remove the ZooKeeper record.

During review of this fix, I realized that we were not locking
specific image uploads before modifying them or deleting them.
Although this likely would not cause any problems, it may have
caused the odd error in competing builder processes. This adds
a locking mechanism for that data, and tests to validate the lock.

Change-Id: I3fbf1c1a3ca10cc397facd4e939cbf4cbb2d8305
This commit is contained in:
David Shrewsbury 2019-09-12 13:36:10 -04:00
parent 5c605b3240
commit 91dc050c3c
5 changed files with 253 additions and 13 deletions

View File

@ -303,18 +303,24 @@ class CleanupWorker(BaseWorker):
self._deleteUpload(upload)
def _deleteUpload(self, upload):
deleted = False
if upload.state != zk.DELETING:
if not self._inProgressUpload(upload):
data = zk.ImageUpload()
data.state = zk.DELETING
self._zk.storeImageUpload(upload.image_name, upload.build_id,
upload.provider_name, data,
upload.id)
deleted = True
try:
with self._zk.imageUploadNumberLock(upload,
blocking=False):
upload.state = zk.DELETING
self._zk.storeImageUpload(upload.image_name,
upload.build_id,
upload.provider_name,
upload,
upload.id)
except exceptions.ZKLockException:
# If we can't get a lock, we'll try again later.
self.log.debug("Unable to get lock on image upload: %s",
upload)
return
if upload.state == zk.DELETING or deleted:
if upload.state == zk.DELETING:
manager = self._config.provider_managers[upload.provider_name]
try:
# It is possible we got this far, but don't actually have an
@ -331,8 +337,16 @@ class CleanupWorker(BaseWorker):
upload.external_name, upload.provider_name)
else:
self.log.debug("Deleting image upload: %s", upload)
self._zk.deleteUpload(upload.image_name, upload.build_id,
upload.provider_name, upload.id)
try:
with self._zk.imageUploadNumberLock(upload,
blocking=False):
self._zk.deleteUpload(upload.image_name,
upload.build_id,
upload.provider_name, upload.id)
except exceptions.ZKLockException:
# If we can't get a lock, we'll try again later.
self.log.debug("Unable to get lock on image upload: %s",
upload)
def _inProgressBuild(self, build, image):
'''

View File

@ -0,0 +1,64 @@
elements-dir: .
images-dir: '{images_dir}'
build-log-dir: '{build_log_dir}'
build-log-retention: 1
zookeeper-servers:
- host: {zookeeper_host}
port: {zookeeper_port}
chroot: {zookeeper_chroot}
labels:
- name: fake-label1
- name: fake-label2
providers:
- name: fake-provider
cloud: fake
driver: fake
region-name: fake-region
rate: 0.0001
diskimages:
- name: fake-image1
- name: fake-image2
pools:
- name: main
max-servers: 96
availability-zones:
- az1
networks:
- net-name
labels:
- name: fake-label1
diskimage: fake-image1
min-ram: 8192
flavor-name: 'Fake'
- name: fake-label2
diskimage: fake-image2
min-ram: 8192
flavor-name: 'Fake'
diskimages:
- name: fake-image1
elements:
- fedora
- vm
release: 21
dib-cmd: nodepool/tests/fake-image-create
env-vars:
TMPDIR: /opt/dib_tmp
DIB_IMAGE_CACHE: /opt/dib_cache
DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2
- name: fake-image2
elements:
- fedora
- vm
release: 21
dib-cmd: nodepool/tests/fake-image-create
env-vars:
TMPDIR: /opt/dib_tmp
DIB_IMAGE_CACHE: /opt/dib_cache
DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2

View File

@ -24,6 +24,7 @@ from pathlib import Path
from nodepool import builder, exceptions, tests
from nodepool.driver.fake import provider as fakeprovider
from nodepool import zk
from nodepool.nodeutils import iterate_timeout
class TestNodepoolBuilderDibImage(tests.BaseTestCase):
@ -406,3 +407,43 @@ class TestNodePoolBuilder(tests.DBTestCase):
time.sleep(.1)
image_files = builder.DibImageFile.from_image_id(
images_dir, 'fake-image-0000000001')
def test_upload_removal_retries_until_success(self):
'''
If removing an image from a provider fails on the first attempt, make
sure that we retry until successful.
This test starts with two images uploaded to the provider. It then
removes one of the images by setting the state to FAILED which should
begin the process to delete the image from the provider.
'''
configfile = self.setup_config('builder_2_diskimages.yaml')
bldr = self.useBuilder(configfile)
self.waitForImage('fake-provider', 'fake-image1')
image = self.waitForImage('fake-provider', 'fake-image2')
# Introduce a failure in the upload deletion process by replacing
# the cleanup thread's deleteImage() call with one that fails.
cleanup_thd = bldr._janitor
cleanup_mgr = cleanup_thd._config.provider_managers['fake-provider']
saved_method = cleanup_mgr.deleteImage
cleanup_mgr.deleteImage = mock.Mock(side_effect=Exception('Conflict'))
# Manually cause the image to be deleted from the provider. Note that
# we set it to FAILED instead of DELETING because that bypasses the
# bit of code we want to test here in the CleanupWorker._deleteUpload()
# method.
image.state = zk.FAILED
bldr.zk.storeImageUpload(image.image_name, image.build_id,
image.provider_name, image, image.id)
# Pick a call count > 1 to verify we make multiple attempts at
# deleting the image in the provider.
for _ in iterate_timeout(10, Exception, 'call count to increase'):
if cleanup_mgr.deleteImage.call_count >= 5:
break
# Remove the failure to verify deletion.
cleanup_mgr.deleteImage = saved_method
self.waitForUploadRecordDeletion(image.provider_name, image.image_name,
image.build_id, image.id)

View File

@ -91,6 +91,67 @@ class TestZooKeeper(tests.DBTestCase):
):
pass
def test_imageUploadNumberLock_delete(self):
'''
Test that deleting an image upload number while we hold the lock
on it works properly.
'''
upload = zk.ImageUpload()
upload.image_name = "ubuntu-trusty"
upload.build_id = "0000000003"
upload.provider_name = "providerA"
upload.id = "0000000001"
path = self.zk._imageUploadNumberLockPath(upload.image_name,
upload.build_id,
upload.provider_name,
upload.id)
with self.zk.imageUploadNumberLock(upload, blocking=False):
self.assertIsNotNone(self.zk.client.exists(path))
self.zk.deleteUpload(upload.image_name,
upload.build_id,
upload.provider_name,
upload.id)
self.assertIsNone(self.zk.client.exists(path))
def test_imageUploadNumberLock(self):
upload = zk.ImageUpload()
upload.image_name = "ubuntu-trusty"
upload.build_id = "0000000003"
upload.provider_name = "providerA"
upload.id = "0000000001"
path = self.zk._imageUploadNumberLockPath(upload.image_name,
upload.build_id,
upload.provider_name,
upload.id)
with self.zk.imageUploadNumberLock(upload, blocking=False):
self.assertIsNotNone(self.zk.client.exists(path))
def test_imageUploadNumberLock_exception_nonblocking(self):
upload = zk.ImageUpload()
upload.image_name = "ubuntu-trusty"
upload.build_id = "0000000003"
upload.provider_name = "providerA"
upload.id = "0000000001"
with self.zk.imageUploadNumberLock(upload, blocking=False):
with testtools.ExpectedException(
npe.ZKLockException, "Did not get lock on .*"
):
with self.zk.imageUploadNumberLock(upload, blocking=False):
pass
def test_imageUploadNumberLock_exception_blocking(self):
upload = zk.ImageUpload()
upload.image_name = "ubuntu-trusty"
upload.build_id = "0000000003"
upload.provider_name = "providerA"
upload.id = "0000000001"
with self.zk.imageUploadNumberLock(upload, blocking=False):
with testtools.ExpectedException(npe.TimeoutException):
with self.zk.imageUploadNumberLock(
upload, blocking=True, timeout=1
):
pass
def test_imageUploadLock(self):
path = self.zk._imageUploadLockPath("ubuntu-trusty", "0000", "prov1")
with self.zk.imageUploadLock("ubuntu-trusty", "0000", "prov1",

View File

@ -781,6 +781,12 @@ class ZooKeeper(object):
return "%s/lock" % self._imageUploadPath(image, build_number,
provider)
def _imageUploadNumberLockPath(self, image, build_number, provider,
upload_number):
return "%s/%s/lock" % (
self._imageUploadPath(image, build_number, provider),
upload_number)
def _launcherPath(self, launcher):
return "%s/%s" % (self.LAUNCHER_ROOT, launcher)
@ -839,6 +845,29 @@ class ZooKeeper(object):
return lock
def _getImageUploadNumberLock(self, image, build_number, provider,
upload_number, blocking=True, timeout=None):
lock_path = self._imageUploadNumberLockPath(image, build_number,
provider, upload_number)
try:
lock = Lock(self.client, lock_path)
have_lock = lock.acquire(blocking, timeout)
except kze.LockTimeout:
raise npe.TimeoutException(
"Timeout trying to acquire lock %s" % lock_path)
except kze.NoNodeError:
have_lock = False
self.log.error("Image upload number %s not found for locking: "
"%s, %s, %s",
upload_number, build_number, provider, image)
# If we aren't blocking, it's possible we didn't get the lock
# because someone else has it.
if not have_lock:
raise npe.ZKLockException("Did not get lock on %s" % lock_path)
return lock
def _getImageUploadLock(self, image, build_number, provider,
blocking=True, timeout=None):
lock_path = self._imageUploadLockPath(image, build_number, provider)
@ -1040,7 +1069,7 @@ class ZooKeeper(object):
def imageUploadLock(self, image, build_number, provider,
blocking=True, timeout=None):
'''
Context manager to use for locking image builds.
Context manager to use for locking image uploads.
Obtains a write lock for the specified image upload.
@ -1065,6 +1094,36 @@ class ZooKeeper(object):
if lock:
lock.release()
@contextmanager
def imageUploadNumberLock(self, image_upload, blocking=True, timeout=None):
'''
Context manager to use for locking _specific_ image uploads.
Obtains a write lock for the specified image upload number.
:param ImageUpload image_upload: The object describing the upload
to lock.
:param bool blocking: Whether or not to block on trying to
acquire the lock
:param int timeout: When blocking, how long to wait for the lock
to get acquired. None, the default, waits forever.
:raises: TimeoutException if we failed to acquire the lock when
blocking with a timeout. ZKLockException if we are not blocking
and could not get the lock, or a lock is already held.
'''
lock = None
try:
lock = self._getImageUploadNumberLock(image_upload.image_name,
image_upload.build_id,
image_upload.provider_name,
image_upload.id,
blocking, timeout)
yield
finally:
if lock:
lock.release()
def getImageNames(self):
'''
Retrieve the image names in Zookeeper.
@ -1500,7 +1559,8 @@ class ZooKeeper(object):
path = self._imageUploadPath(image, build_number, provider)
path = path + "/%s" % upload_number
try:
self.client.delete(path)
# NOTE: Need to do recursively to remove lock znodes
self.client.delete(path, recursive=True)
except kze.NoNodeError:
pass