Do not overwrite image upload ZK data on delete
This code is setting the ImageUpload state field to DELETING and updating the data in ZooKeeper. However, it was using a new ImageUpload object to do so, and we lose all other data, like the external_id, external_name, etc. If the delete should fail in the provider on the first attempt, the next attempt to delete will not have these attributes, some of which (external_name) are needed to actually delete the image. This means we leak the image in the provider because the 2nd attempt will not try to delete in the provider and will remove the ZooKeeper record. During review of this fix, I realized that we were not locking specific image uploads before modifying them or deleting them. Although this likely would not cause any problems, it may have caused the odd error in competing builder processes. This adds a locking mechanism for that data, and tests to validate the lock. Change-Id: I3fbf1c1a3ca10cc397facd4e939cbf4cbb2d8305
This commit is contained in:
parent
5c605b3240
commit
91dc050c3c
|
@ -303,18 +303,24 @@ class CleanupWorker(BaseWorker):
|
|||
self._deleteUpload(upload)
|
||||
|
||||
def _deleteUpload(self, upload):
|
||||
deleted = False
|
||||
|
||||
if upload.state != zk.DELETING:
|
||||
if not self._inProgressUpload(upload):
|
||||
data = zk.ImageUpload()
|
||||
data.state = zk.DELETING
|
||||
self._zk.storeImageUpload(upload.image_name, upload.build_id,
|
||||
upload.provider_name, data,
|
||||
upload.id)
|
||||
deleted = True
|
||||
try:
|
||||
with self._zk.imageUploadNumberLock(upload,
|
||||
blocking=False):
|
||||
upload.state = zk.DELETING
|
||||
self._zk.storeImageUpload(upload.image_name,
|
||||
upload.build_id,
|
||||
upload.provider_name,
|
||||
upload,
|
||||
upload.id)
|
||||
except exceptions.ZKLockException:
|
||||
# If we can't get a lock, we'll try again later.
|
||||
self.log.debug("Unable to get lock on image upload: %s",
|
||||
upload)
|
||||
return
|
||||
|
||||
if upload.state == zk.DELETING or deleted:
|
||||
if upload.state == zk.DELETING:
|
||||
manager = self._config.provider_managers[upload.provider_name]
|
||||
try:
|
||||
# It is possible we got this far, but don't actually have an
|
||||
|
@ -331,8 +337,16 @@ class CleanupWorker(BaseWorker):
|
|||
upload.external_name, upload.provider_name)
|
||||
else:
|
||||
self.log.debug("Deleting image upload: %s", upload)
|
||||
self._zk.deleteUpload(upload.image_name, upload.build_id,
|
||||
upload.provider_name, upload.id)
|
||||
try:
|
||||
with self._zk.imageUploadNumberLock(upload,
|
||||
blocking=False):
|
||||
self._zk.deleteUpload(upload.image_name,
|
||||
upload.build_id,
|
||||
upload.provider_name, upload.id)
|
||||
except exceptions.ZKLockException:
|
||||
# If we can't get a lock, we'll try again later.
|
||||
self.log.debug("Unable to get lock on image upload: %s",
|
||||
upload)
|
||||
|
||||
def _inProgressBuild(self, build, image):
|
||||
'''
|
||||
|
|
|
@ -0,0 +1,64 @@
|
|||
elements-dir: .
|
||||
images-dir: '{images_dir}'
|
||||
build-log-dir: '{build_log_dir}'
|
||||
build-log-retention: 1
|
||||
|
||||
zookeeper-servers:
|
||||
- host: {zookeeper_host}
|
||||
port: {zookeeper_port}
|
||||
chroot: {zookeeper_chroot}
|
||||
|
||||
labels:
|
||||
- name: fake-label1
|
||||
- name: fake-label2
|
||||
|
||||
providers:
|
||||
- name: fake-provider
|
||||
cloud: fake
|
||||
driver: fake
|
||||
region-name: fake-region
|
||||
rate: 0.0001
|
||||
diskimages:
|
||||
- name: fake-image1
|
||||
- name: fake-image2
|
||||
pools:
|
||||
- name: main
|
||||
max-servers: 96
|
||||
availability-zones:
|
||||
- az1
|
||||
networks:
|
||||
- net-name
|
||||
labels:
|
||||
- name: fake-label1
|
||||
diskimage: fake-image1
|
||||
min-ram: 8192
|
||||
flavor-name: 'Fake'
|
||||
- name: fake-label2
|
||||
diskimage: fake-image2
|
||||
min-ram: 8192
|
||||
flavor-name: 'Fake'
|
||||
|
||||
diskimages:
|
||||
- name: fake-image1
|
||||
elements:
|
||||
- fedora
|
||||
- vm
|
||||
release: 21
|
||||
dib-cmd: nodepool/tests/fake-image-create
|
||||
env-vars:
|
||||
TMPDIR: /opt/dib_tmp
|
||||
DIB_IMAGE_CACHE: /opt/dib_cache
|
||||
DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
|
||||
BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2
|
||||
|
||||
- name: fake-image2
|
||||
elements:
|
||||
- fedora
|
||||
- vm
|
||||
release: 21
|
||||
dib-cmd: nodepool/tests/fake-image-create
|
||||
env-vars:
|
||||
TMPDIR: /opt/dib_tmp
|
||||
DIB_IMAGE_CACHE: /opt/dib_cache
|
||||
DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
|
||||
BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2
|
|
@ -24,6 +24,7 @@ from pathlib import Path
|
|||
from nodepool import builder, exceptions, tests
|
||||
from nodepool.driver.fake import provider as fakeprovider
|
||||
from nodepool import zk
|
||||
from nodepool.nodeutils import iterate_timeout
|
||||
|
||||
|
||||
class TestNodepoolBuilderDibImage(tests.BaseTestCase):
|
||||
|
@ -406,3 +407,43 @@ class TestNodePoolBuilder(tests.DBTestCase):
|
|||
time.sleep(.1)
|
||||
image_files = builder.DibImageFile.from_image_id(
|
||||
images_dir, 'fake-image-0000000001')
|
||||
|
||||
def test_upload_removal_retries_until_success(self):
|
||||
'''
|
||||
If removing an image from a provider fails on the first attempt, make
|
||||
sure that we retry until successful.
|
||||
|
||||
This test starts with two images uploaded to the provider. It then
|
||||
removes one of the images by setting the state to FAILED which should
|
||||
begin the process to delete the image from the provider.
|
||||
'''
|
||||
configfile = self.setup_config('builder_2_diskimages.yaml')
|
||||
bldr = self.useBuilder(configfile)
|
||||
self.waitForImage('fake-provider', 'fake-image1')
|
||||
image = self.waitForImage('fake-provider', 'fake-image2')
|
||||
|
||||
# Introduce a failure in the upload deletion process by replacing
|
||||
# the cleanup thread's deleteImage() call with one that fails.
|
||||
cleanup_thd = bldr._janitor
|
||||
cleanup_mgr = cleanup_thd._config.provider_managers['fake-provider']
|
||||
saved_method = cleanup_mgr.deleteImage
|
||||
cleanup_mgr.deleteImage = mock.Mock(side_effect=Exception('Conflict'))
|
||||
|
||||
# Manually cause the image to be deleted from the provider. Note that
|
||||
# we set it to FAILED instead of DELETING because that bypasses the
|
||||
# bit of code we want to test here in the CleanupWorker._deleteUpload()
|
||||
# method.
|
||||
image.state = zk.FAILED
|
||||
bldr.zk.storeImageUpload(image.image_name, image.build_id,
|
||||
image.provider_name, image, image.id)
|
||||
|
||||
# Pick a call count > 1 to verify we make multiple attempts at
|
||||
# deleting the image in the provider.
|
||||
for _ in iterate_timeout(10, Exception, 'call count to increase'):
|
||||
if cleanup_mgr.deleteImage.call_count >= 5:
|
||||
break
|
||||
|
||||
# Remove the failure to verify deletion.
|
||||
cleanup_mgr.deleteImage = saved_method
|
||||
self.waitForUploadRecordDeletion(image.provider_name, image.image_name,
|
||||
image.build_id, image.id)
|
||||
|
|
|
@ -91,6 +91,67 @@ class TestZooKeeper(tests.DBTestCase):
|
|||
):
|
||||
pass
|
||||
|
||||
def test_imageUploadNumberLock_delete(self):
|
||||
'''
|
||||
Test that deleting an image upload number while we hold the lock
|
||||
on it works properly.
|
||||
'''
|
||||
upload = zk.ImageUpload()
|
||||
upload.image_name = "ubuntu-trusty"
|
||||
upload.build_id = "0000000003"
|
||||
upload.provider_name = "providerA"
|
||||
upload.id = "0000000001"
|
||||
path = self.zk._imageUploadNumberLockPath(upload.image_name,
|
||||
upload.build_id,
|
||||
upload.provider_name,
|
||||
upload.id)
|
||||
with self.zk.imageUploadNumberLock(upload, blocking=False):
|
||||
self.assertIsNotNone(self.zk.client.exists(path))
|
||||
self.zk.deleteUpload(upload.image_name,
|
||||
upload.build_id,
|
||||
upload.provider_name,
|
||||
upload.id)
|
||||
self.assertIsNone(self.zk.client.exists(path))
|
||||
|
||||
def test_imageUploadNumberLock(self):
|
||||
upload = zk.ImageUpload()
|
||||
upload.image_name = "ubuntu-trusty"
|
||||
upload.build_id = "0000000003"
|
||||
upload.provider_name = "providerA"
|
||||
upload.id = "0000000001"
|
||||
path = self.zk._imageUploadNumberLockPath(upload.image_name,
|
||||
upload.build_id,
|
||||
upload.provider_name,
|
||||
upload.id)
|
||||
with self.zk.imageUploadNumberLock(upload, blocking=False):
|
||||
self.assertIsNotNone(self.zk.client.exists(path))
|
||||
|
||||
def test_imageUploadNumberLock_exception_nonblocking(self):
|
||||
upload = zk.ImageUpload()
|
||||
upload.image_name = "ubuntu-trusty"
|
||||
upload.build_id = "0000000003"
|
||||
upload.provider_name = "providerA"
|
||||
upload.id = "0000000001"
|
||||
with self.zk.imageUploadNumberLock(upload, blocking=False):
|
||||
with testtools.ExpectedException(
|
||||
npe.ZKLockException, "Did not get lock on .*"
|
||||
):
|
||||
with self.zk.imageUploadNumberLock(upload, blocking=False):
|
||||
pass
|
||||
|
||||
def test_imageUploadNumberLock_exception_blocking(self):
|
||||
upload = zk.ImageUpload()
|
||||
upload.image_name = "ubuntu-trusty"
|
||||
upload.build_id = "0000000003"
|
||||
upload.provider_name = "providerA"
|
||||
upload.id = "0000000001"
|
||||
with self.zk.imageUploadNumberLock(upload, blocking=False):
|
||||
with testtools.ExpectedException(npe.TimeoutException):
|
||||
with self.zk.imageUploadNumberLock(
|
||||
upload, blocking=True, timeout=1
|
||||
):
|
||||
pass
|
||||
|
||||
def test_imageUploadLock(self):
|
||||
path = self.zk._imageUploadLockPath("ubuntu-trusty", "0000", "prov1")
|
||||
with self.zk.imageUploadLock("ubuntu-trusty", "0000", "prov1",
|
||||
|
|
|
@ -781,6 +781,12 @@ class ZooKeeper(object):
|
|||
return "%s/lock" % self._imageUploadPath(image, build_number,
|
||||
provider)
|
||||
|
||||
def _imageUploadNumberLockPath(self, image, build_number, provider,
|
||||
upload_number):
|
||||
return "%s/%s/lock" % (
|
||||
self._imageUploadPath(image, build_number, provider),
|
||||
upload_number)
|
||||
|
||||
def _launcherPath(self, launcher):
|
||||
return "%s/%s" % (self.LAUNCHER_ROOT, launcher)
|
||||
|
||||
|
@ -839,6 +845,29 @@ class ZooKeeper(object):
|
|||
|
||||
return lock
|
||||
|
||||
def _getImageUploadNumberLock(self, image, build_number, provider,
|
||||
upload_number, blocking=True, timeout=None):
|
||||
lock_path = self._imageUploadNumberLockPath(image, build_number,
|
||||
provider, upload_number)
|
||||
try:
|
||||
lock = Lock(self.client, lock_path)
|
||||
have_lock = lock.acquire(blocking, timeout)
|
||||
except kze.LockTimeout:
|
||||
raise npe.TimeoutException(
|
||||
"Timeout trying to acquire lock %s" % lock_path)
|
||||
except kze.NoNodeError:
|
||||
have_lock = False
|
||||
self.log.error("Image upload number %s not found for locking: "
|
||||
"%s, %s, %s",
|
||||
upload_number, build_number, provider, image)
|
||||
|
||||
# If we aren't blocking, it's possible we didn't get the lock
|
||||
# because someone else has it.
|
||||
if not have_lock:
|
||||
raise npe.ZKLockException("Did not get lock on %s" % lock_path)
|
||||
|
||||
return lock
|
||||
|
||||
def _getImageUploadLock(self, image, build_number, provider,
|
||||
blocking=True, timeout=None):
|
||||
lock_path = self._imageUploadLockPath(image, build_number, provider)
|
||||
|
@ -1040,7 +1069,7 @@ class ZooKeeper(object):
|
|||
def imageUploadLock(self, image, build_number, provider,
|
||||
blocking=True, timeout=None):
|
||||
'''
|
||||
Context manager to use for locking image builds.
|
||||
Context manager to use for locking image uploads.
|
||||
|
||||
Obtains a write lock for the specified image upload.
|
||||
|
||||
|
@ -1065,6 +1094,36 @@ class ZooKeeper(object):
|
|||
if lock:
|
||||
lock.release()
|
||||
|
||||
@contextmanager
|
||||
def imageUploadNumberLock(self, image_upload, blocking=True, timeout=None):
|
||||
'''
|
||||
Context manager to use for locking _specific_ image uploads.
|
||||
|
||||
Obtains a write lock for the specified image upload number.
|
||||
|
||||
:param ImageUpload image_upload: The object describing the upload
|
||||
to lock.
|
||||
:param bool blocking: Whether or not to block on trying to
|
||||
acquire the lock
|
||||
:param int timeout: When blocking, how long to wait for the lock
|
||||
to get acquired. None, the default, waits forever.
|
||||
|
||||
:raises: TimeoutException if we failed to acquire the lock when
|
||||
blocking with a timeout. ZKLockException if we are not blocking
|
||||
and could not get the lock, or a lock is already held.
|
||||
'''
|
||||
lock = None
|
||||
try:
|
||||
lock = self._getImageUploadNumberLock(image_upload.image_name,
|
||||
image_upload.build_id,
|
||||
image_upload.provider_name,
|
||||
image_upload.id,
|
||||
blocking, timeout)
|
||||
yield
|
||||
finally:
|
||||
if lock:
|
||||
lock.release()
|
||||
|
||||
def getImageNames(self):
|
||||
'''
|
||||
Retrieve the image names in Zookeeper.
|
||||
|
@ -1500,7 +1559,8 @@ class ZooKeeper(object):
|
|||
path = self._imageUploadPath(image, build_number, provider)
|
||||
path = path + "/%s" % upload_number
|
||||
try:
|
||||
self.client.delete(path)
|
||||
# NOTE: Need to do recursively to remove lock znodes
|
||||
self.client.delete(path, recursive=True)
|
||||
except kze.NoNodeError:
|
||||
pass
|
||||
|
||||
|
|
Loading…
Reference in New Issue