Merge "Do not overwrite image upload ZK data on delete"
This commit is contained in:
commit
c4db2a168d
|
@ -303,18 +303,24 @@ class CleanupWorker(BaseWorker):
|
|||
self._deleteUpload(upload)
|
||||
|
||||
def _deleteUpload(self, upload):
|
||||
deleted = False
|
||||
|
||||
if upload.state != zk.DELETING:
|
||||
if not self._inProgressUpload(upload):
|
||||
data = zk.ImageUpload()
|
||||
data.state = zk.DELETING
|
||||
self._zk.storeImageUpload(upload.image_name, upload.build_id,
|
||||
upload.provider_name, data,
|
||||
upload.id)
|
||||
deleted = True
|
||||
try:
|
||||
with self._zk.imageUploadNumberLock(upload,
|
||||
blocking=False):
|
||||
upload.state = zk.DELETING
|
||||
self._zk.storeImageUpload(upload.image_name,
|
||||
upload.build_id,
|
||||
upload.provider_name,
|
||||
upload,
|
||||
upload.id)
|
||||
except exceptions.ZKLockException:
|
||||
# If we can't get a lock, we'll try again later.
|
||||
self.log.debug("Unable to get lock on image upload: %s",
|
||||
upload)
|
||||
return
|
||||
|
||||
if upload.state == zk.DELETING or deleted:
|
||||
if upload.state == zk.DELETING:
|
||||
manager = self._config.provider_managers[upload.provider_name]
|
||||
try:
|
||||
# It is possible we got this far, but don't actually have an
|
||||
|
@ -331,8 +337,16 @@ class CleanupWorker(BaseWorker):
|
|||
upload.external_name, upload.provider_name)
|
||||
else:
|
||||
self.log.debug("Deleting image upload: %s", upload)
|
||||
self._zk.deleteUpload(upload.image_name, upload.build_id,
|
||||
upload.provider_name, upload.id)
|
||||
try:
|
||||
with self._zk.imageUploadNumberLock(upload,
|
||||
blocking=False):
|
||||
self._zk.deleteUpload(upload.image_name,
|
||||
upload.build_id,
|
||||
upload.provider_name, upload.id)
|
||||
except exceptions.ZKLockException:
|
||||
# If we can't get a lock, we'll try again later.
|
||||
self.log.debug("Unable to get lock on image upload: %s",
|
||||
upload)
|
||||
|
||||
def _inProgressBuild(self, build, image):
|
||||
'''
|
||||
|
|
|
@ -0,0 +1,64 @@
|
|||
elements-dir: .
|
||||
images-dir: '{images_dir}'
|
||||
build-log-dir: '{build_log_dir}'
|
||||
build-log-retention: 1
|
||||
|
||||
zookeeper-servers:
|
||||
- host: {zookeeper_host}
|
||||
port: {zookeeper_port}
|
||||
chroot: {zookeeper_chroot}
|
||||
|
||||
labels:
|
||||
- name: fake-label1
|
||||
- name: fake-label2
|
||||
|
||||
providers:
|
||||
- name: fake-provider
|
||||
cloud: fake
|
||||
driver: fake
|
||||
region-name: fake-region
|
||||
rate: 0.0001
|
||||
diskimages:
|
||||
- name: fake-image1
|
||||
- name: fake-image2
|
||||
pools:
|
||||
- name: main
|
||||
max-servers: 96
|
||||
availability-zones:
|
||||
- az1
|
||||
networks:
|
||||
- net-name
|
||||
labels:
|
||||
- name: fake-label1
|
||||
diskimage: fake-image1
|
||||
min-ram: 8192
|
||||
flavor-name: 'Fake'
|
||||
- name: fake-label2
|
||||
diskimage: fake-image2
|
||||
min-ram: 8192
|
||||
flavor-name: 'Fake'
|
||||
|
||||
diskimages:
|
||||
- name: fake-image1
|
||||
elements:
|
||||
- fedora
|
||||
- vm
|
||||
release: 21
|
||||
dib-cmd: nodepool/tests/fake-image-create
|
||||
env-vars:
|
||||
TMPDIR: /opt/dib_tmp
|
||||
DIB_IMAGE_CACHE: /opt/dib_cache
|
||||
DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
|
||||
BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2
|
||||
|
||||
- name: fake-image2
|
||||
elements:
|
||||
- fedora
|
||||
- vm
|
||||
release: 21
|
||||
dib-cmd: nodepool/tests/fake-image-create
|
||||
env-vars:
|
||||
TMPDIR: /opt/dib_tmp
|
||||
DIB_IMAGE_CACHE: /opt/dib_cache
|
||||
DIB_CLOUD_IMAGES: http://download.fedoraproject.org/pub/fedora/linux/releases/test/21-Beta/Cloud/Images/x86_64/
|
||||
BASE_IMAGE_FILE: Fedora-Cloud-Base-20141029-21_Beta.x86_64.qcow2
|
|
@ -24,6 +24,7 @@ from pathlib import Path
|
|||
from nodepool import builder, exceptions, tests
|
||||
from nodepool.driver.fake import provider as fakeprovider
|
||||
from nodepool import zk
|
||||
from nodepool.nodeutils import iterate_timeout
|
||||
|
||||
|
||||
class TestNodepoolBuilderDibImage(tests.BaseTestCase):
|
||||
|
@ -406,3 +407,43 @@ class TestNodePoolBuilder(tests.DBTestCase):
|
|||
time.sleep(.1)
|
||||
image_files = builder.DibImageFile.from_image_id(
|
||||
images_dir, 'fake-image-0000000001')
|
||||
|
||||
def test_upload_removal_retries_until_success(self):
|
||||
'''
|
||||
If removing an image from a provider fails on the first attempt, make
|
||||
sure that we retry until successful.
|
||||
|
||||
This test starts with two images uploaded to the provider. It then
|
||||
removes one of the images by setting the state to FAILED which should
|
||||
begin the process to delete the image from the provider.
|
||||
'''
|
||||
configfile = self.setup_config('builder_2_diskimages.yaml')
|
||||
bldr = self.useBuilder(configfile)
|
||||
self.waitForImage('fake-provider', 'fake-image1')
|
||||
image = self.waitForImage('fake-provider', 'fake-image2')
|
||||
|
||||
# Introduce a failure in the upload deletion process by replacing
|
||||
# the cleanup thread's deleteImage() call with one that fails.
|
||||
cleanup_thd = bldr._janitor
|
||||
cleanup_mgr = cleanup_thd._config.provider_managers['fake-provider']
|
||||
saved_method = cleanup_mgr.deleteImage
|
||||
cleanup_mgr.deleteImage = mock.Mock(side_effect=Exception('Conflict'))
|
||||
|
||||
# Manually cause the image to be deleted from the provider. Note that
|
||||
# we set it to FAILED instead of DELETING because that bypasses the
|
||||
# bit of code we want to test here in the CleanupWorker._deleteUpload()
|
||||
# method.
|
||||
image.state = zk.FAILED
|
||||
bldr.zk.storeImageUpload(image.image_name, image.build_id,
|
||||
image.provider_name, image, image.id)
|
||||
|
||||
# Pick a call count > 1 to verify we make multiple attempts at
|
||||
# deleting the image in the provider.
|
||||
for _ in iterate_timeout(10, Exception, 'call count to increase'):
|
||||
if cleanup_mgr.deleteImage.call_count >= 5:
|
||||
break
|
||||
|
||||
# Remove the failure to verify deletion.
|
||||
cleanup_mgr.deleteImage = saved_method
|
||||
self.waitForUploadRecordDeletion(image.provider_name, image.image_name,
|
||||
image.build_id, image.id)
|
||||
|
|
|
@ -91,6 +91,67 @@ class TestZooKeeper(tests.DBTestCase):
|
|||
):
|
||||
pass
|
||||
|
||||
def test_imageUploadNumberLock_delete(self):
|
||||
'''
|
||||
Test that deleting an image upload number while we hold the lock
|
||||
on it works properly.
|
||||
'''
|
||||
upload = zk.ImageUpload()
|
||||
upload.image_name = "ubuntu-trusty"
|
||||
upload.build_id = "0000000003"
|
||||
upload.provider_name = "providerA"
|
||||
upload.id = "0000000001"
|
||||
path = self.zk._imageUploadNumberLockPath(upload.image_name,
|
||||
upload.build_id,
|
||||
upload.provider_name,
|
||||
upload.id)
|
||||
with self.zk.imageUploadNumberLock(upload, blocking=False):
|
||||
self.assertIsNotNone(self.zk.client.exists(path))
|
||||
self.zk.deleteUpload(upload.image_name,
|
||||
upload.build_id,
|
||||
upload.provider_name,
|
||||
upload.id)
|
||||
self.assertIsNone(self.zk.client.exists(path))
|
||||
|
||||
def test_imageUploadNumberLock(self):
|
||||
upload = zk.ImageUpload()
|
||||
upload.image_name = "ubuntu-trusty"
|
||||
upload.build_id = "0000000003"
|
||||
upload.provider_name = "providerA"
|
||||
upload.id = "0000000001"
|
||||
path = self.zk._imageUploadNumberLockPath(upload.image_name,
|
||||
upload.build_id,
|
||||
upload.provider_name,
|
||||
upload.id)
|
||||
with self.zk.imageUploadNumberLock(upload, blocking=False):
|
||||
self.assertIsNotNone(self.zk.client.exists(path))
|
||||
|
||||
def test_imageUploadNumberLock_exception_nonblocking(self):
|
||||
upload = zk.ImageUpload()
|
||||
upload.image_name = "ubuntu-trusty"
|
||||
upload.build_id = "0000000003"
|
||||
upload.provider_name = "providerA"
|
||||
upload.id = "0000000001"
|
||||
with self.zk.imageUploadNumberLock(upload, blocking=False):
|
||||
with testtools.ExpectedException(
|
||||
npe.ZKLockException, "Did not get lock on .*"
|
||||
):
|
||||
with self.zk.imageUploadNumberLock(upload, blocking=False):
|
||||
pass
|
||||
|
||||
def test_imageUploadNumberLock_exception_blocking(self):
|
||||
upload = zk.ImageUpload()
|
||||
upload.image_name = "ubuntu-trusty"
|
||||
upload.build_id = "0000000003"
|
||||
upload.provider_name = "providerA"
|
||||
upload.id = "0000000001"
|
||||
with self.zk.imageUploadNumberLock(upload, blocking=False):
|
||||
with testtools.ExpectedException(npe.TimeoutException):
|
||||
with self.zk.imageUploadNumberLock(
|
||||
upload, blocking=True, timeout=1
|
||||
):
|
||||
pass
|
||||
|
||||
def test_imageUploadLock(self):
|
||||
path = self.zk._imageUploadLockPath("ubuntu-trusty", "0000", "prov1")
|
||||
with self.zk.imageUploadLock("ubuntu-trusty", "0000", "prov1",
|
||||
|
|
|
@ -781,6 +781,12 @@ class ZooKeeper(object):
|
|||
return "%s/lock" % self._imageUploadPath(image, build_number,
|
||||
provider)
|
||||
|
||||
def _imageUploadNumberLockPath(self, image, build_number, provider,
|
||||
upload_number):
|
||||
return "%s/%s/lock" % (
|
||||
self._imageUploadPath(image, build_number, provider),
|
||||
upload_number)
|
||||
|
||||
def _launcherPath(self, launcher):
|
||||
return "%s/%s" % (self.LAUNCHER_ROOT, launcher)
|
||||
|
||||
|
@ -839,6 +845,29 @@ class ZooKeeper(object):
|
|||
|
||||
return lock
|
||||
|
||||
def _getImageUploadNumberLock(self, image, build_number, provider,
|
||||
upload_number, blocking=True, timeout=None):
|
||||
lock_path = self._imageUploadNumberLockPath(image, build_number,
|
||||
provider, upload_number)
|
||||
try:
|
||||
lock = Lock(self.client, lock_path)
|
||||
have_lock = lock.acquire(blocking, timeout)
|
||||
except kze.LockTimeout:
|
||||
raise npe.TimeoutException(
|
||||
"Timeout trying to acquire lock %s" % lock_path)
|
||||
except kze.NoNodeError:
|
||||
have_lock = False
|
||||
self.log.error("Image upload number %s not found for locking: "
|
||||
"%s, %s, %s",
|
||||
upload_number, build_number, provider, image)
|
||||
|
||||
# If we aren't blocking, it's possible we didn't get the lock
|
||||
# because someone else has it.
|
||||
if not have_lock:
|
||||
raise npe.ZKLockException("Did not get lock on %s" % lock_path)
|
||||
|
||||
return lock
|
||||
|
||||
def _getImageUploadLock(self, image, build_number, provider,
|
||||
blocking=True, timeout=None):
|
||||
lock_path = self._imageUploadLockPath(image, build_number, provider)
|
||||
|
@ -1040,7 +1069,7 @@ class ZooKeeper(object):
|
|||
def imageUploadLock(self, image, build_number, provider,
|
||||
blocking=True, timeout=None):
|
||||
'''
|
||||
Context manager to use for locking image builds.
|
||||
Context manager to use for locking image uploads.
|
||||
|
||||
Obtains a write lock for the specified image upload.
|
||||
|
||||
|
@ -1065,6 +1094,36 @@ class ZooKeeper(object):
|
|||
if lock:
|
||||
lock.release()
|
||||
|
||||
@contextmanager
|
||||
def imageUploadNumberLock(self, image_upload, blocking=True, timeout=None):
|
||||
'''
|
||||
Context manager to use for locking _specific_ image uploads.
|
||||
|
||||
Obtains a write lock for the specified image upload number.
|
||||
|
||||
:param ImageUpload image_upload: The object describing the upload
|
||||
to lock.
|
||||
:param bool blocking: Whether or not to block on trying to
|
||||
acquire the lock
|
||||
:param int timeout: When blocking, how long to wait for the lock
|
||||
to get acquired. None, the default, waits forever.
|
||||
|
||||
:raises: TimeoutException if we failed to acquire the lock when
|
||||
blocking with a timeout. ZKLockException if we are not blocking
|
||||
and could not get the lock, or a lock is already held.
|
||||
'''
|
||||
lock = None
|
||||
try:
|
||||
lock = self._getImageUploadNumberLock(image_upload.image_name,
|
||||
image_upload.build_id,
|
||||
image_upload.provider_name,
|
||||
image_upload.id,
|
||||
blocking, timeout)
|
||||
yield
|
||||
finally:
|
||||
if lock:
|
||||
lock.release()
|
||||
|
||||
def getImageNames(self):
|
||||
'''
|
||||
Retrieve the image names in Zookeeper.
|
||||
|
@ -1500,7 +1559,8 @@ class ZooKeeper(object):
|
|||
path = self._imageUploadPath(image, build_number, provider)
|
||||
path = path + "/%s" % upload_number
|
||||
try:
|
||||
self.client.delete(path)
|
||||
# NOTE: Need to do recursively to remove lock znodes
|
||||
self.client.delete(path, recursive=True)
|
||||
except kze.NoNodeError:
|
||||
pass
|
||||
|
||||
|
|
Loading…
Reference in New Issue