Mitigate config disagreement issues
Because the config file can change on us at any point, and each thread carries around its own copy of it, it's possible that a thread can be using an outdated version while in the middle of its work. This can cause unexpected behavior, like uploading an image to a provider when it is currently being deleted (leaving an undeleted upload). Until we can solve the deeper issue of config file syncing, we can reduce probability of this delete/upload problem by adding a check for the build status _just_ before an upload attempt, and by reloading our config after we spent time trying to upload to a provider (which can take quite a while). Also removes a TODO that was TODONE. Change-Id: Id066361792d356cf7f22b9157092e364a848c8e1
This commit is contained in:
parent
fd296c868b
commit
76f3617e24
|
@ -797,6 +797,19 @@ class UploadWorker(BaseWorker):
|
|||
self.log = logging.getLogger("nodepool.builder.UploadWorker.%s" % name)
|
||||
self.name = 'UploadWorker.%s' % name
|
||||
|
||||
def _reloadConfig(self):
|
||||
'''
|
||||
Reload the nodepool configuration file.
|
||||
'''
|
||||
new_config = nodepool_config.loadConfig(self._config_path)
|
||||
if not self._config:
|
||||
self._config = new_config
|
||||
|
||||
self._checkForZooKeeperChanges(new_config)
|
||||
provider_manager.ProviderManager.reconfigure(self._config, new_config,
|
||||
use_taskmanager=False)
|
||||
self._config = new_config
|
||||
|
||||
def _uploadImage(self, build_id, upload_id, image_name, images, provider):
|
||||
'''
|
||||
Upload a local DIB image build to a provider.
|
||||
|
@ -891,17 +904,26 @@ class UploadWorker(BaseWorker):
|
|||
'''
|
||||
for provider in self._config.providers.values():
|
||||
for image in provider.images.values():
|
||||
uploaded = False
|
||||
|
||||
# Check if we've been told to shutdown
|
||||
# or if ZK connection is suspended
|
||||
if not self.running or self._zk.suspended or self._zk.lost:
|
||||
return
|
||||
try:
|
||||
self._checkProviderImageUpload(provider, image)
|
||||
uploaded = self._checkProviderImageUpload(provider, image)
|
||||
except Exception:
|
||||
self.log.exception("Error uploading image %s "
|
||||
"to provider %s:",
|
||||
image.name, provider.name)
|
||||
|
||||
# NOTE: Due to the configuration file disagreement issue
|
||||
# (the copy we have may not be current), if we took the time
|
||||
# to attempt to upload an image, let's short-circuit this loop
|
||||
# to give us a chance to reload the configuration file.
|
||||
if uploaded:
|
||||
return
|
||||
|
||||
def _checkProviderImageUpload(self, provider, image):
|
||||
'''
|
||||
The main body of _checkForProviderUploads. This encapsulates
|
||||
|
@ -909,18 +931,18 @@ class UploadWorker(BaseWorker):
|
|||
and performing the upload. It is a separate function so that
|
||||
exception handling can treat all provider-image uploads
|
||||
indepedently.
|
||||
'''
|
||||
# TODO(jeblair): check for pause here
|
||||
|
||||
:returns: True if an upload was attempted, False otherwise.
|
||||
'''
|
||||
# Check if image uploads are paused.
|
||||
if provider.images.get(image.name).pause:
|
||||
return
|
||||
return False
|
||||
|
||||
# Search for the most recent 'ready' image build
|
||||
builds = self._zk.getMostRecentBuilds(1, image.name,
|
||||
zk.READY)
|
||||
if not builds:
|
||||
return
|
||||
return False
|
||||
|
||||
build = builds[0]
|
||||
|
||||
|
@ -929,17 +951,17 @@ class UploadWorker(BaseWorker):
|
|||
local_images = DibImageFile.from_image_id(
|
||||
self._config.imagesdir, "-".join([image.name, build.id]))
|
||||
if not local_images:
|
||||
return
|
||||
return False
|
||||
|
||||
# See if this image has already been uploaded
|
||||
upload = self._zk.getMostRecentBuildImageUploads(
|
||||
1, image.name, build.id, provider.name, zk.READY)
|
||||
if upload:
|
||||
return
|
||||
return False
|
||||
|
||||
# See if this provider supports the available image formats
|
||||
if provider.image_type not in build.formats:
|
||||
return
|
||||
return False
|
||||
|
||||
try:
|
||||
with self._zk.imageUploadLock(
|
||||
|
@ -951,7 +973,15 @@ class UploadWorker(BaseWorker):
|
|||
upload = self._zk.getMostRecentBuildImageUploads(
|
||||
1, image.name, build.id, provider.name, zk.READY)
|
||||
if upload:
|
||||
return
|
||||
return False
|
||||
|
||||
# NOTE: Due to the configuration file disagreement issue
|
||||
# (the copy we have may not be current), we try to verify
|
||||
# that another thread isn't trying to delete this build just
|
||||
# before we upload.
|
||||
b = self._zk.getBuild(image.name, build.id)
|
||||
if b.state == zk.DELETING:
|
||||
return False
|
||||
|
||||
# New upload number with initial state 'uploading'
|
||||
data = zk.ImageUpload()
|
||||
|
@ -965,9 +995,10 @@ class UploadWorker(BaseWorker):
|
|||
# Set final state
|
||||
self._zk.storeImageUpload(image.name, build.id,
|
||||
provider.name, data, upnum)
|
||||
return True
|
||||
except exceptions.ZKLockException:
|
||||
# Lock is already held. Skip it.
|
||||
pass
|
||||
return False
|
||||
|
||||
def run(self):
|
||||
|
||||
|
@ -982,7 +1013,8 @@ class UploadWorker(BaseWorker):
|
|||
time.sleep(SUSPEND_WAIT_TIME)
|
||||
|
||||
try:
|
||||
self._run()
|
||||
self._reloadConfig()
|
||||
self._checkForProviderUploads()
|
||||
except Exception:
|
||||
self.log.exception("Exception in UploadWorker:")
|
||||
time.sleep(10)
|
||||
|
@ -991,21 +1023,6 @@ class UploadWorker(BaseWorker):
|
|||
|
||||
provider_manager.ProviderManager.stopProviders(self._config)
|
||||
|
||||
def _run(self):
|
||||
'''
|
||||
Body of run method for exception handling purposes.
|
||||
'''
|
||||
new_config = nodepool_config.loadConfig(self._config_path)
|
||||
if not self._config:
|
||||
self._config = new_config
|
||||
|
||||
self._checkForZooKeeperChanges(new_config)
|
||||
provider_manager.ProviderManager.reconfigure(self._config, new_config,
|
||||
use_taskmanager=False)
|
||||
self._config = new_config
|
||||
|
||||
self._checkForProviderUploads()
|
||||
|
||||
|
||||
class NodePoolBuilder(object):
|
||||
'''
|
||||
|
|
Loading…
Reference in New Issue