Merge "Remediate releases stuck in non-DEPLOYED statuses"

This commit is contained in:
Zuul 2019-01-28 18:15:50 +00:00 committed by Gerrit Code Review
commit 52f29ddf73
4 changed files with 89 additions and 43 deletions

View File

@ -31,14 +31,15 @@ class ArmadaTimeoutException(ArmadaException):
class ProtectedReleaseException(ArmadaException):
'''
Exception that occurs when Armada encounters a FAILED release that is
designated `protected` in the Chart and `continue_processing` is False.
Exception that occurs when Armada encounters a release with status other
than DEPLOYED that is designated `protected` in the Chart and
`continue_processing` is False.
'''
def __init__(self, reason):
def __init__(self, release, status):
self._message = (
'Armada encountered protected release %s in FAILED status' %
reason)
'Armada encountered protected release {} in {} status'.format(
release, status))
super(ProtectedReleaseException, self).__init__(self._message)
@ -88,13 +89,16 @@ class WaitException(ArmadaException):
super(WaitException, self).__init__(message)
class UnexpectedReleaseStatusException(ArmadaException):
class DeploymentLikelyPendingException(ArmadaException):
'''
Exception that occurs when armada encounters an existing release for a
chart with an unexpected status which armada does not know what to do with.
Exception that occurs when it is detected that an existing release
operation (e.g. install, update, rollback, delete) is likely still pending.
'''
def __init__(self, release_name, status):
self._message = "Found release {} in unexpected status {}".format(
release_name, status)
super(UnexpectedReleaseStatusException, self).__init__(self._message)
def __init__(self, release, status, last_deployment_age, timeout):
self._message = (
'Existing deployment likely pending '
'release={}, status={}, '
'(last deployment age={}s) < (chart wait timeout={}s)'.format(
release, status, last_deployment_age, timeout))
super(DeploymentLikelyPendingException, self).__init__(self._message)

View File

@ -52,19 +52,12 @@ class ChartDeploy(object):
result = {}
protected = chart.get('protected', {})
p_continue = protected.get('continue_processing', False)
old_release = self.find_chart_release(known_releases, release_name)
status = None
if old_release:
status = r.get_release_status(old_release)
if status not in [const.STATUS_FAILED, const.STATUS_DEPLOYED]:
raise armada_exceptions.UnexpectedReleaseStatusException(
release_name, status)
chart_wait = ChartWait(
self.tiller.k8s,
release_name,
@ -82,29 +75,6 @@ class ChartDeploy(object):
chartbuilder = ChartBuilder(chart)
new_chart = chartbuilder.get_helm_chart()
# Check for existing FAILED release, and purge
if status == const.STATUS_FAILED:
LOG.info('Purging FAILED release %s before deployment.',
release_name)
if protected:
if p_continue:
LOG.warn(
'Release %s is `protected`, '
'continue_processing=True. Operator must '
'handle FAILED release manually.', release_name)
result['protected'] = release_name
return result
else:
LOG.error(
'Release %s is `protected`, '
'continue_processing=False.', release_name)
raise armada_exceptions.ProtectedReleaseException(
release_name)
else:
# Purge the release
self.tiller.uninstall_release(release_name)
result['purge'] = release_name
# TODO(mark-burnett): It may be more robust to directly call
# tiller status to decide whether to install/upgrade rather
# than checking for list membership.
@ -181,6 +151,62 @@ class ChartDeploy(object):
tiller_result.__dict__)
result['upgrade'] = release_name
else:
# Check for release with status other than DEPLOYED
if status:
if status != const.STATUS_FAILED:
LOG.warn(
'Unexpected release status encountered '
'release=%s, status=%s', release_name, status)
# Make best effort to determine whether a deployment is
# likely pending, by checking if the last deployment
# was started within the timeout window of the chart.
last_deployment_age = r.get_last_deployment_age(
old_release)
wait_timeout = chart_wait.get_timeout()
likely_pending = last_deployment_age <= wait_timeout
if likely_pending:
# Give up if a deployment is likely pending, we do not
# want to have multiple operations going on for the
# same release at the same time.
raise armada_exceptions.\
DeploymentLikelyPendingException(
release_name, status, last_deployment_age,
wait_timeout)
else:
# Release is likely stuck in an unintended (by tiller)
# state. Log and continue on with remediation steps
# below.
LOG.info(
'Old release %s likely stuck in status %s, '
'(last deployment age=%ss) >= '
'(chart wait timeout=%ss)', release, status,
last_deployment_age, wait_timeout)
protected = chart.get('protected', {})
if protected:
p_continue = protected.get('continue_processing', False)
if p_continue:
LOG.warn(
'Release %s is `protected`, '
'continue_processing=True. Operator must '
'handle %s release manually.', release_name,
status)
result['protected'] = release_name
return result
else:
LOG.error(
'Release %s is `protected`, '
'continue_processing=False.', release_name)
raise armada_exceptions.ProtectedReleaseException(
release_name, status)
else:
# Purge the release
LOG.info('Purging release %s with status %s', release_name,
status)
self.tiller.uninstall_release(release_name)
result['purge'] = release_name
timer = int(round(deadline - time.time()))
LOG.info(
"Installing release %s in namespace %s, wait=%s, "

View File

@ -14,6 +14,8 @@
from armada.handlers.test import Test
import time
def release_prefixer(prefix, release):
'''
@ -53,3 +55,17 @@ def get_last_test_result(release):
if not status.HasField('last_test_suite_run'):
return None
return Test.get_test_suite_run_success(status.last_test_suite_run)
def get_last_deployment_age(release):
"""
:param release: protobuf release object
:return: age in seconds of last deployment of release
"""
last_deployed = release.info.last_deployed.seconds
now = int(time.time())
last_deployment_age = now - last_deployed
return last_deployment_age

View File

@ -47,7 +47,7 @@ Armada Exceptions
:show-inheritance:
:undoc-members:
.. autoexception:: armada.exceptions.armada_exceptions.UnexpectedReleaseStatusException
.. autoexception:: armada.exceptions.armada_exceptions.DeploymentLikelyPendingException
:members:
:show-inheritance:
:undoc-members: