Merge "Recover from pre-upgrade hook timeout"

This commit is contained in:
Zuul 2023-03-23 16:10:16 +00:00 committed by Gerrit Code Review
commit 46c009d8d7
2 changed files with 51 additions and 0 deletions

View File

@ -1772,6 +1772,9 @@ FLUXCD_CRD_HELM_CHART_PLURAL = 'helmcharts'
FLUXCD_RECOVERY_HELM_RELEASE_STATUS_ERRORS = [
'Helm upgrade failed: another operation (install/upgrade/rollback) is in progress'
]
FLUXCD_RECOVERY_HELM_RELEASE_STATUS_ERRORS_REMOVAL_REQUIRED = [
'Helm upgrade failed: pre-upgrade hooks failed: timed out waiting for the condition'
]
FLUXCD_RECOVERABLE_HELM_RELEASE_STATUS = [
'pending-install',
'pending-upgrade',
@ -2299,3 +2302,6 @@ NOT_CONFIGURABLE = 'not-configurable'
# apparmor states
APPARMOR_STATE_ENABLED = 'enabled'
APPARMOR_STATE_DISABLED = 'disabled'
# Method callers
RECOVER_VIA_REMOVAL = 'recover_via_removal'

View File

@ -1806,6 +1806,42 @@ class AppOperator(object):
return False
def _recover_via_removal(release_name, release_err_msg):
""" Verify if a given helm release error can be recovered
by removing the app and applying it again.
This leverages the retry mechanism triggered when an
ApplicationApplyFailure exception is raised.
:param release_name: helm release name
:param released_err_msg: helm error message
"""
for error_string in \
constants.FLUXCD_RECOVERY_HELM_RELEASE_STATUS_ERRORS_REMOVAL_REQUIRED:
if release_err_msg.startswith(error_string):
LOG.info("For helm release {} found a matching error string. "
"Application removal is required to recover from: {}"
"".format(release_name, release_err_msg))
lifecycle_hook_info_app_remove = LifecycleHookInfo()
lifecycle_hook_info_app_remove.operation = constants.APP_REMOVE_OP
self.perform_app_remove(app._kube_app, lifecycle_hook_info_app_remove)
progress_str = "Recovering from: {}.".format(error_string)
self._update_app_status(app,
constants.APP_RECOVER_IN_PROGRESS,
progress_str
)
lifecycle_hook_info_app_apply = LifecycleHookInfo()
lifecycle_hook_info_app_apply.operation = constants.APP_APPLY_OP
self.perform_app_apply(app._kube_app, mode=None,
lifecycle_hook_info_app_apply=lifecycle_hook_info_app_apply,
caller=constants.RECOVER_VIA_REMOVAL)
raise exception.ApplicationApplyFailure(name=app.name)
def _check_progress():
tadjust = 0
last_successful_chart = None
@ -1886,6 +1922,10 @@ class AppOperator(object):
flux_error_message=err_msg)
if not attempt:
# Handle corner cases in which application removal
# and apply are required to recover from failure
_recover_via_removal(release_name, err_msg)
LOG.exception("Application {}: release {}: Failed during {} :{}"
"".format(app.name, release_name, request, err_msg))
return False
@ -1926,6 +1966,8 @@ class AppOperator(object):
# check progress only for apply for now
if rc and request == constants.APP_APPLY_OP:
rc = _check_progress()
except (exception.ApplicationApplyFailure):
raise
except Exception as e:
# timeout or subprocess error
LOG.exception(e)
@ -3051,6 +3093,9 @@ class AppOperator(object):
if AppOperator.is_app_aborted(app.name):
raise exception.KubeAppAbort()
if caller == constants.RECOVER_VIA_REMOVAL:
return True
if self._make_app_request(app, constants.APP_APPLY_OP, overrides_str):
self._update_app_releases_version(app.name)
self._update_app_status(app,