Recover from pre-upgrade hook timeout
Implement logic to recover from Helm pre-upgrade hook timeout. In order to successfully recover in this scenario, applications need to be removed prior to retrying. A new constant was created to store Helm error messages that can trigger this recovery logic. More errors can be added to this constant in the future if needed. The recovery logic leverages the already existing retry mechanism triggered when an ApplicationApplyFailure exception is raised. Test Plan: PASS: build-pkgs -p sysinv PASS: Add pre-upgrade hook to platform-integ-apps running "sleep 300" and set the Helm release timeout to 3 minutes. Then rebuild package, update app, observe the hook timeout being triggered, delete flux pods, and watch the recovery logic successfully finish updating the app. PASS: upload/apply/remove/delete unmodified snmp app Closes-Bug: 2011850 Signed-off-by: Igor Soares <Igor.PiresSoares@windriver.com> Change-Id: Ib2cf97ea728e8a9bec4559de04d3731f34f35f1b
This commit is contained in:
parent
e948a02f29
commit
2cba79c276
|
@ -1772,6 +1772,9 @@ FLUXCD_CRD_HELM_CHART_PLURAL = 'helmcharts'
|
|||
FLUXCD_RECOVERY_HELM_RELEASE_STATUS_ERRORS = [
|
||||
'Helm upgrade failed: another operation (install/upgrade/rollback) is in progress'
|
||||
]
|
||||
FLUXCD_RECOVERY_HELM_RELEASE_STATUS_ERRORS_REMOVAL_REQUIRED = [
|
||||
'Helm upgrade failed: pre-upgrade hooks failed: timed out waiting for the condition'
|
||||
]
|
||||
FLUXCD_RECOVERABLE_HELM_RELEASE_STATUS = [
|
||||
'pending-install',
|
||||
'pending-upgrade',
|
||||
|
@ -2299,3 +2302,6 @@ NOT_CONFIGURABLE = 'not-configurable'
|
|||
# apparmor states
|
||||
APPARMOR_STATE_ENABLED = 'enabled'
|
||||
APPARMOR_STATE_DISABLED = 'disabled'
|
||||
|
||||
# Method callers
|
||||
RECOVER_VIA_REMOVAL = 'recover_via_removal'
|
||||
|
|
|
@ -1806,6 +1806,42 @@ class AppOperator(object):
|
|||
|
||||
return False
|
||||
|
||||
def _recover_via_removal(release_name, release_err_msg):
|
||||
""" Verify if a given helm release error can be recovered
|
||||
by removing the app and applying it again.
|
||||
|
||||
This leverages the retry mechanism triggered when an
|
||||
ApplicationApplyFailure exception is raised.
|
||||
|
||||
:param release_name: helm release name
|
||||
:param released_err_msg: helm error message
|
||||
"""
|
||||
|
||||
for error_string in \
|
||||
constants.FLUXCD_RECOVERY_HELM_RELEASE_STATUS_ERRORS_REMOVAL_REQUIRED:
|
||||
if release_err_msg.startswith(error_string):
|
||||
LOG.info("For helm release {} found a matching error string. "
|
||||
"Application removal is required to recover from: {}"
|
||||
"".format(release_name, release_err_msg))
|
||||
|
||||
lifecycle_hook_info_app_remove = LifecycleHookInfo()
|
||||
lifecycle_hook_info_app_remove.operation = constants.APP_REMOVE_OP
|
||||
self.perform_app_remove(app._kube_app, lifecycle_hook_info_app_remove)
|
||||
|
||||
progress_str = "Recovering from: {}.".format(error_string)
|
||||
self._update_app_status(app,
|
||||
constants.APP_RECOVER_IN_PROGRESS,
|
||||
progress_str
|
||||
)
|
||||
|
||||
lifecycle_hook_info_app_apply = LifecycleHookInfo()
|
||||
lifecycle_hook_info_app_apply.operation = constants.APP_APPLY_OP
|
||||
self.perform_app_apply(app._kube_app, mode=None,
|
||||
lifecycle_hook_info_app_apply=lifecycle_hook_info_app_apply,
|
||||
caller=constants.RECOVER_VIA_REMOVAL)
|
||||
|
||||
raise exception.ApplicationApplyFailure(name=app.name)
|
||||
|
||||
def _check_progress():
|
||||
tadjust = 0
|
||||
last_successful_chart = None
|
||||
|
@ -1886,6 +1922,10 @@ class AppOperator(object):
|
|||
flux_error_message=err_msg)
|
||||
|
||||
if not attempt:
|
||||
# Handle corner cases in which application removal
|
||||
# and apply are required to recover from failure
|
||||
_recover_via_removal(release_name, err_msg)
|
||||
|
||||
LOG.exception("Application {}: release {}: Failed during {} :{}"
|
||||
"".format(app.name, release_name, request, err_msg))
|
||||
return False
|
||||
|
@ -1926,6 +1966,8 @@ class AppOperator(object):
|
|||
# check progress only for apply for now
|
||||
if rc and request == constants.APP_APPLY_OP:
|
||||
rc = _check_progress()
|
||||
except (exception.ApplicationApplyFailure):
|
||||
raise
|
||||
except Exception as e:
|
||||
# timeout or subprocess error
|
||||
LOG.exception(e)
|
||||
|
@ -3051,6 +3093,9 @@ class AppOperator(object):
|
|||
if AppOperator.is_app_aborted(app.name):
|
||||
raise exception.KubeAppAbort()
|
||||
|
||||
if caller == constants.RECOVER_VIA_REMOVAL:
|
||||
return True
|
||||
|
||||
if self._make_app_request(app, constants.APP_APPLY_OP, overrides_str):
|
||||
self._update_app_releases_version(app.name)
|
||||
self._update_app_status(app,
|
||||
|
|
Loading…
Reference in New Issue