Recover from pre-upgrade hook timeout

Implement logic to recover from Helm pre-upgrade hook timeout.

In order to successfully recover in this scenario, applications
need to be removed prior to retrying. A new constant was created
to store Helm error messages that can trigger this recovery logic.
More errors can be added to this constant in the future if needed.

The recovery logic leverages the already existing retry mechanism
triggered when an ApplicationApplyFailure exception is raised.

Test Plan:
PASS: build-pkgs -p sysinv
PASS: Add pre-upgrade hook to platform-integ-apps running
      "sleep 300" and set the Helm release timeout to 3
      minutes. Then rebuild package, update app, observe the
      hook timeout being triggered, delete flux pods, and
      watch the recovery logic successfully finish updating
      the app.
PASS: upload/apply/remove/delete unmodified snmp app

Closes-Bug: 2011850
Signed-off-by: Igor Soares <Igor.PiresSoares@windriver.com>
Change-Id: Ib2cf97ea728e8a9bec4559de04d3731f34f35f1b
This commit is contained in:
Igor Soares 2023-03-15 19:55:18 -04:00
parent e948a02f29
commit 2cba79c276
2 changed files with 51 additions and 0 deletions

View File

@ -1772,6 +1772,9 @@ FLUXCD_CRD_HELM_CHART_PLURAL = 'helmcharts'
FLUXCD_RECOVERY_HELM_RELEASE_STATUS_ERRORS = [
'Helm upgrade failed: another operation (install/upgrade/rollback) is in progress'
]
FLUXCD_RECOVERY_HELM_RELEASE_STATUS_ERRORS_REMOVAL_REQUIRED = [
'Helm upgrade failed: pre-upgrade hooks failed: timed out waiting for the condition'
]
FLUXCD_RECOVERABLE_HELM_RELEASE_STATUS = [
'pending-install',
'pending-upgrade',
@ -2299,3 +2302,6 @@ NOT_CONFIGURABLE = 'not-configurable'
# apparmor states
APPARMOR_STATE_ENABLED = 'enabled'
APPARMOR_STATE_DISABLED = 'disabled'
# Method callers
RECOVER_VIA_REMOVAL = 'recover_via_removal'

View File

@ -1806,6 +1806,42 @@ class AppOperator(object):
return False
def _recover_via_removal(release_name, release_err_msg):
""" Verify if a given helm release error can be recovered
by removing the app and applying it again.
This leverages the retry mechanism triggered when an
ApplicationApplyFailure exception is raised.
:param release_name: helm release name
:param released_err_msg: helm error message
"""
for error_string in \
constants.FLUXCD_RECOVERY_HELM_RELEASE_STATUS_ERRORS_REMOVAL_REQUIRED:
if release_err_msg.startswith(error_string):
LOG.info("For helm release {} found a matching error string. "
"Application removal is required to recover from: {}"
"".format(release_name, release_err_msg))
lifecycle_hook_info_app_remove = LifecycleHookInfo()
lifecycle_hook_info_app_remove.operation = constants.APP_REMOVE_OP
self.perform_app_remove(app._kube_app, lifecycle_hook_info_app_remove)
progress_str = "Recovering from: {}.".format(error_string)
self._update_app_status(app,
constants.APP_RECOVER_IN_PROGRESS,
progress_str
)
lifecycle_hook_info_app_apply = LifecycleHookInfo()
lifecycle_hook_info_app_apply.operation = constants.APP_APPLY_OP
self.perform_app_apply(app._kube_app, mode=None,
lifecycle_hook_info_app_apply=lifecycle_hook_info_app_apply,
caller=constants.RECOVER_VIA_REMOVAL)
raise exception.ApplicationApplyFailure(name=app.name)
def _check_progress():
tadjust = 0
last_successful_chart = None
@ -1886,6 +1922,10 @@ class AppOperator(object):
flux_error_message=err_msg)
if not attempt:
# Handle corner cases in which application removal
# and apply are required to recover from failure
_recover_via_removal(release_name, err_msg)
LOG.exception("Application {}: release {}: Failed during {} :{}"
"".format(app.name, release_name, request, err_msg))
return False
@ -1926,6 +1966,8 @@ class AppOperator(object):
# check progress only for apply for now
if rc and request == constants.APP_APPLY_OP:
rc = _check_progress()
except (exception.ApplicationApplyFailure):
raise
except Exception as e:
# timeout or subprocess error
LOG.exception(e)
@ -3051,6 +3093,9 @@ class AppOperator(object):
if AppOperator.is_app_aborted(app.name):
raise exception.KubeAppAbort()
if caller == constants.RECOVER_VIA_REMOVAL:
return True
if self._make_app_request(app, constants.APP_APPLY_OP, overrides_str):
self._update_app_releases_version(app.name)
self._update_app_status(app,