Retry applying platform-integ-apps

In rare cases, platform-integ-apps does and automatic applies
or re-applies before the kubernetes core system pods are not
yet ready.

To fix this issue, we wrap the function that applies applications
in a retry decorator that retries only when a platform-integ-apps
specific exception is raised.

Change-Id: I6b0bf996658079e0c10871254c75045662ad9db4
Closes-bug: 1850189
Signed-off-by: Stefan Dinescu <stefan.dinescu@windriver.com>
This commit is contained in:
Stefan Dinescu 2019-11-27 16:10:42 +02:00
parent 04c338016d
commit 638487f67b
2 changed files with 25 additions and 0 deletions

View File

@ -1394,6 +1394,10 @@ class InvalidHelmNamespace(Invalid):
class InvalidHelmDockerImageSource(Invalid):
message = _("Invalid docker image source: %(source)s. Must be one of %(valid_srcs)s")
class PlatformApplicationApplyFailure(SysinvException):
message = _("Failed to apply %(name)s application.")
#
# Kubernetes related exceptions
#

View File

@ -38,6 +38,7 @@ from sysinv.common import constants
from sysinv.common import exception
from sysinv.common import kubernetes
from sysinv.common import image_versions
from sysinv.common.retrying import retry
from sysinv.common import utils as cutils
from sysinv.common.storage_backend_conf import K8RbdProvisioner
from sysinv.conductor import openstack
@ -1359,6 +1360,8 @@ class AppOperator(object):
"Chart %s from version %s" % (to_app.name, to_app.version,
chart.name, from_app.version))
@retry(retry_on_exception=lambda x: isinstance(x, exception.PlatformApplicationApplyFailure),
stop_max_attempt_number=5, wait_fixed=30 * 1000)
def _make_armada_request_with_monitor(self, app, request, overrides_str=None):
"""Initiate armada request with monitoring
@ -1460,6 +1463,13 @@ class AppOperator(object):
pass
# Body of the outer method
# This check is for cases where an abort is issued while
# this function waits between retries. In such cases, it
# should just return False
if AppOperator.is_app_aborted(app.name):
return False
mqueue = queue.Queue()
rc = True
logname = time.strftime(app.name + '-' + request + '_%Y-%m-%d-%H-%M-%S.log')
@ -1480,6 +1490,15 @@ class AppOperator(object):
_cleanup_armada_log(ARMADA_HOST_LOG_LOCATION, app.name, request)
mqueue.put('done')
monitor.kill()
# In case platform-integ-apps apply fails, we raise a specific exception
# to be caught by the retry decorator and attempt a re-apply
if (not rc and request == constants.APP_APPLY_OP and
app.name == constants.HELM_APP_PLATFORM and
not AppOperator.is_app_aborted(app.name)):
LOG.info("%s app failed applying. Retrying." % str(app.name))
raise exception.PlatformApplicationApplyFailure(name=app.name)
return rc
def _create_app_specific_resources(self, app_name):
@ -1608,6 +1627,8 @@ class AppOperator(object):
else:
rc = False
except exception.PlatformApplicationApplyFailure:
rc = False
except Exception as e:
# ie. patch report error, cleanup application files error
# helm release delete failure