From 5edd3bdbe588e2c2e7a58cb839f030305613c30f Mon Sep 17 00:00:00 2001 From: Andrei Grosu Date: Tue, 13 Apr 2021 08:52:40 +0000 Subject: [PATCH] Check for connectivity to the tiller postgres backend. The existing code checks that the pod(s) are 'Running' but that might not be enough as the service inside the pod (postgres) might not be able to accept connections. Closes-Bug: 1923587 Signed-off-by: Andrei Grosu Change-Id: Ide49e4a38b805d5fc41d9f06d94393c69c6ed9d2 --- .../sysinv/sysinv/conductor/kube_app.py | 48 ++++++++++++++----- sysinv/sysinv/sysinv/sysinv/helm/utils.py | 2 +- 2 files changed, 38 insertions(+), 12 deletions(-) diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py b/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py index a60118977a..e7e7039f69 100644 --- a/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py +++ b/sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py @@ -3193,8 +3193,12 @@ class ArmadaHelper(object): # Wait for armada to be ready for cmd execution. # NOTE: make_armada_requests() also has retry mechanism - timeout = 30 - while True: + TIMEOUT_DELTA = 5 + TIMEOUT_SLEEP = 5 + TIMEOUT_START_VALUE = 30 + + timeout = TIMEOUT_START_VALUE + while timeout > 0: try: pods = self._kube.kube_get_pods_by_selector( ARMADA_NAMESPACE, @@ -3222,20 +3226,42 @@ class ArmadaHelper(object): LOG.error("Failed to copy %s to %s, error: %s", src, dest_dir, stderr) raise RuntimeError('armada pod not ready') - else: - return True - return True + break except Exception as e: LOG.info("Could not get Armada service : %s " % e) - if timeout <= 0: - break - time.sleep(5) - timeout -= 5 + time.sleep(TIMEOUT_SLEEP) + timeout -= TIMEOUT_DELTA - LOG.error("Failed to get Armada service after 30 seconds.") - return False + if timeout <= 0: + LOG.error("Failed to get Armada service after {seconds} seconds.". + format(seconds=TIMEOUT_START_VALUE)) + return False + + # We don't need to loop through the code that checks the pod's status + # again. Once the previous loop exits with pod 'Running' we can test + # the connectivity to the tiller postgres backend: + timeout = TIMEOUT_START_VALUE + while timeout > 0: + try: + _ = helm_utils.retrieve_helm_releases() + break + except exception.HelmTillerFailure: + LOG.warn("Could not query Helm/Tiller releases") + time.sleep(TIMEOUT_SLEEP) + timeout -= TIMEOUT_DELTA + continue + except Exception as ex: + LOG.error("Unhandled exception : {error}".format(error=str(ex))) + return False + + if timeout <= 0: + LOG.error("Failed to query Helm/Tiller for {seconds} seconds.". + format(seconds=TIMEOUT_START_VALUE)) + return False + + return True def stop_armada_request(self): """A simple way to cancel an on-going manifest apply/rollback/delete diff --git a/sysinv/sysinv/sysinv/sysinv/helm/utils.py b/sysinv/sysinv/sysinv/sysinv/helm/utils.py index 424b203e3d..cd256a0b73 100644 --- a/sysinv/sysinv/sysinv/sysinv/helm/utils.py +++ b/sysinv/sysinv/sysinv/sysinv/helm/utils.py @@ -1,6 +1,6 @@ # sim: tabstop=4 shiftwidth=4 softtabstop=4 # -# Copyright (c) 2019 Wind River Systems, Inc. +# Copyright (c) 2019-2021 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 #