Check for connectivity to the tiller postgres backend.

The existing code checks that the pod(s) are 'Running' but that
might not be enough as the service inside the pod (postgres)
might not be able to accept connections.

Closes-Bug: 1923587
Signed-off-by: Andrei Grosu <andrei.grosu@windriver.com>
Change-Id: Ide49e4a38b805d5fc41d9f06d94393c69c6ed9d2
This commit is contained in:
Andrei Grosu 2021-04-13 08:52:40 +00:00
parent a40a3bd892
commit 5edd3bdbe5
2 changed files with 38 additions and 12 deletions

View File

@ -3193,8 +3193,12 @@ class ArmadaHelper(object):
# Wait for armada to be ready for cmd execution. # Wait for armada to be ready for cmd execution.
# NOTE: make_armada_requests() also has retry mechanism # NOTE: make_armada_requests() also has retry mechanism
timeout = 30 TIMEOUT_DELTA = 5
while True: TIMEOUT_SLEEP = 5
TIMEOUT_START_VALUE = 30
timeout = TIMEOUT_START_VALUE
while timeout > 0:
try: try:
pods = self._kube.kube_get_pods_by_selector( pods = self._kube.kube_get_pods_by_selector(
ARMADA_NAMESPACE, ARMADA_NAMESPACE,
@ -3222,20 +3226,42 @@ class ArmadaHelper(object):
LOG.error("Failed to copy %s to %s, error: %s", LOG.error("Failed to copy %s to %s, error: %s",
src, dest_dir, stderr) src, dest_dir, stderr)
raise RuntimeError('armada pod not ready') raise RuntimeError('armada pod not ready')
else: break
return True
return True
except Exception as e: except Exception as e:
LOG.info("Could not get Armada service : %s " % e) LOG.info("Could not get Armada service : %s " % e)
if timeout <= 0: time.sleep(TIMEOUT_SLEEP)
break timeout -= TIMEOUT_DELTA
time.sleep(5)
timeout -= 5
LOG.error("Failed to get Armada service after 30 seconds.") if timeout <= 0:
return False LOG.error("Failed to get Armada service after {seconds} seconds.".
format(seconds=TIMEOUT_START_VALUE))
return False
# We don't need to loop through the code that checks the pod's status
# again. Once the previous loop exits with pod 'Running' we can test
# the connectivity to the tiller postgres backend:
timeout = TIMEOUT_START_VALUE
while timeout > 0:
try:
_ = helm_utils.retrieve_helm_releases()
break
except exception.HelmTillerFailure:
LOG.warn("Could not query Helm/Tiller releases")
time.sleep(TIMEOUT_SLEEP)
timeout -= TIMEOUT_DELTA
continue
except Exception as ex:
LOG.error("Unhandled exception : {error}".format(error=str(ex)))
return False
if timeout <= 0:
LOG.error("Failed to query Helm/Tiller for {seconds} seconds.".
format(seconds=TIMEOUT_START_VALUE))
return False
return True
def stop_armada_request(self): def stop_armada_request(self):
"""A simple way to cancel an on-going manifest apply/rollback/delete """A simple way to cancel an on-going manifest apply/rollback/delete

View File

@ -1,6 +1,6 @@
# sim: tabstop=4 shiftwidth=4 softtabstop=4 # sim: tabstop=4 shiftwidth=4 softtabstop=4
# #
# Copyright (c) 2019 Wind River Systems, Inc. # Copyright (c) 2019-2021 Wind River Systems, Inc.
# #
# SPDX-License-Identifier: Apache-2.0 # SPDX-License-Identifier: Apache-2.0
# #