Browse Source

Check for connectivity to the tiller postgres backend.

The existing code checks that the pod(s) are 'Running' but that
might not be enough as the service inside the pod (postgres)
might not be able to accept connections.

Closes-Bug: 1923587
Signed-off-by: Andrei Grosu <andrei.grosu@windriver.com>
Change-Id: Ide49e4a38b805d5fc41d9f06d94393c69c6ed9d2
changes/21/786021/5
Andrei Grosu 2 months ago
parent
commit
5edd3bdbe5
2 changed files with 37 additions and 11 deletions
  1. +36
    -10
      sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py
  2. +1
    -1
      sysinv/sysinv/sysinv/sysinv/helm/utils.py

+ 36
- 10
sysinv/sysinv/sysinv/sysinv/conductor/kube_app.py View File

@ -3193,8 +3193,12 @@ class ArmadaHelper(object):
# Wait for armada to be ready for cmd execution.
# NOTE: make_armada_requests() also has retry mechanism
timeout = 30
while True:
TIMEOUT_DELTA = 5
TIMEOUT_SLEEP = 5
TIMEOUT_START_VALUE = 30
timeout = TIMEOUT_START_VALUE
while timeout > 0:
try:
pods = self._kube.kube_get_pods_by_selector(
ARMADA_NAMESPACE,
@ -3222,20 +3226,42 @@ class ArmadaHelper(object):
LOG.error("Failed to copy %s to %s, error: %s",
src, dest_dir, stderr)
raise RuntimeError('armada pod not ready')
else:
return True
return True
break
except Exception as e:
LOG.info("Could not get Armada service : %s " % e)
if timeout <= 0:
time.sleep(TIMEOUT_SLEEP)
timeout -= TIMEOUT_DELTA
if timeout <= 0:
LOG.error("Failed to get Armada service after {seconds} seconds.".
format(seconds=TIMEOUT_START_VALUE))
return False
# We don't need to loop through the code that checks the pod's status
# again. Once the previous loop exits with pod 'Running' we can test
# the connectivity to the tiller postgres backend:
timeout = TIMEOUT_START_VALUE
while timeout > 0:
try:
_ = helm_utils.retrieve_helm_releases()
break
time.sleep(5)
timeout -= 5
except exception.HelmTillerFailure:
LOG.warn("Could not query Helm/Tiller releases")
time.sleep(TIMEOUT_SLEEP)
timeout -= TIMEOUT_DELTA
continue
except Exception as ex:
LOG.error("Unhandled exception : {error}".format(error=str(ex)))
return False
LOG.error("Failed to get Armada service after 30 seconds.")
return False
if timeout <= 0:
LOG.error("Failed to query Helm/Tiller for {seconds} seconds.".
format(seconds=TIMEOUT_START_VALUE))
return False
return True
def stop_armada_request(self):
"""A simple way to cancel an on-going manifest apply/rollback/delete


+ 1
- 1
sysinv/sysinv/sysinv/sysinv/helm/utils.py View File

@ -1,6 +1,6 @@
# sim: tabstop=4 shiftwidth=4 softtabstop=4
#
# Copyright (c) 2019 Wind River Systems, Inc.
# Copyright (c) 2019-2021 Wind River Systems, Inc.
#
# SPDX-License-Identifier: Apache-2.0
#


Loading…
Cancel
Save