add health query timeout for HA
This change causes vault-manager to not pause for long periods when a configured vault server is not responsive. Use curl --connect-timeout for queries to vault server /sys/health. During HA recovery it is known that the server is non-responsive, so vault-manager should not wait the default time, which is 60s or 5m depending on the google search result. It is observed that vault-manager appears to hang for long periods during HA recovery. Watching the $PVCDIR/pods.txt confirms that vault-manager is inactive for minutes at a time. This changes the default behavior to timeout within 2 seconds during the HA recovery scenario. In addition to not waiting, the vault-manager log will show the 'sealed' status as empty string when the query times-out. Test Plan: PASS - vault ha 3 replicas PASS - vault 1 replica PASS - kubectl exec kill vault process PASS - kubectl delete vault pod PASS - short network downtime PASS - long network downtime PASS - rates including 1, 5 PASS - wait intervals including 0, 1, 3, 15 PASS - kubectl delete 2 vault pods PASS - kubectl delete 3 (all) vault pods Story: 2010393 Task: 47701 Change-Id: I4fd916033f6dd5210078126abb065393d25851cd Signed-off-by: Michel Thebeau <michel.thebeau@windriver.com>
This commit is contained in:
parent
02184560c5
commit
dc79220541
@ -1,17 +1,17 @@
|
||||
From ef953bc9a8f961c40fd8c6a051b04232ca1849c2 Mon Sep 17 00:00:00 2001
|
||||
From 3a687b597f91a7f344a22d5f63b24159880ee74f Mon Sep 17 00:00:00 2001
|
||||
From: Greg Waines <greg.waines@windriver.com>
|
||||
Date: Sat, 5 Nov 2022 20:14:58 -0400
|
||||
Subject: [PATCH] Add vault manager repository to values.yaml
|
||||
|
||||
---
|
||||
values.yaml | 24 ++++++++++++++++++++++++
|
||||
1 file changed, 24 insertions(+)
|
||||
values.yaml | 37 +++++++++++++++++++++++++++++++++++++
|
||||
1 file changed, 37 insertions(+)
|
||||
|
||||
diff --git a/values.yaml b/values.yaml
|
||||
index 61af7b2..15f5287 100644
|
||||
index 61af7b2..aff058b 100644
|
||||
--- a/values.yaml
|
||||
+++ b/values.yaml
|
||||
@@ -24,6 +24,30 @@ global:
|
||||
@@ -24,6 +24,43 @@ global:
|
||||
seccomp.security.alpha.kubernetes.io/defaultProfileName: runtime/default
|
||||
apparmor.security.beta.kubernetes.io/defaultProfileName: runtime/default
|
||||
|
||||
@ -38,6 +38,19 @@ index 61af7b2..15f5287 100644
|
||||
+ # Default is 5 s/interval * 3 intervals == 15 seconds.
|
||||
+ #
|
||||
+ unsealWaitIntervals: 3
|
||||
+
|
||||
+ # Network timeout for queries to vault server /sys/health endpoint
|
||||
+ #
|
||||
+ # The maximum time in seconds to wait for a server to respond to
|
||||
+ # health query. This applies for the HA recovery situations, not the
|
||||
+ # initialization of vault cluster. Unsetting the value is not
|
||||
+ # recommended, and defaults to curl --connect-timeout of 60 seconds.
|
||||
+ #
|
||||
+ # vault-manager will appear to hang if healthQueryTimeout is
|
||||
+ # over-large. This setting affects the logs, since vault-manager will
|
||||
+ # issue a log when the 'sealed' status toggles between true/false and
|
||||
+ # the 'unknown' value
|
||||
+ healthQueryTimeout: 2
|
||||
+
|
||||
injector:
|
||||
# True if you want to enable vault agent injection.
|
||||
|
@ -19,6 +19,9 @@ data:
|
||||
PODREC_F="$WORKDIR/previous_pods_status.txt"
|
||||
PODREC_TMP_F="$WORKDIR/new_pods_status.txt"
|
||||
|
||||
# Vault server health query timeout during HA recovery scenario
|
||||
QUERY_TMOUT={{ .Values.manager.healthQueryTimeout }}
|
||||
|
||||
STATEFULSET_RATE=5
|
||||
INIT_CONVERGE_TIME=10
|
||||
JOIN_RATE=5
|
||||
@ -42,6 +45,11 @@ data:
|
||||
echo "$(date +%Y-%m-%dT%H-%M-%S) $@"
|
||||
}
|
||||
|
||||
if ! [[ "$QUERY_TMOUT" =~ ^[0-9]+$ ]]; then
|
||||
log ".Values.manager.healthQueryTimeout not an integer"
|
||||
QUERY_TMOUT=""
|
||||
fi
|
||||
|
||||
# Creates a list of all k8s vault pods and stores in text file.
|
||||
# Converts ips from X.X.X.X or a:b:c::d to X-X-X-X for use as pod
|
||||
# dns names
|
||||
@ -130,7 +138,13 @@ data:
|
||||
# Simply calls the status check of a vault, used to check if it is
|
||||
# initialized, unsealed, or part of raft cluster
|
||||
function vaultServerStatus {
|
||||
curl --cacert $CERT -s https://$row.$DOMAIN:8200/v1/sys/health
|
||||
local tmout=""
|
||||
|
||||
if [ -n "$1" ]; then
|
||||
tmout="--connect-timeout $1"
|
||||
fi
|
||||
curl $tmout --cacert $CERT -s \
|
||||
https://$row.$DOMAIN:8200/v1/sys/health
|
||||
}
|
||||
|
||||
function runStateMachine {
|
||||
@ -277,7 +291,7 @@ data:
|
||||
log "pod list has empty data: [$host] [$row]"
|
||||
continue
|
||||
fi
|
||||
vaultServerStatus > $WORKDIR/healthcheck.txt
|
||||
vaultServerStatus $QUERY_TMOUT > $WORKDIR/healthcheck.txt
|
||||
TEMP=$(cat $WORKDIR/healthcheck.txt | jq -r .sealed)
|
||||
|
||||
# Decide when to unseal the vault server; includes
|
||||
|
Loading…
Reference in New Issue
Block a user