add health query timeout for HA
This change causes vault-manager to not pause for long periods when a configured vault server is not responsive. Use curl --connect-timeout for queries to vault server /sys/health. During HA recovery it is known that the server is non-responsive, so vault-manager should not wait the default time, which is 60s or 5m depending on the google search result. It is observed that vault-manager appears to hang for long periods during HA recovery. Watching the $PVCDIR/pods.txt confirms that vault-manager is inactive for minutes at a time. This changes the default behavior to timeout within 2 seconds during the HA recovery scenario. In addition to not waiting, the vault-manager log will show the 'sealed' status as empty string when the query times-out. Test Plan: PASS - vault ha 3 replicas PASS - vault 1 replica PASS - kubectl exec kill vault process PASS - kubectl delete vault pod PASS - short network downtime PASS - long network downtime PASS - rates including 1, 5 PASS - wait intervals including 0, 1, 3, 15 PASS - kubectl delete 2 vault pods PASS - kubectl delete 3 (all) vault pods Story: 2010393 Task: 47701 Change-Id: I4fd916033f6dd5210078126abb065393d25851cd Signed-off-by: Michel Thebeau <michel.thebeau@windriver.com>
This commit is contained in:
parent
02184560c5
commit
dc79220541
@ -1,17 +1,17 @@
|
|||||||
From ef953bc9a8f961c40fd8c6a051b04232ca1849c2 Mon Sep 17 00:00:00 2001
|
From 3a687b597f91a7f344a22d5f63b24159880ee74f Mon Sep 17 00:00:00 2001
|
||||||
From: Greg Waines <greg.waines@windriver.com>
|
From: Greg Waines <greg.waines@windriver.com>
|
||||||
Date: Sat, 5 Nov 2022 20:14:58 -0400
|
Date: Sat, 5 Nov 2022 20:14:58 -0400
|
||||||
Subject: [PATCH] Add vault manager repository to values.yaml
|
Subject: [PATCH] Add vault manager repository to values.yaml
|
||||||
|
|
||||||
---
|
---
|
||||||
values.yaml | 24 ++++++++++++++++++++++++
|
values.yaml | 37 +++++++++++++++++++++++++++++++++++++
|
||||||
1 file changed, 24 insertions(+)
|
1 file changed, 37 insertions(+)
|
||||||
|
|
||||||
diff --git a/values.yaml b/values.yaml
|
diff --git a/values.yaml b/values.yaml
|
||||||
index 61af7b2..15f5287 100644
|
index 61af7b2..aff058b 100644
|
||||||
--- a/values.yaml
|
--- a/values.yaml
|
||||||
+++ b/values.yaml
|
+++ b/values.yaml
|
||||||
@@ -24,6 +24,30 @@ global:
|
@@ -24,6 +24,43 @@ global:
|
||||||
seccomp.security.alpha.kubernetes.io/defaultProfileName: runtime/default
|
seccomp.security.alpha.kubernetes.io/defaultProfileName: runtime/default
|
||||||
apparmor.security.beta.kubernetes.io/defaultProfileName: runtime/default
|
apparmor.security.beta.kubernetes.io/defaultProfileName: runtime/default
|
||||||
|
|
||||||
@ -38,6 +38,19 @@ index 61af7b2..15f5287 100644
|
|||||||
+ # Default is 5 s/interval * 3 intervals == 15 seconds.
|
+ # Default is 5 s/interval * 3 intervals == 15 seconds.
|
||||||
+ #
|
+ #
|
||||||
+ unsealWaitIntervals: 3
|
+ unsealWaitIntervals: 3
|
||||||
|
+
|
||||||
|
+ # Network timeout for queries to vault server /sys/health endpoint
|
||||||
|
+ #
|
||||||
|
+ # The maximum time in seconds to wait for a server to respond to
|
||||||
|
+ # health query. This applies for the HA recovery situations, not the
|
||||||
|
+ # initialization of vault cluster. Unsetting the value is not
|
||||||
|
+ # recommended, and defaults to curl --connect-timeout of 60 seconds.
|
||||||
|
+ #
|
||||||
|
+ # vault-manager will appear to hang if healthQueryTimeout is
|
||||||
|
+ # over-large. This setting affects the logs, since vault-manager will
|
||||||
|
+ # issue a log when the 'sealed' status toggles between true/false and
|
||||||
|
+ # the 'unknown' value
|
||||||
|
+ healthQueryTimeout: 2
|
||||||
+
|
+
|
||||||
injector:
|
injector:
|
||||||
# True if you want to enable vault agent injection.
|
# True if you want to enable vault agent injection.
|
||||||
|
@ -19,6 +19,9 @@ data:
|
|||||||
PODREC_F="$WORKDIR/previous_pods_status.txt"
|
PODREC_F="$WORKDIR/previous_pods_status.txt"
|
||||||
PODREC_TMP_F="$WORKDIR/new_pods_status.txt"
|
PODREC_TMP_F="$WORKDIR/new_pods_status.txt"
|
||||||
|
|
||||||
|
# Vault server health query timeout during HA recovery scenario
|
||||||
|
QUERY_TMOUT={{ .Values.manager.healthQueryTimeout }}
|
||||||
|
|
||||||
STATEFULSET_RATE=5
|
STATEFULSET_RATE=5
|
||||||
INIT_CONVERGE_TIME=10
|
INIT_CONVERGE_TIME=10
|
||||||
JOIN_RATE=5
|
JOIN_RATE=5
|
||||||
@ -42,6 +45,11 @@ data:
|
|||||||
echo "$(date +%Y-%m-%dT%H-%M-%S) $@"
|
echo "$(date +%Y-%m-%dT%H-%M-%S) $@"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if ! [[ "$QUERY_TMOUT" =~ ^[0-9]+$ ]]; then
|
||||||
|
log ".Values.manager.healthQueryTimeout not an integer"
|
||||||
|
QUERY_TMOUT=""
|
||||||
|
fi
|
||||||
|
|
||||||
# Creates a list of all k8s vault pods and stores in text file.
|
# Creates a list of all k8s vault pods and stores in text file.
|
||||||
# Converts ips from X.X.X.X or a:b:c::d to X-X-X-X for use as pod
|
# Converts ips from X.X.X.X or a:b:c::d to X-X-X-X for use as pod
|
||||||
# dns names
|
# dns names
|
||||||
@ -130,7 +138,13 @@ data:
|
|||||||
# Simply calls the status check of a vault, used to check if it is
|
# Simply calls the status check of a vault, used to check if it is
|
||||||
# initialized, unsealed, or part of raft cluster
|
# initialized, unsealed, or part of raft cluster
|
||||||
function vaultServerStatus {
|
function vaultServerStatus {
|
||||||
curl --cacert $CERT -s https://$row.$DOMAIN:8200/v1/sys/health
|
local tmout=""
|
||||||
|
|
||||||
|
if [ -n "$1" ]; then
|
||||||
|
tmout="--connect-timeout $1"
|
||||||
|
fi
|
||||||
|
curl $tmout --cacert $CERT -s \
|
||||||
|
https://$row.$DOMAIN:8200/v1/sys/health
|
||||||
}
|
}
|
||||||
|
|
||||||
function runStateMachine {
|
function runStateMachine {
|
||||||
@ -277,7 +291,7 @@ data:
|
|||||||
log "pod list has empty data: [$host] [$row]"
|
log "pod list has empty data: [$host] [$row]"
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
vaultServerStatus > $WORKDIR/healthcheck.txt
|
vaultServerStatus $QUERY_TMOUT > $WORKDIR/healthcheck.txt
|
||||||
TEMP=$(cat $WORKDIR/healthcheck.txt | jq -r .sealed)
|
TEMP=$(cat $WORKDIR/healthcheck.txt | jq -r .sealed)
|
||||||
|
|
||||||
# Decide when to unseal the vault server; includes
|
# Decide when to unseal the vault server; includes
|
||||||
|
Loading…
x
Reference in New Issue
Block a user