Make aux etcd more conservative

- Currently the auxiliary etcd instances remove themselves
  after a single non-genesis member joins the cluster. This
  leaves the cluster susceptible to non-recoverable disruption
  until a 3rd member joins. This change makes the auxiliary control
  script wait for a configurable number of non-auxiliary members to
  join before removing the auxiliary members.

Change-Id: Ib4968b533e8433e3c40a845d086c7078e807c3e2
This commit is contained in:
Scott Hussey 2019-07-03 11:38:51 -05:00
parent fe60268244
commit 8649fbd3f5
5 changed files with 22 additions and 6 deletions

View File

@ -32,6 +32,8 @@ data:
- --v=3
armada:
target_manifest: cluster-bootstrap
etcd:
auxiliary_threshold: 3
labels:
dynamic:
- calico-etcd=enabled

View File

@ -13,6 +13,8 @@ data:
external_ip: 192.168.77.10
armada:
target_manifest: cluster-bootstrap
etcd:
auxiliary_threshold: 3
labels:
dynamic:
- calico-etcd=enabled

View File

@ -32,6 +32,8 @@ data:
- --v=3
armada:
target_manifest: cluster-bootstrap
etcd:
auxiliary_threshold: 3
labels:
dynamic:
- calico-etcd=enabled
@ -73,4 +75,4 @@ data:
- type: Server
qps: 1000
burst: 10000
...
...

View File

@ -91,6 +91,15 @@ data:
additionalProperties: true
additionalProperties: false
etcd:
type: object
properties:
# What number of non-auxiliary etcd members are needed
# before the auxiliary members will self-terminate
auxiliary_threshold:
type: integer
additionalProperties: false
files:
type: array
items:

View File

@ -30,7 +30,7 @@ spec:
function external_member_count() {
etcdctl member list \
| grep '\bstarted\b' \
| grep -Ev "\\b({{ config['Genesis:hostname'] }}|auxiliary-0|auxiliary-1)\\b" \
| grep -Ev "\\b(auxiliary-0|auxiliary-1)\\b" \
| wc -l
}
@ -42,10 +42,11 @@ spec:
fi
}
# NOTE(mark-burnett): If there are any non-genesis members, then we are ready to
# remove the auxiliary members. Otherwise, wait.
while [ ! "$(external_member_count)" -gt 0 ]; do
sleep 10
auxiliary_threshold="{{ config.get_first('Genesis:etcd.auxiliary_threshold', default=3) }}"
# NOTE(sh8121att): If there are enough (a fully resilient contigent) non-auxiliary members,
# then we are ready to remove the auxiliary members. Otherwise, wait.
while [ ! "$(external_member_count)" -ge "$auxiliary_threshold" ]; do
sleep 30
done
# NOTE(mark-burnett): Failures beyond this point are unexpected, but