From 8649fbd3f529ce6afb00e45e98eee5aa6b249641 Mon Sep 17 00:00:00 2001 From: Scott Hussey Date: Wed, 3 Jul 2019 11:38:51 -0500 Subject: [PATCH] Make aux etcd more conservative - Currently the auxiliary etcd instances remove themselves after a single non-genesis member joins the cluster. This leaves the cluster susceptible to non-recoverable disruption until a 3rd member joins. This change makes the auxiliary control script wait for a configurable number of non-auxiliary members to join before removing the auxiliary members. Change-Id: Ib4968b533e8433e3c40a845d086c7078e807c3e2 --- examples/basic/Genesis.yaml | 2 ++ examples/complete/Genesis.yaml | 2 ++ examples/gate/Genesis.yaml | 4 +++- promenade/schemas/Genesis.yaml | 9 +++++++++ .../manifests/auxiliary-kubernetes-etcd.yaml | 11 ++++++----- 5 files changed, 22 insertions(+), 6 deletions(-) diff --git a/examples/basic/Genesis.yaml b/examples/basic/Genesis.yaml index c1f66e23..d32952c2 100644 --- a/examples/basic/Genesis.yaml +++ b/examples/basic/Genesis.yaml @@ -32,6 +32,8 @@ data: - --v=3 armada: target_manifest: cluster-bootstrap + etcd: + auxiliary_threshold: 3 labels: dynamic: - calico-etcd=enabled diff --git a/examples/complete/Genesis.yaml b/examples/complete/Genesis.yaml index aacf3247..01eda3cf 100644 --- a/examples/complete/Genesis.yaml +++ b/examples/complete/Genesis.yaml @@ -13,6 +13,8 @@ data: external_ip: 192.168.77.10 armada: target_manifest: cluster-bootstrap + etcd: + auxiliary_threshold: 3 labels: dynamic: - calico-etcd=enabled diff --git a/examples/gate/Genesis.yaml b/examples/gate/Genesis.yaml index c5a98cfa..90fbc8ea 100644 --- a/examples/gate/Genesis.yaml +++ b/examples/gate/Genesis.yaml @@ -32,6 +32,8 @@ data: - --v=3 armada: target_manifest: cluster-bootstrap + etcd: + auxiliary_threshold: 3 labels: dynamic: - calico-etcd=enabled @@ -73,4 +75,4 @@ data: - type: Server qps: 1000 burst: 10000 -... \ No newline at end of file +... diff --git a/promenade/schemas/Genesis.yaml b/promenade/schemas/Genesis.yaml index 704f00c4..c69a0cce 100644 --- a/promenade/schemas/Genesis.yaml +++ b/promenade/schemas/Genesis.yaml @@ -91,6 +91,15 @@ data: additionalProperties: true additionalProperties: false + etcd: + type: object + properties: + # What number of non-auxiliary etcd members are needed + # before the auxiliary members will self-terminate + auxiliary_threshold: + type: integer + additionalProperties: false + files: type: array items: diff --git a/promenade/templates/roles/genesis/etc/kubernetes/manifests/auxiliary-kubernetes-etcd.yaml b/promenade/templates/roles/genesis/etc/kubernetes/manifests/auxiliary-kubernetes-etcd.yaml index ad1ecb8b..674ad319 100644 --- a/promenade/templates/roles/genesis/etc/kubernetes/manifests/auxiliary-kubernetes-etcd.yaml +++ b/promenade/templates/roles/genesis/etc/kubernetes/manifests/auxiliary-kubernetes-etcd.yaml @@ -30,7 +30,7 @@ spec: function external_member_count() { etcdctl member list \ | grep '\bstarted\b' \ - | grep -Ev "\\b({{ config['Genesis:hostname'] }}|auxiliary-0|auxiliary-1)\\b" \ + | grep -Ev "\\b(auxiliary-0|auxiliary-1)\\b" \ | wc -l } @@ -42,10 +42,11 @@ spec: fi } - # NOTE(mark-burnett): If there are any non-genesis members, then we are ready to - # remove the auxiliary members. Otherwise, wait. - while [ ! "$(external_member_count)" -gt 0 ]; do - sleep 10 + auxiliary_threshold="{{ config.get_first('Genesis:etcd.auxiliary_threshold', default=3) }}" + # NOTE(sh8121att): If there are enough (a fully resilient contigent) non-auxiliary members, + # then we are ready to remove the auxiliary members. Otherwise, wait. + while [ ! "$(external_member_count)" -ge "$auxiliary_threshold" ]; do + sleep 30 done # NOTE(mark-burnett): Failures beyond this point are unexpected, but