Fixes etcd race condition bug
- During genesis there was a race condition on the genesis node leaving and other nodes joining. - Updated etcd anchor to update the config when a host is not healthy. fixes #54 Change-Id: I0ba2c831c73cc3136ee635e7d0c0efcc8b009858
This commit is contained in:
parent
a3b79eabc0
commit
cf0037597d
@ -1,5 +1,5 @@
|
|||||||
#!/bin/sh
|
#!/bin/sh
|
||||||
# Copyright 2017 AT&T Intellectual Property. All other rights reserved.
|
# Copyright 2018 AT&T Intellectual Property. All other rights reserved.
|
||||||
#
|
#
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
# you may not use this file except in compliance with the License.
|
# you may not use this file except in compliance with the License.
|
||||||
@ -12,37 +12,40 @@
|
|||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
# See the License for the specific language governing permissions and
|
# See the License for the specific language governing permissions and
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
|
|
||||||
set -x
|
set -x
|
||||||
|
TEMP_MANIFEST=/tmp/etcd.yaml
|
||||||
function copy_certificates {
|
function sync_file {
|
||||||
ETCD_NAME=$1
|
if ! cmp "$1" "$2"; then
|
||||||
|
cp -f "$1" "$2"
|
||||||
set -e
|
fi
|
||||||
|
}
|
||||||
|
function sync_certificates {
|
||||||
mkdir -p /etcd-etc/tls
|
mkdir -p /etcd-etc/tls
|
||||||
# Copy CA Certificates in place
|
sync_file /etc/etcd/tls/certs/client-ca.pem /etcd-etc/tls/client-ca.pem
|
||||||
cp \
|
sync_file /etc/etcd/tls/certs/peer-ca.pem /etcd-etc/tls/peer-ca.pem
|
||||||
/etc/etcd/tls/certs/client-ca.pem \
|
sync_file "/etc/etcd/tls/certs/${ETCD_NAME}-etcd-client.pem" /etcd-etc/tls/etcd-client.pem
|
||||||
/etc/etcd/tls/certs/peer-ca.pem \
|
sync_file "/etc/etcd/tls/certs/${ETCD_NAME}-etcd-peer.pem" /etcd-etc/tls/etcd-peer.pem
|
||||||
/etcd-etc/tls
|
sync_file "/etc/etcd/tls/keys/${ETCD_NAME}-etcd-client-key.pem" /etcd-etc/tls/etcd-client-key.pem
|
||||||
|
sync_file "/etc/etcd/tls/keys/${ETCD_NAME}-etcd-peer-key.pem" /etcd-etc/tls/etcd-peer-key.pem
|
||||||
cp /etc/etcd/tls/certs/$ETCD_NAME-etcd-client.pem /etcd-etc/tls/etcd-client.pem
|
|
||||||
cp /etc/etcd/tls/certs/$ETCD_NAME-etcd-peer.pem /etcd-etc/tls/etcd-peer.pem
|
|
||||||
|
|
||||||
cp /etc/etcd/tls/keys/$ETCD_NAME-etcd-client-key.pem /etcd-etc/tls/etcd-client-key.pem
|
|
||||||
cp /etc/etcd/tls/keys/$ETCD_NAME-etcd-peer-key.pem /etcd-etc/tls/etcd-peer-key.pem
|
|
||||||
|
|
||||||
set +e
|
|
||||||
}
|
}
|
||||||
|
|
||||||
function create_manifest {
|
function create_manifest {
|
||||||
sed -i -e 's#_ETCD_INITIAL_CLUSTER_STATE_#'$2'#g' /anchor-etcd/{{ .Values.service.name }}.yaml
|
WIP=/tmp/wip-manifest.yaml
|
||||||
sed -i -e 's#_ETCD_INITIAL_CLUSTER_#'$1'#g' /anchor-etcd/{{ .Values.service.name }}.yaml
|
cp -f /anchor-etcd/{{ .Values.service.name }}.yaml $WIP
|
||||||
|
sed -i -e 's#_ETCD_INITIAL_CLUSTER_STATE_#'$2'#g' $WIP
|
||||||
cp /anchor-etcd/{{ .Values.service.name }}.yaml $MANIFEST_PATH
|
sed -i -e 's#_ETCD_INITIAL_CLUSTER_#'$1'#g' $WIP
|
||||||
|
mv -f "$WIP" "$3"
|
||||||
}
|
}
|
||||||
|
function sync_configuration {
|
||||||
|
sync_certificates
|
||||||
|
ETCD_INITIAL_CLUSTER=$(grep -v $PEER_ENDPOINT "$1" \
|
||||||
|
| awk -F ', ' '{ print $3 "=" $4 }' \
|
||||||
|
| tr '\n' ',' \
|
||||||
|
| sed "s;\$;$ETCD_NAME=https://\$\(POD_IP\):{{ .Values.network.service_peer.target_port }};")
|
||||||
|
ETCD_INITIAL_CLUSTER_STATE=existing
|
||||||
|
create_manifest "$ETCD_INITIAL_CLUSTER" "$ETCD_INITIAL_CLUSTER_STATE" "$TEMP_MANIFEST"
|
||||||
|
sync_file "${TEMP_MANIFEST}" "${MANIFEST_PATH}"
|
||||||
|
}
|
||||||
|
firstrun=true
|
||||||
while true; do
|
while true; do
|
||||||
# TODO(mark-burnett) Need to monitor a file(s) when shutting down/starting
|
# TODO(mark-burnett) Need to monitor a file(s) when shutting down/starting
|
||||||
# up so I don't try to take two actions on the node at once.
|
# up so I don't try to take two actions on the node at once.
|
||||||
@ -67,62 +70,56 @@ while true; do
|
|||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ -e /bootstrapping/{{ .Values.bootstrapping.filename }} ]; then
|
if [ -e /bootstrapping/{{ .Values.bootstrapping.filename }} ]; then
|
||||||
# Bootstrap the first node
|
# Bootstrap the first node
|
||||||
copy_certificates ${ETCD_NAME}
|
sync_certificates
|
||||||
ETCD_INITIAL_CLUSTER=${ETCD_NAME}=https://\$\(POD_IP\):{{ .Values.network.service_peer.target_port }}
|
ETCD_INITIAL_CLUSTER=${ETCD_NAME}=https://\$\(POD_IP\):{{ .Values.network.service_peer.target_port }}
|
||||||
ETCD_INITIAL_CLUSTER_STATE=new
|
ETCD_INITIAL_CLUSTER_STATE=new
|
||||||
create_manifest $ETCD_INITIAL_CLUSTER $ETCD_INITIAL_CLUSTER_STATE
|
create_manifest "$ETCD_INITIAL_CLUSTER" "$ETCD_INITIAL_CLUSTER_STATE" "$MANIFEST_PATH"
|
||||||
|
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
{{- end }}
|
{{- end }}
|
||||||
|
|
||||||
sleep {{ .Values.anchor.period }}
|
sleep {{ .Values.anchor.period }}
|
||||||
|
|
||||||
if [ -e /tmp/stopped ]; then
|
if [ -e /tmp/stopped ]; then
|
||||||
echo Stopping
|
echo Stopping
|
||||||
break
|
break
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ -e /tmp/stopping ]; then
|
if [ -e /tmp/stopping ]; then
|
||||||
echo Waiting to stop..
|
echo Waiting to stop..
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
etcdctl member list > /tmp/members
|
||||||
if [ ! -e $MANIFEST_PATH ]; then
|
# if never started or (ever started and not currently started); then
|
||||||
if ! etcdctl member list > /tmp/members; then
|
# resync
|
||||||
echo Failed to locate existing cluster
|
# fi
|
||||||
continue
|
|
||||||
fi
|
|
||||||
|
|
||||||
if ! grep $PEER_ENDPOINT /tmp/members; then
|
if ! grep $PEER_ENDPOINT /tmp/members; then
|
||||||
|
# If this member is not in the cluster, try to add it.
|
||||||
if grep -v '\bstarted\b' /tmp/members; then
|
if grep -v '\bstarted\b' /tmp/members; then
|
||||||
echo Cluster does not appear fully online, waiting.
|
echo Cluster does not appear fully online, waiting.
|
||||||
continue
|
continue
|
||||||
fi
|
fi
|
||||||
|
|
||||||
# Add this member to the cluster
|
# Add this member to the cluster
|
||||||
etcdctl member add $HOSTNAME --peer-urls $PEER_ENDPOINT
|
if ! etcdctl member add $HOSTNAME --peer-urls $PEER_ENDPOINT; then
|
||||||
|
echo Failed to add $HOSTNAME to member list. Waiting.
|
||||||
|
continue
|
||||||
fi
|
fi
|
||||||
|
echo Successfully added $HOSTNAME to cluster members.
|
||||||
# If needed, drop the file in place
|
# Refresh member list so we start with the right configuration.
|
||||||
if [ ! -e FILE ]; then
|
|
||||||
# Refresh member list
|
|
||||||
etcdctl member list > /tmp/members
|
etcdctl member list > /tmp/members
|
||||||
|
|
||||||
if grep $PEER_ENDPOINT /tmp/members; then
|
|
||||||
copy_certificates ${ETCD_NAME}
|
|
||||||
|
|
||||||
ETCD_INITIAL_CLUSTER=$(grep -v $PEER_ENDPOINT /tmp/members \
|
|
||||||
| awk -F ', ' '{ print $3 "=" $4 }' \
|
|
||||||
| tr '\n' ',' \
|
|
||||||
| sed "s;\$;$ETCD_NAME=https://\$\(POD_IP\):{{ .Values.network.service_peer.target_port }};")
|
|
||||||
ETCD_INITIAL_CLUSTER_STATE=existing
|
|
||||||
|
|
||||||
create_manifest $ETCD_INITIAL_CLUSTER $ETCD_INITIAL_CLUSTER_STATE
|
|
||||||
fi
|
fi
|
||||||
|
if $firstrun; then
|
||||||
|
sync_configuration /tmp/members
|
||||||
|
firstrun=false
|
||||||
|
fi
|
||||||
|
if ! ETCDCTL_ENDPOINTS=$CLIENT_ENDPOINT etcdctl endpoint health; then
|
||||||
|
# If not health, sleeps before checking again and then updating configs.
|
||||||
|
echo Member is not healthy, sleeping before checking again.
|
||||||
|
sleep {{ .Values.anchor.health_wait_period }}
|
||||||
|
if ! ETCDCTL_ENDPOINTS=$CLIENT_ENDPOINT etcdctl endpoint health; then
|
||||||
|
# If still not healthy updates the configs.
|
||||||
|
echo Member is not healthy, syncing configurations.
|
||||||
|
sync_configuration /tmp/members
|
||||||
|
continue
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
done
|
done
|
||||||
|
@ -32,6 +32,7 @@ anchor:
|
|||||||
manifest_path: /etc/kubernetes/manifests
|
manifest_path: /etc/kubernetes/manifests
|
||||||
|
|
||||||
period: 15
|
period: 15
|
||||||
|
health_wait_period: 60
|
||||||
|
|
||||||
etcd:
|
etcd:
|
||||||
host_etc_path: /etc/etcd-example
|
host_etc_path: /etc/etcd-example
|
||||||
|
@ -47,7 +47,7 @@
|
|||||||
"-l", "kubernetes-etcd=enabled",
|
"-l", "kubernetes-etcd=enabled",
|
||||||
"-l", "kubernetes-scheduler=enabled",
|
"-l", "kubernetes-scheduler=enabled",
|
||||||
"-l", "ucp-control-plane=enabled",
|
"-l", "ucp-control-plane=enabled",
|
||||||
"-e", "kubernetes n0 genesis n1 n2 n3",
|
"-e", "kubernetes n0 n0 n1 n2 n3",
|
||||||
"-e", "calico n0 n0 n1 n2 n3"
|
"-e", "calico n0 n0 n1 n2 n3"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
Loading…
Reference in New Issue
Block a user