Run etcd on all controllers for redundancy

Change-Id: If8d8d10691d80f5d46dfb8dcf23ae1779b028309
This commit is contained in:
Emma Gordon 2016-01-07 10:31:25 +00:00
parent f8ae3ffa13
commit 8f0d8d149b
5 changed files with 129 additions and 19 deletions

View File

@ -6,28 +6,33 @@ import yaml
from pluginutils import NODES_CONFIG
RECONFIGURE_ROUTE_REFLECTOR = "##REPLACE_ON_INSTALL##/calico_route_reflector.sh"
SCRIPTS_LOCATION="##REPLACE_ON_INSTALL##/"
RECONFIGURE_ROUTE_REFLECTOR = SCRIPTS_LOCATION + "calico_route_reflector.sh"
UPDATE_ETCD_CLUSTER = SCRIPTS_LOCATION + "update_etcd_cluster.sh"
def _get_configured_nodes():
def _get_configured_nodes(roles):
with open(NODES_CONFIG, "r") as f:
config = yaml.safe_load(f)
nodes = [node for node in config["nodes"] if node["role"] in [
"compute", "controller", "primary-controller"]]
return [node for node in config["nodes"] if node["role"] in roles]
# There is no need to reconfigure the route reflector for a change in
# primary controller, so we don't keep track of which controller is the
# current primary.
primary_controller_index = None
def _get_compute_nodes():
return _get_configured_nodes(["compute"])
def _get_control_nodes():
nodes = _get_configured_nodes(["controller", "primary-controller"])
primary_index = None
for (index, node) in enumerate(nodes):
if node["role"] == "primary-controller":
primary_controller_index = index
primary_index = index
break
# Note the index could be 0 - hence 'if x is not None' rather than 'if x'
if primary_controller_index is not None:
nodes[primary_controller_index]["role"] = "controller"
if primary_index is not None:
nodes[primary_index]["role"] = "controller"
return nodes
@ -35,13 +40,22 @@ def _get_configured_nodes():
class DeploymentChangeHandler(pyinotify.ProcessEvent):
def __init__(self):
super(DeploymentChangeHandler, self).__init__()
self.nodes = _get_configured_nodes()
self.compute_nodes = _get_compute_nodes()
self.control_nodes = _get_control_nodes()
def process_IN_MODIFY(self, event):
current_nodes = _get_configured_nodes()
if current_nodes != self.nodes:
current_compute_nodes = _get_compute_nodes()
current_control_nodes = _get_control_nodes()
if current_control_nodes != self.control_nodes:
subprocess.call(RECONFIGURE_ROUTE_REFLECTOR)
self.nodes = current_nodes
subprocess.call(UPDATE_ETCD_CLUSTER)
elif current_compute_nodes != self.compute_nodes:
subprocess.call(RECONFIGURE_ROUTE_REFLECTOR)
self.compute_nodes = current_compute_nodes
self.control_nodes = current_control_nodes
if __name__ == "__main__":

View File

@ -52,6 +52,11 @@ apt-get update
apt-get -y install etcd
for controller_address in ${controller_node_addresses[@]};do
initial_cluster+="${controller_address}=http://${controller_address}:2380,"
done
initial_cluster=${initial_cluster::-1} # remove trailing comma
service etcd stop
rm -rf /var/lib/etcd/*
awk '/exec \/usr\/bin\/etcd/{while(getline && $0 != ""){}}1' /etc/init/etcd.conf > tmp
@ -60,7 +65,7 @@ cat << EXEC_CMD >> /etc/init/etcd.conf
exec /usr/bin/etcd -proxy on \\
-listen-client-urls http://127.0.0.1:4001 \\
-advertise-client-urls http://127.0.0.1:7001 \\
-initial-cluster controller=http://${controller_node_addresses}:2380
-initial-cluster ${initial_cluster}
EXEC_CMD
service etcd start

View File

@ -10,6 +10,7 @@ set -x
echo "Hi, I'm a controller node!"
this_node_address=$(python get_node_ip.py `hostname`)
controller_node_addresses=$(python get_node_ips_by_role.py controller)
# Get APT key for binaries.projectcalico.org.
@ -51,19 +52,25 @@ apt-get update
apt-get -y install etcd
for controller_address in ${controller_node_addresses[@]};do
initial_cluster+="${controller_address}=http://${controller_address}:2380,"
done
initial_cluster=${initial_cluster::-1} # remove trailing comma
service etcd stop
rm -rf /var/lib/etcd/*
awk '/exec \/usr\/bin\/etcd/{while(getline && $0 != ""){}}1' /etc/init/etcd.conf > tmp
mv tmp /etc/init/etcd.conf
cat << EXEC_CMD >> /etc/init/etcd.conf
exec /usr/bin/etcd -name controller \\
exec /usr/bin/etcd -name ${this_node_address} \\
-advertise-client-urls "http://${this_node_address}:2379,http://${this_node_address}:4001" \\
-listen-client-urls "http://0.0.0.0:2379,http://0.0.0.0:4001" \\
-listen-peer-urls "http://0.0.0.0:2380" \\
-initial-advertise-peer-urls "http://${this_node_address}:2380" \\
-initial-cluster-token fuel-cluster-1 \\
-initial-cluster controller=http://${this_node_address}:2380 \\
-initial-cluster ${initial_cluster} \\
-initial-cluster-state new
EXEC_CMD
service etcd start

View File

@ -0,0 +1,43 @@
#!/bin/bash
# Copyright 2015 Metaswitch Networks
caller=$1
node_address=$2
initial_cluster=$3
CALLED_BY=/tmp/etcd_cfg_modifiers
touch ${CALLED_BY}
num_callers=$(wc -l < ${CALLED_BY})
if [[ $num_callers != 0 ]]; then
# Someone else has already run this script - exit.
exit
fi
echo ${caller} >> ${CALLED_BY}
sleep 1
num_callers=$(wc -l < ${CALLED_BY})
if [[ $num_callers > 1 ]]; then
# Someone else is also trying to run this script, back off unless the caller wins an arbitrary
# tiebreak of an alphabetical sort.
callers=$(cat ${CALLED_BY} | sort)
if [[ "$caller" != "${callers[0]}" ]]; then
exit
fi
fi
service etcd stop
rm -rf /var/lib/etcd/*
awk '/exec \/usr\/bin\/etcd/{while(getline && $0 != ""){}}1' /etc/init/etcd.conf > tmp
mv tmp /etc/init/etcd.conf
cat << EXEC_CMD >> /etc/init/etcd.conf
exec /usr/bin/etcd -name ${node_address} \\
-advertise-client-urls "http://${node_address}:2379,http://${node_address}:4001" \\
-listen-client-urls "http://0.0.0.0:2379,http://0.0.0.0:4001" \\
-listen-peer-urls "http://0.0.0.0:2380" \\
-initial-advertise-peer-urls "http://${node_address}:2380" \\
-initial-cluster-token fuel-cluster-1 \\
-initial-cluster ${initial_cluster} \\
-initial-cluster-state existing
EXEC_CMD
service etcd start

View File

@ -0,0 +1,41 @@
#!/bin/bash
# Copyright 2015 Metaswitch Networks
this_node_address=$(python get_node_ip.py `hostname`)
controller_node_addresses=$(python get_node_ips_by_role.py controller)
for node_address in ${controller_node_addresses[@]}; do
initial_cluster+="${node_address}=http://${node_address}:2380,"
done
initial_cluster=${initial_cluster::-1} # remove trailing comma
service etcd stop
rm -rf /var/lib/etcd/*
awk '/exec \/usr\/bin\/etcd/{while(getline && $0 != ""){}}1' /etc/init/etcd.conf > tmp
mv tmp /etc/init/etcd.conf
cat << EXEC_CMD >> /etc/init/etcd.conf
exec /usr/bin/etcd -name ${this_node_address} \\
-advertise-client-urls "http://${this_node_address}:2379,http://${this_node_address}:4001" \\
-listen-client-urls "http://0.0.0.0:2379,http://0.0.0.0:4001" \\
-listen-peer-urls "http://0.0.0.0:2380" \\
-initial-advertise-peer-urls "http://${this_node_address}:2380" \\
-initial-cluster-token fuel-cluster-1 \\
-initial-cluster ${initial_cluster} \\
-initial-cluster-state new
EXEC_CMD
service etcd start
retry_count=0
while [[ $retry_count < 5 ]]; do
etcdctl cluster-health
if [[ $? == 0 ]]; then
break
else
((retry_count++))
service etcd restart
sleep 2
fi
done