tripleo-heat-templates/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh
Ian Pilcher 6e65c8fc0a Disable VIPs before stopping cluster during version upgrade
If "pcs cluster stop --all" is executed on a controller that
happens to have a VIP on the internal network, pcs may use the
VIP as the source address for communication with another cluster
node.  When pacemaker is stopped this VIP goes away, and pcs never
receives a response from the other node.  This causes pcs to hang
indefinitely; eventually the upgrade times out and fails.

Disabling the VIPs before stopping the cluster avoids this
situation.

Change-Id: I6bc59120211af28456018640033ce3763c373bbb
Closes-Bug: 1577570
2016-05-02 16:26:49 -05:00

60 lines
2.2 KiB
Bash
Executable File

#!/bin/bash
set -eu
cluster_sync_timeout=600
if pcs status 2>&1 | grep -E '(cluster is not currently running)|(OFFLINE:)'; then
echo_error "ERROR: upgrade cannot start with some cluster nodes being offline"
exit 1
fi
if [ "$(hiera -c /etc/puppet/hiera.yaml bootstrap_nodeid)" = "$(facter hostname)" ]; then
pcs resource disable httpd
check_resource httpd stopped 1800
pcs resource disable openstack-core
check_resource openstack-core stopped 1800
pcs resource disable redis
check_resource redis stopped 600
pcs resource disable mongod
check_resource mongod stopped 600
pcs resource disable rabbitmq
check_resource rabbitmq stopped 600
pcs resource disable memcached
check_resource memcached stopped 600
pcs resource disable galera
check_resource galera stopped 600
# Disable all VIPs before stopping the cluster, so that pcs doesn't use one as a source address:
# https://bugzilla.redhat.com/show_bug.cgi?id=1330688
for vip in $(pcs resource show | grep ocf::heartbeat:IPaddr2 | grep Started | awk '{ print $1 }'); do
pcs resource disable $vip
check_resource $vip stopped 60
done
pcs cluster stop --all
fi
# Swift isn't controled by pacemaker
systemctl_swift stop
tstart=$(date +%s)
while systemctl is-active pacemaker; do
sleep 5
tnow=$(date +%s)
if (( tnow-tstart > cluster_sync_timeout )) ; then
echo_error "ERROR: cluster shutdown timed out"
exit 1
fi
done
yum -y install python-zaqarclient # needed for os-collect-config
yum -y -q update
# Pin messages sent to compute nodes to kilo, these will be upgraded later
crudini --set /etc/nova/nova.conf upgrade_levels compute "$upgrade_level_nova_compute"
# https://bugzilla.redhat.com/show_bug.cgi?id=1284047
# Change-Id: Ib3f6c12ff5471e1f017f28b16b1e6496a4a4b435
crudini --set /etc/ceilometer/ceilometer.conf DEFAULT rpc_backend rabbit
# https://bugzilla.redhat.com/show_bug.cgi?id=1284058
# Ifd1861e3df46fad0e44ff9b5cbd58711bbc87c97 Swift Ceilometer middleware no longer exists
crudini --set /etc/swift/proxy-server.conf pipeline:main pipeline "catch_errors healthcheck cache ratelimit tempurl formpost authtoken keystone staticweb proxy-logging proxy-server"