From f9e6a26f32aea4d3c40178f87b61efb924f81f97 Mon Sep 17 00:00:00 2001 From: Michele Baldessari Date: Sun, 25 Sep 2016 14:10:31 +0200 Subject: [PATCH] A few major-upgrade issues This commit does the following: 1. We now explicitly disable/stop and then remove the resources that are moving to systemd. We do this because we want to make sure they are all stopped before doing a yum upgrade, which otherwise would take ages due to rabbitmq and galera being down. It is best if we do this via pcs while we do the HA Full -> HA NG migration because it is simpler to make sure all the services are stopped at that stage. For extra safety we can still do a check by hand. By doing it via pacemaker we have the guarantee that all the migrated services are down already when we stop the cluster (which happens to be a syncronization point between all controller nodes). That way we can be certain that they are all down on all nodes before starting the yum upgrade process. 2. We actually need to start the systemd services in major_upgrade_controller_pacemaker_2.sh and not stop them. 3. We need to use the proper bash variable name 4. Use is_bootstrap_node everywhere to make the code more consistent Change-Id: Ic565c781b80357bed9483df45a4a94ec0423487c Closes-Bug: #1627490 --- .../major_upgrade_controller_pacemaker_1.sh | 27 ++++++++++--- .../major_upgrade_controller_pacemaker_2.sh | 4 +- .../major_upgrade_pacemaker_migrations.sh | 40 ++++++++++--------- 3 files changed, 46 insertions(+), 25 deletions(-) diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh index 2490ce27ac..4ceedb9b98 100755 --- a/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh +++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_1.sh @@ -19,7 +19,7 @@ STONITH_STATE=$(pcs property show stonith-enabled | grep "stonith-enabled" | awk pcs property set stonith-enabled=false # Migrate to HA NG -if [ "$(hiera -c /etc/puppet/hiera.yaml bootstrap_nodeid)" = "$(facter hostname)" ]; then +if [[ -n $(is_bootstrap_node) ]]; then migrate_full_to_ng_ha fi @@ -29,9 +29,26 @@ fi # is going to take a long time because rabbit is down. By having the service stopped # systemctl try-restart is a noop -for $service in $(services_to_migrate); do +for service in $(services_to_migrate); do manage_systemd_service stop "${service%%-clone}" - check_resource_systemd "${service%%-clone}" stopped 600 + # So the reason for not reusing check_resource_systemd is that + # I have observed systemctl is-active returning unknown with at least + # one service that was stopped (See LP 1627254) + timeout=600 + tstart=$(date +%s) + tend=$(( $tstart + $timeout )) + check_interval=3 + while (( $(date +%s) < $tend )); do + if [[ "$(systemctl is-active ${service%%-clone})" = "active" ]]; then + echo "$service still active, sleeping $check_interval seconds." + sleep $check_interval + else + # we do not care if it is inactive, unknown or failed as long as it is + # not running + break + fi + + done done # In case the mysql package is updated, the database on disk must be @@ -46,7 +63,7 @@ done # on mysql package versionning, but this can be overriden manually # to support specific upgrade scenario -if [ "$(hiera -c /etc/puppet/hiera.yaml bootstrap_nodeid)" = "$(facter hostname)" ]; then +if [[ -n $(is_bootstrap_node) ]]; then if [ $DO_MYSQL_UPGRADE -eq 1 ]; then mysqldump $backup_flags > "$MYSQL_BACKUP_DIR/openstack_database.sql" cp -rdp /etc/my.cnf* "$MYSQL_BACKUP_DIR" @@ -68,7 +85,7 @@ if [ "$(hiera -c /etc/puppet/hiera.yaml bootstrap_nodeid)" = "$(facter hostname) fi -# Swift isn't controled by pacemaker +# Swift isn't controlled by pacemaker systemctl_swift stop tstart=$(date +%s) diff --git a/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh b/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh index 6bb2fa735e..7e814f5f81 100755 --- a/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh +++ b/extraconfig/tasks/major_upgrade_controller_pacemaker_2.sh @@ -68,7 +68,7 @@ systemctl_swift start # We need to start the systemd services we explicitely stopped at step _1.sh # FIXME: Should we let puppet during the convergence step do the service enabling or # should we add it here? -for $service in $(services_to_migrate); do - manage_systemd_service stop "${service%%-clone}" +for service in $(services_to_migrate); do + manage_systemd_service start "${service%%-clone}" check_resource_systemd "${service%%-clone}" started 600 done diff --git a/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh b/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh index b8c5321b45..d974bb7965 100644 --- a/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh +++ b/extraconfig/tasks/major_upgrade_pacemaker_migrations.sh @@ -109,10 +109,11 @@ function services_to_migrate { # during the conversion # 2. Remove all the colocation constraints and then the ordering constraints, except the # ones related to haproxy/VIPs which exist in Newton as well -# 3. Remove all the resources that won't be managed by pacemaker in newton. Note that they -# will show up as ORPHANED but they will keep running normally via systemd. They will be -# enabled to start at boot by puppet during the converge step -# 4. Take the cluster out of maintenance-mode and do a resource cleanup +# 3. Take the cluster out of maintenance-mode and do a resource cleanup +# 4. Remove all the resources that won't be managed by pacemaker in newton. The +# outcome will be +# that they are stopped and removed from pacemakers control +# 5. Do a resource cleanup to make sure the cluster is in a clean state function migrate_full_to_ng_ha { if [[ -n $(pcmk_running) ]]; then pcs property set maintenance-mode=true @@ -135,32 +136,35 @@ function migrate_full_to_ng_ha { log_debug "Deleting ordering constraint $constraint from CIB" pcs constraint remove "$constraint" done + # At this stage all the pacemaker resources are removed from the CIB. + # Once we remove the maintenance-mode those systemd resources will keep + # on running. They shall be systemd enabled via the puppet converge + # step later on + pcs property set maintenance-mode=false # At this stage there are no constraints whatsoever except the haproxy/ip ones - # which we want to keep. We now delete each resource that will move to systemd - # Note that the corresponding systemd resource will stay running, which means that - # later when we do the "yum update", things will be a bit slower because each - # "systemctl try-restart " is not a no-op any longer because the service is up - # and running and it will be restarted with rabbitmq being down. + # which we want to keep. We now disable and then delete each resource + # that will move to systemd. + # We want the systemd resources be stopped before doing "yum update", + # that way "systemctl try-restart " is no-op because the + # service was down already PCS_STATUS_OUTPUT="$(pcs status)" for resource in $(services_to_migrate) "delay-clone" "openstack-core-clone"; do if echo "$PCS_STATUS_OUTPUT" | grep "$resource"; then log_debug "Deleting $resource from the CIB" - - # We need to add --force because the cluster is in maintenance mode and the resource - # is unmanaged. The if serves to make this idempotent + if ! pcs resource disable "$resource" --wait=600; then + echo_error "ERROR: resource $resource failed to be disabled" + exit 1 + fi pcs resource delete --force "$resource" else log_debug "Service $service not found as a pacemaker resource, not trying to delete." fi done - # At this stage all the pacemaker resources are removed from the CIB. Once we remove the - # maintenance-mode those systemd resources will keep on running. They shall be systemd enabled - # via the puppet converge step later on - pcs property set maintenance-mode=false - # We need to do a pcs resource cleanup here + crm_resource --wait to make sure the - # cluster is in a clean state before we stop everything, upgrade and restart everything + # We need to do a pcs resource cleanup here + crm_resource --wait to + # make sure the cluster is in a clean state before we stop everything, + # upgrade and restart everything pcs resource cleanup # We are making sure here that the cluster is stable before proceeding if ! timeout -k 10 600 crm_resource --wait; then