From fb88ed98a2e4b46161ab5193d03338c3eb6b306a Mon Sep 17 00:00:00 2001
From: Damien Ciabrini <dciabrin@redhat.com>
Date: Fri, 1 Apr 2022 13:37:20 +0200
Subject: [PATCH] Check whether an HA resource already exists explicitly

With ephemeral heat we lost the meaning of the 'stack_action' hiera key
which we previously used to distinguish between fresh deployment and
pre-existing deployment (aka redeploy).
Since this hiera key is not available anymore, in ansible we added a
TRIPLEO_HA_WRAPPER_RESOURCE_EXISTS env variable which will be true
when the resource existed even before calling puppet.

This way we can restore the previous behaviour (which was relying
on the stack_update hiera key) of restarting an HA
bundle on the bootstrap node in case of a configuration change.

While we're at it we make sure that the logging takes place via logger
so we're sure to capture these events in the journal.

Tested as follows:
1) Initial deploy:
[root@controller-0 ~]# journalctl |grep pcmkres
Sep 01 10:23:35 controller-0.alejandro.ftw pcmkrestart[47636]: Initial deployment, skipping the restart of haproxy-bundle
Sep 01 10:24:25 controller-0.alejandro.ftw pcmkrestart[49735]: Initial deployment, skipping the restart of galera-bundle
Sep 01 10:25:15 controller-0.alejandro.ftw pcmkrestart[53052]: Initial deployment, skipping the restart of rabbitmq-bundle
Sep 01 10:37:35 controller-0.alejandro.ftw pcmkrestart[148651]: Initial deployment, skipping the restart of openstack-cinder-volume

Redeploy changing only the haproxy config via a hiera key change:
Sep 01 11:12:29 controller-0.alejandro.ftw pcmkrestart[438507]: Wed Sep Restarting haproxy-bundle globally. Stopping:
Sep 01 11:12:37 controller-0.alejandro.ftw pcmkrestart[439271]: Wed Sep Restarting haproxy-bundle globally. Starting:

Depends-On: https://review.opendev.org/c/openstack/tripleo-ansible/+/836134

Closes-Bug: #1942309

Change-Id: I90ea2287b5ab32c8dc6bbf5f91927e7488326dcd
(cherry picked from commit ad2a13ab47b6ad6c948d8c229dde8b9f1386ea29)
(cherry picked from commit 79791ce30d3203d03e9a4db4b3c18991d5102ff2)
---
 .../pacemaker_restart_bundle.sh               | 30 ++++++++++++-------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/container_config_scripts/pacemaker_restart_bundle.sh b/container_config_scripts/pacemaker_restart_bundle.sh
index 0e924e2589..b3baa43cc4 100755
--- a/container_config_scripts/pacemaker_restart_bundle.sh
+++ b/container_config_scripts/pacemaker_restart_bundle.sh
@@ -12,7 +12,7 @@ BUNDLE_NAME=$3
 WAIT_TARGET_LOCAL=$4
 WAIT_TARGET_ANYWHERE=${5:-_}
 TRIPLEO_MINOR_UPDATE="${TRIPLEO_MINOR_UPDATE:-false}"
-
+TRIPLEO_HA_WRAPPER_RESOURCE_EXISTS="${TRIPLEO_HA_WRAPPER_RESOURCE_EXISTS:-false}"
 
 bundle_can_be_restarted() {
     local bundle=$1
@@ -24,11 +24,20 @@ bundle_can_be_restarted() {
     [ "$(crm_resource --meta -r $1 -g target-role 2>/dev/null)" != "Stopped" ]
 }
 
+log() {
+    local msg=$1
+    logger -t pcmkrestart "$1"
+}
 
+HOSTNAME=$(/bin/hostname -s)
 if [ x"${TRIPLEO_MINOR_UPDATE,,}" != x"true" ]; then
-    if hiera -c /etc/puppet/hiera.yaml stack_action | grep -q -x CREATE; then
+    if [ x"${TRIPLEO_HA_WRAPPER_RESOURCE_EXISTS,,}" = x"false" ]; then
         # Do not restart during initial deployment, as the resource
         # has just been created.
+        SERVICE_NODEID=$(/bin/hiera -c /etc/puppet/hiera.yaml "${TRIPLEO_SERVICE}_short_bootstrap_node_name")
+        if [[ "${HOSTNAME,,}" == "${SERVICE_NODEID,,}" ]]; then
+            log "Initial deployment, skipping the restart of ${BUNDLE_NAME}"
+	fi
         exit 0
     else
         # During a stack update, this script is called in parallel on
@@ -36,26 +45,25 @@ if [ x"${TRIPLEO_MINOR_UPDATE,,}" != x"true" ]; then
         # have been updated on all nodes. So we need to run pcs only
         # once (e.g. on the service's boostrap node).
         if bundle_can_be_restarted ${BUNDLE_NAME}; then
-            HOSTNAME=$(/bin/hostname -s)
             SERVICE_NODEID=$(/bin/hiera -c /etc/puppet/hiera.yaml "${TRIPLEO_SERVICE}_short_bootstrap_node_name")
             if [[ "${HOSTNAME,,}" == "${SERVICE_NODEID,,}" ]]; then
                 replicas_running=$(crm_resource -Q -r $BUNDLE_NAME --locate 2>&1 | wc -l)
                 if [ "$replicas_running" != "0" ]; then
-                    echo "$(date -u): Restarting ${BUNDLE_NAME} globally. Stopping:"
+                    log "Restarting ${BUNDLE_NAME} globally. Stopping:"
                     /sbin/pcs resource disable --wait=__PCMKTIMEOUT__ $BUNDLE_NAME
-                    echo "$(date -u): Restarting ${BUNDLE_NAME} globally. Starting:"
+                    log "Restarting ${BUNDLE_NAME} globally. Starting:"
                     /sbin/pcs resource enable --wait=__PCMKTIMEOUT__ $BUNDLE_NAME
                 else
-                    echo "$(date -u): ${BUNDLE_NAME} is not running anywhere," \
+                    log "${BUNDLE_NAME} is not running anywhere," \
                          "cleaning up to restart it globally if necessary"
                     /sbin/pcs resource cleanup $BUNDLE_NAME
                 fi
             else
-                echo "$(date -u): Skipping global restart of ${BUNDLE_NAME} on ${HOSTNAME} it will be restarted by node ${SERVICE_NODEID}"
+                log "Skipping global restart of ${BUNDLE_NAME} on ${HOSTNAME} it will be restarted by node ${SERVICE_NODEID}"
             fi
 
         else
-            echo "$(date -u): No global restart needed for ${BUNDLE_NAME}."
+            log "No global restart needed for ${BUNDLE_NAME}."
         fi
     fi
 else
@@ -70,7 +78,7 @@ else
     if bundle_can_be_restarted ${BUNDLE_NAME}; then
 	# if the resource is running locally, restart it
 	if crm_resource -r $BUNDLE_NAME --locate 2>&1 | grep -w -q "${HOST}"; then
-            echo "$(date -u): Restarting ${BUNDLE_NAME} locally on '${HOST}'"
+            log "Restarting ${BUNDLE_NAME} locally on '${HOST}'"
             /sbin/pcs resource restart $BUNDLE_NAME "${HOST}"
 
 	else
@@ -80,7 +88,7 @@ else
 	    # By cleaning up resource, we ensure that a) it will try to
 	    # restart, or b) it won't do anything if the resource is
 	    # already running elsewhere.
-            echo "$(date -u): ${BUNDLE_NAME} is currently not running on '${HOST}'," \
+            log "${BUNDLE_NAME} is currently not running on '${HOST}'," \
                  "cleaning up its state to restart it if necessary"
             /sbin/pcs resource cleanup $BUNDLE_NAME node="${HOST}"
 	fi
@@ -91,6 +99,6 @@ else
             "$WAIT_TARGET_LOCAL" "$WAIT_TARGET_ANYWHERE" \
 	    "${HOST}" __PCMKTIMEOUT__
     else
-        echo "$(date -u): No restart needed for ${BUNDLE_NAME}."
+        log "No restart needed for ${BUNDLE_NAME}."
     fi
 fi