#!/bin/bash

# ----
# Wait for an OCF resource or a bundle to be restarted
# ----
# e.g.:
# M/S OCF:      $0 galera galera-bundle Master
# clone OCF:    $0 rabbitmq rabbitmq-bundle Started
# A/P M/S OCF:  $0 redis redis-bundle Slave Master
# A/P bundle:   $0 openstack-cinder-volume openstack-cinder-volume _ Started
# clone bundle: $0 haproxy-bundle haproxy-bundle Started

# design note 1:
#  - this script is called during a minor update; it is called
#    once per node that hosts a service replica.
#  - the purpose of this script is to ensure that restarting the
#    service replica locally won't disrupt the service availability
#    for the end user. To reach that goal, the script waits until the
#    service is restarted locally or globallu and reaches a given
#    target state (i.e. Started, Slave or Master).
# design note 2:
#   - we don't want to track restart error: our only job is to ensure
#     service restart synchronization, not service health.
#   - In particular, we don't want to error out in case the resource
#     cannot be restarted locally, because that would make the minor
#     update fail, even if potentially other replicas still provide
#     the service.
# design note 3:
#   - we can bail out early if we determine that the resource can't
#     be restarted automatically by pacemaker (e.g. its "blocked",
#     unmanaged or disabled).

log() {
    local msg=$1
    echo "$(date -u): $1"
}

usage() {
    echo 2>&1 "Usage: $0 NAME BUNDLE_NAME ROLE_LOCAL [ROLE_ANYWHERE] [HOST] [TIMEOUT]"
    exit 1
}

pacemaker_supports_promoted() {
    # The Promoted token is only matched in recent pacemaker versions
    grep -wq "<value>Promoted</value>" /usr/share/pacemaker/resources-*.rng
}

#
# Utility functions to detect stuck resources
#

bundle_failures_locally() {
    local engine=$BUNDLE_CONTAINER_ENGINE
    local replicas=$BUNDLE_REPLICAS
    local last=$(($replicas - 1))
    local replica_name
    for i in $(seq 0 $last); do
	replica_name=${BUNDLE_NAME}-${engine}-${i}
	crm_failcount -q -G -r $replica_name -N $HOST
    done
}

bundle_failures_globally() {
    local engine=$BUNDLE_CONTAINER_ENGINE
    local replicas=$BUNDLE_REPLICAS
    local last=$(($replicas - 1))
    for i in $(seq 0 $last); do
	crm_failcount -q -G -r ${BUNDLE_NAME}-${engine}-${i}
    done
}

bundle_running_globally() {
    local engine=$BUNDLE_CONTAINER_ENGINE
    # return the number of running bundles replica, i.e. the number of
    # docker/podman resource replicas currently running in the cluster
    crm_mon --as-xml | xmllint --xpath "count(//resources/bundle[@id='${BUNDLE_NAME}']/replica/resource[@resource_agent='${OCF}:heartbeat:${engine}']/node)" -
}

ocf_failures_globally() {
    local replicas=$BUNDLE_REPLICAS
    local last=$(($replicas - 1))
    local bundle_node
    for i in $(seq 0 $last); do
	bundle_node=${BUNDLE_NAME}-${i}
	crm_failcount -q -G -r $NAME -N $bundle_node
    done
}

did_resource_failed_locally() {
    local failures
    local running
    local remotehost
    if [ "${NAME}" != "${BUNDLE_NAME}" ]; then
	# if we're dealing with an ocf resource, it is running on a
	# pacemaker_remote rather that on the real host, and the
	# failcounts are thus associated to the pcmk remote. Replace
	# the host's name with the pcmk remote's name.
        remotehost=$(crm_mon --as-xml | xmllint --xpath "string(//resources/bundle[@id='${BUNDLE_NAME}']/replica/resource/node[@name='${HOST}']/../../resource[@resource_agent='${OCF}:pacemaker:remote']/@id)" -)
	if [ -n "${remotehost}" ]; then
	    crm_failcount -q -G -r $NAME -N $remotehost | grep -q -w INFINITY
	    return $?
	fi
	# If no pcmk remote is currently running, the failcount from
	# the ocf resource is useless, compute the failcount from the
	# bundle case instead (computed below).
    fi

    # for bundles, pacemaker can run any bundle replica locally
    # (e.g. galera-bundle-docker-{0,1,2}), and a failure happens when
    # there are no more replica to try.
    # That is, when _at least_ one replica failed locally, and all the
    # others either failed or are currently running elsewhere.
    failures=$(bundle_failures_locally $HOST | grep -c -w INFINITY)
    running=$(bundle_running_globally)
    test $failures -gt 0 && \
    test $(( $failures + $running )) -ge $BUNDLE_REPLICAS
}

did_resource_failed_globally() {
    local remotecount
    local failures
    if [ "${NAME}" != "${BUNDLE_NAME}" ]; then
	# we check the state of an ocf resource only if the
	# pcmkremotes are started
        remotecount=$(crm_mon --as-xml | xmllint --xpath "count(//resources/bundle[@id='${BUNDLE_NAME}']/replica/resource[@resource_agent='${OCF}:pacemaker:remote']/node)" -)
	if [ "${remotecount}" = "0" ]; then
	    # no pcmkremote is running, so check the bundle state
	    # instead of checking the ocf resource
	    # bundle failed if all ${BUNDLE_REPLICAS} replicas failed
	    failures=$(bundle_failures_globally | grep -c -w INFINITY)
	    test $failures -eq $BUNDLE_REPLICAS
	else
	    # ocf resource failed if it failed to start on
	    # all $BUNDLE_REPLICAS bundle nodes
	    failures=$(ocf_failures_globally | grep -c -w INFINITY)
	    test $failures -eq $BUNDLE_REPLICAS
	fi
    else
	# bundle failed if all ${BUNDLE_REPLICAS} replicas failed
	failures=$(bundle_failures_globally | grep -c -w INFINITY)
	test $failures -eq $BUNDLE_REPLICAS
    fi
}


# Input validation
#

NAME=$1
if [ -z "${NAME}" ]; then
    echo 2>&1 "Error: argument NAME must not be empty"
    exit 1
fi

BUNDLE_NAME=$2
if [ -z "${BUNDLE_NAME}" ]; then
    echo 2>&1 "Error: argument BUNDLE_NAME must not be empty"
    exit 1
fi

ROLE_LOCAL=$3
if [ "${ROLE_LOCAL}" = "_" ]; then
    ROLE_LOCAL=""
fi

ROLE_ANYWHERE=$4
if [ "${ROLE_ANYWHERE}" = "_" ]; then
    ROLE_ANYWHERE=""
fi

if [ -z "${ROLE_LOCAL}" ]; then
    if [ -z "${ROLE_ANYWHERE}" ]; then
        echo 2>&1 "Error: either ROLE_LOCAL or ROLE_ANYWHERE must be non empty"
        exit 1
    fi
else
    if !(echo "${ROLE_LOCAL}" | grep -q -x -E "(Started|Slave|Master|Unpromoted|Promoted)"); then
        echo 2>&1 "Error: argument ROLE_LOCAL must be either 'Started' 'Slave' 'Master' 'Unpromoted' or 'Promoted'"
        exit 1
    fi
fi

if [ -n "${ROLE_ANYWHERE}" ] && !(echo "${ROLE_ANYWHERE}" | grep -q -x -E "(Started|Slave|Master|Unpromoted|Promoted)"); then
    echo 2>&1 "Error: argument ROLE_ANYWHERE must be either 'Started' 'Slave' 'Master' 'Unpromoted' or 'Promoted'"
    exit 1
fi

# Ensure compatibility with pacemaker 2.1
if pacemaker_supports_promoted; then
    ROLE_LOCAL=$(echo "$ROLE_LOCAL" | sed -e 's/Master/Promoted/' -e 's/Slave/Unpromoted/')
    ROLE_ANYWHERE=$(echo "$ROLE_ANYWHERE" | sed -e 's/Master/Promoted/' -e 's/Slave/Unpromoted/')
    OCF="ocf"
else
    OCF="ocf:"
fi

HOST=${5:-$(facter hostname)}
TIMEOUT=${6:-__PCMKTIMEOUT__}


# Configure the search
# ----
# Note: we can't use crm_resource in all searches because we can't
# easily extract the host the OCF resources run on (crm_resource
# returns the pcmk-remote nodes rather than the hosts)
# So instead, we implement various searches with XPath directly.

if [ "${BUNDLE_NAME}" != "${NAME}" ]; then
# ocf resource
local_resource_xpath="//bundle/replica/resource[@resource_agent='${OCF}:pacemaker:remote']/node[@name='${HOST}']/../../resource[@id='${NAME}']"
any_resource_xpath="//bundle//resource[@id='${NAME}']"
replicas_xpath="//bundle/primitive[@id='${BUNDLE_NAME}']/../*[boolean(@image) and boolean(@replicas)]"
else
# bundle resource
local_resource_xpath="//bundle[@id='${NAME}']/replica/resource/node[@name='${HOST}']/../../resource"
any_resource_xpath="//bundle[@id='${NAME}']//resource"
replicas_xpath="//bundle[@id='${BUNDLE_NAME}']/*[boolean(@image) and boolean(@replicas)]"
fi

bundle_def_xpath="//bundle[@id='${BUNDLE_NAME}']/*[boolean(@image) and boolean(@replicas)]"
BUNDLE_CONTAINER_ENGINE=$(cibadmin -Q | xmllint --xpath "name(${bundle_def_xpath})" -)
BUNDLE_REPLICAS=$(cibadmin -Q | xmllint --xpath "string(${bundle_def_xpath}/@replicas)" -)


# The wait algorithm follows a two-stage approach
#  1. Depending on how the script is called, we first check whether
#     the resource is restarted locally. An A/P resource may be
#     restarted elsewhere in the cluster.
#  2. If needed, check whether the A/P resource has restarted
#     elsewhere. For A/P M/S resources, in case the resource is
#     restarted as Slave locally, ensure a Master is available.

success=1
bailout=1
timeout=$TIMEOUT
role=""

# Stage 1: local check
if [ -n "$ROLE_LOCAL" ]; then
    log "Waiting until ${NAME} has restarted on ${HOST} and is in state ${ROLE_LOCAL}"
    log "Will probe resource state with the following XPath pattern: ${local_resource_xpath}"

    while [ $timeout -gt 0 ] && [ $bailout -ne 0 ] && [ $success -ne 0 ]; do
        resource=$(crm_mon -r --as-xml | xmllint --xpath "${local_resource_xpath}" - 2>/dev/null)
        role=$(echo "${resource}" | sed -ne 's/.*\Wrole="\([^"]*\)".*/\1/p')

	if [ "$(crm_resource --meta -r ${NAME} -g is-managed 2>/dev/null)" = "false" ]; then
            log "${NAME} is unmanaged, will never reach target role. Bailing out"
            bailout=0
            continue
	elif [ "$(crm_resource --meta -r ${NAME} -g target-role 2>/dev/null)" = "Stopped" ]; then
            log "${NAME} is disabled, will never reach target role. Bailing out"
            bailout=0
            continue
        elif echo "${resource}" | grep -q -w "\Wblocked=\"true\""; then
            log "${NAME} is blocked, will never reach target role. Bailing out"
            bailout=0
            continue
	elif did_resource_failed_locally; then
            log "${NAME} is in failed state, will never reach target role. Bailing out"
            bailout=0
            continue
        elif [ "$role" = "$ROLE_LOCAL" ]; then
            success=0
            continue
        elif [ -n "$ROLE_ANYWHERE" ] && [ "$role" = "$ROLE_ANYWHERE" ]; then
            # A/P: we are restarted in the expected state
            success=0
            continue
        else
            log "Waiting for ${NAME} to transition to role ${ROLE_LOCAL} on ${HOST}"
        fi

        if [ $bailout -ne 0 ] && [ $success -ne 0 ]; then
            sleep 4
            timeout=$((timeout-4))
        fi
    done
fi

# Stage 2: global check
if [ $timeout -gt 0 ] && [ -n "$ROLE_ANYWHERE" ] && [ "$role" != "$ROLE_ANYWHERE" ]; then
    log "Wait until ${NAME} is restarted anywhere in the cluster in state ${ROLE_ANYWHERE}"
    log "Will probe resource state with the following XPath pattern: ${any_resource_xpath}"

    success=1
    bailout=1
    while [ $timeout -gt 0 ] && [ $bailout -ne 0 ] && [ $success -ne 0 ]; do
        resources=$(crm_mon -r --as-xml | xmllint --xpath "${any_resource_xpath}" - 2>/dev/null)
	if [ "$(crm_resource --meta -r ${NAME} -g is-managed 2>/dev/null)" = "false" ]; then
            log "${NAME} is unmanaged, will never reach target role. Bailing out"
            bailout=0
            continue
	elif [ "$(crm_resource --meta -r ${NAME} -g target-role 2>/dev/null)" = "Stopped" ]; then
            log "${NAME} is disabled, will never reach target role. Bailing out"
            bailout=0
            continue
        elif ! (echo "${resources}" | grep -q -w "\Wblocked=\"false\""); then
            log "${NAME} blocked, will never reach target role. Bailing out"
            bailout=0
            continue
	elif did_resource_failed_globally; then
            log "${NAME} is in failed state, will never reach target role. Bailing out"
            bailout=0
            continue
        elif echo "${resources}" | grep -q -w "\Wrole=\"${ROLE_ANYWHERE}\""; then
            success=0
            continue
        else
            log "Waiting for ${NAME} to transition to role ${ROLE_ANYWHERE} anywhere in the cluster"
        fi

        if [ $bailout -ne 0 ] && [ $success -ne 0 ]; then
            sleep 4
            timeout=$((timeout-4))
        fi
    done
fi

if [ $timeout -le 0 ]; then
    log "Timeout reached after ${TIMEOUT}s while waiting for ${NAME} to be restarted"
elif [ $bailout -le 0 ]; then
    log "Restart monitoring for ${NAME} cancelled"
fi

if [ $success -eq 0 ]; then
    log "${NAME} successfully restarted"
else
    log "${NAME} was not restarted properly"
fi

# Don't block minor update or stack update if the wait was unsuccessful
exit 0