#!/bin/bash # # Copyright (c) 2016 Wind River Systems, Inc. # # SPDX-License-Identifier: Apache-2.0 # # # The patching subsystem provides a patch-functions bash source file # with useful function and variable definitions. # . /etc/patching/patch-functions # # We can now check to see what type of node we're on, if it's locked, etc, # and act accordingly # # # Declare an overall script return code # declare -i GLOBAL_RC=$PATCH_STATUS_OK # # Handle restarting Cinder services # # Syntax:(" "\) SERVICES=("cinder-volume 30 30 kill"\ "cinder-scheduler 50 20 kill"\ "cinder-api 30 20 kill") # where: # = name of executable file reported by ps # = how much to wait for the process to gracefully shutdown # = either 'kill' the process with SIGKILL or 'leave' it running # the idea is to avoid leaving behind processes that are degraded, better have # new ones re-spawn # = how much to wait before the new process can be considered available. # The processes are restarted by sm by running the ocf scripts at # /usr/lib/ocf/resource.d/openstack/cinder-*. These scripts have a very good service init # monitoring routine. We just have to make sure that they don't hang while restarting. # The values are taken from SM, but we don't wait for any retry. # Note: cinder-volume timeout is set to 180 secs in sm which is too much for our controlled # restart function get_pid { local service=$1 PID=$(cat /var/run/resource-agents/$service.pid) echo "$PID" } if is_controller then # Cinder services only run on the controller if [ ! -f $PATCH_FLAGDIR/cinder.restarted ] then touch $PATCH_FLAGDIR/cinder.restarted # cinder-volume uses this to know that its new process was restarted by in-service patching touch $PATCH_FLAGDIR/cinder.restarting for s in "${SERVICES[@]}"; do set -- $s service="$1" timeout="$2" initialize_interval="$3" after_timeout="$4" new_process="false" # Check SM to see if service is running sm-query service $service | grep -q 'enabled-active' if [ $? -eq 0 ] then loginfo "$0: Restarting $service" # Get PID PID=$(get_pid $service) # Send restart signal to process kill -s TERM $PID # Wait up to $timeout seconds for service to gracefully recover let -i UNTIL=$SECONDS+$timeout while [ $UNTIL -ge $SECONDS ] do # Check to see if we have a new process NEW_PID=$(get_pid $service) if [[ "$PID" != "$NEW_PID" ]] then # We have a new process new_process="true" break fi # Still old process? Let's wait 5 seconds and check again sleep 5 done # Do a hard restart of the process if we still have the old one NEW_PID=$(get_pid $service) if [[ "$PID" == "$NEW_PID" ]] then # we have the old process still running! if [[ "$after_timeout" == "kill" ]] then loginfo "$0: Old process of $service failed to gracefully terminate in $timeout, killing it!" # kill the old process kill -s KILL $PID # wait for a new process to be restarted by sm let -i UNTIL=$SECONDS+10 while [ $UNTIL -ge $SECONDS ] do sleep 1 # Check to see if we have a new process NEW_PID=$(get_pid $service) if [[ ! -z "$NEW_PID" ]] && [[ "$PID" != "$NEW_PID" ]] then loginfo "$0: New process of $service started" new_process="true" break fi done fi fi # Wait for the new process to complete initialisation if [[ "$new_process" == "true" ]] then let -i UNTIL=$SECONDS+$initialize_interval while [ $UNTIL -ge $SECONDS ] do # Note: Services are restarted by sm which runs the ocf start script. # Sm reports enabled-active only *after* those scripts return success sm-query service $service | grep -q 'enabled-active' if [ $? -eq 0 ] then loginfo "$0: New process of $service started correctly" break fi sleep 1 done fi sm-query service $service | grep -q 'enabled-active' if [ $? -ne 0 ] then # Still not running! Clear the flag and mark the RC as failed loginfo "$0: Failed to restart $service" rm -f $PATCH_FLAGDIR/$service.restarted GLOBAL_RC=$PATCH_STATUS_FAILED sm-query service $service break # Note: break if any process in the SERVICES list fails fi fi done fi fi # # Exit the script with the overall return code # rm -f $PATCH_FLAGDIR/cinder.restarting exit $GLOBAL_RC