#!/bin/bash # Authors: Bartosz Kupidura (Mirantis): Rewrite RA to support mysql/galera # Sergii Golovatiuk (Mirantis): Rewrite RA to support mysql/galera # Alan Robertson: DB2 Script # Jakub Janczak: rewrite as MySQL # Andrew Beekhof: cleanup and import # Sebastian Reitenbach: add OpenBSD defaults, more cleanup # Narayan Newton: add Gentoo/Debian defaults # Marian Marinov, Florian Haas: add replication capability # Yves Trudeau, Baron Schwartz: add VIP support and improve replication # # Support: openstack@lists.launchpad.net # License: GNU General Public License (GPL) # # (c) 2002-2005 International Business Machines, Inc. # 2005-2010 Linux-HA contributors # 2014 Mirantis Inc. ####################################################################### # Initialization: : ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} . ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs ####################################################################### # Fill in some defaults if no values are specified OCF_RESKEY_binary_default="/usr/bin/mysqld_safe" OCF_RESKEY_client_binary_default="/usr/bin/mysql" OCF_RESKEY_config_default="/etc/my.cnf" OCF_RESKEY_datadir_default="/var/lib/mysql" OCF_RESKEY_user_default="mysql" OCF_RESKEY_group_default="mysql" OCF_RESKEY_pid_default="${HA_RSCTMP}/${__SCRIPT_NAME}/${__SCRIPT_NAME}.pid" OCF_RESKEY_socket_default="/var/lib/mysql/mysql.sock" OCF_RESKEY_test_user_default="root" OCF_RESKEY_test_passwd_default="" OCF_RESKEY_additional_parameters_default="" OCF_RESKEY_master_timeout_default="300" : ${HA_LOGTAG="ocf-mysql-wss"} : ${HA_LOGFACILITY="daemon"} : ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} MYSQL_BINDIR="$(dirname ${OCF_RESKEY_binary})" : ${OCF_RESKEY_client_binary=${OCF_RESKEY_client_binary_default}} : ${OCF_RESKEY_config=${OCF_RESKEY_config_default}} : ${OCF_RESKEY_datadir=${OCF_RESKEY_datadir_default}} : ${OCF_RESKEY_user=${OCF_RESKEY_user_default}} : ${OCF_RESKEY_group=${OCF_RESKEY_group_default}} : ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}} : ${OCF_RESKEY_socket=${OCF_RESKEY_socket_default}} : ${OCF_RESKEY_test_user=${OCF_RESKEY_test_user_default}} : ${OCF_RESKEY_test_passwd=${OCF_RESKEY_test_passwd_default}} : ${OCF_RESKEY_additional_parameters=${OCF_RESKEY_additional_parameters_default}} : ${OCF_RESKEY_master_timeout=${OCF_RESKEY_master_timeout_default}} ####################################################################### # Convenience variables MYSQL=$OCF_RESKEY_client_binary HOSTNAME=$(uname -n) MYSQL_OPTIONS_LOCAL="-S $OCF_RESKEY_socket --connect_timeout=10" MYSQL_OPTIONS_TEST="$MYSQL_OPTIONS_LOCAL --user=$OCF_RESKEY_test_user --password=$OCF_RESKEY_test_passwd" ####################################################################### usage() { cat < 0.1 Resource script for MySQL Resource script for MySQL Location of the MySQL server binary MySQL server binary Location of the MySQL client binary MySQL client binary Configuration file MySQL config Directory containing databases Data directory User running MySQL daemon MySQL user Group running MySQL daemon (for logfile and directory permissions) MySQL group The pidfile to be used for mysqld. MySQL pid file The socket to be used for mysqld. MySQL socket MySQL test user, must have select privilege on 'show status' MySQL test user MySQL test user password MySQL test user password Additional parameters which are passed to the mysqld on startup. (e.g. --skip-external-locking or --skip-grant-tables) Additional parameters to pass to mysqld How long we should wait for galera master. If master not come up before timeout, RA will choose new master from already running nodes. This value can by changed by crm_attribute: # crm_attribute --name galera_master_timeout --update 500 Remember to remove this after maintenance. USE WITH CAUTION! Remember to change timeout for start operation. Start timeout should be bigger than master_timeout Galera master timeout END } # Convenience functions ####################################################################### nodes_in_cluster_online() { local NODES NODES=$(crm_node --partition | sed -e '/(null)/d') if [ ! -z "$NODES" ]; then echo $NODES else echo fi } nodes_in_cluster() { local NODES #Ubuntu doesn't like \w NODES=$(crm_node --list | awk '/^[a-zA-Z0-9]/ {print $2}' | sed -e '/(null)/d') if [ ! -z "$NODES" ]; then echo $NODES else echo fi } #Validate if GTID have correct format (return 0), else return 1 #valid values are: #XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX:123 - standard cluster-id:commit-id #XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX:-1 - standard non initialized cluster, 00000000-0000-0000-0000-000000000000:-1 validate_gtid() { local rc local status_loglevel="err" if [ -z $1 ]; then ocf_log $status_loglevel "No GTID provided" return 1 fi echo $1 | grep -q -E '^\w{8}-\w{4}-\w{4}-\w{4}-\w{12}:([[:digit:]]|-1)' rc=$? if [ $rc -ne 0 ]; then ocf_log $status_loglevel "GTID have wrong format: $1" return 1 else ocf_log info "GTID OK: $1" return 0 fi } #Get galera GTID from local mysql instance update_node_gtid() { local status_loglevel="err" local GTID # Set loglevel to info during probe if ocf_is_probe; then status_loglevel="info" fi if mysql_status $status_loglevel 1; then CLUSTER_ID=$($MYSQL $MYSQL_OPTIONS_TEST -s -N \ -e "SHOW STATUS LIKE 'wsrep_local_state_uuid'" | awk '{print $NF}') COMMIT_ID=$($MYSQL $MYSQL_OPTIONS_TEST -s -N \ -e "SHOW STATUS LIKE 'wsrep_last_committed'" | awk '{print $NF}') GTID="$CLUSTER_ID:$COMMIT_ID" else GTID=$(cat ${OCF_RESKEY_datadir}/grastate.dat \ | awk '/uuid/ { uuid = $NF} /seqno/ { seqno = $NF} END {print uuid":"seqno}') fi #Final try to recover if [ -z ${GTID} ]; then GTID=$(${OCF_RESKEY_binary} --wsrep-recover \ --log-error=/dev/stdout 2>&1 | grep 'Recovered position' | awk '{print $NF}') fi if validate_gtid "$GTID"; then ocf_log info "Galera GTID: ${GTID}" crm_attribute --quiet --node $HOSTNAME --lifetime reboot --name gtid \ --update $GTID else ocf_log info "Wrong GTID: ${GTID}, Removing" crm_attribute --quiet --node $HOSTNAME --lifetime reboot --name gtid -D fi } get_master_timeout() { local default=$OCF_RESKEY_master_timeout local rc local timeout=$(crm_attribute --quiet --name galera_master_timeout \ --query -d $default -q | sed -e '/(null)/d') rc=$? if [ $rc -eq 0 ]; then echo $timeout else echo $default fi } #Get gtid attribute for $1 node, "0" means no GTID set or wrong format for GTID get_node_gtid() { local rc local GTID GTID=$(crm_attribute --quiet --node $1 --lifetime reboot --query \ --name gtid 2> /dev/null | sed -e '/(null)/d') rc=$? if [ $rc -ne 0 -o -z "$GTID" ]; then ocf_log info "No GTID for $1" echo 0 else if validate_gtid "$GTID"; then ocf_log info "Galera GTID: ${GTID}" echo $GTID else ocf_log info "No GTID for $1" echo 0 fi fi } check_if_reelection_needed() { local PARTITION_WITH_QUORUM=$(crm_node -q | sed -e '/(null)/d') local RESOURCE_NAME=$(echo $OCF_RESOURCE_INSTANCE | cut -f1 -d":") local NODE_COUNT=$(nodes_in_cluster | wc -w) local RUNNING_INSTANCES local rc if [ $PARTITION_WITH_QUORUM -eq 1 -o $NODE_COUNT -eq 1 ]; then RUNNING_INSTANCES=$(crm_resource \ --quiet --locate --resource $RESOURCE_NAME | sed -e '/(null)/d' | wc -l 2> /dev/null) rc=$? if [ $rc -eq $OCF_SUCCESS -a $RUNNING_INSTANCES -lt 1 ]; then return 1 fi fi return 0 } choose_master() { local NODES=$1 local -A TMP for NODE in $NODES; do NODE_ID=$(echo $NODE | md5sum | awk '{print $1}') TMP[$NODE_ID]=$NODE done MASTER=$(printf -- '%s\n' "${!TMP[@]}" | sort | head -1) ocf_log info "Choosed master: ${TMP[$MASTER]}" echo ${TMP[$MASTER]} } get_possible_masters() { local NODES=$* local POSSIBLE_MASTERS local -A TMP local MASTER_GTID local GTID for NODE in $NODES; do GTID=$(get_node_gtid $NODE) TMP[$NODE]=$(echo $GTID|cut -d":" -f 2) done MASTER_GTID=$(printf -- '%s\n' "${TMP[@]}" | sort -r | head -1) for NODE in $NODES; do if [ $MASTER_GTID -eq ${TMP[$NODE]} ]; then POSSIBLE_MASTERS="$POSSIBLE_MASTERS $NODE" fi done ocf_log info "Possible masters: $POSSIBLE_MASTERS" echo $POSSIBLE_MASTERS } check_if_galera_pc() { local NODES local MASTERS local timeout=$(get_master_timeout) ocf_log info "Checking if Primary Component" while [ $timeout -gt 0 ]; do NODES=$(nodes_in_cluster_online) MASTERS=$(get_possible_masters "$NODES") MASTER=$(choose_master "$MASTERS") if [ "$MASTER" = "$HOSTNAME" ]; then ocf_log info "I\'m Primary Component. Join me!" return 1 fi if check_if_reelection_needed; then ocf_log info "My neighbour is Primary Component" return 0 fi sleep 10 (( timeout -= 10 )) ocf_log info "Waiting for master. ${timeout} seconds left" done ocf_log info "${HOSTNAME} is not Primary Component" return 0 } # Functions invoked by resource manager actions mysql_validate() { check_binary $OCF_RESKEY_binary check_binary $OCF_RESKEY_client_binary if [ ! -f $OCF_RESKEY_config ]; then ocf_log err "Config $OCF_RESKEY_config doesn't exist"; return $OCF_ERR_INSTALLED; fi if [ ! -d $OCF_RESKEY_datadir ]; then ocf_log err "Datadir $OCF_RESKEY_datadir doesn't exist"; return $OCF_ERR_INSTALLED; fi getent passwd $OCF_RESKEY_user >/dev/null 2>&1 if [ ! $? -eq 0 ]; then ocf_log err "User $OCF_RESKEY_user doesn't exit"; return $OCF_ERR_INSTALLED; fi getent group $OCF_RESKEY_group >/dev/null 2>&1 if [ ! $? -eq 0 ]; then ocf_log err "Group $OCF_RESKEY_group doesn't exist"; return $OCF_ERR_INSTALLED; fi return $OCF_SUCCESS } mysql_status() { i=${2:-3} sleeptime=${3:-5} while [ $i -gt 0 ]; do if [ -f "$OCF_RESKEY_pid" ]; then break fi sleep $sleeptime (( i-- )) ocf_log info "PIDFile ${OCF_RESKEY_pid} of MySQL server not found. Sleeping for $sleeptime seconds. ${i} retries left" done if [ $i -eq 0 ]; then ocf_log $1 "MySQL is not running" return $OCF_NOT_RUNNING; fi pid=$(cat $OCF_RESKEY_pid); if [ -d /proc -a -d /proc/1 ]; then [ "u$pid" != "u" -a -d /proc/$pid ] else kill -s 0 $pid >/dev/null 2>&1 fi if [ $? -eq 0 ]; then return $OCF_SUCCESS; else ocf_log $1 "MySQL is not running" return $OCF_ERR_GENERIC; fi } mysql_monitor() { local rc local status_loglevel="err" local WSREP_CONNECTED local WSREP_LOCAL_STATE_COMMENT local WSREP_READY # Set loglevel to info during probe if ocf_is_probe; then status_loglevel="info" fi if [ -f "${OCF_RESKEY_datadir}"/sst_in_progress ]; then ocf_log 'SST is in progress' return $OCF_SUCCESS fi mysql_status $status_loglevel rc=$? if [ $rc -ne $OCF_SUCCESS ]; then return $rc fi update_node_gtid WSREP_CONNECTED=$($MYSQL $MYSQL_OPTIONS_TEST -s -N \ -e "SHOW STATUS LIKE 'wsrep_connected'" | awk '{print $NF}') rc=$? if [ $rc -ne 0 -o "$WSREP_CONNECTED" != "ON" ]; then return $OCF_ERR_GENERIC fi WSREP_LOCAL_STATE_COMMENT=$($MYSQL $MYSQL_OPTIONS_TEST -s -N \ -e "SHOW STATUS LIKE 'wsrep_local_state_comment'" | awk '{print $NF}') rc=$? #Synced|Donor|Desync if [ $rc -eq 0 ]; then if [[ "$WSREP_LOCAL_STATE_COMMENT" =~ 'Synced'|'Donor'|'Desync' ]]; then WSREP_READY=$($MYSQL $MYSQL_OPTIONS_TEST -s -N \ -e "SHOW STATUS LIKE 'wsrep_ready'" | awk '{print $NF}') rc=$? #Synced but wsrep not ready if [ $rc -ne 0 -o "$WSREP_READY" != "ON" ]; then return $OCF_ERR_GENERIC fi elif [[ "$WSREP_LOCAL_STATE_COMMENT" == 'Initialized' ]]; then ocf_log err 'MySQL lost quorum or uninitialized' return $OCF_ERR_GENERIC fi fi ocf_log debug "MySQL monitor succeeded"; return $OCF_SUCCESS } mysql_start() { local NODES if mysql_status info 1; then ocf_log info "MySQL already running" return $OCF_SUCCESS fi socket_dir="$( dirname ${OCF_RESKEY_socket} )" if [ ! -d "${socket_dir}" ] ; then ocf_log info "Create socket dir: ${socket_dir} and chown to ${OCF_RESKEY_user}:${OCF_RESKEY_group}" mkdir -p "${socket_dir}" chown ${OCF_RESKEY_user}:${OCF_RESKEY_group} "${socket_dir}" chmod 755 "${socket_dir}" fi # check and make PID file dir pid_dir="$( dirname ${OCF_RESKEY_pid} )" if [ ! -d "${pid_dir}" ] ; then ocf_log info "Create PID dir: ${pid_dir} and chown to ${OCF_RESKEY_user}:${OCF_RESKEY_group}" mkdir -p "${pid_dir}" chown -R ${OCF_RESKEY_user}:${OCF_RESKEY_group} "${pid_dir}" chmod 755 "${pid_dir}" fi # Regardless of whether we just created the directory or it # already existed, check whether it is writable by the configured # user for dir in $pid_dir $socket_dir; do if ! su -s /bin/sh - $OCF_RESKEY_user -c "test -w $dir"; then ocf_log err "Directory $dir is not writable by $OCF_RESKEY_user" exit $OCF_ERR_PERM; fi done if [ -f /tmp/wsrep-init-file ]; then mysql_extra_params="--init-file=/tmp/wsrep-init-file" else mysql_extra_params="" fi update_node_gtid check_if_reelection_needed rc=$? if [ $rc -eq 1 ]; then check_if_galera_pc rc=$? if [ $rc -eq 1 ]; then mysql_extra_params="$mysql_extra_params --wsrep-new-cluster" fi fi ocf_log info "Starting MySQL" ${OCF_RESKEY_binary} \ --pid-file=$OCF_RESKEY_pid \ --socket=$OCF_RESKEY_socket \ --datadir=$OCF_RESKEY_datadir \ --user=$OCF_RESKEY_user $OCF_RESKEY_additional_parameters \ $mysql_extra_params >/dev/null 2>&1 & rc=$? if [ $rc -ne 0 ]; then ocf_log err "MySQL start command failed: $rc" return $rc fi # Spin waiting for the server to come up. # Let the CRM/LRM time us out if required. while :; do if mysql_status info 1; then break fi sleep 3 done ocf_log info "MySQL started" return $OCF_SUCCESS } mysql_cleanup() { ocf_log debug "Delete lock file: /var/lock/subsys/mysqld" rm -f /var/lock/subsys/mysqld ocf_log debug "Delete sock file: ${OCF_RESKEY_socket}" rm -f $OCF_RESKEY_socket ocf_log debug "Delete pid file: ${OCF_RESKEY_pid} with content $(cat ${OCF_RESKEY_pid})" rm -f $OCF_RESKEY_pid } mysql_stop() { local rc local pids mysql_status info 1 rc=$? if [ $rc -ne $OCF_SUCCESS ]; then mysql_cleanup return $OCF_SUCCESS fi local pid=$(cat $OCF_RESKEY_pid 2>/dev/null) local pgrp=$(ps -o pgid= ${pid}) local pids="$(echo $(pgrep -g ${pgrp}))" ocf_log info "Sending SIGTERM to PID: ${pid}" /bin/kill -TERM ${pid} > /dev/null rc=$? [ $rc -ne 0 ] && ocf_log warn "MySQL has failed to terminate gracefully" # stop waiting shutdown_timeout=20 while [ $shutdown_timeout -gt 0 ]; do mysql_status info 1 rc=$? if [ $rc -ne $OCF_SUCCESS ]; then /usr/bin/pkill -9 -g ${pgrp} > /dev/null mysql_cleanup return $OCF_SUCCESS fi sleep 2 (( shutdown_timeout -= 2 )) ocf_log info "MySQL still hasn't stopped yet. ${shutdown_timeout} seconds left " done mysql_status info 1 rc=$? if [ $rc -ne $OCF_NOT_RUNNING ]; then ocf_log info "MySQL failed to stop using SIGTERM. Sending SIGKILL to PIDS: ${pids}" /usr/bin/pkill -9 -g ${pgrp} > /dev/null fi mysql_cleanup return $OCF_SUCCESS } ########################################################################## # If DEBUG_LOG is set, make this resource agent easy to debug: set up the # debug log and direct all output to it. Otherwise, redirect to /dev/null. # The log directory must be a directory owned by root, with permissions 0700, # and the log must be writable and not a symlink. ########################################################################## DEBUG_LOG="/tmp/mysql.ocf.ra.debug/log" if [ "${DEBUG_LOG}" -a -w "${DEBUG_LOG}" -a ! -L "${DEBUG_LOG}" ]; then DEBUG_LOG_DIR="${DEBUG_LOG%/*}" if [ -d "${DEBUG_LOG_DIR}" ]; then exec 9>>"$DEBUG_LOG" exec 1>&9 2>&9 date '+%Y%m%d %H:%M:%S' >&9 echo "$*" >&9 env | grep OCF_ | sort >&9 set -x else exec 9>/dev/null fi fi case "$1" in meta-data) meta_data exit $OCF_SUCCESS;; usage|help) usage exit $OCF_SUCCESS;; esac mysql_validate rc=$? if [ $rc -ne 0 ]; then case "$1" in stop) exit $OCF_SUCCESS;; monitor) exit $OCF_NOT_RUNNING;; *) exit $rc;; esac fi # What kind of method was invoked? case "$1" in start) mysql_start;; stop) mysql_stop;; monitor) mysql_monitor;; validate-all) exit $OCF_SUCCESS;; *) usage exit $OCF_ERR_UNIMPLEMENTED;; esac # vim: set ts=4 sw=4 tw=0 et :