From 990e6b83f278b0a9943719f20a5cc13dd342e676 Mon Sep 17 00:00:00 2001 From: Felipe Reyes Date: Mon, 2 Mar 2015 17:52:44 -0300 Subject: [PATCH 01/22] Add percona ocf --- hooks/percona_hooks.py | 25 +- hooks/percona_utils.py | 11 + ocf/percona/mysql | 2193 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 2227 insertions(+), 2 deletions(-) create mode 100755 ocf/percona/mysql diff --git a/hooks/percona_hooks.py b/hooks/percona_hooks.py index 212f2f4..d269a7c 100755 --- a/hooks/percona_hooks.py +++ b/hooks/percona_hooks.py @@ -50,6 +50,7 @@ from percona_utils import ( assert_charm_supports_ipv6, unit_sorted, get_db_helper, + install_mysql_ocf, ) from charmhelpers.contrib.database.mysql import ( PerconaClusterHelper, @@ -72,6 +73,20 @@ from charmhelpers.contrib.network.ip import ( hooks = Hooks() LEADER_RES = 'grp_percona_cluster' +RES_MYSQL_PARAMS = ('params config="/etc/my.cnf" ' + 'pid="/var/lib/mysql/mysqld.pid" ' + 'socket="/var/run/mysqld/mysqld.sock" ' + 'replication_user="sstuser" ' + 'replication_passwd="%(sstpsswd)s" ' + 'max_slave_lag="60" ' + 'evict_outdated_slaves="false" ' + 'binary="/usr/libexec/mysqld" ' + 'op monitor interval="5s" role="Master" ' + 'OCF_CHECK_LEVEL="1" ' + 'op monitor interval="2s" role="Slave" ' + 'OCF_CHECK_LEVEL="1" ' + 'op start interval="0" timeout="60s" ' + 'op stop interval="0" timeout="60s" ') @hooks.hook('install') @@ -167,6 +182,8 @@ def cluster_joined(relation_id=None): relation_set(relation_id=relation_id, relation_settings=relation_settings) + install_mysql_ocf() + @hooks.hook('cluster-relation-departed') @hooks.hook('cluster-relation-changed') @@ -387,8 +404,12 @@ def ha_relation_joined(): vip_params = 'params ip="%s" cidr_netmask="%s" nic="%s"' % \ (vip, vip_cidr, vip_iface) - resources = {'res_mysql_vip': res_mysql_vip} - resource_params = {'res_mysql_vip': vip_params} + resources = {'res_mysql_vip': res_mysql_vip, + 'res_mysql': 'ocf:percona:mysql'} + db_helper = get_db_helper() + sstpsswd = db_helper.get_mysql_password(username='sstuser') + resource_params = {'res_mysql_vip': vip_params, + 'res_mysql': RES_MYSQL_PARAMS % {'sstpsswd': sstpsswd}} groups = {'grp_percona_cluster': 'res_mysql_vip'} for rel_id in relation_ids('ha'): diff --git a/hooks/percona_utils.py b/hooks/percona_utils.py index f7c92bf..70f951c 100644 --- a/hooks/percona_utils.py +++ b/hooks/percona_utils.py @@ -4,6 +4,7 @@ from subprocess import Popen, PIPE import socket import tempfile import os +import shutil from charmhelpers.core.host import ( lsb_release ) @@ -229,3 +230,13 @@ def unit_sorted(units): """Return a sorted list of unit names.""" return sorted( units, lambda a, b: cmp(int(a.split('/')[-1]), int(b.split('/')[-1]))) + + +def install_mysql_ocf(): + dest_file = '/usr/lib/ocf/resource.d/percona/mysql' + src_file = 'ofc/percona/mysql' + + if not os.path.isdir(os.path.dirname(dest_file)): + os.makedirs(os.path.dirname(dest_file)) + if not os.path.exists(dest_file): + shutil.copy(src_file, dest_file) diff --git a/ocf/percona/mysql b/ocf/percona/mysql new file mode 100755 index 0000000..9ef84de --- /dev/null +++ b/ocf/percona/mysql @@ -0,0 +1,2193 @@ +#!/bin/bash +# +# +# MySQL +# +# Description: Manages a MySQL database as Linux-HA resource +# +# Authors: Alan Robertson: DB2 Script +# Jakub Janczak: rewrite as MySQL +# Andrew Beekhof: cleanup and import +# Sebastian Reitenbach: add OpenBSD defaults, more cleanup +# Narayan Newton: add Gentoo/Debian defaults +# Marian Marinov, Florian Haas: add replication capability +# Yves Trudeau, Baron Schwartz: add VIP support and improve replication +# Jervin Real, Kenny Gryp: Booth Compatibility Improvements +# +# Support: linux-ha@lists.linux-ha.org +# License: GNU General Public License (GPL) +# +# (c) 2002-2005 International Business Machines, Inc. +# 2005-2010 Linux-HA contributors +# +# An example usage in /etc/ha.d/haresources: +# node1 10.0.0.170 mysql +# +# Version: 20141112131457 +# +# See usage() function below for more details... +# +# OCF instance parameters: +# OCF_RESKEY_binary +# OCF_RESKEY_client_binary +# OCF_RESKEY_config +# OCF_RESKEY_datadir +# OCF_RESKEY_user +# OCF_RESKEY_group +# OCF_RESKEY_test_table +# OCF_RESKEY_test_user +# OCF_RESKEY_test_passwd +# OCF_RESKEY_enable_creation +# OCF_RESKEY_additional_parameters +# OCF_RESKEY_log +# OCF_RESKEY_pid +# OCF_RESKEY_socket +# OCF_RESKEY_replication_user +# OCF_RESKEY_replication_passwd +# OCF_RESKEY_replication_port +# OCF_RESKEY_replication_options +# OCF_RESKEY_max_slave_lag +# OCF_RESKEY_evict_outdated_slaves +# OCF_RESKEY_reader_attribute +# OCF_RESKEY_reader_failcount +# OCF_RESKEY_backup_lockfile +# OCF_RESKEY_geo_remote_IP +# OCF_RESKEY_booth_master_ticket +# OCF_RESKEY_post_promote_script +# OCF_RESKEY_prm_binlog_parser_path +# OCF_RESKEY_try_restart_crashed_master + +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +####################################################################### + +# Fill in some defaults if no values are specified +HOSTOS=`uname` +if [ "X${HOSTOS}" = "XOpenBSD" ];then + OCF_RESKEY_binary_default="/usr/local/bin/mysqld_safe" + OCF_RESKEY_config_default="/etc/my.cnf" + OCF_RESKEY_datadir_default="/var/mysql" + OCF_RESKEY_user_default="_mysql" + OCF_RESKEY_group_default="_mysql" + OCF_RESKEY_log_default="/var/log/mysqld.log" + OCF_RESKEY_pid_default="/var/mysql/mysqld.pid" + OCF_RESKEY_socket_default="/var/run/mysql/mysql.sock" +else + OCF_RESKEY_binary_default="/usr/bin/safe_mysqld" + OCF_RESKEY_config_default="/etc/my.cnf" + OCF_RESKEY_datadir_default="/var/lib/mysql" + OCF_RESKEY_user_default="mysql" + OCF_RESKEY_group_default="mysql" + OCF_RESKEY_log_default="/var/log/mysqld.log" + OCF_RESKEY_pid_default="/var/run/mysql/mysqld.pid" + OCF_RESKEY_socket_default="/var/lib/mysql/mysql.sock" +fi +OCF_RESKEY_client_binary_default="mysql" +OCF_RESKEY_test_user_default="root" +OCF_RESKEY_test_table_default="mysql.user" +OCF_RESKEY_test_passwd_default="" +OCF_RESKEY_enable_creation_default=0 +OCF_RESKEY_additional_parameters_default="" +OCF_RESKEY_replication_port_default="3306" +OCF_RESKEY_max_slave_lag_default="3600" +OCF_RESKEY_evict_outdated_slaves_default="false" +OCF_RESKEY_reader_attribute_default="readable" +OCF_RESKEY_reader_failcount_default="1" +OCF_RESKEY_backup_lockfile_default="/var/lock/innobackupex" +OCF_RESKEY_booth_master_ticket_default="ticketMaster" +OCF_RESKEY_async_stop_default=0 +OCF_RESKEY_try_restart_crashed_master_default=1 + +: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} +MYSQL_SBINDIR=`dirname ${OCF_RESKEY_binary}` + +: ${OCF_RESKEY_client_binary=${OCF_RESKEY_client_binary_default}} + +: ${OCF_RESKEY_config=${OCF_RESKEY_config_default}} +: ${OCF_RESKEY_datadir=${OCF_RESKEY_datadir_default}} + +: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}} +: ${OCF_RESKEY_group=${OCF_RESKEY_group_default}} + +: ${OCF_RESKEY_log=${OCF_RESKEY_log_default}} +: ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}} +: ${OCF_RESKEY_socket=${OCF_RESKEY_socket_default}} + +: ${OCF_RESKEY_test_user=${OCF_RESKEY_test_user_default}} +: ${OCF_RESKEY_test_table=${OCF_RESKEY_test_table_default}} +: ${OCF_RESKEY_test_passwd=${OCF_RESKEY_test_passwd_default}} + +: ${OCF_RESKEY_enable_creation=${OCF_RESKEY_enable_creation_default}} +: ${OCF_RESKEY_additional_parameters=${OCF_RESKEY_additional_parameters_default}} + +: ${OCF_RESKEY_replication_user=${OCF_RESKEY_replication_user_default}} +: ${OCF_RESKEY_replication_passwd=${OCF_RESKEY_replication_passwd_default}} +: ${OCF_RESKEY_replication_port=${OCF_RESKEY_replication_port_default}} +: ${OCF_RESKEY_replication_options=${OCF_RESKEY_replication_options_default}} + +: ${OCF_RESKEY_max_slave_lag=${OCF_RESKEY_max_slave_lag_default}} +: ${OCF_RESKEY_evict_outdated_slaves=${OCF_RESKEY_evict_outdated_slaves_default}} + +: ${OCF_RESKEY_reader_attribute=${OCF_RESKEY_reader_attribute_default}} +: ${OCF_RESKEY_reader_failcount=${OCF_RESKEY_reader_failcount_default}} + +: ${OCF_RESKEY_backup_lockfile=${OCF_RESKEY_backup_lockfile_default}} + +: ${OCF_RESKEY_geo_remote_IP}="" +: ${OCF_RESKEY_booth_master_ticket}=${OCF_RESKEY_booth_master_ticket_default} +: ${OCF_RESKEY_post_promote_script}="" +: ${OCF_RESKEY_prm_binlog_parser_path}="`which prm_binlog_parser 2> /dev/null`" + +: ${OCF_RESKEY_async_stop=${OCF_RESKEY_async_stop_default}} +: ${OCF_RESKEY_try_restart_crashed_master=${OCF_RESKEY_try_restart_crashed_master_default}} + +####################################################################### +# Convenience variables + +MYSQL=$OCF_RESKEY_client_binary +MYSQL_BINDIR=`dirname ${OCF_RESKEY_client_binary}` + +MYSQL_OPTIONS_LOCAL="-A -S $OCF_RESKEY_socket --connect_timeout=10" +MYSQL_OPTIONS_REPL="$MYSQL_OPTIONS_LOCAL --user=$OCF_RESKEY_replication_user --password=$OCF_RESKEY_replication_passwd" +MYSQL_OPTIONS_TEST="$MYSQL_OPTIONS_LOCAL --user=$OCF_RESKEY_test_user --password=$OCF_RESKEY_test_passwd" +MYSQL_LAST_ERR=0 +MYSQL_TOO_MANY_CONN_ERR=1040 + +CRM_MASTER="${HA_SBIN_DIR}/crm_master -l reboot " +HOSTNAME=`uname -n` +CRM_ATTR="${HA_SBIN_DIR}/crm_attribute -N $HOSTNAME -q" +INSTANCE_ATTR_NAME=`echo ${OCF_RESOURCE_INSTANCE}| awk -F : '{print $1}'` +CRM_ATTR_REPL_INFO="${HA_SBIN_DIR}/crm_attribute --type crm_config --name ${INSTANCE_ATTR_NAME}_REPL_INFO -s mysql_replication -q " +CRM_ATTR_REPL_STATUS="${HA_SBIN_DIR}/crm_attribute --type crm_config --name ${INSTANCE_ATTR_NAME}_REPL_STATUS -s mysql_replication -q " +CRM_ATTR_LAST_TRX="${HA_SBIN_DIR}/crm_attribute --type crm_config --name ${INSTANCE_ATTR_NAME}_NEW_MASTER_LAST_TRX -s mysql_replication -q" +CRM_ATTR_MASTER_CRASHED_TS="$CRM_ATTR -l reboot --name ${INSTANCE_ATTR_NAME}_master_crashed" +CRM_ATTR_NODE_LAST_TRX_MD5="$CRM_ATTR -l reboot --name ${INSTANCE_ATTR_NAME}_last_trx_md5" +CRM_RES="${HA_SBIN_DIR}/crm_resource" +CRM_TICKET="${HA_SBIN_DIR}/crm_ticket" +SSH="/usr/bin/ssh " +MAX_BINLOG_SIZE_CACHE="${HA_RSCTMP}/max_binlog_size_cache" +ASYNC_STOP_WITNESS_FILE="${HA_RSCTMP}/stop_${INSTANCE_ATTR_NAME}" +OCF_STOPPING=100 #custom error code for async_stop + +####################################################################### + +usage() { + cat < + + +1.0 + + +Resource script for MySQL. +May manage a standalone MySQL database, a clone set with externally +managed replication, or a complete master/slave replication setup. + +While managing replication, the default behavior is to use uname -n +values in the change master to command. Other IPs can be specified +manually by adding a node attribute \${INSTANCE_ATTR_NAME}_mysql_master_IP +giving the IP to use for replication. For example, if the mysql primitive +you are using is p_mysql, the attribute to set will be +p_mysql_mysql_master_IP. + +Manages a MySQL database instance + + + + +Location of the MySQL server binary + +MySQL server binary + + + + + +Location of the MySQL client binary + +MySQL client binary + + + + + +Configuration file + +MySQL config + + + + + +Directory containing databases + +MySQL datadir + + + + + +User running MySQL daemon + +MySQL user + + + + + +Group running MySQL daemon (for logfile and directory permissions) + +MySQL group + + + + + +The logfile to be used for mysqld. + +MySQL log file + + + + + +The pidfile to be used for mysqld. + +MySQL pid file + + + + + +The socket to be used for mysqld. + +MySQL socket + + + + + +Table to be tested in monitor statement (in database.table notation) + +MySQL test table + + + + + +MySQL test user, must have select privilege on test_table + +MySQL test user + + + + + +MySQL test user password + +MySQL test user password + + + + + +If the MySQL database does not exist, it will be created + +Create the database if it does not exist + + + + + +Additional parameters which are passed to the mysqld on startup. +(e.g. --skip-external-locking or --skip-grant-tables) + +Additional parameters to pass to mysqld + + + + + +MySQL replication user. This user is used for starting and stopping +MySQL replication, for setting and resetting the master host, and for +setting and unsetting read-only mode. Because of that, this user must +have SUPER, REPLICATION SLAVE, REPLICATION CLIENT, and PROCESS +privileges on all nodes within the cluster. Mandatory if you define +a master-slave resource. + +MySQL replication user + + + + + +MySQL replication password. Used for replication client and slave. +Mandatory if you define a master-slave resource. + +MySQL replication user password + + + + + +The port on which the Master MySQL instance is listening. + +MySQL replication port + + + + + +Extra options to pass to CHANGE MASTER, be sure to pass a preceeding comma. Handy for SSL, for example: +replication_options=", MASTER_SSL=1, MASTER_SSL_CA='/path/to/ca.crt'" + +MySQL replication options + + + + + +The maximum number of seconds a replication slave is allowed to lag +behind its master. Do not set this to zero. What the cluster manager +does in case a slave exceeds this maximum lag is determined by the +evict_outdated_slaves parameter. + +Maximum time (seconds) a MySQL slave is allowed +to lag behind a master + + + + + +If set to true, any slave which is more than max_slave_lag seconds +behind the master has its MySQL instance shut down. If this parameter +is set to false in a primitive or clone resource, it is simply +ignored. If set to false in a master/slave resource, then exceeding +the maximum slave lag will merely push down the master preference so +the lagging slave is never promoted to the new master. + +Determines whether to shut down badly lagging +slaves + + + + + +An attribute that the RA can manage to specify whether a node +can be read from. This node attribute will be 1 if it's fine to +read from the node, and 0 otherwise (for example, when a slave +has lagged too far behind the master). + +A typical example for the use of this attribute would be to tie +a set of IP addresses to MySQL slaves that can be read from. + +This parameter is only meaningful in master/slave set configurations. + +Sets the node attribute that determines +whether a node is usable for clients to read from. + + + + +The number of times a monitor operation can find the slave +to be unsuitable for reader VIP before failing. Useful if +there are short intermittent issues like clock adjustments in VMs. + +Allowed failcount for reader + + + + + +The path to a file that will be exclusively locked by any backup +process. The lockfile serves to provide a reliable way of determining +whether to restart the slave process or not. If a thirdparty process +locks this file, the agent will fail to lock the file and will not +start the slave. When this agent is able to lock the file, it is +assumed that backups are finished and the slave thread should be +running and will start it. + +A typical cron command example would be like: +flock -xn /var/lock/innobackupex innobackupex --safe-slave-backup /tmp/mysqlbackup + +This example will use innobackupex's ability to stop the slave when necessary +to ensure backup consistency. During this time the agent will not start the +slave. Once the backup is complete, the lock will automatically expire and +the agent can start the slave if it isn't already. + +Path to backup lockfile + + + + + +In case multiple Geo redundant sites are connected with the booth protocol +this is the IP to use to connect to the remote cluster to query replication info. +Normally this would be the writer VIP on the remote cluster. Also, ssh is used +for communication so make sure keys are exchanged and that ssh options are set +in a way that connection doesn't take many seconds. If empty, the booth type +behavior is not triggered. + +IP of the remote cluster + + + + + +In case multiple Geo redundant sites are connected with the booth protocol +this is the name of the ticket used to identify the master side. + +Booth ticket name + + + + + +Allows to run custom code following a promotion. An application of this is to +prevent fail-back of the master role after an initial failover. + +Post promote script + + + + + +Path to the prm_binlog_parser tool that is used to publish the last trx of a new +master after a hard crash of the previous master. The tool can be downloaded from +https://github.com/percona/percona-pacemaker-agents/tree/master/tools/ybinlogp + +Path to the prm_binlog_parser tool + + + + + +If set to true, PRM will not wait for MySQL to stop after sending the +SIGTERM signal. This can be useful to speed up failover when a server has a +large number of dirty pages and takes a long time to shutdown, or worse, receives +a SIGKILL after the stop timeout. The main drawback is that if PRM wants to restart +MySQL before it completed its shutdown, the operation will error out. + +Asynchronous stop of MySQL + + + + + +If set to true, PRM will try to restart a failed master in place instead of promoting +another node. This can help recover untransmitted binary logs. However, if you have +a large database that takes a long time to recovery, this may not be a good option +for you. + +Try restarting a crashed master + + + + + + + + + + + + + + + + + + + +END + +} + +# Convenience functions + +set_read_only() { + # Sets or unsets read-only mode. Accepts one boolean as its + # optional argument. If invoked without any arguments, defaults to + # enabling read only mode. Should only be set in master/slave + # setups. + # Returns $OCF_SUCCESS if the operation succeeds, or + # $OCF_ERR_GENERIC if it fails. + local ro_val + if ocf_is_true $1; then + ro_val="on" + else + ro_val="off" + fi + mysql_run -Q -sw $MYSQL $MYSQL_OPTIONS_REPL \ + -e "SET GLOBAL read_only=${ro_val}" +} + +get_read_only() { + # Check if read-only is set + local read_only_state + + read_only_state=`mysql_run -Q -sw -O $MYSQL -N $MYSQL_OPTIONS_REPL \ + -e "SHOW VARIABLES like 'read_only'" | awk '{print $2}'` + + if [ "$read_only_state" = "ON" ]; then + return 0 + else + return 1 + fi +} + +is_slave() { + # Determine whether the machine is currently running as a MySQL + # slave, as determined per SHOW SLAVE STATUS. Returns 1 if SHOW + # SLAVE STATUS creates an empty result set, 0 otherwise. + local rc + local tmpfile + + # Check whether this machine should be slave + if ! ocf_is_ms || ! get_read_only; then + return 1 + fi + + get_slave_info + rc=$? + + if [ $rc -eq 0 ]; then + # show slave status is not empty + # Is there a master_log_file defined? (master_log_file is deleted + # by reset slave + if [ "$master_log_file" ]; then + return 0 + else + return 1 + fi + else + # "SHOW SLAVE STATUS" returns an empty set if instance is not a + # replication slave + return 1 + fi + +} + +parse_slave_info() { + # Extracts field $1 from result of "SHOW SLAVE STATUS\G" from file $2 + sed -ne "s/^.* $1: \(.*\)$/\1/p" < $2 +} + +# get the current max_binlog_size. Since this value rarely change, we cache +# it for 1h +get_max_binlog_size() { + if [ -e $MAX_BINLOG_SIZE_CACHE ]; then + cat $MAX_BINLOG_SIZE_CACHE + if [ `date +%s` -gt "$((`stat -c %Z $MAX_BINLOG_SIZE_CACHE`+3600))" ]; then + rm $MAX_BINLOG_SIZE_CACHE + fi + else + mysql_run -Q -sw -O $MYSQL -N $MYSQL_OPTIONS_REPL \ + -e "Show global variables like 'max_binlog_size';" | \ + awk '{ print $2 }' > $MAX_BINLOG_SIZE_CACHE + cat $MAX_BINLOG_SIZE_CACHE + fi +} + +get_slave_info() { + + local mysql_options tmpfile + + if [ "$master_log_file" -a "$master_host" ]; then + # variables are already defined, get_slave_info has been run before + return $OCF_SUCCESS + else + tmpfile=`mktemp ${HA_RSCTMP}/check_slave.${OCF_RESOURCE_INSTANCE}.XXXXXX` + + mysql_run -Q -sw -O $MYSQL $MYSQL_OPTIONS_REPL \ + -e 'SHOW SLAVE STATUS\G' > $tmpfile + + if [ -s $tmpfile ]; then + master_host=`parse_slave_info Master_Host $tmpfile` + master_user=`parse_slave_info Master_User $tmpfile` + master_port=`parse_slave_info Master_Port $tmpfile` + master_log_file=`parse_slave_info Master_Log_File $tmpfile` + relay_log_file=`parse_slave_info Relay_Log_File $tmpfile` + master_log_pos=`parse_slave_info Read_Master_Log_Pos $tmpfile` + slave_sql=`parse_slave_info Slave_SQL_Running $tmpfile` + slave_io=`parse_slave_info Slave_IO_Running $tmpfile` + slave_io_state=`parse_slave_info Slave_IO_State $tmpfile` + last_errno=`parse_slave_info Last_Errno $tmpfile` + secs_behind=`parse_slave_info Seconds_Behind_Master $tmpfile` + ocf_log debug "MySQL instance has a non empty slave status" + else + # Instance produced an empty "SHOW SLAVE STATUS" output -- + # instance is not a slave + + ocf_log err "check_slave invoked on an instance that is not a replication slave." + rm -f $tmpfile + return $OCF_ERR_GENERIC + fi + rm -f $tmpfile + return $OCF_SUCCESS + fi +} + +check_slave() { + # Checks slave status + local rc new_master + + get_slave_info + rc=$? + + if [ $rc -eq 0 ]; then + # Did we receive an error other than max_connections? + if [ $last_errno -ne 0 -a $last_errno -ne "$MYSQL_TOO_MANY_CONN_ERR" ]; then + # Whoa. Replication ran into an error. This slave has + # diverged from its master. Make sure this resource + # doesn't restart in place. + ocf_log err "MySQL instance configured for replication, but replication has failed." + + # Just pull the reader VIP away, killing MySQL here would be pretty evil + # on a loaded server + set_reader_attr 0 + + #Since replication is broken, not suitable to be a master + $CRM_MASTER -v -INF + + exit $OCF_SUCCESS + + fi + + # If we got max_connections, let's only remove the vip + if [ $last_errno -eq "$MYSQL_TOO_MANY_CONN_ERR" ]; then + set_reader_attr 0 + exit $OCF_SUCCESS + fi + + if [ "$slave_io" != 'Yes' ]; then + # Not necessarily a bad thing. The master may have + # temporarily shut down, and the slave may just be + # reconnecting. A warning can't hurt, though. + ocf_log warn "MySQL Slave IO threads currently not running." + + # Sanity check, are we at least on the right master + if [ "$master_host" != "$glb_cib_master" ]; then + # Not pointing to the right master + + # Is this a recent master failover on the remote side + if [ "${#glb_remote_info}" -gt "0" -a "$slave_sql" = 'Yes' ]; then + # looks like, the sql thread is still running, no need + # to remove the vip, doing nothing + : + else + + set_reader_attr 0 + fi + + # try setting up the slave with the new master + set_master + exit $OCF_SUCCESS + + elif [ "$slave_sql" == 'Yes' ]; then + # If the slq thread is running, it is an issue with the io thread + # let's try to restart it + + if [ "$slave_io_state" != "" ]; then + # The io thread is running but is not connected, let's restart it. + mysql_run -Q -sw $MYSQL $MYSQL_OPTIONS_REPL \ + -e "STOP SLAVE IO_THREAD" + fi + + # At this point, the io_thread should be stopped. + # let's try to start it again. + + mysql_run -Q -sw $MYSQL $MYSQL_OPTIONS_REPL \ + -e "START SLAVE IO_THREAD" + + # We give some time to connect + sleep 2 + + get_slave_info + rc=$? + if [ $rc -eq 0 -a "$slave_io" == 'Yes' ]; then + ocf_log info "MySQL Slave IO thread started succesfully." + else + ocf_log warn "We could not start the MySQL Slave IO thread." + fi + fi + + + fi + + if [ "$slave_sql" != 'Yes' ]; then + # We don't have a replication SQL thread running. Not a + # good thing. Try to recoved by restarting the SQL thread + # and remove reader vip. Prevent MySQL restart. + ocf_log err "MySQL Slave SQL threads currently not running." + + # Remove reader vip + set_reader_attr 0 + + # If sql is not running, can't be a master + $CRM_MASTER -v -INF + + # Check that the flock tool exists first + if type flock &>/dev/null; then + ( + flock -xn 8 + if [ $? -eq 0 ]; then + mysql_run -Q -sw $MYSQL $MYSQL_OPTIONS_REPL \ + -e "START SLAVE" + else + ocf_log info "Unable to lock $OCF_RESKEY_backup_lockfile. Not starting slave." + fi + ) 8>$OCF_RESKEY_backup_lockfile + else + # try to restart slave + mysql_run -Q -sw $MYSQL $MYSQL_OPTIONS_REPL \ + -e "START SLAVE" + fi + + # Return success to prevent a restart + exit $OCF_SUCCESS + fi + + if ocf_is_true $OCF_RESKEY_evict_outdated_slaves; then + # We're supposed to bail out if we lag too far + # behind. Let's check our lag. + if [ $secs_behind -gt $OCF_RESKEY_max_slave_lag ]; then + ocf_log err "MySQL Slave is $secs_behind seconds behind master (allowed maximum: $OCF_RESKEY_max_slave_lag)." + + # Remove reader vip + set_reader_attr 0 + exit $OCF_ERR_INSTALLED + fi + elif ocf_is_ms; then + # Even if we're not set to evict lagging slaves, we can + # still use the seconds behind master value to set our + # master preference. + local master_pref + master_pref=$((${OCF_RESKEY_max_slave_lag}-${secs_behind})) + if [ $master_pref -lt 0 ]; then + # Sanitize a below-zero preference to just zero + master_pref=0 + fi + + # Is the datadir almost full + if check_datadir_state; then + $CRM_MASTER -v $master_pref + else + # full so not good for a master + $CRM_MASTER -v -2147483640 + fi + fi + + # is the slave ok to have a VIP on it + test $secs_behind -eq 0 2>/dev/null + if [ $? -eq 2 ]; then + set_reader_attr 0 + else + if [ $secs_behind -gt $OCF_RESKEY_max_slave_lag ]; then + set_reader_attr 0 + else + set_reader_attr 1 + + #Edge case verification, check if on the right master + set_master nologging + fi + fi + + ocf_log debug "MySQL instance running as a replication slave" + else + # Instance produced an empty "SHOW SLAVE STATUS" output -- + # instance is not a slave + # TODO: Needs to handle when get_slave_info will return too many connections error + + if [ $MYSQL_LAST_ERR -eq "$MYSQL_TOO_MANY_CONN_ERR" ]; then + # Remove the vip + set_reader_attr 0 + return $OCF_SUCCESS + fi + + # An empty status could happen when a master is demote in a + # geo DR setup, let's check + if [ $MYSQL_LAST_ERR -eq 0 -a $glb_master_exists -eq 1 ]; then + # This is not the master side, let's try to setup the slave + # No need to unset the master since slave status is empty + set_reader_attr 0 + set_master + return $OCF_SUCCESS + fi + + ocf_log err "check_slave invoked on an instance that is not a replication slave." + exit $OCF_ERR_GENERIC + fi +} + +set_master() { + local new_master master_log_file master_log_pos new_master_info + local master_params new_master_log_file new_master_log_pos + + if [ "$glb_master_exists" ]; then + if [ "${#glb_remote_info}" -gt "0" ]; then + # geo_remote_IP is defined, let's do the booth part + + if [ $glb_master_side -ne 0 ]; then + # this is _not_ the side with the token + new_master_info=`echo $glb_remote_info | awk '{ print $1 }'` + new_master=`echo $new_master_info | cut -d'|' -f1` + new_master_log_file=`echo $new_master_info | cut -d'|' -f2` + new_master_log_pos=`echo $new_master_info | cut -d'|' -f3` + fi + fi + + if [ "${#new_master_info}" -eq "0" ]; then + new_master=`echo $glb_local_info | cut -d'|' -f1` + new_master_log_file=`echo $glb_local_info | cut -d'|' -f2` + new_master_log_pos=`echo $glb_local_info | cut -d'|' -f3` + fi + + # Keep replication position + get_slave_info + + if [ "$master_log_file" -a "$new_master" = "$master_host" ]; then + # master_params=", MASTER_LOG_FILE='$master_log_file', \ + # MASTER_LOG_POS=$master_log_pos" + if [ "$1" = "nologging" ]; then + : + else + ocf_log info "Kept master pos for $master_host : $master_log_file:$master_log_pos" + fi + return + else + if [ -n "$new_master_log_file" -a -n "$new_master_log_pos" ]; then + master_params=", MASTER_LOG_FILE='$new_master_log_file', \ + MASTER_LOG_POS=$new_master_log_pos" + ocf_log info "Restored master pos for $new_master : $new_master_log_file:$new_master_log_pos" + fi + fi + + # Informs the MySQL server of the master to replicate + # from. Accepts one mandatory argument which must contain the host + # name of the new master host. The master must either be unchanged + # from the laste master the slave replicated from, or freshly + # reset with RESET MASTER. + + mysql_run -Q -sw $MYSQL $MYSQL_OPTIONS_REPL \ + -e "STOP SLAVE;CHANGE MASTER TO MASTER_HOST='$new_master', \ + MASTER_PORT=$OCF_RESKEY_replication_port, \ + MASTER_USER='$OCF_RESKEY_replication_user', \ + MASTER_PASSWORD='$OCF_RESKEY_replication_passwd' \ + $OCF_RESKEY_replication_options $master_params;START SLAVE;" + fi +} + +unset_master(){ + # Instructs the MySQL server to stop replicating from a master + # host. + + # If we're currently not configured to be replicating from any + # host, then there's nothing to do. But we do log a warning as + # no-one but the CRM should be touching the MySQL master/slave + # configuration. + + is_slave + rc=$? + if [ $rc -ne 0 ]; then + ocf_log warn "Attempted to unset the replication master on an instance that is not configured as a replication slave" + return $OCF_SUCCESS + fi + + local tmpfile + tmpfile=`mktemp ${HA_RSCTMP}/unset_master.${OCF_RESOURCE_INSTANCE}.XXXXXX` + + # At this point, the master is read only so there should not be much binlogs to transfer + # Let's wait for the last bits + while true; do + get_slave_info + rc=$? + + # Is the slave_io thread running? + if [ "$slave_io" != 'Yes' ]; then + ocf_log info "Slave IO thread not running, master likely dead or stopped" + break; + fi + + mysql_run -Q -sw -O $MYSQL $MYSQL_OPTIONS_REPL \ + -e 'SHOW PROCESSLIST\G' > $tmpfile + + if grep -i 'Master has sent all binlog to slave' $tmpfile >/dev/null; then + ocf_log info "MySQL slave has finished reading master binary log" + break + fi + if grep -i 'Waiting for master to send event' $tmpfile >/dev/null; then + ocf_log info "MySQL slave has finished reading master binary log" + break + fi + if grep -i 'Reconnecting after a failed master event read' $tmpfile >/dev/null; then + ocf_log info "Master is down, no more binary logs to come" + break + fi + if grep -i 'Connecting to master' $tmpfile >/dev/null; then + ocf_log info "Master is down, no more binary logs to come" + break + fi + if ! grep 'system user' $tmpfile >/dev/null; then + ocf_log info "Slave is not running - not waiting to finish" + break + fi + + sleep 1 + done + + # Now, stop the slave I/O thread and wait for relay log + # processing to complete + mysql_run -Q -sw $MYSQL $MYSQL_OPTIONS_REPL \ + -e "STOP SLAVE IO_THREAD" + if [ $? -gt 0 ]; then + ocf_log err "Error stopping slave IO thread" + rm -f $tmpfile + exit $OCF_ERR_GENERIC + fi + + while true; do + mysql_run -Q -sw -O $MYSQL $MYSQL_OPTIONS_REPL \ + -e 'SHOW PROCESSLIST\G' > $tmpfile + + # Of course, slave messages changed over MySQL versions... + if grep -i 'Has read all relay log' $tmpfile >/dev/null; then + ocf_log info "MySQL slave has finished processing relay log" + break + fi + if ! grep -q 'system user' $tmpfile; then + ocf_log info "Slave not runnig - not waiting to finish" + break + fi + ocf_log info "Waiting for MySQL slave to finish processing relay log" + sleep 1 + done + rm -f $tmpfile + + # Now, stop all slave activity and unset the master host + mysql_run -Q -sw $MYSQL $MYSQL_OPTIONS_REPL \ + -e "STOP SLAVE" + if [ $? -gt 0 ]; then + ocf_log err "Error stopping rest slave threads" + exit $OCF_ERR_GENERIC + fi + + # a last get_slave_info to save the status in variables may be needed + # by pre-promote notification + unset master_host # need to unset for get_slave_info to run + get_slave_info + + if [ "${#OCF_RESKEY_prm_binlog_parser_path}" -gt "0" ]; then + + # First, where are the relay logs? That will be easier when the using_multi_config + # branch will be merged. If the path is not defined, the output will be "." + relaylog_path=`${OCF_RESKEY_binary} --defaults-file=$OCF_RESKEY_config --verbose --help --user=$OCF_RESKEY_user | grep -e '^relay-log ' | awk '{ print $2 }'` + relaylog_path=`dirname $relaylog_path` + + if [ "$relaylog_path" == "." ]; then + relaylog_path=$OCF_RESKEY_datadir + fi + + last_relaylog_file="$relay_log_file" + + #ok now we need to find the md5 of the last trx + last_trx_md5=`$OCF_RESKEY_prm_binlog_parser_path ${relaylog_path}/${last_relaylog_file} | tail -n 1 | cut -d',' -f2` + + if [ ! -z "$last_trx_md5" ]; then + $CRM_ATTR_NODE_LAST_TRX_MD5 -v $last_trx_md5 + fi + fi + + mysql_run -Q -sw $MYSQL $MYSQL_OPTIONS_REPL \ + -e "RESET SLAVE /*!50516 ALL */;" + if [ $? -gt 0 ]; then + ocf_log err "Failed to reset slave" + exit $OCF_ERR_GENERIC + fi +} + +# Start replication as slave +start_slave() { + mysql_run -Q -sw $MYSQL $MYSQL_OPTIONS_REPL \ + -e "START SLAVE" +} + +# Set the attribute controlling the readers VIP +set_reader_attr() { + local curr_attr_value + + curr_attr_value=$(get_reader_attr) + + if [ "$1" -eq "0" ]; then + if [ "$curr_attr_value" -gt "0" ]; then + curr_attr_value=$((${curr_attr_value}-1)) + $CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} -v $curr_attr_value + else + $CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} -v 0 + fi + else + if [ "$curr_attr_value" -ne "$OCF_RESKEY_reader_failcount" ]; then + $CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} -v $OCF_RESKEY_reader_failcount + fi + fi + +} + +is_master_side() { + #Returns true (0) if this cluster has a grant for the booth ticket OCF_RESKEY_booth_master_ticket + local ticket crmTicketRet + + if [ "${#OCF_RESKEY_geo_remote_IP}" -gt "0" ]; then + #Try the new format + crmTicketRet=`file $CRM_TICKET | grep -c 'Bourne-Again shell script'` + if [ "$crmTicketRet" -eq "1" ]; then + # got an error, we assume the old format + ticket=`$CRM_TICKET -t $OCF_RESKEY_booth_master_ticket -Q | grep -c 'true'` + else + ticket=`$CRM_TICKET --info | grep $OCF_RESKEY_booth_master_ticket | awk '{ print $2 }' | grep -c granted` + fi + + if [ "$ticket" -eq "1" ]; then + return 0 + else + return 1 + fi + else + return 0 + fi +} + +# get the attribute controlling the readers VIP +get_reader_attr() { + local attr_value + local rc + + attr_value=`$CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} --query -q` + rc=$? + if [ "$rc" -eq "0" ]; then + echo $attr_value + else + echo -1 + fi + +} + +# Stores data for MASTER STATUS from MySQL +update_data_master_status() { + + master_status_file="${HA_RSCTMP}/master_status.${OCF_RESOURCE_INSTANCE}" + + mysql_run -Q -sw -O $MYSQL $MYSQL_OPTIONS_REPL -e "SHOW MASTER STATUS\G" > $master_status_file +} + + +# Returns the specified value from the stored copy of SHOW MASTER STATUS. +# should be call after update_data_master_status for tmpfile +# Arguments: +# $1 The value to get. +get_master_status() { + awk -v var="$1" '$1 == var ":" {print substr($0, index($0, ":") + 2)}' "$master_status_file" +} + +# Determines what IP address is attached to the current host. The output of the +# crm_attribute command looks like this: +# scope=nodes name=IP value=10.2.2.161 +# If the ${INSTANCE_ATTR_NAME}_MYSQL_MASTER_IP node attribute is not defined, fallback is to uname -n +# The ${INSTANCE_ATTR_NAME}_MYSQL_MASTER_IP is the IP address that will be used for the +# change master to command. +get_local_ip() { + local IP + IP=`$CRM_ATTR -l forever -n ${INSTANCE_ATTR_NAME}_mysql_master_IP -q -G` + if [ ! $? -eq 0 ]; then + uname -n + else + echo $IP + fi +} + +# Determine if the datadir is full or almost full, the threshold is 97% +check_datadir_state() { + # Get the free space of the binlogdir + FREE_SPC_PCT=`/bin/df $OCF_RESKEY_datadir | /bin/grep -v Filesystem \ + | /bin/sed -e 's/ \+/ /g' | /usr/bin/cut -d' ' -f 5 \ + | /usr/bin/tr -d '%'` + + if [ "$FREE_SPC_PCT" -ge "97" ]; then + ocf_log warn "Partition $OCF_RESKEY_datadir usage is at " \ + "or more than 97, " \ + "unsuitable for master..." + return 1 + fi + + return 0 +} + +####################################################################### + +# Functions invoked by resource manager actions + +mysql_validate() { + check_binary $OCF_RESKEY_binary + check_binary $OCF_RESKEY_client_binary + + if [ ! -f $OCF_RESKEY_config ]; then + ocf_log err "Config $OCF_RESKEY_config doesn't exist"; + return $OCF_ERR_INSTALLED; + fi + + if [ ! -d $OCF_RESKEY_datadir ]; then + ocf_log err "Datadir $OCF_RESKEY_datadir doesn't exist"; + return $OCF_ERR_INSTALLED; + fi + + getent passwd $OCF_RESKEY_user >/dev/null 2>&1 + if [ ! $? -eq 0 ]; then + ocf_log err "User $OCF_RESKEY_user doesn't exit"; + return $OCF_ERR_INSTALLED; + fi + + getent group $OCF_RESKEY_group >/dev/null 2>&1 + if [ ! $? -eq 0 ]; then + ocf_log err "Group $OCF_RESKEY_group doesn't exist"; + return $OCF_ERR_INSTALLED; + fi + + true +} + +# Return the status of mysqld +# $1 the loglevel to use (mandatory) +# $2 Override async_stop if 1, default to 0 +mysql_status() { + local last_restart_ts + local kill_exit_code + local witness_pid + local override_async_stop + + override_async_stop=0 + + if [ -n $2 ]; then + override_async_stop=$2 + fi + + if [ ! -e $OCF_RESKEY_pid ]; then + ocf_log $1 "MySQL is not running" + + if [ -e $ASYNC_STOP_WITNESS_FILE ]; then + # MySQL is stopped and there's a witness file, cleanup + rm -f $ASYNC_STOP_WITNESS_FILE + fi + return $OCF_NOT_RUNNING; + fi + + pid=`cat $OCF_RESKEY_pid`; + if [ -d /proc -a -d /proc/1 ]; then + [ "u$pid" != "u" -a -d /proc/$pid ] + else + kill -s 0 $pid >/dev/null 2>&1 + fi + + kill_exit_code=$? + + if [ "$OCF_RESKEY_async_stop" -eq "1" -a \ + -e $ASYNC_STOP_WITNESS_FILE ]; then + + # Async stop seems to be in progress + witness_pid=`cat $ASYNC_STOP_WITNESS_FILE | grep pid | cut -d':' -f2` + + if [ $kill_exit_code -eq 0 -a $witness_pid -eq $pid ]; then + + # Should we lie or tell the truth + if [ "$override_async_stop" -eq "0" ]; then + # we lie + # still running but because of async, we report stopped + return $OCF_NOT_RUNNING + else + # we tell the truth + return $OCF_STOPPING #custom error code + fi + else + # That shouldn't happen execpt if SIGKILL, cleanup + rm -f $ASYNC_STOP_WITNESS_FILE + fi + fi + + if [ $kill_exit_code -eq 0 ]; then + return $OCF_SUCCESS; + else + ocf_log $1 "MySQL not running: removing old PID file" + rm -f $OCF_RESKEY_pid + + # This is abnormal, is this host the master defined in the cib? + # Also confirm it succeed in starting with the socket file + if [ "$glb_master_exists" -eq "1" -a "$glb_cib_master" = $(get_local_ip) \ + -a -e "$OCF_RESKEY_socket" ]; then + + #This is a crashed master + if [ "$OCF_RESKEY_try_restart_crashed_master" -eq "1" ]; then + # This is the master, let's give it a change to restart + # that will allow the slaves a better chance to sync but we + # need to avoid letting it restart forever. Has it tried to + # restart within the last hour + last_crash_ts=`$CRM_ATTR -l reboot --name ${INSTANCE_ATTR_NAME}_last_crash --query -q` + + if [[ ! -z $last_crash_ts ]]; then + if [ `date +%s` -lt "$((${last_crash_ts}+3600))" ]; then + # too soon, multiple crash, let's error out + return $OCF_NOT_RUNNING; + fi + fi + + $CRM_ATTR -l reboot --name ${INSTANCE_ATTR_NAME}_last_crash -v `date +%s` + + mysql_start_low + rc=$? + + if [ "$rc" -eq "0" ]; then + set_read_only OFF + fi + + return $rc + else + $CRM_ATTR -l reboot --name ${INSTANCE_ATTR_NAME}_last_crash -v `date +%s` + # OCF_ERR_ARGS is a hard error, won't wait for restart + return $OCF_ERR_ARGS + fi + fi + return $OCF_NOT_RUNNING + fi +} + +mysql_monitor() { + local rc + local status_loglevel="err" + local master_resource + local master_status_attr + local new_master_status_attr + + : ${OCF_RESKEY_CRM_meta_interval=0} + + # Set loglevel to info during probe + if ocf_is_probe; then + status_loglevel="info" + fi + + mysql_status $status_loglevel + + rc=$? + + # TODO: check max connections error + + # If status returned an error, return that immediately + if [ $rc -ne $OCF_SUCCESS ]; then + return $rc + fi + + + if [ $OCF_CHECK_LEVEL -gt 0 -a -n "$OCF_RESKEY_test_table" ]; then + # Check if this instance is configured as a slave, and if so + # check slave status + + # Are we currently having a master? + if [ "$glb_master_exists" -ne "0" ]; then + is_slave + rc=$? + if [ $rc -eq 0 -o "$OCF_RESKEY_CRM_meta_role" = "Slave" ]; then + check_slave + else + update_data_master_status + master_status_attr=`$CRM_ATTR_REPL_STATUS --query -q` + new_master_status_attr="$(get_master_status File)|$(get_master_status Position)|$(get_max_binlog_size)" + rm -f $master_status_file + if [ "$master_status_attr" != "$new_master_status_attr" ]; then + # Doing in bg, no need to wait and that can hang if a node is lost at the same time + $CRM_ATTR_REPL_STATUS -v "$new_master_status_attr" & + fi + + # Is this following a recent master crash? + master_crashed_ts=`$CRM_ATTR_MASTER_CRASHED_TS --query` + + if [ ! -z $master_crashed_ts ]; then + if [ `date +%s` -gt "$((${master_crashed_ts}+3600))" ]; then + #Let's cleanup the cib + $CRM_ATTR_MASTER_CRASHED_TS -D + $CRM_ATTR_LAST_TRX -D + fi + fi + fi + else + is_slave + rc=$? + # Need to cover for crashed master... no unset_master... or always set + # last trx md5... + if [ $rc -eq 0 -o "$OCF_RESKEY_CRM_meta_role" = "Slave" ]; then + unset_master + set_reader_attr 0 + fi + fi + + + # Check for test table + mysql_run -Q -sw $MYSQL $MYSQL_OPTIONS_TEST \ + -e "SELECT COUNT(*) FROM $OCF_RESKEY_test_table" + + + if [ $MYSQL_LAST_ERR -ne "$MYSQL_TOO_MANY_CONN_ERR" ]; then + if [ $MYSQL_LAST_ERR -ne 0 ]; then + ocf_log err "Failed to select from $OCF_RESKEY_test_table"; + return $OCF_ERR_GENERIC; + fi + else + ocf_log info "Master hit max_connections" + fi + fi + + if ocf_is_ms && ! get_read_only; then + ocf_log debug "MySQL monitor succeeded (master)"; + if [ "$OCF_RESKEY_CRM_meta_interval" -eq "0" ]; then + # this is a probe and this server is a master so need to set master_score + $CRM_MASTER -v 2147483647 + fi + if ! check_datadir_state; then + $CRM_MASTER -v -2147483640 + fi + return $OCF_RUNNING_MASTER + else + ocf_log debug "MySQL monitor succeeded"; + return $OCF_SUCCESS + fi +} + +# Start MySQL in the master-slave context +mysql_start() { + local current_status + + if ocf_is_ms; then + # Initialize the ReaderVIP attribute, monitor will enable it + set_reader_attr 0 + + # set master_score to 0 in case mysql crashes on startup + $CRM_MASTER -v 0 + fi + + mysql_status info 1 # Adding 2nd param here to get the true state in case + # async_stop is used + current_status=$? + if [ "$current_status" = "$OCF_SUCCESS" ]; then + ocf_log info "MySQL already running" + return $OCF_SUCCESS + fi + + # Is MySQL still stopping, OCF_STOPPING is a custom error code + if [ "$current_status" = "$OCF_STOPPING" ]; then + ocf_log err "MySQL asked to start while still stopping" + # TODO, wait for stop (or timeout). For now just a sleep + sleep 5 + return $OCF_ERR_GENERIC + fi + + mysql_start_low + rc=$? + + if [ $rc != $OCF_SUCCESS ]; then + ocf_log err "Wasn't able to start MySQL, stopping 'start'." + return $rc + fi + + if ocf_is_ms; then + # We're configured as a stateful resource. We must start as + # slave by default. At this point we don't know if the CRM has + # already promoted a master. So, we simply start in read only + # mode. Should already be from command line. + set_read_only on + + # Now, let's see whether there is a master. We might be a new + # node that is just joining the cluster, and the CRM may have + # promoted a master before. + + if [ "$glb_master_exists" -ne 0 -a "$glb_cib_master" != $(get_local_ip) ]; then + ocf_log info "Changing MySQL configuration to replicate from $master_host." + set_master + start_slave + if [ $? -ne 0 ]; then + ocf_log err "Failed to start slave" + return $OCF_ERR_GENERIC + fi + else + ocf_log info "No MySQL master present - clearing replication state" + unset_master + fi + + # We also need to set a master preference, otherwise Pacemaker + # won't ever promote us in the absence of any explicit + # preference set by the administrator. We choose a low + # greater-than-zero preference. + $CRM_MASTER -v 1 + + fi + + # Initial monitor action + if [ -n "$OCF_RESKEY_test_table" -a -n "$OCF_RESKEY_test_user" \ + -a -n "$OCF_RESKEY_test_passwd" ]; then + OCF_CHECK_LEVEL=10 + fi + mysql_monitor + rc=$? + if [ $rc != $OCF_SUCCESS -a $rc != $OCF_RUNNING_MASTER ]; then + ocf_log err "Failed initial monitor action" + return $rc + fi + + ocf_log info "MySQL started" + return $OCF_SUCCESS +} + +# low level MySQL start +mysql_start_low() { + touch $OCF_RESKEY_log + chown $OCF_RESKEY_user:$OCF_RESKEY_group $OCF_RESKEY_log + chmod 0640 $OCF_RESKEY_log + [ -x /sbin/restorecon ] && /sbin/restorecon $OCF_RESKEY_log + + if ocf_is_true "$OCF_RESKEY_enable_creation" && [ ! -d $OCF_RESKEY_datadir/mysql ] ; then + ocf_log info "Initializing MySQL database: " + $MYSQL_SBINDIR/mysql_install_db --datadir=$OCF_RESKEY_datadir + rc=$? + if [ $rc -ne 0 ] ; then + ocf_log err "Initialization failed: $rc"; + exit $OCF_ERR_GENERIC + fi + chown -R $OCF_RESKEY_user:$OCF_RESKEY_group $OCF_RESKEY_datadir + fi + + pid_dir=`dirname $OCF_RESKEY_pid` + if [ ! -d $pid_dir ] ; then + ocf_log info "Creating PID dir: $pid_dir" + mkdir -p $pid_dir + chown $OCF_RESKEY_user:$OCF_RESKEY_group $pid_dir + fi + + socket_dir=`dirname $OCF_RESKEY_socket` + if [ ! -d $socket_dir ] ; then + ocf_log info "Creating socket dir: $socket_dir" + mkdir -p $socket_dir + chown $OCF_RESKEY_user:$OCF_RESKEY_group $socket_dir + fi + + # Regardless of whether we just created the directory or it + # already existed, check whether it is writable by the configured + # user + for dir in $pid_dir $socket_dir; do + # needed to wrap around su a bit, sssd causing issue + if [ `su - $OCF_RESKEY_user -s /bin/bash -c "if test -w $dir; then echo yes; else echo no; fi" 2> /dev/null` != "yes" ]; then + ocf_log err "Directory $dir is not writable by $OCF_RESKEY_user" + exit $OCF_ERR_PERM; + fi + done + + # Uncomment to perform permission clensing + # - not convinced this should be enabled by default + # + #chmod 0755 $OCF_RESKEY_datadir + #chown -R $OCF_RESKEY_user $OCF_RESKEY_datadir + #chgrp -R $OCF_RESKEY_group $OCF_RESKEY_datadir + mysql_extra_params= + if ocf_is_ms; then + mysql_extra_params="$mysql_extra_params --skip-slave-start --read-only" + fi + + ${OCF_RESKEY_binary} --defaults-file=$OCF_RESKEY_config \ + --pid-file=$OCF_RESKEY_pid \ + --socket=$OCF_RESKEY_socket \ + --datadir=$OCF_RESKEY_datadir \ + --user=$OCF_RESKEY_user $OCF_RESKEY_additional_parameters \ + $mysql_extra_params >/dev/null 2>&1 & + rc=$? + + # we also get the process id from $! because the PID file is only + # created by mysql as soon as mysql is fully up and running + # for example, when recovery is busy, the pid file does not exist yet + # this part already creates the PID file as the mysql user + # so that other PRM checks know + # When recovery happens, the PID file does not exist yet. + process_pid=$! + # mysql_status expects that if the pid is there and it's running + # that mysql is completely active + #su $OCF_RESKEY_user -c "echo '$process_pid' > $OCF_RESKEY_pid" + echo "$process_pid" > ${OCF_RESKEY_pid}.starting + + if [ $rc != 0 ]; then + ocf_log err "MySQL start command failed: $rc" + return $rc + fi + + # Spin waiting for the server to come up. + # Let the CRM/LRM time us out if required. + start_wait=1 + while [ $start_wait = 1 ]; do + mysql_status info + rc=$? + if [ $rc = $OCF_SUCCESS ]; then + start_wait=0 + + elif [ $rc != $OCF_NOT_RUNNING ]; then + ocf_log info "MySQL start failed: $rc" + return $rc + fi + + # if mysql died in the meantime, we shall not wait + # until the timeout is reached. + kill -s 0 $process_pid > /dev/null + mysqld_pid_status=$? + + if [ "$mysqld_pid_status" -ne "0" ]; then + ocf_log err "MySQL daemon died during start, giving up." + return $OCF_ERR_GENERIC + fi + + sleep 2 + done + + return $OCF_SUCCESS +} + +mysql_stop() { + + if ocf_is_ms; then + # clear preference for becoming master + $CRM_MASTER -D + + # Remove VIP capability + set_reader_attr 0 + fi + + # we rely only on ${OCF_RESKEY_pid}.starting + # as this certainly contains the file we need with the PID + if [ ! -f ${OCF_RESKEY_pid}.starting ]; then + ocf_log info "MySQL is not running" + return $OCF_SUCCESS + fi + + pid=`cat ${OCF_RESKEY_pid}.starting 2> /dev/null` + /bin/kill $pid > /dev/null + rc=$? + if [ $rc != 0 ]; then + ocf_log err "MySQL couldn't be stopped" + return $OCF_ERR_GENERIC + fi + + if [ "$OCF_RESKEY_async_stop" -eq "1" ]; then + #Ok, MySQL is stopping and the async_stop option is set, just put the + #pid and a timestamp in the witness file and return + + echo "pid:$pid" > $ASYNC_STOP_WITNESS_FILE + echo "ts:`date +%s`" >> $ASYNC_STOP_WITNESS_FILE + + #Don't know yet why the ts, just seems useful for debugging for now + ocf_log info "MySQL async stopped"; + return $OCF_SUCCESS + fi + + # stop waiting + shutdown_timeout=15 + if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then + shutdown_timeout=$((($OCF_RESKEY_CRM_meta_timeout/1000)-5)) + fi + + #Normally, pacemaker handles timeout but here we want to be able to do + #a SIGKILL (-9) before the timeout occurs. + count=0 + while [ $count -lt $shutdown_timeout ] + do + kill -s 0 $pid + rc=$? + if [ $rc -ne 0 ]; then + break + fi + count=`expr $count + 1` + sleep 1 + ocf_log debug "MySQL still hasn't stopped yet. Waiting..." + done + + kill -s 0 $pid + if [ $? -eq 0 ]; then + ocf_log info "MySQL failed to stop after ${shutdown_timeout}s using SIGTERM. Trying SIGKILL..." + /bin/kill -KILL $pid > /dev/null + fi + + rm ${OCF_RESKEY_pid}.starting + ocf_log info "MySQL stopped"; + rm -f /var/lock/subsys/mysqld + rm -f $OCF_RESKEY_socket + return $OCF_SUCCESS +} + +mysql_promote() { + local master_info + local master_crashed_ts + local log_bin_path + local tmpfiletrx + local nb_trx + local last_binlog_number + + if ( ! mysql_status err ); then + return $OCF_NOT_RUNNING + fi + + unset_master + + # Set Master Info in CIB, cluster level attribute + update_data_master_status + master_info="$(get_local_ip)|$(get_master_status File)|$(get_master_status Position)" + ${CRM_ATTR_REPL_INFO} -v "$master_info" + rm -f $master_status_file + + master_crashed_ts=`$CRM_ATTR_MASTER_CRASHED_TS --query` + + if [ ! -z "$master_crashed_ts" -a "${#OCF_RESKEY_prm_binlog_parser_path}" -gt "0" ]; then + if [ `date +%s` -lt "$((${master_crashed_ts}+3600))" ]; then + # Master crashed less than 1h ago, let's publish the last trx + + # First, where are the binlogs? That will be easier when the using_multi_config + # branch will be merged. If the path is not defined, the output will be "." + log_bin_path=`${OCF_RESKEY_binary} --defaults-file=$OCF_RESKEY_config --verbose --help --user=$OCF_RESKEY_user | grep -e '^log-bin ' | awk '{ print $2 }'` + log_bin_path=`dirname $log_bin_path` + + if [ "$log_bin_path" == "." ]; then + log_bin_path=$OCF_RESKEY_datadir + fi + + # Let's find the last binlog file + update_data_master_status + last_binlog_file="$(get_master_status File)" + + tmpfiletrx=`mktemp ${HA_RSCTMP}/trx.${OCF_RESOURCE_INSTANCE}.XXXXXX` + + ( echo -n "${last_binlog_file}@";$OCF_RESKEY_prm_binlog_parser_path ${log_bin_path}/${last_binlog_file} | tail -n 3000 ) > $tmpfiletrx + + # Do we have 3000 trx? + nb_trx=`cat $tmpfiletrx | wc -l` + if [ "$nb_trx" -lt "3000" ]; then + # we have less than 3000, let's try the previous file + nb_trx=$((3000-$nb_trx)) #remaining + + last_binlog_number=`echo $last_binlog_file | cut -d'.' -f2 | sed -ne "s/^0*\([1-9][0-9]*\)$/\1/p"` + last_binlog_number=$(($last_binlog_number-1)) + + # re-adding the 0 padding + while [ "${#last_binlog_number}" -lt "6" ]; do + last_binlog_number="0${last_binlog_number}" + done + + last_binlog_file="`echo $last_binlog_file | cut -d'.' -f1`.${last_binlog_number}" + if [ -e "${log_bin_path}/$last_binlog_file" ]; then + ( echo -n "@${last_binlog_file}@";$OCF_RESKEY_prm_binlog_parser_path ${log_bin_path}/${last_binlog_file} | tail -n $nb_trx ) >> $tmpfiletrx + fi + fi + + #now we load all that to the cib so that it reaches the other nodes + $CRM_ATTR_LAST_TRX -v "`cat $tmpfiletrx | tr '\n' '|'`" + rm -f $tmpfiletrx + + fi + fi + + set_read_only off || return $OCF_ERR_GENERIC + + # Existing master gets a higher-than-default master preference, so + # the cluster manager does not shuffle the master role around + # unnecessarily + $CRM_MASTER -v 2147483647 + + # A master can accept reads + set_reader_attr 1 + + if [ "${#OCF_RESKEY_post_promote_script}" -gt "0" -a \ + -x "${OCF_RESKEY_post_promote_script}" -a \ + ! -L "${OCF_RESKEY_post_promote_script}" ]; then + ${OCF_RESKEY_post_promote_script} + fi + + return $OCF_SUCCESS +} + +mysql_demote() { + if ! mysql_status err; then + $CRM_MASTER -v 0 + exit $OCF_SUCCESS + else + # Return master preference to default, so the cluster manager gets + # a chance to select a new master + $CRM_MASTER -v 1 + exit $OCF_SUCCESS + fi +} + +mysql_notify() { + local master_crashed_ts last_reported_master_file_number master_status_attr + local last_reported_master_file last_reported_master_pos master_max_binlog_size + local master_score notify_resource my_resource master_crashed_ts + local relaylog_path last_relaylog_file last_trx_md5 binlog_file binlog_pos + local strip_last_reported_master_file_number strip_master_log_file_number + + # If not configured as a Stateful resource, we make no sense of + # notifications. + if ! ocf_is_ms; then + ocf_log info "This agent makes no use of notifications unless running in master/slave mode." + return $OCF_SUCCESS + fi + + local type_op + type_op="${OCF_RESKEY_CRM_meta_notify_type}-${OCF_RESKEY_CRM_meta_notify_operation}" + + ocf_log debug "Received $type_op notification." + + case "$type_op" in + 'pre-promote') + + # Master-score is normally calculated from slave-lag but it is better to use + # binlog offset position to pick the most up to date slave + master_status_attr=`$CRM_ATTR_REPL_STATUS --query -q` + + if [ $? -eq 0 ]; then + # There's a master status entry although we don't know if it is + # a valid one + last_reported_master_file=`echo $master_status_attr | cut -d'|' -f1` + last_reported_master_pos=`echo $master_status_attr | cut -d'|' -f2` + master_max_binlog_size=`echo $master_status_attr | cut -d'|' -f3` + + get_slave_info + if [ $? -eq "$OCF_SUCCESS" ]; then + # We'll be here only if the master crashed. In the event of + # a graceful demote, a post-demote notification event would have occurred. + # The post-demote include an unset-master that + # resets the slave after the completion of the IO and SQL + # threads. The post-demote doesn't run if the master host + # crashed. + + # Let's establish the master score based on the following + # rule. + # score = ((file number diff)*master_max_binlog_size + # + fileposdiff/10 + constante + # + # All events are at least 10 bytes so dividing by 10 doesn't reduce + # the resolution and increases the span. + # + # Since the master publishes its status only once per few + # seconds, the fileposdiff is likely positive. + # We'll cap all values to int signed range et target + # 1B as the value if a slave is fully in sync with the master + + #First unset_master to allow the application of the relay-log if any + #The unset_master will return the last slave status variables. + unset_master + + last_reported_master_file_number=`echo $last_reported_master_file | cut -d'.' -f2 | sed -ne "s/^0*\([1-9][0-9]*\)$/\1/p"` + master_log_file_number=`echo $master_log_file | cut -d'.' -f2 | sed -ne "s/^0*\([1-9][0-9]*\)$/\1/p"` + + master_score=$(((100000000+\ + ($master_log_file_number-$last_reported_master_file_number)*\ + $master_max_binlog_size+$master_log_pos-$last_reported_master_pos)/10)) + + # now, the caps, the upper cap is unlikely + if [ $master_score -gt 2147483647 ]; then + master_score=2147483647 + fi + + # the lower cap could happened if a slave lags behind by + # more then 30GB of binlog. In that case... do we really care + # if we floor the value + + if [ $master_score -lt -2147483647 ]; then + master_score=-2147483647 + fi + + $CRM_MASTER -v $master_score + + # Next, we need a reminder that the master crashed and when, that will be + # used to publish the last trx in the promote event if we are picket + # as the new master. + + $CRM_ATTR_MASTER_CRASHED_TS -v `date +%s` + + fi + fi + ;; + + 'post-promote') + # The master has completed its promotion. Now is a good + # time to check whether our replication slave is working + # correctly. + + # Is the notification for our set + notify_resource=`echo $OCF_RESKEY_CRM_meta_notify_promote_resource|cut -d: -f1` + my_resource=`echo $OCF_RESOURCE_INSTANCE|cut -d: -f1` + if [ $notify_resource != ${my_resource} ]; then + ocf_log debug "Notification is not for us" + return $OCF_SUCCESS + fi + + master_host=`echo $OCF_RESKEY_CRM_meta_notify_promote_uname|tr -d " "` + if [ "$master_host" = ${HOSTNAME} ]; then + ocf_log info "This will be the new master, ignoring post-promote notification." + else + + # Is this following a recent master crash? + master_crashed_ts=`$CRM_ATTR_MASTER_CRASHED_TS --query` + + if [ ! -z "$master_crashed_ts" -a "${#OCF_RESKEY_prm_binlog_parser_path}" -gt "0" ]; then + if [ `date +%s` -lt "$((${master_crashed_ts}+3600))" ]; then + # Master crashed less than 1h ago, let's see what our last trx was. + # Since the master crashed, we didn't have the post-demote notification + # so the slave may still configured if no monitor ops has run + + # if a monitor ops occurred, it should be saved in the cib + last_trx_md5=`$CRM_ATTR_NODE_LAST_TRX_MD5 --query` + + if [ -z "$last_trx_md5" ]; then + # no last_trx_md5 set in the cib, let's try to find it + + # First, where are the relay logs? That will be easier when the using_multi_config + # branch will be merged. If the path is not defined, the output will be "." + relaylog_path=`${OCF_RESKEY_binary} --defaults-file=$OCF_RESKEY_config --verbose --help --user=$OCF_RESKEY_user | grep -e '^relay-log ' | awk '{ print $2 }'` + relaylog_path=`dirname $relaylog_path` + + if [ "$log_bin_path" == "." ]; then + relaylog_path=$OCF_RESKEY_datadir + fi + + # Let's find the last binlog file + get_slave_info + last_relaylog_file="$relay_log_file" + + #ok now we need to find the md5 of the last trx + + last_trx_md5=`$OCF_RESKEY_prm_binlog_parser_path ${relaylog_path}/${last_relaylog_file} | tail -n 1 | cut -d',' -f2` + fi + + if [ ! -z "$last_trx_md5" ]; then + # now, let's try to find this md5 in the NEW_MASTER_LAST_TRX attribute + # There maybe up to 2 binlog files in the attribute + + #1st file + binlog_file=`$CRM_ATTR_LAST_TRX --query | cut -d'@' -f1` + binlog_pos=`$CRM_ATTR_LAST_TRX --query | cut -d'@' -f2 | tr '|' '\n' | grep -A1 $last_trx_md5 | tail -n 1 | cut -d',' -f1` + + #found? + if [ -z "$binlog_pos" ]; then + #no, let's try if there's a 2nd file + binlog_file=`$CRM_ATTR_LAST_TRX --query | cut -d'@' -f3` + + if [ -z "$binlog_file" ]; then + binlog_pos=`$CRM_ATTR_LAST_TRX --query | cut -d'@' -f4 | tr '|' '\n' | grep -A1 $last_trx_md5 | tail -n 1 | cut -d',' -f1` + fi + fi + + # TODO: we could be at the end of the 2nd file so we should point to the first entry of the first + # file. Edge case, will deal with it later. + + # have we found something? + if [ ! -z "$binlog_file" -a ! -z "$binlog_pos" ]; then + # Let's overwrite the glb_local_info variable + glb_local_info="`echo $glb_local_info | cut -d'|' -f1`|$binlog_file|$binlog_pos" + fi + fi + fi + fi + + ocf_log info "Resetting replication" + unset_master #Should be unset already execpt if master crashed + if [ $? -ne 0 ]; then + return $OCF_ERR_GENERIC + fi + + ocf_log info "Changing MySQL configuration to replicate from $master_host" + set_master + if [ $? -ne 0 ]; then + return $OCF_ERR_GENERIC + fi + + start_slave + if [ $? -ne 0 ]; then + ocf_log err "Failed to start slave" + return $OCF_ERR_GENERIC + fi + fi + return $OCF_SUCCESS + ;; + + 'pre-demote') + # Is the notification for our set + notify_resource=`echo $OCF_RESKEY_CRM_meta_notify_demote_resource|cut -d: -f1` + my_resource=`echo $OCF_RESOURCE_INSTANCE|cut -d: -f1` + if [ $notify_resource != ${my_resource} ]; then + ocf_log debug "Notification is not for us" + return $OCF_SUCCESS + fi + + demote_host=`echo $OCF_RESKEY_CRM_meta_notify_demote_uname|tr -d " "` + if [ $demote_host = ${HOSTNAME} ]; then + ocf_log info "post-demote notification for $demote_host" + set_read_only on + if [ $? -ne 0 ]; then + ocf_log err "Failed to set read-only"; + return $OCF_ERR_GENERIC; + fi + + # Must kill all existing user threads because they are still Read/write + # in order for the slaves to complete the read of binlogs + local tmpfile + tmpfile=`mktemp ${HA_RSCTMP}/threads.${OCF_RESOURCE_INSTANCE}.XXXXXX` + mysql_run -Q -sw -O $MYSQL $MYSQL_OPTIONS_REPL \ + -e "SHOW PROCESSLIST" > $tmpfile + + for thread in `awk '$0 !~ /Binlog Dump|system user|event_scheduler|SHOW PROCESSLIST/ && $0 ~ /^[0-9]/ {print $1}' $tmpfile` + do + mysql_run -Q -sw $MYSQL $MYSQL_OPTIONS_REPL \ + -e "KILL ${thread}" + done + rm -f $tmpfile + else + ocf_log info "Ignoring post-demote notification execpt for my own demotion." + fi + return $OCF_SUCCESS + ;; + 'post-demote') + # Is the notification for our set + notify_resource=`echo $OCF_RESKEY_CRM_meta_notify_demote_resource|cut -d: -f1` + my_resource=`echo $OCF_RESOURCE_INSTANCE|cut -d: -f1` + if [ $notify_resource != ${my_resource} ]; then + ocf_log debug "Notification is not for us" + return $OCF_SUCCESS + fi + + demote_host=`echo $OCF_RESKEY_CRM_meta_notify_demote_uname|tr -d " "` + if [ $demote_host = ${HOSTNAME} ]; then + ocf_log info "Ignoring post-demote notification for my own demotion." + return $OCF_SUCCESS + fi + ocf_log info "post-demote notification for $demote_host." + # The former master has just been gracefully demoted. + unset_master + ;; + *) + return $OCF_SUCCESS + ;; + esac +} + +# +# mysql_run: Run a mysql command, log its output and return the proper error code. +# Usage: mysql_run [-Q] [-info|-warn|-err] [-O] [-sw] +# -Q: don't log the output of the command if it succeeds +# -info|-warn|-err: log the output of the command at given +# severity if it fails (defaults to err) +# -O: echo the output of the command +# -sw: Suppress 5.6 client warning when password is used on the command line +# Adapted from ocf_run. +# +mysql_run() { + local rc + local output outputfile + local verbose=1 + local returnoutput + local loglevel=err + local suppress_56_password_warning + local var + + for var in 1 2 3 4 + do + case "$1" in + "-Q") + verbose="" + shift 1;; + "-info"|"-warn"|"-err") + loglevel=`echo $1 | sed -e s/-//g` + shift 1;; + "-O") + returnoutput=1 + shift 1;; + "-sw") + suppress_56_password_warning=1 + shift 1;; + + *) + ;; + esac + done + + outputfile=`mktemp ${HA_RSCTMP}/mysql_run.${OCF_RESOURCE_INSTANCE}.XXXXXX` + error=`"$@" 2>&1 1>$outputfile` + rc=$? + if [ "$suppress_56_password_warning" -eq 1 ]; then + error=`echo "$error" | egrep -v '^Warning: Using a password on the command line'` + fi + output=`cat $outputfile` + rm -f $outputfile + + if [ $rc -eq 0 ]; then + if [ "$verbose" -a ! -z "$output" ]; then + ocf_log info "$output" + fi + + if [ "$returnoutput" -a ! -z "$output" ]; then + echo "$output" + fi + + MYSQL_LAST_ERR=$OCF_SUCCESS + return $OCF_SUCCESS + else + if [ ! -z "$error" ]; then + ocf_log $loglevel "$error" + regex='^ERROR ([[:digit:]]{4}).*' + if [[ $error =~ $regex ]]; then + mysql_code=${BASH_REMATCH[1]} + if [ -n "$mysql_code" ]; then + MYSQL_LAST_ERR=$mysql_code + return $rc + fi + fi + else + ocf_log $loglevel "command failed: $*" + fi + # No output to parse so return the standard exit code. + MYSQL_LAST_ERR=$rc + return $rc + fi +} + +####################################################################### + + +########################################################################## +# If DEBUG_LOG is set, make this resource agent easy to debug: set up the +# debug log and direct all output to it. Otherwise, redirect to /dev/null. +# The log directory must be a directory owned by root, with permissions 0700, +# and the log must be writable and not a symlink. +########################################################################## +DEBUG_LOG="/tmp/mysql.ocf.ra.debug/log" +if [ "${DEBUG_LOG}" -a -w "${DEBUG_LOG}" -a ! -L "${DEBUG_LOG}" ]; then + DEBUG_LOG_DIR="${DEBUG_LOG%/*}" + if [ -d "${DEBUG_LOG_DIR}" ]; then + exec 9>>"$DEBUG_LOG" + exec 2>&9 + date >&9 + echo "$*" >&9 + env | grep OCF_ | sort >&9 + set -x + else + exec 9>/dev/null + fi +fi + +case "$1" in + meta-data) meta_data + exit $OCF_SUCCESS;; + usage|help) usage + exit $OCF_SUCCESS;; +esac + +if [ "$#" -lt "1" ]; then + usage + exit $OCF_SUCCESS +fi + +mysql_validate +rc=$? +LSB_STATUS_STOPPED=3 +if [ $rc -ne 0 ]; then + case "$1" in + stop) exit $OCF_SUCCESS;; + monitor) exit $OCF_NOT_RUNNING;; + status) exit $LSB_STATUS_STOPPED;; + *) exit $rc;; + esac +fi + +# We check if there is a location constraint against this node +# where $OCF_RESOURCE_INSTANCE should not be running here i.e. +# -INFINITY score, if so we ignore monitor call for this node +contrnt=$(cibadmin -t 2 --query --obj_type constraints\ + |awk "/rsc=\"$OCF_RESOURCE_INSTANCE\"/,/<\/rsc_location/"\ + |awk '/score="-INFINITY"/,/<\/rule/'\ + |egrep "expression attribute=\"#uname\".*operation=\"eq\".*value=\"$HOSTNAME\"" 2> /dev/null) + +if [ "$?" -eq "0" ]; then + exit $OCF_SUCCESS +fi + +#Global info missing from OCF_RESKEY +resources=`$CRM_RES --list` + +# now we need the master-slave clone set name, need to walk around limitations +# of older pacemaker +if [[ "$OCF_RESKEY_crm_feature_set" > "3.0.1" ]]; then + glb_master_resource=`echo "$resources" | grep $INSTANCE_ATTR_NAME | awk '{print $3}' | head -n 1` +else + # older versions of Pacemaker don't write the primitive name in the resources list + for msr in `echo "$resources" | grep 'Master/Slave' | awk '{print $3}'`; do + isThere=`$CRM_RES -q -r $msr | grep primitive | grep -c $INSTANCE_ATTR_NAME` + if [ "$isThere" -gt "0" ]; then + glb_master_resource=$msr + fi + done +fi +is_master_side +glb_master_side=$? +if [ "${#OCF_RESKEY_geo_remote_IP}" -gt "0" -a $glb_master_side -ne 0 ]; then + # geo_remote_IP is defined, let's query the remote side + # the variable content will be like: pacemaker-1-1|binlog.000156|107 1 + glb_remote_info=`$SSH $OCF_RESKEY_geo_remote_IP -l root "$CRM_ATTR_REPL_INFO --query -q | tr '\n' ' ';$CRM_RES --list | grep -A2 $glb_master_resource | egrep -c 'Master[^\/]'"` + glb_master_exists=`echo $glb_remote_info | awk '{ print $NF }'` + if [[ -z "$glb_master_exists" ]]; then + glb_master_exists=0 + fi +else + glb_master_exists=`echo "$resources" | grep -A2 $glb_master_resource | egrep -c 'Master[^\/]'` +fi + +if [ "$glb_master_exists" -eq "1" ]; then + if [ "${#glb_remote_info}" -gt "0" ]; then + glb_cib_master=`echo $glb_remote_info | awk '{ print $1 }' | cut -d'|' -f1` + else + glb_local_info=`$CRM_ATTR_REPL_INFO --query -q` + glb_cib_master=`echo $glb_local_info | cut -d'|' -f1` + fi +fi + +# What kind of method was invoked? +case "$1" in + start) mysql_start;; + stop) mysql_stop;; + status) mysql_status err;; + monitor) mysql_monitor;; + promote) mysql_promote;; + demote) mysql_demote;; + notify) mysql_notify;; + validate-all) exit $OCF_SUCCESS;; + + *) usage + exit $OCF_ERR_UNIMPLEMENTED;; +esac From 1ab6102145ce18c58bf57f8016b9edce2e25f82d Mon Sep 17 00:00:00 2001 From: Felipe Reyes Date: Mon, 2 Mar 2015 18:09:39 -0300 Subject: [PATCH 02/22] add charm_dir() --- hooks/percona_utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hooks/percona_utils.py b/hooks/percona_utils.py index 70f951c..e8990e8 100644 --- a/hooks/percona_utils.py +++ b/hooks/percona_utils.py @@ -9,6 +9,7 @@ from charmhelpers.core.host import ( lsb_release ) from charmhelpers.core.hookenv import ( + charm_dir, unit_get, relation_ids, related_units, @@ -234,7 +235,7 @@ def unit_sorted(units): def install_mysql_ocf(): dest_file = '/usr/lib/ocf/resource.d/percona/mysql' - src_file = 'ofc/percona/mysql' + src_file = os.path.join(charm_dir(), 'ofc/percona/mysql') if not os.path.isdir(os.path.dirname(dest_file)): os.makedirs(os.path.dirname(dest_file)) From 8473bd1f93fcc39537a03b5af16d9be9415b8f9a Mon Sep 17 00:00:00 2001 From: Felipe Reyes Date: Mon, 2 Mar 2015 18:17:13 -0300 Subject: [PATCH 03/22] Fix typo --- hooks/percona_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hooks/percona_utils.py b/hooks/percona_utils.py index e8990e8..d58ba9e 100644 --- a/hooks/percona_utils.py +++ b/hooks/percona_utils.py @@ -235,7 +235,7 @@ def unit_sorted(units): def install_mysql_ocf(): dest_file = '/usr/lib/ocf/resource.d/percona/mysql' - src_file = os.path.join(charm_dir(), 'ofc/percona/mysql') + src_file = os.path.join(charm_dir(), 'ocf/percona/mysql') if not os.path.isdir(os.path.dirname(dest_file)): os.makedirs(os.path.dirname(dest_file)) From 42f20eb8cc053b274fe6681fe26fbeac69635e56 Mon Sep 17 00:00:00 2001 From: Felipe Reyes Date: Wed, 4 Mar 2015 11:42:47 -0300 Subject: [PATCH 04/22] Create mysql resource to monitor the daemon This new pcmkr resource will take care of keep running mysqld and if it can't then it will migrate the vip to a node where it is. --- hooks/percona_hooks.py | 27 ++++++++++++++++----------- templates/my.cnf | 1 + 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/hooks/percona_hooks.py b/hooks/percona_hooks.py index d269a7c..5efa1a3 100755 --- a/hooks/percona_hooks.py +++ b/hooks/percona_hooks.py @@ -73,14 +73,11 @@ from charmhelpers.contrib.network.ip import ( hooks = Hooks() LEADER_RES = 'grp_percona_cluster' -RES_MYSQL_PARAMS = ('params config="/etc/my.cnf" ' - 'pid="/var/lib/mysql/mysqld.pid" ' +RES_MYSQL_PARAMS = ('params config="/etc/mysql/my.cnf" ' + 'pid="/var/run/mysqld/mysqld.pid" ' 'socket="/var/run/mysqld/mysqld.sock" ' - 'replication_user="sstuser" ' - 'replication_passwd="%(sstpsswd)s" ' - 'max_slave_lag="60" ' - 'evict_outdated_slaves="false" ' - 'binary="/usr/libexec/mysqld" ' + 'max_slave_lag="60" ' # default is 3600 + 'binary="/usr/sbin/mysqld" ' 'op monitor interval="5s" role="Master" ' 'OCF_CHECK_LEVEL="1" ' 'op monitor interval="2s" role="Slave" ' @@ -405,20 +402,28 @@ def ha_relation_joined(): (vip, vip_cidr, vip_iface) resources = {'res_mysql_vip': res_mysql_vip, - 'res_mysql': 'ocf:percona:mysql'} + 'res_mysqld': 'ocf:percona:mysql'} db_helper = get_db_helper() - sstpsswd = db_helper.get_mysql_password(username='sstuser') + cfg_passwd = config('sst-password') + sstpsswd = db_helper.get_mysql_password(username='sstuser', + password=cfg_passwd) resource_params = {'res_mysql_vip': vip_params, - 'res_mysql': RES_MYSQL_PARAMS % {'sstpsswd': sstpsswd}} + 'res_mysqld': RES_MYSQL_PARAMS % {'sstpsswd': sstpsswd}} groups = {'grp_percona_cluster': 'res_mysql_vip'} + clones = {'cl_mysqld': 'res_mysqld meta interleave=true'} + + colocations = {'vip_mysqld': 'inf: res_mysqld res_mysql_vip role=Master'} + for rel_id in relation_ids('ha'): relation_set(relation_id=rel_id, corosync_bindiface=corosync_bindiface, corosync_mcastport=corosync_mcastport, resources=resources, resource_params=resource_params, - groups=groups) + groups=groups, + clones=clones, + colocations=colocations) @hooks.hook('ha-relation-changed') diff --git a/templates/my.cnf b/templates/my.cnf index 19f3f86..28891f5 100644 --- a/templates/my.cnf +++ b/templates/my.cnf @@ -11,6 +11,7 @@ wsrep_provider_options = {{ wsrep_provider_options }} datadir=/var/lib/mysql user=mysql +pid_file = /var/run/mysqld/mysqld.pid # Path to Galera library wsrep_provider=/usr/lib/libgalera_smm.so From 5979fd154f4d70e380add0eab73083be351f486c Mon Sep 17 00:00:00 2001 From: Felipe Reyes Date: Fri, 6 Mar 2015 12:33:52 -0300 Subject: [PATCH 05/22] pcmkr: Add mysql_monitor agent with pxc support --- ocf/percona/mysql_monitor | 632 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 632 insertions(+) create mode 100755 ocf/percona/mysql_monitor diff --git a/ocf/percona/mysql_monitor b/ocf/percona/mysql_monitor new file mode 100755 index 0000000..ba6f841 --- /dev/null +++ b/ocf/percona/mysql_monitor @@ -0,0 +1,632 @@ +#!/bin/bash +# +# +# MySQL_Monitor agent, set writeable and readable attributes based on the +# state of the local MySQL, running and read_only or not. The agent basis is +# the original "Dummy" agent written by Lars Marowsky-Brée and part of the +# Pacemaker distribution. Many functions are from mysql_prm. +# +# +# Copyright (c) 2013, Percona inc., Yves Trudeau, Michael Coburn +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# This program is distributed in the hope that it would be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# +# Further, this software is distributed without any warranty that it is +# free of the rightful claim of any third person regarding infringement +# or the like. Any license provided herein, whether implied or +# otherwise, applies only to this software file. Patent licenses, if +# any, provided herein do not apply to combinations of this program with +# other software, or any other product whatsoever. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write the Free Software Foundation, +# Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. +# +# Version: 20131119163921 +# +# See usage() function below for more details... +# +# OCF instance parameters: +# +# OCF_RESKEY_state +# OCF_RESKEY_user +# OCF_RESKEY_password +# OCF_RESKEY_client_binary +# OCF_RESKEY_pid +# OCF_RESKEY_socket +# OCF_RESKEY_reader_attribute +# OCF_RESKEY_reader_failcount +# OCF_RESKEY_writer_attribute +# OCF_RESKEY_max_slave_lag +# OCF_RESKEY_cluster_type +# +####################################################################### +# Initialization: + +: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} +. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs + +####################################################################### + +HOSTOS=`uname` +if [ "X${HOSTOS}" = "XOpenBSD" ];then +OCF_RESKEY_client_binary_default="/usr/local/bin/mysql" +OCF_RESKEY_pid_default="/var/mysql/mysqld.pid" +OCF_RESKEY_socket_default="/var/run/mysql/mysql.sock" +else +OCF_RESKEY_client_binary_default="/usr/bin/mysql" +OCF_RESKEY_pid_default="/var/run/mysql/mysqld.pid" +OCF_RESKEY_socket_default="/var/lib/mysql/mysql.sock" +fi +OCF_RESKEY_reader_attribute_default="readable" +OCF_RESKEY_writer_attribute_default="writable" +OCF_RESKEY_reader_failcount_default="1" +OCF_RESKEY_user_default="root" +OCF_RESKEY_password_default="" +OCF_RESKEY_max_slave_lag_default="3600" +OCF_RESKEY_cluster_type_default="replication" + +: ${OCF_RESKEY_state=${HA_RSCTMP}/mysql-monitor-${OCF_RESOURCE_INSTANCE}.state} +: ${OCF_RESKEY_client_binary=${OCF_RESKEY_client_binary_default}} +: ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}} +: ${OCF_RESKEY_socket=${OCF_RESKEY_socket_default}} +: ${OCF_RESKEY_reader_attribute=${OCF_RESKEY_reader_attribute_default}} +: ${OCF_RESKEY_reader_failcount=${OCF_RESKEY_reader_failcount_default}} +: ${OCF_RESKEY_writer_attribute=${OCF_RESKEY_writer_attribute_default}} +: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}} +: ${OCF_RESKEY_password=${OCF_RESKEY_password_default}} +: ${OCF_RESKEY_max_slave_lag=${OCF_RESKEY_max_slave_lag_default}} +: ${OCF_RESKEY_cluster_type=${OCF_RESKEY_cluster_type_default}} + +MYSQL="$OCF_RESKEY_client_binary -A -S $OCF_RESKEY_socket --connect_timeout=10 --user=$OCF_RESKEY_user --password=$OCF_RESKEY_password " +HOSTNAME=`uname -n` +CRM_ATTR="${HA_SBIN_DIR}/crm_attribute -N $HOSTNAME " + +meta_data() { + cat < + + +1.0 + + +This agent monitors the local MySQL instance and set the writable and readable +attributes according to what it finds. It checks if MySQL is running and if +it is read-only or not. + +Agent monitoring mysql + + + + +Location to store the resource state in. + +State file + + + + + +MySQL user to connect to the local MySQL instance to check the slave status and +if the read_only variable is set. It requires the replication client priviledge. + +MySQL user + + + + + +Password of the mysql user to connect to the local MySQL instance + +MySQL password + + + + + +MySQL Client Binary path. + +MySQL client binary path + + + + + +Unix socket to use in order to connect to MySQL on the host + +MySQL socket + + + + + +MySQL pid file, used to verify MySQL is running. + +MySQL pid file + + + + + +The reader attribute in the cib that can be used by location rules to allow or not +reader VIPs on a host. + +Reader attribute + + + + + +The reader attribute in the cib that can be used by location rules to allow or not +reader VIPs on a host. + +Writer attribute + + + + + +The maximum number of seconds a replication slave is allowed to lag +behind its master in order to have a reader VIP on it. + +Maximum time (seconds) a MySQL slave is allowed +to lag behind a master + + + + + +Type of cluster, three possible values: pxc, replication, read-only. "pxc" is +for Percona XtraDB cluster, it uses the clustercheck script and set the +reader_attribute and writer_attribute according to the return code. +"replication" checks the read-only state and the slave status, only writable +node(s) will get the writer_attribute (and the reader_attribute) and on the +read-only nodes, replication status will be checked and the reader_attribute set +according to the state. "read-only" will just check if the read-only variable, +if read/write, it will get both the writer_attribute and reader_attribute set, if +read-only it will get only the reader_attribute. + +Type of cluster + + + + + + + + + + + + + + + + +END +} + +####################################################################### +# Non API functions + +# Extract fields from slave status +parse_slave_info() { + # Extracts field $1 from result of "SHOW SLAVE STATUS\G" from file $2 + sed -ne "s/^.* $1: \(.*\)$/\1/p" < $2 +} + +# Read the slave status and +get_slave_info() { + + local mysql_options tmpfile + + if [ "$master_log_file" -a "$master_host" ]; then + # variables are already defined, get_slave_info has been run before + return $OCF_SUCCESS + else + tmpfile=`mktemp ${HA_RSCTMP}/check_slave.${OCF_RESOURCE_INSTANCE}.XXXXXX` + + mysql_run -Q -sw -O $MYSQL $MYSQL_OPTIONS_REPL \ + -e 'SHOW SLAVE STATUS\G' > $tmpfile + + if [ -s $tmpfile ]; then + master_host=`parse_slave_info Master_Host $tmpfile` + slave_sql=`parse_slave_info Slave_SQL_Running $tmpfile` + slave_io=`parse_slave_info Slave_IO_Running $tmpfile` + slave_io_state=`parse_slave_info Slave_IO_State $tmpfile` + last_errno=`parse_slave_info Last_Errno $tmpfile` + secs_behind=`parse_slave_info Seconds_Behind_Master $tmpfile` + ocf_log debug "MySQL instance has a non empty slave status" + else + # Instance produced an empty "SHOW SLAVE STATUS" output -- + # instance is not a slave + + ocf_log err "check_slave invoked on an instance that is not a replication slave." + rm -f $tmpfile + return $OCF_ERR_GENERIC + fi + rm -f $tmpfile + return $OCF_SUCCESS + fi +} + +get_read_only() { + # Check if read-only is set + local read_only_state + + read_only_state=`mysql_run -Q -sw -O $MYSQL -N $MYSQL_OPTIONS_REPL \ + -e "SHOW VARIABLES like 'read_only'" | awk '{print $2}'` + + if [ "$read_only_state" = "ON" ]; then + return 0 + else + return 1 + fi +} + +# get the attribute controlling the readers VIP +get_reader_attr() { + local attr_value + local rc + + attr_value=`$CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} --query -q` + rc=$? + if [ "$rc" -eq "0" ]; then + echo $attr_value + else + echo -1 + fi + +} + +# Set the attribute controlling the readers VIP +set_reader_attr() { + local curr_attr_value + + curr_attr_value=$(get_reader_attr) + + if [ "$1" -eq "0" ]; then + if [ "$curr_attr_value" -gt "0" ]; then + curr_attr_value=$((${curr_attr_value}-1)) + $CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} -v $curr_attr_value + else + $CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} -v 0 + fi + else + if [ "$curr_attr_value" -ne "$OCF_RESKEY_reader_failcount" ]; then + $CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} -v $OCF_RESKEY_reader_failcount + fi + fi + +} + +# get the attribute controlling the writer VIP +get_writer_attr() { + local attr_value + local rc + + attr_value=`$CRM_ATTR -l reboot --name ${OCF_RESKEY_writer_attribute} --query -q` + rc=$? + if [ "$rc" -eq "0" ]; then + echo $attr_value + else + echo -1 + fi + +} + +# Set the attribute controlling the writer VIP +set_writer_attr() { + local curr_attr_value + + curr_attr_value=$(get_writer_attr) + + if [ "$1" -ne "$curr_attr_value" ]; then + if [ "$1" -eq "0" ]; then + $CRM_ATTR -l reboot --name ${OCF_RESKEY_writer_attribute} -v 0 + else + $CRM_ATTR -l reboot --name ${OCF_RESKEY_writer_attribute} -v 1 + fi + fi +} + +# +# mysql_run: Run a mysql command, log its output and return the proper error code. +# Usage: mysql_run [-Q] [-info|-warn|-err] [-O] [-sw] +# -Q: don't log the output of the command if it succeeds +# -info|-warn|-err: log the output of the command at given +# severity if it fails (defaults to err) +# -O: echo the output of the command +# -sw: Suppress 5.6 client warning when password is used on the command line +# Adapted from ocf_run. +# +mysql_run() { + local rc + local output outputfile + local verbose=1 + local returnoutput + local loglevel=err + local suppress_56_password_warning + local var + + for var in 1 2 3 4 + do + case "$1" in + "-Q") + verbose="" + shift 1;; + "-info"|"-warn"|"-err") + loglevel=`echo $1 | sed -e s/-//g` + shift 1;; + "-O") + returnoutput=1 + shift 1;; + "-sw") + suppress_56_password_warning=1 + shift 1;; + + *) + ;; + esac + done + + outputfile=`mktemp ${HA_RSCTMP}/mysql_run.${OCF_RESOURCE_INSTANCE}.XXXXXX` + error=`"$@" 2>&1 1>$outputfile` + rc=$? + if [ "$suppress_56_password_warning" -eq 1 ]; then + error=`echo "$error" | egrep -v '^Warning: Using a password on the command line'` + fi + output=`cat $outputfile` + rm -f $outputfile + + if [ $rc -eq 0 ]; then + if [ "$verbose" -a ! -z "$output" ]; then + ocf_log info "$output" + fi + + if [ "$returnoutput" -a ! -z "$output" ]; then + echo "$output" + fi + + MYSQL_LAST_ERR=$OCF_SUCCESS + return $OCF_SUCCESS + else + if [ ! -z "$error" ]; then + ocf_log $loglevel "$error" + regex='^ERROR ([[:digit:]]{4}).*' + if [[ $error =~ $regex ]]; then + mysql_code=${BASH_REMATCH[1]} + if [ -n "$mysql_code" ]; then + MYSQL_LAST_ERR=$mysql_code + return $rc + fi + fi + else + ocf_log $loglevel "command failed: $*" + fi + # No output to parse so return the standard exit code. + MYSQL_LAST_ERR=$rc + return $rc + fi +} + + + + +####################################################################### +# API functions + +mysql_monitor_usage() { + cat </dev/null 2>&1 + fi + + if [ $? -eq 0 ]; then + + case ${OCF_RESKEY_cluster_type} in + 'replication'|'REPLICATION') + if get_read_only; then + # a slave? + + set_writer_attr 0 + + get_slave_info + rc=$? + + if [ $rc -eq 0 ]; then + # show slave status is not empty + # Is there a master_log_file defined? (master_log_file is deleted + # by reset slave + if [ "$master_log_file" ]; then + # is read_only but no slave config... + + set_reader_attr 0 + + else + # has a slave config + + if [ "$slave_sql" = 'Yes' -a "$slave_io" = 'Yes' ]; then + # $secs_behind can be NULL so must be tested only + # if replication is OK + if [ $secs_behind -gt $OCF_RESKEY_max_slave_lag ]; then + set_reader_attr 0 + else + set_reader_attr 1 + fi + else + set_reader_attr 0 + fi + fi + else + # "SHOW SLAVE STATUS" returns an empty set if instance is not a + # replication slave + + set_reader_attr 0 + + fi + else + # host is RW + set_reader_attr 1 + set_writer_attr 1 + fi + ;; + + 'pxc'|'PXC') + pxcstat=`/usr/bin/clustercheck $OCF_RESKEY_user $OCF_RESKEY_password ` + if [ $? -eq 0 ]; then + set_reader_attr 1 + set_writer_attr 1 + else + set_reader_attr 0 + set_writer_attr 0 + fi + + ;; + + 'read-only'|'READ-ONLY') + if get_read_only; then + set_reader_attr 1 + set_writer_attr 0 + else + set_reader_attr 1 + set_writer_attr 1 + fi + ;; + + esac + fi + else + ocf_log $1 "MySQL is not running" + set_reader_attr 0 + set_writer_attr 0 + fi +} + +mysql_monitor_monitor() { + # Monitor _MUST!_ differentiate correctly between running + # (SUCCESS), failed (ERROR) or _cleanly_ stopped (NOT RUNNING). + # That is THREE states, not just yes/no. + + if [ -f ${OCF_RESKEY_state} ]; then + return $OCF_SUCCESS + fi + if false ; then + return $OCF_ERR_GENERIC + fi + return $OCF_NOT_RUNNING +} + +mysql_monitor_validate() { + + # Is the state directory writable? + state_dir=`dirname "$OCF_RESKEY_state"` + touch "$state_dir/$$" + if [ $? != 0 ]; then + return $OCF_ERR_ARGS + fi + rm "$state_dir/$$" + + return $OCF_SUCCESS +} + +########################################################################## +# If DEBUG_LOG is set, make this resource agent easy to debug: set up the +# debug log and direct all output to it. Otherwise, redirect to /dev/null. +# The log directory must be a directory owned by root, with permissions 0700, +# and the log must be writable and not a symlink. +########################################################################## +DEBUG_LOG="/tmp/mysql_monitor.ocf.ra.debug/log" +if [ "${DEBUG_LOG}" -a -w "${DEBUG_LOG}" -a ! -L "${DEBUG_LOG}" ]; then + DEBUG_LOG_DIR="${DEBUG_LOG%/*}" + if [ -d "${DEBUG_LOG_DIR}" ]; then + exec 9>>"$DEBUG_LOG" + exec 2>&9 + date >&9 + echo "$*" >&9 + env | grep OCF_ | sort >&9 + set -x + else + exec 9>/dev/null + fi +fi + + +case $__OCF_ACTION in +meta-data) meta_data + exit $OCF_SUCCESS + ;; +start) mysql_monitor_start;; +stop) mysql_monitor_stop;; +monitor) mysql_monitor + mysql_monitor_monitor;; +migrate_to) ocf_log info "Migrating ${OCF_RESOURCE_INSTANCE} to ${OCF_RESKEY_CRM_meta_migrate_target}." + mysql_monitor_stop + ;; +migrate_from) ocf_log info "Migrating ${OCF_RESOURCE_INSTANCE} from ${OCF_RESKEY_CRM_meta_migrate_source}." + mysql_monitor_start + ;; +reload) ocf_log info "Reloading ${OCF_RESOURCE_INSTANCE} ..." + ;; +validate-all) mysql_monitor_validate;; +usage|help) mysql_monitor_usage + exit $OCF_SUCCESS + ;; +*) mysql_monitor_usage + exit $OCF_ERR_UNIMPLEMENTED + ;; +esac +rc=$? +ocf_log debug "${OCF_RESOURCE_INSTANCE} $__OCF_ACTION : $rc" +exit $rc + From 2917a37761051242440b2e1484fc10dbe392305f Mon Sep 17 00:00:00 2001 From: Felipe Reyes Date: Fri, 6 Mar 2015 12:35:01 -0300 Subject: [PATCH 06/22] configure mysql_monitor agent Defining a location rule to make sure the vip is always running in a node that has a writable percona --- .bzrignore | 3 + Makefile | 4 ++ hooks/percona_hooks.py | 33 +++++----- hooks/percona_utils.py | 17 +++-- tests/00-setup.sh | 20 ++++++ tests/10-deploy_test.py | 29 +++++++++ tests/20-broken-mysqld.py | 38 ++++++++++++ tests/basic_deployment.py | 126 ++++++++++++++++++++++++++++++++++++++ 8 files changed, 248 insertions(+), 22 deletions(-) create mode 100755 tests/00-setup.sh create mode 100755 tests/10-deploy_test.py create mode 100755 tests/20-broken-mysqld.py create mode 100644 tests/basic_deployment.py diff --git a/.bzrignore b/.bzrignore index 4ff08cd..0c2be0d 100644 --- a/.bzrignore +++ b/.bzrignore @@ -2,3 +2,6 @@ bin .coverage .pydevproject .project +*.pyc +*.pyo +__pycache__ diff --git a/Makefile b/Makefile index 858c03f..6cec23b 100644 --- a/Makefile +++ b/Makefile @@ -9,6 +9,10 @@ lint: unit_test: @$(PYTHON) /usr/bin/nosetests --nologcapture unit_tests +functional_test: + @echo Starting amulet tests... + @juju test -v -p AMULET_HTTP_PROXY --timeout 900 + bin/charm_helpers_sync.py: @mkdir -p bin @bzr cat lp:charm-helpers/tools/charm_helpers_sync/charm_helpers_sync.py \ diff --git a/hooks/percona_hooks.py b/hooks/percona_hooks.py index 5efa1a3..f236154 100755 --- a/hooks/percona_hooks.py +++ b/hooks/percona_hooks.py @@ -73,17 +73,13 @@ from charmhelpers.contrib.network.ip import ( hooks = Hooks() LEADER_RES = 'grp_percona_cluster' -RES_MYSQL_PARAMS = ('params config="/etc/mysql/my.cnf" ' - 'pid="/var/run/mysqld/mysqld.pid" ' - 'socket="/var/run/mysqld/mysqld.sock" ' - 'max_slave_lag="60" ' # default is 3600 - 'binary="/usr/sbin/mysqld" ' - 'op monitor interval="5s" role="Master" ' - 'OCF_CHECK_LEVEL="1" ' - 'op monitor interval="2s" role="Slave" ' - 'OCF_CHECK_LEVEL="1" ' - 'op start interval="0" timeout="60s" ' - 'op stop interval="0" timeout="60s" ') +RES_MONITOR_PARAMS = ('params user="sstuser" password="%(sstpass)s" ' + 'pid="/var/run/mysqld/mysqld.pid" ' + 'socket="/var/run/mysqld/mysqld.sock" ' + 'max_slave_lag="5" ' + 'cluster_type="pxc" ' + 'op monitor interval="1s" timeout="30s" ' + 'OCF_CHECK_LEVEL="1"') @hooks.hook('install') @@ -402,18 +398,22 @@ def ha_relation_joined(): (vip, vip_cidr, vip_iface) resources = {'res_mysql_vip': res_mysql_vip, - 'res_mysqld': 'ocf:percona:mysql'} + 'res_mysql_monitor': 'ocf:percona:mysql_monitor'} db_helper = get_db_helper() cfg_passwd = config('sst-password') sstpsswd = db_helper.get_mysql_password(username='sstuser', password=cfg_passwd) resource_params = {'res_mysql_vip': vip_params, - 'res_mysqld': RES_MYSQL_PARAMS % {'sstpsswd': sstpsswd}} + 'res_mysql_monitor': + RES_MONITOR_PARAMS % {'sstpass': sstpsswd}} groups = {'grp_percona_cluster': 'res_mysql_vip'} - clones = {'cl_mysqld': 'res_mysqld meta interleave=true'} + clones = {'cl_mysql_monitor': 'res_mysql_monitor meta interleave=true'} - colocations = {'vip_mysqld': 'inf: res_mysqld res_mysql_vip role=Master'} + colocations = {'vip_mysqld': 'inf: grp_percona_cluster cl_mysql_monitor'} + + locations = {'loc_percona_cluster': + 'grp_percona_cluster rule inf: writable eq 1'} for rel_id in relation_ids('ha'): relation_set(relation_id=rel_id, @@ -423,7 +423,8 @@ def ha_relation_joined(): resource_params=resource_params, groups=groups, clones=clones, - colocations=colocations) + colocations=colocations, + locations=locations) @hooks.hook('ha-relation-changed') diff --git a/hooks/percona_utils.py b/hooks/percona_utils.py index d58ba9e..c259ab4 100644 --- a/hooks/percona_utils.py +++ b/hooks/percona_utils.py @@ -234,10 +234,15 @@ def unit_sorted(units): def install_mysql_ocf(): - dest_file = '/usr/lib/ocf/resource.d/percona/mysql' - src_file = os.path.join(charm_dir(), 'ocf/percona/mysql') + dest_dir = '/usr/lib/ocf/resource.d/percona/' + for fname in ['ocf/percona/mysql', 'ocf/percona/mysql_monitor']: + src_file = os.path.join(charm_dir(), fname) + if not os.path.isdir(dest_dir): + os.makedirs(dest_dir) - if not os.path.isdir(os.path.dirname(dest_file)): - os.makedirs(os.path.dirname(dest_file)) - if not os.path.exists(dest_file): - shutil.copy(src_file, dest_file) + dest_file = os.path.join(dest_dir, os.path.basename(src_file)) + if not os.path.exists(dest_file): + log('Installing %s' % dest_file, level='INFO') + shutil.copy(src_file, dest_file) + else: + log("'%s' already exists, skipping" % dest_file, level='INFO') diff --git a/tests/00-setup.sh b/tests/00-setup.sh new file mode 100755 index 0000000..4d9b849 --- /dev/null +++ b/tests/00-setup.sh @@ -0,0 +1,20 @@ +#!/bin/bash -ex +# The script installs amulet and other tools needed for the amulet tests. + +# Get the status of the amulet package, this returns 0 of package is installed. +dpkg -s amulet +if [ $? -ne 0 ]; then + # Install the Amulet testing harness. + sudo add-apt-repository -y ppa:juju/stable + sudo apt-get update + sudo apt-get install -y amulet juju-core charm-tools +fi + + +PACKAGES="python3 python3-yaml" +for pkg in $PACKAGES; do + dpkg -s python3 + if [ $? -ne 0 ]; then + sudo apt-get install -y -q $pkg + fi +done diff --git a/tests/10-deploy_test.py b/tests/10-deploy_test.py new file mode 100755 index 0000000..312e9c1 --- /dev/null +++ b/tests/10-deploy_test.py @@ -0,0 +1,29 @@ +#!/usr/bin/python3 +# test percona-cluster (3 nodes) + +import basic_deployment +import time + + +class ThreeNode(basic_deployment.BasicDeployment): + def __init__(self): + super(ThreeNode, self).__init__(units=3) + + def run(self): + super(ThreeNode, self).run() + # we are going to kill the master + old_master = self.master_unit + self.master_unit.run('sudo poweroff') + + time.sleep(10) # give some time to pacemaker to react + new_master = self.find_master() + assert new_master is not None, "master unit not found" + assert (new_master.info['public-address'] != + old_master.info['public-address']) + + assert self.is_port_open(address=self.vip), 'cannot connect to vip' + + +if __name__ == "__main__": + t = ThreeNode() + t.run() diff --git a/tests/20-broken-mysqld.py b/tests/20-broken-mysqld.py new file mode 100755 index 0000000..520126f --- /dev/null +++ b/tests/20-broken-mysqld.py @@ -0,0 +1,38 @@ +#!/usr/bin/python3 +# test percona-cluster (3 nodes) + +import basic_deployment +import time + + +class ThreeNode(basic_deployment.BasicDeployment): + def __init__(self): + super(ThreeNode, self).__init__(units=3) + + def run(self): + super(ThreeNode, self).run() + # we are going to kill the master + old_master = self.master_unit + print('stopping mysql in %s' % str(self.master_unit.info)) + self.master_unit.run('sudo service mysql stop') + + print('looking for the new master') + i = 0 + changed = False + while i < 10 and not changed: + i += 1 + time.sleep(5) # give some time to pacemaker to react + new_master = self.find_master() + + if (new_master and new_master.info['unit_name'] != + old_master.info['unit_name']): + changed = True + + assert changed, "The master didn't change" + + assert self.is_port_open(address=self.vip), 'cannot connect to vip' + + +if __name__ == "__main__": + t = ThreeNode() + t.run() diff --git a/tests/basic_deployment.py b/tests/basic_deployment.py new file mode 100644 index 0000000..02823b1 --- /dev/null +++ b/tests/basic_deployment.py @@ -0,0 +1,126 @@ +import amulet +import os +import telnetlib +import unittest +import yaml + + +class BasicDeployment(unittest.TestCase): + def __init__(self, vip=None, units=1): + self.units = units + self.master_unit = None + self.vip = None + if vip: + self.vip = vip + elif 'VIP' in os.environ: + self.vip = os.environ.get('VIP') + elif os.path.isfile('local.yaml'): + with open('local.yaml', 'rb') as f: + self.cfg = yaml.safe_load(f.read()) + + self.vip = self.cfg.get('vip') + else: + amulet.raise_status(amulet.SKIP, + ("please set ENV variable VIP " + "to run this test")) + + def run(self): + # The number of seconds to wait for the environment to setup. + seconds = 1200 + + self.d = amulet.Deployment(series="trusty") + self.d.add('percona-cluster', units=self.units) + self.d.add('hacluster') + self.d.relate('percona-cluster:ha', 'hacluster:ha') + + cfg_percona = {'sst-password': 'ubuntu', + 'root-password': 't00r', + 'dataset-size': '128M', + 'vip': self.vip} + + cfg_ha = {'debug': True, + 'corosync_mcastaddr': '226.94.1.4', + 'corosync_key': ('xZP7GDWV0e8Qs0GxWThXirNNYlScgi3sRTdZk/IXKD' + 'qkNFcwdCWfRQnqrHU/6mb6sz6OIoZzX2MtfMQIDcXu' + 'PqQyvKuv7YbRyGHmQwAWDUA4ed759VWAO39kHkfWp9' + 'y5RRk/wcHakTcWYMwm70upDGJEP00YT3xem3NQy27A' + 'C1w=')} + + self.d.configure('percona-cluster', cfg_percona) + self.d.configure('hacluster', cfg_ha) + + try: + self.d.setup(timeout=seconds) + self.d.sentry.wait(seconds) + except amulet.helpers.TimeoutError: + message = 'The environment did not setup in %d seconds.' % seconds + amulet.raise_status(amulet.SKIP, msg=message) + except: + raise + + self.master_unit = self.find_master() + assert self.master_unit is not None, 'percona-cluster vip not found' + + output, code = self.master_unit.run('sudo crm_verify --live-check') + assert code == 0, "'crm_verify --live-check' failed" + + resources = ['res_mysql_vip'] + resources += ['res_mysql_monitor:%d' % i for i in range(self.units)] + + assert sorted(self.get_pcmkr_resources()) == sorted(resources) + + for i in range(self.units): + uid = 'percona-cluster/%d' % i + unit = self.d.sentry.unit[uid] + assert self.is_mysqld_running(unit), 'mysql not running: %s' % uid + + def find_master(self): + for unit_id, unit in self.d.sentry.unit.items(): + if not unit_id.startswith('percona-cluster/'): + continue + + # is the vip running here? + output, code = unit.run('sudo ip a | grep %s' % self.vip) + print(unit_id) + print(output) + if code == 0: + print('vip(%s) running in %s' % (self.vip, unit_id)) + return unit + + def get_pcmkr_resources(self, unit=None): + if unit: + u = unit + else: + u = self.master_unit + + output, code = u.run('sudo crm_resource -l') + + assert code == 0, 'could not get "crm resource list"' + + return output.split('\n') + + def is_mysqld_running(self, unit=None): + if unit: + u = unit + else: + u = self.master_unit + + output, code = u.run('pidof mysqld') + + if code != 0: + return False + + return self.is_port_open(u, '3306') + + def is_port_open(self, unit=None, port='3306', address=None): + if unit: + addr = unit.info['public-address'] + elif address: + addr = address + else: + raise Exception('Please provide a unit or address') + try: + telnetlib.Telnet(addr, port) + return True + except TimeoutError: # noqa this exception only available in py3 + return False From c6b41bf12cec9aedf873419039d1a96d3d19ac6f Mon Sep 17 00:00:00 2001 From: Felipe Reyes Date: Fri, 6 Mar 2015 17:09:10 -0300 Subject: [PATCH 07/22] Check if local.yaml exists and print a friendly msg --- tests/00-setup.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/00-setup.sh b/tests/00-setup.sh index 4d9b849..64e36b7 100755 --- a/tests/00-setup.sh +++ b/tests/00-setup.sh @@ -18,3 +18,12 @@ for pkg in $PACKAGES; do sudo apt-get install -y -q $pkg fi done + + +if [ ! -f "$(dirname $0)/../local.yaml" ]; then + echo "To run these amulet tests a vip is needed, create a file called \ +local.yaml in the charm dir, this file must contain a 'vip', if you're \ +using the local provider with lxc you could use a free IP from the range \ +10.0.3.0/24" + exit 1 +fi From 0f9c7a5d06d30bbba8efe52ea7d2d336002c90ed Mon Sep 17 00:00:00 2001 From: Felipe Reyes Date: Mon, 9 Mar 2015 10:25:03 -0300 Subject: [PATCH 08/22] Fix error message when local.yaml file isn't set --- tests/basic_deployment.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/basic_deployment.py b/tests/basic_deployment.py index 02823b1..2ae5812 100644 --- a/tests/basic_deployment.py +++ b/tests/basic_deployment.py @@ -21,8 +21,8 @@ class BasicDeployment(unittest.TestCase): self.vip = self.cfg.get('vip') else: amulet.raise_status(amulet.SKIP, - ("please set ENV variable VIP " - "to run this test")) + ("please set the vip in local.yaml " + "to run this test suite")) def run(self): # The number of seconds to wait for the environment to setup. @@ -30,7 +30,7 @@ class BasicDeployment(unittest.TestCase): self.d = amulet.Deployment(series="trusty") self.d.add('percona-cluster', units=self.units) - self.d.add('hacluster') + self.d.add('hacluster', charm='local:trusty/hacluster') self.d.relate('percona-cluster:ha', 'hacluster:ha') cfg_percona = {'sst-password': 'ubuntu', From a3c76f5acb4c04d879ec40a2c547275757189d79 Mon Sep 17 00:00:00 2001 From: Felipe Reyes Date: Mon, 9 Mar 2015 10:28:51 -0300 Subject: [PATCH 09/22] Reduce verbosity while installing dependencies --- tests/00-setup.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/00-setup.sh b/tests/00-setup.sh index 64e36b7..497b294 100755 --- a/tests/00-setup.sh +++ b/tests/00-setup.sh @@ -1,4 +1,4 @@ -#!/bin/bash -ex +#!/bin/bash -x # The script installs amulet and other tools needed for the amulet tests. # Get the status of the amulet package, this returns 0 of package is installed. @@ -7,7 +7,7 @@ if [ $? -ne 0 ]; then # Install the Amulet testing harness. sudo add-apt-repository -y ppa:juju/stable sudo apt-get update - sudo apt-get install -y amulet juju-core charm-tools + sudo apt-get install -y -q amulet juju-core charm-tools fi From eccba33a94ed0d88a757cbf377e15902156dfb23 Mon Sep 17 00:00:00 2001 From: Felipe Reyes Date: Mon, 9 Mar 2015 10:58:07 -0300 Subject: [PATCH 10/22] Pull hacluster from the charm store --- tests/basic_deployment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/basic_deployment.py b/tests/basic_deployment.py index 2ae5812..fbb9e86 100644 --- a/tests/basic_deployment.py +++ b/tests/basic_deployment.py @@ -30,7 +30,7 @@ class BasicDeployment(unittest.TestCase): self.d = amulet.Deployment(series="trusty") self.d.add('percona-cluster', units=self.units) - self.d.add('hacluster', charm='local:trusty/hacluster') + self.d.add('hacluster') self.d.relate('percona-cluster:ha', 'hacluster:ha') cfg_percona = {'sst-password': 'ubuntu', From 738ac3bebf0dd5c8fd6242ffdfef42de073c41f6 Mon Sep 17 00:00:00 2001 From: Felipe Reyes Date: Mon, 9 Mar 2015 11:02:03 -0300 Subject: [PATCH 11/22] Remove ocf/perona/mysql This agent is not used --- hooks/percona_utils.py | 2 +- ocf/percona/mysql | 2193 ---------------------------------------- 2 files changed, 1 insertion(+), 2194 deletions(-) delete mode 100755 ocf/percona/mysql diff --git a/hooks/percona_utils.py b/hooks/percona_utils.py index c259ab4..8bf9fc7 100644 --- a/hooks/percona_utils.py +++ b/hooks/percona_utils.py @@ -235,7 +235,7 @@ def unit_sorted(units): def install_mysql_ocf(): dest_dir = '/usr/lib/ocf/resource.d/percona/' - for fname in ['ocf/percona/mysql', 'ocf/percona/mysql_monitor']: + for fname in ['ocf/percona/mysql_monitor']: src_file = os.path.join(charm_dir(), fname) if not os.path.isdir(dest_dir): os.makedirs(dest_dir) diff --git a/ocf/percona/mysql b/ocf/percona/mysql deleted file mode 100755 index 9ef84de..0000000 --- a/ocf/percona/mysql +++ /dev/null @@ -1,2193 +0,0 @@ -#!/bin/bash -# -# -# MySQL -# -# Description: Manages a MySQL database as Linux-HA resource -# -# Authors: Alan Robertson: DB2 Script -# Jakub Janczak: rewrite as MySQL -# Andrew Beekhof: cleanup and import -# Sebastian Reitenbach: add OpenBSD defaults, more cleanup -# Narayan Newton: add Gentoo/Debian defaults -# Marian Marinov, Florian Haas: add replication capability -# Yves Trudeau, Baron Schwartz: add VIP support and improve replication -# Jervin Real, Kenny Gryp: Booth Compatibility Improvements -# -# Support: linux-ha@lists.linux-ha.org -# License: GNU General Public License (GPL) -# -# (c) 2002-2005 International Business Machines, Inc. -# 2005-2010 Linux-HA contributors -# -# An example usage in /etc/ha.d/haresources: -# node1 10.0.0.170 mysql -# -# Version: 20141112131457 -# -# See usage() function below for more details... -# -# OCF instance parameters: -# OCF_RESKEY_binary -# OCF_RESKEY_client_binary -# OCF_RESKEY_config -# OCF_RESKEY_datadir -# OCF_RESKEY_user -# OCF_RESKEY_group -# OCF_RESKEY_test_table -# OCF_RESKEY_test_user -# OCF_RESKEY_test_passwd -# OCF_RESKEY_enable_creation -# OCF_RESKEY_additional_parameters -# OCF_RESKEY_log -# OCF_RESKEY_pid -# OCF_RESKEY_socket -# OCF_RESKEY_replication_user -# OCF_RESKEY_replication_passwd -# OCF_RESKEY_replication_port -# OCF_RESKEY_replication_options -# OCF_RESKEY_max_slave_lag -# OCF_RESKEY_evict_outdated_slaves -# OCF_RESKEY_reader_attribute -# OCF_RESKEY_reader_failcount -# OCF_RESKEY_backup_lockfile -# OCF_RESKEY_geo_remote_IP -# OCF_RESKEY_booth_master_ticket -# OCF_RESKEY_post_promote_script -# OCF_RESKEY_prm_binlog_parser_path -# OCF_RESKEY_try_restart_crashed_master - -####################################################################### -# Initialization: - -: ${OCF_FUNCTIONS_DIR=${OCF_ROOT}/lib/heartbeat} -. ${OCF_FUNCTIONS_DIR}/ocf-shellfuncs - -####################################################################### - -# Fill in some defaults if no values are specified -HOSTOS=`uname` -if [ "X${HOSTOS}" = "XOpenBSD" ];then - OCF_RESKEY_binary_default="/usr/local/bin/mysqld_safe" - OCF_RESKEY_config_default="/etc/my.cnf" - OCF_RESKEY_datadir_default="/var/mysql" - OCF_RESKEY_user_default="_mysql" - OCF_RESKEY_group_default="_mysql" - OCF_RESKEY_log_default="/var/log/mysqld.log" - OCF_RESKEY_pid_default="/var/mysql/mysqld.pid" - OCF_RESKEY_socket_default="/var/run/mysql/mysql.sock" -else - OCF_RESKEY_binary_default="/usr/bin/safe_mysqld" - OCF_RESKEY_config_default="/etc/my.cnf" - OCF_RESKEY_datadir_default="/var/lib/mysql" - OCF_RESKEY_user_default="mysql" - OCF_RESKEY_group_default="mysql" - OCF_RESKEY_log_default="/var/log/mysqld.log" - OCF_RESKEY_pid_default="/var/run/mysql/mysqld.pid" - OCF_RESKEY_socket_default="/var/lib/mysql/mysql.sock" -fi -OCF_RESKEY_client_binary_default="mysql" -OCF_RESKEY_test_user_default="root" -OCF_RESKEY_test_table_default="mysql.user" -OCF_RESKEY_test_passwd_default="" -OCF_RESKEY_enable_creation_default=0 -OCF_RESKEY_additional_parameters_default="" -OCF_RESKEY_replication_port_default="3306" -OCF_RESKEY_max_slave_lag_default="3600" -OCF_RESKEY_evict_outdated_slaves_default="false" -OCF_RESKEY_reader_attribute_default="readable" -OCF_RESKEY_reader_failcount_default="1" -OCF_RESKEY_backup_lockfile_default="/var/lock/innobackupex" -OCF_RESKEY_booth_master_ticket_default="ticketMaster" -OCF_RESKEY_async_stop_default=0 -OCF_RESKEY_try_restart_crashed_master_default=1 - -: ${OCF_RESKEY_binary=${OCF_RESKEY_binary_default}} -MYSQL_SBINDIR=`dirname ${OCF_RESKEY_binary}` - -: ${OCF_RESKEY_client_binary=${OCF_RESKEY_client_binary_default}} - -: ${OCF_RESKEY_config=${OCF_RESKEY_config_default}} -: ${OCF_RESKEY_datadir=${OCF_RESKEY_datadir_default}} - -: ${OCF_RESKEY_user=${OCF_RESKEY_user_default}} -: ${OCF_RESKEY_group=${OCF_RESKEY_group_default}} - -: ${OCF_RESKEY_log=${OCF_RESKEY_log_default}} -: ${OCF_RESKEY_pid=${OCF_RESKEY_pid_default}} -: ${OCF_RESKEY_socket=${OCF_RESKEY_socket_default}} - -: ${OCF_RESKEY_test_user=${OCF_RESKEY_test_user_default}} -: ${OCF_RESKEY_test_table=${OCF_RESKEY_test_table_default}} -: ${OCF_RESKEY_test_passwd=${OCF_RESKEY_test_passwd_default}} - -: ${OCF_RESKEY_enable_creation=${OCF_RESKEY_enable_creation_default}} -: ${OCF_RESKEY_additional_parameters=${OCF_RESKEY_additional_parameters_default}} - -: ${OCF_RESKEY_replication_user=${OCF_RESKEY_replication_user_default}} -: ${OCF_RESKEY_replication_passwd=${OCF_RESKEY_replication_passwd_default}} -: ${OCF_RESKEY_replication_port=${OCF_RESKEY_replication_port_default}} -: ${OCF_RESKEY_replication_options=${OCF_RESKEY_replication_options_default}} - -: ${OCF_RESKEY_max_slave_lag=${OCF_RESKEY_max_slave_lag_default}} -: ${OCF_RESKEY_evict_outdated_slaves=${OCF_RESKEY_evict_outdated_slaves_default}} - -: ${OCF_RESKEY_reader_attribute=${OCF_RESKEY_reader_attribute_default}} -: ${OCF_RESKEY_reader_failcount=${OCF_RESKEY_reader_failcount_default}} - -: ${OCF_RESKEY_backup_lockfile=${OCF_RESKEY_backup_lockfile_default}} - -: ${OCF_RESKEY_geo_remote_IP}="" -: ${OCF_RESKEY_booth_master_ticket}=${OCF_RESKEY_booth_master_ticket_default} -: ${OCF_RESKEY_post_promote_script}="" -: ${OCF_RESKEY_prm_binlog_parser_path}="`which prm_binlog_parser 2> /dev/null`" - -: ${OCF_RESKEY_async_stop=${OCF_RESKEY_async_stop_default}} -: ${OCF_RESKEY_try_restart_crashed_master=${OCF_RESKEY_try_restart_crashed_master_default}} - -####################################################################### -# Convenience variables - -MYSQL=$OCF_RESKEY_client_binary -MYSQL_BINDIR=`dirname ${OCF_RESKEY_client_binary}` - -MYSQL_OPTIONS_LOCAL="-A -S $OCF_RESKEY_socket --connect_timeout=10" -MYSQL_OPTIONS_REPL="$MYSQL_OPTIONS_LOCAL --user=$OCF_RESKEY_replication_user --password=$OCF_RESKEY_replication_passwd" -MYSQL_OPTIONS_TEST="$MYSQL_OPTIONS_LOCAL --user=$OCF_RESKEY_test_user --password=$OCF_RESKEY_test_passwd" -MYSQL_LAST_ERR=0 -MYSQL_TOO_MANY_CONN_ERR=1040 - -CRM_MASTER="${HA_SBIN_DIR}/crm_master -l reboot " -HOSTNAME=`uname -n` -CRM_ATTR="${HA_SBIN_DIR}/crm_attribute -N $HOSTNAME -q" -INSTANCE_ATTR_NAME=`echo ${OCF_RESOURCE_INSTANCE}| awk -F : '{print $1}'` -CRM_ATTR_REPL_INFO="${HA_SBIN_DIR}/crm_attribute --type crm_config --name ${INSTANCE_ATTR_NAME}_REPL_INFO -s mysql_replication -q " -CRM_ATTR_REPL_STATUS="${HA_SBIN_DIR}/crm_attribute --type crm_config --name ${INSTANCE_ATTR_NAME}_REPL_STATUS -s mysql_replication -q " -CRM_ATTR_LAST_TRX="${HA_SBIN_DIR}/crm_attribute --type crm_config --name ${INSTANCE_ATTR_NAME}_NEW_MASTER_LAST_TRX -s mysql_replication -q" -CRM_ATTR_MASTER_CRASHED_TS="$CRM_ATTR -l reboot --name ${INSTANCE_ATTR_NAME}_master_crashed" -CRM_ATTR_NODE_LAST_TRX_MD5="$CRM_ATTR -l reboot --name ${INSTANCE_ATTR_NAME}_last_trx_md5" -CRM_RES="${HA_SBIN_DIR}/crm_resource" -CRM_TICKET="${HA_SBIN_DIR}/crm_ticket" -SSH="/usr/bin/ssh " -MAX_BINLOG_SIZE_CACHE="${HA_RSCTMP}/max_binlog_size_cache" -ASYNC_STOP_WITNESS_FILE="${HA_RSCTMP}/stop_${INSTANCE_ATTR_NAME}" -OCF_STOPPING=100 #custom error code for async_stop - -####################################################################### - -usage() { - cat < - - -1.0 - - -Resource script for MySQL. -May manage a standalone MySQL database, a clone set with externally -managed replication, or a complete master/slave replication setup. - -While managing replication, the default behavior is to use uname -n -values in the change master to command. Other IPs can be specified -manually by adding a node attribute \${INSTANCE_ATTR_NAME}_mysql_master_IP -giving the IP to use for replication. For example, if the mysql primitive -you are using is p_mysql, the attribute to set will be -p_mysql_mysql_master_IP. - -Manages a MySQL database instance - - - - -Location of the MySQL server binary - -MySQL server binary - - - - - -Location of the MySQL client binary - -MySQL client binary - - - - - -Configuration file - -MySQL config - - - - - -Directory containing databases - -MySQL datadir - - - - - -User running MySQL daemon - -MySQL user - - - - - -Group running MySQL daemon (for logfile and directory permissions) - -MySQL group - - - - - -The logfile to be used for mysqld. - -MySQL log file - - - - - -The pidfile to be used for mysqld. - -MySQL pid file - - - - - -The socket to be used for mysqld. - -MySQL socket - - - - - -Table to be tested in monitor statement (in database.table notation) - -MySQL test table - - - - - -MySQL test user, must have select privilege on test_table - -MySQL test user - - - - - -MySQL test user password - -MySQL test user password - - - - - -If the MySQL database does not exist, it will be created - -Create the database if it does not exist - - - - - -Additional parameters which are passed to the mysqld on startup. -(e.g. --skip-external-locking or --skip-grant-tables) - -Additional parameters to pass to mysqld - - - - - -MySQL replication user. This user is used for starting and stopping -MySQL replication, for setting and resetting the master host, and for -setting and unsetting read-only mode. Because of that, this user must -have SUPER, REPLICATION SLAVE, REPLICATION CLIENT, and PROCESS -privileges on all nodes within the cluster. Mandatory if you define -a master-slave resource. - -MySQL replication user - - - - - -MySQL replication password. Used for replication client and slave. -Mandatory if you define a master-slave resource. - -MySQL replication user password - - - - - -The port on which the Master MySQL instance is listening. - -MySQL replication port - - - - - -Extra options to pass to CHANGE MASTER, be sure to pass a preceeding comma. Handy for SSL, for example: -replication_options=", MASTER_SSL=1, MASTER_SSL_CA='/path/to/ca.crt'" - -MySQL replication options - - - - - -The maximum number of seconds a replication slave is allowed to lag -behind its master. Do not set this to zero. What the cluster manager -does in case a slave exceeds this maximum lag is determined by the -evict_outdated_slaves parameter. - -Maximum time (seconds) a MySQL slave is allowed -to lag behind a master - - - - - -If set to true, any slave which is more than max_slave_lag seconds -behind the master has its MySQL instance shut down. If this parameter -is set to false in a primitive or clone resource, it is simply -ignored. If set to false in a master/slave resource, then exceeding -the maximum slave lag will merely push down the master preference so -the lagging slave is never promoted to the new master. - -Determines whether to shut down badly lagging -slaves - - - - - -An attribute that the RA can manage to specify whether a node -can be read from. This node attribute will be 1 if it's fine to -read from the node, and 0 otherwise (for example, when a slave -has lagged too far behind the master). - -A typical example for the use of this attribute would be to tie -a set of IP addresses to MySQL slaves that can be read from. - -This parameter is only meaningful in master/slave set configurations. - -Sets the node attribute that determines -whether a node is usable for clients to read from. - - - - -The number of times a monitor operation can find the slave -to be unsuitable for reader VIP before failing. Useful if -there are short intermittent issues like clock adjustments in VMs. - -Allowed failcount for reader - - - - - -The path to a file that will be exclusively locked by any backup -process. The lockfile serves to provide a reliable way of determining -whether to restart the slave process or not. If a thirdparty process -locks this file, the agent will fail to lock the file and will not -start the slave. When this agent is able to lock the file, it is -assumed that backups are finished and the slave thread should be -running and will start it. - -A typical cron command example would be like: -flock -xn /var/lock/innobackupex innobackupex --safe-slave-backup /tmp/mysqlbackup - -This example will use innobackupex's ability to stop the slave when necessary -to ensure backup consistency. During this time the agent will not start the -slave. Once the backup is complete, the lock will automatically expire and -the agent can start the slave if it isn't already. - -Path to backup lockfile - - - - - -In case multiple Geo redundant sites are connected with the booth protocol -this is the IP to use to connect to the remote cluster to query replication info. -Normally this would be the writer VIP on the remote cluster. Also, ssh is used -for communication so make sure keys are exchanged and that ssh options are set -in a way that connection doesn't take many seconds. If empty, the booth type -behavior is not triggered. - -IP of the remote cluster - - - - - -In case multiple Geo redundant sites are connected with the booth protocol -this is the name of the ticket used to identify the master side. - -Booth ticket name - - - - - -Allows to run custom code following a promotion. An application of this is to -prevent fail-back of the master role after an initial failover. - -Post promote script - - - - - -Path to the prm_binlog_parser tool that is used to publish the last trx of a new -master after a hard crash of the previous master. The tool can be downloaded from -https://github.com/percona/percona-pacemaker-agents/tree/master/tools/ybinlogp - -Path to the prm_binlog_parser tool - - - - - -If set to true, PRM will not wait for MySQL to stop after sending the -SIGTERM signal. This can be useful to speed up failover when a server has a -large number of dirty pages and takes a long time to shutdown, or worse, receives -a SIGKILL after the stop timeout. The main drawback is that if PRM wants to restart -MySQL before it completed its shutdown, the operation will error out. - -Asynchronous stop of MySQL - - - - - -If set to true, PRM will try to restart a failed master in place instead of promoting -another node. This can help recover untransmitted binary logs. However, if you have -a large database that takes a long time to recovery, this may not be a good option -for you. - -Try restarting a crashed master - - - - - - - - - - - - - - - - - - - -END - -} - -# Convenience functions - -set_read_only() { - # Sets or unsets read-only mode. Accepts one boolean as its - # optional argument. If invoked without any arguments, defaults to - # enabling read only mode. Should only be set in master/slave - # setups. - # Returns $OCF_SUCCESS if the operation succeeds, or - # $OCF_ERR_GENERIC if it fails. - local ro_val - if ocf_is_true $1; then - ro_val="on" - else - ro_val="off" - fi - mysql_run -Q -sw $MYSQL $MYSQL_OPTIONS_REPL \ - -e "SET GLOBAL read_only=${ro_val}" -} - -get_read_only() { - # Check if read-only is set - local read_only_state - - read_only_state=`mysql_run -Q -sw -O $MYSQL -N $MYSQL_OPTIONS_REPL \ - -e "SHOW VARIABLES like 'read_only'" | awk '{print $2}'` - - if [ "$read_only_state" = "ON" ]; then - return 0 - else - return 1 - fi -} - -is_slave() { - # Determine whether the machine is currently running as a MySQL - # slave, as determined per SHOW SLAVE STATUS. Returns 1 if SHOW - # SLAVE STATUS creates an empty result set, 0 otherwise. - local rc - local tmpfile - - # Check whether this machine should be slave - if ! ocf_is_ms || ! get_read_only; then - return 1 - fi - - get_slave_info - rc=$? - - if [ $rc -eq 0 ]; then - # show slave status is not empty - # Is there a master_log_file defined? (master_log_file is deleted - # by reset slave - if [ "$master_log_file" ]; then - return 0 - else - return 1 - fi - else - # "SHOW SLAVE STATUS" returns an empty set if instance is not a - # replication slave - return 1 - fi - -} - -parse_slave_info() { - # Extracts field $1 from result of "SHOW SLAVE STATUS\G" from file $2 - sed -ne "s/^.* $1: \(.*\)$/\1/p" < $2 -} - -# get the current max_binlog_size. Since this value rarely change, we cache -# it for 1h -get_max_binlog_size() { - if [ -e $MAX_BINLOG_SIZE_CACHE ]; then - cat $MAX_BINLOG_SIZE_CACHE - if [ `date +%s` -gt "$((`stat -c %Z $MAX_BINLOG_SIZE_CACHE`+3600))" ]; then - rm $MAX_BINLOG_SIZE_CACHE - fi - else - mysql_run -Q -sw -O $MYSQL -N $MYSQL_OPTIONS_REPL \ - -e "Show global variables like 'max_binlog_size';" | \ - awk '{ print $2 }' > $MAX_BINLOG_SIZE_CACHE - cat $MAX_BINLOG_SIZE_CACHE - fi -} - -get_slave_info() { - - local mysql_options tmpfile - - if [ "$master_log_file" -a "$master_host" ]; then - # variables are already defined, get_slave_info has been run before - return $OCF_SUCCESS - else - tmpfile=`mktemp ${HA_RSCTMP}/check_slave.${OCF_RESOURCE_INSTANCE}.XXXXXX` - - mysql_run -Q -sw -O $MYSQL $MYSQL_OPTIONS_REPL \ - -e 'SHOW SLAVE STATUS\G' > $tmpfile - - if [ -s $tmpfile ]; then - master_host=`parse_slave_info Master_Host $tmpfile` - master_user=`parse_slave_info Master_User $tmpfile` - master_port=`parse_slave_info Master_Port $tmpfile` - master_log_file=`parse_slave_info Master_Log_File $tmpfile` - relay_log_file=`parse_slave_info Relay_Log_File $tmpfile` - master_log_pos=`parse_slave_info Read_Master_Log_Pos $tmpfile` - slave_sql=`parse_slave_info Slave_SQL_Running $tmpfile` - slave_io=`parse_slave_info Slave_IO_Running $tmpfile` - slave_io_state=`parse_slave_info Slave_IO_State $tmpfile` - last_errno=`parse_slave_info Last_Errno $tmpfile` - secs_behind=`parse_slave_info Seconds_Behind_Master $tmpfile` - ocf_log debug "MySQL instance has a non empty slave status" - else - # Instance produced an empty "SHOW SLAVE STATUS" output -- - # instance is not a slave - - ocf_log err "check_slave invoked on an instance that is not a replication slave." - rm -f $tmpfile - return $OCF_ERR_GENERIC - fi - rm -f $tmpfile - return $OCF_SUCCESS - fi -} - -check_slave() { - # Checks slave status - local rc new_master - - get_slave_info - rc=$? - - if [ $rc -eq 0 ]; then - # Did we receive an error other than max_connections? - if [ $last_errno -ne 0 -a $last_errno -ne "$MYSQL_TOO_MANY_CONN_ERR" ]; then - # Whoa. Replication ran into an error. This slave has - # diverged from its master. Make sure this resource - # doesn't restart in place. - ocf_log err "MySQL instance configured for replication, but replication has failed." - - # Just pull the reader VIP away, killing MySQL here would be pretty evil - # on a loaded server - set_reader_attr 0 - - #Since replication is broken, not suitable to be a master - $CRM_MASTER -v -INF - - exit $OCF_SUCCESS - - fi - - # If we got max_connections, let's only remove the vip - if [ $last_errno -eq "$MYSQL_TOO_MANY_CONN_ERR" ]; then - set_reader_attr 0 - exit $OCF_SUCCESS - fi - - if [ "$slave_io" != 'Yes' ]; then - # Not necessarily a bad thing. The master may have - # temporarily shut down, and the slave may just be - # reconnecting. A warning can't hurt, though. - ocf_log warn "MySQL Slave IO threads currently not running." - - # Sanity check, are we at least on the right master - if [ "$master_host" != "$glb_cib_master" ]; then - # Not pointing to the right master - - # Is this a recent master failover on the remote side - if [ "${#glb_remote_info}" -gt "0" -a "$slave_sql" = 'Yes' ]; then - # looks like, the sql thread is still running, no need - # to remove the vip, doing nothing - : - else - - set_reader_attr 0 - fi - - # try setting up the slave with the new master - set_master - exit $OCF_SUCCESS - - elif [ "$slave_sql" == 'Yes' ]; then - # If the slq thread is running, it is an issue with the io thread - # let's try to restart it - - if [ "$slave_io_state" != "" ]; then - # The io thread is running but is not connected, let's restart it. - mysql_run -Q -sw $MYSQL $MYSQL_OPTIONS_REPL \ - -e "STOP SLAVE IO_THREAD" - fi - - # At this point, the io_thread should be stopped. - # let's try to start it again. - - mysql_run -Q -sw $MYSQL $MYSQL_OPTIONS_REPL \ - -e "START SLAVE IO_THREAD" - - # We give some time to connect - sleep 2 - - get_slave_info - rc=$? - if [ $rc -eq 0 -a "$slave_io" == 'Yes' ]; then - ocf_log info "MySQL Slave IO thread started succesfully." - else - ocf_log warn "We could not start the MySQL Slave IO thread." - fi - fi - - - fi - - if [ "$slave_sql" != 'Yes' ]; then - # We don't have a replication SQL thread running. Not a - # good thing. Try to recoved by restarting the SQL thread - # and remove reader vip. Prevent MySQL restart. - ocf_log err "MySQL Slave SQL threads currently not running." - - # Remove reader vip - set_reader_attr 0 - - # If sql is not running, can't be a master - $CRM_MASTER -v -INF - - # Check that the flock tool exists first - if type flock &>/dev/null; then - ( - flock -xn 8 - if [ $? -eq 0 ]; then - mysql_run -Q -sw $MYSQL $MYSQL_OPTIONS_REPL \ - -e "START SLAVE" - else - ocf_log info "Unable to lock $OCF_RESKEY_backup_lockfile. Not starting slave." - fi - ) 8>$OCF_RESKEY_backup_lockfile - else - # try to restart slave - mysql_run -Q -sw $MYSQL $MYSQL_OPTIONS_REPL \ - -e "START SLAVE" - fi - - # Return success to prevent a restart - exit $OCF_SUCCESS - fi - - if ocf_is_true $OCF_RESKEY_evict_outdated_slaves; then - # We're supposed to bail out if we lag too far - # behind. Let's check our lag. - if [ $secs_behind -gt $OCF_RESKEY_max_slave_lag ]; then - ocf_log err "MySQL Slave is $secs_behind seconds behind master (allowed maximum: $OCF_RESKEY_max_slave_lag)." - - # Remove reader vip - set_reader_attr 0 - exit $OCF_ERR_INSTALLED - fi - elif ocf_is_ms; then - # Even if we're not set to evict lagging slaves, we can - # still use the seconds behind master value to set our - # master preference. - local master_pref - master_pref=$((${OCF_RESKEY_max_slave_lag}-${secs_behind})) - if [ $master_pref -lt 0 ]; then - # Sanitize a below-zero preference to just zero - master_pref=0 - fi - - # Is the datadir almost full - if check_datadir_state; then - $CRM_MASTER -v $master_pref - else - # full so not good for a master - $CRM_MASTER -v -2147483640 - fi - fi - - # is the slave ok to have a VIP on it - test $secs_behind -eq 0 2>/dev/null - if [ $? -eq 2 ]; then - set_reader_attr 0 - else - if [ $secs_behind -gt $OCF_RESKEY_max_slave_lag ]; then - set_reader_attr 0 - else - set_reader_attr 1 - - #Edge case verification, check if on the right master - set_master nologging - fi - fi - - ocf_log debug "MySQL instance running as a replication slave" - else - # Instance produced an empty "SHOW SLAVE STATUS" output -- - # instance is not a slave - # TODO: Needs to handle when get_slave_info will return too many connections error - - if [ $MYSQL_LAST_ERR -eq "$MYSQL_TOO_MANY_CONN_ERR" ]; then - # Remove the vip - set_reader_attr 0 - return $OCF_SUCCESS - fi - - # An empty status could happen when a master is demote in a - # geo DR setup, let's check - if [ $MYSQL_LAST_ERR -eq 0 -a $glb_master_exists -eq 1 ]; then - # This is not the master side, let's try to setup the slave - # No need to unset the master since slave status is empty - set_reader_attr 0 - set_master - return $OCF_SUCCESS - fi - - ocf_log err "check_slave invoked on an instance that is not a replication slave." - exit $OCF_ERR_GENERIC - fi -} - -set_master() { - local new_master master_log_file master_log_pos new_master_info - local master_params new_master_log_file new_master_log_pos - - if [ "$glb_master_exists" ]; then - if [ "${#glb_remote_info}" -gt "0" ]; then - # geo_remote_IP is defined, let's do the booth part - - if [ $glb_master_side -ne 0 ]; then - # this is _not_ the side with the token - new_master_info=`echo $glb_remote_info | awk '{ print $1 }'` - new_master=`echo $new_master_info | cut -d'|' -f1` - new_master_log_file=`echo $new_master_info | cut -d'|' -f2` - new_master_log_pos=`echo $new_master_info | cut -d'|' -f3` - fi - fi - - if [ "${#new_master_info}" -eq "0" ]; then - new_master=`echo $glb_local_info | cut -d'|' -f1` - new_master_log_file=`echo $glb_local_info | cut -d'|' -f2` - new_master_log_pos=`echo $glb_local_info | cut -d'|' -f3` - fi - - # Keep replication position - get_slave_info - - if [ "$master_log_file" -a "$new_master" = "$master_host" ]; then - # master_params=", MASTER_LOG_FILE='$master_log_file', \ - # MASTER_LOG_POS=$master_log_pos" - if [ "$1" = "nologging" ]; then - : - else - ocf_log info "Kept master pos for $master_host : $master_log_file:$master_log_pos" - fi - return - else - if [ -n "$new_master_log_file" -a -n "$new_master_log_pos" ]; then - master_params=", MASTER_LOG_FILE='$new_master_log_file', \ - MASTER_LOG_POS=$new_master_log_pos" - ocf_log info "Restored master pos for $new_master : $new_master_log_file:$new_master_log_pos" - fi - fi - - # Informs the MySQL server of the master to replicate - # from. Accepts one mandatory argument which must contain the host - # name of the new master host. The master must either be unchanged - # from the laste master the slave replicated from, or freshly - # reset with RESET MASTER. - - mysql_run -Q -sw $MYSQL $MYSQL_OPTIONS_REPL \ - -e "STOP SLAVE;CHANGE MASTER TO MASTER_HOST='$new_master', \ - MASTER_PORT=$OCF_RESKEY_replication_port, \ - MASTER_USER='$OCF_RESKEY_replication_user', \ - MASTER_PASSWORD='$OCF_RESKEY_replication_passwd' \ - $OCF_RESKEY_replication_options $master_params;START SLAVE;" - fi -} - -unset_master(){ - # Instructs the MySQL server to stop replicating from a master - # host. - - # If we're currently not configured to be replicating from any - # host, then there's nothing to do. But we do log a warning as - # no-one but the CRM should be touching the MySQL master/slave - # configuration. - - is_slave - rc=$? - if [ $rc -ne 0 ]; then - ocf_log warn "Attempted to unset the replication master on an instance that is not configured as a replication slave" - return $OCF_SUCCESS - fi - - local tmpfile - tmpfile=`mktemp ${HA_RSCTMP}/unset_master.${OCF_RESOURCE_INSTANCE}.XXXXXX` - - # At this point, the master is read only so there should not be much binlogs to transfer - # Let's wait for the last bits - while true; do - get_slave_info - rc=$? - - # Is the slave_io thread running? - if [ "$slave_io" != 'Yes' ]; then - ocf_log info "Slave IO thread not running, master likely dead or stopped" - break; - fi - - mysql_run -Q -sw -O $MYSQL $MYSQL_OPTIONS_REPL \ - -e 'SHOW PROCESSLIST\G' > $tmpfile - - if grep -i 'Master has sent all binlog to slave' $tmpfile >/dev/null; then - ocf_log info "MySQL slave has finished reading master binary log" - break - fi - if grep -i 'Waiting for master to send event' $tmpfile >/dev/null; then - ocf_log info "MySQL slave has finished reading master binary log" - break - fi - if grep -i 'Reconnecting after a failed master event read' $tmpfile >/dev/null; then - ocf_log info "Master is down, no more binary logs to come" - break - fi - if grep -i 'Connecting to master' $tmpfile >/dev/null; then - ocf_log info "Master is down, no more binary logs to come" - break - fi - if ! grep 'system user' $tmpfile >/dev/null; then - ocf_log info "Slave is not running - not waiting to finish" - break - fi - - sleep 1 - done - - # Now, stop the slave I/O thread and wait for relay log - # processing to complete - mysql_run -Q -sw $MYSQL $MYSQL_OPTIONS_REPL \ - -e "STOP SLAVE IO_THREAD" - if [ $? -gt 0 ]; then - ocf_log err "Error stopping slave IO thread" - rm -f $tmpfile - exit $OCF_ERR_GENERIC - fi - - while true; do - mysql_run -Q -sw -O $MYSQL $MYSQL_OPTIONS_REPL \ - -e 'SHOW PROCESSLIST\G' > $tmpfile - - # Of course, slave messages changed over MySQL versions... - if grep -i 'Has read all relay log' $tmpfile >/dev/null; then - ocf_log info "MySQL slave has finished processing relay log" - break - fi - if ! grep -q 'system user' $tmpfile; then - ocf_log info "Slave not runnig - not waiting to finish" - break - fi - ocf_log info "Waiting for MySQL slave to finish processing relay log" - sleep 1 - done - rm -f $tmpfile - - # Now, stop all slave activity and unset the master host - mysql_run -Q -sw $MYSQL $MYSQL_OPTIONS_REPL \ - -e "STOP SLAVE" - if [ $? -gt 0 ]; then - ocf_log err "Error stopping rest slave threads" - exit $OCF_ERR_GENERIC - fi - - # a last get_slave_info to save the status in variables may be needed - # by pre-promote notification - unset master_host # need to unset for get_slave_info to run - get_slave_info - - if [ "${#OCF_RESKEY_prm_binlog_parser_path}" -gt "0" ]; then - - # First, where are the relay logs? That will be easier when the using_multi_config - # branch will be merged. If the path is not defined, the output will be "." - relaylog_path=`${OCF_RESKEY_binary} --defaults-file=$OCF_RESKEY_config --verbose --help --user=$OCF_RESKEY_user | grep -e '^relay-log ' | awk '{ print $2 }'` - relaylog_path=`dirname $relaylog_path` - - if [ "$relaylog_path" == "." ]; then - relaylog_path=$OCF_RESKEY_datadir - fi - - last_relaylog_file="$relay_log_file" - - #ok now we need to find the md5 of the last trx - last_trx_md5=`$OCF_RESKEY_prm_binlog_parser_path ${relaylog_path}/${last_relaylog_file} | tail -n 1 | cut -d',' -f2` - - if [ ! -z "$last_trx_md5" ]; then - $CRM_ATTR_NODE_LAST_TRX_MD5 -v $last_trx_md5 - fi - fi - - mysql_run -Q -sw $MYSQL $MYSQL_OPTIONS_REPL \ - -e "RESET SLAVE /*!50516 ALL */;" - if [ $? -gt 0 ]; then - ocf_log err "Failed to reset slave" - exit $OCF_ERR_GENERIC - fi -} - -# Start replication as slave -start_slave() { - mysql_run -Q -sw $MYSQL $MYSQL_OPTIONS_REPL \ - -e "START SLAVE" -} - -# Set the attribute controlling the readers VIP -set_reader_attr() { - local curr_attr_value - - curr_attr_value=$(get_reader_attr) - - if [ "$1" -eq "0" ]; then - if [ "$curr_attr_value" -gt "0" ]; then - curr_attr_value=$((${curr_attr_value}-1)) - $CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} -v $curr_attr_value - else - $CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} -v 0 - fi - else - if [ "$curr_attr_value" -ne "$OCF_RESKEY_reader_failcount" ]; then - $CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} -v $OCF_RESKEY_reader_failcount - fi - fi - -} - -is_master_side() { - #Returns true (0) if this cluster has a grant for the booth ticket OCF_RESKEY_booth_master_ticket - local ticket crmTicketRet - - if [ "${#OCF_RESKEY_geo_remote_IP}" -gt "0" ]; then - #Try the new format - crmTicketRet=`file $CRM_TICKET | grep -c 'Bourne-Again shell script'` - if [ "$crmTicketRet" -eq "1" ]; then - # got an error, we assume the old format - ticket=`$CRM_TICKET -t $OCF_RESKEY_booth_master_ticket -Q | grep -c 'true'` - else - ticket=`$CRM_TICKET --info | grep $OCF_RESKEY_booth_master_ticket | awk '{ print $2 }' | grep -c granted` - fi - - if [ "$ticket" -eq "1" ]; then - return 0 - else - return 1 - fi - else - return 0 - fi -} - -# get the attribute controlling the readers VIP -get_reader_attr() { - local attr_value - local rc - - attr_value=`$CRM_ATTR -l reboot --name ${OCF_RESKEY_reader_attribute} --query -q` - rc=$? - if [ "$rc" -eq "0" ]; then - echo $attr_value - else - echo -1 - fi - -} - -# Stores data for MASTER STATUS from MySQL -update_data_master_status() { - - master_status_file="${HA_RSCTMP}/master_status.${OCF_RESOURCE_INSTANCE}" - - mysql_run -Q -sw -O $MYSQL $MYSQL_OPTIONS_REPL -e "SHOW MASTER STATUS\G" > $master_status_file -} - - -# Returns the specified value from the stored copy of SHOW MASTER STATUS. -# should be call after update_data_master_status for tmpfile -# Arguments: -# $1 The value to get. -get_master_status() { - awk -v var="$1" '$1 == var ":" {print substr($0, index($0, ":") + 2)}' "$master_status_file" -} - -# Determines what IP address is attached to the current host. The output of the -# crm_attribute command looks like this: -# scope=nodes name=IP value=10.2.2.161 -# If the ${INSTANCE_ATTR_NAME}_MYSQL_MASTER_IP node attribute is not defined, fallback is to uname -n -# The ${INSTANCE_ATTR_NAME}_MYSQL_MASTER_IP is the IP address that will be used for the -# change master to command. -get_local_ip() { - local IP - IP=`$CRM_ATTR -l forever -n ${INSTANCE_ATTR_NAME}_mysql_master_IP -q -G` - if [ ! $? -eq 0 ]; then - uname -n - else - echo $IP - fi -} - -# Determine if the datadir is full or almost full, the threshold is 97% -check_datadir_state() { - # Get the free space of the binlogdir - FREE_SPC_PCT=`/bin/df $OCF_RESKEY_datadir | /bin/grep -v Filesystem \ - | /bin/sed -e 's/ \+/ /g' | /usr/bin/cut -d' ' -f 5 \ - | /usr/bin/tr -d '%'` - - if [ "$FREE_SPC_PCT" -ge "97" ]; then - ocf_log warn "Partition $OCF_RESKEY_datadir usage is at " \ - "or more than 97, " \ - "unsuitable for master..." - return 1 - fi - - return 0 -} - -####################################################################### - -# Functions invoked by resource manager actions - -mysql_validate() { - check_binary $OCF_RESKEY_binary - check_binary $OCF_RESKEY_client_binary - - if [ ! -f $OCF_RESKEY_config ]; then - ocf_log err "Config $OCF_RESKEY_config doesn't exist"; - return $OCF_ERR_INSTALLED; - fi - - if [ ! -d $OCF_RESKEY_datadir ]; then - ocf_log err "Datadir $OCF_RESKEY_datadir doesn't exist"; - return $OCF_ERR_INSTALLED; - fi - - getent passwd $OCF_RESKEY_user >/dev/null 2>&1 - if [ ! $? -eq 0 ]; then - ocf_log err "User $OCF_RESKEY_user doesn't exit"; - return $OCF_ERR_INSTALLED; - fi - - getent group $OCF_RESKEY_group >/dev/null 2>&1 - if [ ! $? -eq 0 ]; then - ocf_log err "Group $OCF_RESKEY_group doesn't exist"; - return $OCF_ERR_INSTALLED; - fi - - true -} - -# Return the status of mysqld -# $1 the loglevel to use (mandatory) -# $2 Override async_stop if 1, default to 0 -mysql_status() { - local last_restart_ts - local kill_exit_code - local witness_pid - local override_async_stop - - override_async_stop=0 - - if [ -n $2 ]; then - override_async_stop=$2 - fi - - if [ ! -e $OCF_RESKEY_pid ]; then - ocf_log $1 "MySQL is not running" - - if [ -e $ASYNC_STOP_WITNESS_FILE ]; then - # MySQL is stopped and there's a witness file, cleanup - rm -f $ASYNC_STOP_WITNESS_FILE - fi - return $OCF_NOT_RUNNING; - fi - - pid=`cat $OCF_RESKEY_pid`; - if [ -d /proc -a -d /proc/1 ]; then - [ "u$pid" != "u" -a -d /proc/$pid ] - else - kill -s 0 $pid >/dev/null 2>&1 - fi - - kill_exit_code=$? - - if [ "$OCF_RESKEY_async_stop" -eq "1" -a \ - -e $ASYNC_STOP_WITNESS_FILE ]; then - - # Async stop seems to be in progress - witness_pid=`cat $ASYNC_STOP_WITNESS_FILE | grep pid | cut -d':' -f2` - - if [ $kill_exit_code -eq 0 -a $witness_pid -eq $pid ]; then - - # Should we lie or tell the truth - if [ "$override_async_stop" -eq "0" ]; then - # we lie - # still running but because of async, we report stopped - return $OCF_NOT_RUNNING - else - # we tell the truth - return $OCF_STOPPING #custom error code - fi - else - # That shouldn't happen execpt if SIGKILL, cleanup - rm -f $ASYNC_STOP_WITNESS_FILE - fi - fi - - if [ $kill_exit_code -eq 0 ]; then - return $OCF_SUCCESS; - else - ocf_log $1 "MySQL not running: removing old PID file" - rm -f $OCF_RESKEY_pid - - # This is abnormal, is this host the master defined in the cib? - # Also confirm it succeed in starting with the socket file - if [ "$glb_master_exists" -eq "1" -a "$glb_cib_master" = $(get_local_ip) \ - -a -e "$OCF_RESKEY_socket" ]; then - - #This is a crashed master - if [ "$OCF_RESKEY_try_restart_crashed_master" -eq "1" ]; then - # This is the master, let's give it a change to restart - # that will allow the slaves a better chance to sync but we - # need to avoid letting it restart forever. Has it tried to - # restart within the last hour - last_crash_ts=`$CRM_ATTR -l reboot --name ${INSTANCE_ATTR_NAME}_last_crash --query -q` - - if [[ ! -z $last_crash_ts ]]; then - if [ `date +%s` -lt "$((${last_crash_ts}+3600))" ]; then - # too soon, multiple crash, let's error out - return $OCF_NOT_RUNNING; - fi - fi - - $CRM_ATTR -l reboot --name ${INSTANCE_ATTR_NAME}_last_crash -v `date +%s` - - mysql_start_low - rc=$? - - if [ "$rc" -eq "0" ]; then - set_read_only OFF - fi - - return $rc - else - $CRM_ATTR -l reboot --name ${INSTANCE_ATTR_NAME}_last_crash -v `date +%s` - # OCF_ERR_ARGS is a hard error, won't wait for restart - return $OCF_ERR_ARGS - fi - fi - return $OCF_NOT_RUNNING - fi -} - -mysql_monitor() { - local rc - local status_loglevel="err" - local master_resource - local master_status_attr - local new_master_status_attr - - : ${OCF_RESKEY_CRM_meta_interval=0} - - # Set loglevel to info during probe - if ocf_is_probe; then - status_loglevel="info" - fi - - mysql_status $status_loglevel - - rc=$? - - # TODO: check max connections error - - # If status returned an error, return that immediately - if [ $rc -ne $OCF_SUCCESS ]; then - return $rc - fi - - - if [ $OCF_CHECK_LEVEL -gt 0 -a -n "$OCF_RESKEY_test_table" ]; then - # Check if this instance is configured as a slave, and if so - # check slave status - - # Are we currently having a master? - if [ "$glb_master_exists" -ne "0" ]; then - is_slave - rc=$? - if [ $rc -eq 0 -o "$OCF_RESKEY_CRM_meta_role" = "Slave" ]; then - check_slave - else - update_data_master_status - master_status_attr=`$CRM_ATTR_REPL_STATUS --query -q` - new_master_status_attr="$(get_master_status File)|$(get_master_status Position)|$(get_max_binlog_size)" - rm -f $master_status_file - if [ "$master_status_attr" != "$new_master_status_attr" ]; then - # Doing in bg, no need to wait and that can hang if a node is lost at the same time - $CRM_ATTR_REPL_STATUS -v "$new_master_status_attr" & - fi - - # Is this following a recent master crash? - master_crashed_ts=`$CRM_ATTR_MASTER_CRASHED_TS --query` - - if [ ! -z $master_crashed_ts ]; then - if [ `date +%s` -gt "$((${master_crashed_ts}+3600))" ]; then - #Let's cleanup the cib - $CRM_ATTR_MASTER_CRASHED_TS -D - $CRM_ATTR_LAST_TRX -D - fi - fi - fi - else - is_slave - rc=$? - # Need to cover for crashed master... no unset_master... or always set - # last trx md5... - if [ $rc -eq 0 -o "$OCF_RESKEY_CRM_meta_role" = "Slave" ]; then - unset_master - set_reader_attr 0 - fi - fi - - - # Check for test table - mysql_run -Q -sw $MYSQL $MYSQL_OPTIONS_TEST \ - -e "SELECT COUNT(*) FROM $OCF_RESKEY_test_table" - - - if [ $MYSQL_LAST_ERR -ne "$MYSQL_TOO_MANY_CONN_ERR" ]; then - if [ $MYSQL_LAST_ERR -ne 0 ]; then - ocf_log err "Failed to select from $OCF_RESKEY_test_table"; - return $OCF_ERR_GENERIC; - fi - else - ocf_log info "Master hit max_connections" - fi - fi - - if ocf_is_ms && ! get_read_only; then - ocf_log debug "MySQL monitor succeeded (master)"; - if [ "$OCF_RESKEY_CRM_meta_interval" -eq "0" ]; then - # this is a probe and this server is a master so need to set master_score - $CRM_MASTER -v 2147483647 - fi - if ! check_datadir_state; then - $CRM_MASTER -v -2147483640 - fi - return $OCF_RUNNING_MASTER - else - ocf_log debug "MySQL monitor succeeded"; - return $OCF_SUCCESS - fi -} - -# Start MySQL in the master-slave context -mysql_start() { - local current_status - - if ocf_is_ms; then - # Initialize the ReaderVIP attribute, monitor will enable it - set_reader_attr 0 - - # set master_score to 0 in case mysql crashes on startup - $CRM_MASTER -v 0 - fi - - mysql_status info 1 # Adding 2nd param here to get the true state in case - # async_stop is used - current_status=$? - if [ "$current_status" = "$OCF_SUCCESS" ]; then - ocf_log info "MySQL already running" - return $OCF_SUCCESS - fi - - # Is MySQL still stopping, OCF_STOPPING is a custom error code - if [ "$current_status" = "$OCF_STOPPING" ]; then - ocf_log err "MySQL asked to start while still stopping" - # TODO, wait for stop (or timeout). For now just a sleep - sleep 5 - return $OCF_ERR_GENERIC - fi - - mysql_start_low - rc=$? - - if [ $rc != $OCF_SUCCESS ]; then - ocf_log err "Wasn't able to start MySQL, stopping 'start'." - return $rc - fi - - if ocf_is_ms; then - # We're configured as a stateful resource. We must start as - # slave by default. At this point we don't know if the CRM has - # already promoted a master. So, we simply start in read only - # mode. Should already be from command line. - set_read_only on - - # Now, let's see whether there is a master. We might be a new - # node that is just joining the cluster, and the CRM may have - # promoted a master before. - - if [ "$glb_master_exists" -ne 0 -a "$glb_cib_master" != $(get_local_ip) ]; then - ocf_log info "Changing MySQL configuration to replicate from $master_host." - set_master - start_slave - if [ $? -ne 0 ]; then - ocf_log err "Failed to start slave" - return $OCF_ERR_GENERIC - fi - else - ocf_log info "No MySQL master present - clearing replication state" - unset_master - fi - - # We also need to set a master preference, otherwise Pacemaker - # won't ever promote us in the absence of any explicit - # preference set by the administrator. We choose a low - # greater-than-zero preference. - $CRM_MASTER -v 1 - - fi - - # Initial monitor action - if [ -n "$OCF_RESKEY_test_table" -a -n "$OCF_RESKEY_test_user" \ - -a -n "$OCF_RESKEY_test_passwd" ]; then - OCF_CHECK_LEVEL=10 - fi - mysql_monitor - rc=$? - if [ $rc != $OCF_SUCCESS -a $rc != $OCF_RUNNING_MASTER ]; then - ocf_log err "Failed initial monitor action" - return $rc - fi - - ocf_log info "MySQL started" - return $OCF_SUCCESS -} - -# low level MySQL start -mysql_start_low() { - touch $OCF_RESKEY_log - chown $OCF_RESKEY_user:$OCF_RESKEY_group $OCF_RESKEY_log - chmod 0640 $OCF_RESKEY_log - [ -x /sbin/restorecon ] && /sbin/restorecon $OCF_RESKEY_log - - if ocf_is_true "$OCF_RESKEY_enable_creation" && [ ! -d $OCF_RESKEY_datadir/mysql ] ; then - ocf_log info "Initializing MySQL database: " - $MYSQL_SBINDIR/mysql_install_db --datadir=$OCF_RESKEY_datadir - rc=$? - if [ $rc -ne 0 ] ; then - ocf_log err "Initialization failed: $rc"; - exit $OCF_ERR_GENERIC - fi - chown -R $OCF_RESKEY_user:$OCF_RESKEY_group $OCF_RESKEY_datadir - fi - - pid_dir=`dirname $OCF_RESKEY_pid` - if [ ! -d $pid_dir ] ; then - ocf_log info "Creating PID dir: $pid_dir" - mkdir -p $pid_dir - chown $OCF_RESKEY_user:$OCF_RESKEY_group $pid_dir - fi - - socket_dir=`dirname $OCF_RESKEY_socket` - if [ ! -d $socket_dir ] ; then - ocf_log info "Creating socket dir: $socket_dir" - mkdir -p $socket_dir - chown $OCF_RESKEY_user:$OCF_RESKEY_group $socket_dir - fi - - # Regardless of whether we just created the directory or it - # already existed, check whether it is writable by the configured - # user - for dir in $pid_dir $socket_dir; do - # needed to wrap around su a bit, sssd causing issue - if [ `su - $OCF_RESKEY_user -s /bin/bash -c "if test -w $dir; then echo yes; else echo no; fi" 2> /dev/null` != "yes" ]; then - ocf_log err "Directory $dir is not writable by $OCF_RESKEY_user" - exit $OCF_ERR_PERM; - fi - done - - # Uncomment to perform permission clensing - # - not convinced this should be enabled by default - # - #chmod 0755 $OCF_RESKEY_datadir - #chown -R $OCF_RESKEY_user $OCF_RESKEY_datadir - #chgrp -R $OCF_RESKEY_group $OCF_RESKEY_datadir - mysql_extra_params= - if ocf_is_ms; then - mysql_extra_params="$mysql_extra_params --skip-slave-start --read-only" - fi - - ${OCF_RESKEY_binary} --defaults-file=$OCF_RESKEY_config \ - --pid-file=$OCF_RESKEY_pid \ - --socket=$OCF_RESKEY_socket \ - --datadir=$OCF_RESKEY_datadir \ - --user=$OCF_RESKEY_user $OCF_RESKEY_additional_parameters \ - $mysql_extra_params >/dev/null 2>&1 & - rc=$? - - # we also get the process id from $! because the PID file is only - # created by mysql as soon as mysql is fully up and running - # for example, when recovery is busy, the pid file does not exist yet - # this part already creates the PID file as the mysql user - # so that other PRM checks know - # When recovery happens, the PID file does not exist yet. - process_pid=$! - # mysql_status expects that if the pid is there and it's running - # that mysql is completely active - #su $OCF_RESKEY_user -c "echo '$process_pid' > $OCF_RESKEY_pid" - echo "$process_pid" > ${OCF_RESKEY_pid}.starting - - if [ $rc != 0 ]; then - ocf_log err "MySQL start command failed: $rc" - return $rc - fi - - # Spin waiting for the server to come up. - # Let the CRM/LRM time us out if required. - start_wait=1 - while [ $start_wait = 1 ]; do - mysql_status info - rc=$? - if [ $rc = $OCF_SUCCESS ]; then - start_wait=0 - - elif [ $rc != $OCF_NOT_RUNNING ]; then - ocf_log info "MySQL start failed: $rc" - return $rc - fi - - # if mysql died in the meantime, we shall not wait - # until the timeout is reached. - kill -s 0 $process_pid > /dev/null - mysqld_pid_status=$? - - if [ "$mysqld_pid_status" -ne "0" ]; then - ocf_log err "MySQL daemon died during start, giving up." - return $OCF_ERR_GENERIC - fi - - sleep 2 - done - - return $OCF_SUCCESS -} - -mysql_stop() { - - if ocf_is_ms; then - # clear preference for becoming master - $CRM_MASTER -D - - # Remove VIP capability - set_reader_attr 0 - fi - - # we rely only on ${OCF_RESKEY_pid}.starting - # as this certainly contains the file we need with the PID - if [ ! -f ${OCF_RESKEY_pid}.starting ]; then - ocf_log info "MySQL is not running" - return $OCF_SUCCESS - fi - - pid=`cat ${OCF_RESKEY_pid}.starting 2> /dev/null` - /bin/kill $pid > /dev/null - rc=$? - if [ $rc != 0 ]; then - ocf_log err "MySQL couldn't be stopped" - return $OCF_ERR_GENERIC - fi - - if [ "$OCF_RESKEY_async_stop" -eq "1" ]; then - #Ok, MySQL is stopping and the async_stop option is set, just put the - #pid and a timestamp in the witness file and return - - echo "pid:$pid" > $ASYNC_STOP_WITNESS_FILE - echo "ts:`date +%s`" >> $ASYNC_STOP_WITNESS_FILE - - #Don't know yet why the ts, just seems useful for debugging for now - ocf_log info "MySQL async stopped"; - return $OCF_SUCCESS - fi - - # stop waiting - shutdown_timeout=15 - if [ -n "$OCF_RESKEY_CRM_meta_timeout" ]; then - shutdown_timeout=$((($OCF_RESKEY_CRM_meta_timeout/1000)-5)) - fi - - #Normally, pacemaker handles timeout but here we want to be able to do - #a SIGKILL (-9) before the timeout occurs. - count=0 - while [ $count -lt $shutdown_timeout ] - do - kill -s 0 $pid - rc=$? - if [ $rc -ne 0 ]; then - break - fi - count=`expr $count + 1` - sleep 1 - ocf_log debug "MySQL still hasn't stopped yet. Waiting..." - done - - kill -s 0 $pid - if [ $? -eq 0 ]; then - ocf_log info "MySQL failed to stop after ${shutdown_timeout}s using SIGTERM. Trying SIGKILL..." - /bin/kill -KILL $pid > /dev/null - fi - - rm ${OCF_RESKEY_pid}.starting - ocf_log info "MySQL stopped"; - rm -f /var/lock/subsys/mysqld - rm -f $OCF_RESKEY_socket - return $OCF_SUCCESS -} - -mysql_promote() { - local master_info - local master_crashed_ts - local log_bin_path - local tmpfiletrx - local nb_trx - local last_binlog_number - - if ( ! mysql_status err ); then - return $OCF_NOT_RUNNING - fi - - unset_master - - # Set Master Info in CIB, cluster level attribute - update_data_master_status - master_info="$(get_local_ip)|$(get_master_status File)|$(get_master_status Position)" - ${CRM_ATTR_REPL_INFO} -v "$master_info" - rm -f $master_status_file - - master_crashed_ts=`$CRM_ATTR_MASTER_CRASHED_TS --query` - - if [ ! -z "$master_crashed_ts" -a "${#OCF_RESKEY_prm_binlog_parser_path}" -gt "0" ]; then - if [ `date +%s` -lt "$((${master_crashed_ts}+3600))" ]; then - # Master crashed less than 1h ago, let's publish the last trx - - # First, where are the binlogs? That will be easier when the using_multi_config - # branch will be merged. If the path is not defined, the output will be "." - log_bin_path=`${OCF_RESKEY_binary} --defaults-file=$OCF_RESKEY_config --verbose --help --user=$OCF_RESKEY_user | grep -e '^log-bin ' | awk '{ print $2 }'` - log_bin_path=`dirname $log_bin_path` - - if [ "$log_bin_path" == "." ]; then - log_bin_path=$OCF_RESKEY_datadir - fi - - # Let's find the last binlog file - update_data_master_status - last_binlog_file="$(get_master_status File)" - - tmpfiletrx=`mktemp ${HA_RSCTMP}/trx.${OCF_RESOURCE_INSTANCE}.XXXXXX` - - ( echo -n "${last_binlog_file}@";$OCF_RESKEY_prm_binlog_parser_path ${log_bin_path}/${last_binlog_file} | tail -n 3000 ) > $tmpfiletrx - - # Do we have 3000 trx? - nb_trx=`cat $tmpfiletrx | wc -l` - if [ "$nb_trx" -lt "3000" ]; then - # we have less than 3000, let's try the previous file - nb_trx=$((3000-$nb_trx)) #remaining - - last_binlog_number=`echo $last_binlog_file | cut -d'.' -f2 | sed -ne "s/^0*\([1-9][0-9]*\)$/\1/p"` - last_binlog_number=$(($last_binlog_number-1)) - - # re-adding the 0 padding - while [ "${#last_binlog_number}" -lt "6" ]; do - last_binlog_number="0${last_binlog_number}" - done - - last_binlog_file="`echo $last_binlog_file | cut -d'.' -f1`.${last_binlog_number}" - if [ -e "${log_bin_path}/$last_binlog_file" ]; then - ( echo -n "@${last_binlog_file}@";$OCF_RESKEY_prm_binlog_parser_path ${log_bin_path}/${last_binlog_file} | tail -n $nb_trx ) >> $tmpfiletrx - fi - fi - - #now we load all that to the cib so that it reaches the other nodes - $CRM_ATTR_LAST_TRX -v "`cat $tmpfiletrx | tr '\n' '|'`" - rm -f $tmpfiletrx - - fi - fi - - set_read_only off || return $OCF_ERR_GENERIC - - # Existing master gets a higher-than-default master preference, so - # the cluster manager does not shuffle the master role around - # unnecessarily - $CRM_MASTER -v 2147483647 - - # A master can accept reads - set_reader_attr 1 - - if [ "${#OCF_RESKEY_post_promote_script}" -gt "0" -a \ - -x "${OCF_RESKEY_post_promote_script}" -a \ - ! -L "${OCF_RESKEY_post_promote_script}" ]; then - ${OCF_RESKEY_post_promote_script} - fi - - return $OCF_SUCCESS -} - -mysql_demote() { - if ! mysql_status err; then - $CRM_MASTER -v 0 - exit $OCF_SUCCESS - else - # Return master preference to default, so the cluster manager gets - # a chance to select a new master - $CRM_MASTER -v 1 - exit $OCF_SUCCESS - fi -} - -mysql_notify() { - local master_crashed_ts last_reported_master_file_number master_status_attr - local last_reported_master_file last_reported_master_pos master_max_binlog_size - local master_score notify_resource my_resource master_crashed_ts - local relaylog_path last_relaylog_file last_trx_md5 binlog_file binlog_pos - local strip_last_reported_master_file_number strip_master_log_file_number - - # If not configured as a Stateful resource, we make no sense of - # notifications. - if ! ocf_is_ms; then - ocf_log info "This agent makes no use of notifications unless running in master/slave mode." - return $OCF_SUCCESS - fi - - local type_op - type_op="${OCF_RESKEY_CRM_meta_notify_type}-${OCF_RESKEY_CRM_meta_notify_operation}" - - ocf_log debug "Received $type_op notification." - - case "$type_op" in - 'pre-promote') - - # Master-score is normally calculated from slave-lag but it is better to use - # binlog offset position to pick the most up to date slave - master_status_attr=`$CRM_ATTR_REPL_STATUS --query -q` - - if [ $? -eq 0 ]; then - # There's a master status entry although we don't know if it is - # a valid one - last_reported_master_file=`echo $master_status_attr | cut -d'|' -f1` - last_reported_master_pos=`echo $master_status_attr | cut -d'|' -f2` - master_max_binlog_size=`echo $master_status_attr | cut -d'|' -f3` - - get_slave_info - if [ $? -eq "$OCF_SUCCESS" ]; then - # We'll be here only if the master crashed. In the event of - # a graceful demote, a post-demote notification event would have occurred. - # The post-demote include an unset-master that - # resets the slave after the completion of the IO and SQL - # threads. The post-demote doesn't run if the master host - # crashed. - - # Let's establish the master score based on the following - # rule. - # score = ((file number diff)*master_max_binlog_size - # + fileposdiff/10 + constante - # - # All events are at least 10 bytes so dividing by 10 doesn't reduce - # the resolution and increases the span. - # - # Since the master publishes its status only once per few - # seconds, the fileposdiff is likely positive. - # We'll cap all values to int signed range et target - # 1B as the value if a slave is fully in sync with the master - - #First unset_master to allow the application of the relay-log if any - #The unset_master will return the last slave status variables. - unset_master - - last_reported_master_file_number=`echo $last_reported_master_file | cut -d'.' -f2 | sed -ne "s/^0*\([1-9][0-9]*\)$/\1/p"` - master_log_file_number=`echo $master_log_file | cut -d'.' -f2 | sed -ne "s/^0*\([1-9][0-9]*\)$/\1/p"` - - master_score=$(((100000000+\ - ($master_log_file_number-$last_reported_master_file_number)*\ - $master_max_binlog_size+$master_log_pos-$last_reported_master_pos)/10)) - - # now, the caps, the upper cap is unlikely - if [ $master_score -gt 2147483647 ]; then - master_score=2147483647 - fi - - # the lower cap could happened if a slave lags behind by - # more then 30GB of binlog. In that case... do we really care - # if we floor the value - - if [ $master_score -lt -2147483647 ]; then - master_score=-2147483647 - fi - - $CRM_MASTER -v $master_score - - # Next, we need a reminder that the master crashed and when, that will be - # used to publish the last trx in the promote event if we are picket - # as the new master. - - $CRM_ATTR_MASTER_CRASHED_TS -v `date +%s` - - fi - fi - ;; - - 'post-promote') - # The master has completed its promotion. Now is a good - # time to check whether our replication slave is working - # correctly. - - # Is the notification for our set - notify_resource=`echo $OCF_RESKEY_CRM_meta_notify_promote_resource|cut -d: -f1` - my_resource=`echo $OCF_RESOURCE_INSTANCE|cut -d: -f1` - if [ $notify_resource != ${my_resource} ]; then - ocf_log debug "Notification is not for us" - return $OCF_SUCCESS - fi - - master_host=`echo $OCF_RESKEY_CRM_meta_notify_promote_uname|tr -d " "` - if [ "$master_host" = ${HOSTNAME} ]; then - ocf_log info "This will be the new master, ignoring post-promote notification." - else - - # Is this following a recent master crash? - master_crashed_ts=`$CRM_ATTR_MASTER_CRASHED_TS --query` - - if [ ! -z "$master_crashed_ts" -a "${#OCF_RESKEY_prm_binlog_parser_path}" -gt "0" ]; then - if [ `date +%s` -lt "$((${master_crashed_ts}+3600))" ]; then - # Master crashed less than 1h ago, let's see what our last trx was. - # Since the master crashed, we didn't have the post-demote notification - # so the slave may still configured if no monitor ops has run - - # if a monitor ops occurred, it should be saved in the cib - last_trx_md5=`$CRM_ATTR_NODE_LAST_TRX_MD5 --query` - - if [ -z "$last_trx_md5" ]; then - # no last_trx_md5 set in the cib, let's try to find it - - # First, where are the relay logs? That will be easier when the using_multi_config - # branch will be merged. If the path is not defined, the output will be "." - relaylog_path=`${OCF_RESKEY_binary} --defaults-file=$OCF_RESKEY_config --verbose --help --user=$OCF_RESKEY_user | grep -e '^relay-log ' | awk '{ print $2 }'` - relaylog_path=`dirname $relaylog_path` - - if [ "$log_bin_path" == "." ]; then - relaylog_path=$OCF_RESKEY_datadir - fi - - # Let's find the last binlog file - get_slave_info - last_relaylog_file="$relay_log_file" - - #ok now we need to find the md5 of the last trx - - last_trx_md5=`$OCF_RESKEY_prm_binlog_parser_path ${relaylog_path}/${last_relaylog_file} | tail -n 1 | cut -d',' -f2` - fi - - if [ ! -z "$last_trx_md5" ]; then - # now, let's try to find this md5 in the NEW_MASTER_LAST_TRX attribute - # There maybe up to 2 binlog files in the attribute - - #1st file - binlog_file=`$CRM_ATTR_LAST_TRX --query | cut -d'@' -f1` - binlog_pos=`$CRM_ATTR_LAST_TRX --query | cut -d'@' -f2 | tr '|' '\n' | grep -A1 $last_trx_md5 | tail -n 1 | cut -d',' -f1` - - #found? - if [ -z "$binlog_pos" ]; then - #no, let's try if there's a 2nd file - binlog_file=`$CRM_ATTR_LAST_TRX --query | cut -d'@' -f3` - - if [ -z "$binlog_file" ]; then - binlog_pos=`$CRM_ATTR_LAST_TRX --query | cut -d'@' -f4 | tr '|' '\n' | grep -A1 $last_trx_md5 | tail -n 1 | cut -d',' -f1` - fi - fi - - # TODO: we could be at the end of the 2nd file so we should point to the first entry of the first - # file. Edge case, will deal with it later. - - # have we found something? - if [ ! -z "$binlog_file" -a ! -z "$binlog_pos" ]; then - # Let's overwrite the glb_local_info variable - glb_local_info="`echo $glb_local_info | cut -d'|' -f1`|$binlog_file|$binlog_pos" - fi - fi - fi - fi - - ocf_log info "Resetting replication" - unset_master #Should be unset already execpt if master crashed - if [ $? -ne 0 ]; then - return $OCF_ERR_GENERIC - fi - - ocf_log info "Changing MySQL configuration to replicate from $master_host" - set_master - if [ $? -ne 0 ]; then - return $OCF_ERR_GENERIC - fi - - start_slave - if [ $? -ne 0 ]; then - ocf_log err "Failed to start slave" - return $OCF_ERR_GENERIC - fi - fi - return $OCF_SUCCESS - ;; - - 'pre-demote') - # Is the notification for our set - notify_resource=`echo $OCF_RESKEY_CRM_meta_notify_demote_resource|cut -d: -f1` - my_resource=`echo $OCF_RESOURCE_INSTANCE|cut -d: -f1` - if [ $notify_resource != ${my_resource} ]; then - ocf_log debug "Notification is not for us" - return $OCF_SUCCESS - fi - - demote_host=`echo $OCF_RESKEY_CRM_meta_notify_demote_uname|tr -d " "` - if [ $demote_host = ${HOSTNAME} ]; then - ocf_log info "post-demote notification for $demote_host" - set_read_only on - if [ $? -ne 0 ]; then - ocf_log err "Failed to set read-only"; - return $OCF_ERR_GENERIC; - fi - - # Must kill all existing user threads because they are still Read/write - # in order for the slaves to complete the read of binlogs - local tmpfile - tmpfile=`mktemp ${HA_RSCTMP}/threads.${OCF_RESOURCE_INSTANCE}.XXXXXX` - mysql_run -Q -sw -O $MYSQL $MYSQL_OPTIONS_REPL \ - -e "SHOW PROCESSLIST" > $tmpfile - - for thread in `awk '$0 !~ /Binlog Dump|system user|event_scheduler|SHOW PROCESSLIST/ && $0 ~ /^[0-9]/ {print $1}' $tmpfile` - do - mysql_run -Q -sw $MYSQL $MYSQL_OPTIONS_REPL \ - -e "KILL ${thread}" - done - rm -f $tmpfile - else - ocf_log info "Ignoring post-demote notification execpt for my own demotion." - fi - return $OCF_SUCCESS - ;; - 'post-demote') - # Is the notification for our set - notify_resource=`echo $OCF_RESKEY_CRM_meta_notify_demote_resource|cut -d: -f1` - my_resource=`echo $OCF_RESOURCE_INSTANCE|cut -d: -f1` - if [ $notify_resource != ${my_resource} ]; then - ocf_log debug "Notification is not for us" - return $OCF_SUCCESS - fi - - demote_host=`echo $OCF_RESKEY_CRM_meta_notify_demote_uname|tr -d " "` - if [ $demote_host = ${HOSTNAME} ]; then - ocf_log info "Ignoring post-demote notification for my own demotion." - return $OCF_SUCCESS - fi - ocf_log info "post-demote notification for $demote_host." - # The former master has just been gracefully demoted. - unset_master - ;; - *) - return $OCF_SUCCESS - ;; - esac -} - -# -# mysql_run: Run a mysql command, log its output and return the proper error code. -# Usage: mysql_run [-Q] [-info|-warn|-err] [-O] [-sw] -# -Q: don't log the output of the command if it succeeds -# -info|-warn|-err: log the output of the command at given -# severity if it fails (defaults to err) -# -O: echo the output of the command -# -sw: Suppress 5.6 client warning when password is used on the command line -# Adapted from ocf_run. -# -mysql_run() { - local rc - local output outputfile - local verbose=1 - local returnoutput - local loglevel=err - local suppress_56_password_warning - local var - - for var in 1 2 3 4 - do - case "$1" in - "-Q") - verbose="" - shift 1;; - "-info"|"-warn"|"-err") - loglevel=`echo $1 | sed -e s/-//g` - shift 1;; - "-O") - returnoutput=1 - shift 1;; - "-sw") - suppress_56_password_warning=1 - shift 1;; - - *) - ;; - esac - done - - outputfile=`mktemp ${HA_RSCTMP}/mysql_run.${OCF_RESOURCE_INSTANCE}.XXXXXX` - error=`"$@" 2>&1 1>$outputfile` - rc=$? - if [ "$suppress_56_password_warning" -eq 1 ]; then - error=`echo "$error" | egrep -v '^Warning: Using a password on the command line'` - fi - output=`cat $outputfile` - rm -f $outputfile - - if [ $rc -eq 0 ]; then - if [ "$verbose" -a ! -z "$output" ]; then - ocf_log info "$output" - fi - - if [ "$returnoutput" -a ! -z "$output" ]; then - echo "$output" - fi - - MYSQL_LAST_ERR=$OCF_SUCCESS - return $OCF_SUCCESS - else - if [ ! -z "$error" ]; then - ocf_log $loglevel "$error" - regex='^ERROR ([[:digit:]]{4}).*' - if [[ $error =~ $regex ]]; then - mysql_code=${BASH_REMATCH[1]} - if [ -n "$mysql_code" ]; then - MYSQL_LAST_ERR=$mysql_code - return $rc - fi - fi - else - ocf_log $loglevel "command failed: $*" - fi - # No output to parse so return the standard exit code. - MYSQL_LAST_ERR=$rc - return $rc - fi -} - -####################################################################### - - -########################################################################## -# If DEBUG_LOG is set, make this resource agent easy to debug: set up the -# debug log and direct all output to it. Otherwise, redirect to /dev/null. -# The log directory must be a directory owned by root, with permissions 0700, -# and the log must be writable and not a symlink. -########################################################################## -DEBUG_LOG="/tmp/mysql.ocf.ra.debug/log" -if [ "${DEBUG_LOG}" -a -w "${DEBUG_LOG}" -a ! -L "${DEBUG_LOG}" ]; then - DEBUG_LOG_DIR="${DEBUG_LOG%/*}" - if [ -d "${DEBUG_LOG_DIR}" ]; then - exec 9>>"$DEBUG_LOG" - exec 2>&9 - date >&9 - echo "$*" >&9 - env | grep OCF_ | sort >&9 - set -x - else - exec 9>/dev/null - fi -fi - -case "$1" in - meta-data) meta_data - exit $OCF_SUCCESS;; - usage|help) usage - exit $OCF_SUCCESS;; -esac - -if [ "$#" -lt "1" ]; then - usage - exit $OCF_SUCCESS -fi - -mysql_validate -rc=$? -LSB_STATUS_STOPPED=3 -if [ $rc -ne 0 ]; then - case "$1" in - stop) exit $OCF_SUCCESS;; - monitor) exit $OCF_NOT_RUNNING;; - status) exit $LSB_STATUS_STOPPED;; - *) exit $rc;; - esac -fi - -# We check if there is a location constraint against this node -# where $OCF_RESOURCE_INSTANCE should not be running here i.e. -# -INFINITY score, if so we ignore monitor call for this node -contrnt=$(cibadmin -t 2 --query --obj_type constraints\ - |awk "/rsc=\"$OCF_RESOURCE_INSTANCE\"/,/<\/rsc_location/"\ - |awk '/score="-INFINITY"/,/<\/rule/'\ - |egrep "expression attribute=\"#uname\".*operation=\"eq\".*value=\"$HOSTNAME\"" 2> /dev/null) - -if [ "$?" -eq "0" ]; then - exit $OCF_SUCCESS -fi - -#Global info missing from OCF_RESKEY -resources=`$CRM_RES --list` - -# now we need the master-slave clone set name, need to walk around limitations -# of older pacemaker -if [[ "$OCF_RESKEY_crm_feature_set" > "3.0.1" ]]; then - glb_master_resource=`echo "$resources" | grep $INSTANCE_ATTR_NAME | awk '{print $3}' | head -n 1` -else - # older versions of Pacemaker don't write the primitive name in the resources list - for msr in `echo "$resources" | grep 'Master/Slave' | awk '{print $3}'`; do - isThere=`$CRM_RES -q -r $msr | grep primitive | grep -c $INSTANCE_ATTR_NAME` - if [ "$isThere" -gt "0" ]; then - glb_master_resource=$msr - fi - done -fi -is_master_side -glb_master_side=$? -if [ "${#OCF_RESKEY_geo_remote_IP}" -gt "0" -a $glb_master_side -ne 0 ]; then - # geo_remote_IP is defined, let's query the remote side - # the variable content will be like: pacemaker-1-1|binlog.000156|107 1 - glb_remote_info=`$SSH $OCF_RESKEY_geo_remote_IP -l root "$CRM_ATTR_REPL_INFO --query -q | tr '\n' ' ';$CRM_RES --list | grep -A2 $glb_master_resource | egrep -c 'Master[^\/]'"` - glb_master_exists=`echo $glb_remote_info | awk '{ print $NF }'` - if [[ -z "$glb_master_exists" ]]; then - glb_master_exists=0 - fi -else - glb_master_exists=`echo "$resources" | grep -A2 $glb_master_resource | egrep -c 'Master[^\/]'` -fi - -if [ "$glb_master_exists" -eq "1" ]; then - if [ "${#glb_remote_info}" -gt "0" ]; then - glb_cib_master=`echo $glb_remote_info | awk '{ print $1 }' | cut -d'|' -f1` - else - glb_local_info=`$CRM_ATTR_REPL_INFO --query -q` - glb_cib_master=`echo $glb_local_info | cut -d'|' -f1` - fi -fi - -# What kind of method was invoked? -case "$1" in - start) mysql_start;; - stop) mysql_stop;; - status) mysql_status err;; - monitor) mysql_monitor;; - promote) mysql_promote;; - demote) mysql_demote;; - notify) mysql_notify;; - validate-all) exit $OCF_SUCCESS;; - - *) usage - exit $OCF_ERR_UNIMPLEMENTED;; -esac From af06d3ff42c33478f7643350d3416eed38fe6867 Mon Sep 17 00:00:00 2001 From: Felipe Reyes Date: Tue, 17 Mar 2015 11:37:27 -0300 Subject: [PATCH 12/22] Call ha_relation_joined() when upgrading the charm --- hooks/percona_hooks.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hooks/percona_hooks.py b/hooks/percona_hooks.py index f236154..bf90fec 100755 --- a/hooks/percona_hooks.py +++ b/hooks/percona_hooks.py @@ -163,6 +163,10 @@ def config_changed(): for unit in related_units(r_id): shared_db_changed(r_id, unit) + if relation_ids('ha'): + # make sure all the HA resources are (re)created + ha_relation_joined() + @hooks.hook('cluster-relation-joined') def cluster_joined(relation_id=None): From f64d918ea04267075975dffa8a3b3de0131f3282 Mon Sep 17 00:00:00 2001 From: Felipe Reyes Date: Tue, 17 Mar 2015 11:37:44 -0300 Subject: [PATCH 13/22] Add unit tests for ha-relation-joined hook --- setup.cfg | 6 ++ unit_tests/test_percona_hooks.py | 65 +++++++++++++++++ unit_tests/test_utils.py | 121 +++++++++++++++++++++++++++++++ 3 files changed, 192 insertions(+) create mode 100644 setup.cfg create mode 100644 unit_tests/test_percona_hooks.py create mode 100644 unit_tests/test_utils.py diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..3f7bd91 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,6 @@ +[nosetests] +verbosity=2 +with-coverage=1 +cover-erase=1 +cover-package=hooks + diff --git a/unit_tests/test_percona_hooks.py b/unit_tests/test_percona_hooks.py new file mode 100644 index 0000000..65d3059 --- /dev/null +++ b/unit_tests/test_percona_hooks.py @@ -0,0 +1,65 @@ +import mock +import sys +from test_utils import CharmTestCase + +sys.modules['MySQLdb'] = mock.Mock() +import percona_hooks as hooks + +TO_PATCH = ['log', 'config', + 'get_db_helper', + 'relation_ids', + 'relation_set'] + + +class TestHaRelation(CharmTestCase): + def setUp(self): + CharmTestCase.setUp(self, hooks, TO_PATCH) + + @mock.patch('sys.exit') + def test_relation_not_configured(self, exit_): + self.config.return_value = None + + class MyError(Exception): + pass + + def f(x): + raise MyError(x) + exit_.side_effect = f + self.assertRaises(MyError, hooks.ha_relation_joined) + + def test_resources(self): + self.relation_ids.return_value = ['ha:1'] + password = 'ubuntu' + helper = mock.Mock() + attrs = {'get_mysql_password.return_value': password} + helper.configure_mock(**attrs) + self.get_db_helper.return_value = helper + self.test_config.set('vip', '10.0.3.3') + self.test_config.set('sst-password', password) + def f(k): + return self.test_config.get(k) + + self.config.side_effect = f + hooks.ha_relation_joined() + + resources = {'res_mysql_vip': 'ocf:heartbeat:IPaddr2', + 'res_mysql_monitor': 'ocf:percona:mysql_monitor'} + resource_params = {'res_mysql_vip': ('params ip="10.0.3.3" ' + 'cidr_netmask="24" ' + 'nic="eth0"'), + 'res_mysql_monitor': + hooks.RES_MONITOR_PARAMS % {'sstpass': 'ubuntu'}} + groups = {'grp_percona_cluster': 'res_mysql_vip'} + + clones = {'cl_mysql_monitor': 'res_mysql_monitor meta interleave=true'} + + colocations = {'vip_mysqld': 'inf: grp_percona_cluster cl_mysql_monitor'} + + locations = {'loc_percona_cluster': + 'grp_percona_cluster rule inf: writable eq 1'} + + self.relation_set.assert_called_with( + relation_id='ha:1', corosync_bindiface=f('ha-bindiface'), + corosync_mcastport=f('ha-mcastport'), resources=resources, + resource_params=resource_params, groups=groups, + clones=clones, colocations=colocations, locations=locations) diff --git a/unit_tests/test_utils.py b/unit_tests/test_utils.py new file mode 100644 index 0000000..a59f897 --- /dev/null +++ b/unit_tests/test_utils.py @@ -0,0 +1,121 @@ +import logging +import unittest +import os +import yaml + +from contextlib import contextmanager +from mock import patch, MagicMock + + +def load_config(): + ''' + Walk backwords from __file__ looking for config.yaml, load and return the + 'options' section' + ''' + config = None + f = __file__ + while config is None: + d = os.path.dirname(f) + if os.path.isfile(os.path.join(d, 'config.yaml')): + config = os.path.join(d, 'config.yaml') + break + f = d + + if not config: + logging.error('Could not find config.yaml in any parent directory ' + 'of %s. ' % file) + raise Exception + + return yaml.safe_load(open(config).read())['options'] + + +def get_default_config(): + ''' + Load default charm config from config.yaml return as a dict. + If no default is set in config.yaml, its value is None. + ''' + default_config = {} + config = load_config() + for k, v in config.iteritems(): + if 'default' in v: + default_config[k] = v['default'] + else: + default_config[k] = None + return default_config + + +class CharmTestCase(unittest.TestCase): + + def setUp(self, obj, patches): + super(CharmTestCase, self).setUp() + self.patches = patches + self.obj = obj + self.test_config = TestConfig() + self.test_relation = TestRelation() + self.patch_all() + + def patch(self, method): + _m = patch.object(self.obj, method) + mock = _m.start() + self.addCleanup(_m.stop) + return mock + + def patch_all(self): + for method in self.patches: + setattr(self, method, self.patch(method)) + + +class TestConfig(object): + + def __init__(self): + self.config = get_default_config() + + def get(self, attr=None): + if not attr: + return self.get_all() + try: + return self.config[attr] + except KeyError: + return None + + def get_all(self): + return self.config + + def set(self, attr, value): + if attr not in self.config: + raise KeyError + self.config[attr] = value + + +class TestRelation(object): + + def __init__(self, relation_data={}): + self.relation_data = relation_data + + def set(self, relation_data): + self.relation_data = relation_data + + def get(self, attr=None, unit=None, rid=None): + if attr is None: + return self.relation_data + elif attr in self.relation_data: + return self.relation_data[attr] + return None + + +@contextmanager +def patch_open(): + '''Patch open() to allow mocking both open() itself and the file that is + yielded. + + Yields the mock for "open" and "file", respectively.''' + mock_open = MagicMock(spec=open) + mock_file = MagicMock(spec=file) + + @contextmanager + def stub_open(*args, **kwargs): + mock_open(*args, **kwargs) + yield mock_file + + with patch('__builtin__.open', stub_open): + yield mock_open, mock_file From ac78dca0b14cf4d4992d2f8094d90a7389bdc6c8 Mon Sep 17 00:00:00 2001 From: Felipe Reyes Date: Tue, 17 Mar 2015 14:31:12 -0300 Subject: [PATCH 14/22] Install mysq_monitor agent during ugprade-charm --- hooks/percona_hooks.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hooks/percona_hooks.py b/hooks/percona_hooks.py index bf90fec..0f4f4a2 100755 --- a/hooks/percona_hooks.py +++ b/hooks/percona_hooks.py @@ -164,6 +164,8 @@ def config_changed(): shared_db_changed(r_id, unit) if relation_ids('ha'): + # (re)install pcmkr agent + install_mysql_ocf() # make sure all the HA resources are (re)created ha_relation_joined() From ae4ca3d37b9803bd282db1536f4d86996009080f Mon Sep 17 00:00:00 2001 From: Felipe Reyes Date: Tue, 17 Mar 2015 14:43:42 -0300 Subject: [PATCH 15/22] Moved mysql_monitor installation to config-changed hook --- hooks/percona_hooks.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/hooks/percona_hooks.py b/hooks/percona_hooks.py index 0f4f4a2..0773de2 100755 --- a/hooks/percona_hooks.py +++ b/hooks/percona_hooks.py @@ -163,9 +163,10 @@ def config_changed(): for unit in related_units(r_id): shared_db_changed(r_id, unit) + # (re)install pcmkr agent + install_mysql_ocf() + if relation_ids('ha'): - # (re)install pcmkr agent - install_mysql_ocf() # make sure all the HA resources are (re)created ha_relation_joined() @@ -181,8 +182,6 @@ def cluster_joined(relation_id=None): relation_set(relation_id=relation_id, relation_settings=relation_settings) - install_mysql_ocf() - @hooks.hook('cluster-relation-departed') @hooks.hook('cluster-relation-changed') From 58837684f04cbf38c65e853b81d99a7ea41781e9 Mon Sep 17 00:00:00 2001 From: Felipe Reyes Date: Tue, 17 Mar 2015 14:44:21 -0300 Subject: [PATCH 16/22] Add mysql_monitor agent to copyright definition --- copyright | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/copyright b/copyright index 1632584..98834d7 100644 --- a/copyright +++ b/copyright @@ -15,3 +15,25 @@ License: GPL-3 . You should have received a copy of the GNU General Public License along with this program. If not, see . + +Files: ocf/percona/mysql_monitor +Copyright: Copyright (c) 2013, Percona inc., Yves Trudeau, Michael Coburn +License: GPL-2 + This program is free software; you can redistribute it and/or modify + it under the terms of version 2 of the GNU General Public License as + published by the Free Software Foundation. + + This program is distributed in the hope that it would be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + + Further, this software is distributed without any warranty that it is + free of the rightful claim of any third person regarding infringement + or the like. Any license provided herein, whether implied or + otherwise, applies only to this software file. Patent licenses, if + any, provided herein do not apply to combinations of this program with + other software, or any other product whatsoever. + + You should have received a copy of the GNU General Public License + along with this program; if not, write the Free Software Foundation, + Inc., 59 Temple Place - Suite 330, Boston MA 02111-1307, USA. From bf187288973f02eb7c34167b1cb67f6c1557c631 Mon Sep 17 00:00:00 2001 From: Felipe Reyes Date: Tue, 7 Apr 2015 12:51:43 -0300 Subject: [PATCH 17/22] mysql_monitor: Apply patch available in upstream PR #52 https://github.com/percona/percona-pacemaker-agents/pull/53 --- ocf/percona/mysql_monitor | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ocf/percona/mysql_monitor b/ocf/percona/mysql_monitor index ba6f841..e60499b 100755 --- a/ocf/percona/mysql_monitor +++ b/ocf/percona/mysql_monitor @@ -545,6 +545,10 @@ mysql_monitor() { ;; esac + else + ocf_log $1 "MySQL is not running, but there is a pidfile" + set_reader_attr 0 + set_writer_attr 0 fi else ocf_log $1 "MySQL is not running" From 648591899121a7ace4f73d06a81d46306782c019 Mon Sep 17 00:00:00 2001 From: Felipe Reyes Date: Wed, 15 Apr 2015 14:11:46 +0200 Subject: [PATCH 18/22] Rename target to 'test' and use AMULET_OS_VIP to handoff the vip --- Makefile | 6 ++++-- tests/00-setup.sh | 14 +++++++------- tests/basic_deployment.py | 14 +++++++++----- 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/Makefile b/Makefile index 6cec23b..0665acd 100644 --- a/Makefile +++ b/Makefile @@ -9,9 +9,11 @@ lint: unit_test: @$(PYTHON) /usr/bin/nosetests --nologcapture unit_tests -functional_test: +test: @echo Starting amulet tests... - @juju test -v -p AMULET_HTTP_PROXY --timeout 900 + #NOTE(beisner): can remove -v after bug 1320357 is fixed + # https://bugs.launchpad.net/amulet/+bug/1320357 + @juju test -v -p AMULET_HTTP_PROXY,AMULET_OS_VIP --timeout 900 bin/charm_helpers_sync.py: @mkdir -p bin diff --git a/tests/00-setup.sh b/tests/00-setup.sh index 497b294..ed1c292 100755 --- a/tests/00-setup.sh +++ b/tests/00-setup.sh @@ -20,10 +20,10 @@ for pkg in $PACKAGES; do done -if [ ! -f "$(dirname $0)/../local.yaml" ]; then - echo "To run these amulet tests a vip is needed, create a file called \ -local.yaml in the charm dir, this file must contain a 'vip', if you're \ -using the local provider with lxc you could use a free IP from the range \ -10.0.3.0/24" - exit 1 -fi +#if [ ! -f "$(dirname $0)/../local.yaml" ]; then +# echo "To run these amulet tests a vip is needed, create a file called \ +#local.yaml in the charm dir, this file must contain a 'vip', if you're \ +#using the local provider with lxc you could use a free IP from the range \ +#10.0.3.0/24" +# exit 1 +#fi diff --git a/tests/basic_deployment.py b/tests/basic_deployment.py index fbb9e86..936d3ac 100644 --- a/tests/basic_deployment.py +++ b/tests/basic_deployment.py @@ -12,8 +12,8 @@ class BasicDeployment(unittest.TestCase): self.vip = None if vip: self.vip = vip - elif 'VIP' in os.environ: - self.vip = os.environ.get('VIP') + elif 'AMULET_OS_VIP' in os.environ: + self.vip = os.environ.get('AMULET_OS_VIP') elif os.path.isfile('local.yaml'): with open('local.yaml', 'rb') as f: self.cfg = yaml.safe_load(f.read()) @@ -21,8 +21,8 @@ class BasicDeployment(unittest.TestCase): self.vip = self.cfg.get('vip') else: amulet.raise_status(amulet.SKIP, - ("please set the vip in local.yaml " - "to run this test suite")) + ("please set the vip in local.yaml or env var " + "AMULET_OS_VIP to run this test suite")) def run(self): # The number of seconds to wait for the environment to setup. @@ -30,7 +30,11 @@ class BasicDeployment(unittest.TestCase): self.d = amulet.Deployment(series="trusty") self.d.add('percona-cluster', units=self.units) - self.d.add('hacluster') + + # NOTE(freyes): we use hacluster/next, because stable doesn't support + # location rules definition. + self.d.add('hacluster', + charm='lp:~openstack-charmers/charms/trusty/hacluster/next') self.d.relate('percona-cluster:ha', 'hacluster:ha') cfg_percona = {'sst-password': 'ubuntu', From 33f46e4a619bd3527691291b0fe515980b0b0d79 Mon Sep 17 00:00:00 2001 From: Felipe Reyes Date: Wed, 15 Apr 2015 16:23:37 +0200 Subject: [PATCH 19/22] Add tests/charmhelpers/ --- Makefile | 1 + charm-helpers-tests.yaml | 5 + tests/charmhelpers/__init__.py | 38 +++ tests/charmhelpers/contrib/__init__.py | 15 + tests/charmhelpers/contrib/amulet/__init__.py | 15 + .../charmhelpers/contrib/amulet/deployment.py | 93 ++++++ tests/charmhelpers/contrib/amulet/utils.py | 316 ++++++++++++++++++ .../contrib/openstack/__init__.py | 15 + .../contrib/openstack/amulet/__init__.py | 15 + .../contrib/openstack/amulet/deployment.py | 134 ++++++++ .../contrib/openstack/amulet/utils.py | 294 ++++++++++++++++ 11 files changed, 941 insertions(+) create mode 100644 charm-helpers-tests.yaml create mode 100644 tests/charmhelpers/__init__.py create mode 100644 tests/charmhelpers/contrib/__init__.py create mode 100644 tests/charmhelpers/contrib/amulet/__init__.py create mode 100644 tests/charmhelpers/contrib/amulet/deployment.py create mode 100644 tests/charmhelpers/contrib/amulet/utils.py create mode 100644 tests/charmhelpers/contrib/openstack/__init__.py create mode 100644 tests/charmhelpers/contrib/openstack/amulet/__init__.py create mode 100644 tests/charmhelpers/contrib/openstack/amulet/deployment.py create mode 100644 tests/charmhelpers/contrib/openstack/amulet/utils.py diff --git a/Makefile b/Makefile index 0665acd..1cfb40b 100644 --- a/Makefile +++ b/Makefile @@ -22,6 +22,7 @@ bin/charm_helpers_sync.py: sync: bin/charm_helpers_sync.py @$(PYTHON) bin/charm_helpers_sync.py -c charm-helpers.yaml + @$(PYTHON) bin/charm_helpers_sync.py -c charm-helpers-tests.yaml publish: lint bzr push lp:charms/trusty/percona-cluster diff --git a/charm-helpers-tests.yaml b/charm-helpers-tests.yaml new file mode 100644 index 0000000..48b12f6 --- /dev/null +++ b/charm-helpers-tests.yaml @@ -0,0 +1,5 @@ +branch: lp:charm-helpers +destination: tests/charmhelpers +include: + - contrib.amulet + - contrib.openstack.amulet diff --git a/tests/charmhelpers/__init__.py b/tests/charmhelpers/__init__.py new file mode 100644 index 0000000..f72e7f8 --- /dev/null +++ b/tests/charmhelpers/__init__.py @@ -0,0 +1,38 @@ +# Copyright 2014-2015 Canonical Limited. +# +# This file is part of charm-helpers. +# +# charm-helpers is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License version 3 as +# published by the Free Software Foundation. +# +# charm-helpers is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with charm-helpers. If not, see . + +# Bootstrap charm-helpers, installing its dependencies if necessary using +# only standard libraries. +import subprocess +import sys + +try: + import six # flake8: noqa +except ImportError: + if sys.version_info.major == 2: + subprocess.check_call(['apt-get', 'install', '-y', 'python-six']) + else: + subprocess.check_call(['apt-get', 'install', '-y', 'python3-six']) + import six # flake8: noqa + +try: + import yaml # flake8: noqa +except ImportError: + if sys.version_info.major == 2: + subprocess.check_call(['apt-get', 'install', '-y', 'python-yaml']) + else: + subprocess.check_call(['apt-get', 'install', '-y', 'python3-yaml']) + import yaml # flake8: noqa diff --git a/tests/charmhelpers/contrib/__init__.py b/tests/charmhelpers/contrib/__init__.py new file mode 100644 index 0000000..d1400a0 --- /dev/null +++ b/tests/charmhelpers/contrib/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2014-2015 Canonical Limited. +# +# This file is part of charm-helpers. +# +# charm-helpers is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License version 3 as +# published by the Free Software Foundation. +# +# charm-helpers is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with charm-helpers. If not, see . diff --git a/tests/charmhelpers/contrib/amulet/__init__.py b/tests/charmhelpers/contrib/amulet/__init__.py new file mode 100644 index 0000000..d1400a0 --- /dev/null +++ b/tests/charmhelpers/contrib/amulet/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2014-2015 Canonical Limited. +# +# This file is part of charm-helpers. +# +# charm-helpers is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License version 3 as +# published by the Free Software Foundation. +# +# charm-helpers is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with charm-helpers. If not, see . diff --git a/tests/charmhelpers/contrib/amulet/deployment.py b/tests/charmhelpers/contrib/amulet/deployment.py new file mode 100644 index 0000000..367d6b4 --- /dev/null +++ b/tests/charmhelpers/contrib/amulet/deployment.py @@ -0,0 +1,93 @@ +# Copyright 2014-2015 Canonical Limited. +# +# This file is part of charm-helpers. +# +# charm-helpers is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License version 3 as +# published by the Free Software Foundation. +# +# charm-helpers is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with charm-helpers. If not, see . + +import amulet +import os +import six + + +class AmuletDeployment(object): + """Amulet deployment. + + This class provides generic Amulet deployment and test runner + methods. + """ + + def __init__(self, series=None): + """Initialize the deployment environment.""" + self.series = None + + if series: + self.series = series + self.d = amulet.Deployment(series=self.series) + else: + self.d = amulet.Deployment() + + def _add_services(self, this_service, other_services): + """Add services. + + Add services to the deployment where this_service is the local charm + that we're testing and other_services are the other services that + are being used in the local amulet tests. + """ + if this_service['name'] != os.path.basename(os.getcwd()): + s = this_service['name'] + msg = "The charm's root directory name needs to be {}".format(s) + amulet.raise_status(amulet.FAIL, msg=msg) + + if 'units' not in this_service: + this_service['units'] = 1 + + self.d.add(this_service['name'], units=this_service['units']) + + for svc in other_services: + if 'location' in svc: + branch_location = svc['location'] + elif self.series: + branch_location = 'cs:{}/{}'.format(self.series, svc['name']), + else: + branch_location = None + + if 'units' not in svc: + svc['units'] = 1 + + self.d.add(svc['name'], charm=branch_location, units=svc['units']) + + def _add_relations(self, relations): + """Add all of the relations for the services.""" + for k, v in six.iteritems(relations): + self.d.relate(k, v) + + def _configure_services(self, configs): + """Configure all of the services.""" + for service, config in six.iteritems(configs): + self.d.configure(service, config) + + def _deploy(self): + """Deploy environment and wait for all hooks to finish executing.""" + try: + self.d.setup(timeout=900) + self.d.sentry.wait(timeout=900) + except amulet.helpers.TimeoutError: + amulet.raise_status(amulet.FAIL, msg="Deployment timed out") + except Exception: + raise + + def run_tests(self): + """Run all of the methods that are prefixed with 'test_'.""" + for test in dir(self): + if test.startswith('test_'): + getattr(self, test)() diff --git a/tests/charmhelpers/contrib/amulet/utils.py b/tests/charmhelpers/contrib/amulet/utils.py new file mode 100644 index 0000000..5088b1d --- /dev/null +++ b/tests/charmhelpers/contrib/amulet/utils.py @@ -0,0 +1,316 @@ +# Copyright 2014-2015 Canonical Limited. +# +# This file is part of charm-helpers. +# +# charm-helpers is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License version 3 as +# published by the Free Software Foundation. +# +# charm-helpers is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with charm-helpers. If not, see . + +import ConfigParser +import io +import logging +import re +import sys +import time + +import six + + +class AmuletUtils(object): + """Amulet utilities. + + This class provides common utility functions that are used by Amulet + tests. + """ + + def __init__(self, log_level=logging.ERROR): + self.log = self.get_logger(level=log_level) + + def get_logger(self, name="amulet-logger", level=logging.DEBUG): + """Get a logger object that will log to stdout.""" + log = logging + logger = log.getLogger(name) + fmt = log.Formatter("%(asctime)s %(funcName)s " + "%(levelname)s: %(message)s") + + handler = log.StreamHandler(stream=sys.stdout) + handler.setLevel(level) + handler.setFormatter(fmt) + + logger.addHandler(handler) + logger.setLevel(level) + + return logger + + def valid_ip(self, ip): + if re.match(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$", ip): + return True + else: + return False + + def valid_url(self, url): + p = re.compile( + r'^(?:http|ftp)s?://' + r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # noqa + r'localhost|' + r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' + r'(?::\d+)?' + r'(?:/?|[/?]\S+)$', + re.IGNORECASE) + if p.match(url): + return True + else: + return False + + def validate_services(self, commands): + """Validate services. + + Verify the specified services are running on the corresponding + service units. + """ + for k, v in six.iteritems(commands): + for cmd in v: + output, code = k.run(cmd) + if code != 0: + return "command `{}` returned {}".format(cmd, str(code)) + return None + + def _get_config(self, unit, filename): + """Get a ConfigParser object for parsing a unit's config file.""" + file_contents = unit.file_contents(filename) + config = ConfigParser.ConfigParser() + config.readfp(io.StringIO(file_contents)) + return config + + def validate_config_data(self, sentry_unit, config_file, section, + expected): + """Validate config file data. + + Verify that the specified section of the config file contains + the expected option key:value pairs. + """ + config = self._get_config(sentry_unit, config_file) + + if section != 'DEFAULT' and not config.has_section(section): + return "section [{}] does not exist".format(section) + + for k in expected.keys(): + if not config.has_option(section, k): + return "section [{}] is missing option {}".format(section, k) + if config.get(section, k) != expected[k]: + return "section [{}] {}:{} != expected {}:{}".format( + section, k, config.get(section, k), k, expected[k]) + return None + + def _validate_dict_data(self, expected, actual): + """Validate dictionary data. + + Compare expected dictionary data vs actual dictionary data. + The values in the 'expected' dictionary can be strings, bools, ints, + longs, or can be a function that evaluate a variable and returns a + bool. + """ + self.log.debug('actual: {}'.format(repr(actual))) + self.log.debug('expected: {}'.format(repr(expected))) + + for k, v in six.iteritems(expected): + if k in actual: + if (isinstance(v, six.string_types) or + isinstance(v, bool) or + isinstance(v, six.integer_types)): + if v != actual[k]: + return "{}:{}".format(k, actual[k]) + elif not v(actual[k]): + return "{}:{}".format(k, actual[k]) + else: + return "key '{}' does not exist".format(k) + return None + + def validate_relation_data(self, sentry_unit, relation, expected): + """Validate actual relation data based on expected relation data.""" + actual = sentry_unit.relation(relation[0], relation[1]) + return self._validate_dict_data(expected, actual) + + def _validate_list_data(self, expected, actual): + """Compare expected list vs actual list data.""" + for e in expected: + if e not in actual: + return "expected item {} not found in actual list".format(e) + return None + + def not_null(self, string): + if string is not None: + return True + else: + return False + + def _get_file_mtime(self, sentry_unit, filename): + """Get last modification time of file.""" + return sentry_unit.file_stat(filename)['mtime'] + + def _get_dir_mtime(self, sentry_unit, directory): + """Get last modification time of directory.""" + return sentry_unit.directory_stat(directory)['mtime'] + + def _get_proc_start_time(self, sentry_unit, service, pgrep_full=False): + """Get process' start time. + + Determine start time of the process based on the last modification + time of the /proc/pid directory. If pgrep_full is True, the process + name is matched against the full command line. + """ + if pgrep_full: + cmd = 'pgrep -o -f {}'.format(service) + else: + cmd = 'pgrep -o {}'.format(service) + cmd = cmd + ' | grep -v pgrep || exit 0' + cmd_out = sentry_unit.run(cmd) + self.log.debug('CMDout: ' + str(cmd_out)) + if cmd_out[0]: + self.log.debug('Pid for %s %s' % (service, str(cmd_out[0]))) + proc_dir = '/proc/{}'.format(cmd_out[0].strip()) + return self._get_dir_mtime(sentry_unit, proc_dir) + + def service_restarted(self, sentry_unit, service, filename, + pgrep_full=False, sleep_time=20): + """Check if service was restarted. + + Compare a service's start time vs a file's last modification time + (such as a config file for that service) to determine if the service + has been restarted. + """ + time.sleep(sleep_time) + if (self._get_proc_start_time(sentry_unit, service, pgrep_full) >= + self._get_file_mtime(sentry_unit, filename)): + return True + else: + return False + + def service_restarted_since(self, sentry_unit, mtime, service, + pgrep_full=False, sleep_time=20, + retry_count=2): + """Check if service was been started after a given time. + + Args: + sentry_unit (sentry): The sentry unit to check for the service on + mtime (float): The epoch time to check against + service (string): service name to look for in process table + pgrep_full (boolean): Use full command line search mode with pgrep + sleep_time (int): Seconds to sleep before looking for process + retry_count (int): If service is not found, how many times to retry + + Returns: + bool: True if service found and its start time it newer than mtime, + False if service is older than mtime or if service was + not found. + """ + self.log.debug('Checking %s restarted since %s' % (service, mtime)) + time.sleep(sleep_time) + proc_start_time = self._get_proc_start_time(sentry_unit, service, + pgrep_full) + while retry_count > 0 and not proc_start_time: + self.log.debug('No pid file found for service %s, will retry %i ' + 'more times' % (service, retry_count)) + time.sleep(30) + proc_start_time = self._get_proc_start_time(sentry_unit, service, + pgrep_full) + retry_count = retry_count - 1 + + if not proc_start_time: + self.log.warn('No proc start time found, assuming service did ' + 'not start') + return False + if proc_start_time >= mtime: + self.log.debug('proc start time is newer than provided mtime' + '(%s >= %s)' % (proc_start_time, mtime)) + return True + else: + self.log.warn('proc start time (%s) is older than provided mtime ' + '(%s), service did not restart' % (proc_start_time, + mtime)) + return False + + def config_updated_since(self, sentry_unit, filename, mtime, + sleep_time=20): + """Check if file was modified after a given time. + + Args: + sentry_unit (sentry): The sentry unit to check the file mtime on + filename (string): The file to check mtime of + mtime (float): The epoch time to check against + sleep_time (int): Seconds to sleep before looking for process + + Returns: + bool: True if file was modified more recently than mtime, False if + file was modified before mtime, + """ + self.log.debug('Checking %s updated since %s' % (filename, mtime)) + time.sleep(sleep_time) + file_mtime = self._get_file_mtime(sentry_unit, filename) + if file_mtime >= mtime: + self.log.debug('File mtime is newer than provided mtime ' + '(%s >= %s)' % (file_mtime, mtime)) + return True + else: + self.log.warn('File mtime %s is older than provided mtime %s' + % (file_mtime, mtime)) + return False + + def validate_service_config_changed(self, sentry_unit, mtime, service, + filename, pgrep_full=False, + sleep_time=20, retry_count=2): + """Check service and file were updated after mtime + + Args: + sentry_unit (sentry): The sentry unit to check for the service on + mtime (float): The epoch time to check against + service (string): service name to look for in process table + filename (string): The file to check mtime of + pgrep_full (boolean): Use full command line search mode with pgrep + sleep_time (int): Seconds to sleep before looking for process + retry_count (int): If service is not found, how many times to retry + + Typical Usage: + u = OpenStackAmuletUtils(ERROR) + ... + mtime = u.get_sentry_time(self.cinder_sentry) + self.d.configure('cinder', {'verbose': 'True', 'debug': 'True'}) + if not u.validate_service_config_changed(self.cinder_sentry, + mtime, + 'cinder-api', + '/etc/cinder/cinder.conf') + amulet.raise_status(amulet.FAIL, msg='update failed') + Returns: + bool: True if both service and file where updated/restarted after + mtime, False if service is older than mtime or if service was + not found or if filename was modified before mtime. + """ + self.log.debug('Checking %s restarted since %s' % (service, mtime)) + time.sleep(sleep_time) + service_restart = self.service_restarted_since(sentry_unit, mtime, + service, + pgrep_full=pgrep_full, + sleep_time=0, + retry_count=retry_count) + config_update = self.config_updated_since(sentry_unit, filename, mtime, + sleep_time=0) + return service_restart and config_update + + def get_sentry_time(self, sentry_unit): + """Return current epoch time on a sentry""" + cmd = "date +'%s'" + return float(sentry_unit.run(cmd)[0]) + + def relation_error(self, name, data): + return 'unexpected relation data in {} - {}'.format(name, data) + + def endpoint_error(self, name, data): + return 'unexpected endpoint data in {} - {}'.format(name, data) diff --git a/tests/charmhelpers/contrib/openstack/__init__.py b/tests/charmhelpers/contrib/openstack/__init__.py new file mode 100644 index 0000000..d1400a0 --- /dev/null +++ b/tests/charmhelpers/contrib/openstack/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2014-2015 Canonical Limited. +# +# This file is part of charm-helpers. +# +# charm-helpers is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License version 3 as +# published by the Free Software Foundation. +# +# charm-helpers is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with charm-helpers. If not, see . diff --git a/tests/charmhelpers/contrib/openstack/amulet/__init__.py b/tests/charmhelpers/contrib/openstack/amulet/__init__.py new file mode 100644 index 0000000..d1400a0 --- /dev/null +++ b/tests/charmhelpers/contrib/openstack/amulet/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2014-2015 Canonical Limited. +# +# This file is part of charm-helpers. +# +# charm-helpers is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License version 3 as +# published by the Free Software Foundation. +# +# charm-helpers is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with charm-helpers. If not, see . diff --git a/tests/charmhelpers/contrib/openstack/amulet/deployment.py b/tests/charmhelpers/contrib/openstack/amulet/deployment.py new file mode 100644 index 0000000..fef9638 --- /dev/null +++ b/tests/charmhelpers/contrib/openstack/amulet/deployment.py @@ -0,0 +1,134 @@ +# Copyright 2014-2015 Canonical Limited. +# +# This file is part of charm-helpers. +# +# charm-helpers is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License version 3 as +# published by the Free Software Foundation. +# +# charm-helpers is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with charm-helpers. If not, see . + +import six +from collections import OrderedDict +from charmhelpers.contrib.amulet.deployment import ( + AmuletDeployment +) + + +class OpenStackAmuletDeployment(AmuletDeployment): + """OpenStack amulet deployment. + + This class inherits from AmuletDeployment and has additional support + that is specifically for use by OpenStack charms. + """ + + def __init__(self, series=None, openstack=None, source=None, stable=True): + """Initialize the deployment environment.""" + super(OpenStackAmuletDeployment, self).__init__(series) + self.openstack = openstack + self.source = source + self.stable = stable + # Note(coreycb): this needs to be changed when new next branches come + # out. + self.current_next = "trusty" + + def _determine_branch_locations(self, other_services): + """Determine the branch locations for the other services. + + Determine if the local branch being tested is derived from its + stable or next (dev) branch, and based on this, use the corresonding + stable or next branches for the other_services.""" + base_charms = ['mysql', 'mongodb'] + + if self.stable: + for svc in other_services: + temp = 'lp:charms/{}' + svc['location'] = temp.format(svc['name']) + else: + for svc in other_services: + if svc['name'] in base_charms: + temp = 'lp:charms/{}' + svc['location'] = temp.format(svc['name']) + else: + temp = 'lp:~openstack-charmers/charms/{}/{}/next' + svc['location'] = temp.format(self.current_next, + svc['name']) + return other_services + + def _add_services(self, this_service, other_services): + """Add services to the deployment and set openstack-origin/source.""" + other_services = self._determine_branch_locations(other_services) + + super(OpenStackAmuletDeployment, self)._add_services(this_service, + other_services) + + services = other_services + services.append(this_service) + use_source = ['mysql', 'mongodb', 'rabbitmq-server', 'ceph', + 'ceph-osd', 'ceph-radosgw'] + # Openstack subordinate charms do not expose an origin option as that + # is controlled by the principle + ignore = ['neutron-openvswitch'] + + if self.openstack: + for svc in services: + if svc['name'] not in use_source + ignore: + config = {'openstack-origin': self.openstack} + self.d.configure(svc['name'], config) + + if self.source: + for svc in services: + if svc['name'] in use_source and svc['name'] not in ignore: + config = {'source': self.source} + self.d.configure(svc['name'], config) + + def _configure_services(self, configs): + """Configure all of the services.""" + for service, config in six.iteritems(configs): + self.d.configure(service, config) + + def _get_openstack_release(self): + """Get openstack release. + + Return an integer representing the enum value of the openstack + release. + """ + (self.precise_essex, self.precise_folsom, self.precise_grizzly, + self.precise_havana, self.precise_icehouse, + self.trusty_icehouse, self.trusty_juno, self.trusty_kilo) = range(8) + releases = { + ('precise', None): self.precise_essex, + ('precise', 'cloud:precise-folsom'): self.precise_folsom, + ('precise', 'cloud:precise-grizzly'): self.precise_grizzly, + ('precise', 'cloud:precise-havana'): self.precise_havana, + ('precise', 'cloud:precise-icehouse'): self.precise_icehouse, + ('trusty', None): self.trusty_icehouse, + ('trusty', 'cloud:trusty-juno'): self.trusty_juno, + ('trusty', 'cloud:trusty-kilo'): self.trusty_kilo} + return releases[(self.series, self.openstack)] + + def _get_openstack_release_string(self): + """Get openstack release string. + + Return a string representing the openstack release. + """ + releases = OrderedDict([ + ('precise', 'essex'), + ('quantal', 'folsom'), + ('raring', 'grizzly'), + ('saucy', 'havana'), + ('trusty', 'icehouse'), + ('utopic', 'juno'), + ('vivid', 'kilo'), + ]) + if self.openstack: + os_origin = self.openstack.split(':')[1] + return os_origin.split('%s-' % self.series)[1].split('/')[0] + else: + return releases[self.series] diff --git a/tests/charmhelpers/contrib/openstack/amulet/utils.py b/tests/charmhelpers/contrib/openstack/amulet/utils.py new file mode 100644 index 0000000..9c3d918 --- /dev/null +++ b/tests/charmhelpers/contrib/openstack/amulet/utils.py @@ -0,0 +1,294 @@ +# Copyright 2014-2015 Canonical Limited. +# +# This file is part of charm-helpers. +# +# charm-helpers is free software: you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License version 3 as +# published by the Free Software Foundation. +# +# charm-helpers is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with charm-helpers. If not, see . + +import logging +import os +import time +import urllib + +import glanceclient.v1.client as glance_client +import keystoneclient.v2_0 as keystone_client +import novaclient.v1_1.client as nova_client + +import six + +from charmhelpers.contrib.amulet.utils import ( + AmuletUtils +) + +DEBUG = logging.DEBUG +ERROR = logging.ERROR + + +class OpenStackAmuletUtils(AmuletUtils): + """OpenStack amulet utilities. + + This class inherits from AmuletUtils and has additional support + that is specifically for use by OpenStack charms. + """ + + def __init__(self, log_level=ERROR): + """Initialize the deployment environment.""" + super(OpenStackAmuletUtils, self).__init__(log_level) + + def validate_endpoint_data(self, endpoints, admin_port, internal_port, + public_port, expected): + """Validate endpoint data. + + Validate actual endpoint data vs expected endpoint data. The ports + are used to find the matching endpoint. + """ + found = False + for ep in endpoints: + self.log.debug('endpoint: {}'.format(repr(ep))) + if (admin_port in ep.adminurl and + internal_port in ep.internalurl and + public_port in ep.publicurl): + found = True + actual = {'id': ep.id, + 'region': ep.region, + 'adminurl': ep.adminurl, + 'internalurl': ep.internalurl, + 'publicurl': ep.publicurl, + 'service_id': ep.service_id} + ret = self._validate_dict_data(expected, actual) + if ret: + return 'unexpected endpoint data - {}'.format(ret) + + if not found: + return 'endpoint not found' + + def validate_svc_catalog_endpoint_data(self, expected, actual): + """Validate service catalog endpoint data. + + Validate a list of actual service catalog endpoints vs a list of + expected service catalog endpoints. + """ + self.log.debug('actual: {}'.format(repr(actual))) + for k, v in six.iteritems(expected): + if k in actual: + ret = self._validate_dict_data(expected[k][0], actual[k][0]) + if ret: + return self.endpoint_error(k, ret) + else: + return "endpoint {} does not exist".format(k) + return ret + + def validate_tenant_data(self, expected, actual): + """Validate tenant data. + + Validate a list of actual tenant data vs list of expected tenant + data. + """ + self.log.debug('actual: {}'.format(repr(actual))) + for e in expected: + found = False + for act in actual: + a = {'enabled': act.enabled, 'description': act.description, + 'name': act.name, 'id': act.id} + if e['name'] == a['name']: + found = True + ret = self._validate_dict_data(e, a) + if ret: + return "unexpected tenant data - {}".format(ret) + if not found: + return "tenant {} does not exist".format(e['name']) + return ret + + def validate_role_data(self, expected, actual): + """Validate role data. + + Validate a list of actual role data vs a list of expected role + data. + """ + self.log.debug('actual: {}'.format(repr(actual))) + for e in expected: + found = False + for act in actual: + a = {'name': act.name, 'id': act.id} + if e['name'] == a['name']: + found = True + ret = self._validate_dict_data(e, a) + if ret: + return "unexpected role data - {}".format(ret) + if not found: + return "role {} does not exist".format(e['name']) + return ret + + def validate_user_data(self, expected, actual): + """Validate user data. + + Validate a list of actual user data vs a list of expected user + data. + """ + self.log.debug('actual: {}'.format(repr(actual))) + for e in expected: + found = False + for act in actual: + a = {'enabled': act.enabled, 'name': act.name, + 'email': act.email, 'tenantId': act.tenantId, + 'id': act.id} + if e['name'] == a['name']: + found = True + ret = self._validate_dict_data(e, a) + if ret: + return "unexpected user data - {}".format(ret) + if not found: + return "user {} does not exist".format(e['name']) + return ret + + def validate_flavor_data(self, expected, actual): + """Validate flavor data. + + Validate a list of actual flavors vs a list of expected flavors. + """ + self.log.debug('actual: {}'.format(repr(actual))) + act = [a.name for a in actual] + return self._validate_list_data(expected, act) + + def tenant_exists(self, keystone, tenant): + """Return True if tenant exists.""" + return tenant in [t.name for t in keystone.tenants.list()] + + def authenticate_keystone_admin(self, keystone_sentry, user, password, + tenant): + """Authenticates admin user with the keystone admin endpoint.""" + unit = keystone_sentry + service_ip = unit.relation('shared-db', + 'mysql:shared-db')['private-address'] + ep = "http://{}:35357/v2.0".format(service_ip.strip().decode('utf-8')) + return keystone_client.Client(username=user, password=password, + tenant_name=tenant, auth_url=ep) + + def authenticate_keystone_user(self, keystone, user, password, tenant): + """Authenticates a regular user with the keystone public endpoint.""" + ep = keystone.service_catalog.url_for(service_type='identity', + endpoint_type='publicURL') + return keystone_client.Client(username=user, password=password, + tenant_name=tenant, auth_url=ep) + + def authenticate_glance_admin(self, keystone): + """Authenticates admin user with glance.""" + ep = keystone.service_catalog.url_for(service_type='image', + endpoint_type='adminURL') + return glance_client.Client(ep, token=keystone.auth_token) + + def authenticate_nova_user(self, keystone, user, password, tenant): + """Authenticates a regular user with nova-api.""" + ep = keystone.service_catalog.url_for(service_type='identity', + endpoint_type='publicURL') + return nova_client.Client(username=user, api_key=password, + project_id=tenant, auth_url=ep) + + def create_cirros_image(self, glance, image_name): + """Download the latest cirros image and upload it to glance.""" + http_proxy = os.getenv('AMULET_HTTP_PROXY') + self.log.debug('AMULET_HTTP_PROXY: {}'.format(http_proxy)) + if http_proxy: + proxies = {'http': http_proxy} + opener = urllib.FancyURLopener(proxies) + else: + opener = urllib.FancyURLopener() + + f = opener.open("http://download.cirros-cloud.net/version/released") + version = f.read().strip() + cirros_img = "cirros-{}-x86_64-disk.img".format(version) + local_path = os.path.join('tests', cirros_img) + + if not os.path.exists(local_path): + cirros_url = "http://{}/{}/{}".format("download.cirros-cloud.net", + version, cirros_img) + opener.retrieve(cirros_url, local_path) + f.close() + + with open(local_path) as f: + image = glance.images.create(name=image_name, is_public=True, + disk_format='qcow2', + container_format='bare', data=f) + count = 1 + status = image.status + while status != 'active' and count < 10: + time.sleep(3) + image = glance.images.get(image.id) + status = image.status + self.log.debug('image status: {}'.format(status)) + count += 1 + + if status != 'active': + self.log.error('image creation timed out') + return None + + return image + + def delete_image(self, glance, image): + """Delete the specified image.""" + num_before = len(list(glance.images.list())) + glance.images.delete(image) + + count = 1 + num_after = len(list(glance.images.list())) + while num_after != (num_before - 1) and count < 10: + time.sleep(3) + num_after = len(list(glance.images.list())) + self.log.debug('number of images: {}'.format(num_after)) + count += 1 + + if num_after != (num_before - 1): + self.log.error('image deletion timed out') + return False + + return True + + def create_instance(self, nova, image_name, instance_name, flavor): + """Create the specified instance.""" + image = nova.images.find(name=image_name) + flavor = nova.flavors.find(name=flavor) + instance = nova.servers.create(name=instance_name, image=image, + flavor=flavor) + + count = 1 + status = instance.status + while status != 'ACTIVE' and count < 60: + time.sleep(3) + instance = nova.servers.get(instance.id) + status = instance.status + self.log.debug('instance status: {}'.format(status)) + count += 1 + + if status != 'ACTIVE': + self.log.error('instance creation timed out') + return None + + return instance + + def delete_instance(self, nova, instance): + """Delete the specified instance.""" + num_before = len(list(nova.servers.list())) + nova.servers.delete(instance) + + count = 1 + num_after = len(list(nova.servers.list())) + while num_after != (num_before - 1) and count < 10: + time.sleep(3) + num_after = len(list(nova.servers.list())) + self.log.debug('number of instances: {}'.format(num_after)) + count += 1 + + if num_after != (num_before - 1): + self.log.error('instance deletion timed out') + return False + + return True From 231c5fb59fca1ef38b567bb2983bc42faf012966 Mon Sep 17 00:00:00 2001 From: Felipe Reyes Date: Wed, 15 Apr 2015 16:24:59 +0200 Subject: [PATCH 20/22] Add amulet test that runs 'killall -9 mysqld' in the master node --- tests/30-kill-9-mysqld.py | 38 ++++++++++++++++++++++++++++++++++++++ tests/basic_deployment.py | 3 ++- 2 files changed, 40 insertions(+), 1 deletion(-) create mode 100755 tests/30-kill-9-mysqld.py diff --git a/tests/30-kill-9-mysqld.py b/tests/30-kill-9-mysqld.py new file mode 100755 index 0000000..7ba58e9 --- /dev/null +++ b/tests/30-kill-9-mysqld.py @@ -0,0 +1,38 @@ +#!/usr/bin/python3 +# test percona-cluster (3 nodes) + +import basic_deployment +import time + + +class ThreeNode(basic_deployment.BasicDeployment): + def __init__(self): + super(ThreeNode, self).__init__(units=3) + + def run(self): + super(ThreeNode, self).run() + # we are going to kill the master + old_master = self.master_unit + print('kill-9 mysqld in %s' % str(self.master_unit.info)) + self.master_unit.run('sudo killall -9 mysqld') + + print('looking for the new master') + i = 0 + changed = False + while i < 10 and not changed: + i += 1 + time.sleep(5) # give some time to pacemaker to react + new_master = self.find_master() + + if (new_master and new_master.info['unit_name'] != + old_master.info['unit_name']): + changed = True + + assert changed, "The master didn't change" + + assert self.is_port_open(address=self.vip), 'cannot connect to vip' + + +if __name__ == "__main__": + t = ThreeNode() + t.run() diff --git a/tests/basic_deployment.py b/tests/basic_deployment.py index 936d3ac..c02ea4d 100644 --- a/tests/basic_deployment.py +++ b/tests/basic_deployment.py @@ -3,6 +3,7 @@ import os import telnetlib import unittest import yaml +from charmhelpers.contrib.amulet.deployment import AmuletDeployment class BasicDeployment(unittest.TestCase): @@ -39,7 +40,7 @@ class BasicDeployment(unittest.TestCase): cfg_percona = {'sst-password': 'ubuntu', 'root-password': 't00r', - 'dataset-size': '128M', + 'dataset-size': '512M', 'vip': self.vip} cfg_ha = {'debug': True, From 1c3a719fd3f64509507211eefbeca83211da40f0 Mon Sep 17 00:00:00 2001 From: Felipe Reyes Date: Fri, 17 Apr 2015 12:04:59 +0200 Subject: [PATCH 21/22] Resync charm helpers tests/ --- tests/charmhelpers/contrib/openstack/amulet/deployment.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/charmhelpers/contrib/openstack/amulet/deployment.py b/tests/charmhelpers/contrib/openstack/amulet/deployment.py index fef9638..11d49a7 100644 --- a/tests/charmhelpers/contrib/openstack/amulet/deployment.py +++ b/tests/charmhelpers/contrib/openstack/amulet/deployment.py @@ -101,7 +101,8 @@ class OpenStackAmuletDeployment(AmuletDeployment): """ (self.precise_essex, self.precise_folsom, self.precise_grizzly, self.precise_havana, self.precise_icehouse, - self.trusty_icehouse, self.trusty_juno, self.trusty_kilo) = range(8) + self.trusty_icehouse, self.trusty_juno, self.trusty_kilo, + self.utopic_juno, self.vivid_kilo) = range(10) releases = { ('precise', None): self.precise_essex, ('precise', 'cloud:precise-folsom'): self.precise_folsom, @@ -110,7 +111,9 @@ class OpenStackAmuletDeployment(AmuletDeployment): ('precise', 'cloud:precise-icehouse'): self.precise_icehouse, ('trusty', None): self.trusty_icehouse, ('trusty', 'cloud:trusty-juno'): self.trusty_juno, - ('trusty', 'cloud:trusty-kilo'): self.trusty_kilo} + ('trusty', 'cloud:trusty-kilo'): self.trusty_kilo, + ('utopic', None): self.utopic_juno, + ('vivid', None): self.vivid_kilo} return releases[(self.series, self.openstack)] def _get_openstack_release_string(self): From 2dc9fffcb5717afc66c0f35c71a74fb20148f019 Mon Sep 17 00:00:00 2001 From: Felipe Reyes Date: Fri, 17 Apr 2015 12:05:16 +0200 Subject: [PATCH 22/22] Pull hacluster from next using openstack charm-helpers base class --- tests/basic_deployment.py | 70 +++++++++++++++++++++++++-------------- 1 file changed, 45 insertions(+), 25 deletions(-) diff --git a/tests/basic_deployment.py b/tests/basic_deployment.py index c02ea4d..b97a892 100644 --- a/tests/basic_deployment.py +++ b/tests/basic_deployment.py @@ -1,13 +1,19 @@ import amulet import os +import time import telnetlib import unittest import yaml -from charmhelpers.contrib.amulet.deployment import AmuletDeployment +from charmhelpers.contrib.openstack.amulet.deployment import ( + OpenStackAmuletDeployment +) -class BasicDeployment(unittest.TestCase): - def __init__(self, vip=None, units=1): +class BasicDeployment(OpenStackAmuletDeployment): + def __init__(self, vip=None, units=1, series="trusty", openstack=None, + source=None, stable=False): + super(BasicDeployment, self).__init__(series, openstack, source, + stable) self.units = units self.master_unit = None self.vip = None @@ -25,19 +31,26 @@ class BasicDeployment(unittest.TestCase): ("please set the vip in local.yaml or env var " "AMULET_OS_VIP to run this test suite")) - def run(self): - # The number of seconds to wait for the environment to setup. - seconds = 1200 + def _add_services(self): + """Add services - self.d = amulet.Deployment(series="trusty") - self.d.add('percona-cluster', units=self.units) + Add the services that we're testing, where percona-cluster is local, + and the rest of the service are from lp branches that are + compatible with the local charm (e.g. stable or next). + """ + this_service = {'name': 'percona-cluster', + 'units': self.units} + other_services = [{'name': 'hacluster'}] + super(BasicDeployment, self)._add_services(this_service, + other_services) - # NOTE(freyes): we use hacluster/next, because stable doesn't support - # location rules definition. - self.d.add('hacluster', - charm='lp:~openstack-charmers/charms/trusty/hacluster/next') - self.d.relate('percona-cluster:ha', 'hacluster:ha') + def _add_relations(self): + """Add all of the relations for the services.""" + relations = {'percona-cluster:ha': 'hacluster:ha'} + super(BasicDeployment, self)._add_relations(relations) + def _configure_services(self): + """Configure all of the services.""" cfg_percona = {'sst-password': 'ubuntu', 'root-password': 't00r', 'dataset-size': '512M', @@ -51,19 +64,25 @@ class BasicDeployment(unittest.TestCase): 'y5RRk/wcHakTcWYMwm70upDGJEP00YT3xem3NQy27A' 'C1w=')} - self.d.configure('percona-cluster', cfg_percona) - self.d.configure('hacluster', cfg_ha) + configs = {'percona-cluster': cfg_percona, + 'hacluster': cfg_ha} + super(BasicDeployment, self)._configure_services(configs) - try: - self.d.setup(timeout=seconds) - self.d.sentry.wait(seconds) - except amulet.helpers.TimeoutError: - message = 'The environment did not setup in %d seconds.' % seconds - amulet.raise_status(amulet.SKIP, msg=message) - except: - raise + def run(self): + # The number of seconds to wait for the environment to setup. + seconds = 1200 + + self._add_services() + self._add_relations() + self._configure_services() + self._deploy() + + i = 0 + while i < 30 and not self.master_unit: + self.master_unit = self.find_master() + i += 1 + time.sleep(10) - self.master_unit = self.find_master() assert self.master_unit is not None, 'percona-cluster vip not found' output, code = self.master_unit.run('sudo crm_verify --live-check') @@ -85,7 +104,8 @@ class BasicDeployment(unittest.TestCase): continue # is the vip running here? - output, code = unit.run('sudo ip a | grep %s' % self.vip) + output, code = unit.run('sudo ip a | grep "inet %s/"' % self.vip) + print('---') print(unit_id) print(output) if code == 0: