Fix pacemaker down crm infinite loop

On corosync restart, corosync may take longer than a minute to come
up. The systemd start script times out too soon. Then pacemaker which
is dependent on corosync is immediatly started and fails as corosync
is still in the process of starting.

Subsequently the charm would run crm node list to validate pacemaker.
This would become an infinite loop.

This change adds longer timeout values for systemd scripts and adds
better error handling and communication to the end user.

Change-Id: I7c3d018a03fddfb1f6bfd91fd7aeed4b13879e45
Partial-Bug: #1654403
This commit is contained in:
David Ames 2017-01-11 16:00:39 -08:00
parent 3f2f528cb1
commit fda5176bd5
9 changed files with 317 additions and 12 deletions

View File

@ -29,6 +29,16 @@ To enable HA clustering support (for mysql for example):
The principle charm must have explicit support for the hacluster interface
in order for clustering to occur - otherwise nothing actually get configured.
# Settings
It is best practice to set cluster_count to the number of expected units in the
cluster. The charm will build the cluster without this setting, however, race
conditions may occur in which one node is not yet aware of the total number of
relations to other hacluster units, leading to failure of the corosync and
pacemaker services to complete startup.
Setting cluster_count helps guarantee the hacluster charm waits until all
expected peer relations are available before building the corosync cluster.
# HA/Clustering

View File

@ -53,6 +53,18 @@ options:
.
This configuration element is mandatory and the service will fail on
install if it is not provided. The value must be base64 encoded.
service_start_timeout:
type: int
default: 180
description: |
Systemd override value for corosync and pacemaker service start timeout in seconds.
Set value to -1 turn off timeout for the services.
service_stop_timeout:
type: int
default: 60
description: |
Systemd override value for corosync and pacemaker service stop timeout in seconds.
Set value to -1 turn off timeout for the services.
stonith_enabled:
type: string
default: 'False'
@ -89,7 +101,8 @@ options:
If less that 3 is specified, the cluster will be configured to
ignore any quorum problems; with 3 or more units, quorum will be
enforced and services will be stopped in the event of a loss
of quorum.
of quorum. It is best practice to set this value to the expected
number of units to avoid potential race conditions.
monitor_host:
type: string
default:

View File

@ -72,6 +72,7 @@ from utils import (
set_unit_status,
ocf_file_exists,
kill_legacy_ocf_daemon_process,
try_pcmk_wait,
)
from charmhelpers.contrib.charmsupport import nrpe
@ -152,7 +153,7 @@ def config_changed():
status_set('maintenance', "Setting up corosync")
if configure_corosync():
pcmk.wait_for_pcmk()
try_pcmk_wait()
configure_cluster_global()
configure_monitor_host()
configure_stonith()
@ -257,7 +258,7 @@ def ha_relation_changed():
# NOTE: this should be removed in 15.04 cycle as corosync
# configuration should be set directly on subordinate
configure_corosync()
pcmk.wait_for_pcmk()
try_pcmk_wait()
configure_cluster_global()
configure_monitor_host()
configure_stonith()

View File

@ -15,6 +15,7 @@
import commands
import subprocess
import socket
import time
from charmhelpers.core.hookenv import (
log,
@ -22,12 +23,23 @@ from charmhelpers.core.hookenv import (
)
def wait_for_pcmk():
class ServicesNotUp(Exception):
pass
def wait_for_pcmk(retries=12, sleep=10):
crm_up = None
hostname = socket.gethostname()
while not crm_up:
for i in range(retries):
if crm_up:
return True
output = commands.getstatusoutput("crm node list")[1]
crm_up = hostname in output
time.sleep(sleep)
if not crm_up:
raise ServicesNotUp("Pacemaker or Corosync are still down after "
"waiting for {} retries. Last output: {}"
"".format(retries, output))
def commit(cmd):

View File

@ -31,6 +31,7 @@ from charmhelpers.core.hookenv import (
local_unit,
log,
DEBUG,
ERROR,
INFO,
WARNING,
relation_get,
@ -54,11 +55,11 @@ from charmhelpers.core.host import (
rsync,
service_start,
service_stop,
service_restart,
service_running,
write_file,
file_hash,
lsb_release
lsb_release,
init_is_systemd,
)
from charmhelpers.fetch import (
apt_install,
@ -104,6 +105,9 @@ COROSYNC_CONF_FILES = [
]
SUPPORTED_TRANSPORTS = ['udp', 'udpu', 'multicast', 'unicast']
SYSTEMD_OVERRIDES_DIR = '/etc/systemd/system/{}.service.d'
SYSTEMD_OVERRIDES_FILE = '{}/overrides.conf'
class MAASConfigIncomplete(Exception):
pass
@ -287,6 +291,41 @@ def get_corosync_conf():
return None
def emit_systemd_overrides_file():
"""Generate the systemd overrides file
With Start and Stop timeout values
Note: (David Ames) Bug#1654403 Work around
May be removed if bug is resolved
If timeout value is set to -1 pass infinity
"""
if not init_is_systemd():
return
stop_timeout = int(config('service_stop_timeout'))
if stop_timeout < 0:
stop_timeout = 'infinity'
start_timeout = int(config('service_start_timeout'))
if start_timeout < 0:
start_timeout = 'infinity'
systemd_overrides_context = {'service_stop_timeout': stop_timeout,
'service_start_timeout': start_timeout,
}
for service in ['corosync', 'pacemaker']:
overrides_dir = SYSTEMD_OVERRIDES_DIR.format(service)
overrides_file = SYSTEMD_OVERRIDES_FILE.format(overrides_dir)
if not os.path.isdir(overrides_dir):
os.mkdir(overrides_dir)
write_file(path=overrides_file,
content=render_template('systemd-overrides.conf',
systemd_overrides_context))
# Update systemd with the new information
subprocess.check_call(['systemctl', 'daemon-reload'])
def emit_corosync_conf():
corosync_conf_context = get_corosync_conf()
if corosync_conf_context:
@ -534,7 +573,7 @@ def restart_corosync_on_change():
if return_data:
for path in COROSYNC_CONF_FILES:
if checksums[path] != file_hash(path):
restart_corosync()
validated_restart_corosync()
break
return return_data
@ -542,20 +581,88 @@ def restart_corosync_on_change():
return wrap
def try_pcmk_wait():
"""Try pcmk.wait_for_pcmk()
Log results and set status message
"""
try:
pcmk.wait_for_pcmk()
log("Pacemaker is ready", DEBUG)
except pcmk.ServicesNotUp:
msg = ("Pacemaker is down. Please manually start it.")
log(msg, ERROR)
status_set('blocked', msg)
raise pcmk.ServicesNotUp(msg)
@restart_corosync_on_change()
def configure_corosync():
log('Configuring and (maybe) restarting corosync', level=DEBUG)
# David Ames Bug#1654403 Work around
# May be removed if bug is resolved
emit_systemd_overrides_file()
return emit_base_conf() and emit_corosync_conf()
def services_running():
"""Determine if both Corosync and Pacemaker are running
Both from the operating system perspective and with a functional test
@returns boolean
"""
pacemaker_status = service_running("pacemaker")
corosync_status = service_running("corosync")
log("Pacemaker status: {}, Corosync status: {}"
"".format(pacemaker_status, corosync_status),
level=DEBUG)
if not (pacemaker_status and corosync_status):
# OS perspective
return False
else:
# Functional test of pacemaker
return pcmk.wait_for_pcmk()
def validated_restart_corosync(retries=10):
"""Restart and validate Corosync and Pacemaker are in fact up and running.
@param retries: number of attempts to restart the services before giving up
@raises pcmk.ServicesNotUp if after retries services are still not up
"""
for restart in range(retries):
try:
if restart_corosync():
log("Corosync and Pacemaker are validated as up and running.",
INFO)
return
else:
log("Corosync or Pacemaker not validated as up yet, retrying",
WARNING)
except pcmk.ServicesNotUp:
log("Pacemaker failed to start, retrying", WARNING)
continue
msg = ("Corosync and/or Pacemaker failed to restart after {} retries"
"".format(retries))
log(msg, ERROR)
status_set('blocked', msg)
raise pcmk.ServicesNotUp(msg)
def restart_corosync():
if service_running("pacemaker"):
log("Stopping pacemaker", DEBUG)
service_stop("pacemaker")
if not is_unit_paused_set():
service_restart("corosync")
log("Stopping corosync", DEBUG)
service_stop("corosync")
log("Starting corosync", DEBUG)
service_start("corosync")
log("Starting pacemaker", DEBUG)
service_start("pacemaker")
return services_running()
def validate_dns_ha():
"""Validate the DNS HA
@ -715,6 +822,11 @@ def assess_status_helper():
node_count = int(config('cluster_count'))
status = 'active'
message = 'Unit is ready and clustered'
try:
try_pcmk_wait()
except pcmk.ServicesNotUp:
message = 'Pacemaker is down'
status = 'blocked'
for relid in relation_ids('hanode'):
if len(related_units(relid)) + 1 < node_count:
status = 'blocked'

View File

@ -0,0 +1,3 @@
[Service]
TimeoutStartSec={{service_start_timeout}}
TimeoutStopSec={{service_stop_timeout}}

View File

@ -33,6 +33,9 @@ from charmhelpers.contrib.openstack.amulet.utils import (
u = OpenStackAmuletUtils(DEBUG)
seconds_to_wait = 600
# Set number of primary units and cluster-count for hacluster
NUM_UNITS = 3
class HAClusterBasicDeployment(OpenStackAmuletDeployment):
@ -64,7 +67,7 @@ class HAClusterBasicDeployment(OpenStackAmuletDeployment):
this_service = {'name': 'hacluster'}
other_services = [
{'name': 'percona-cluster', 'constraints': {'mem': '3072M'}},
{'name': 'keystone', 'units': 3},
{'name': 'keystone', 'units': NUM_UNITS},
]
super(HAClusterBasicDeployment, self)._add_services(this_service,
other_services)
@ -93,7 +96,8 @@ class HAClusterBasicDeployment(OpenStackAmuletDeployment):
'sst-password': 'ChangeMe123',
}
hacluster_config = {
'debug': 'true'
'debug': 'true',
'cluster_count': NUM_UNITS,
}
configs = {

View File

@ -21,6 +21,7 @@ import tempfile
import unittest
import utils
import pcmk
def write_file(path, content, *args, **kwargs):
@ -31,7 +32,7 @@ def write_file(path, content, *args, **kwargs):
@mock.patch.object(utils, 'log', lambda *args, **kwargs: None)
@mock.patch.object(utils, 'write_file', write_file)
class UtilsTestCase(unittest.TestCase):
class UtilsTestCaseWriteTmp(unittest.TestCase):
def setUp(self):
self.tmpdir = tempfile.mkdtemp()
@ -88,6 +89,9 @@ class UtilsTestCase(unittest.TestCase):
def test_debug_off(self):
self.check_debug(False)
class UtilsTestCase(unittest.TestCase):
@mock.patch.object(utils, 'config')
def test_get_transport(self, mock_config):
mock_config.return_value = 'udp'
@ -218,3 +222,135 @@ class UtilsTestCase(unittest.TestCase):
check_output_mock.return_value = ps_output
utils.kill_legacy_ocf_daemon_process('res_ceilometer_agent_central')
call_mock.assert_called_once_with(['sudo', 'kill', '-9', '11109'])
@mock.patch.object(pcmk, 'wait_for_pcmk')
def test_try_pcmk_wait(self, mock_wait_for_pcmk):
# Returns OK
mock_wait_for_pcmk.side_effect = None
self.assertEquals(None, utils.try_pcmk_wait())
# Raises Exception
mock_wait_for_pcmk.side_effect = pcmk.ServicesNotUp
with self.assertRaises(pcmk.ServicesNotUp):
utils.try_pcmk_wait()
@mock.patch.object(pcmk, 'wait_for_pcmk')
@mock.patch.object(utils, 'service_running')
def test_services_running(self, mock_service_running,
mock_wait_for_pcmk):
# OS not running
mock_service_running.return_value = False
self.assertFalse(utils.services_running())
# Functional not running
mock_service_running.return_value = True
mock_wait_for_pcmk.side_effect = pcmk.ServicesNotUp
with self.assertRaises(pcmk.ServicesNotUp):
utils.services_running()
# All running
mock_service_running.return_value = True
mock_wait_for_pcmk.side_effect = None
mock_wait_for_pcmk.return_value = True
self.assertTrue(utils.services_running())
@mock.patch.object(pcmk, 'wait_for_pcmk')
@mock.patch.object(utils, 'restart_corosync')
def test_validated_restart_corosync(self, mock_restart_corosync,
mock_wait_for_pcmk):
# Services are down
mock_restart_corosync.mock_calls = []
mock_restart_corosync.return_value = False
with self.assertRaises(pcmk.ServicesNotUp):
utils.validated_restart_corosync(retries=3)
self.assertEqual(3, len(mock_restart_corosync.mock_calls))
# Services are up
mock_restart_corosync.mock_calls = []
mock_restart_corosync.return_value = True
utils.validated_restart_corosync(retries=10)
self.assertEqual(1, len(mock_restart_corosync.mock_calls))
@mock.patch.object(utils, 'is_unit_paused_set')
@mock.patch.object(utils, 'services_running')
@mock.patch.object(utils, 'service_start')
@mock.patch.object(utils, 'service_stop')
@mock.patch.object(utils, 'service_running')
def test_restart_corosync(self, mock_service_running,
mock_service_stop, mock_service_start,
mock_services_running, mock_is_unit_paused_set):
# PM up, services down
mock_service_running.return_value = True
mock_is_unit_paused_set.return_value = False
mock_services_running.return_value = False
self.assertFalse(utils.restart_corosync())
mock_service_stop.assert_has_calls([mock.call('pacemaker'),
mock.call('corosync')])
mock_service_start.assert_has_calls([mock.call('corosync'),
mock.call('pacemaker')])
# PM already down, services down
mock_service_running.return_value = False
mock_is_unit_paused_set.return_value = False
mock_services_running.return_value = False
self.assertFalse(utils.restart_corosync())
mock_service_stop.assert_has_calls([mock.call('corosync')])
mock_service_start.assert_has_calls([mock.call('corosync'),
mock.call('pacemaker')])
# PM already down, services up
mock_service_running.return_value = True
mock_is_unit_paused_set.return_value = False
mock_services_running.return_value = True
self.assertTrue(utils.restart_corosync())
mock_service_stop.assert_has_calls([mock.call('pacemaker'),
mock.call('corosync')])
mock_service_start.assert_has_calls([mock.call('corosync'),
mock.call('pacemaker')])
@mock.patch.object(subprocess, 'check_call')
@mock.patch.object(utils.os, 'mkdir')
@mock.patch.object(utils.os.path, 'exists')
@mock.patch.object(utils, 'render_template')
@mock.patch.object(utils, 'write_file')
@mock.patch.object(utils, 'is_unit_paused_set')
@mock.patch.object(utils, 'config')
def test_emit_systemd_overrides_file(self, mock_config,
mock_is_unit_paused_set,
mock_write_file, mock_render_template,
mock_path_exists,
mock_mkdir, mock_check_call):
# Normal values
cfg = {'service_stop_timeout': 30,
'service_start_timeout': 90}
mock_config.side_effect = lambda key: cfg.get(key)
mock_is_unit_paused_set.return_value = True
mock_path_exists.return_value = True
utils.emit_systemd_overrides_file()
self.assertEquals(2, len(mock_write_file.mock_calls))
mock_render_template.assert_has_calls(
[mock.call('systemd-overrides.conf', cfg),
mock.call('systemd-overrides.conf', cfg)])
mock_check_call.assert_has_calls([mock.call(['systemctl',
'daemon-reload'])])
mock_write_file.mock_calls = []
mock_render_template.mock_calls = []
mock_check_call.mock_calls = []
# Disable timeout
cfg = {'service_stop_timeout': -1,
'service_start_timeout': -1}
expected_cfg = {'service_stop_timeout': 'infinity',
'service_start_timeout': 'infinity'}
mock_config.side_effect = lambda key: cfg.get(key)
mock_is_unit_paused_set.return_value = True
mock_path_exists.return_value = True
utils.emit_systemd_overrides_file()
self.assertEquals(2, len(mock_write_file.mock_calls))
mock_render_template.assert_has_calls(
[mock.call('systemd-overrides.conf', expected_cfg),
mock.call('systemd-overrides.conf', expected_cfg)])
mock_check_call.assert_has_calls([mock.call(['systemctl',
'daemon-reload'])])

View File

@ -34,3 +34,17 @@ class TestPcmk(unittest.TestCase):
def test_crm_res_running_undefined(self, getstatusoutput):
getstatusoutput.return_value = (1, "foobar")
self.assertFalse(pcmk.crm_res_running('res_nova_consoleauth'))
@mock.patch('socket.gethostname')
@mock.patch('commands.getstatusoutput')
def test_wait_for_pcmk(self, getstatusoutput, gethostname):
# Pacemaker is down
gethostname.return_value = 'hanode-1'
getstatusoutput.return_value = (1, 'Not the hostname')
with self.assertRaises(pcmk.ServicesNotUp):
pcmk.wait_for_pcmk(retries=2, sleep=0)
# Pacemaker is up
gethostname.return_value = 'hanode-1'
getstatusoutput.return_value = (0, 'Hosname: hanode-1')
self.assertTrue(pcmk.wait_for_pcmk(retries=2, sleep=0))