Fix pacemaker down crm infinite loop
On corosync restart, corosync may take longer than a minute to come up. The systemd start script times out too soon. Then pacemaker which is dependent on corosync is immediatly started and fails as corosync is still in the process of starting. Subsequently the charm would run crm node list to validate pacemaker. This would become an infinite loop. This change adds longer timeout values for systemd scripts and adds better error handling and communication to the end user. Change-Id: I7c3d018a03fddfb1f6bfd91fd7aeed4b13879e45 Partial-Bug: #1654403
This commit is contained in:
parent
3f2f528cb1
commit
fda5176bd5
10
README.md
10
README.md
@ -29,6 +29,16 @@ To enable HA clustering support (for mysql for example):
|
||||
The principle charm must have explicit support for the hacluster interface
|
||||
in order for clustering to occur - otherwise nothing actually get configured.
|
||||
|
||||
# Settings
|
||||
|
||||
It is best practice to set cluster_count to the number of expected units in the
|
||||
cluster. The charm will build the cluster without this setting, however, race
|
||||
conditions may occur in which one node is not yet aware of the total number of
|
||||
relations to other hacluster units, leading to failure of the corosync and
|
||||
pacemaker services to complete startup.
|
||||
|
||||
Setting cluster_count helps guarantee the hacluster charm waits until all
|
||||
expected peer relations are available before building the corosync cluster.
|
||||
|
||||
# HA/Clustering
|
||||
|
||||
|
15
config.yaml
15
config.yaml
@ -53,6 +53,18 @@ options:
|
||||
.
|
||||
This configuration element is mandatory and the service will fail on
|
||||
install if it is not provided. The value must be base64 encoded.
|
||||
service_start_timeout:
|
||||
type: int
|
||||
default: 180
|
||||
description: |
|
||||
Systemd override value for corosync and pacemaker service start timeout in seconds.
|
||||
Set value to -1 turn off timeout for the services.
|
||||
service_stop_timeout:
|
||||
type: int
|
||||
default: 60
|
||||
description: |
|
||||
Systemd override value for corosync and pacemaker service stop timeout in seconds.
|
||||
Set value to -1 turn off timeout for the services.
|
||||
stonith_enabled:
|
||||
type: string
|
||||
default: 'False'
|
||||
@ -89,7 +101,8 @@ options:
|
||||
If less that 3 is specified, the cluster will be configured to
|
||||
ignore any quorum problems; with 3 or more units, quorum will be
|
||||
enforced and services will be stopped in the event of a loss
|
||||
of quorum.
|
||||
of quorum. It is best practice to set this value to the expected
|
||||
number of units to avoid potential race conditions.
|
||||
monitor_host:
|
||||
type: string
|
||||
default:
|
||||
|
@ -72,6 +72,7 @@ from utils import (
|
||||
set_unit_status,
|
||||
ocf_file_exists,
|
||||
kill_legacy_ocf_daemon_process,
|
||||
try_pcmk_wait,
|
||||
)
|
||||
|
||||
from charmhelpers.contrib.charmsupport import nrpe
|
||||
@ -152,7 +153,7 @@ def config_changed():
|
||||
|
||||
status_set('maintenance', "Setting up corosync")
|
||||
if configure_corosync():
|
||||
pcmk.wait_for_pcmk()
|
||||
try_pcmk_wait()
|
||||
configure_cluster_global()
|
||||
configure_monitor_host()
|
||||
configure_stonith()
|
||||
@ -257,7 +258,7 @@ def ha_relation_changed():
|
||||
# NOTE: this should be removed in 15.04 cycle as corosync
|
||||
# configuration should be set directly on subordinate
|
||||
configure_corosync()
|
||||
pcmk.wait_for_pcmk()
|
||||
try_pcmk_wait()
|
||||
configure_cluster_global()
|
||||
configure_monitor_host()
|
||||
configure_stonith()
|
||||
|
@ -15,6 +15,7 @@
|
||||
import commands
|
||||
import subprocess
|
||||
import socket
|
||||
import time
|
||||
|
||||
from charmhelpers.core.hookenv import (
|
||||
log,
|
||||
@ -22,12 +23,23 @@ from charmhelpers.core.hookenv import (
|
||||
)
|
||||
|
||||
|
||||
def wait_for_pcmk():
|
||||
class ServicesNotUp(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def wait_for_pcmk(retries=12, sleep=10):
|
||||
crm_up = None
|
||||
hostname = socket.gethostname()
|
||||
while not crm_up:
|
||||
for i in range(retries):
|
||||
if crm_up:
|
||||
return True
|
||||
output = commands.getstatusoutput("crm node list")[1]
|
||||
crm_up = hostname in output
|
||||
time.sleep(sleep)
|
||||
if not crm_up:
|
||||
raise ServicesNotUp("Pacemaker or Corosync are still down after "
|
||||
"waiting for {} retries. Last output: {}"
|
||||
"".format(retries, output))
|
||||
|
||||
|
||||
def commit(cmd):
|
||||
|
120
hooks/utils.py
120
hooks/utils.py
@ -31,6 +31,7 @@ from charmhelpers.core.hookenv import (
|
||||
local_unit,
|
||||
log,
|
||||
DEBUG,
|
||||
ERROR,
|
||||
INFO,
|
||||
WARNING,
|
||||
relation_get,
|
||||
@ -54,11 +55,11 @@ from charmhelpers.core.host import (
|
||||
rsync,
|
||||
service_start,
|
||||
service_stop,
|
||||
service_restart,
|
||||
service_running,
|
||||
write_file,
|
||||
file_hash,
|
||||
lsb_release
|
||||
lsb_release,
|
||||
init_is_systemd,
|
||||
)
|
||||
from charmhelpers.fetch import (
|
||||
apt_install,
|
||||
@ -104,6 +105,9 @@ COROSYNC_CONF_FILES = [
|
||||
]
|
||||
SUPPORTED_TRANSPORTS = ['udp', 'udpu', 'multicast', 'unicast']
|
||||
|
||||
SYSTEMD_OVERRIDES_DIR = '/etc/systemd/system/{}.service.d'
|
||||
SYSTEMD_OVERRIDES_FILE = '{}/overrides.conf'
|
||||
|
||||
|
||||
class MAASConfigIncomplete(Exception):
|
||||
pass
|
||||
@ -287,6 +291,41 @@ def get_corosync_conf():
|
||||
return None
|
||||
|
||||
|
||||
def emit_systemd_overrides_file():
|
||||
"""Generate the systemd overrides file
|
||||
With Start and Stop timeout values
|
||||
Note: (David Ames) Bug#1654403 Work around
|
||||
May be removed if bug is resolved
|
||||
If timeout value is set to -1 pass infinity
|
||||
"""
|
||||
if not init_is_systemd():
|
||||
return
|
||||
|
||||
stop_timeout = int(config('service_stop_timeout'))
|
||||
if stop_timeout < 0:
|
||||
stop_timeout = 'infinity'
|
||||
start_timeout = int(config('service_start_timeout'))
|
||||
if start_timeout < 0:
|
||||
start_timeout = 'infinity'
|
||||
|
||||
systemd_overrides_context = {'service_stop_timeout': stop_timeout,
|
||||
'service_start_timeout': start_timeout,
|
||||
}
|
||||
|
||||
for service in ['corosync', 'pacemaker']:
|
||||
overrides_dir = SYSTEMD_OVERRIDES_DIR.format(service)
|
||||
overrides_file = SYSTEMD_OVERRIDES_FILE.format(overrides_dir)
|
||||
if not os.path.isdir(overrides_dir):
|
||||
os.mkdir(overrides_dir)
|
||||
|
||||
write_file(path=overrides_file,
|
||||
content=render_template('systemd-overrides.conf',
|
||||
systemd_overrides_context))
|
||||
|
||||
# Update systemd with the new information
|
||||
subprocess.check_call(['systemctl', 'daemon-reload'])
|
||||
|
||||
|
||||
def emit_corosync_conf():
|
||||
corosync_conf_context = get_corosync_conf()
|
||||
if corosync_conf_context:
|
||||
@ -534,7 +573,7 @@ def restart_corosync_on_change():
|
||||
if return_data:
|
||||
for path in COROSYNC_CONF_FILES:
|
||||
if checksums[path] != file_hash(path):
|
||||
restart_corosync()
|
||||
validated_restart_corosync()
|
||||
break
|
||||
|
||||
return return_data
|
||||
@ -542,20 +581,88 @@ def restart_corosync_on_change():
|
||||
return wrap
|
||||
|
||||
|
||||
def try_pcmk_wait():
|
||||
"""Try pcmk.wait_for_pcmk()
|
||||
Log results and set status message
|
||||
"""
|
||||
try:
|
||||
pcmk.wait_for_pcmk()
|
||||
log("Pacemaker is ready", DEBUG)
|
||||
except pcmk.ServicesNotUp:
|
||||
msg = ("Pacemaker is down. Please manually start it.")
|
||||
log(msg, ERROR)
|
||||
status_set('blocked', msg)
|
||||
raise pcmk.ServicesNotUp(msg)
|
||||
|
||||
|
||||
@restart_corosync_on_change()
|
||||
def configure_corosync():
|
||||
log('Configuring and (maybe) restarting corosync', level=DEBUG)
|
||||
# David Ames Bug#1654403 Work around
|
||||
# May be removed if bug is resolved
|
||||
emit_systemd_overrides_file()
|
||||
return emit_base_conf() and emit_corosync_conf()
|
||||
|
||||
|
||||
def services_running():
|
||||
"""Determine if both Corosync and Pacemaker are running
|
||||
Both from the operating system perspective and with a functional test
|
||||
@returns boolean
|
||||
"""
|
||||
pacemaker_status = service_running("pacemaker")
|
||||
corosync_status = service_running("corosync")
|
||||
log("Pacemaker status: {}, Corosync status: {}"
|
||||
"".format(pacemaker_status, corosync_status),
|
||||
level=DEBUG)
|
||||
if not (pacemaker_status and corosync_status):
|
||||
# OS perspective
|
||||
return False
|
||||
else:
|
||||
# Functional test of pacemaker
|
||||
return pcmk.wait_for_pcmk()
|
||||
|
||||
|
||||
def validated_restart_corosync(retries=10):
|
||||
"""Restart and validate Corosync and Pacemaker are in fact up and running.
|
||||
|
||||
@param retries: number of attempts to restart the services before giving up
|
||||
@raises pcmk.ServicesNotUp if after retries services are still not up
|
||||
"""
|
||||
for restart in range(retries):
|
||||
try:
|
||||
if restart_corosync():
|
||||
log("Corosync and Pacemaker are validated as up and running.",
|
||||
INFO)
|
||||
return
|
||||
else:
|
||||
log("Corosync or Pacemaker not validated as up yet, retrying",
|
||||
WARNING)
|
||||
except pcmk.ServicesNotUp:
|
||||
log("Pacemaker failed to start, retrying", WARNING)
|
||||
continue
|
||||
|
||||
msg = ("Corosync and/or Pacemaker failed to restart after {} retries"
|
||||
"".format(retries))
|
||||
log(msg, ERROR)
|
||||
status_set('blocked', msg)
|
||||
raise pcmk.ServicesNotUp(msg)
|
||||
|
||||
|
||||
def restart_corosync():
|
||||
if service_running("pacemaker"):
|
||||
log("Stopping pacemaker", DEBUG)
|
||||
service_stop("pacemaker")
|
||||
|
||||
if not is_unit_paused_set():
|
||||
service_restart("corosync")
|
||||
log("Stopping corosync", DEBUG)
|
||||
service_stop("corosync")
|
||||
log("Starting corosync", DEBUG)
|
||||
service_start("corosync")
|
||||
log("Starting pacemaker", DEBUG)
|
||||
service_start("pacemaker")
|
||||
|
||||
return services_running()
|
||||
|
||||
|
||||
def validate_dns_ha():
|
||||
"""Validate the DNS HA
|
||||
@ -715,6 +822,11 @@ def assess_status_helper():
|
||||
node_count = int(config('cluster_count'))
|
||||
status = 'active'
|
||||
message = 'Unit is ready and clustered'
|
||||
try:
|
||||
try_pcmk_wait()
|
||||
except pcmk.ServicesNotUp:
|
||||
message = 'Pacemaker is down'
|
||||
status = 'blocked'
|
||||
for relid in relation_ids('hanode'):
|
||||
if len(related_units(relid)) + 1 < node_count:
|
||||
status = 'blocked'
|
||||
|
3
templates/systemd-overrides.conf
Normal file
3
templates/systemd-overrides.conf
Normal file
@ -0,0 +1,3 @@
|
||||
[Service]
|
||||
TimeoutStartSec={{service_start_timeout}}
|
||||
TimeoutStopSec={{service_stop_timeout}}
|
@ -33,6 +33,9 @@ from charmhelpers.contrib.openstack.amulet.utils import (
|
||||
u = OpenStackAmuletUtils(DEBUG)
|
||||
seconds_to_wait = 600
|
||||
|
||||
# Set number of primary units and cluster-count for hacluster
|
||||
NUM_UNITS = 3
|
||||
|
||||
|
||||
class HAClusterBasicDeployment(OpenStackAmuletDeployment):
|
||||
|
||||
@ -64,7 +67,7 @@ class HAClusterBasicDeployment(OpenStackAmuletDeployment):
|
||||
this_service = {'name': 'hacluster'}
|
||||
other_services = [
|
||||
{'name': 'percona-cluster', 'constraints': {'mem': '3072M'}},
|
||||
{'name': 'keystone', 'units': 3},
|
||||
{'name': 'keystone', 'units': NUM_UNITS},
|
||||
]
|
||||
super(HAClusterBasicDeployment, self)._add_services(this_service,
|
||||
other_services)
|
||||
@ -93,7 +96,8 @@ class HAClusterBasicDeployment(OpenStackAmuletDeployment):
|
||||
'sst-password': 'ChangeMe123',
|
||||
}
|
||||
hacluster_config = {
|
||||
'debug': 'true'
|
||||
'debug': 'true',
|
||||
'cluster_count': NUM_UNITS,
|
||||
}
|
||||
|
||||
configs = {
|
||||
|
@ -21,6 +21,7 @@ import tempfile
|
||||
import unittest
|
||||
|
||||
import utils
|
||||
import pcmk
|
||||
|
||||
|
||||
def write_file(path, content, *args, **kwargs):
|
||||
@ -31,7 +32,7 @@ def write_file(path, content, *args, **kwargs):
|
||||
|
||||
@mock.patch.object(utils, 'log', lambda *args, **kwargs: None)
|
||||
@mock.patch.object(utils, 'write_file', write_file)
|
||||
class UtilsTestCase(unittest.TestCase):
|
||||
class UtilsTestCaseWriteTmp(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.tmpdir = tempfile.mkdtemp()
|
||||
@ -88,6 +89,9 @@ class UtilsTestCase(unittest.TestCase):
|
||||
def test_debug_off(self):
|
||||
self.check_debug(False)
|
||||
|
||||
|
||||
class UtilsTestCase(unittest.TestCase):
|
||||
|
||||
@mock.patch.object(utils, 'config')
|
||||
def test_get_transport(self, mock_config):
|
||||
mock_config.return_value = 'udp'
|
||||
@ -218,3 +222,135 @@ class UtilsTestCase(unittest.TestCase):
|
||||
check_output_mock.return_value = ps_output
|
||||
utils.kill_legacy_ocf_daemon_process('res_ceilometer_agent_central')
|
||||
call_mock.assert_called_once_with(['sudo', 'kill', '-9', '11109'])
|
||||
|
||||
@mock.patch.object(pcmk, 'wait_for_pcmk')
|
||||
def test_try_pcmk_wait(self, mock_wait_for_pcmk):
|
||||
# Returns OK
|
||||
mock_wait_for_pcmk.side_effect = None
|
||||
self.assertEquals(None, utils.try_pcmk_wait())
|
||||
|
||||
# Raises Exception
|
||||
mock_wait_for_pcmk.side_effect = pcmk.ServicesNotUp
|
||||
with self.assertRaises(pcmk.ServicesNotUp):
|
||||
utils.try_pcmk_wait()
|
||||
|
||||
@mock.patch.object(pcmk, 'wait_for_pcmk')
|
||||
@mock.patch.object(utils, 'service_running')
|
||||
def test_services_running(self, mock_service_running,
|
||||
mock_wait_for_pcmk):
|
||||
# OS not running
|
||||
mock_service_running.return_value = False
|
||||
self.assertFalse(utils.services_running())
|
||||
|
||||
# Functional not running
|
||||
mock_service_running.return_value = True
|
||||
mock_wait_for_pcmk.side_effect = pcmk.ServicesNotUp
|
||||
with self.assertRaises(pcmk.ServicesNotUp):
|
||||
utils.services_running()
|
||||
|
||||
# All running
|
||||
mock_service_running.return_value = True
|
||||
mock_wait_for_pcmk.side_effect = None
|
||||
mock_wait_for_pcmk.return_value = True
|
||||
self.assertTrue(utils.services_running())
|
||||
|
||||
@mock.patch.object(pcmk, 'wait_for_pcmk')
|
||||
@mock.patch.object(utils, 'restart_corosync')
|
||||
def test_validated_restart_corosync(self, mock_restart_corosync,
|
||||
mock_wait_for_pcmk):
|
||||
# Services are down
|
||||
mock_restart_corosync.mock_calls = []
|
||||
mock_restart_corosync.return_value = False
|
||||
with self.assertRaises(pcmk.ServicesNotUp):
|
||||
utils.validated_restart_corosync(retries=3)
|
||||
self.assertEqual(3, len(mock_restart_corosync.mock_calls))
|
||||
|
||||
# Services are up
|
||||
mock_restart_corosync.mock_calls = []
|
||||
mock_restart_corosync.return_value = True
|
||||
utils.validated_restart_corosync(retries=10)
|
||||
self.assertEqual(1, len(mock_restart_corosync.mock_calls))
|
||||
|
||||
@mock.patch.object(utils, 'is_unit_paused_set')
|
||||
@mock.patch.object(utils, 'services_running')
|
||||
@mock.patch.object(utils, 'service_start')
|
||||
@mock.patch.object(utils, 'service_stop')
|
||||
@mock.patch.object(utils, 'service_running')
|
||||
def test_restart_corosync(self, mock_service_running,
|
||||
mock_service_stop, mock_service_start,
|
||||
mock_services_running, mock_is_unit_paused_set):
|
||||
# PM up, services down
|
||||
mock_service_running.return_value = True
|
||||
mock_is_unit_paused_set.return_value = False
|
||||
mock_services_running.return_value = False
|
||||
self.assertFalse(utils.restart_corosync())
|
||||
mock_service_stop.assert_has_calls([mock.call('pacemaker'),
|
||||
mock.call('corosync')])
|
||||
mock_service_start.assert_has_calls([mock.call('corosync'),
|
||||
mock.call('pacemaker')])
|
||||
|
||||
# PM already down, services down
|
||||
mock_service_running.return_value = False
|
||||
mock_is_unit_paused_set.return_value = False
|
||||
mock_services_running.return_value = False
|
||||
self.assertFalse(utils.restart_corosync())
|
||||
mock_service_stop.assert_has_calls([mock.call('corosync')])
|
||||
mock_service_start.assert_has_calls([mock.call('corosync'),
|
||||
mock.call('pacemaker')])
|
||||
|
||||
# PM already down, services up
|
||||
mock_service_running.return_value = True
|
||||
mock_is_unit_paused_set.return_value = False
|
||||
mock_services_running.return_value = True
|
||||
self.assertTrue(utils.restart_corosync())
|
||||
mock_service_stop.assert_has_calls([mock.call('pacemaker'),
|
||||
mock.call('corosync')])
|
||||
mock_service_start.assert_has_calls([mock.call('corosync'),
|
||||
mock.call('pacemaker')])
|
||||
|
||||
@mock.patch.object(subprocess, 'check_call')
|
||||
@mock.patch.object(utils.os, 'mkdir')
|
||||
@mock.patch.object(utils.os.path, 'exists')
|
||||
@mock.patch.object(utils, 'render_template')
|
||||
@mock.patch.object(utils, 'write_file')
|
||||
@mock.patch.object(utils, 'is_unit_paused_set')
|
||||
@mock.patch.object(utils, 'config')
|
||||
def test_emit_systemd_overrides_file(self, mock_config,
|
||||
mock_is_unit_paused_set,
|
||||
mock_write_file, mock_render_template,
|
||||
mock_path_exists,
|
||||
mock_mkdir, mock_check_call):
|
||||
|
||||
# Normal values
|
||||
cfg = {'service_stop_timeout': 30,
|
||||
'service_start_timeout': 90}
|
||||
mock_config.side_effect = lambda key: cfg.get(key)
|
||||
|
||||
mock_is_unit_paused_set.return_value = True
|
||||
mock_path_exists.return_value = True
|
||||
utils.emit_systemd_overrides_file()
|
||||
self.assertEquals(2, len(mock_write_file.mock_calls))
|
||||
mock_render_template.assert_has_calls(
|
||||
[mock.call('systemd-overrides.conf', cfg),
|
||||
mock.call('systemd-overrides.conf', cfg)])
|
||||
mock_check_call.assert_has_calls([mock.call(['systemctl',
|
||||
'daemon-reload'])])
|
||||
mock_write_file.mock_calls = []
|
||||
mock_render_template.mock_calls = []
|
||||
mock_check_call.mock_calls = []
|
||||
|
||||
# Disable timeout
|
||||
cfg = {'service_stop_timeout': -1,
|
||||
'service_start_timeout': -1}
|
||||
expected_cfg = {'service_stop_timeout': 'infinity',
|
||||
'service_start_timeout': 'infinity'}
|
||||
mock_config.side_effect = lambda key: cfg.get(key)
|
||||
mock_is_unit_paused_set.return_value = True
|
||||
mock_path_exists.return_value = True
|
||||
utils.emit_systemd_overrides_file()
|
||||
self.assertEquals(2, len(mock_write_file.mock_calls))
|
||||
mock_render_template.assert_has_calls(
|
||||
[mock.call('systemd-overrides.conf', expected_cfg),
|
||||
mock.call('systemd-overrides.conf', expected_cfg)])
|
||||
mock_check_call.assert_has_calls([mock.call(['systemctl',
|
||||
'daemon-reload'])])
|
||||
|
@ -34,3 +34,17 @@ class TestPcmk(unittest.TestCase):
|
||||
def test_crm_res_running_undefined(self, getstatusoutput):
|
||||
getstatusoutput.return_value = (1, "foobar")
|
||||
self.assertFalse(pcmk.crm_res_running('res_nova_consoleauth'))
|
||||
|
||||
@mock.patch('socket.gethostname')
|
||||
@mock.patch('commands.getstatusoutput')
|
||||
def test_wait_for_pcmk(self, getstatusoutput, gethostname):
|
||||
# Pacemaker is down
|
||||
gethostname.return_value = 'hanode-1'
|
||||
getstatusoutput.return_value = (1, 'Not the hostname')
|
||||
with self.assertRaises(pcmk.ServicesNotUp):
|
||||
pcmk.wait_for_pcmk(retries=2, sleep=0)
|
||||
|
||||
# Pacemaker is up
|
||||
gethostname.return_value = 'hanode-1'
|
||||
getstatusoutput.return_value = (0, 'Hosname: hanode-1')
|
||||
self.assertTrue(pcmk.wait_for_pcmk(retries=2, sleep=0))
|
||||
|
Loading…
x
Reference in New Issue
Block a user