Enable custom failure-timeout configuration

As explained here[0], setting failure-timeout means that the cib will 'forget'
that a resource agent action failed by setting failcount to 0:
- if $failure-timeout seconds have elapsed from the last failure
- if an event wakes up the policy engine (i.e. at the global resource
  recheck in an idle cluster)

By default the failure-timeout will be set to 0, which disables the feature,
however this change allows for tuning.

[0] https://clusterlabs.org/pacemaker/doc/en-US/Pacemaker/1.1/html-single/Pacemaker_Explained/#_failure_response

Change-Id: Ia958a8c5472547c7cf0cb4ecd7e70cb226074b88
Closes-Bug: #1802310
This commit is contained in:
Andrea Ieri 2018-12-12 15:32:59 +01:00 committed by Ryan A Farrell
parent 9483383555
commit e28f8a9adc
5 changed files with 34 additions and 8 deletions

View File

@ -143,6 +143,15 @@ options:
Specifies the corosync.conf network mtu. If unset, the default
corosync.conf value is used (currently 1500). See 'man corosync.conf' for
detailed information on this config option.
failure_timeout:
type: int
default: 0
description: |
Sets the pacemaker default resource meta-attribute value for
failure_timeout. This value represents the duration in seconds to wait
before resetting failcount to 0. In practice, this is measured as the
time elapsed since the most recent failure. Setting this to 0 disables
the feature.
# Monitoring config
nagios_context:
type: string

View File

@ -186,7 +186,8 @@ def config_changed():
status_set('maintenance', "Setting up corosync")
if configure_corosync():
try_pcmk_wait()
configure_cluster_global()
failure_timeout = config('failure_timeout')
configure_cluster_global(failure_timeout)
configure_monitor_host()
configure_stonith()
@ -329,7 +330,8 @@ def ha_relation_changed():
# configuration should be set directly on subordinate
configure_corosync()
try_pcmk_wait()
configure_cluster_global()
failure_timeout = config('failure_timeout')
configure_cluster_global(failure_timeout)
configure_monitor_host()
configure_stonith()

View File

@ -585,8 +585,13 @@ def configure_monitor_host():
pcmk.commit('crm -w -F configure delete ping')
def configure_cluster_global():
"""Configure global cluster options"""
def configure_cluster_global(failure_timeout):
"""Configure global cluster options
:param failure_timeout: Duration in seconds (measured from the most recent
failure) to wait before resetting failcount to 0.
:type failure_timeout: int
"""
log('Applying global cluster configuration', level=DEBUG)
# NOTE(lathiat) quorum in a two-node scenario is handled by
# corosync two_node=1. In this case quorum is required for
@ -594,10 +599,11 @@ def configure_cluster_global():
# contact with the full cluster.
log('Configuring no-quorum-policy to stop', level=DEBUG)
cmd = "crm configure property no-quorum-policy=stop"
pcmk.commit(cmd)
cmd = ('crm configure rsc_defaults $id="rsc-options" '
'resource-stickiness="100"')
'resource-stickiness="100" '
'failure-timeout={}'.format(failure_timeout))
pcmk.commit(cmd)
log('Configuring cluster-recheck-interval to 60 seconds', level=DEBUG)

View File

@ -90,7 +90,8 @@ class TestCorosyncConf(unittest.TestCase):
'prefer-ipv6': False,
'corosync_transport': 'udpu',
'corosync_mcastaddr': 'corosync_mcastaddr',
'cluster_count': 3}
'cluster_count': 3,
'failure_timeout': 180}
config.side_effect = lambda key: cfg.get(key)
@ -118,7 +119,7 @@ class TestCorosyncConf(unittest.TestCase):
relation_set.assert_any_call(relation_id='hanode:1', ready=True)
configure_stonith.assert_called_with()
configure_monitor_host.assert_called_with()
configure_cluster_global.assert_called_with()
configure_cluster_global.assert_called_with(180)
configure_corosync.assert_called_with()
set_cluster_symmetry.assert_called_with()
configure_pacemaker_remote_resources.assert_called_with()

View File

@ -1022,3 +1022,11 @@ class UtilsTestCase(unittest.TestCase):
clones=clones,
groups=groups)
self.assertFalse(commit.called)
@mock.patch('pcmk.commit')
def test_configure_global_cluster(self, mock_commit):
utils.configure_cluster_global(240)
mock_commit.assert_any_call('crm configure rsc_defaults '
'$id="rsc-options" '
'resource-stickiness="100" '
'failure-timeout=240')