Fix spurious nagios alerts for radosgw service.

Currently, when the charm tears down the default radosgw daemon in
order to make way for per host daemons, it does not remove the nrpe
check for the daemon. This PR fixes the issue.

It also closes a gap where alerts for the per host daemons are not
setup until a hook that happens to call update_nrpe_checks as a
side-effect is run.

Change-Id: I7621b9671b010a77bb3e94bdd1e80f45274c73e5
Closes-Bug: #1825843
This commit is contained in:
Pete Vander Giessen 2019-04-24 09:39:26 -04:00
parent dce33bfb6e
commit ff90c0f058
2 changed files with 53 additions and 1 deletions

View File

@ -255,6 +255,12 @@ def mon_relation(rid=None, unit=None):
if systemd_based_radosgw():
service_stop('radosgw')
service('disable', 'radosgw')
# Update the nrpe config. If we wait for the below
# to be called elsewhere, there exists a period
# where nagios will report the radosgw service as
# down, and also not be monitoring the per
# host services.
update_nrpe_config(checks_to_remove=['radosgw'])
service('enable', service_name())
# NOTE(jamespage):
@ -362,13 +368,28 @@ def ha_relation_changed():
@hooks.hook('nrpe-external-master-relation-joined',
'nrpe-external-master-relation-changed')
def update_nrpe_config():
def update_nrpe_config(checks_to_remove=None):
"""
Update the checks for the nagios plugin.
:param checks_to_remove: list of short names of nrpe checks to
remove. For example, pass ['radosgw'] to remove the check for
the default systemd radosgw service, to make way for per host
services.
:type checks_to_remove: list
"""
# python-dbus is used by check_upstart_job
apt_install('python-dbus')
hostname = nrpe.get_nagios_hostname()
current_unit = nrpe.get_nagios_unit_name()
nrpe_setup = nrpe.NRPE(hostname=hostname)
nrpe.copy_nrpe_checks()
if checks_to_remove is not None:
log("Removing the following nrpe checks: {}".format(checks_to_remove),
level=DEBUG)
for svc in checks_to_remove:
nrpe_setup.remove_check(shortname=svc)
nrpe.add_init_service_checks(nrpe_setup, services(), current_unit)
nrpe.add_haproxy_checks(nrpe_setup, current_unit)
nrpe_setup.write()

View File

@ -671,3 +671,34 @@ class SlaveMultisiteTests(CephRadosMultisiteTests):
self.is_leader.return_value = False
ceph_hooks.slave_relation_changed('slave:1', 'rgw/0')
self.relation_get.assert_not_called()
@patch.object(ceph_hooks, 'apt_install')
@patch.object(ceph_hooks, 'services')
@patch.object(ceph_hooks, 'nrpe')
def test_update_nrpe_config(self, nrpe, services, apt_install):
# Setup Mocks
nrpe.get_nagios_hostname.return_value = 'foo'
nrpe.get_nagios_unit_name.return_value = 'bar'
nrpe_setup = MagicMock()
nrpe.NRPE.return_value = nrpe_setup
services.return_value = ['baz', 'qux']
# Call the routine
ceph_hooks.update_nrpe_config()
# Verify calls
apt_install.assert_called()
nrpe.get_nagios_hostname.assert_called()
nrpe.get_nagios_unit_name.assert_called()
nrpe.copy_nrpe_checks.assert_called()
nrpe.remove_check.assert_not_called()
nrpe.add_init_service_checks.assert_called_with(nrpe_setup,
['baz', 'qux'], 'bar')
nrpe.add_haproxy_checks.assert_called_with(nrpe_setup, 'bar')
nrpe_setup.write.assert_called()
# Verify that remove_check is called appropriately if we pass
# checks_to_remove
ceph_hooks.update_nrpe_config(checks_to_remove=['quux', 'quuux'])
nrpe_setup.remove_check.assert_has_calls([call(shortname='quux'),
call(shortname='quuux')])