From c74b40cb049fa2e9f7b229cf2127f1e0fc28fbe3 Mon Sep 17 00:00:00 2001 From: root Date: Wed, 29 Oct 2014 22:30:36 -0500 Subject: [PATCH 01/12] [bradm] initial nrpe checks --- config.yaml | 10 + files/nrpe-external-master/check_upstart_job | 72 ++++++ .../contrib/charmsupport/__init__.py | 0 .../charmhelpers/contrib/charmsupport/nrpe.py | 218 ++++++++++++++++++ .../contrib/charmsupport/volumes.py | 156 +++++++++++++ hooks/nrpe-external-master-relation-changed | 1 + hooks/nrpe-external-master-relation-joined | 1 + hooks/quantum_hooks.py | 27 +++ metadata.yaml | 3 + 9 files changed, 488 insertions(+) create mode 100755 files/nrpe-external-master/check_upstart_job create mode 100644 hooks/charmhelpers/contrib/charmsupport/__init__.py create mode 100644 hooks/charmhelpers/contrib/charmsupport/nrpe.py create mode 100644 hooks/charmhelpers/contrib/charmsupport/volumes.py create mode 120000 hooks/nrpe-external-master-relation-changed create mode 120000 hooks/nrpe-external-master-relation-joined diff --git a/config.yaml b/config.yaml index acc64c46..15e2c050 100644 --- a/config.yaml +++ b/config.yaml @@ -91,6 +91,16 @@ options: default: nova type: string description: Database name + nagios_context: + default: "juju" + type: string + description: | + Used by the nrpe-external-master subordinate charm. + A string that will be prepended to instance name to set the host name + in nagios. So for instance the hostname would be something like: + juju-myservice-0 + If you're running multiple environments with the same services in them + this allows you to differentiate between them. # Network configuration options # by default all access is over 'private-address' os-data-network: diff --git a/files/nrpe-external-master/check_upstart_job b/files/nrpe-external-master/check_upstart_job new file mode 100755 index 00000000..94efb95e --- /dev/null +++ b/files/nrpe-external-master/check_upstart_job @@ -0,0 +1,72 @@ +#!/usr/bin/python + +# +# Copyright 2012, 2013 Canonical Ltd. +# +# Author: Paul Collins +# +# Based on http://www.eurion.net/python-snippets/snippet/Upstart%20service%20status.html +# + +import sys + +import dbus + + +class Upstart(object): + def __init__(self): + self._bus = dbus.SystemBus() + self._upstart = self._bus.get_object('com.ubuntu.Upstart', + '/com/ubuntu/Upstart') + def get_job(self, job_name): + path = self._upstart.GetJobByName(job_name, + dbus_interface='com.ubuntu.Upstart0_6') + return self._bus.get_object('com.ubuntu.Upstart', path) + + def get_properties(self, job): + path = job.GetInstance([], dbus_interface='com.ubuntu.Upstart0_6.Job') + instance = self._bus.get_object('com.ubuntu.Upstart', path) + return instance.GetAll('com.ubuntu.Upstart0_6.Instance', + dbus_interface=dbus.PROPERTIES_IFACE) + + def get_job_instances(self, job_name): + job = self.get_job(job_name) + paths = job.GetAllInstances([], dbus_interface='com.ubuntu.Upstart0_6.Job') + return [self._bus.get_object('com.ubuntu.Upstart', path) for path in paths] + + def get_job_instance_properties(self, job): + return job.GetAll('com.ubuntu.Upstart0_6.Instance', + dbus_interface=dbus.PROPERTIES_IFACE) + +try: + upstart = Upstart() + try: + job = upstart.get_job(sys.argv[1]) + props = upstart.get_properties(job) + + if props['state'] == 'running': + print 'OK: %s is running' % sys.argv[1] + sys.exit(0) + else: + print 'CRITICAL: %s is not running' % sys.argv[1] + sys.exit(2) + + except dbus.DBusException as e: + instances = upstart.get_job_instances(sys.argv[1]) + propses = [upstart.get_job_instance_properties(instance) for instance in instances] + states = dict([(props['name'], props['state']) for props in propses]) + if len(states) != states.values().count('running'): + not_running = [] + for name in states.keys(): + if states[name] != 'running': + not_running.append(name) + print 'CRITICAL: %d instances of %s not running: %s' % \ + (len(not_running), sys.argv[1], not_running.join(', ')) + sys.exit(2) + else: + print 'OK: %d instances of %s running' % (len(states), sys.argv[1]) + +except dbus.DBusException as e: + print 'CRITICAL: failed to get properties of \'%s\' from upstart' % sys.argv[1] + sys.exit(2) + diff --git a/hooks/charmhelpers/contrib/charmsupport/__init__.py b/hooks/charmhelpers/contrib/charmsupport/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/hooks/charmhelpers/contrib/charmsupport/nrpe.py b/hooks/charmhelpers/contrib/charmsupport/nrpe.py new file mode 100644 index 00000000..f3bfe3f3 --- /dev/null +++ b/hooks/charmhelpers/contrib/charmsupport/nrpe.py @@ -0,0 +1,218 @@ +"""Compatibility with the nrpe-external-master charm""" +# Copyright 2012 Canonical Ltd. +# +# Authors: +# Matthew Wedgwood + +import subprocess +import pwd +import grp +import os +import re +import shlex +import yaml + +from charmhelpers.core.hookenv import ( + config, + local_unit, + log, + relation_ids, + relation_set, +) + +from charmhelpers.core.host import service + +# This module adds compatibility with the nrpe-external-master and plain nrpe +# subordinate charms. To use it in your charm: +# +# 1. Update metadata.yaml +# +# provides: +# (...) +# nrpe-external-master: +# interface: nrpe-external-master +# scope: container +# +# and/or +# +# provides: +# (...) +# local-monitors: +# interface: local-monitors +# scope: container + +# +# 2. Add the following to config.yaml +# +# nagios_context: +# default: "juju" +# type: string +# description: | +# Used by the nrpe subordinate charms. +# A string that will be prepended to instance name to set the host name +# in nagios. So for instance the hostname would be something like: +# juju-myservice-0 +# If you're running multiple environments with the same services in them +# this allows you to differentiate between them. +# +# 3. Add custom checks (Nagios plugins) to files/nrpe-external-master +# +# 4. Update your hooks.py with something like this: +# +# from charmsupport.nrpe import NRPE +# (...) +# def update_nrpe_config(): +# nrpe_compat = NRPE() +# nrpe_compat.add_check( +# shortname = "myservice", +# description = "Check MyService", +# check_cmd = "check_http -w 2 -c 10 http://localhost" +# ) +# nrpe_compat.add_check( +# "myservice_other", +# "Check for widget failures", +# check_cmd = "/srv/myapp/scripts/widget_check" +# ) +# nrpe_compat.write() +# +# def config_changed(): +# (...) +# update_nrpe_config() +# +# def nrpe_external_master_relation_changed(): +# update_nrpe_config() +# +# def local_monitors_relation_changed(): +# update_nrpe_config() +# +# 5. ln -s hooks.py nrpe-external-master-relation-changed +# ln -s hooks.py local-monitors-relation-changed + + +class CheckException(Exception): + pass + + +class Check(object): + shortname_re = '[A-Za-z0-9-_]+$' + service_template = (""" +#--------------------------------------------------- +# This file is Juju managed +#--------------------------------------------------- +define service {{ + use active-service + host_name {nagios_hostname} + service_description {nagios_hostname}[{shortname}] """ + """{description} + check_command check_nrpe!{command} + servicegroups {nagios_servicegroup} +}} +""") + + def __init__(self, shortname, description, check_cmd): + super(Check, self).__init__() + # XXX: could be better to calculate this from the service name + if not re.match(self.shortname_re, shortname): + raise CheckException("shortname must match {}".format( + Check.shortname_re)) + self.shortname = shortname + self.command = "check_{}".format(shortname) + # Note: a set of invalid characters is defined by the + # Nagios server config + # The default is: illegal_object_name_chars=`~!$%^&*"|'<>?,()= + self.description = description + self.check_cmd = self._locate_cmd(check_cmd) + + def _locate_cmd(self, check_cmd): + search_path = ( + '/', + os.path.join(os.environ['CHARM_DIR'], + 'files/nrpe-external-master'), + '/usr/lib/nagios/plugins', + ) + parts = shlex.split(check_cmd) + for path in search_path: + if os.path.exists(os.path.join(path, parts[0])): + command = os.path.join(path, parts[0]) + if len(parts) > 1: + command += " " + " ".join(parts[1:]) + return command + log('Check command not found: {}'.format(parts[0])) + return '' + + def write(self, nagios_context, hostname): + nrpe_check_file = '/etc/nagios/nrpe.d/{}.cfg'.format( + self.command) + with open(nrpe_check_file, 'w') as nrpe_check_config: + nrpe_check_config.write("# check {}\n".format(self.shortname)) + nrpe_check_config.write("command[{}]={}\n".format( + self.command, self.check_cmd)) + + if not os.path.exists(NRPE.nagios_exportdir): + log('Not writing service config as {} is not accessible'.format( + NRPE.nagios_exportdir)) + else: + self.write_service_config(nagios_context, hostname) + + def write_service_config(self, nagios_context, hostname): + for f in os.listdir(NRPE.nagios_exportdir): + if re.search('.*{}.cfg'.format(self.command), f): + os.remove(os.path.join(NRPE.nagios_exportdir, f)) + + templ_vars = { + 'nagios_hostname': hostname, + 'nagios_servicegroup': nagios_context, + 'description': self.description, + 'shortname': self.shortname, + 'command': self.command, + } + nrpe_service_text = Check.service_template.format(**templ_vars) + nrpe_service_file = '{}/service__{}_{}.cfg'.format( + NRPE.nagios_exportdir, hostname, self.command) + with open(nrpe_service_file, 'w') as nrpe_service_config: + nrpe_service_config.write(str(nrpe_service_text)) + + def run(self): + subprocess.call(self.check_cmd) + + +class NRPE(object): + nagios_logdir = '/var/log/nagios' + nagios_exportdir = '/var/lib/nagios/export' + nrpe_confdir = '/etc/nagios/nrpe.d' + + def __init__(self): + super(NRPE, self).__init__() + self.config = config() + self.nagios_context = self.config['nagios_context'] + self.unit_name = local_unit().replace('/', '-') + self.hostname = "{}-{}".format(self.nagios_context, self.unit_name) + self.checks = [] + + def add_check(self, *args, **kwargs): + self.checks.append(Check(*args, **kwargs)) + + def write(self): + try: + nagios_uid = pwd.getpwnam('nagios').pw_uid + nagios_gid = grp.getgrnam('nagios').gr_gid + except: + log("Nagios user not set up, nrpe checks not updated") + return + + if not os.path.exists(NRPE.nagios_logdir): + os.mkdir(NRPE.nagios_logdir) + os.chown(NRPE.nagios_logdir, nagios_uid, nagios_gid) + + nrpe_monitors = {} + monitors = {"monitors": {"remote": {"nrpe": nrpe_monitors}}} + for nrpecheck in self.checks: + nrpecheck.write(self.nagios_context, self.hostname) + nrpe_monitors[nrpecheck.shortname] = { + "command": nrpecheck.command, + } + + service('restart', 'nagios-nrpe-server') + + for rid in relation_ids("local-monitors"): + relation_set(relation_id=rid, monitors=yaml.dump(monitors)) diff --git a/hooks/charmhelpers/contrib/charmsupport/volumes.py b/hooks/charmhelpers/contrib/charmsupport/volumes.py new file mode 100644 index 00000000..0f905dff --- /dev/null +++ b/hooks/charmhelpers/contrib/charmsupport/volumes.py @@ -0,0 +1,156 @@ +''' +Functions for managing volumes in juju units. One volume is supported per unit. +Subordinates may have their own storage, provided it is on its own partition. + +Configuration stanzas: + volume-ephemeral: + type: boolean + default: true + description: > + If false, a volume is mounted as sepecified in "volume-map" + If true, ephemeral storage will be used, meaning that log data + will only exist as long as the machine. YOU HAVE BEEN WARNED. + volume-map: + type: string + default: {} + description: > + YAML map of units to device names, e.g: + "{ rsyslog/0: /dev/vdb, rsyslog/1: /dev/vdb }" + Service units will raise a configure-error if volume-ephemeral + is 'true' and no volume-map value is set. Use 'juju set' to set a + value and 'juju resolved' to complete configuration. + +Usage: + from charmsupport.volumes import configure_volume, VolumeConfigurationError + from charmsupport.hookenv import log, ERROR + def post_mount_hook(): + stop_service('myservice') + def post_mount_hook(): + start_service('myservice') + + if __name__ == '__main__': + try: + configure_volume(before_change=pre_mount_hook, + after_change=post_mount_hook) + except VolumeConfigurationError: + log('Storage could not be configured', ERROR) +''' + +# XXX: Known limitations +# - fstab is neither consulted nor updated + +import os +from charmhelpers.core import hookenv +from charmhelpers.core import host +import yaml + + +MOUNT_BASE = '/srv/juju/volumes' + + +class VolumeConfigurationError(Exception): + '''Volume configuration data is missing or invalid''' + pass + + +def get_config(): + '''Gather and sanity-check volume configuration data''' + volume_config = {} + config = hookenv.config() + + errors = False + + if config.get('volume-ephemeral') in (True, 'True', 'true', 'Yes', 'yes'): + volume_config['ephemeral'] = True + else: + volume_config['ephemeral'] = False + + try: + volume_map = yaml.safe_load(config.get('volume-map', '{}')) + except yaml.YAMLError as e: + hookenv.log("Error parsing YAML volume-map: {}".format(e), + hookenv.ERROR) + errors = True + if volume_map is None: + # probably an empty string + volume_map = {} + elif not isinstance(volume_map, dict): + hookenv.log("Volume-map should be a dictionary, not {}".format( + type(volume_map))) + errors = True + + volume_config['device'] = volume_map.get(os.environ['JUJU_UNIT_NAME']) + if volume_config['device'] and volume_config['ephemeral']: + # asked for ephemeral storage but also defined a volume ID + hookenv.log('A volume is defined for this unit, but ephemeral ' + 'storage was requested', hookenv.ERROR) + errors = True + elif not volume_config['device'] and not volume_config['ephemeral']: + # asked for permanent storage but did not define volume ID + hookenv.log('Ephemeral storage was requested, but there is no volume ' + 'defined for this unit.', hookenv.ERROR) + errors = True + + unit_mount_name = hookenv.local_unit().replace('/', '-') + volume_config['mountpoint'] = os.path.join(MOUNT_BASE, unit_mount_name) + + if errors: + return None + return volume_config + + +def mount_volume(config): + if os.path.exists(config['mountpoint']): + if not os.path.isdir(config['mountpoint']): + hookenv.log('Not a directory: {}'.format(config['mountpoint'])) + raise VolumeConfigurationError() + else: + host.mkdir(config['mountpoint']) + if os.path.ismount(config['mountpoint']): + unmount_volume(config) + if not host.mount(config['device'], config['mountpoint'], persist=True): + raise VolumeConfigurationError() + + +def unmount_volume(config): + if os.path.ismount(config['mountpoint']): + if not host.umount(config['mountpoint'], persist=True): + raise VolumeConfigurationError() + + +def managed_mounts(): + '''List of all mounted managed volumes''' + return filter(lambda mount: mount[0].startswith(MOUNT_BASE), host.mounts()) + + +def configure_volume(before_change=lambda: None, after_change=lambda: None): + '''Set up storage (or don't) according to the charm's volume configuration. + Returns the mount point or "ephemeral". before_change and after_change + are optional functions to be called if the volume configuration changes. + ''' + + config = get_config() + if not config: + hookenv.log('Failed to read volume configuration', hookenv.CRITICAL) + raise VolumeConfigurationError() + + if config['ephemeral']: + if os.path.ismount(config['mountpoint']): + before_change() + unmount_volume(config) + after_change() + return 'ephemeral' + else: + # persistent storage + if os.path.ismount(config['mountpoint']): + mounts = dict(managed_mounts()) + if mounts.get(config['mountpoint']) != config['device']: + before_change() + unmount_volume(config) + mount_volume(config) + after_change() + else: + before_change() + mount_volume(config) + after_change() + return config['mountpoint'] diff --git a/hooks/nrpe-external-master-relation-changed b/hooks/nrpe-external-master-relation-changed new file mode 120000 index 00000000..9a2da58e --- /dev/null +++ b/hooks/nrpe-external-master-relation-changed @@ -0,0 +1 @@ +quantum_hooks.py \ No newline at end of file diff --git a/hooks/nrpe-external-master-relation-joined b/hooks/nrpe-external-master-relation-joined new file mode 120000 index 00000000..9a2da58e --- /dev/null +++ b/hooks/nrpe-external-master-relation-joined @@ -0,0 +1 @@ +quantum_hooks.py \ No newline at end of file diff --git a/hooks/quantum_hooks.py b/hooks/quantum_hooks.py index 62443d5b..e30916e9 100755 --- a/hooks/quantum_hooks.py +++ b/hooks/quantum_hooks.py @@ -33,6 +33,8 @@ from charmhelpers.contrib.openstack.utils import ( ) from charmhelpers.payload.execd import execd_preinstall +from charmhelpers.contrib.charmsupport.nrpe import NRPE + import sys from quantum_utils import ( register_configs, @@ -76,6 +78,7 @@ def config_changed(): global CONFIGS if openstack_upgrade_available(get_common_package()): CONFIGS = do_openstack_upgrade() + update_nrpe_config() # Re-run joined hooks as config might have changed for r_id in relation_ids('shared-db'): db_joined(relation_id=r_id) @@ -196,6 +199,30 @@ def cluster_departed(): def stop(): stop_services() + +@hooks.hook('nrpe-external-master-relation-joined', 'nrpe-external-master-relation-changed') +def update_nrpe_config(): + SERVICES = [ + 'neutron-dhcp-agent', + 'neutron-lbaas-agent', + 'neutron-metadata-agent', + 'neutron-metering-agent', + 'neutron-ovs-cleanup', + 'neutron-plugin-openvswitch-agent', + 'neutron-vpn-agent', + ] + nrpe = NRPE() + apt_install('python-dbus') + + for service in SERVICES: + nrpe.add_check( + shortname=service, + description='%s process' % service, + check_cmd = 'check_upstart_job %s' % service, + ) + + nrpe.write() + if __name__ == '__main__': try: hooks.execute(sys.argv) diff --git a/metadata.yaml b/metadata.yaml index f24dde9f..d7ca866d 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -16,6 +16,9 @@ description: | categories: - openstack provides: + nrpe-external-master: + interface: nrpe-external-master + scope: container quantum-network-service: interface: quantum requires: From 22a0032cd4f0dd88ffe2ab816b41cc434b152e9f Mon Sep 17 00:00:00 2001 From: Brad Marshall Date: Thu, 30 Oct 2014 16:06:15 +1000 Subject: [PATCH 02/12] [bradm] Added charmsupport to charmhelpers --- charm-helpers-hooks.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/charm-helpers-hooks.yaml b/charm-helpers-hooks.yaml index 1a98c81c..7d57228c 100644 --- a/charm-helpers-hooks.yaml +++ b/charm-helpers-hooks.yaml @@ -8,3 +8,4 @@ include: - contrib.network - contrib.storage.linux - payload.execd + - contrib.charmsupport From e5052ec32dcc707863108993413812e81a3fd9c4 Mon Sep 17 00:00:00 2001 From: Brad Marshall Date: Fri, 31 Oct 2014 14:56:18 +1000 Subject: [PATCH 03/12] [bradm] Added support to get nagios hostname from nrpe relation --- hooks/charmhelpers/contrib/charmsupport/nrpe.py | 8 ++++++-- hooks/quantum_hooks.py | 9 ++++++++- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/hooks/charmhelpers/contrib/charmsupport/nrpe.py b/hooks/charmhelpers/contrib/charmsupport/nrpe.py index f3bfe3f3..51b62d39 100644 --- a/hooks/charmhelpers/contrib/charmsupport/nrpe.py +++ b/hooks/charmhelpers/contrib/charmsupport/nrpe.py @@ -129,6 +129,7 @@ define service {{ os.path.join(os.environ['CHARM_DIR'], 'files/nrpe-external-master'), '/usr/lib/nagios/plugins', + '/usr/local/lib/nagios/plugins', ) parts = shlex.split(check_cmd) for path in search_path: @@ -181,12 +182,15 @@ class NRPE(object): nagios_exportdir = '/var/lib/nagios/export' nrpe_confdir = '/etc/nagios/nrpe.d' - def __init__(self): + def __init__(self, hostname=None): super(NRPE, self).__init__() self.config = config() self.nagios_context = self.config['nagios_context'] self.unit_name = local_unit().replace('/', '-') - self.hostname = "{}-{}".format(self.nagios_context, self.unit_name) + if hostname: + self.hostname = hostname + else: + self.hostname = "{}-{}".format(self.nagios_context, self.unit_name) self.checks = [] def add_check(self, *args, **kwargs): diff --git a/hooks/quantum_hooks.py b/hooks/quantum_hooks.py index e30916e9..9a022c1e 100755 --- a/hooks/quantum_hooks.py +++ b/hooks/quantum_hooks.py @@ -9,6 +9,7 @@ from charmhelpers.core.hookenv import ( relation_get, relation_set, relation_ids, + relations_of_type, unit_get, Hooks, UnregisteredHookError ) @@ -211,7 +212,13 @@ def update_nrpe_config(): 'neutron-plugin-openvswitch-agent', 'neutron-vpn-agent', ] - nrpe = NRPE() + # Find out if nrpe set nagios_hostname + hostname = None + for rel in relations_of_type('nrpe-external-master'): + if 'nagios_hostname' in rel: + hostname = rel['nagios_hostname'] + break + nrpe = NRPE(hostname=hostname) apt_install('python-dbus') for service in SERVICES: From 0f38e9577c70925b981e25dee81c230904221d9c Mon Sep 17 00:00:00 2001 From: Brad Marshall Date: Tue, 4 Nov 2014 17:18:44 +1000 Subject: [PATCH 04/12] [bradm] Tweaked check to include host context and unit name --- hooks/quantum_hooks.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/hooks/quantum_hooks.py b/hooks/quantum_hooks.py index 9a022c1e..eb0729a5 100755 --- a/hooks/quantum_hooks.py +++ b/hooks/quantum_hooks.py @@ -10,6 +10,7 @@ from charmhelpers.core.hookenv import ( relation_set, relation_ids, relations_of_type, + local_unit, unit_get, Hooks, UnregisteredHookError ) @@ -217,14 +218,17 @@ def update_nrpe_config(): for rel in relations_of_type('nrpe-external-master'): if 'nagios_hostname' in rel: hostname = rel['nagios_hostname'] + host_context = rel['nagios_host_context'] break nrpe = NRPE(hostname=hostname) apt_install('python-dbus') - + + current_unit = "%s:%s" % (host_context, local_unit()) + for service in SERVICES: nrpe.add_check( shortname=service, - description='%s process' % service, + description='process check {%s}' % current_unit, check_cmd = 'check_upstart_job %s' % service, ) From 987bff0208e1983579a0b832834ebd703e0deb9d Mon Sep 17 00:00:00 2001 From: Brad Marshall Date: Thu, 6 Nov 2014 17:33:57 +1000 Subject: [PATCH 05/12] [bradm] Check if host_context is defined before using it --- hooks/quantum_hooks.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/hooks/quantum_hooks.py b/hooks/quantum_hooks.py index eb0729a5..ed99d83d 100755 --- a/hooks/quantum_hooks.py +++ b/hooks/quantum_hooks.py @@ -215,6 +215,7 @@ def update_nrpe_config(): ] # Find out if nrpe set nagios_hostname hostname = None + host_context = None for rel in relations_of_type('nrpe-external-master'): if 'nagios_hostname' in rel: hostname = rel['nagios_hostname'] @@ -223,7 +224,10 @@ def update_nrpe_config(): nrpe = NRPE(hostname=hostname) apt_install('python-dbus') - current_unit = "%s:%s" % (host_context, local_unit()) + if host_context: + current_unit = "%s:%s" % (host_context, local_unit()) + else: + current_unit = local_unit() for service in SERVICES: nrpe.add_check( From 0a54eab617c434e21bd45d71afaddc14766fa0d1 Mon Sep 17 00:00:00 2001 From: Brad Marshall Date: Mon, 17 Nov 2014 13:56:28 +1000 Subject: [PATCH 06/12] [bradm] Added sysvinit daemon monitoring, use services() instead of hard coded daemon list, pep8 fixes --- .../nrpe-external-master/check_exit_status.pl | 189 ++++++++++++++++++ .../nrpe-external-master/check_status_file.py | 60 ++++++ files/nrpe-external-master/nagios_plugin.py | 78 ++++++++ hooks/quantum_hooks.py | 49 +++-- hooks/quantum_utils.py | 8 + 5 files changed, 368 insertions(+), 16 deletions(-) create mode 100755 files/nrpe-external-master/check_exit_status.pl create mode 100755 files/nrpe-external-master/check_status_file.py create mode 100755 files/nrpe-external-master/nagios_plugin.py diff --git a/files/nrpe-external-master/check_exit_status.pl b/files/nrpe-external-master/check_exit_status.pl new file mode 100755 index 00000000..49df22d8 --- /dev/null +++ b/files/nrpe-external-master/check_exit_status.pl @@ -0,0 +1,189 @@ +#!/usr/bin/perl +################################################################################ +# # +# Copyright (C) 2011 Chad Columbus # +# # +# This program is free software; you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation; either version 2 of the License, or # +# (at your option) any later version. # +# # +# This program is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with this program; if not, write to the Free Software # +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # +# # +################################################################################ + +use strict; +use Getopt::Std; +$| = 1; + +my %opts; +getopts('heronp:s:', \%opts); + +my $VERSION = "Version 1.0"; +my $AUTHOR = '(c) 2011 Chad Columbus '; + +# Default values: +my $script_to_check; +my $pattern = 'is running'; +my $cmd; +my $message; +my $error; + +# Exit codes +my $STATE_OK = 0; +my $STATE_WARNING = 1; +my $STATE_CRITICAL = 2; +my $STATE_UNKNOWN = 3; + +# Parse command line options +if ($opts{'h'} || scalar(%opts) == 0) { + &print_help(); + exit($STATE_OK); +} + +# Make sure scipt is provided: +if ($opts{'s'} eq '') { + # Script to run not provided + print "\nYou must provide a script to run. Example: -s /etc/init.d/httpd\n"; + exit($STATE_UNKNOWN); +} else { + $script_to_check = $opts{'s'}; +} + +# Make sure only a-z, 0-9, /, _, and - are used in the script. +if ($script_to_check =~ /[^a-z0-9\_\-\/\.]/) { + # Script contains illegal characters exit. + print "\nScript to check can only contain Letters, Numbers, Periods, Underscores, Hyphens, and/or Slashes\n"; + exit($STATE_UNKNOWN); +} + +# See if script is executable +if (! -x "$script_to_check") { + print "\nIt appears you can't execute $script_to_check, $!\n"; + exit($STATE_UNKNOWN); +} + +# If a pattern is provided use it: +if ($opts{'p'} ne '') { + $pattern = $opts{'p'}; +} + +# If -r run command via sudo as root: +if ($opts{'r'}) { + $cmd = "sudo -n $script_to_check status" . ' 2>&1'; +} else { + $cmd = "$script_to_check status" . ' 2>&1'; +} + +my $cmd_result = `$cmd`; +chomp($cmd_result); +if ($cmd_result =~ /sudo/i) { + # This means it could not run the sudo command + $message = "$script_to_check CRITICAL - Could not run: 'sudo -n $script_to_check status'. Result is $cmd_result"; + $error = $STATE_UNKNOWN; +} else { + # Check exitstatus instead of output: + if ($opts{'e'} == 1) { + if ($? != 0) { + # error + $message = "$script_to_check CRITICAL - Exit code: $?\."; + if ($opts{'o'} == 0) { + $message .= " $cmd_result"; + } + $error = $STATE_CRITICAL; + } else { + # success + $message = "$script_to_check OK - Exit code: $?\."; + if ($opts{'o'} == 0) { + $message .= " $cmd_result"; + } + $error = $STATE_OK; + } + } else { + my $not_check = 1; + if ($opts{'n'} == 1) { + $not_check = 0; + } + if (($cmd_result =~ /$pattern/i) == $not_check) { + $message = "$script_to_check OK"; + if ($opts{'o'} == 0) { + $message .= " - $cmd_result"; + } + $error = $STATE_OK; + } else { + $message = "$script_to_check CRITICAL"; + if ($opts{'o'} == 0) { + $message .= " - $cmd_result"; + } + $error = $STATE_CRITICAL; + } + } +} + +if ($message eq '') { + print "Error: program failed in an unknown way\n"; + exit($STATE_UNKNOWN); +} + +if ($error) { + print "$message\n"; + exit($error); +} else { + # If we get here we are OK + print "$message\n"; + exit($STATE_OK); +} + +#################################### +# Start Subs: +#################################### +sub print_help() { + print << "EOF"; +Check the output or exit status of a script. +$VERSION +$AUTHOR + +Options: +-h + Print detailed help screen + +-s + 'FULL PATH TO SCRIPT' (required) + This is the script to run, the script is designed to run scripts in the + /etc/init.d dir (but can run any script) and will call the script with + a 'status' argument. So if you use another script make sure it will + work with /path/script status, example: /etc/init.d/httpd status + +-e + This is the "exitstaus" flag, it means check the exit status + code instead of looking for a pattern in the output of the script. + +-p 'REGEX' + This is a pattern to look for in the output of the script to confirm it + is running, default is 'is running', but not all init.d scripts output + (iptables), so you can specify an arbitrary pattern. + All patterns are case insensitive. + +-n + This is the "NOT" flag, it means not the -p pattern, so if you want to + make sure the output of the script does NOT contain -p 'REGEX' + +-r + This is the "ROOT" flag, it means run as root via sudo. You will need a + line in your /etc/sudoers file like: + nagios ALL=(root) NOPASSWD: /etc/init.d/* status + +-o + This is the "SUPPRESS OUTPUT" flag. Some programs have a long output + (like iptables), this flag suppresses that output so it is not printed + as a part of the nagios message. +EOF +} + diff --git a/files/nrpe-external-master/check_status_file.py b/files/nrpe-external-master/check_status_file.py new file mode 100755 index 00000000..ba828087 --- /dev/null +++ b/files/nrpe-external-master/check_status_file.py @@ -0,0 +1,60 @@ +#!/usr/bin/python + +# m +# mmmm m m mmmm mmmm mmm mm#mm +# #" "# # # #" "# #" "# #" # # +# # # # # # # # # #"""" # +# ##m#" "mm"# ##m#" ##m#" "#mm" "mm +# # # # +# " " " +# This file is managed by puppet. Do not make local changes. + +# +# Copyright 2014 Canonical Ltd. +# +# Author: Jacek Nykis +# + +import re +import nagios_plugin + + +def parse_args(): + import argparse + + parser = argparse.ArgumentParser( + description='Read file and return nagios status based on its content', + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument('-f', '--status-file', required=True, + help='Status file path') + parser.add_argument('-c', '--critical-text', default='CRITICAL', + help='String indicating critical status') + parser.add_argument('-w', '--warning-text', default='WARNING', + help='String indicating warning status') + parser.add_argument('-o', '--ok-text', default='OK', + help='String indicating OK status') + parser.add_argument('-u', '--unknown-text', default='UNKNOWN', + help='String indicating unknown status') + return parser.parse_args() + + +def check_status(args): + nagios_plugin.check_file_freshness(args.status_file, 43200) + + with open(args.status_file, "r") as f: + content = [l.strip() for l in f.readlines()] + + for line in content: + if re.search(args.critical_text, line): + raise nagios_plugin.CriticalError(line) + elif re.search(args.warning_text, line): + raise nagios_plugin.WarnError(line) + elif re.search(args.unknown_text, line): + raise nagios_plugin.UnknownError(line) + else: + print line + + +if __name__ == '__main__': + args = parse_args() + nagios_plugin.try_check(check_status, args) diff --git a/files/nrpe-external-master/nagios_plugin.py b/files/nrpe-external-master/nagios_plugin.py new file mode 100755 index 00000000..f0f8e7b5 --- /dev/null +++ b/files/nrpe-external-master/nagios_plugin.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python +# m +# mmmm m m mmmm mmmm mmm mm#mm +# #" "# # # #" "# #" "# #" # # +# # # # # # # # # #"""" # +# ##m#" "mm"# ##m#" ##m#" "#mm" "mm +# # # # +# " " " +# This file is managed by puppet. Do not make local changes. + +# Copyright (C) 2005, 2006, 2007, 2012 James Troup + +import os +import stat +import time +import traceback +import sys + + +################################################################################ + +class CriticalError(Exception): + """This indicates a critical error.""" + pass + + +class WarnError(Exception): + """This indicates a warning condition.""" + pass + + +class UnknownError(Exception): + """This indicates a unknown error was encountered.""" + pass + + +def try_check(function, *args, **kwargs): + """Perform a check with error/warn/unknown handling.""" + try: + function(*args, **kwargs) + except UnknownError, msg: + print msg + sys.exit(3) + except CriticalError, msg: + print msg + sys.exit(2) + except WarnError, msg: + print msg + sys.exit(1) + except: + print "%s raised unknown exception '%s'" % (function, sys.exc_info()[0]) + print '=' * 60 + traceback.print_exc(file=sys.stdout) + print '=' * 60 + sys.exit(3) + + +################################################################################ + +def check_file_freshness(filename, newer_than=600): + """Check a file exists, is readable and is newer than seconds (where defaults to 600).""" + # First check the file exists and is readable + if not os.path.exists(filename): + raise CriticalError("%s: does not exist." % (filename)) + if os.access(filename, os.R_OK) == 0: + raise CriticalError("%s: is not readable." % (filename)) + + # Then ensure the file is up-to-date enough + mtime = os.stat(filename)[stat.ST_MTIME] + last_modified = time.time() - mtime + if last_modified > newer_than: + raise CriticalError("%s: was last modified on %s and is too old (> %s seconds)." + % (filename, time.ctime(mtime), newer_than)) + if last_modified < 0: + raise CriticalError("%s: was last modified on %s which is in the future." + % (filename, time.ctime(mtime))) + +################################################################################ diff --git a/hooks/quantum_hooks.py b/hooks/quantum_hooks.py index ed99d83d..97c8e418 100755 --- a/hooks/quantum_hooks.py +++ b/hooks/quantum_hooks.py @@ -1,6 +1,7 @@ #!/usr/bin/python from base64 import b64decode +import os from charmhelpers.core.hookenv import ( log, ERROR, WARNING, @@ -41,6 +42,7 @@ import sys from quantum_utils import ( register_configs, restart_map, + services, do_openstack_upgrade, get_packages, get_early_packages, @@ -202,17 +204,9 @@ def stop(): stop_services() -@hooks.hook('nrpe-external-master-relation-joined', 'nrpe-external-master-relation-changed') +@hooks.hook('nrpe-external-master-relation-joined', + 'nrpe-external-master-relation-changed') def update_nrpe_config(): - SERVICES = [ - 'neutron-dhcp-agent', - 'neutron-lbaas-agent', - 'neutron-metadata-agent', - 'neutron-metering-agent', - 'neutron-ovs-cleanup', - 'neutron-plugin-openvswitch-agent', - 'neutron-vpn-agent', - ] # Find out if nrpe set nagios_hostname hostname = None host_context = None @@ -229,15 +223,38 @@ def update_nrpe_config(): else: current_unit = local_unit() - for service in SERVICES: - nrpe.add_check( - shortname=service, - description='process check {%s}' % current_unit, - check_cmd = 'check_upstart_job %s' % service, - ) + services_to_monitor = services() + for service in services_to_monitor: + upstart_init = '/etc/init/%s.conf' % service + sysv_init = '/etc/init.d/%s' % service + + if os.path.exists(upstart_init): + nrpe.add_check( + shortname=service, + description='process check {%s}' % current_unit, + check_cmd='check_upstart_job %s' % service, + ) + elif os.path.exists(sysv_init): + cronpath = '/etc/cron.d/nagios-service-check-%s' % service + checkpath = os.path.join(os.environ['CHARM_DIR'], + 'files/nrpe-external-master', + 'check_exit_status.pl'), + cron_template = '*/5 * * * * root %s -s \ +/etc/init.d/%s status > /var/lib/nagios/service-check-%s.txt\n' \ + % (checkpath[0], service, service) + f = open(cronpath, 'w') + f.write(cron_template) + f.close() + nrpe.add_check( + shortname=service, + description='process check {%s}' % current_unit, + check_cmd='check_status_file.py -f \ +/var/lib/nagios/service-check-%s.txt' % service, + ) nrpe.write() + if __name__ == '__main__': try: hooks.execute(sys.argv) diff --git a/hooks/quantum_utils.py b/hooks/quantum_utils.py index 7bc3a439..bdf6c271 100644 --- a/hooks/quantum_utils.py +++ b/hooks/quantum_utils.py @@ -402,6 +402,14 @@ def restart_map(): return _map +def services(): + ''' Returns a list of services associate with this charm ''' + _services = [] + for v in restart_map().values(): + _services = _services + v + return list(set(_services)) + + INT_BRIDGE = "br-int" EXT_BRIDGE = "br-ex" From a9300400ac32d2454c5cdd078057dec2f2a73b1f Mon Sep 17 00:00:00 2001 From: Brad Marshall Date: Mon, 17 Nov 2014 15:07:54 +1000 Subject: [PATCH 07/12] [bradm] Removed puppet header from nagios_plugin module --- files/nrpe-external-master/nagios_plugin.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/files/nrpe-external-master/nagios_plugin.py b/files/nrpe-external-master/nagios_plugin.py index f0f8e7b5..fc0d7b7b 100755 --- a/files/nrpe-external-master/nagios_plugin.py +++ b/files/nrpe-external-master/nagios_plugin.py @@ -1,13 +1,4 @@ #!/usr/bin/env python -# m -# mmmm m m mmmm mmmm mmm mm#mm -# #" "# # # #" "# #" "# #" # # -# # # # # # # # # #"""" # -# ##m#" "mm"# ##m#" ##m#" "#mm" "mm -# # # # -# " " " -# This file is managed by puppet. Do not make local changes. - # Copyright (C) 2005, 2006, 2007, 2012 James Troup import os From b096c1261093dcec59ed519cc03eb6801a6cc17f Mon Sep 17 00:00:00 2001 From: Brad Marshall Date: Tue, 18 Nov 2014 11:26:17 +1000 Subject: [PATCH 08/12] [bradm] Removed nagios check files that were moved to nrpe-external-master charm --- .../nrpe-external-master/check_exit_status.pl | 189 ------------------ .../nrpe-external-master/check_status_file.py | 60 ------ files/nrpe-external-master/check_upstart_job | 72 ------- files/nrpe-external-master/nagios_plugin.py | 69 ------- hooks/quantum_hooks.py | 6 +- 5 files changed, 3 insertions(+), 393 deletions(-) delete mode 100755 files/nrpe-external-master/check_exit_status.pl delete mode 100755 files/nrpe-external-master/check_status_file.py delete mode 100755 files/nrpe-external-master/check_upstart_job delete mode 100755 files/nrpe-external-master/nagios_plugin.py diff --git a/files/nrpe-external-master/check_exit_status.pl b/files/nrpe-external-master/check_exit_status.pl deleted file mode 100755 index 49df22d8..00000000 --- a/files/nrpe-external-master/check_exit_status.pl +++ /dev/null @@ -1,189 +0,0 @@ -#!/usr/bin/perl -################################################################################ -# # -# Copyright (C) 2011 Chad Columbus # -# # -# This program is free software; you can redistribute it and/or modify # -# it under the terms of the GNU General Public License as published by # -# the Free Software Foundation; either version 2 of the License, or # -# (at your option) any later version. # -# # -# This program is distributed in the hope that it will be useful, # -# but WITHOUT ANY WARRANTY; without even the implied warranty of # -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # -# GNU General Public License for more details. # -# # -# You should have received a copy of the GNU General Public License # -# along with this program; if not, write to the Free Software # -# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA # -# # -################################################################################ - -use strict; -use Getopt::Std; -$| = 1; - -my %opts; -getopts('heronp:s:', \%opts); - -my $VERSION = "Version 1.0"; -my $AUTHOR = '(c) 2011 Chad Columbus '; - -# Default values: -my $script_to_check; -my $pattern = 'is running'; -my $cmd; -my $message; -my $error; - -# Exit codes -my $STATE_OK = 0; -my $STATE_WARNING = 1; -my $STATE_CRITICAL = 2; -my $STATE_UNKNOWN = 3; - -# Parse command line options -if ($opts{'h'} || scalar(%opts) == 0) { - &print_help(); - exit($STATE_OK); -} - -# Make sure scipt is provided: -if ($opts{'s'} eq '') { - # Script to run not provided - print "\nYou must provide a script to run. Example: -s /etc/init.d/httpd\n"; - exit($STATE_UNKNOWN); -} else { - $script_to_check = $opts{'s'}; -} - -# Make sure only a-z, 0-9, /, _, and - are used in the script. -if ($script_to_check =~ /[^a-z0-9\_\-\/\.]/) { - # Script contains illegal characters exit. - print "\nScript to check can only contain Letters, Numbers, Periods, Underscores, Hyphens, and/or Slashes\n"; - exit($STATE_UNKNOWN); -} - -# See if script is executable -if (! -x "$script_to_check") { - print "\nIt appears you can't execute $script_to_check, $!\n"; - exit($STATE_UNKNOWN); -} - -# If a pattern is provided use it: -if ($opts{'p'} ne '') { - $pattern = $opts{'p'}; -} - -# If -r run command via sudo as root: -if ($opts{'r'}) { - $cmd = "sudo -n $script_to_check status" . ' 2>&1'; -} else { - $cmd = "$script_to_check status" . ' 2>&1'; -} - -my $cmd_result = `$cmd`; -chomp($cmd_result); -if ($cmd_result =~ /sudo/i) { - # This means it could not run the sudo command - $message = "$script_to_check CRITICAL - Could not run: 'sudo -n $script_to_check status'. Result is $cmd_result"; - $error = $STATE_UNKNOWN; -} else { - # Check exitstatus instead of output: - if ($opts{'e'} == 1) { - if ($? != 0) { - # error - $message = "$script_to_check CRITICAL - Exit code: $?\."; - if ($opts{'o'} == 0) { - $message .= " $cmd_result"; - } - $error = $STATE_CRITICAL; - } else { - # success - $message = "$script_to_check OK - Exit code: $?\."; - if ($opts{'o'} == 0) { - $message .= " $cmd_result"; - } - $error = $STATE_OK; - } - } else { - my $not_check = 1; - if ($opts{'n'} == 1) { - $not_check = 0; - } - if (($cmd_result =~ /$pattern/i) == $not_check) { - $message = "$script_to_check OK"; - if ($opts{'o'} == 0) { - $message .= " - $cmd_result"; - } - $error = $STATE_OK; - } else { - $message = "$script_to_check CRITICAL"; - if ($opts{'o'} == 0) { - $message .= " - $cmd_result"; - } - $error = $STATE_CRITICAL; - } - } -} - -if ($message eq '') { - print "Error: program failed in an unknown way\n"; - exit($STATE_UNKNOWN); -} - -if ($error) { - print "$message\n"; - exit($error); -} else { - # If we get here we are OK - print "$message\n"; - exit($STATE_OK); -} - -#################################### -# Start Subs: -#################################### -sub print_help() { - print << "EOF"; -Check the output or exit status of a script. -$VERSION -$AUTHOR - -Options: --h - Print detailed help screen - --s - 'FULL PATH TO SCRIPT' (required) - This is the script to run, the script is designed to run scripts in the - /etc/init.d dir (but can run any script) and will call the script with - a 'status' argument. So if you use another script make sure it will - work with /path/script status, example: /etc/init.d/httpd status - --e - This is the "exitstaus" flag, it means check the exit status - code instead of looking for a pattern in the output of the script. - --p 'REGEX' - This is a pattern to look for in the output of the script to confirm it - is running, default is 'is running', but not all init.d scripts output - (iptables), so you can specify an arbitrary pattern. - All patterns are case insensitive. - --n - This is the "NOT" flag, it means not the -p pattern, so if you want to - make sure the output of the script does NOT contain -p 'REGEX' - --r - This is the "ROOT" flag, it means run as root via sudo. You will need a - line in your /etc/sudoers file like: - nagios ALL=(root) NOPASSWD: /etc/init.d/* status - --o - This is the "SUPPRESS OUTPUT" flag. Some programs have a long output - (like iptables), this flag suppresses that output so it is not printed - as a part of the nagios message. -EOF -} - diff --git a/files/nrpe-external-master/check_status_file.py b/files/nrpe-external-master/check_status_file.py deleted file mode 100755 index ba828087..00000000 --- a/files/nrpe-external-master/check_status_file.py +++ /dev/null @@ -1,60 +0,0 @@ -#!/usr/bin/python - -# m -# mmmm m m mmmm mmmm mmm mm#mm -# #" "# # # #" "# #" "# #" # # -# # # # # # # # # #"""" # -# ##m#" "mm"# ##m#" ##m#" "#mm" "mm -# # # # -# " " " -# This file is managed by puppet. Do not make local changes. - -# -# Copyright 2014 Canonical Ltd. -# -# Author: Jacek Nykis -# - -import re -import nagios_plugin - - -def parse_args(): - import argparse - - parser = argparse.ArgumentParser( - description='Read file and return nagios status based on its content', - formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('-f', '--status-file', required=True, - help='Status file path') - parser.add_argument('-c', '--critical-text', default='CRITICAL', - help='String indicating critical status') - parser.add_argument('-w', '--warning-text', default='WARNING', - help='String indicating warning status') - parser.add_argument('-o', '--ok-text', default='OK', - help='String indicating OK status') - parser.add_argument('-u', '--unknown-text', default='UNKNOWN', - help='String indicating unknown status') - return parser.parse_args() - - -def check_status(args): - nagios_plugin.check_file_freshness(args.status_file, 43200) - - with open(args.status_file, "r") as f: - content = [l.strip() for l in f.readlines()] - - for line in content: - if re.search(args.critical_text, line): - raise nagios_plugin.CriticalError(line) - elif re.search(args.warning_text, line): - raise nagios_plugin.WarnError(line) - elif re.search(args.unknown_text, line): - raise nagios_plugin.UnknownError(line) - else: - print line - - -if __name__ == '__main__': - args = parse_args() - nagios_plugin.try_check(check_status, args) diff --git a/files/nrpe-external-master/check_upstart_job b/files/nrpe-external-master/check_upstart_job deleted file mode 100755 index 94efb95e..00000000 --- a/files/nrpe-external-master/check_upstart_job +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/python - -# -# Copyright 2012, 2013 Canonical Ltd. -# -# Author: Paul Collins -# -# Based on http://www.eurion.net/python-snippets/snippet/Upstart%20service%20status.html -# - -import sys - -import dbus - - -class Upstart(object): - def __init__(self): - self._bus = dbus.SystemBus() - self._upstart = self._bus.get_object('com.ubuntu.Upstart', - '/com/ubuntu/Upstart') - def get_job(self, job_name): - path = self._upstart.GetJobByName(job_name, - dbus_interface='com.ubuntu.Upstart0_6') - return self._bus.get_object('com.ubuntu.Upstart', path) - - def get_properties(self, job): - path = job.GetInstance([], dbus_interface='com.ubuntu.Upstart0_6.Job') - instance = self._bus.get_object('com.ubuntu.Upstart', path) - return instance.GetAll('com.ubuntu.Upstart0_6.Instance', - dbus_interface=dbus.PROPERTIES_IFACE) - - def get_job_instances(self, job_name): - job = self.get_job(job_name) - paths = job.GetAllInstances([], dbus_interface='com.ubuntu.Upstart0_6.Job') - return [self._bus.get_object('com.ubuntu.Upstart', path) for path in paths] - - def get_job_instance_properties(self, job): - return job.GetAll('com.ubuntu.Upstart0_6.Instance', - dbus_interface=dbus.PROPERTIES_IFACE) - -try: - upstart = Upstart() - try: - job = upstart.get_job(sys.argv[1]) - props = upstart.get_properties(job) - - if props['state'] == 'running': - print 'OK: %s is running' % sys.argv[1] - sys.exit(0) - else: - print 'CRITICAL: %s is not running' % sys.argv[1] - sys.exit(2) - - except dbus.DBusException as e: - instances = upstart.get_job_instances(sys.argv[1]) - propses = [upstart.get_job_instance_properties(instance) for instance in instances] - states = dict([(props['name'], props['state']) for props in propses]) - if len(states) != states.values().count('running'): - not_running = [] - for name in states.keys(): - if states[name] != 'running': - not_running.append(name) - print 'CRITICAL: %d instances of %s not running: %s' % \ - (len(not_running), sys.argv[1], not_running.join(', ')) - sys.exit(2) - else: - print 'OK: %d instances of %s running' % (len(states), sys.argv[1]) - -except dbus.DBusException as e: - print 'CRITICAL: failed to get properties of \'%s\' from upstart' % sys.argv[1] - sys.exit(2) - diff --git a/files/nrpe-external-master/nagios_plugin.py b/files/nrpe-external-master/nagios_plugin.py deleted file mode 100755 index fc0d7b7b..00000000 --- a/files/nrpe-external-master/nagios_plugin.py +++ /dev/null @@ -1,69 +0,0 @@ -#!/usr/bin/env python -# Copyright (C) 2005, 2006, 2007, 2012 James Troup - -import os -import stat -import time -import traceback -import sys - - -################################################################################ - -class CriticalError(Exception): - """This indicates a critical error.""" - pass - - -class WarnError(Exception): - """This indicates a warning condition.""" - pass - - -class UnknownError(Exception): - """This indicates a unknown error was encountered.""" - pass - - -def try_check(function, *args, **kwargs): - """Perform a check with error/warn/unknown handling.""" - try: - function(*args, **kwargs) - except UnknownError, msg: - print msg - sys.exit(3) - except CriticalError, msg: - print msg - sys.exit(2) - except WarnError, msg: - print msg - sys.exit(1) - except: - print "%s raised unknown exception '%s'" % (function, sys.exc_info()[0]) - print '=' * 60 - traceback.print_exc(file=sys.stdout) - print '=' * 60 - sys.exit(3) - - -################################################################################ - -def check_file_freshness(filename, newer_than=600): - """Check a file exists, is readable and is newer than seconds (where defaults to 600).""" - # First check the file exists and is readable - if not os.path.exists(filename): - raise CriticalError("%s: does not exist." % (filename)) - if os.access(filename, os.R_OK) == 0: - raise CriticalError("%s: is not readable." % (filename)) - - # Then ensure the file is up-to-date enough - mtime = os.stat(filename)[stat.ST_MTIME] - last_modified = time.time() - mtime - if last_modified > newer_than: - raise CriticalError("%s: was last modified on %s and is too old (> %s seconds)." - % (filename, time.ctime(mtime), newer_than)) - if last_modified < 0: - raise CriticalError("%s: was last modified on %s which is in the future." - % (filename, time.ctime(mtime))) - -################################################################################ diff --git a/hooks/quantum_hooks.py b/hooks/quantum_hooks.py index 97c8e418..422aae48 100755 --- a/hooks/quantum_hooks.py +++ b/hooks/quantum_hooks.py @@ -239,9 +239,9 @@ def update_nrpe_config(): checkpath = os.path.join(os.environ['CHARM_DIR'], 'files/nrpe-external-master', 'check_exit_status.pl'), - cron_template = '*/5 * * * * root %s -s \ -/etc/init.d/%s status > /var/lib/nagios/service-check-%s.txt\n' \ - % (checkpath[0], service, service) + cron_template = '*/5 * * * * root \ +/usr/local/lib/nagios/plugins/check_exit_status.pl -s /etc/init.d/%s \ +status > /var/lib/nagios/service-check-%s.txt\n' % (service, service) f = open(cronpath, 'w') f.write(cron_template) f.close() From b07d95c28d90b8168fd02a81c3e2d2ed9fc5ba35 Mon Sep 17 00:00:00 2001 From: Brad Marshall Date: Thu, 20 Nov 2014 12:16:45 +1000 Subject: [PATCH 09/12] [bradm] Add network namespace checks --- hooks/quantum_hooks.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/hooks/quantum_hooks.py b/hooks/quantum_hooks.py index 422aae48..ebfda445 100755 --- a/hooks/quantum_hooks.py +++ b/hooks/quantum_hooks.py @@ -236,9 +236,6 @@ def update_nrpe_config(): ) elif os.path.exists(sysv_init): cronpath = '/etc/cron.d/nagios-service-check-%s' % service - checkpath = os.path.join(os.environ['CHARM_DIR'], - 'files/nrpe-external-master', - 'check_exit_status.pl'), cron_template = '*/5 * * * * root \ /usr/local/lib/nagios/plugins/check_exit_status.pl -s /etc/init.d/%s \ status > /var/lib/nagios/service-check-%s.txt\n' % (service, service) @@ -252,6 +249,18 @@ status > /var/lib/nagios/service-check-%s.txt\n' % (service, service) /var/lib/nagios/service-check-%s.txt' % service, ) + cronpath = '/etc/cron.d/nagios-netns-check' + cron_template = '*/5 * * * * root \ +/usr/local/lib/nagios/plugins/check_netns.sh \ +> /var/lib/nagios/netns-check.txt\n' + f = open(cronpath, 'w') + f.write(cron_template) + f.close() + nrpe.add_check( + shortname="netns", + description='Network Namespace check {%s}' % current_unit, + check_cmd='check_status_file.py -f /var/lib/nagios/netns-check.txt' + ) nrpe.write() From 08455802632101abb104861f29b6909c179725c0 Mon Sep 17 00:00:00 2001 From: Liam Young Date: Fri, 9 Jan 2015 15:44:00 +0000 Subject: [PATCH 10/12] Fix unit tests --- unit_tests/test_quantum_hooks.py | 1 + 1 file changed, 1 insertion(+) diff --git a/unit_tests/test_quantum_hooks.py b/unit_tests/test_quantum_hooks.py index ee39c1bd..7b7b07a7 100644 --- a/unit_tests/test_quantum_hooks.py +++ b/unit_tests/test_quantum_hooks.py @@ -42,6 +42,7 @@ TO_PATCH = [ 'b64decode', 'is_relation_made', 'create_sysctl', + 'update_nrpe_config', ] From 5ee9df1cd50e04b31e3172e5f1250e63a9fd13de Mon Sep 17 00:00:00 2001 From: Liam Young Date: Mon, 12 Jan 2015 12:04:01 +0000 Subject: [PATCH 11/12] Use rnpe functions from charmhelpers --- .../charmhelpers/contrib/charmsupport/nrpe.py | 102 ++++++++++++++++-- .../contrib/charmsupport/volumes.py | 7 +- .../charmhelpers/contrib/hahelpers/cluster.py | 38 ++++--- hooks/charmhelpers/contrib/network/ufw.py | 11 ++ .../charmhelpers/contrib/openstack/context.py | 1 + .../charmhelpers/contrib/openstack/neutron.py | 10 +- hooks/charmhelpers/contrib/openstack/utils.py | 6 ++ .../contrib/storage/linux/ceph.py | 43 ++++++++ hooks/charmhelpers/core/host.py | 11 +- hooks/charmhelpers/fetch/__init__.py | 9 +- hooks/quantum_hooks.py | 61 +++-------- 11 files changed, 221 insertions(+), 78 deletions(-) diff --git a/hooks/charmhelpers/contrib/charmsupport/nrpe.py b/hooks/charmhelpers/contrib/charmsupport/nrpe.py index 51b62d39..f3a936d0 100644 --- a/hooks/charmhelpers/contrib/charmsupport/nrpe.py +++ b/hooks/charmhelpers/contrib/charmsupport/nrpe.py @@ -18,6 +18,7 @@ from charmhelpers.core.hookenv import ( log, relation_ids, relation_set, + relations_of_type, ) from charmhelpers.core.host import service @@ -54,6 +55,12 @@ from charmhelpers.core.host import service # juju-myservice-0 # If you're running multiple environments with the same services in them # this allows you to differentiate between them. +# nagios_servicegroups: +# default: "" +# type: string +# description: | +# A comma-separated list of nagios servicegroups. +# If left empty, the nagios_context will be used as the servicegroup # # 3. Add custom checks (Nagios plugins) to files/nrpe-external-master # @@ -125,9 +132,6 @@ define service {{ def _locate_cmd(self, check_cmd): search_path = ( - '/', - os.path.join(os.environ['CHARM_DIR'], - 'files/nrpe-external-master'), '/usr/lib/nagios/plugins', '/usr/local/lib/nagios/plugins', ) @@ -141,7 +145,7 @@ define service {{ log('Check command not found: {}'.format(parts[0])) return '' - def write(self, nagios_context, hostname): + def write(self, nagios_context, hostname, nagios_servicegroups=None): nrpe_check_file = '/etc/nagios/nrpe.d/{}.cfg'.format( self.command) with open(nrpe_check_file, 'w') as nrpe_check_config: @@ -153,16 +157,21 @@ define service {{ log('Not writing service config as {} is not accessible'.format( NRPE.nagios_exportdir)) else: - self.write_service_config(nagios_context, hostname) + self.write_service_config(nagios_context, hostname, + nagios_servicegroups) - def write_service_config(self, nagios_context, hostname): + def write_service_config(self, nagios_context, hostname, + nagios_servicegroups=None): for f in os.listdir(NRPE.nagios_exportdir): if re.search('.*{}.cfg'.format(self.command), f): os.remove(os.path.join(NRPE.nagios_exportdir, f)) + if not nagios_servicegroups: + nagios_servicegroups = nagios_context + templ_vars = { 'nagios_hostname': hostname, - 'nagios_servicegroup': nagios_context, + 'nagios_servicegroup': nagios_servicegroups, 'description': self.description, 'shortname': self.shortname, 'command': self.command, @@ -186,6 +195,10 @@ class NRPE(object): super(NRPE, self).__init__() self.config = config() self.nagios_context = self.config['nagios_context'] + if 'nagios_servicegroups' in self.config: + self.nagios_servicegroups = self.config['nagios_servicegroups'] + else: + self.nagios_servicegroups = 'juju' self.unit_name = local_unit().replace('/', '-') if hostname: self.hostname = hostname @@ -211,7 +224,8 @@ class NRPE(object): nrpe_monitors = {} monitors = {"monitors": {"remote": {"nrpe": nrpe_monitors}}} for nrpecheck in self.checks: - nrpecheck.write(self.nagios_context, self.hostname) + nrpecheck.write(self.nagios_context, self.hostname, + self.nagios_servicegroups) nrpe_monitors[nrpecheck.shortname] = { "command": nrpecheck.command, } @@ -220,3 +234,75 @@ class NRPE(object): for rid in relation_ids("local-monitors"): relation_set(relation_id=rid, monitors=yaml.dump(monitors)) + + +def get_nagios_hostcontext(relation_name='nrpe-external-master'): + """ + Query relation with nrpe subordinate, return the nagios_host_context + + :param str relation_name: Name of relation nrpe sub joined to + """ + for rel in relations_of_type(relation_name): + if 'nagios_hostname' in rel: + return rel['nagios_host_context'] + + +def get_nagios_hostname(relation_name='nrpe-external-master'): + """ + Query relation with nrpe subordinate, return the nagios_hostname + + :param str relation_name: Name of relation nrpe sub joined to + """ + for rel in relations_of_type(relation_name): + if 'nagios_hostname' in rel: + return rel['nagios_hostname'] + + +def get_nagios_unit_name(relation_name='nrpe-external-master'): + """ + Return the nagios unit name prepended with host_context if needed + + :param str relation_name: Name of relation nrpe sub joined to + """ + host_context = get_nagios_hostcontext(relation_name) + if host_context: + unit = "%s:%s" % (host_context, local_unit()) + else: + unit = local_unit() + return unit + + +def add_init_service_checks(nrpe, services, unit_name): + """ + Add checks for each service in list + + :param NRPE nrpe: NRPE object to add check to + :param list services: List of services to check + :param str unit_name: Unit name to use in check description + """ + for svc in services: + upstart_init = '/etc/init/%s.conf' % svc + sysv_init = '/etc/init.d/%s' % svc + if os.path.exists(upstart_init): + nrpe.add_check( + shortname=svc, + description='process check {%s}' % unit_name, + check_cmd='check_upstart_job %s' % svc + ) + elif os.path.exists(sysv_init): + cronpath = '/etc/cron.d/nagios-service-check-%s' % svc + cron_file = ('*/5 * * * * root ' + '/usr/local/lib/nagios/plugins/check_exit_status.pl ' + '-s /etc/init.d/%s status > ' + '/var/lib/nagios/service-check-%s.txt\n' % (svc, + svc) + ) + f = open(cronpath, 'w') + f.write(cron_file) + f.close() + nrpe.add_check( + shortname=svc, + description='process check {%s}' % unit_name, + check_cmd='check_status_file.py -f ' + '/var/lib/nagios/service-check-%s.txt' % svc, + ) diff --git a/hooks/charmhelpers/contrib/charmsupport/volumes.py b/hooks/charmhelpers/contrib/charmsupport/volumes.py index 0f905dff..d61aa47f 100644 --- a/hooks/charmhelpers/contrib/charmsupport/volumes.py +++ b/hooks/charmhelpers/contrib/charmsupport/volumes.py @@ -2,7 +2,8 @@ Functions for managing volumes in juju units. One volume is supported per unit. Subordinates may have their own storage, provided it is on its own partition. -Configuration stanzas: +Configuration stanzas:: + volume-ephemeral: type: boolean default: true @@ -20,7 +21,8 @@ Configuration stanzas: is 'true' and no volume-map value is set. Use 'juju set' to set a value and 'juju resolved' to complete configuration. -Usage: +Usage:: + from charmsupport.volumes import configure_volume, VolumeConfigurationError from charmsupport.hookenv import log, ERROR def post_mount_hook(): @@ -34,6 +36,7 @@ Usage: after_change=post_mount_hook) except VolumeConfigurationError: log('Storage could not be configured', ERROR) + ''' # XXX: Known limitations diff --git a/hooks/charmhelpers/contrib/hahelpers/cluster.py b/hooks/charmhelpers/contrib/hahelpers/cluster.py index 52ce4b7c..912b2fe3 100644 --- a/hooks/charmhelpers/contrib/hahelpers/cluster.py +++ b/hooks/charmhelpers/contrib/hahelpers/cluster.py @@ -13,6 +13,7 @@ clustering-related helpers. import subprocess import os + from socket import gethostname as get_unit_hostname import six @@ -28,12 +29,19 @@ from charmhelpers.core.hookenv import ( WARNING, unit_get, ) +from charmhelpers.core.decorators import ( + retry_on_exception, +) class HAIncompleteConfig(Exception): pass +class CRMResourceNotFound(Exception): + pass + + def is_elected_leader(resource): """ Returns True if the charm executing this is the elected cluster leader. @@ -68,24 +76,30 @@ def is_clustered(): return False -def is_crm_leader(resource): +@retry_on_exception(5, base_delay=2, exc_type=CRMResourceNotFound) +def is_crm_leader(resource, retry=False): """ Returns True if the charm calling this is the elected corosync leader, as returned by calling the external "crm" command. + + We allow this operation to be retried to avoid the possibility of getting a + false negative. See LP #1396246 for more info. """ - cmd = [ - "crm", "resource", - "show", resource - ] + cmd = ['crm', 'resource', 'show', resource] try: - status = subprocess.check_output(cmd).decode('UTF-8') + status = subprocess.check_output(cmd, stderr=subprocess.STDOUT) + if not isinstance(status, six.text_type): + status = six.text_type(status, "utf-8") except subprocess.CalledProcessError: - return False - else: - if get_unit_hostname() in status: - return True - else: - return False + status = None + + if status and get_unit_hostname() in status: + return True + + if status and "resource %s is NOT running" % (resource) in status: + raise CRMResourceNotFound("CRM resource %s not found" % (resource)) + + return False def is_leader(resource): diff --git a/hooks/charmhelpers/contrib/network/ufw.py b/hooks/charmhelpers/contrib/network/ufw.py index 1c56b93e..b51689f2 100644 --- a/hooks/charmhelpers/contrib/network/ufw.py +++ b/hooks/charmhelpers/contrib/network/ufw.py @@ -54,6 +54,17 @@ def enable(): if is_enabled(): return True + if not os.path.isdir('/proc/sys/net/ipv6'): + # disable IPv6 support in ufw + hookenv.log("This machine doesn't have IPv6 enabled", level="INFO") + exit_code = subprocess.call(['sed', '-i', 's/IPV6=yes/IPV6=no/g', + '/etc/default/ufw']) + if exit_code == 0: + hookenv.log('IPv6 support in ufw disabled', level='INFO') + else: + hookenv.log("Couldn't disable IPv6 support in ufw", level="ERROR") + raise Exception("Couldn't disable IPv6 support in ufw") + output = subprocess.check_output(['ufw', 'enable'], env={'LANG': 'en_US', 'PATH': os.environ['PATH']}) diff --git a/hooks/charmhelpers/contrib/openstack/context.py b/hooks/charmhelpers/contrib/openstack/context.py index eb108910..180bfad2 100644 --- a/hooks/charmhelpers/contrib/openstack/context.py +++ b/hooks/charmhelpers/contrib/openstack/context.py @@ -491,6 +491,7 @@ class HAProxyContext(OSContextGenerator): ctxt['haproxy_client_timeout'] = config('haproxy-client-timeout') if config('prefer-ipv6'): + ctxt['ipv6'] = True ctxt['local_host'] = 'ip6-localhost' ctxt['haproxy_host'] = '::' ctxt['stat_port'] = ':::8888' diff --git a/hooks/charmhelpers/contrib/openstack/neutron.py b/hooks/charmhelpers/contrib/openstack/neutron.py index 1446f637..095cc24b 100644 --- a/hooks/charmhelpers/contrib/openstack/neutron.py +++ b/hooks/charmhelpers/contrib/openstack/neutron.py @@ -152,9 +152,15 @@ def neutron_plugins(): database=config('neutron-database'), relation_prefix='neutron', ssl_dir=NEUTRON_CONF_DIR)], - 'services': ['calico-compute', 'bird', 'neutron-dhcp-agent'], + 'services': ['calico-felix', + 'bird', + 'neutron-dhcp-agent', + 'nova-api-metadata'], 'packages': [[headers_package()] + determine_dkms_package(), - ['calico-compute', 'bird', 'neutron-dhcp-agent']], + ['calico-compute', + 'bird', + 'neutron-dhcp-agent', + 'nova-api-metadata']], 'server_packages': ['neutron-server', 'calico-control'], 'server_services': ['neutron-server'] } diff --git a/hooks/charmhelpers/contrib/openstack/utils.py b/hooks/charmhelpers/contrib/openstack/utils.py index 44179679..ddd40ce5 100644 --- a/hooks/charmhelpers/contrib/openstack/utils.py +++ b/hooks/charmhelpers/contrib/openstack/utils.py @@ -53,6 +53,7 @@ UBUNTU_OPENSTACK_RELEASE = OrderedDict([ ('saucy', 'havana'), ('trusty', 'icehouse'), ('utopic', 'juno'), + ('vivid', 'kilo'), ]) @@ -64,6 +65,7 @@ OPENSTACK_CODENAMES = OrderedDict([ ('2013.2', 'havana'), ('2014.1', 'icehouse'), ('2014.2', 'juno'), + ('2015.1', 'kilo'), ]) # The ugly duckling @@ -84,6 +86,7 @@ SWIFT_CODENAMES = OrderedDict([ ('2.0.0', 'juno'), ('2.1.0', 'juno'), ('2.2.0', 'juno'), + ('2.2.1', 'kilo'), ]) DEFAULT_LOOPBACK_SIZE = '5G' @@ -289,6 +292,9 @@ def configure_installation_source(rel): 'juno': 'trusty-updates/juno', 'juno/updates': 'trusty-updates/juno', 'juno/proposed': 'trusty-proposed/juno', + 'kilo': 'trusty-updates/kilo', + 'kilo/updates': 'trusty-updates/kilo', + 'kilo/proposed': 'trusty-proposed/kilo', } try: diff --git a/hooks/charmhelpers/contrib/storage/linux/ceph.py b/hooks/charmhelpers/contrib/storage/linux/ceph.py index d47dc228..1479f4f3 100644 --- a/hooks/charmhelpers/contrib/storage/linux/ceph.py +++ b/hooks/charmhelpers/contrib/storage/linux/ceph.py @@ -372,3 +372,46 @@ def ceph_version(): return None else: return None + + +class CephBrokerRq(object): + """Ceph broker request. + + Multiple operations can be added to a request and sent to the Ceph broker + to be executed. + + Request is json-encoded for sending over the wire. + + The API is versioned and defaults to version 1. + """ + def __init__(self, api_version=1): + self.api_version = api_version + self.ops = [] + + def add_op_create_pool(self, name, replica_count=3): + self.ops.append({'op': 'create-pool', 'name': name, + 'replicas': replica_count}) + + @property + def request(self): + return json.dumps({'api-version': self.api_version, 'ops': self.ops}) + + +class CephBrokerRsp(object): + """Ceph broker response. + + Response is json-decoded and contents provided as methods/properties. + + The API is versioned and defaults to version 1. + """ + def __init__(self, encoded_rsp): + self.api_version = None + self.rsp = json.loads(encoded_rsp) + + @property + def exit_code(self): + return self.rsp.get('exit-code') + + @property + def exit_msg(self): + return self.rsp.get('stderr') diff --git a/hooks/charmhelpers/core/host.py b/hooks/charmhelpers/core/host.py index c6f1680a..5221120c 100644 --- a/hooks/charmhelpers/core/host.py +++ b/hooks/charmhelpers/core/host.py @@ -162,13 +162,16 @@ def mkdir(path, owner='root', group='root', perms=0o555, force=False): uid = pwd.getpwnam(owner).pw_uid gid = grp.getgrnam(group).gr_gid realpath = os.path.abspath(path) - if os.path.exists(realpath): - if force and not os.path.isdir(realpath): + path_exists = os.path.exists(realpath) + if path_exists and force: + if not os.path.isdir(realpath): log("Removing non-directory file {} prior to mkdir()".format(path)) os.unlink(realpath) - else: + os.makedirs(realpath, perms) + os.chown(realpath, uid, gid) + elif not path_exists: os.makedirs(realpath, perms) - os.chown(realpath, uid, gid) + os.chown(realpath, uid, gid) def write_file(path, content, owner='root', group='root', perms=0o444): diff --git a/hooks/charmhelpers/fetch/__init__.py b/hooks/charmhelpers/fetch/__init__.py index 0a126fc3..aceadea4 100644 --- a/hooks/charmhelpers/fetch/__init__.py +++ b/hooks/charmhelpers/fetch/__init__.py @@ -64,9 +64,16 @@ CLOUD_ARCHIVE_POCKETS = { 'trusty-juno/updates': 'trusty-updates/juno', 'trusty-updates/juno': 'trusty-updates/juno', 'juno/proposed': 'trusty-proposed/juno', - 'juno/proposed': 'trusty-proposed/juno', 'trusty-juno/proposed': 'trusty-proposed/juno', 'trusty-proposed/juno': 'trusty-proposed/juno', + # Kilo + 'kilo': 'trusty-updates/kilo', + 'trusty-kilo': 'trusty-updates/kilo', + 'trusty-kilo/updates': 'trusty-updates/kilo', + 'trusty-updates/kilo': 'trusty-updates/kilo', + 'kilo/proposed': 'trusty-proposed/kilo', + 'trusty-kilo/proposed': 'trusty-proposed/kilo', + 'trusty-proposed/kilo': 'trusty-proposed/kilo', } # The order of this list is very important. Handlers should be listed in from diff --git a/hooks/quantum_hooks.py b/hooks/quantum_hooks.py index ea4ec9dc..022ed747 100755 --- a/hooks/quantum_hooks.py +++ b/hooks/quantum_hooks.py @@ -1,7 +1,6 @@ #!/usr/bin/python from base64 import b64decode -import os from charmhelpers.core.hookenv import ( log, ERROR, WARNING, @@ -10,8 +9,6 @@ from charmhelpers.core.hookenv import ( relation_get, relation_set, relation_ids, - relations_of_type, - local_unit, unit_get, Hooks, UnregisteredHookError ) @@ -38,7 +35,7 @@ from charmhelpers.contrib.openstack.utils import ( from charmhelpers.payload.execd import execd_preinstall from charmhelpers.core.sysctl import create as create_sysctl -from charmhelpers.contrib.charmsupport.nrpe import NRPE +from charmhelpers.contrib.charmsupport import nrpe import sys from quantum_utils import ( @@ -224,61 +221,27 @@ def stop(): @hooks.hook('nrpe-external-master-relation-joined', 'nrpe-external-master-relation-changed') def update_nrpe_config(): - # Find out if nrpe set nagios_hostname - hostname = None - host_context = None - for rel in relations_of_type('nrpe-external-master'): - if 'nagios_hostname' in rel: - hostname = rel['nagios_hostname'] - host_context = rel['nagios_host_context'] - break - nrpe = NRPE(hostname=hostname) + # python-dbus is used by check_upstart_job apt_install('python-dbus') - - if host_context: - current_unit = "%s:%s" % (host_context, local_unit()) - else: - current_unit = local_unit() - - services_to_monitor = services() - for service in services_to_monitor: - upstart_init = '/etc/init/%s.conf' % service - sysv_init = '/etc/init.d/%s' % service - - if os.path.exists(upstart_init): - nrpe.add_check( - shortname=service, - description='process check {%s}' % current_unit, - check_cmd='check_upstart_job %s' % service, - ) - elif os.path.exists(sysv_init): - cronpath = '/etc/cron.d/nagios-service-check-%s' % service - cron_template = '*/5 * * * * root \ -/usr/local/lib/nagios/plugins/check_exit_status.pl -s /etc/init.d/%s \ -status > /var/lib/nagios/service-check-%s.txt\n' % (service, service) - f = open(cronpath, 'w') - f.write(cron_template) - f.close() - nrpe.add_check( - shortname=service, - description='process check {%s}' % current_unit, - check_cmd='check_status_file.py -f \ -/var/lib/nagios/service-check-%s.txt' % service, - ) + hostname = nrpe.get_nagios_hostname() + current_unit = nrpe.get_nagios_unit_name() + nrpe_setup = nrpe.NRPE(hostname=hostname) + nrpe.add_init_service_checks(nrpe_setup, services(), current_unit) cronpath = '/etc/cron.d/nagios-netns-check' - cron_template = '*/5 * * * * root \ -/usr/local/lib/nagios/plugins/check_netns.sh \ -> /var/lib/nagios/netns-check.txt\n' + cron_template = ('*/5 * * * * root ' + '/usr/local/lib/nagios/plugins/check_netns.sh ' + '> /var/lib/nagios/netns-check.txt\n' + ) f = open(cronpath, 'w') f.write(cron_template) f.close() - nrpe.add_check( + nrpe_setup.add_check( shortname="netns", description='Network Namespace check {%s}' % current_unit, check_cmd='check_status_file.py -f /var/lib/nagios/netns-check.txt' ) - nrpe.write() + nrpe_setup.write() if __name__ == '__main__': From b88f5de1b6f52e57e4dc18749f9942a60bcf5243 Mon Sep 17 00:00:00 2001 From: Liam Young Date: Mon, 12 Jan 2015 12:04:50 +0000 Subject: [PATCH 12/12] Add decorators.py after charmhelpers sync --- hooks/charmhelpers/core/decorators.py | 41 +++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 hooks/charmhelpers/core/decorators.py diff --git a/hooks/charmhelpers/core/decorators.py b/hooks/charmhelpers/core/decorators.py new file mode 100644 index 00000000..029a4ef4 --- /dev/null +++ b/hooks/charmhelpers/core/decorators.py @@ -0,0 +1,41 @@ +# +# Copyright 2014 Canonical Ltd. +# +# Authors: +# Edward Hope-Morley +# + +import time + +from charmhelpers.core.hookenv import ( + log, + INFO, +) + + +def retry_on_exception(num_retries, base_delay=0, exc_type=Exception): + """If the decorated function raises exception exc_type, allow num_retries + retry attempts before raise the exception. + """ + def _retry_on_exception_inner_1(f): + def _retry_on_exception_inner_2(*args, **kwargs): + retries = num_retries + multiplier = 1 + while True: + try: + return f(*args, **kwargs) + except exc_type: + if not retries: + raise + + delay = base_delay * multiplier + multiplier += 1 + log("Retrying '%s' %d more times (delay=%s)" % + (f.__name__, retries, delay), level=INFO) + retries -= 1 + if delay: + time.sleep(delay) + + return _retry_on_exception_inner_2 + + return _retry_on_exception_inner_1