From ee991c755421d83bc06f833e20094eda7ac585cc Mon Sep 17 00:00:00 2001 From: Brad Marshall Date: Thu, 30 Oct 2014 16:57:10 +1000 Subject: [PATCH] [bradm] Initial nrpe checks --- charm-helpers-hooks.yaml | 1 + config.yaml | 11 + files/nagios/check_ceph_status.py | 44 ++++ files/nagios/collect_ceph_status.sh | 18 ++ .../contrib/charmsupport/__init__.py | 0 .../charmhelpers/contrib/charmsupport/nrpe.py | 219 ++++++++++++++++++ .../contrib/charmsupport/volumes.py | 156 +++++++++++++ hooks/hooks.py | 46 +++- hooks/nrpe-external-master-relation-changed | 1 + hooks/nrpe-external-master-relation-joined | 1 + metadata.yaml | 4 + 11 files changed, 500 insertions(+), 1 deletion(-) create mode 100755 files/nagios/check_ceph_status.py create mode 100755 files/nagios/collect_ceph_status.sh create mode 100644 hooks/charmhelpers/contrib/charmsupport/__init__.py create mode 100644 hooks/charmhelpers/contrib/charmsupport/nrpe.py create mode 100644 hooks/charmhelpers/contrib/charmsupport/volumes.py create mode 120000 hooks/nrpe-external-master-relation-changed create mode 120000 hooks/nrpe-external-master-relation-joined diff --git a/charm-helpers-hooks.yaml b/charm-helpers-hooks.yaml index afb9e42..f697867 100644 --- a/charm-helpers-hooks.yaml +++ b/charm-helpers-hooks.yaml @@ -8,3 +8,4 @@ include: - payload.execd - contrib.openstack.alternatives - contrib.network.ip + - contrib.charmsupport diff --git a/config.yaml b/config.yaml index 95e9485..1581052 100644 --- a/config.yaml +++ b/config.yaml @@ -155,3 +155,14 @@ options: order for this charm to function correctly, the privacy extension must be disabled and a non-temporary address must be configured/available on your network interface. + nagios_context: + default: "juju" + type: string + description: | + Used by the nrpe-external-master subordinate charm. + A string that will be prepended to instance name to set the host name + in nagios. So for instance the hostname would be something like: + juju-myservice-0 + If you're running multiple environments with the same services in them + this allows you to differentiate between them. + diff --git a/files/nagios/check_ceph_status.py b/files/nagios/check_ceph_status.py new file mode 100755 index 0000000..cb8d1a1 --- /dev/null +++ b/files/nagios/check_ceph_status.py @@ -0,0 +1,44 @@ +#!/usr/bin/env python + +# Copyright (C) 2014 Canonical +# All Rights Reserved +# Author: Jacek Nykis + +import re +import argparse +import subprocess +import nagios_plugin + + +def check_ceph_status(args): + if args.status_file: + nagios_plugin.check_file_freshness(args.status_file, 3600) + with open(args.status_file, "r") as f: + lines = f.readlines() + status_data = dict(l.strip().split(' ', 1) for l in lines if len(l) > 1) + else: + lines = subprocess.check_output(["ceph", "status"]).split('\n') + status_data = dict(l.strip().split(' ', 1) for l in lines if len(l) > 1) + + if ('health' not in status_data + or 'monmap' not in status_data + or 'osdmap'not in status_data): + raise nagios_plugin.UnknownError('UNKNOWN: status data is incomplete') + + if status_data['health'] != 'HEALTH_OK': + msg = 'CRITICAL: ceph health status: "{}"'.format(status_data['health']) + raise nagios_plugin.CriticalError(msg) + osds = re.search("^.*: (\d+) osds: (\d+) up, (\d+) in", status_data['osdmap']) + if osds.group(1) > osds.group(2): # not all OSDs are "up" + msg = 'CRITICAL: Some OSDs are not up. Total: {}, up: {}'.format( + osds.group(1), osds.group(2)) + raise nagios_plugin.CriticalError(msg) + print "All OK" + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Check ceph status') + parser.add_argument('-f', '--file', dest='status_file', + default=False, help='Optional file with "ceph status" output') + args = parser.parse_args() + nagios_plugin.try_check(check_ceph_status, args) diff --git a/files/nagios/collect_ceph_status.sh b/files/nagios/collect_ceph_status.sh new file mode 100755 index 0000000..dbdd3ac --- /dev/null +++ b/files/nagios/collect_ceph_status.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# Copyright (C) 2014 Canonical +# All Rights Reserved +# Author: Jacek Nykis + +LOCK=/var/lock/ceph-status.lock +lockfile-create -r2 --lock-name $LOCK > /dev/null 2>&1 +if [ $? -ne 0 ]; then + exit 1 +fi +trap "rm -f $LOCK > /dev/null 2>&1" exit + +DATA_DIR="/var/lib/nagios" +if [ ! -d $DATA_DIR ]; then + mkdir -p $DATA_DIR +fi + +ceph status >${DATA_DIR}/cat-ceph-status.txt diff --git a/hooks/charmhelpers/contrib/charmsupport/__init__.py b/hooks/charmhelpers/contrib/charmsupport/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/hooks/charmhelpers/contrib/charmsupport/nrpe.py b/hooks/charmhelpers/contrib/charmsupport/nrpe.py new file mode 100644 index 0000000..1815dad --- /dev/null +++ b/hooks/charmhelpers/contrib/charmsupport/nrpe.py @@ -0,0 +1,219 @@ +"""Compatibility with the nrpe-external-master charm""" +# Copyright 2012 Canonical Ltd. +# +# Authors: +# Matthew Wedgwood + +import subprocess +import pwd +import grp +import os +import re +import shlex +import yaml + +from charmhelpers.core.hookenv import ( + config, + local_unit, + log, + relation_ids, + relation_set, +) + +from charmhelpers.core.host import service + +# This module adds compatibility with the nrpe-external-master and plain nrpe +# subordinate charms. To use it in your charm: +# +# 1. Update metadata.yaml +# +# provides: +# (...) +# nrpe-external-master: +# interface: nrpe-external-master +# scope: container +# +# and/or +# +# provides: +# (...) +# local-monitors: +# interface: local-monitors +# scope: container + +# +# 2. Add the following to config.yaml +# +# nagios_context: +# default: "juju" +# type: string +# description: | +# Used by the nrpe subordinate charms. +# A string that will be prepended to instance name to set the host name +# in nagios. So for instance the hostname would be something like: +# juju-myservice-0 +# If you're running multiple environments with the same services in them +# this allows you to differentiate between them. +# +# 3. Add custom checks (Nagios plugins) to files/nrpe-external-master +# +# 4. Update your hooks.py with something like this: +# +# from charmsupport.nrpe import NRPE +# (...) +# def update_nrpe_config(): +# nrpe_compat = NRPE() +# nrpe_compat.add_check( +# shortname = "myservice", +# description = "Check MyService", +# check_cmd = "check_http -w 2 -c 10 http://localhost" +# ) +# nrpe_compat.add_check( +# "myservice_other", +# "Check for widget failures", +# check_cmd = "/srv/myapp/scripts/widget_check" +# ) +# nrpe_compat.write() +# +# def config_changed(): +# (...) +# update_nrpe_config() +# +# def nrpe_external_master_relation_changed(): +# update_nrpe_config() +# +# def local_monitors_relation_changed(): +# update_nrpe_config() +# +# 5. ln -s hooks.py nrpe-external-master-relation-changed +# ln -s hooks.py local-monitors-relation-changed + + +class CheckException(Exception): + pass + + +class Check(object): + shortname_re = '[A-Za-z0-9-_]+$' + service_template = (""" +#--------------------------------------------------- +# This file is Juju managed +#--------------------------------------------------- +define service {{ + use active-service + host_name {nagios_hostname} + service_description {nagios_hostname}[{shortname}] """ + """{description} + check_command check_nrpe!{command} + servicegroups {nagios_servicegroup} +}} +""") + + def __init__(self, shortname, description, check_cmd): + super(Check, self).__init__() + # XXX: could be better to calculate this from the service name + if not re.match(self.shortname_re, shortname): + raise CheckException("shortname must match {}".format( + Check.shortname_re)) + self.shortname = shortname + self.command = "check_{}".format(shortname) + # Note: a set of invalid characters is defined by the + # Nagios server config + # The default is: illegal_object_name_chars=`~!$%^&*"|'<>?,()= + self.description = description + self.check_cmd = self._locate_cmd(check_cmd) + + def _locate_cmd(self, check_cmd): + search_path = ( + '/usr/lib/nagios/plugins', + '/usr/local/lib/nagios/plugins', + ) + parts = shlex.split(check_cmd) + for path in search_path: + if os.path.exists(os.path.join(path, parts[0])): + command = os.path.join(path, parts[0]) + if len(parts) > 1: + command += " " + " ".join(parts[1:]) + return command + log('Check command not found: {}'.format(parts[0])) + return '' + + def write(self, nagios_context, hostname): + nrpe_check_file = '/etc/nagios/nrpe.d/{}.cfg'.format( + self.command) + with open(nrpe_check_file, 'w') as nrpe_check_config: + nrpe_check_config.write("# check {}\n".format(self.shortname)) + nrpe_check_config.write("command[{}]={}\n".format( + self.command, self.check_cmd)) + + if not os.path.exists(NRPE.nagios_exportdir): + log('Not writing service config as {} is not accessible'.format( + NRPE.nagios_exportdir)) + else: + self.write_service_config(nagios_context, hostname) + + def write_service_config(self, nagios_context, hostname): + for f in os.listdir(NRPE.nagios_exportdir): + if re.search('.*{}.cfg'.format(self.command), f): + os.remove(os.path.join(NRPE.nagios_exportdir, f)) + + templ_vars = { + 'nagios_hostname': hostname, + 'nagios_servicegroup': nagios_context, + 'description': self.description, + 'shortname': self.shortname, + 'command': self.command, + } + nrpe_service_text = Check.service_template.format(**templ_vars) + nrpe_service_file = '{}/service__{}_{}.cfg'.format( + NRPE.nagios_exportdir, hostname, self.command) + with open(nrpe_service_file, 'w') as nrpe_service_config: + nrpe_service_config.write(str(nrpe_service_text)) + + def run(self): + subprocess.call(self.check_cmd) + + +class NRPE(object): + nagios_logdir = '/var/log/nagios' + nagios_exportdir = '/var/lib/nagios/export' + nrpe_confdir = '/etc/nagios/nrpe.d' + + def __init__(self, hostname=None): + super(NRPE, self).__init__() + self.config = config() + self.nagios_context = self.config['nagios_context'] + self.unit_name = local_unit().replace('/', '-') + if hostname: + self.hostname = hostname + else: + self.hostname = "{}-{}".format(self.nagios_context, self.unit_name) + self.checks = [] + + def add_check(self, *args, **kwargs): + self.checks.append(Check(*args, **kwargs)) + + def write(self): + try: + nagios_uid = pwd.getpwnam('nagios').pw_uid + nagios_gid = grp.getgrnam('nagios').gr_gid + except: + log("Nagios user not set up, nrpe checks not updated") + return + + if not os.path.exists(NRPE.nagios_logdir): + os.mkdir(NRPE.nagios_logdir) + os.chown(NRPE.nagios_logdir, nagios_uid, nagios_gid) + + nrpe_monitors = {} + monitors = {"monitors": {"remote": {"nrpe": nrpe_monitors}}} + for nrpecheck in self.checks: + nrpecheck.write(self.nagios_context, self.hostname) + nrpe_monitors[nrpecheck.shortname] = { + "command": nrpecheck.command, + } + + service('restart', 'nagios-nrpe-server') + + for rid in relation_ids("local-monitors"): + relation_set(relation_id=rid, monitors=yaml.dump(monitors)) diff --git a/hooks/charmhelpers/contrib/charmsupport/volumes.py b/hooks/charmhelpers/contrib/charmsupport/volumes.py new file mode 100644 index 0000000..0f905df --- /dev/null +++ b/hooks/charmhelpers/contrib/charmsupport/volumes.py @@ -0,0 +1,156 @@ +''' +Functions for managing volumes in juju units. One volume is supported per unit. +Subordinates may have their own storage, provided it is on its own partition. + +Configuration stanzas: + volume-ephemeral: + type: boolean + default: true + description: > + If false, a volume is mounted as sepecified in "volume-map" + If true, ephemeral storage will be used, meaning that log data + will only exist as long as the machine. YOU HAVE BEEN WARNED. + volume-map: + type: string + default: {} + description: > + YAML map of units to device names, e.g: + "{ rsyslog/0: /dev/vdb, rsyslog/1: /dev/vdb }" + Service units will raise a configure-error if volume-ephemeral + is 'true' and no volume-map value is set. Use 'juju set' to set a + value and 'juju resolved' to complete configuration. + +Usage: + from charmsupport.volumes import configure_volume, VolumeConfigurationError + from charmsupport.hookenv import log, ERROR + def post_mount_hook(): + stop_service('myservice') + def post_mount_hook(): + start_service('myservice') + + if __name__ == '__main__': + try: + configure_volume(before_change=pre_mount_hook, + after_change=post_mount_hook) + except VolumeConfigurationError: + log('Storage could not be configured', ERROR) +''' + +# XXX: Known limitations +# - fstab is neither consulted nor updated + +import os +from charmhelpers.core import hookenv +from charmhelpers.core import host +import yaml + + +MOUNT_BASE = '/srv/juju/volumes' + + +class VolumeConfigurationError(Exception): + '''Volume configuration data is missing or invalid''' + pass + + +def get_config(): + '''Gather and sanity-check volume configuration data''' + volume_config = {} + config = hookenv.config() + + errors = False + + if config.get('volume-ephemeral') in (True, 'True', 'true', 'Yes', 'yes'): + volume_config['ephemeral'] = True + else: + volume_config['ephemeral'] = False + + try: + volume_map = yaml.safe_load(config.get('volume-map', '{}')) + except yaml.YAMLError as e: + hookenv.log("Error parsing YAML volume-map: {}".format(e), + hookenv.ERROR) + errors = True + if volume_map is None: + # probably an empty string + volume_map = {} + elif not isinstance(volume_map, dict): + hookenv.log("Volume-map should be a dictionary, not {}".format( + type(volume_map))) + errors = True + + volume_config['device'] = volume_map.get(os.environ['JUJU_UNIT_NAME']) + if volume_config['device'] and volume_config['ephemeral']: + # asked for ephemeral storage but also defined a volume ID + hookenv.log('A volume is defined for this unit, but ephemeral ' + 'storage was requested', hookenv.ERROR) + errors = True + elif not volume_config['device'] and not volume_config['ephemeral']: + # asked for permanent storage but did not define volume ID + hookenv.log('Ephemeral storage was requested, but there is no volume ' + 'defined for this unit.', hookenv.ERROR) + errors = True + + unit_mount_name = hookenv.local_unit().replace('/', '-') + volume_config['mountpoint'] = os.path.join(MOUNT_BASE, unit_mount_name) + + if errors: + return None + return volume_config + + +def mount_volume(config): + if os.path.exists(config['mountpoint']): + if not os.path.isdir(config['mountpoint']): + hookenv.log('Not a directory: {}'.format(config['mountpoint'])) + raise VolumeConfigurationError() + else: + host.mkdir(config['mountpoint']) + if os.path.ismount(config['mountpoint']): + unmount_volume(config) + if not host.mount(config['device'], config['mountpoint'], persist=True): + raise VolumeConfigurationError() + + +def unmount_volume(config): + if os.path.ismount(config['mountpoint']): + if not host.umount(config['mountpoint'], persist=True): + raise VolumeConfigurationError() + + +def managed_mounts(): + '''List of all mounted managed volumes''' + return filter(lambda mount: mount[0].startswith(MOUNT_BASE), host.mounts()) + + +def configure_volume(before_change=lambda: None, after_change=lambda: None): + '''Set up storage (or don't) according to the charm's volume configuration. + Returns the mount point or "ephemeral". before_change and after_change + are optional functions to be called if the volume configuration changes. + ''' + + config = get_config() + if not config: + hookenv.log('Failed to read volume configuration', hookenv.CRITICAL) + raise VolumeConfigurationError() + + if config['ephemeral']: + if os.path.ismount(config['mountpoint']): + before_change() + unmount_volume(config) + after_change() + return 'ephemeral' + else: + # persistent storage + if os.path.ismount(config['mountpoint']): + mounts = dict(managed_mounts()) + if mounts.get(config['mountpoint']) != config['device']: + before_change() + unmount_volume(config) + mount_volume(config) + after_change() + else: + before_change() + mount_volume(config) + after_change() + return config['mountpoint'] diff --git a/hooks/hooks.py b/hooks/hooks.py index 8a6c26c..a0befde 100755 --- a/hooks/hooks.py +++ b/hooks/hooks.py @@ -23,13 +23,16 @@ from charmhelpers.core.hookenv import ( relation_set, remote_unit, Hooks, UnregisteredHookError, - service_name + service_name, + relations_of_type ) from charmhelpers.core.host import ( service_restart, umount, mkdir, + write_file, + rsync, cmp_pkgrevno ) from charmhelpers.fetch import ( @@ -51,8 +54,15 @@ from utils import ( assert_charm_supports_ipv6 ) +from charmhelpers.contrib.charmsupport.nrpe import NRPE + hooks = Hooks() +NAGIOS_PLUGINS = '/usr/local/lib/nagios/plugins' +SCRIPTS_DIR = '/usr/local/bin' +STATUS_FILE = '/var/lib/nagios/cat-ceph-status.txt' +STATUS_CRONFILE = '/etc/cron.d/cat-ceph-health' + def install_upstart_scripts(): # Only install upstart configurations for older versions @@ -143,6 +153,9 @@ def config_changed(): reformat_osd(), config('ignore-device-errors')) ceph.start_osds(get_devices()) + if relations_of_type('nrpe-external-master'): + update_nrpe_config() + def get_mon_hosts(): hosts = [] @@ -307,6 +320,37 @@ def start(): ceph.start_osds(get_devices()) +@hooks.hook('nrpe-external-master-relation-joined') +@hooks.hook('nrpe-external-master-relation-changed') +def update_nrpe_config(): + log('Refreshing nagios checks') + if os.path.isdir(NAGIOS_PLUGINS): + rsync(os.path.join(os.getenv('CHARM_DIR'), 'files', 'nagios', + 'check_ceph_status.py'), + os.path.join(NAGIOS_PLUGINS, 'check_ceph_status.py')) + + script = os.path.join(SCRIPTS_DIR, 'collect_ceph_status.sh') + rsync(os.path.join(os.getenv('CHARM_DIR'), 'files', + 'nagios', 'collect_ceph_status.sh'), + script) + cronjob = "{} root {}\n".format('*/5 * * * *', script) + write_file(STATUS_CRONFILE, cronjob) + + # Find out if nrpe set nagios_hostname + hostname = None + for rel in relations_of_type('nrpe-external-master'): + if 'nagios_hostname' in rel: + hostname = rel['nagios_hostname'] + break + nrpe = NRPE(hostname=hostname) + nrpe.add_check( + shortname="ceph", + description='Check Ceph health', + check_cmd='check_ceph_status.py -f {}'.format(STATUS_FILE) + ) + nrpe.write() + + if __name__ == '__main__': try: hooks.execute(sys.argv) diff --git a/hooks/nrpe-external-master-relation-changed b/hooks/nrpe-external-master-relation-changed new file mode 120000 index 0000000..9416ca6 --- /dev/null +++ b/hooks/nrpe-external-master-relation-changed @@ -0,0 +1 @@ +hooks.py \ No newline at end of file diff --git a/hooks/nrpe-external-master-relation-joined b/hooks/nrpe-external-master-relation-joined new file mode 120000 index 0000000..9416ca6 --- /dev/null +++ b/hooks/nrpe-external-master-relation-joined @@ -0,0 +1 @@ +hooks.py \ No newline at end of file diff --git a/metadata.yaml b/metadata.yaml index 3d0a7f4..9fab75a 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -19,3 +19,7 @@ provides: interface: ceph-osd radosgw: interface: ceph-radosgw + nrpe-external-master: + interface: nrpe-external-master + scope: container + gets: [nagios_hostname, nagios_host_context]