[gnuoy,r=james-page] Add support for nrpe

This commit is contained in:
Liam Young 2015-01-12 17:43:48 +00:00
commit 30e0ba5bc9
19 changed files with 666 additions and 19 deletions

View File

@ -9,3 +9,4 @@ include:
- contrib.python.packages
- contrib.storage.linux
- payload.execd
- contrib.charmsupport

View File

@ -104,6 +104,16 @@ options:
default: nova
type: string
description: Database name
nagios_context:
default: "juju"
type: string
description: |
Used by the nrpe-external-master subordinate charm.
A string that will be prepended to instance name to set the host name
in nagios. So for instance the hostname would be something like:
juju-myservice-0
If you're running multiple environments with the same services in them
this allows you to differentiate between them.
# Network configuration options
# by default all access is over 'private-address'
os-data-network:

View File

@ -0,0 +1,308 @@
"""Compatibility with the nrpe-external-master charm"""
# Copyright 2012 Canonical Ltd.
#
# Authors:
# Matthew Wedgwood <matthew.wedgwood@canonical.com>
import subprocess
import pwd
import grp
import os
import re
import shlex
import yaml
from charmhelpers.core.hookenv import (
config,
local_unit,
log,
relation_ids,
relation_set,
relations_of_type,
)
from charmhelpers.core.host import service
# This module adds compatibility with the nrpe-external-master and plain nrpe
# subordinate charms. To use it in your charm:
#
# 1. Update metadata.yaml
#
# provides:
# (...)
# nrpe-external-master:
# interface: nrpe-external-master
# scope: container
#
# and/or
#
# provides:
# (...)
# local-monitors:
# interface: local-monitors
# scope: container
#
# 2. Add the following to config.yaml
#
# nagios_context:
# default: "juju"
# type: string
# description: |
# Used by the nrpe subordinate charms.
# A string that will be prepended to instance name to set the host name
# in nagios. So for instance the hostname would be something like:
# juju-myservice-0
# If you're running multiple environments with the same services in them
# this allows you to differentiate between them.
# nagios_servicegroups:
# default: ""
# type: string
# description: |
# A comma-separated list of nagios servicegroups.
# If left empty, the nagios_context will be used as the servicegroup
#
# 3. Add custom checks (Nagios plugins) to files/nrpe-external-master
#
# 4. Update your hooks.py with something like this:
#
# from charmsupport.nrpe import NRPE
# (...)
# def update_nrpe_config():
# nrpe_compat = NRPE()
# nrpe_compat.add_check(
# shortname = "myservice",
# description = "Check MyService",
# check_cmd = "check_http -w 2 -c 10 http://localhost"
# )
# nrpe_compat.add_check(
# "myservice_other",
# "Check for widget failures",
# check_cmd = "/srv/myapp/scripts/widget_check"
# )
# nrpe_compat.write()
#
# def config_changed():
# (...)
# update_nrpe_config()
#
# def nrpe_external_master_relation_changed():
# update_nrpe_config()
#
# def local_monitors_relation_changed():
# update_nrpe_config()
#
# 5. ln -s hooks.py nrpe-external-master-relation-changed
# ln -s hooks.py local-monitors-relation-changed
class CheckException(Exception):
pass
class Check(object):
shortname_re = '[A-Za-z0-9-_]+$'
service_template = ("""
#---------------------------------------------------
# This file is Juju managed
#---------------------------------------------------
define service {{
use active-service
host_name {nagios_hostname}
service_description {nagios_hostname}[{shortname}] """
"""{description}
check_command check_nrpe!{command}
servicegroups {nagios_servicegroup}
}}
""")
def __init__(self, shortname, description, check_cmd):
super(Check, self).__init__()
# XXX: could be better to calculate this from the service name
if not re.match(self.shortname_re, shortname):
raise CheckException("shortname must match {}".format(
Check.shortname_re))
self.shortname = shortname
self.command = "check_{}".format(shortname)
# Note: a set of invalid characters is defined by the
# Nagios server config
# The default is: illegal_object_name_chars=`~!$%^&*"|'<>?,()=
self.description = description
self.check_cmd = self._locate_cmd(check_cmd)
def _locate_cmd(self, check_cmd):
search_path = (
'/usr/lib/nagios/plugins',
'/usr/local/lib/nagios/plugins',
)
parts = shlex.split(check_cmd)
for path in search_path:
if os.path.exists(os.path.join(path, parts[0])):
command = os.path.join(path, parts[0])
if len(parts) > 1:
command += " " + " ".join(parts[1:])
return command
log('Check command not found: {}'.format(parts[0]))
return ''
def write(self, nagios_context, hostname, nagios_servicegroups=None):
nrpe_check_file = '/etc/nagios/nrpe.d/{}.cfg'.format(
self.command)
with open(nrpe_check_file, 'w') as nrpe_check_config:
nrpe_check_config.write("# check {}\n".format(self.shortname))
nrpe_check_config.write("command[{}]={}\n".format(
self.command, self.check_cmd))
if not os.path.exists(NRPE.nagios_exportdir):
log('Not writing service config as {} is not accessible'.format(
NRPE.nagios_exportdir))
else:
self.write_service_config(nagios_context, hostname,
nagios_servicegroups)
def write_service_config(self, nagios_context, hostname,
nagios_servicegroups=None):
for f in os.listdir(NRPE.nagios_exportdir):
if re.search('.*{}.cfg'.format(self.command), f):
os.remove(os.path.join(NRPE.nagios_exportdir, f))
if not nagios_servicegroups:
nagios_servicegroups = nagios_context
templ_vars = {
'nagios_hostname': hostname,
'nagios_servicegroup': nagios_servicegroups,
'description': self.description,
'shortname': self.shortname,
'command': self.command,
}
nrpe_service_text = Check.service_template.format(**templ_vars)
nrpe_service_file = '{}/service__{}_{}.cfg'.format(
NRPE.nagios_exportdir, hostname, self.command)
with open(nrpe_service_file, 'w') as nrpe_service_config:
nrpe_service_config.write(str(nrpe_service_text))
def run(self):
subprocess.call(self.check_cmd)
class NRPE(object):
nagios_logdir = '/var/log/nagios'
nagios_exportdir = '/var/lib/nagios/export'
nrpe_confdir = '/etc/nagios/nrpe.d'
def __init__(self, hostname=None):
super(NRPE, self).__init__()
self.config = config()
self.nagios_context = self.config['nagios_context']
if 'nagios_servicegroups' in self.config:
self.nagios_servicegroups = self.config['nagios_servicegroups']
else:
self.nagios_servicegroups = 'juju'
self.unit_name = local_unit().replace('/', '-')
if hostname:
self.hostname = hostname
else:
self.hostname = "{}-{}".format(self.nagios_context, self.unit_name)
self.checks = []
def add_check(self, *args, **kwargs):
self.checks.append(Check(*args, **kwargs))
def write(self):
try:
nagios_uid = pwd.getpwnam('nagios').pw_uid
nagios_gid = grp.getgrnam('nagios').gr_gid
except:
log("Nagios user not set up, nrpe checks not updated")
return
if not os.path.exists(NRPE.nagios_logdir):
os.mkdir(NRPE.nagios_logdir)
os.chown(NRPE.nagios_logdir, nagios_uid, nagios_gid)
nrpe_monitors = {}
monitors = {"monitors": {"remote": {"nrpe": nrpe_monitors}}}
for nrpecheck in self.checks:
nrpecheck.write(self.nagios_context, self.hostname,
self.nagios_servicegroups)
nrpe_monitors[nrpecheck.shortname] = {
"command": nrpecheck.command,
}
service('restart', 'nagios-nrpe-server')
for rid in relation_ids("local-monitors"):
relation_set(relation_id=rid, monitors=yaml.dump(monitors))
def get_nagios_hostcontext(relation_name='nrpe-external-master'):
"""
Query relation with nrpe subordinate, return the nagios_host_context
:param str relation_name: Name of relation nrpe sub joined to
"""
for rel in relations_of_type(relation_name):
if 'nagios_hostname' in rel:
return rel['nagios_host_context']
def get_nagios_hostname(relation_name='nrpe-external-master'):
"""
Query relation with nrpe subordinate, return the nagios_hostname
:param str relation_name: Name of relation nrpe sub joined to
"""
for rel in relations_of_type(relation_name):
if 'nagios_hostname' in rel:
return rel['nagios_hostname']
def get_nagios_unit_name(relation_name='nrpe-external-master'):
"""
Return the nagios unit name prepended with host_context if needed
:param str relation_name: Name of relation nrpe sub joined to
"""
host_context = get_nagios_hostcontext(relation_name)
if host_context:
unit = "%s:%s" % (host_context, local_unit())
else:
unit = local_unit()
return unit
def add_init_service_checks(nrpe, services, unit_name):
"""
Add checks for each service in list
:param NRPE nrpe: NRPE object to add check to
:param list services: List of services to check
:param str unit_name: Unit name to use in check description
"""
for svc in services:
upstart_init = '/etc/init/%s.conf' % svc
sysv_init = '/etc/init.d/%s' % svc
if os.path.exists(upstart_init):
nrpe.add_check(
shortname=svc,
description='process check {%s}' % unit_name,
check_cmd='check_upstart_job %s' % svc
)
elif os.path.exists(sysv_init):
cronpath = '/etc/cron.d/nagios-service-check-%s' % svc
cron_file = ('*/5 * * * * root '
'/usr/local/lib/nagios/plugins/check_exit_status.pl '
'-s /etc/init.d/%s status > '
'/var/lib/nagios/service-check-%s.txt\n' % (svc,
svc)
)
f = open(cronpath, 'w')
f.write(cron_file)
f.close()
nrpe.add_check(
shortname=svc,
description='process check {%s}' % unit_name,
check_cmd='check_status_file.py -f '
'/var/lib/nagios/service-check-%s.txt' % svc,
)

View File

@ -0,0 +1,159 @@
'''
Functions for managing volumes in juju units. One volume is supported per unit.
Subordinates may have their own storage, provided it is on its own partition.
Configuration stanzas::
volume-ephemeral:
type: boolean
default: true
description: >
If false, a volume is mounted as sepecified in "volume-map"
If true, ephemeral storage will be used, meaning that log data
will only exist as long as the machine. YOU HAVE BEEN WARNED.
volume-map:
type: string
default: {}
description: >
YAML map of units to device names, e.g:
"{ rsyslog/0: /dev/vdb, rsyslog/1: /dev/vdb }"
Service units will raise a configure-error if volume-ephemeral
is 'true' and no volume-map value is set. Use 'juju set' to set a
value and 'juju resolved' to complete configuration.
Usage::
from charmsupport.volumes import configure_volume, VolumeConfigurationError
from charmsupport.hookenv import log, ERROR
def post_mount_hook():
stop_service('myservice')
def post_mount_hook():
start_service('myservice')
if __name__ == '__main__':
try:
configure_volume(before_change=pre_mount_hook,
after_change=post_mount_hook)
except VolumeConfigurationError:
log('Storage could not be configured', ERROR)
'''
# XXX: Known limitations
# - fstab is neither consulted nor updated
import os
from charmhelpers.core import hookenv
from charmhelpers.core import host
import yaml
MOUNT_BASE = '/srv/juju/volumes'
class VolumeConfigurationError(Exception):
'''Volume configuration data is missing or invalid'''
pass
def get_config():
'''Gather and sanity-check volume configuration data'''
volume_config = {}
config = hookenv.config()
errors = False
if config.get('volume-ephemeral') in (True, 'True', 'true', 'Yes', 'yes'):
volume_config['ephemeral'] = True
else:
volume_config['ephemeral'] = False
try:
volume_map = yaml.safe_load(config.get('volume-map', '{}'))
except yaml.YAMLError as e:
hookenv.log("Error parsing YAML volume-map: {}".format(e),
hookenv.ERROR)
errors = True
if volume_map is None:
# probably an empty string
volume_map = {}
elif not isinstance(volume_map, dict):
hookenv.log("Volume-map should be a dictionary, not {}".format(
type(volume_map)))
errors = True
volume_config['device'] = volume_map.get(os.environ['JUJU_UNIT_NAME'])
if volume_config['device'] and volume_config['ephemeral']:
# asked for ephemeral storage but also defined a volume ID
hookenv.log('A volume is defined for this unit, but ephemeral '
'storage was requested', hookenv.ERROR)
errors = True
elif not volume_config['device'] and not volume_config['ephemeral']:
# asked for permanent storage but did not define volume ID
hookenv.log('Ephemeral storage was requested, but there is no volume '
'defined for this unit.', hookenv.ERROR)
errors = True
unit_mount_name = hookenv.local_unit().replace('/', '-')
volume_config['mountpoint'] = os.path.join(MOUNT_BASE, unit_mount_name)
if errors:
return None
return volume_config
def mount_volume(config):
if os.path.exists(config['mountpoint']):
if not os.path.isdir(config['mountpoint']):
hookenv.log('Not a directory: {}'.format(config['mountpoint']))
raise VolumeConfigurationError()
else:
host.mkdir(config['mountpoint'])
if os.path.ismount(config['mountpoint']):
unmount_volume(config)
if not host.mount(config['device'], config['mountpoint'], persist=True):
raise VolumeConfigurationError()
def unmount_volume(config):
if os.path.ismount(config['mountpoint']):
if not host.umount(config['mountpoint'], persist=True):
raise VolumeConfigurationError()
def managed_mounts():
'''List of all mounted managed volumes'''
return filter(lambda mount: mount[0].startswith(MOUNT_BASE), host.mounts())
def configure_volume(before_change=lambda: None, after_change=lambda: None):
'''Set up storage (or don't) according to the charm's volume configuration.
Returns the mount point or "ephemeral". before_change and after_change
are optional functions to be called if the volume configuration changes.
'''
config = get_config()
if not config:
hookenv.log('Failed to read volume configuration', hookenv.CRITICAL)
raise VolumeConfigurationError()
if config['ephemeral']:
if os.path.ismount(config['mountpoint']):
before_change()
unmount_volume(config)
after_change()
return 'ephemeral'
else:
# persistent storage
if os.path.ismount(config['mountpoint']):
mounts = dict(managed_mounts())
if mounts.get(config['mountpoint']) != config['device']:
before_change()
unmount_volume(config)
mount_volume(config)
after_change()
else:
before_change()
mount_volume(config)
after_change()
return config['mountpoint']

View File

@ -13,6 +13,7 @@ clustering-related helpers.
import subprocess
import os
from socket import gethostname as get_unit_hostname
import six
@ -28,12 +29,19 @@ from charmhelpers.core.hookenv import (
WARNING,
unit_get,
)
from charmhelpers.core.decorators import (
retry_on_exception,
)
class HAIncompleteConfig(Exception):
pass
class CRMResourceNotFound(Exception):
pass
def is_elected_leader(resource):
"""
Returns True if the charm executing this is the elected cluster leader.
@ -68,24 +76,30 @@ def is_clustered():
return False
def is_crm_leader(resource):
@retry_on_exception(5, base_delay=2, exc_type=CRMResourceNotFound)
def is_crm_leader(resource, retry=False):
"""
Returns True if the charm calling this is the elected corosync leader,
as returned by calling the external "crm" command.
We allow this operation to be retried to avoid the possibility of getting a
false negative. See LP #1396246 for more info.
"""
cmd = [
"crm", "resource",
"show", resource
]
cmd = ['crm', 'resource', 'show', resource]
try:
status = subprocess.check_output(cmd).decode('UTF-8')
status = subprocess.check_output(cmd, stderr=subprocess.STDOUT)
if not isinstance(status, six.text_type):
status = six.text_type(status, "utf-8")
except subprocess.CalledProcessError:
return False
else:
if get_unit_hostname() in status:
return True
else:
return False
status = None
if status and get_unit_hostname() in status:
return True
if status and "resource %s is NOT running" % (resource) in status:
raise CRMResourceNotFound("CRM resource %s not found" % (resource))
return False
def is_leader(resource):

View File

@ -54,6 +54,17 @@ def enable():
if is_enabled():
return True
if not os.path.isdir('/proc/sys/net/ipv6'):
# disable IPv6 support in ufw
hookenv.log("This machine doesn't have IPv6 enabled", level="INFO")
exit_code = subprocess.call(['sed', '-i', 's/IPV6=yes/IPV6=no/g',
'/etc/default/ufw'])
if exit_code == 0:
hookenv.log('IPv6 support in ufw disabled', level='INFO')
else:
hookenv.log("Couldn't disable IPv6 support in ufw", level="ERROR")
raise Exception("Couldn't disable IPv6 support in ufw")
output = subprocess.check_output(['ufw', 'enable'],
env={'LANG': 'en_US',
'PATH': os.environ['PATH']})

View File

@ -491,6 +491,7 @@ class HAProxyContext(OSContextGenerator):
ctxt['haproxy_client_timeout'] = config('haproxy-client-timeout')
if config('prefer-ipv6'):
ctxt['ipv6'] = True
ctxt['local_host'] = 'ip6-localhost'
ctxt['haproxy_host'] = '::'
ctxt['stat_port'] = ':::8888'

View File

@ -152,9 +152,15 @@ def neutron_plugins():
database=config('neutron-database'),
relation_prefix='neutron',
ssl_dir=NEUTRON_CONF_DIR)],
'services': ['calico-compute', 'bird', 'neutron-dhcp-agent'],
'services': ['calico-felix',
'bird',
'neutron-dhcp-agent',
'nova-api-metadata'],
'packages': [[headers_package()] + determine_dkms_package(),
['calico-compute', 'bird', 'neutron-dhcp-agent']],
['calico-compute',
'bird',
'neutron-dhcp-agent',
'nova-api-metadata']],
'server_packages': ['neutron-server', 'calico-control'],
'server_services': ['neutron-server']
}

View File

@ -53,6 +53,7 @@ UBUNTU_OPENSTACK_RELEASE = OrderedDict([
('saucy', 'havana'),
('trusty', 'icehouse'),
('utopic', 'juno'),
('vivid', 'kilo'),
])
@ -64,6 +65,7 @@ OPENSTACK_CODENAMES = OrderedDict([
('2013.2', 'havana'),
('2014.1', 'icehouse'),
('2014.2', 'juno'),
('2015.1', 'kilo'),
])
# The ugly duckling
@ -84,6 +86,7 @@ SWIFT_CODENAMES = OrderedDict([
('2.0.0', 'juno'),
('2.1.0', 'juno'),
('2.2.0', 'juno'),
('2.2.1', 'kilo'),
])
DEFAULT_LOOPBACK_SIZE = '5G'
@ -289,6 +292,9 @@ def configure_installation_source(rel):
'juno': 'trusty-updates/juno',
'juno/updates': 'trusty-updates/juno',
'juno/proposed': 'trusty-proposed/juno',
'kilo': 'trusty-updates/kilo',
'kilo/updates': 'trusty-updates/kilo',
'kilo/proposed': 'trusty-proposed/kilo',
}
try:

View File

@ -372,3 +372,46 @@ def ceph_version():
return None
else:
return None
class CephBrokerRq(object):
"""Ceph broker request.
Multiple operations can be added to a request and sent to the Ceph broker
to be executed.
Request is json-encoded for sending over the wire.
The API is versioned and defaults to version 1.
"""
def __init__(self, api_version=1):
self.api_version = api_version
self.ops = []
def add_op_create_pool(self, name, replica_count=3):
self.ops.append({'op': 'create-pool', 'name': name,
'replicas': replica_count})
@property
def request(self):
return json.dumps({'api-version': self.api_version, 'ops': self.ops})
class CephBrokerRsp(object):
"""Ceph broker response.
Response is json-decoded and contents provided as methods/properties.
The API is versioned and defaults to version 1.
"""
def __init__(self, encoded_rsp):
self.api_version = None
self.rsp = json.loads(encoded_rsp)
@property
def exit_code(self):
return self.rsp.get('exit-code')
@property
def exit_msg(self):
return self.rsp.get('stderr')

View File

@ -0,0 +1,41 @@
#
# Copyright 2014 Canonical Ltd.
#
# Authors:
# Edward Hope-Morley <opentastic@gmail.com>
#
import time
from charmhelpers.core.hookenv import (
log,
INFO,
)
def retry_on_exception(num_retries, base_delay=0, exc_type=Exception):
"""If the decorated function raises exception exc_type, allow num_retries
retry attempts before raise the exception.
"""
def _retry_on_exception_inner_1(f):
def _retry_on_exception_inner_2(*args, **kwargs):
retries = num_retries
multiplier = 1
while True:
try:
return f(*args, **kwargs)
except exc_type:
if not retries:
raise
delay = base_delay * multiplier
multiplier += 1
log("Retrying '%s' %d more times (delay=%s)" %
(f.__name__, retries, delay), level=INFO)
retries -= 1
if delay:
time.sleep(delay)
return _retry_on_exception_inner_2
return _retry_on_exception_inner_1

View File

@ -162,13 +162,16 @@ def mkdir(path, owner='root', group='root', perms=0o555, force=False):
uid = pwd.getpwnam(owner).pw_uid
gid = grp.getgrnam(group).gr_gid
realpath = os.path.abspath(path)
if os.path.exists(realpath):
if force and not os.path.isdir(realpath):
path_exists = os.path.exists(realpath)
if path_exists and force:
if not os.path.isdir(realpath):
log("Removing non-directory file {} prior to mkdir()".format(path))
os.unlink(realpath)
else:
os.makedirs(realpath, perms)
os.chown(realpath, uid, gid)
elif not path_exists:
os.makedirs(realpath, perms)
os.chown(realpath, uid, gid)
os.chown(realpath, uid, gid)
def write_file(path, content, owner='root', group='root', perms=0o444):

View File

@ -64,9 +64,16 @@ CLOUD_ARCHIVE_POCKETS = {
'trusty-juno/updates': 'trusty-updates/juno',
'trusty-updates/juno': 'trusty-updates/juno',
'juno/proposed': 'trusty-proposed/juno',
'juno/proposed': 'trusty-proposed/juno',
'trusty-juno/proposed': 'trusty-proposed/juno',
'trusty-proposed/juno': 'trusty-proposed/juno',
# Kilo
'kilo': 'trusty-updates/kilo',
'trusty-kilo': 'trusty-updates/kilo',
'trusty-kilo/updates': 'trusty-updates/kilo',
'trusty-updates/kilo': 'trusty-updates/kilo',
'kilo/proposed': 'trusty-proposed/kilo',
'trusty-kilo/proposed': 'trusty-proposed/kilo',
'trusty-proposed/kilo': 'trusty-proposed/kilo',
}
# The order of this list is very important. Handlers should be listed in from

View File

@ -0,0 +1 @@
quantum_hooks.py

View File

@ -0,0 +1 @@
quantum_hooks.py

View File

@ -35,10 +35,13 @@ from charmhelpers.contrib.openstack.utils import (
from charmhelpers.payload.execd import execd_preinstall
from charmhelpers.core.sysctl import create as create_sysctl
from charmhelpers.contrib.charmsupport import nrpe
import sys
from quantum_utils import (
register_configs,
restart_map,
services,
do_openstack_upgrade,
get_packages,
get_early_packages,
@ -79,6 +82,7 @@ def config_changed():
global CONFIGS
if openstack_upgrade_available(get_common_package()):
CONFIGS = do_openstack_upgrade()
update_nrpe_config()
sysctl_dict = config('sysctl')
if sysctl_dict:
@ -213,6 +217,33 @@ def cluster_departed():
def stop():
stop_services()
@hooks.hook('nrpe-external-master-relation-joined',
'nrpe-external-master-relation-changed')
def update_nrpe_config():
# python-dbus is used by check_upstart_job
apt_install('python-dbus')
hostname = nrpe.get_nagios_hostname()
current_unit = nrpe.get_nagios_unit_name()
nrpe_setup = nrpe.NRPE(hostname=hostname)
nrpe.add_init_service_checks(nrpe_setup, services(), current_unit)
cronpath = '/etc/cron.d/nagios-netns-check'
cron_template = ('*/5 * * * * root '
'/usr/local/lib/nagios/plugins/check_netns.sh '
'> /var/lib/nagios/netns-check.txt\n'
)
f = open(cronpath, 'w')
f.write(cron_template)
f.close()
nrpe_setup.add_check(
shortname="netns",
description='Network Namespace check {%s}' % current_unit,
check_cmd='check_status_file.py -f /var/lib/nagios/netns-check.txt'
)
nrpe_setup.write()
if __name__ == '__main__':
try:
hooks.execute(sys.argv)

View File

@ -16,6 +16,9 @@ description: |
categories:
- openstack
provides:
nrpe-external-master:
interface: nrpe-external-master
scope: container
quantum-network-service:
interface: quantum
requires:

View File

@ -42,6 +42,7 @@ TO_PATCH = [
'b64decode',
'is_relation_made',
'create_sysctl',
'update_nrpe_config',
]