diff --git a/charm-helpers-hooks.yaml b/charm-helpers-hooks.yaml index f3734f7d..aa2e13bb 100644 --- a/charm-helpers-hooks.yaml +++ b/charm-helpers-hooks.yaml @@ -9,3 +9,4 @@ include: - contrib.python.packages - contrib.storage.linux - payload.execd + - contrib.charmsupport diff --git a/config.yaml b/config.yaml index c0bfda3f..51816eb1 100644 --- a/config.yaml +++ b/config.yaml @@ -104,6 +104,16 @@ options: default: nova type: string description: Database name + nagios_context: + default: "juju" + type: string + description: | + Used by the nrpe-external-master subordinate charm. + A string that will be prepended to instance name to set the host name + in nagios. So for instance the hostname would be something like: + juju-myservice-0 + If you're running multiple environments with the same services in them + this allows you to differentiate between them. # Network configuration options # by default all access is over 'private-address' os-data-network: diff --git a/files/NeutronAgentMon b/files/NeutronAgentMon deleted file mode 100755 index 048955f1..00000000 --- a/files/NeutronAgentMon +++ /dev/null @@ -1,155 +0,0 @@ -#!/bin/sh -# -# -# NeutronAgentMon OCF RA. -# Starts crm_mon in background which logs cluster status as -# html to the specified file. -# -# Copyright 2014 Canonical Ltd. -# -# Authors: Hui Xiang -# Edward Hope-Morley -# -# OCF instance parameters: -# OCF_RESKEY_file - -####################################################################### -# Initialization: -: ${OCF_FUNCTIONS=${OCF_ROOT}/resource.d/heartbeat/.ocf-shellfuncs} -. ${OCF_FUNCTIONS} -: ${__OCF_ACTION=$1} - -####################################################################### - -meta_data() { - cat < - - -1.0 - - -This is a NeutronAgentMon Resource Agent. -It monitors the 'neutron-ha-monitor daemon' status. - -Monitor '/usr/local/bin/neutron-ha-monitor.py' in the background. - - - - - -The file we want to run as a daemon. - -The file we want to run as a daemon. - - - - - - - - - - - - - -END -} - -####################################################################### - -NeutronAgentMon_usage() { - cat <> /dev/null 2>&1 & echo $! - sleep 5 - else - ocf_log warn "[NeutronAgentMon_start] Monitor daemon already running." - fi - NeutronAgentMon_exit $? -} - -NeutronAgentMon_stop() { - pid=`sudo ps -aux | grep neutron-ha-m\[o\]nitor.py | awk -F' ' '{print $2}'` - if [ ! -z $pid ]; then - sudo kill -s 9 $pid - ocf_log info "[NeutronAgentMon_stop] Pid $pid is killed." - else - ocf_log warn "[NeutronAgentMon_stop] Monitor daemon already stopped." - fi - NeutronAgentMon_exit 0 -} - -NeutronAgentMon_monitor() { - pid=`sudo ps -aux | grep neutron-ha-m\[o\]nitor.py | awk -F' ' '{print $2}'` - if [ ! -z $pid ]; then - ocf_log info "[NeutronAgentMon_monitor] success." - exit $OCF_SUCCESS - fi - exit $OCF_NOT_RUNNING -} - -NeutronAgentMon_validate() { -# Existence of the user - if [ -f $OCF_RESKEY_file ]; then - echo "Validate OK" - return $OCF_SUCCESS - else - ocf_log err "The file $OCF_RESKEY_file does not exist!" - exit $OCF_ERR_ARGS - fi -} - -if [ $# -ne 1 ]; then - NeutronAgentMon_usage - exit $OCF_ERR_ARGS -fi - -: ${OCF_RESKEY_update:="15000"} -: ${OCF_RESKEY_pidfile:="/tmp/NeutronAgentMon_${OCF_RESOURCE_INSTANCE}.pid"} -: ${OCF_RESKEY_htmlfile:="/tmp/NeutronAgentMon_${OCF_RESOURCE_INSTANCE}.html"} - -OCF_RESKEY_update=`expr $OCF_RESKEY_update / 1000` - -case $__OCF_ACTION in -meta-data) meta_data - exit $OCF_SUCCESS - ;; -start) NeutronAgentMon_start - ;; -stop) NeutronAgentMon_stop - ;; -monitor) NeutronAgentMon_monitor - ;; -validate-all) NeutronAgentMon_validate - ;; -usage|help) NeutronAgentMon_usage - exit $OCF_SUCCESS - ;; -*) NeutronAgentMon_usage - exit $OCF_ERR_UNIMPLEMENTED - ;; -esac - -exit $? diff --git a/files/neutron-ha-monitor.conf b/files/neutron-ha-monitor.conf deleted file mode 100644 index d62ad10a..00000000 --- a/files/neutron-ha-monitor.conf +++ /dev/null @@ -1,4 +0,0 @@ -[DEFAULT] -verbose=True -#debug=True -check_interval=8 diff --git a/files/neutron-ha-monitor.py b/files/neutron-ha-monitor.py deleted file mode 100644 index cd3d04f3..00000000 --- a/files/neutron-ha-monitor.py +++ /dev/null @@ -1,430 +0,0 @@ -# Copyright 2014 Canonical Ltd. -# -# Authors: Hui Xiang -# Joshua Zhang -# Edward Hope-Morley -# - -""" -Helpers for monitoring Neutron agents, reschedule failed agents, -cleaned resources on failed nodes. -""" - -import os -import re -import sys -import signal -import socket -import subprocess -import time - -from oslo.config import cfg -from neutron.agent.linux import ovs_lib -from neutron.agent.linux import ip_lib -from neutron.common import exceptions -from neutron.openstack.common import log as logging - -LOG = logging.getLogger(__name__) - - -class Daemon(object): - """A generic daemon class. - - Usage: subclass the Daemon class and override the run() method - """ - def __init__(self, stdin='/dev/null', stdout='/dev/null', - stderr='/dev/null', procname='python'): - self.stdin = stdin - self.stdout = stdout - self.stderr = stderr - self.procname = procname - - def _fork(self): - try: - pid = os.fork() - if pid > 0: - sys.exit(0) - except OSError: - LOG.exception('Fork failed') - sys.exit(1) - - def daemonize(self): - """Daemonize process by doing Stevens double fork.""" - # fork first time - self._fork() - - # decouple from parent environment - os.chdir("/") - os.setsid() - os.umask(0) - # fork second time - self._fork() - - # redirect standard file descriptors - sys.stdout.flush() - sys.stderr.flush() - stdin = open(self.stdin, 'r') - stdout = open(self.stdout, 'a+') - stderr = open(self.stderr, 'a+', 0) - os.dup2(stdin.fileno(), sys.stdin.fileno()) - os.dup2(stdout.fileno(), sys.stdout.fileno()) - os.dup2(stderr.fileno(), sys.stderr.fileno()) - - signal.signal(signal.SIGTERM, self.handle_sigterm) - - def handle_sigterm(self, signum, frame): - sys.exit(0) - - def start(self): - """Start the daemon.""" - self.daemonize() - self.run() - - def run(self): - """Override this method when subclassing Daemon. - - start() will call this method after the process has daemonized. - """ - pass - - -class MonitorNeutronAgentsDaemon(Daemon): - def __init__(self): - super(MonitorNeutronAgentsDaemon, self).__init__() - logging.setup('Neuron-HA-Monitor') - LOG.info('Monitor Neutron Agent Loop Init') - self.hostname = None - self.env = {} - - def get_env(self): - envrc_f = '/etc/legacy_ha_envrc' - envrc_f_m = False - if os.path.isfile(envrc_f): - ctime = time.ctime(os.stat(envrc_f).st_ctime) - mtime = time.ctime(os.stat(envrc_f).st_mtime) - if ctime != mtime: - envrc_f_m = True - - if not self.env or envrc_f_m: - with open(envrc_f, 'r') as f: - for line in f: - data = line.strip().split('=') - if data and data[0] and data[1]: - self.env[data[0]] = data[1] - else: - raise Exception("OpenStack env data uncomplete.") - return self.env - - def get_hostname(self): - if not self.hostname: - self.hostname = socket.gethostname() - return self.hostname - - def get_root_helper(self): - return 'sudo' - - def list_monitor_res(self): - # List crm resource 'cl_monitor' running node - nodes = [] - cmd = ['crm', 'resource', 'show', 'cl_monitor'] - output = subprocess.check_output(cmd) - pattern = re.compile('resource cl_monitor is running on: (.*) ') - nodes = pattern.findall(output) - return nodes - - def get_crm_res_lead_node(self): - nodes = self.list_monitor_res() - if nodes: - return nodes[0].strip() - else: - LOG.error('Failed to get crm resource.') - return None - - def unplug_device(self, device): - try: - device.link.delete() - except RuntimeError: - root_helper = self.get_root_helper() - # Maybe the device is OVS port, so try to delete - bridge_name = ovs_lib.get_bridge_for_iface(root_helper, - device.name) - if bridge_name: - bridge = ovs_lib.OVSBridge(bridge_name, root_helper) - bridge.delete_port(device.name) - else: - LOG.debug('Unable to find bridge for device: %s', device.name) - - def get_pattern(self, key, text): - if not key or not text: - LOG.debug('Invalid key(%s) or text(%s)' % (key, text)) - return None - - pattern = re.compile('%s' % key) - result = pattern.findall(text) - return result - - def _cleanup(self, key1, key2): - namespaces = [] - if key1: - for k in key1.iterkeys(): - namespaces.append(key2 + '-' + k) - else: - try: - cmd = ['sudo', 'ip', 'netns'] - ns = subprocess.check_output(cmd) - namespaces = self.get_pattern('(%s.*)' % key2, ns) - except RuntimeError as e: - LOG.error('Failed to list namespace, (%s)' % e) - - if namespaces: - LOG.info('Namespaces: %s is going to be deleted.' % namespaces) - self.destroy_namespaces(namespaces) - - def cleanup_dhcp(self, networks): - self._cleanup(networks, 'qdhcp') - - def cleanup_router(self, routers): - self._cleanup(routers, 'qrouter') - - def destroy_namespaces(self, namespaces): - try: - root_helper = self.get_root_helper() - for namespace in namespaces: - ip = ip_lib.IPWrapper(root_helper, namespace) - if ip.netns.exists(namespace): - for device in ip.get_devices(exclude_loopback=True): - self.unplug_device(device) - - ip.garbage_collect_namespace() - except Exception: - LOG.exception('Error unable to destroy namespace: %s', namespace) - - def is_same_host(self, host): - return str(host).strip() == self.get_hostname() - - def validate_reschedule(self): - crm_no_1_node = self.get_crm_res_lead_node() - if not crm_no_1_node: - LOG.error('No crm first node could be found.') - return False - - if not self.is_same_host(crm_no_1_node): - LOG.warn('Only the first crm node %s could reschedule. ' - % crm_no_1_node) - return False - return True - - def l3_agents_reschedule(self, l3_agents, routers, quantum): - if not self.validate_reschedule(): - return - - index = 0 - for router_id in routers: - agent = index % len(l3_agents) - LOG.info('Moving router %s from %s to %s' % - (router_id, routers[router_id], l3_agents[agent])) - try: - quantum.remove_router_from_l3_agent(l3_agent=routers[router_id], - router_id=router_id) - except exceptions.NeutronException as e: - LOG.error('Remove router raised exception: %s' % e) - try: - quantum.add_router_to_l3_agent(l3_agent=l3_agents[agent], - body={'router_id': router_id}) - except exceptions.NeutronException as e: - LOG.error('Add router raised exception: %s' % e) - index += 1 - - def dhcp_agents_reschedule(self, dhcp_agents, networks, quantum): - if not self.validate_reschedule(): - return - - index = 0 - for network_id in networks: - agent = index % len(dhcp_agents) - LOG.info('Moving network %s from %s to %s' % (network_id, - networks[network_id], dhcp_agents[agent])) - try: - quantum.remove_network_from_dhcp_agent( - dhcp_agent=networks[network_id], network_id=network_id) - except exceptions.NeutronException as e: - LOG.error('Remove network raised exception: %s' % e) - try: - quantum.add_network_to_dhcp_agent( - dhcp_agent=dhcp_agents[agent], - body={'network_id': network_id}) - except exceptions.NeutronException as e: - LOG.error('Add network raised exception: %s' % e) - index += 1 - - def get_quantum_client(self): - env = self.get_env() - if not env: - LOG.info('Unable to re-assign resources at this time') - return None - - try: - from quantumclient.v2_0 import client - except ImportError: - # Try to import neutronclient instead for havana+ - from neutronclient.v2_0 import client - - auth_url = '%(auth_protocol)s://%(keystone_host)s:%(auth_port)s/v2.0' \ - % env - quantum = client.Client(username=env['service_username'], - password=env['service_password'], - tenant_name=env['service_tenant'], - auth_url=auth_url, - region_name=env['region']) - return quantum - - def reassign_agent_resources(self, quantum=None): - """Use agent scheduler API to detect down agents and re-schedule""" - if not quantum: - LOG.error('Failed to get quantum client.') - return - - try: - DHCP_AGENT = "DHCP Agent" - L3_AGENT = "L3 Agent" - agents = quantum.list_agents(agent_type=DHCP_AGENT) - except exceptions.NeutronException as e: - LOG.error('Failed to get quantum agents, %s' % e) - return - - dhcp_agents = [] - l3_agents = [] - networks = {} - for agent in agents['agents']: - hosted_networks = quantum.list_networks_on_dhcp_agent( - agent['id'])['networks'] - if not agent['alive']: - LOG.info('DHCP Agent %s down' % agent['id']) - for network in hosted_networks: - networks[network['id']] = agent['id'] - if self.is_same_host(agent['host']): - self.cleanup_dhcp(networks) - else: - dhcp_agents.append(agent['id']) - LOG.info('Active dhcp agents: %s' % agent['id']) - if not hosted_networks and self.is_same_host(agent['host']): - self.cleanup_dhcp(None) - - agents = quantum.list_agents(agent_type=L3_AGENT) - routers = {} - for agent in agents['agents']: - hosted_routers = quantum.list_routers_on_l3_agent( - agent['id'])['routers'] - if not agent['alive']: - LOG.info('L3 Agent %s down' % agent['id']) - for router in hosted_routers: - routers[router['id']] = agent['id'] - if self.is_same_host(agent['host']): - self.cleanup_router(routers) - else: - l3_agents.append(agent['id']) - LOG.info('Active l3 agents: %s' % agent['id']) - if not hosted_routers and self.is_same_host(agent['host']): - self.cleanup_router(None) - - if not networks and not routers: - LOG.info('No networks and routers hosted on failed agents.') - return - - if len(dhcp_agents) == 0 and len(l3_agents) == 0: - LOG.error('Unable to relocate resources, there are %s dhcp_agents ' - 'and %s l3_agents in this cluster' % (len(dhcp_agents), - len(l3_agents))) - return - - if len(l3_agents) > 0: - self.l3_agents_reschedule(l3_agents, routers, quantum) - # new l3 node will not create a tunnel if don't restart ovs process - - if len(dhcp_agents) > 0: - self.dhcp_agents_reschedule(dhcp_agents, networks, quantum) - - - def check_ovs_tunnel(self, quantum=None): - if not quantum: - LOG.error('Failed to get quantum client.') - return - - try: - OVS_AGENT = 'Open vSwitch agent' - agents = quantum.list_agents(agent_type=OVS_AGENT) - except exceptions.NeutronException as e: - LOG.error('No ovs agent found on localhost, error:%s.' % e) - return - - for agent in agents['agents']: - if self.is_same_host(agent['host']): - conf = agent['configurations'] - if 'gre' in conf['tunnel_types'] and conf['l2_population'] \ - and conf['devices']: - LOG.warning('local ovs agent:%s' % agent) - ovs_output = subprocess.check_output(['ovs-vsctl', - 'list-ports', 'br-tun']) - ports = ovs_output.strip().split('\n') - look_up_gre_port = False - for port in ports: - if port.startswith('gre-'): - look_up_gre_port = True - break - if not look_up_gre_port: - try: - LOG.error('Found namespace, but no ovs tunnel is created,' - 'restart ovs agent.') - cmd = ['sudo', 'service', 'neutron-plugin-openvswitch-agent', - 'restart'] - subprocess.call(cmd) - except subprocess.CalledProcessError: - LOG.error('Failed to restart neutron-plugin-openvswitch-agent.') - - def check_local_agents(self): - services = ['openvswitch-switch', 'neutron-dhcp-agent', - 'neutron-metadata-agent', 'neutron-vpn-agent'] - for s in services: - status = ['sudo', 'service', s, 'status'] - restart = ['sudo', 'service', s, 'restart'] - start = ['sudo', 'service', s, 'start'] - stop = 'neutron-vpn-agent stop/waiting' - try: - output = subprocess.check_output(status) - if output.strip() == stop: - subprocess.check_output(start) - if s == 'neutron-metadata-agent': - subprocess.check_output(['sudo', 'service', - 'neutron-vpn-agent', - 'restart']) - except subprocess.CalledProcessError: - LOG.error('Restart service: %s' % s) - subprocess.check_output(restart) - if s == 'neutron-metadata-agent': - subprocess.check_output(['sudo', 'service', - 'neutron-vpn-agent', - 'restart']) - - def run(self): - while True: - LOG.info('Monitor Neutron HA Agent Loop Start') - quantum = self.get_quantum_client() - self.reassign_agent_resources(quantum=quantum) - self.check_ovs_tunnel(quantum=quantum) - self.check_local_agents() - LOG.info('sleep %s' % cfg.CONF.check_interval) - time.sleep(float(cfg.CONF.check_interval)) - - -if __name__ == '__main__': - opts = [ - cfg.StrOpt('check_interval', - default=8, - help='Check Neutron Agents interval.'), - ] - - cfg.CONF.register_cli_opts(opts) - cfg.CONF(project='monitor_neutron_agents', default_config_files=[]) - logging.setup('Neuron-HA-Monitor') - monitor_daemon = MonitorNeutronAgentsDaemon() - monitor_daemon.start() diff --git a/hooks/charmhelpers/contrib/charmsupport/__init__.py b/hooks/charmhelpers/contrib/charmsupport/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/hooks/charmhelpers/contrib/charmsupport/nrpe.py b/hooks/charmhelpers/contrib/charmsupport/nrpe.py new file mode 100644 index 00000000..f3a936d0 --- /dev/null +++ b/hooks/charmhelpers/contrib/charmsupport/nrpe.py @@ -0,0 +1,308 @@ +"""Compatibility with the nrpe-external-master charm""" +# Copyright 2012 Canonical Ltd. +# +# Authors: +# Matthew Wedgwood + +import subprocess +import pwd +import grp +import os +import re +import shlex +import yaml + +from charmhelpers.core.hookenv import ( + config, + local_unit, + log, + relation_ids, + relation_set, + relations_of_type, +) + +from charmhelpers.core.host import service + +# This module adds compatibility with the nrpe-external-master and plain nrpe +# subordinate charms. To use it in your charm: +# +# 1. Update metadata.yaml +# +# provides: +# (...) +# nrpe-external-master: +# interface: nrpe-external-master +# scope: container +# +# and/or +# +# provides: +# (...) +# local-monitors: +# interface: local-monitors +# scope: container + +# +# 2. Add the following to config.yaml +# +# nagios_context: +# default: "juju" +# type: string +# description: | +# Used by the nrpe subordinate charms. +# A string that will be prepended to instance name to set the host name +# in nagios. So for instance the hostname would be something like: +# juju-myservice-0 +# If you're running multiple environments with the same services in them +# this allows you to differentiate between them. +# nagios_servicegroups: +# default: "" +# type: string +# description: | +# A comma-separated list of nagios servicegroups. +# If left empty, the nagios_context will be used as the servicegroup +# +# 3. Add custom checks (Nagios plugins) to files/nrpe-external-master +# +# 4. Update your hooks.py with something like this: +# +# from charmsupport.nrpe import NRPE +# (...) +# def update_nrpe_config(): +# nrpe_compat = NRPE() +# nrpe_compat.add_check( +# shortname = "myservice", +# description = "Check MyService", +# check_cmd = "check_http -w 2 -c 10 http://localhost" +# ) +# nrpe_compat.add_check( +# "myservice_other", +# "Check for widget failures", +# check_cmd = "/srv/myapp/scripts/widget_check" +# ) +# nrpe_compat.write() +# +# def config_changed(): +# (...) +# update_nrpe_config() +# +# def nrpe_external_master_relation_changed(): +# update_nrpe_config() +# +# def local_monitors_relation_changed(): +# update_nrpe_config() +# +# 5. ln -s hooks.py nrpe-external-master-relation-changed +# ln -s hooks.py local-monitors-relation-changed + + +class CheckException(Exception): + pass + + +class Check(object): + shortname_re = '[A-Za-z0-9-_]+$' + service_template = (""" +#--------------------------------------------------- +# This file is Juju managed +#--------------------------------------------------- +define service {{ + use active-service + host_name {nagios_hostname} + service_description {nagios_hostname}[{shortname}] """ + """{description} + check_command check_nrpe!{command} + servicegroups {nagios_servicegroup} +}} +""") + + def __init__(self, shortname, description, check_cmd): + super(Check, self).__init__() + # XXX: could be better to calculate this from the service name + if not re.match(self.shortname_re, shortname): + raise CheckException("shortname must match {}".format( + Check.shortname_re)) + self.shortname = shortname + self.command = "check_{}".format(shortname) + # Note: a set of invalid characters is defined by the + # Nagios server config + # The default is: illegal_object_name_chars=`~!$%^&*"|'<>?,()= + self.description = description + self.check_cmd = self._locate_cmd(check_cmd) + + def _locate_cmd(self, check_cmd): + search_path = ( + '/usr/lib/nagios/plugins', + '/usr/local/lib/nagios/plugins', + ) + parts = shlex.split(check_cmd) + for path in search_path: + if os.path.exists(os.path.join(path, parts[0])): + command = os.path.join(path, parts[0]) + if len(parts) > 1: + command += " " + " ".join(parts[1:]) + return command + log('Check command not found: {}'.format(parts[0])) + return '' + + def write(self, nagios_context, hostname, nagios_servicegroups=None): + nrpe_check_file = '/etc/nagios/nrpe.d/{}.cfg'.format( + self.command) + with open(nrpe_check_file, 'w') as nrpe_check_config: + nrpe_check_config.write("# check {}\n".format(self.shortname)) + nrpe_check_config.write("command[{}]={}\n".format( + self.command, self.check_cmd)) + + if not os.path.exists(NRPE.nagios_exportdir): + log('Not writing service config as {} is not accessible'.format( + NRPE.nagios_exportdir)) + else: + self.write_service_config(nagios_context, hostname, + nagios_servicegroups) + + def write_service_config(self, nagios_context, hostname, + nagios_servicegroups=None): + for f in os.listdir(NRPE.nagios_exportdir): + if re.search('.*{}.cfg'.format(self.command), f): + os.remove(os.path.join(NRPE.nagios_exportdir, f)) + + if not nagios_servicegroups: + nagios_servicegroups = nagios_context + + templ_vars = { + 'nagios_hostname': hostname, + 'nagios_servicegroup': nagios_servicegroups, + 'description': self.description, + 'shortname': self.shortname, + 'command': self.command, + } + nrpe_service_text = Check.service_template.format(**templ_vars) + nrpe_service_file = '{}/service__{}_{}.cfg'.format( + NRPE.nagios_exportdir, hostname, self.command) + with open(nrpe_service_file, 'w') as nrpe_service_config: + nrpe_service_config.write(str(nrpe_service_text)) + + def run(self): + subprocess.call(self.check_cmd) + + +class NRPE(object): + nagios_logdir = '/var/log/nagios' + nagios_exportdir = '/var/lib/nagios/export' + nrpe_confdir = '/etc/nagios/nrpe.d' + + def __init__(self, hostname=None): + super(NRPE, self).__init__() + self.config = config() + self.nagios_context = self.config['nagios_context'] + if 'nagios_servicegroups' in self.config: + self.nagios_servicegroups = self.config['nagios_servicegroups'] + else: + self.nagios_servicegroups = 'juju' + self.unit_name = local_unit().replace('/', '-') + if hostname: + self.hostname = hostname + else: + self.hostname = "{}-{}".format(self.nagios_context, self.unit_name) + self.checks = [] + + def add_check(self, *args, **kwargs): + self.checks.append(Check(*args, **kwargs)) + + def write(self): + try: + nagios_uid = pwd.getpwnam('nagios').pw_uid + nagios_gid = grp.getgrnam('nagios').gr_gid + except: + log("Nagios user not set up, nrpe checks not updated") + return + + if not os.path.exists(NRPE.nagios_logdir): + os.mkdir(NRPE.nagios_logdir) + os.chown(NRPE.nagios_logdir, nagios_uid, nagios_gid) + + nrpe_monitors = {} + monitors = {"monitors": {"remote": {"nrpe": nrpe_monitors}}} + for nrpecheck in self.checks: + nrpecheck.write(self.nagios_context, self.hostname, + self.nagios_servicegroups) + nrpe_monitors[nrpecheck.shortname] = { + "command": nrpecheck.command, + } + + service('restart', 'nagios-nrpe-server') + + for rid in relation_ids("local-monitors"): + relation_set(relation_id=rid, monitors=yaml.dump(monitors)) + + +def get_nagios_hostcontext(relation_name='nrpe-external-master'): + """ + Query relation with nrpe subordinate, return the nagios_host_context + + :param str relation_name: Name of relation nrpe sub joined to + """ + for rel in relations_of_type(relation_name): + if 'nagios_hostname' in rel: + return rel['nagios_host_context'] + + +def get_nagios_hostname(relation_name='nrpe-external-master'): + """ + Query relation with nrpe subordinate, return the nagios_hostname + + :param str relation_name: Name of relation nrpe sub joined to + """ + for rel in relations_of_type(relation_name): + if 'nagios_hostname' in rel: + return rel['nagios_hostname'] + + +def get_nagios_unit_name(relation_name='nrpe-external-master'): + """ + Return the nagios unit name prepended with host_context if needed + + :param str relation_name: Name of relation nrpe sub joined to + """ + host_context = get_nagios_hostcontext(relation_name) + if host_context: + unit = "%s:%s" % (host_context, local_unit()) + else: + unit = local_unit() + return unit + + +def add_init_service_checks(nrpe, services, unit_name): + """ + Add checks for each service in list + + :param NRPE nrpe: NRPE object to add check to + :param list services: List of services to check + :param str unit_name: Unit name to use in check description + """ + for svc in services: + upstart_init = '/etc/init/%s.conf' % svc + sysv_init = '/etc/init.d/%s' % svc + if os.path.exists(upstart_init): + nrpe.add_check( + shortname=svc, + description='process check {%s}' % unit_name, + check_cmd='check_upstart_job %s' % svc + ) + elif os.path.exists(sysv_init): + cronpath = '/etc/cron.d/nagios-service-check-%s' % svc + cron_file = ('*/5 * * * * root ' + '/usr/local/lib/nagios/plugins/check_exit_status.pl ' + '-s /etc/init.d/%s status > ' + '/var/lib/nagios/service-check-%s.txt\n' % (svc, + svc) + ) + f = open(cronpath, 'w') + f.write(cron_file) + f.close() + nrpe.add_check( + shortname=svc, + description='process check {%s}' % unit_name, + check_cmd='check_status_file.py -f ' + '/var/lib/nagios/service-check-%s.txt' % svc, + ) diff --git a/hooks/charmhelpers/contrib/charmsupport/volumes.py b/hooks/charmhelpers/contrib/charmsupport/volumes.py new file mode 100644 index 00000000..d61aa47f --- /dev/null +++ b/hooks/charmhelpers/contrib/charmsupport/volumes.py @@ -0,0 +1,159 @@ +''' +Functions for managing volumes in juju units. One volume is supported per unit. +Subordinates may have their own storage, provided it is on its own partition. + +Configuration stanzas:: + + volume-ephemeral: + type: boolean + default: true + description: > + If false, a volume is mounted as sepecified in "volume-map" + If true, ephemeral storage will be used, meaning that log data + will only exist as long as the machine. YOU HAVE BEEN WARNED. + volume-map: + type: string + default: {} + description: > + YAML map of units to device names, e.g: + "{ rsyslog/0: /dev/vdb, rsyslog/1: /dev/vdb }" + Service units will raise a configure-error if volume-ephemeral + is 'true' and no volume-map value is set. Use 'juju set' to set a + value and 'juju resolved' to complete configuration. + +Usage:: + + from charmsupport.volumes import configure_volume, VolumeConfigurationError + from charmsupport.hookenv import log, ERROR + def post_mount_hook(): + stop_service('myservice') + def post_mount_hook(): + start_service('myservice') + + if __name__ == '__main__': + try: + configure_volume(before_change=pre_mount_hook, + after_change=post_mount_hook) + except VolumeConfigurationError: + log('Storage could not be configured', ERROR) + +''' + +# XXX: Known limitations +# - fstab is neither consulted nor updated + +import os +from charmhelpers.core import hookenv +from charmhelpers.core import host +import yaml + + +MOUNT_BASE = '/srv/juju/volumes' + + +class VolumeConfigurationError(Exception): + '''Volume configuration data is missing or invalid''' + pass + + +def get_config(): + '''Gather and sanity-check volume configuration data''' + volume_config = {} + config = hookenv.config() + + errors = False + + if config.get('volume-ephemeral') in (True, 'True', 'true', 'Yes', 'yes'): + volume_config['ephemeral'] = True + else: + volume_config['ephemeral'] = False + + try: + volume_map = yaml.safe_load(config.get('volume-map', '{}')) + except yaml.YAMLError as e: + hookenv.log("Error parsing YAML volume-map: {}".format(e), + hookenv.ERROR) + errors = True + if volume_map is None: + # probably an empty string + volume_map = {} + elif not isinstance(volume_map, dict): + hookenv.log("Volume-map should be a dictionary, not {}".format( + type(volume_map))) + errors = True + + volume_config['device'] = volume_map.get(os.environ['JUJU_UNIT_NAME']) + if volume_config['device'] and volume_config['ephemeral']: + # asked for ephemeral storage but also defined a volume ID + hookenv.log('A volume is defined for this unit, but ephemeral ' + 'storage was requested', hookenv.ERROR) + errors = True + elif not volume_config['device'] and not volume_config['ephemeral']: + # asked for permanent storage but did not define volume ID + hookenv.log('Ephemeral storage was requested, but there is no volume ' + 'defined for this unit.', hookenv.ERROR) + errors = True + + unit_mount_name = hookenv.local_unit().replace('/', '-') + volume_config['mountpoint'] = os.path.join(MOUNT_BASE, unit_mount_name) + + if errors: + return None + return volume_config + + +def mount_volume(config): + if os.path.exists(config['mountpoint']): + if not os.path.isdir(config['mountpoint']): + hookenv.log('Not a directory: {}'.format(config['mountpoint'])) + raise VolumeConfigurationError() + else: + host.mkdir(config['mountpoint']) + if os.path.ismount(config['mountpoint']): + unmount_volume(config) + if not host.mount(config['device'], config['mountpoint'], persist=True): + raise VolumeConfigurationError() + + +def unmount_volume(config): + if os.path.ismount(config['mountpoint']): + if not host.umount(config['mountpoint'], persist=True): + raise VolumeConfigurationError() + + +def managed_mounts(): + '''List of all mounted managed volumes''' + return filter(lambda mount: mount[0].startswith(MOUNT_BASE), host.mounts()) + + +def configure_volume(before_change=lambda: None, after_change=lambda: None): + '''Set up storage (or don't) according to the charm's volume configuration. + Returns the mount point or "ephemeral". before_change and after_change + are optional functions to be called if the volume configuration changes. + ''' + + config = get_config() + if not config: + hookenv.log('Failed to read volume configuration', hookenv.CRITICAL) + raise VolumeConfigurationError() + + if config['ephemeral']: + if os.path.ismount(config['mountpoint']): + before_change() + unmount_volume(config) + after_change() + return 'ephemeral' + else: + # persistent storage + if os.path.ismount(config['mountpoint']): + mounts = dict(managed_mounts()) + if mounts.get(config['mountpoint']) != config['device']: + before_change() + unmount_volume(config) + mount_volume(config) + after_change() + else: + before_change() + mount_volume(config) + after_change() + return config['mountpoint'] diff --git a/hooks/charmhelpers/contrib/openstack/utils.py b/hooks/charmhelpers/contrib/openstack/utils.py index 44179679..ddd40ce5 100644 --- a/hooks/charmhelpers/contrib/openstack/utils.py +++ b/hooks/charmhelpers/contrib/openstack/utils.py @@ -53,6 +53,7 @@ UBUNTU_OPENSTACK_RELEASE = OrderedDict([ ('saucy', 'havana'), ('trusty', 'icehouse'), ('utopic', 'juno'), + ('vivid', 'kilo'), ]) @@ -64,6 +65,7 @@ OPENSTACK_CODENAMES = OrderedDict([ ('2013.2', 'havana'), ('2014.1', 'icehouse'), ('2014.2', 'juno'), + ('2015.1', 'kilo'), ]) # The ugly duckling @@ -84,6 +86,7 @@ SWIFT_CODENAMES = OrderedDict([ ('2.0.0', 'juno'), ('2.1.0', 'juno'), ('2.2.0', 'juno'), + ('2.2.1', 'kilo'), ]) DEFAULT_LOOPBACK_SIZE = '5G' @@ -289,6 +292,9 @@ def configure_installation_source(rel): 'juno': 'trusty-updates/juno', 'juno/updates': 'trusty-updates/juno', 'juno/proposed': 'trusty-proposed/juno', + 'kilo': 'trusty-updates/kilo', + 'kilo/updates': 'trusty-updates/kilo', + 'kilo/proposed': 'trusty-proposed/kilo', } try: diff --git a/hooks/charmhelpers/fetch/__init__.py b/hooks/charmhelpers/fetch/__init__.py index 0a126fc3..aceadea4 100644 --- a/hooks/charmhelpers/fetch/__init__.py +++ b/hooks/charmhelpers/fetch/__init__.py @@ -64,9 +64,16 @@ CLOUD_ARCHIVE_POCKETS = { 'trusty-juno/updates': 'trusty-updates/juno', 'trusty-updates/juno': 'trusty-updates/juno', 'juno/proposed': 'trusty-proposed/juno', - 'juno/proposed': 'trusty-proposed/juno', 'trusty-juno/proposed': 'trusty-proposed/juno', 'trusty-proposed/juno': 'trusty-proposed/juno', + # Kilo + 'kilo': 'trusty-updates/kilo', + 'trusty-kilo': 'trusty-updates/kilo', + 'trusty-kilo/updates': 'trusty-updates/kilo', + 'trusty-updates/kilo': 'trusty-updates/kilo', + 'kilo/proposed': 'trusty-proposed/kilo', + 'trusty-kilo/proposed': 'trusty-proposed/kilo', + 'trusty-proposed/kilo': 'trusty-proposed/kilo', } # The order of this list is very important. Handlers should be listed in from diff --git a/hooks/nrpe-external-master-relation-changed b/hooks/nrpe-external-master-relation-changed new file mode 120000 index 00000000..9a2da58e --- /dev/null +++ b/hooks/nrpe-external-master-relation-changed @@ -0,0 +1 @@ +quantum_hooks.py \ No newline at end of file diff --git a/hooks/nrpe-external-master-relation-joined b/hooks/nrpe-external-master-relation-joined new file mode 120000 index 00000000..9a2da58e --- /dev/null +++ b/hooks/nrpe-external-master-relation-joined @@ -0,0 +1 @@ +quantum_hooks.py \ No newline at end of file diff --git a/hooks/quantum_hooks.py b/hooks/quantum_hooks.py index 752510dc..8d32636b 100755 --- a/hooks/quantum_hooks.py +++ b/hooks/quantum_hooks.py @@ -36,10 +36,13 @@ from charmhelpers.contrib.openstack.utils import ( from charmhelpers.payload.execd import execd_preinstall from charmhelpers.core.sysctl import create as create_sysctl +from charmhelpers.contrib.charmsupport import nrpe + import sys from quantum_utils import ( register_configs, restart_map, + services, do_openstack_upgrade, get_packages, get_early_packages, @@ -92,6 +95,7 @@ def config_changed(): global CONFIGS if openstack_upgrade_available(get_common_package()): CONFIGS = do_openstack_upgrade() + update_nrpe_config() sysctl_dict = config('sysctl') if sysctl_dict: @@ -234,6 +238,32 @@ def stop(): stop_services() +@hooks.hook('nrpe-external-master-relation-joined', + 'nrpe-external-master-relation-changed') +def update_nrpe_config(): + # python-dbus is used by check_upstart_job + apt_install('python-dbus') + hostname = nrpe.get_nagios_hostname() + current_unit = nrpe.get_nagios_unit_name() + nrpe_setup = nrpe.NRPE(hostname=hostname) + nrpe.add_init_service_checks(nrpe_setup, services(), current_unit) + + cronpath = '/etc/cron.d/nagios-netns-check' + cron_template = ('*/5 * * * * root ' + '/usr/local/lib/nagios/plugins/check_netns.sh ' + '> /var/lib/nagios/netns-check.txt\n' + ) + f = open(cronpath, 'w') + f.write(cron_template) + f.close() + nrpe_setup.add_check( + shortname="netns", + description='Network Namespace check {%s}' % current_unit, + check_cmd='check_status_file.py -f /var/lib/nagios/netns-check.txt' + ) + nrpe_setup.write() + + @hooks.hook('ha-relation-joined') @hooks.hook('ha-relation-changed') def ha_relation_joined(): diff --git a/metadata.yaml b/metadata.yaml index e7828c29..559effeb 100644 --- a/metadata.yaml +++ b/metadata.yaml @@ -16,6 +16,9 @@ description: | categories: - openstack provides: + nrpe-external-master: + interface: nrpe-external-master + scope: container quantum-network-service: interface: quantum requires: diff --git a/unit_tests/test_quantum_hooks.py b/unit_tests/test_quantum_hooks.py index 39d67c6f..b814d2c7 100644 --- a/unit_tests/test_quantum_hooks.py +++ b/unit_tests/test_quantum_hooks.py @@ -42,6 +42,7 @@ TO_PATCH = [ 'b64decode', 'is_relation_made', 'create_sysctl', + 'update_nrpe_config', 'update_legacy_ha_files', 'add_hostname_to_hosts' ]