diff --git a/files/nagios/check_ceph_osd_services.py b/files/nagios/check_ceph_osd_services.py new file mode 100755 index 00000000..669160d4 --- /dev/null +++ b/files/nagios/check_ceph_osd_services.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2018 Canonical +# All Rights Reserved +# Author: Alex Kavanagh + +import os +import sys +import tempfile + +CRON_CHECK_TMPFILE = 'ceph-osd-checks' + +STATE_OK = 0 +STATE_WARNING = 1 +STATE_CRITICAL = 2 +STATE_UNKNOWN = 3 + + +def run_main(): + """Process the CRON_CHECK_TMP_FILE and see if any line is not OK. + + If a line is not OK, the main returns STATE_CRITICAL. + If there are no lines, or the file doesn't exist, it returns STATE_UNKNOWN + Otherwise it returns STATE_OK. + + :returns: nagios state 0,2 or 3 + """ + _tmp_file = os.path.join(tempfile.gettempdir(), CRON_CHECK_TMPFILE) + + if not os.path.isfile(_tmp_file): + print("File '{}' doesn't exist".format(_tmp_file)) + return STATE_UNKNOWN + + try: + with open(_tmp_file, 'rt') as f: + lines = f.readlines() + except Exception as e: + print("Something went wrong reading the file: {}".format(str(e))) + return STATE_UNKNOWN + + # now remove the file in case the next check fails. + try: + os.remove(_tmp_file) + except Exception: + pass + + if not lines: + print("checked status file is empty: {}".format(_tmp_file)) + return STATE_UNKNOWN + + # finally, check that the file contains all ok lines. Unfortunately, it's + # not consistent across releases, but what is consistent is that the check + # command in the collect phase does fail, and so the start of the line is + # 'Failed' + state = STATE_OK + for l in lines: + print(l, end='') + if l.startswith('Failed'): + state = STATE_CRITICAL + + return state + + +if __name__ == '__main__': + sys.exit(run_main()) diff --git a/files/nagios/check_ceph_status.py b/files/nagios/check_ceph_status.py index cc21591a..358fafd9 100755 --- a/files/nagios/check_ceph_status.py +++ b/files/nagios/check_ceph_status.py @@ -40,7 +40,8 @@ def check_ceph_status(args): msg += '"' raise nagios_plugin.CriticalError(msg) - osds = re.search("^.*: (\d+) osds: (\d+) up, (\d+) in", status_data['osdmap']) + osds = re.search("^.*: (\d+) osds: (\d+) up, (\d+) in", + status_data['osdmap']) if osds.group(1) > osds.group(2): # not all OSDs are "up" msg = 'CRITICAL: Some OSDs are not up. Total: {}, up: {}'.format( osds.group(1), osds.group(2)) @@ -50,7 +51,10 @@ def check_ceph_status(args): if __name__ == '__main__': parser = argparse.ArgumentParser(description='Check ceph status') - parser.add_argument('-f', '--file', dest='status_file', - default=False, help='Optional file with "ceph status" output') + parser.add_argument('-f', + '--file', + dest='status_file', + default=False, + help='Optional file with "ceph status" output') args = parser.parse_args() nagios_plugin.try_check(check_ceph_status, args) diff --git a/files/nagios/collect_ceph_osd_services.py b/files/nagios/collect_ceph_osd_services.py new file mode 100755 index 00000000..7133a75e --- /dev/null +++ b/files/nagios/collect_ceph_osd_services.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 + +# Copyright (C) 2018 Canonical +# All Rights Reserved +# Author: Alex Kavanagh + +import os +import subprocess +import tempfile + +# fasteners only exists in Bionic, so this will fail on xenial and trusty +try: + import fasteners +except ImportError: + fasteners = None + +SYSTEMD_SYSTEM = '/run/systemd/system' +LOCKFILE = '/var/lock/check-osds.lock' +CRON_CHECK_TMPFILE = 'ceph-osd-checks' + + +def init_is_systemd(): + """Return True if the host system uses systemd, False otherwise.""" + if lsb_release()['DISTRIB_CODENAME'] == 'trusty': + return False + return os.path.isdir(SYSTEMD_SYSTEM) + + +def lsb_release(): + """Return /etc/lsb-release in a dict""" + d = {} + with open('/etc/lsb-release', 'r') as lsb: + for l in lsb: + k, v = l.split('=') + d[k.strip()] = v.strip() + return d + + +def get_osd_units(): + """Returns a list of strings, one for each unit that is live""" + cmd = '/bin/cat /var/lib/ceph/osd/ceph-*/whoami' + try: + output = (subprocess + .check_output([cmd], shell=True).decode('utf-8') + .split('\n')) + return [u for u in output if u] + except subprocess.CalledProcessError: + return [] + + +def do_status(): + if init_is_systemd(): + cmd = "/usr/local/lib/nagios/plugins/check_systemd.py ceph-osd@{}" + else: + cmd = "/sbin/status ceph-osd id={}" + + lines = [] + + for unit in get_osd_units(): + try: + output = (subprocess + .check_output(cmd.format(unit).split(), + stderr=subprocess.STDOUT) + .decode('utf-8')) + except subprocess.CalledProcessError as e: + output = ("Failed: check command raised: {}" + .format(e.output.decode('utf-8'))) + lines.append(output) + + _tmp_file = os.path.join(tempfile.gettempdir(), CRON_CHECK_TMPFILE) + with open(_tmp_file, 'wt') as f: + f.writelines(lines) + + +def run_main(): + # on bionic we can interprocess lock; we don't do it for older platforms + if fasteners is not None: + lock = fasteners.InterProcessLock(LOCKFILE) + + if lock.acquire(blocking=False): + try: + do_status() + finally: + lock.release() + else: + do_status() + + +if __name__ == '__main__': + run_main() diff --git a/hooks/ceph_hooks.py b/hooks/ceph_hooks.py index c69d693b..7828613f 100755 --- a/hooks/ceph_hooks.py +++ b/hooks/ceph_hooks.py @@ -15,14 +15,14 @@ # limitations under the License. import base64 -import json import glob +import json +import netifaces import os import shutil -import sys import socket import subprocess -import netifaces +import sys sys.path.append('lib') import ceph.utils as ceph @@ -37,6 +37,7 @@ from charmhelpers.core.hookenv import ( related_units, relation_get, relation_set, + relations_of_type, Hooks, UnregisteredHookError, service_name, @@ -47,16 +48,17 @@ from charmhelpers.core.hookenv import ( application_version_set, ) from charmhelpers.core.host import ( - umount, - mkdir, + add_to_updatedb_prunepath, cmp_pkgrevno, + is_container, + lsb_release, + mkdir, + restart_on_change, service_reload, service_restart, - add_to_updatedb_prunepath, - restart_on_change, + umount, write_file, - is_container, - init_is_systemd, + CompareHostReleases, ) from charmhelpers.fetch import ( add_source, @@ -110,6 +112,9 @@ import charmhelpers.contrib.openstack.vaultlocker as vaultlocker hooks = Hooks() STORAGE_MOUNT_PATH = '/var/lib/ceph' +# cron.d related files +CRON_CEPH_CHECK_FILE = '/etc/cron.d/check-osd-services' + def check_for_upgrade(): if not os.path.exists(ceph._upgrade_keyring): @@ -601,6 +606,7 @@ def upgrade_charm(): fatal=True) install_udev_rules() remap_resolved_targets() + maybe_refresh_nrpe_files() def remap_resolved_targets(): @@ -632,28 +638,62 @@ def remap_resolved_targets(): 'nrpe-external-master-relation-changed') def update_nrpe_config(): # python-dbus is used by check_upstart_job - apt_install('python3-dbus') + # fasteners is used by apt_install collect_ceph_osd_services.py + pkgs = ['python3-dbus'] + if CompareHostReleases(lsb_release()['DISTRIB_CODENAME']) >= 'bionic': + pkgs.append('python3-fasteners') + apt_install(pkgs) + + # copy the check and collect files over to the plugins directory + charm_dir = os.environ.get('CHARM_DIR', '') + nagios_plugins = '/usr/local/lib/nagios/plugins' + # Grab nagios user/group ID's from original source + _dir = os.stat(nagios_plugins) + uid = _dir.st_uid + gid = _dir.st_gid + for name in ('collect_ceph_osd_services.py', 'check_ceph_osd_services.py'): + target = os.path.join(nagios_plugins, name) + shutil.copy(os.path.join(charm_dir, 'files', 'nagios', name), target) + os.chown(target, uid, gid) + hostname = nrpe.get_nagios_hostname() current_unit = nrpe.get_nagios_unit_name() - # create systemd or upstart check - cmd = '/bin/cat /var/lib/ceph/osd/ceph-*/whoami |' - if init_is_systemd(): - cmd += 'xargs -I_@ /usr/local/lib/nagios/plugins/check_systemd.py' - cmd += ' ceph-osd@_@' - else: - cmd += 'xargs -I@ status ceph-osd id=@' - cmd += ' && exit 0 || exit 2' + # BUG#1810749 - the nagios user can't access /var/lib/ceph/.. and that's a + # GOOD THING, as it keeps ceph secure from Nagios. However, to check + # whether ceph is okay, the check_systemd.py or 'status ceph-osd' still + # needs to be called with the contents of ../osd/ceph-*/whoami files. To + # get around this conundrum, instead a cron.d job that runs as root will + # perform the checks every minute, and write to a tempory file the results, + # and the nrpe check will grep this file and error out (return 2) if the + # first 3 characters of a line are not 'OK:'. + + cmd = ('MAILTO=""\n' + '* * * * * root ' + '/usr/local/lib/nagios/plugins/collect_ceph_osd_services.py' + ' 2>&1 | logger -t check-osd\n') + with open(CRON_CEPH_CHECK_FILE, "wt") as f: + f.write(cmd) + + nrpe_cmd = '/usr/local/lib/nagios/plugins/check_ceph_osd_services.py' nrpe_setup = nrpe.NRPE(hostname=hostname) nrpe_setup.add_check( shortname='ceph-osd', description='process check {%s}' % current_unit, - check_cmd=cmd + check_cmd=nrpe_cmd ) nrpe_setup.write() +def maybe_refresh_nrpe_files(): + """if the nrpe-external-master relation exists then refresh the nrpe + configuration -- this is called during a charm upgrade + """ + if relations_of_type('nrpe-external-master'): + update_nrpe_config() + + @hooks.hook('secrets-storage-relation-joined') def secrets_storage_joined(relation_id=None): relation_set(relation_id=relation_id, diff --git a/tests/bundles/bionic-rocky.yaml b/tests/bundles/bionic-rocky.yaml index ae37a596..36ea73ba 100644 --- a/tests/bundles/bionic-rocky.yaml +++ b/tests/bundles/bionic-rocky.yaml @@ -57,7 +57,7 @@ applications: charm: cs:~openstack-charmers-next/cinder-ceph nova-cloud-controller: expose: True - charm: cs::~openstack-charmers-next/nova-cloud-controller + charm: cs:~openstack-charmers-next/nova-cloud-controller num_units: 1 options: openstack-origin: cloud:bionic-rocky/proposed diff --git a/tox.ini b/tox.ini index 6b24dddf..e9553173 100644 --- a/tox.ini +++ b/tox.ini @@ -38,7 +38,7 @@ deps = -r{toxinidir}/requirements.txt basepython = python3 deps = -r{toxinidir}/requirements.txt -r{toxinidir}/test-requirements.txt -commands = flake8 {posargs} hooks unit_tests tests actions lib +commands = flake8 {posargs} hooks unit_tests tests actions lib files charm-proof [testenv:venv]