Fix nrpe ceph-osd status respecting permissions

The referenced bug (below) was caused because the nrpe check needed to
access the ceph owned directories, and as the nagios user, nrpe can't.
This change splits the check into a 'collect' phase that runs as root
via a cronjob each minute and writes a file to the tmp directory, and a
nrpe check phase that then reads that file and reports back to nagios.

The 'check' part deletes the 'collect' file, so that fresh information
is available for each nrpe check.  The cron task runs every minute (as
is lightweight), so the nrpe checks should not be sheduled more
frequently than 1 minute.

Change-Id: I4f4594a479eed47cc66643d0c6acece491ae854d
Closes-Bug: #1810749
This commit is contained in:
Alex Kavanagh 2019-01-10 18:01:23 +00:00
parent a3fd4b3127
commit faefe90ce6
6 changed files with 223 additions and 24 deletions

View File

@ -0,0 +1,65 @@
#!/usr/bin/env python3
# Copyright (C) 2018 Canonical
# All Rights Reserved
# Author: Alex Kavanagh <alex.kavanagh@canonical.com>
import os
import sys
import tempfile
CRON_CHECK_TMPFILE = 'ceph-osd-checks'
STATE_OK = 0
STATE_WARNING = 1
STATE_CRITICAL = 2
STATE_UNKNOWN = 3
def run_main():
"""Process the CRON_CHECK_TMP_FILE and see if any line is not OK.
If a line is not OK, the main returns STATE_CRITICAL.
If there are no lines, or the file doesn't exist, it returns STATE_UNKNOWN
Otherwise it returns STATE_OK.
:returns: nagios state 0,2 or 3
"""
_tmp_file = os.path.join(tempfile.gettempdir(), CRON_CHECK_TMPFILE)
if not os.path.isfile(_tmp_file):
print("File '{}' doesn't exist".format(_tmp_file))
return STATE_UNKNOWN
try:
with open(_tmp_file, 'rt') as f:
lines = f.readlines()
except Exception as e:
print("Something went wrong reading the file: {}".format(str(e)))
return STATE_UNKNOWN
# now remove the file in case the next check fails.
try:
os.remove(_tmp_file)
except Exception:
pass
if not lines:
print("checked status file is empty: {}".format(_tmp_file))
return STATE_UNKNOWN
# finally, check that the file contains all ok lines. Unfortunately, it's
# not consistent across releases, but what is consistent is that the check
# command in the collect phase does fail, and so the start of the line is
# 'Failed'
state = STATE_OK
for l in lines:
print(l, end='')
if l.startswith('Failed'):
state = STATE_CRITICAL
return state
if __name__ == '__main__':
sys.exit(run_main())

View File

@ -40,7 +40,8 @@ def check_ceph_status(args):
msg += '"'
raise nagios_plugin.CriticalError(msg)
osds = re.search("^.*: (\d+) osds: (\d+) up, (\d+) in", status_data['osdmap'])
osds = re.search("^.*: (\d+) osds: (\d+) up, (\d+) in",
status_data['osdmap'])
if osds.group(1) > osds.group(2): # not all OSDs are "up"
msg = 'CRITICAL: Some OSDs are not up. Total: {}, up: {}'.format(
osds.group(1), osds.group(2))
@ -50,7 +51,10 @@ def check_ceph_status(args):
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Check ceph status')
parser.add_argument('-f', '--file', dest='status_file',
default=False, help='Optional file with "ceph status" output')
parser.add_argument('-f',
'--file',
dest='status_file',
default=False,
help='Optional file with "ceph status" output')
args = parser.parse_args()
nagios_plugin.try_check(check_ceph_status, args)

View File

@ -0,0 +1,90 @@
#!/usr/bin/env python3
# Copyright (C) 2018 Canonical
# All Rights Reserved
# Author: Alex Kavanagh <alex.kavanagh@canonical.com>
import os
import subprocess
import tempfile
# fasteners only exists in Bionic, so this will fail on xenial and trusty
try:
import fasteners
except ImportError:
fasteners = None
SYSTEMD_SYSTEM = '/run/systemd/system'
LOCKFILE = '/var/lock/check-osds.lock'
CRON_CHECK_TMPFILE = 'ceph-osd-checks'
def init_is_systemd():
"""Return True if the host system uses systemd, False otherwise."""
if lsb_release()['DISTRIB_CODENAME'] == 'trusty':
return False
return os.path.isdir(SYSTEMD_SYSTEM)
def lsb_release():
"""Return /etc/lsb-release in a dict"""
d = {}
with open('/etc/lsb-release', 'r') as lsb:
for l in lsb:
k, v = l.split('=')
d[k.strip()] = v.strip()
return d
def get_osd_units():
"""Returns a list of strings, one for each unit that is live"""
cmd = '/bin/cat /var/lib/ceph/osd/ceph-*/whoami'
try:
output = (subprocess
.check_output([cmd], shell=True).decode('utf-8')
.split('\n'))
return [u for u in output if u]
except subprocess.CalledProcessError:
return []
def do_status():
if init_is_systemd():
cmd = "/usr/local/lib/nagios/plugins/check_systemd.py ceph-osd@{}"
else:
cmd = "/sbin/status ceph-osd id={}"
lines = []
for unit in get_osd_units():
try:
output = (subprocess
.check_output(cmd.format(unit).split(),
stderr=subprocess.STDOUT)
.decode('utf-8'))
except subprocess.CalledProcessError as e:
output = ("Failed: check command raised: {}"
.format(e.output.decode('utf-8')))
lines.append(output)
_tmp_file = os.path.join(tempfile.gettempdir(), CRON_CHECK_TMPFILE)
with open(_tmp_file, 'wt') as f:
f.writelines(lines)
def run_main():
# on bionic we can interprocess lock; we don't do it for older platforms
if fasteners is not None:
lock = fasteners.InterProcessLock(LOCKFILE)
if lock.acquire(blocking=False):
try:
do_status()
finally:
lock.release()
else:
do_status()
if __name__ == '__main__':
run_main()

View File

@ -15,14 +15,14 @@
# limitations under the License.
import base64
import json
import glob
import json
import netifaces
import os
import shutil
import sys
import socket
import subprocess
import netifaces
import sys
sys.path.append('lib')
import ceph.utils as ceph
@ -37,6 +37,7 @@ from charmhelpers.core.hookenv import (
related_units,
relation_get,
relation_set,
relations_of_type,
Hooks,
UnregisteredHookError,
service_name,
@ -47,16 +48,17 @@ from charmhelpers.core.hookenv import (
application_version_set,
)
from charmhelpers.core.host import (
umount,
mkdir,
add_to_updatedb_prunepath,
cmp_pkgrevno,
is_container,
lsb_release,
mkdir,
restart_on_change,
service_reload,
service_restart,
add_to_updatedb_prunepath,
restart_on_change,
umount,
write_file,
is_container,
init_is_systemd,
CompareHostReleases,
)
from charmhelpers.fetch import (
add_source,
@ -110,6 +112,9 @@ import charmhelpers.contrib.openstack.vaultlocker as vaultlocker
hooks = Hooks()
STORAGE_MOUNT_PATH = '/var/lib/ceph'
# cron.d related files
CRON_CEPH_CHECK_FILE = '/etc/cron.d/check-osd-services'
def check_for_upgrade():
if not os.path.exists(ceph._upgrade_keyring):
@ -601,6 +606,7 @@ def upgrade_charm():
fatal=True)
install_udev_rules()
remap_resolved_targets()
maybe_refresh_nrpe_files()
def remap_resolved_targets():
@ -632,28 +638,62 @@ def remap_resolved_targets():
'nrpe-external-master-relation-changed')
def update_nrpe_config():
# python-dbus is used by check_upstart_job
apt_install('python3-dbus')
# fasteners is used by apt_install collect_ceph_osd_services.py
pkgs = ['python3-dbus']
if CompareHostReleases(lsb_release()['DISTRIB_CODENAME']) >= 'bionic':
pkgs.append('python3-fasteners')
apt_install(pkgs)
# copy the check and collect files over to the plugins directory
charm_dir = os.environ.get('CHARM_DIR', '')
nagios_plugins = '/usr/local/lib/nagios/plugins'
# Grab nagios user/group ID's from original source
_dir = os.stat(nagios_plugins)
uid = _dir.st_uid
gid = _dir.st_gid
for name in ('collect_ceph_osd_services.py', 'check_ceph_osd_services.py'):
target = os.path.join(nagios_plugins, name)
shutil.copy(os.path.join(charm_dir, 'files', 'nagios', name), target)
os.chown(target, uid, gid)
hostname = nrpe.get_nagios_hostname()
current_unit = nrpe.get_nagios_unit_name()
# create systemd or upstart check
cmd = '/bin/cat /var/lib/ceph/osd/ceph-*/whoami |'
if init_is_systemd():
cmd += 'xargs -I_@ /usr/local/lib/nagios/plugins/check_systemd.py'
cmd += ' ceph-osd@_@'
else:
cmd += 'xargs -I@ status ceph-osd id=@'
cmd += ' && exit 0 || exit 2'
# BUG#1810749 - the nagios user can't access /var/lib/ceph/.. and that's a
# GOOD THING, as it keeps ceph secure from Nagios. However, to check
# whether ceph is okay, the check_systemd.py or 'status ceph-osd' still
# needs to be called with the contents of ../osd/ceph-*/whoami files. To
# get around this conundrum, instead a cron.d job that runs as root will
# perform the checks every minute, and write to a tempory file the results,
# and the nrpe check will grep this file and error out (return 2) if the
# first 3 characters of a line are not 'OK:'.
cmd = ('MAILTO=""\n'
'* * * * * root '
'/usr/local/lib/nagios/plugins/collect_ceph_osd_services.py'
' 2>&1 | logger -t check-osd\n')
with open(CRON_CEPH_CHECK_FILE, "wt") as f:
f.write(cmd)
nrpe_cmd = '/usr/local/lib/nagios/plugins/check_ceph_osd_services.py'
nrpe_setup = nrpe.NRPE(hostname=hostname)
nrpe_setup.add_check(
shortname='ceph-osd',
description='process check {%s}' % current_unit,
check_cmd=cmd
check_cmd=nrpe_cmd
)
nrpe_setup.write()
def maybe_refresh_nrpe_files():
"""if the nrpe-external-master relation exists then refresh the nrpe
configuration -- this is called during a charm upgrade
"""
if relations_of_type('nrpe-external-master'):
update_nrpe_config()
@hooks.hook('secrets-storage-relation-joined')
def secrets_storage_joined(relation_id=None):
relation_set(relation_id=relation_id,

View File

@ -57,7 +57,7 @@ applications:
charm: cs:~openstack-charmers-next/cinder-ceph
nova-cloud-controller:
expose: True
charm: cs::~openstack-charmers-next/nova-cloud-controller
charm: cs:~openstack-charmers-next/nova-cloud-controller
num_units: 1
options:
openstack-origin: cloud:bionic-rocky/proposed

View File

@ -38,7 +38,7 @@ deps = -r{toxinidir}/requirements.txt
basepython = python3
deps = -r{toxinidir}/requirements.txt
-r{toxinidir}/test-requirements.txt
commands = flake8 {posargs} hooks unit_tests tests actions lib
commands = flake8 {posargs} hooks unit_tests tests actions lib files
charm-proof
[testenv:venv]