diff --git a/files/nagios/check_ceph_osd_services.py b/files/nagios/check_ceph_osd_services.py index 7f53b2d7..28669c28 100755 --- a/files/nagios/check_ceph_osd_services.py +++ b/files/nagios/check_ceph_osd_services.py @@ -6,9 +6,13 @@ import os import sys +import time CRON_CHECK_TMPFILE = 'ceph-osd-checks' NAGIOS_HOME = '/var/lib/nagios' +CRON_CHECK_PREFIX = 'Ceph OSD cron check:' +# The cron job runs every minute, so allow for 2 min +CRON_TIMEOUT = 120 STATE_OK = 0 STATE_WARNING = 1 @@ -38,16 +42,28 @@ def run_main(): print("Something went wrong reading the file: {}".format(str(e))) return STATE_UNKNOWN - # now remove the file in case the next check fails. - try: - os.remove(_tmp_file) - except Exception: - pass - if not lines: print("checked status file is empty: {}".format(_tmp_file)) return STATE_UNKNOWN + # check the first line to see if is the expected time the output is stale + firstline = lines.pop(0) + if not firstline.startswith(CRON_CHECK_PREFIX): + print("First line not of the expected format: {}".format(firstline)) + return STATE_UNKNOWN + + try: + rest_of_line = firstline[len(CRON_CHECK_PREFIX):] + cron_time = int(rest_of_line) + except Exception as exc: + print("Unable to parse time from first line: {}".format(exc)) + return STATE_UNKNOWN + + if cron_time + CRON_TIMEOUT < time.time(): + how_old = int(time.time() - cron_time) + print("Cron output is stale ({} sec old)".format(how_old)) + return STATE_UNKNOWN + # finally, check that the file contains all ok lines. Unfortunately, it's # not consistent across releases, but what is consistent is that the check # command in the collect phase does fail, and so the start of the line is diff --git a/files/nagios/collect_ceph_osd_services.py b/files/nagios/collect_ceph_osd_services.py index 633148a2..fae8e774 100755 --- a/files/nagios/collect_ceph_osd_services.py +++ b/files/nagios/collect_ceph_osd_services.py @@ -6,6 +6,7 @@ import os import subprocess +import time from pwd import getpwnam # fasteners only exists in Bionic, so this will fail on xenial and trusty @@ -18,6 +19,7 @@ SYSTEMD_SYSTEM = '/run/systemd/system' LOCKFILE = '/var/lock/check-osds.lock' CRON_CHECK_TMPFILE = 'ceph-osd-checks' NAGIOS_HOME = '/var/lib/nagios' +CRON_CHECK_PREFIX = 'Ceph OSD cron check:' def init_is_systemd(): @@ -57,6 +59,9 @@ def do_status(): lines = [] + # First line contains the time so the checker can identify stale data + lines.append("{} {}\n".format(CRON_CHECK_PREFIX, int(time.time()))) + for unit in get_osd_units(): try: output = (subprocess