From 8955727738a9aff96c47e4b13d7713eacd91c3ce Mon Sep 17 00:00:00 2001 From: Danny Cocks Date: Mon, 23 Oct 2023 16:06:31 +1100 Subject: [PATCH] Switch the deletion of the cron output with a check for a time The removal of the cron job output messes up the behaviour of the NRPE check if it called more than once in a minute. The intention of removing the cron job output is to prevent stale data from a false STATE_OK. I've rewritten it so the cron job puts the time into the file so stale output can be detected. I went with explicit output in the file as I've known of issues with mtime comparisons in the past. Closes-Bug: #2019251 Change-Id: I91d0430eecb58b6e1a1d1ef1d75e38d88ebfd30d --- files/nagios/check_ceph_osd_services.py | 28 ++++++++++++++++++----- files/nagios/collect_ceph_osd_services.py | 5 ++++ 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/files/nagios/check_ceph_osd_services.py b/files/nagios/check_ceph_osd_services.py index 7f53b2d7..28669c28 100755 --- a/files/nagios/check_ceph_osd_services.py +++ b/files/nagios/check_ceph_osd_services.py @@ -6,9 +6,13 @@ import os import sys +import time CRON_CHECK_TMPFILE = 'ceph-osd-checks' NAGIOS_HOME = '/var/lib/nagios' +CRON_CHECK_PREFIX = 'Ceph OSD cron check:' +# The cron job runs every minute, so allow for 2 min +CRON_TIMEOUT = 120 STATE_OK = 0 STATE_WARNING = 1 @@ -38,16 +42,28 @@ def run_main(): print("Something went wrong reading the file: {}".format(str(e))) return STATE_UNKNOWN - # now remove the file in case the next check fails. - try: - os.remove(_tmp_file) - except Exception: - pass - if not lines: print("checked status file is empty: {}".format(_tmp_file)) return STATE_UNKNOWN + # check the first line to see if is the expected time the output is stale + firstline = lines.pop(0) + if not firstline.startswith(CRON_CHECK_PREFIX): + print("First line not of the expected format: {}".format(firstline)) + return STATE_UNKNOWN + + try: + rest_of_line = firstline[len(CRON_CHECK_PREFIX):] + cron_time = int(rest_of_line) + except Exception as exc: + print("Unable to parse time from first line: {}".format(exc)) + return STATE_UNKNOWN + + if cron_time + CRON_TIMEOUT < time.time(): + how_old = int(time.time() - cron_time) + print("Cron output is stale ({} sec old)".format(how_old)) + return STATE_UNKNOWN + # finally, check that the file contains all ok lines. Unfortunately, it's # not consistent across releases, but what is consistent is that the check # command in the collect phase does fail, and so the start of the line is diff --git a/files/nagios/collect_ceph_osd_services.py b/files/nagios/collect_ceph_osd_services.py index 633148a2..fae8e774 100755 --- a/files/nagios/collect_ceph_osd_services.py +++ b/files/nagios/collect_ceph_osd_services.py @@ -6,6 +6,7 @@ import os import subprocess +import time from pwd import getpwnam # fasteners only exists in Bionic, so this will fail on xenial and trusty @@ -18,6 +19,7 @@ SYSTEMD_SYSTEM = '/run/systemd/system' LOCKFILE = '/var/lock/check-osds.lock' CRON_CHECK_TMPFILE = 'ceph-osd-checks' NAGIOS_HOME = '/var/lib/nagios' +CRON_CHECK_PREFIX = 'Ceph OSD cron check:' def init_is_systemd(): @@ -57,6 +59,9 @@ def do_status(): lines = [] + # First line contains the time so the checker can identify stale data + lines.append("{} {}\n".format(CRON_CHECK_PREFIX, int(time.time()))) + for unit in get_osd_units(): try: output = (subprocess