Switch the deletion of the cron output with a check for a time

The removal of the cron job output messes up the behaviour of the NRPE check if
it called more than once in a minute. The intention of removing the cron job
output is to prevent stale data from a false STATE_OK. I've rewritten it so the
cron job puts the time into the file so stale output can be detected.

I went with explicit output in the file as I've known of issues with mtime
comparisons in the past.

Closes-Bug: #2019251
Change-Id: I91d0430eecb58b6e1a1d1ef1d75e38d88ebfd30d
This commit is contained in:
Danny Cocks 2023-10-23 16:06:31 +11:00
parent 1bac66ee50
commit 8955727738
2 changed files with 27 additions and 6 deletions

View File

@ -6,9 +6,13 @@
import os
import sys
import time
CRON_CHECK_TMPFILE = 'ceph-osd-checks'
NAGIOS_HOME = '/var/lib/nagios'
CRON_CHECK_PREFIX = 'Ceph OSD cron check:'
# The cron job runs every minute, so allow for 2 min
CRON_TIMEOUT = 120
STATE_OK = 0
STATE_WARNING = 1
@ -38,16 +42,28 @@ def run_main():
print("Something went wrong reading the file: {}".format(str(e)))
return STATE_UNKNOWN
# now remove the file in case the next check fails.
try:
os.remove(_tmp_file)
except Exception:
pass
if not lines:
print("checked status file is empty: {}".format(_tmp_file))
return STATE_UNKNOWN
# check the first line to see if is the expected time the output is stale
firstline = lines.pop(0)
if not firstline.startswith(CRON_CHECK_PREFIX):
print("First line not of the expected format: {}".format(firstline))
return STATE_UNKNOWN
try:
rest_of_line = firstline[len(CRON_CHECK_PREFIX):]
cron_time = int(rest_of_line)
except Exception as exc:
print("Unable to parse time from first line: {}".format(exc))
return STATE_UNKNOWN
if cron_time + CRON_TIMEOUT < time.time():
how_old = int(time.time() - cron_time)
print("Cron output is stale ({} sec old)".format(how_old))
return STATE_UNKNOWN
# finally, check that the file contains all ok lines. Unfortunately, it's
# not consistent across releases, but what is consistent is that the check
# command in the collect phase does fail, and so the start of the line is

View File

@ -6,6 +6,7 @@
import os
import subprocess
import time
from pwd import getpwnam
# fasteners only exists in Bionic, so this will fail on xenial and trusty
@ -18,6 +19,7 @@ SYSTEMD_SYSTEM = '/run/systemd/system'
LOCKFILE = '/var/lock/check-osds.lock'
CRON_CHECK_TMPFILE = 'ceph-osd-checks'
NAGIOS_HOME = '/var/lib/nagios'
CRON_CHECK_PREFIX = 'Ceph OSD cron check:'
def init_is_systemd():
@ -57,6 +59,9 @@ def do_status():
lines = []
# First line contains the time so the checker can identify stale data
lines.append("{} {}\n".format(CRON_CHECK_PREFIX, int(time.time())))
for unit in get_osd_units():
try:
output = (subprocess