Switch the deletion of the cron output with a check for a time
The removal of the cron job output messes up the behaviour of the NRPE check if it called more than once in a minute. The intention of removing the cron job output is to prevent stale data from a false STATE_OK. I've rewritten it so the cron job puts the time into the file so stale output can be detected. I went with explicit output in the file as I've known of issues with mtime comparisons in the past. Closes-Bug: #2019251 Change-Id: I91d0430eecb58b6e1a1d1ef1d75e38d88ebfd30d
This commit is contained in:
parent
1bac66ee50
commit
8955727738
|
@ -6,9 +6,13 @@
|
|||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
CRON_CHECK_TMPFILE = 'ceph-osd-checks'
|
||||
NAGIOS_HOME = '/var/lib/nagios'
|
||||
CRON_CHECK_PREFIX = 'Ceph OSD cron check:'
|
||||
# The cron job runs every minute, so allow for 2 min
|
||||
CRON_TIMEOUT = 120
|
||||
|
||||
STATE_OK = 0
|
||||
STATE_WARNING = 1
|
||||
|
@ -38,16 +42,28 @@ def run_main():
|
|||
print("Something went wrong reading the file: {}".format(str(e)))
|
||||
return STATE_UNKNOWN
|
||||
|
||||
# now remove the file in case the next check fails.
|
||||
try:
|
||||
os.remove(_tmp_file)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if not lines:
|
||||
print("checked status file is empty: {}".format(_tmp_file))
|
||||
return STATE_UNKNOWN
|
||||
|
||||
# check the first line to see if is the expected time the output is stale
|
||||
firstline = lines.pop(0)
|
||||
if not firstline.startswith(CRON_CHECK_PREFIX):
|
||||
print("First line not of the expected format: {}".format(firstline))
|
||||
return STATE_UNKNOWN
|
||||
|
||||
try:
|
||||
rest_of_line = firstline[len(CRON_CHECK_PREFIX):]
|
||||
cron_time = int(rest_of_line)
|
||||
except Exception as exc:
|
||||
print("Unable to parse time from first line: {}".format(exc))
|
||||
return STATE_UNKNOWN
|
||||
|
||||
if cron_time + CRON_TIMEOUT < time.time():
|
||||
how_old = int(time.time() - cron_time)
|
||||
print("Cron output is stale ({} sec old)".format(how_old))
|
||||
return STATE_UNKNOWN
|
||||
|
||||
# finally, check that the file contains all ok lines. Unfortunately, it's
|
||||
# not consistent across releases, but what is consistent is that the check
|
||||
# command in the collect phase does fail, and so the start of the line is
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
from pwd import getpwnam
|
||||
|
||||
# fasteners only exists in Bionic, so this will fail on xenial and trusty
|
||||
|
@ -18,6 +19,7 @@ SYSTEMD_SYSTEM = '/run/systemd/system'
|
|||
LOCKFILE = '/var/lock/check-osds.lock'
|
||||
CRON_CHECK_TMPFILE = 'ceph-osd-checks'
|
||||
NAGIOS_HOME = '/var/lib/nagios'
|
||||
CRON_CHECK_PREFIX = 'Ceph OSD cron check:'
|
||||
|
||||
|
||||
def init_is_systemd():
|
||||
|
@ -57,6 +59,9 @@ def do_status():
|
|||
|
||||
lines = []
|
||||
|
||||
# First line contains the time so the checker can identify stale data
|
||||
lines.append("{} {}\n".format(CRON_CHECK_PREFIX, int(time.time())))
|
||||
|
||||
for unit in get_osd_units():
|
||||
try:
|
||||
output = (subprocess
|
||||
|
|
Loading…
Reference in New Issue