Switch the deletion of the cron output with a check for a time
The removal of the cron job output messes up the behaviour of the NRPE check if it called more than once in a minute. The intention of removing the cron job output is to prevent stale data from a false STATE_OK. I've rewritten it so the cron job puts the time into the file so stale output can be detected. I went with explicit output in the file as I've known of issues with mtime comparisons in the past. Closes-Bug: #2019251 Change-Id: I91d0430eecb58b6e1a1d1ef1d75e38d88ebfd30d
This commit is contained in:
parent
1bac66ee50
commit
8955727738
|
@ -6,9 +6,13 @@
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
CRON_CHECK_TMPFILE = 'ceph-osd-checks'
|
CRON_CHECK_TMPFILE = 'ceph-osd-checks'
|
||||||
NAGIOS_HOME = '/var/lib/nagios'
|
NAGIOS_HOME = '/var/lib/nagios'
|
||||||
|
CRON_CHECK_PREFIX = 'Ceph OSD cron check:'
|
||||||
|
# The cron job runs every minute, so allow for 2 min
|
||||||
|
CRON_TIMEOUT = 120
|
||||||
|
|
||||||
STATE_OK = 0
|
STATE_OK = 0
|
||||||
STATE_WARNING = 1
|
STATE_WARNING = 1
|
||||||
|
@ -38,16 +42,28 @@ def run_main():
|
||||||
print("Something went wrong reading the file: {}".format(str(e)))
|
print("Something went wrong reading the file: {}".format(str(e)))
|
||||||
return STATE_UNKNOWN
|
return STATE_UNKNOWN
|
||||||
|
|
||||||
# now remove the file in case the next check fails.
|
|
||||||
try:
|
|
||||||
os.remove(_tmp_file)
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
if not lines:
|
if not lines:
|
||||||
print("checked status file is empty: {}".format(_tmp_file))
|
print("checked status file is empty: {}".format(_tmp_file))
|
||||||
return STATE_UNKNOWN
|
return STATE_UNKNOWN
|
||||||
|
|
||||||
|
# check the first line to see if is the expected time the output is stale
|
||||||
|
firstline = lines.pop(0)
|
||||||
|
if not firstline.startswith(CRON_CHECK_PREFIX):
|
||||||
|
print("First line not of the expected format: {}".format(firstline))
|
||||||
|
return STATE_UNKNOWN
|
||||||
|
|
||||||
|
try:
|
||||||
|
rest_of_line = firstline[len(CRON_CHECK_PREFIX):]
|
||||||
|
cron_time = int(rest_of_line)
|
||||||
|
except Exception as exc:
|
||||||
|
print("Unable to parse time from first line: {}".format(exc))
|
||||||
|
return STATE_UNKNOWN
|
||||||
|
|
||||||
|
if cron_time + CRON_TIMEOUT < time.time():
|
||||||
|
how_old = int(time.time() - cron_time)
|
||||||
|
print("Cron output is stale ({} sec old)".format(how_old))
|
||||||
|
return STATE_UNKNOWN
|
||||||
|
|
||||||
# finally, check that the file contains all ok lines. Unfortunately, it's
|
# finally, check that the file contains all ok lines. Unfortunately, it's
|
||||||
# not consistent across releases, but what is consistent is that the check
|
# not consistent across releases, but what is consistent is that the check
|
||||||
# command in the collect phase does fail, and so the start of the line is
|
# command in the collect phase does fail, and so the start of the line is
|
||||||
|
|
|
@ -6,6 +6,7 @@
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import time
|
||||||
from pwd import getpwnam
|
from pwd import getpwnam
|
||||||
|
|
||||||
# fasteners only exists in Bionic, so this will fail on xenial and trusty
|
# fasteners only exists in Bionic, so this will fail on xenial and trusty
|
||||||
|
@ -18,6 +19,7 @@ SYSTEMD_SYSTEM = '/run/systemd/system'
|
||||||
LOCKFILE = '/var/lock/check-osds.lock'
|
LOCKFILE = '/var/lock/check-osds.lock'
|
||||||
CRON_CHECK_TMPFILE = 'ceph-osd-checks'
|
CRON_CHECK_TMPFILE = 'ceph-osd-checks'
|
||||||
NAGIOS_HOME = '/var/lib/nagios'
|
NAGIOS_HOME = '/var/lib/nagios'
|
||||||
|
CRON_CHECK_PREFIX = 'Ceph OSD cron check:'
|
||||||
|
|
||||||
|
|
||||||
def init_is_systemd():
|
def init_is_systemd():
|
||||||
|
@ -57,6 +59,9 @@ def do_status():
|
||||||
|
|
||||||
lines = []
|
lines = []
|
||||||
|
|
||||||
|
# First line contains the time so the checker can identify stale data
|
||||||
|
lines.append("{} {}\n".format(CRON_CHECK_PREFIX, int(time.time())))
|
||||||
|
|
||||||
for unit in get_osd_units():
|
for unit in get_osd_units():
|
||||||
try:
|
try:
|
||||||
output = (subprocess
|
output = (subprocess
|
||||||
|
|
Loading…
Reference in New Issue