Add monitoring for object-replicator logs disappearing

As noted in lp#1691570, there is an issue with storage I/O during
coro-thread cleanup that affects the swift-object-replicator's ability
to complete replication successfully.  This is most easily witnessed by
the lack of the every 5 minute replicated percentage complete messages
that should come from the swift-object-replicator daemon to syslog.
This patch monitors for and alerts on the condition of the "replicated"
line missing from syslog within the past 15 minutes.

Change-Id: Ieb15da3f3f67fa9bcad03151e36c70faae4c36c9
Closes-Bug: 1691570
This commit is contained in:
Drew Freiberger 2019-12-23 13:18:56 -06:00
parent 971df56d61
commit 74daa465d6
8 changed files with 241 additions and 2 deletions

View File

@ -130,6 +130,20 @@ options:
default: "-m -r 60 180 10 20"
type: string
description: String appended to nagios check
nagios-replication-check-params:
default: "replicated 15 2 1"
type: string
description: |
Space delimited parameters for check_swift_replicator_logs.sh.
search_pattern
interval_in_minutes
minimum_hits_before_warning
minimum_hits_before_critical
Default of "replicated 15 2 1" leads to warning alert when there
have not been at least 2 lines matching "replicated" in the last 15
minutes, and critical if there have been no matching lines in the
last 15 minutes.
Set to blank string "" to disable the check.
nagios_context:
default: "juju"
type: string

View File

@ -0,0 +1,8 @@
#!/bin/bash
pattern=${1:-replicated}
interval=${2:-15}
warn_min=${3:-2}
crit_min=${4:-1}
exec sudo -u root /usr/local/lib/nagios/plugins/check_timed_logs.pl -pattern $pattern -logfile /var/log/syslog -interval $interval -w $warn_min -c $crit_min -reverse 2>&1

View File

@ -0,0 +1,195 @@
#!/usr/bin/perl
##############################################################################
#
# NAME: check_timed_logs.pl
#
# AUTHOR: Gerd Radecke
#
# COMMENT: Script searches a text file for the appearance of a given RegEx within a given time period.
# Using additional parameters you can adjust: Time string format,
# time string position, number of pattern matches required to be "successful".
#
# Return Values for NRPE:
# OK - There are only 0 instances of $pattern in the last $interval minutes (0)
# CRITICAL - There are $hits instances of \"$pattern\" in the last $interval minutes (2)
# WARNING - There are $hits instances of \"$pattern\" in the last $interval minutes (1)
# UNKNOWN - There were no files matching the passed filename (3)
#
# REQUIRES: perl-Time-Piece perl-File-ReadBackwards
# ON RHEL-based systems you can run: yum install perl-Time-Piece perl-File-ReadBackwards
#
# CHANGELOG:
# 1.0 2013-02-19 - initial version
# 1.0.1 2013-02-27 - fixed false variable reference
# 1.0.2 2013-10-07 - integrated threshold comparison fix by Christoph Tavan - thanks ;)
# 1.0.3 2019-12-23 - Added --reverse flag to check for presense of lines within last $interval - drewn3ss
# 1.0.4 2019-12-30 - Updated time_pattern default to match ubuntu syslog timepattern
#
##############################################################################
use File::ReadBackwards; # EPEL RPM: perl-File-ReadBackwards.noarch
use Getopt::Long;
use Time::Piece; # RHEL package: perl-Time-Piece
use File::Find;
$ENV{"LC_ALL"} = "C";
$time_pattern = '%b %e %H:%M:%S';
$warning = 1;
$critical = 1;
$reverse = 0;
$time_position = 0;
$result = GetOptions (
"pattern=s" => \$pattern, # string e.g. "CRITICAL"
"logfile=s" => \$logfile, # string e.g. "/var/log/messages"
"interval=i" => \$interval, # int e.g. 30 for half an hour
"timepattern=s" => \$time_pattern, #string e.g. '%Y-%m-%d %H:%M:%S'
"timeposition=i" => \$time_position, # int, each line is split into string on the space character, this provides the index of the first string block for the time
"warning|w=i" => \$warning, # int e.g. 3
"critical|c=i" => \$critical, # int e.g. 5
"debug|d|vv" => \$debug, # flag/boolean
"verbose|v" => \$verbose, # flag/boolean
"reverse|r|?" => \$reverse, # flag/boolean - should we report on absence of pattern rather than presence
"help|h|?" => \$usage # flag/boolean - is help called?
);
print $count;
if ($usage || !(defined($pattern) && $pattern ne "") || !(defined($logfile) && $logfile ne "") || !(defined($interval) && $interval gt 0 )) {
print "\nUsage: $0
\t -pattern <regex-pattern>
\t -logfile <path to log file>
\t -interval <minutes>
\t -reverse # report on absence of enough entries in the timeframe
\t [-timepattern <POSIX time pattern>]
\t [-warning|w <number_of_required_hits>] [-critical|c <number_of_required_hits>]
\t [-timeposition <time_string_index_on_line>] \n\n";
print "To allow for rotating logfiles, any file that matches the passed filename and was changed within the passed interval is checked. e.g. If you pass /var/log/applog, this could match /var/log/applog.0, /var/log/applog.old and so on. However, it does not handle compressed (e.g. gzip/bzip) files. \n\n";
print "Default time pattern is: %Y-%m-%d %H:%M:%S => 2012-12-31 17:20:40\n";
print "Example Time patterns (from a RHEL system):
BSD/Syslog: %b %d %H:%M:%S => Dec 31 17:20:40
Apache Logs: %d/%b/%Y:%H:%M:%S (with -timeposition 3) => 31/Dec/2012:17:20:40
Websphere Logs: %d-%b-%Y %I:%M:%S %p => 31-Dec-2012 05:20:40 PM
Nagios logs: %s => 1361260238 (seconds since 01-01-1970) \n";
print "For a posix time format documentation check out: http://linux.die.net/man/3/strftime \n\n";
print "Default warning/critical threshold of pattern matches to find is: 1 -> unless you change this, you will only get OK or CRITICAL, but never WARNING\n\n";
print "Default time position is 0 \n";
print "\t Time Position: each line is split into an array of strings on the space character, this provides the index for the first time string.\n";
print "\t Note: If the line starts with the time, that means we start at index 0.\n\n";
print "The values for interval and warning/critical need to be larger than zero \n";
exit;
}
my $now = localtime;
$oldestDate = $now - $interval*60;
if ($debug) { print "Now: $now and tzoffset: ". ($now)->tzoffset ."\n"; }
if ($debug) { print "Oldest date: $oldestDate and tzoffset: ". ($oldestDate)->tzoffset ."\n"; }
$hits = 0; # number of matches for the regex within the log files will be counted in this variable
$validFileNames = 0; # number of files that match the given filename
my @dateFields = $time_pattern =~ / /g; # how many spaces do we have in our time pattern?
my $dateFieldsCount = @dateFields; # count the number spaces in the date format
if ($debug) {
$verbose = 1; # if we debug, we want to have all information
print "Interval: $interval equals " . ($interval/1440) . " Fraction of days.\n";
}
$logfile=~m/^.+\//;
$DIR=$&; # greedy matching from theline above
@files = find(\&process, $DIR);
sub process {
### note the following is done for each file that is found and matches the name and date criteria
if ($File::Find::name =~ m/$logfile/ && (-T)) { # match only files that are ASCII files (-T) and that contain the file name
$validFileNames += 1;
if ($debug) { print "Found: $File::Find::name has age " . (-M) ." (in Fraction of days) \n"; }
# -M returns the last change date of the file in fraction of days. e.g. 24 ago -> 1, 6 hours ago -> 0.25
if ((-M) < ($interval/1440)) { # match only files whose last change (-M) is within the change interval
# perldoc defines -M : Script start time minus file modification time, in days.
$LOGS = File::ReadBackwards->new($File::Find::name) or
die "Can't read file: $File::Find::name\n";
while (defined($line = $LOGS->readline) ) {
my @fields = split ' ', $line; # split the line into an array, split on ' '(space)
$dateString = ""; # reset the datestring for each line
for ($i=0; $i <= $dateFieldsCount; $i++) {
$dateString .= $fields[$time_position + $i] . " "; # concatenate all date strings into one parseable string
}
$dateString =~ s/^\s+|\s+$//g ; # remove both leading and tailing whitespace - perl 6 will have a trim() function, until then - regex !
$dateString =~ s/<|>|\]|\[//g ; # remove brackets
#if ($debug) { print "Datestring: $dateString \n";} # this is only needed if you are unsure which strings of the array are part of your datestring
my $dt = Time::Piece->strptime($dateString, $time_pattern); # parse string into Time::Piece object
my $dt_tzadjusted = ($dt - $now->tzoffset); # TIME::PIECE assumes the parsed dates will be UTC, we need to adjust to the local tz offset
# some date formats don't have the year information e.g. Dec 31 15:50:57 -> the year would automatically be parsed to 1970,
# which is probably never correct. We will correct this to this or last year
if ($dt->year eq 1970) {
$dt = $dt->add_years($now->year - 1970); # We cannot set the year directly. So we add the number of years that have passed since 1970.
$dt_tzadjusted = ($dt - $now->tzoffset);
# NOTE: If $now is January 1st and we're looking at log files from the end of last year, we will add too many years
# hence if the date is now in the future, we subtract one year again.
if ($dt_tzadjusted > $now) {
$dt = $dt->add_years(-1);
$dt_tzadjusted = ($dt - $now->tzoffset);
}
}
if ($dt_tzadjusted > $oldestDate) { # is the date bigger=>newer than the oldest date we want to look at?
if ($line =~ m/$pattern/){ # if the line contains the regex pattern
if ($debug) {print $dt . " => "; }
if ($verbose) { print $line; }
$hits++; # increase by 1 hit
}
}
else{
last; #if the date is older than the oldest we still care about, leave this loop -> go to the next file if available
}
}
close(LOGS);
}
}
}## the find sub process ends here
if (!$reverse) {
if ($hits >= ($critical + 0)) {
print "CRITICAL - There are $hits instances of \"$pattern\" in the last $interval minutes\n";
exit 2; }
if ($hits >= ($warning + 0)) {
print "WARNING - There are $hits instances of \"$pattern\" in the last $interval minutes\n";
exit 1; }
if ($validFileNames == 0) {
print "UNKNOWN - There were no files matching the passed filename: \"$logfile\"\n";
exit 3; }
else {
print "OK - There are only $hits instances of \"$pattern\" in the last $interval minutes - Warning threshold is $warning\n";
exit 0;
}
} else {
if ($hits < ($critical + 0)) {
print "CRITICAL - There are only $hits instances of \"$pattern\" in the last $interval minutes\n";
exit 2; }
if ($hits < ($warning + 0)) {
print "WARNING - There are only $hits instances of \"$pattern\" in the last $interval minutes\n";
exit 1; }
if ($validFileNames == 0) {
print "UNKNOWN - There were no files matching the passed filename: \"$logfile\"\n";
exit 3; }
else {
print "OK - There are at least $hits instances of \"$pattern\" in the last $interval minutes - Warning threshold is $warning\n";
exit 0;
}
}

View File

@ -1 +1,2 @@
nagios ALL=(swift) NOPASSWD:/usr/bin/swift-init status *
nagios ALL=NOPASSWD:/usr/local/lib/nagios/plugins/check_timed_logs.pl *

View File

@ -385,6 +385,12 @@ def update_nrpe_config():
rsync(os.path.join(os.getenv('CHARM_DIR'), 'files', 'nrpe-external-master',
'check_swift_storage.py'),
os.path.join(NAGIOS_PLUGINS, 'check_swift_storage.py'))
rsync(os.path.join(os.getenv('CHARM_DIR'), 'files', 'nrpe-external-master',
'check_timed_logs.pl'),
os.path.join(NAGIOS_PLUGINS, 'check_timed_logs.pl'))
rsync(os.path.join(os.getenv('CHARM_DIR'), 'files', 'nrpe-external-master',
'check_swift_replicator_logs.sh'),
os.path.join(NAGIOS_PLUGINS, 'check_swift_replicator_logs.sh'))
rsync(os.path.join(os.getenv('CHARM_DIR'), 'files', 'nrpe-external-master',
'check_swift_service'),
os.path.join(NAGIOS_PLUGINS, 'check_swift_service'))
@ -405,6 +411,16 @@ def update_nrpe_config():
check_cmd='check_swift_storage.py {}'.format(
config('nagios-check-params'))
)
if config('nagios-replication-check-params'):
nrpe_setup.add_check(
shortname='swift_replicator_health',
description='Check swift object replicator log reporting',
check_cmd='check_swift_replicator_logs.sh {}'.format(
config('nagios-replication-check-params'))
)
else:
nrpe_setup.remove_check(shortname='swift_replicator_health')
nrpe.add_init_service_checks(nrpe_setup, SWIFT_SVCS, current_unit)
nrpe_setup.write()

View File

@ -112,6 +112,8 @@ PACKAGES = [
'python-psutil',
'ufw',
'xfsprogs',
'libfile-readbackwards-perl',
'libtime-piece-perl',
]
PY3_PACKAGES = [

View File

@ -176,7 +176,8 @@ class SwiftStorageRelationsTests(CharmTestCase):
self.apt_install.assert_called_with(
['gdisk', 'lvm2', 'swift', 'swift-account',
'swift-container', 'swift-object', 'python-jinja2',
'python-psutil', 'ufw', 'xfsprogs'],
'python-psutil', 'ufw', 'xfsprogs',
'libfile-readbackwards-perl', 'libtime-piece-perl'],
fatal=True)
self.assertTrue(self.update_nrpe_config.called)
self.assertTrue(mock_ensure_devs_tracked.called)

View File

@ -567,7 +567,8 @@ class SwiftStorageUtilsTests(CharmTestCase):
options=dpkg_opts,
packages=['gdisk', 'lvm2', 'swift', 'swift-account',
'swift-container', 'swift-object', 'python-jinja2',
'python-psutil', 'ufw', 'xfsprogs'],
'python-psutil', 'ufw', 'xfsprogs',
'libfile-readbackwards-perl', 'libtime-piece-perl'],
fatal=True
)
self.assertTrue(mock_remove_old_packages.called)
@ -600,6 +601,7 @@ class SwiftStorageUtilsTests(CharmTestCase):
options=dpkg_opts,
packages=['gdisk', 'lvm2', 'swift', 'swift-account',
'swift-container', 'swift-object', 'ufw', 'xfsprogs',
'libfile-readbackwards-perl', 'libtime-piece-perl',
'python3-jinja2', 'python3-psutil', 'python3-six',
'python3-swift'],
fatal=True