Add new features to swift-drive-audit

This patch adds two new features to swift-drive-audit. The first
is an option in the drive-audit.conf file that allows the operator
to prevent the drives ever being unmounted automatically,
regardless of the amount of errors present. This could be of
benefit in very small systems consisting of only one or two drives
where the operator would like to manually unmount/fix the
particular drive(s) and minimise any potential downtime.

The second is another option in drive-audit.conf that allows the
operator to select a recon directory. This directory will then
have a drive.recon file which will keep an up-to-date record of
the swift drives and any errors associated with them. An example
of the output would be as follows:

{"/srv/node/disk2": "0", "/srv/node/disk3": "25", "/srv/node/disk0": "0",
"/srv/node/disk1": "0", "/srv/node/disk10": "0", "/srv/node/disk7": "0",
"/srv/node/disk4": "137", "/srv/node/disk5": "0", "/srv/node/disk8": "0",
"/srv/node/disk9": "0", "/srv/node/disk6": "0", "/srv/node/disk11": "60"}

This would allow the operator to monitor the errors on the swift
drives without having to spend time searching through logs. Also, if
this is accepted, it should be possible to add an option to
swift-recon that would keep track of this at a system level.

Change-Id: Ib5dacf8622b7363e070c274c7c30c8ead448a055
This commit is contained in:
Lorcan 2014-09-18 17:23:54 +01:00
parent a81b2d2c74
commit cb20763893
2 changed files with 21 additions and 8 deletions

View File

@ -22,7 +22,8 @@ import subprocess
import sys import sys
from ConfigParser import ConfigParser from ConfigParser import ConfigParser
from swift.common.utils import backward, get_logger from swift.common.utils import backward, get_logger, dump_recon_cache, \
config_true_value
def get_devices(device_dir, logger): def get_devices(device_dir, logger):
@ -146,6 +147,7 @@ if __name__ == '__main__':
device_dir = conf.get('device_dir', '/srv/node') device_dir = conf.get('device_dir', '/srv/node')
minutes = int(conf.get('minutes', 60)) minutes = int(conf.get('minutes', 60))
error_limit = int(conf.get('error_limit', 1)) error_limit = int(conf.get('error_limit', 1))
recon_cache_path = conf.get('recon_cache_path', "/var/cache/swift")
log_file_pattern = conf.get('log_file_pattern', log_file_pattern = conf.get('log_file_pattern',
'/var/log/kern.*[!.][!g][!z]') '/var/log/kern.*[!.][!g][!z]')
error_re = [] error_re = []
@ -169,6 +171,9 @@ if __name__ == '__main__':
logger.debug("Devices found: %s" % str(devices)) logger.debug("Devices found: %s" % str(devices))
if not devices: if not devices:
logger.error("Error: No devices found!") logger.error("Error: No devices found!")
recon_errors = {}
for device in devices:
recon_errors[device['mount_point']] = 0
errors = get_errors(error_re, log_file_pattern, minutes) errors = get_errors(error_re, log_file_pattern, minutes)
logger.debug("Errors found: %s" % str(errors)) logger.debug("Errors found: %s" % str(errors))
unmounts = 0 unmounts = 0
@ -179,12 +184,18 @@ if __name__ == '__main__':
if device: if device:
mount_point = device[0]['mount_point'] mount_point = device[0]['mount_point']
if mount_point.startswith(device_dir): if mount_point.startswith(device_dir):
logger.info("Unmounting %s with %d errors" % if config_true_value(conf.get('unmount_failed_device',
(mount_point, count)) True)):
subprocess.call(['umount', '-fl', mount_point]) logger.info("Unmounting %s with %d errors" %
logger.info("Commenting out %s from /etc/fstab" % (mount_point, count))
(mount_point)) subprocess.call(['umount', '-fl', mount_point])
comment_fstab(mount_point) logger.info("Commenting out %s from /etc/fstab" %
unmounts += 1 (mount_point))
comment_fstab(mount_point)
unmounts += 1
recon_errors[mount_point] = count
recon_file = recon_cache_path + "/drive.recon"
dump_recon_cache(recon_errors, recon_file, logger)
if unmounts == 0: if unmounts == 0:
logger.info("No drives were unmounted") logger.info("No drives were unmounted")

View File

@ -8,6 +8,8 @@
# log_max_line_length = 0 # log_max_line_length = 0
# minutes = 60 # minutes = 60
# error_limit = 1 # error_limit = 1
# recon_cache_path = /var/cache/swift
# unmount_failed_device = True
# #
# Location of the log file with globbing # Location of the log file with globbing
# pattern to check against device errors. # pattern to check against device errors.