Add new features to swift-drive-audit
This patch adds two new features to swift-drive-audit. The first is an option in the drive-audit.conf file that allows the operator to prevent the drives ever being unmounted automatically, regardless of the amount of errors present. This could be of benefit in very small systems consisting of only one or two drives where the operator would like to manually unmount/fix the particular drive(s) and minimise any potential downtime. The second is another option in drive-audit.conf that allows the operator to select a recon directory. This directory will then have a drive.recon file which will keep an up-to-date record of the swift drives and any errors associated with them. An example of the output would be as follows: {"/srv/node/disk2": "0", "/srv/node/disk3": "25", "/srv/node/disk0": "0", "/srv/node/disk1": "0", "/srv/node/disk10": "0", "/srv/node/disk7": "0", "/srv/node/disk4": "137", "/srv/node/disk5": "0", "/srv/node/disk8": "0", "/srv/node/disk9": "0", "/srv/node/disk6": "0", "/srv/node/disk11": "60"} This would allow the operator to monitor the errors on the swift drives without having to spend time searching through logs. Also, if this is accepted, it should be possible to add an option to swift-recon that would keep track of this at a system level. Change-Id: Ib5dacf8622b7363e070c274c7c30c8ead448a055
This commit is contained in:
parent
a81b2d2c74
commit
cb20763893
@ -22,7 +22,8 @@ import subprocess
|
|||||||
import sys
|
import sys
|
||||||
from ConfigParser import ConfigParser
|
from ConfigParser import ConfigParser
|
||||||
|
|
||||||
from swift.common.utils import backward, get_logger
|
from swift.common.utils import backward, get_logger, dump_recon_cache, \
|
||||||
|
config_true_value
|
||||||
|
|
||||||
|
|
||||||
def get_devices(device_dir, logger):
|
def get_devices(device_dir, logger):
|
||||||
@ -146,6 +147,7 @@ if __name__ == '__main__':
|
|||||||
device_dir = conf.get('device_dir', '/srv/node')
|
device_dir = conf.get('device_dir', '/srv/node')
|
||||||
minutes = int(conf.get('minutes', 60))
|
minutes = int(conf.get('minutes', 60))
|
||||||
error_limit = int(conf.get('error_limit', 1))
|
error_limit = int(conf.get('error_limit', 1))
|
||||||
|
recon_cache_path = conf.get('recon_cache_path', "/var/cache/swift")
|
||||||
log_file_pattern = conf.get('log_file_pattern',
|
log_file_pattern = conf.get('log_file_pattern',
|
||||||
'/var/log/kern.*[!.][!g][!z]')
|
'/var/log/kern.*[!.][!g][!z]')
|
||||||
error_re = []
|
error_re = []
|
||||||
@ -169,6 +171,9 @@ if __name__ == '__main__':
|
|||||||
logger.debug("Devices found: %s" % str(devices))
|
logger.debug("Devices found: %s" % str(devices))
|
||||||
if not devices:
|
if not devices:
|
||||||
logger.error("Error: No devices found!")
|
logger.error("Error: No devices found!")
|
||||||
|
recon_errors = {}
|
||||||
|
for device in devices:
|
||||||
|
recon_errors[device['mount_point']] = 0
|
||||||
errors = get_errors(error_re, log_file_pattern, minutes)
|
errors = get_errors(error_re, log_file_pattern, minutes)
|
||||||
logger.debug("Errors found: %s" % str(errors))
|
logger.debug("Errors found: %s" % str(errors))
|
||||||
unmounts = 0
|
unmounts = 0
|
||||||
@ -179,12 +184,18 @@ if __name__ == '__main__':
|
|||||||
if device:
|
if device:
|
||||||
mount_point = device[0]['mount_point']
|
mount_point = device[0]['mount_point']
|
||||||
if mount_point.startswith(device_dir):
|
if mount_point.startswith(device_dir):
|
||||||
logger.info("Unmounting %s with %d errors" %
|
if config_true_value(conf.get('unmount_failed_device',
|
||||||
(mount_point, count))
|
True)):
|
||||||
subprocess.call(['umount', '-fl', mount_point])
|
logger.info("Unmounting %s with %d errors" %
|
||||||
logger.info("Commenting out %s from /etc/fstab" %
|
(mount_point, count))
|
||||||
(mount_point))
|
subprocess.call(['umount', '-fl', mount_point])
|
||||||
comment_fstab(mount_point)
|
logger.info("Commenting out %s from /etc/fstab" %
|
||||||
unmounts += 1
|
(mount_point))
|
||||||
|
comment_fstab(mount_point)
|
||||||
|
unmounts += 1
|
||||||
|
recon_errors[mount_point] = count
|
||||||
|
recon_file = recon_cache_path + "/drive.recon"
|
||||||
|
dump_recon_cache(recon_errors, recon_file, logger)
|
||||||
|
|
||||||
if unmounts == 0:
|
if unmounts == 0:
|
||||||
logger.info("No drives were unmounted")
|
logger.info("No drives were unmounted")
|
||||||
|
@ -8,6 +8,8 @@
|
|||||||
# log_max_line_length = 0
|
# log_max_line_length = 0
|
||||||
# minutes = 60
|
# minutes = 60
|
||||||
# error_limit = 1
|
# error_limit = 1
|
||||||
|
# recon_cache_path = /var/cache/swift
|
||||||
|
# unmount_failed_device = True
|
||||||
#
|
#
|
||||||
# Location of the log file with globbing
|
# Location of the log file with globbing
|
||||||
# pattern to check against device errors.
|
# pattern to check against device errors.
|
||||||
|
Loading…
Reference in New Issue
Block a user