From cb20763893bd77bcaba614ffee9af27fb42babb4 Mon Sep 17 00:00:00 2001 From: Lorcan Date: Thu, 18 Sep 2014 17:23:54 +0100 Subject: [PATCH] Add new features to swift-drive-audit This patch adds two new features to swift-drive-audit. The first is an option in the drive-audit.conf file that allows the operator to prevent the drives ever being unmounted automatically, regardless of the amount of errors present. This could be of benefit in very small systems consisting of only one or two drives where the operator would like to manually unmount/fix the particular drive(s) and minimise any potential downtime. The second is another option in drive-audit.conf that allows the operator to select a recon directory. This directory will then have a drive.recon file which will keep an up-to-date record of the swift drives and any errors associated with them. An example of the output would be as follows: {"/srv/node/disk2": "0", "/srv/node/disk3": "25", "/srv/node/disk0": "0", "/srv/node/disk1": "0", "/srv/node/disk10": "0", "/srv/node/disk7": "0", "/srv/node/disk4": "137", "/srv/node/disk5": "0", "/srv/node/disk8": "0", "/srv/node/disk9": "0", "/srv/node/disk6": "0", "/srv/node/disk11": "60"} This would allow the operator to monitor the errors on the swift drives without having to spend time searching through logs. Also, if this is accepted, it should be possible to add an option to swift-recon that would keep track of this at a system level. Change-Id: Ib5dacf8622b7363e070c274c7c30c8ead448a055 --- bin/swift-drive-audit | 27 +++++++++++++++++++-------- etc/drive-audit.conf-sample | 2 ++ 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/bin/swift-drive-audit b/bin/swift-drive-audit index 3d9227ba06..64d1bc5280 100755 --- a/bin/swift-drive-audit +++ b/bin/swift-drive-audit @@ -22,7 +22,8 @@ import subprocess import sys from ConfigParser import ConfigParser -from swift.common.utils import backward, get_logger +from swift.common.utils import backward, get_logger, dump_recon_cache, \ + config_true_value def get_devices(device_dir, logger): @@ -146,6 +147,7 @@ if __name__ == '__main__': device_dir = conf.get('device_dir', '/srv/node') minutes = int(conf.get('minutes', 60)) error_limit = int(conf.get('error_limit', 1)) + recon_cache_path = conf.get('recon_cache_path', "/var/cache/swift") log_file_pattern = conf.get('log_file_pattern', '/var/log/kern.*[!.][!g][!z]') error_re = [] @@ -169,6 +171,9 @@ if __name__ == '__main__': logger.debug("Devices found: %s" % str(devices)) if not devices: logger.error("Error: No devices found!") + recon_errors = {} + for device in devices: + recon_errors[device['mount_point']] = 0 errors = get_errors(error_re, log_file_pattern, minutes) logger.debug("Errors found: %s" % str(errors)) unmounts = 0 @@ -179,12 +184,18 @@ if __name__ == '__main__': if device: mount_point = device[0]['mount_point'] if mount_point.startswith(device_dir): - logger.info("Unmounting %s with %d errors" % - (mount_point, count)) - subprocess.call(['umount', '-fl', mount_point]) - logger.info("Commenting out %s from /etc/fstab" % - (mount_point)) - comment_fstab(mount_point) - unmounts += 1 + if config_true_value(conf.get('unmount_failed_device', + True)): + logger.info("Unmounting %s with %d errors" % + (mount_point, count)) + subprocess.call(['umount', '-fl', mount_point]) + logger.info("Commenting out %s from /etc/fstab" % + (mount_point)) + comment_fstab(mount_point) + unmounts += 1 + recon_errors[mount_point] = count + recon_file = recon_cache_path + "/drive.recon" + dump_recon_cache(recon_errors, recon_file, logger) + if unmounts == 0: logger.info("No drives were unmounted") diff --git a/etc/drive-audit.conf-sample b/etc/drive-audit.conf-sample index 50613ae7e2..bc3cba8fbc 100644 --- a/etc/drive-audit.conf-sample +++ b/etc/drive-audit.conf-sample @@ -8,6 +8,8 @@ # log_max_line_length = 0 # minutes = 60 # error_limit = 1 +# recon_cache_path = /var/cache/swift +# unmount_failed_device = True # # Location of the log file with globbing # pattern to check against device errors.