From 65dba1a7aa3882308f0442fd53f9bea06550d7a8 Mon Sep 17 00:00:00 2001 From: gholt Date: Sat, 3 Dec 2011 07:42:08 +0000 Subject: [PATCH] Added swift-orphans and swift-oldies. Change-Id: I95210098556a22d7bd05f245ae387ee13041fa61 --- bin/swift-oldies | 66 +++++++++++++++++++++++ bin/swift-orphans | 105 +++++++++++++++++++++++++++++++++++++ doc/source/admin_guide.rst | 22 ++++++++ setup.py | 3 +- 4 files changed, 195 insertions(+), 1 deletion(-) create mode 100755 bin/swift-oldies create mode 100755 bin/swift-orphans diff --git a/bin/swift-oldies b/bin/swift-oldies new file mode 100755 index 0000000000..9128f8997b --- /dev/null +++ b/bin/swift-oldies @@ -0,0 +1,66 @@ +#!/usr/bin/env python + +import optparse +import os +import signal +import subprocess +import sys + + +if __name__ == '__main__': + parser = optparse.OptionParser(usage='''%prog [options] + +Lists old Swift processes. + '''.strip()) + parser.add_option('-a', '--age', dest='hours', type='int', default=720, + help='look for processes at least HOURS old; default: 720 (30 days)') + (options, args) = parser.parse_args() + + listing = [] + for line in subprocess.Popen( + ['ps', '-eo', 'etime,pid,args', '--no-headers'], + stdout=subprocess.PIPE).communicate()[0].split('\n'): + if not line: + continue + hours = 0 + try: + etime, pid, args = line.split(None, 2) + except ValueError: + sys.exit('Could not process ps line %r' % line) + if (not args.startswith('/usr/bin/python /usr/bin/swift-') and + not args.startswith('/usr/bin/python /usr/local/bin/swift-')): + continue + args = args.split('-', 1)[1] + etime = etime.split('-') + if len(etime) == 2: + hours = int(etime[0]) * 24 + etime = etime[1] + elif len(etime) == 1: + etime = etime[0] + else: + sys.exit('Could not process etime value from %r' % line) + etime = etime.split(':') + if len(etime) == 3: + hours += int(etime[0]) + elif len(etime) != 2: + sys.exit('Could not process etime value from %r' % line) + if hours >= options.hours: + listing.append((str(hours), pid, args)) + + if not listing: + exit() + + hours_len = len('Hours') + pid_len = len('PID') + args_len = len('Command') + for hours, pid, args in listing: + hours_len = max(hours_len, len(hours)) + pid_len = max(pid_len, len(pid)) + args_len = max(args_len, len(args)) + args_len = min(args_len, 78 - hours_len - pid_len) + + print ('%%%ds %%%ds %%s' % (hours_len, pid_len)) % \ + ('Hours', 'PID', 'Command') + for hours, pid, args in listing: + print ('%%%ds %%%ds %%s' % (hours_len, pid_len)) % \ + (hours, pid, args[:args_len]) diff --git a/bin/swift-orphans b/bin/swift-orphans new file mode 100755 index 0000000000..dce5d9a8ff --- /dev/null +++ b/bin/swift-orphans @@ -0,0 +1,105 @@ +#!/usr/bin/env python + +import optparse +import os +import signal +import subprocess +import sys + + +if __name__ == '__main__': + parser = optparse.OptionParser(usage='''%prog [options] + +Lists and optionally kills orphaned Swift processes. This is done by scanning +/var/run/swift for .pid files and listing any processes that look like Swift +processes but aren't associated with the pids in those .pid files. Any Swift +processes running with the 'once' parameter are ignored, as those are usually +for full-speed audit scans and such. + +Example (sends SIGTERM to all orphaned Swift processes older than two hours): +%prog -a 2 -k TERM + '''.strip()) + parser.add_option('-a', '--age', dest='hours', type='int', default=24, + help='look for processes at least HOURS old; default: 24') + parser.add_option('-k', '--kill', dest='signal', + help='send SIGNAL to matched processes; default: just list process ' + 'information') + parser.add_option('-w', '--wide', dest='wide', default=False, + action='store_true', help="don't clip the listing at 80 characters") + (options, args) = parser.parse_args() + + pids = [] + for root, directories, files in os.walk('/var/run/swift'): + for name in files: + if name.endswith('.pid'): + pids.append(open(os.path.join(root, name)).read().strip()) + pids.extend(subprocess.Popen( + ['ps', '--ppid', pids[-1], '-o', 'pid', '--no-headers'], + stdout=subprocess.PIPE).communicate()[0].split()) + + listing = [] + for line in subprocess.Popen( + ['ps', '-eo', 'etime,pid,args', '--no-headers'], + stdout=subprocess.PIPE).communicate()[0].split('\n'): + if not line: + continue + hours = 0 + try: + etime, pid, args = line.split(None, 2) + except ValueError: + sys.exit('Could not process ps line %r' % line) + if pid in pids: + continue + if (not args.startswith('/usr/bin/python /usr/bin/swift-') and + not args.startswith('/usr/bin/python /usr/local/bin/swift-')) or \ + 'swift-orphans' in args or \ + 'once' in args.split(): + continue + args = args.split('-', 1)[1] + etime = etime.split('-') + if len(etime) == 2: + hours = int(etime[0]) * 24 + etime = etime[1] + elif len(etime) == 1: + etime = etime[0] + else: + sys.exit('Could not process etime value from %r' % line) + etime = etime.split(':') + if len(etime) == 3: + hours += int(etime[0]) + elif len(etime) != 2: + sys.exit('Could not process etime value from %r' % line) + if hours >= options.hours: + listing.append((str(hours), pid, args)) + + if not listing: + exit() + + hours_len = len('Hours') + pid_len = len('PID') + args_len = len('Command') + for hours, pid, args in listing: + hours_len = max(hours_len, len(hours)) + pid_len = max(pid_len, len(pid)) + args_len = max(args_len, len(args)) + args_len = min(args_len, 78 - hours_len - pid_len) + + print ('%%%ds %%%ds %%s' % (hours_len, pid_len)) % \ + ('Hours', 'PID', 'Command') + for hours, pid, args in listing: + print ('%%%ds %%%ds %%s' % (hours_len, pid_len)) % \ + (hours, pid, args[:args_len]) + + if options.signal: + try: + signum = int(options.signal) + except ValueError: + signum = getattr(signal, options.signal.upper(), + getattr(signal, 'SIG' + options.signal.upper(), None)) + if not signum: + sys.exit('Could not translate %r to a signal number.' % + options.signal) + print 'Sending processes %s (%d) signal...' % (options.signal, signum), + for hours, pid, args in listing: + os.kill(int(pid), signum) + print 'Done.' diff --git a/doc/source/admin_guide.rst b/doc/source/admin_guide.rst index c8d903dfaf..dce188d786 100644 --- a/doc/source/admin_guide.rst +++ b/doc/source/admin_guide.rst @@ -373,3 +373,25 @@ run this command as follows: `swift-object-auditor /path/to/object-server/config/file.conf once -z 1000` "-z" means to only check for zero-byte files at 1000 files per second. +------------- +Swift Orphans +------------- + +Swift Orphans are processes left over after a reload of a Swift server. + +For example, when upgrading a proxy server you would probaby finish with a `swift-init proxy-server reload` or `/etc/init.d/swift-proxy reload`. This kills the parent proxy server process and leaves the child processes running to finish processing whatever requests they might be handling at the time. It then starts up a new parent proxy server process and its children to handle new incoming requests. This allows zero-downtime upgrades with no impact to existing requests. + +The orphaned child processes may take a while to exit, depending on the length of the requests they were handling. However, sometimes an old process can be hung up due to some bug or hardware issue. In these cases, these orphaned processes will hang around forever. `swift-orphans` can be used to find and kill these orphans. + +`swift-orphans` with no arguments will just list the orphans it finds that were started more than 24 hours ago. You shouldn't really check for orphans until 24 hours after you perform a reload, as some requests can take a long time to process. `swift-orphans -k TERM` will send the SIG_TERM signal to the orphans processes, or you can `kill -TERM` the pids yourself if you prefer. + +You can run `swift-orphans --help` for more options. + + +------------ +Swift Oldies +------------ + +Swift Oldies are processes that have just been around for a long time. There's nothing necessarily wrong with this, but it might indicate a hung process if you regularly upgrade and reload/restart services. You might have so many servers that you don't notice when a reload/restart fails, `swift-oldies` can help with this. + +For example, if you upgraded and reloaded/restarted everything 2 days ago, and you've already cleaned up any orphans with `swift-orphans`, you can run `swift-oldies -a 48` to find any Swift processes still around that were started more than 2 days ago and then investigate them accordingly. diff --git a/setup.py b/setup.py index e651196a7d..32f95601dd 100644 --- a/setup.py +++ b/setup.py @@ -57,7 +57,8 @@ setup( 'bin/swift-stats-report', 'bin/swift-dispersion-populate', 'bin/swift-dispersion-report', 'bin/swift-bench', - 'bin/swift-recon', 'bin/swift-recon-cron', + 'bin/swift-recon', 'bin/swift-recon-cron', 'bin/swift-orphans', + 'bin/swift-oldies' ], entry_points={ 'paste.app_factory': [