Added swift-orphans and swift-oldies.

Change-Id: I95210098556a22d7bd05f245ae387ee13041fa61
This commit is contained in:
gholt 2011-12-03 07:42:08 +00:00 committed by gholt
parent dd796d094a
commit 65dba1a7aa
4 changed files with 195 additions and 1 deletions

66
bin/swift-oldies Executable file
View File

@ -0,0 +1,66 @@
#!/usr/bin/env python
import optparse
import os
import signal
import subprocess
import sys
if __name__ == '__main__':
parser = optparse.OptionParser(usage='''%prog [options]
Lists old Swift processes.
'''.strip())
parser.add_option('-a', '--age', dest='hours', type='int', default=720,
help='look for processes at least HOURS old; default: 720 (30 days)')
(options, args) = parser.parse_args()
listing = []
for line in subprocess.Popen(
['ps', '-eo', 'etime,pid,args', '--no-headers'],
stdout=subprocess.PIPE).communicate()[0].split('\n'):
if not line:
continue
hours = 0
try:
etime, pid, args = line.split(None, 2)
except ValueError:
sys.exit('Could not process ps line %r' % line)
if (not args.startswith('/usr/bin/python /usr/bin/swift-') and
not args.startswith('/usr/bin/python /usr/local/bin/swift-')):
continue
args = args.split('-', 1)[1]
etime = etime.split('-')
if len(etime) == 2:
hours = int(etime[0]) * 24
etime = etime[1]
elif len(etime) == 1:
etime = etime[0]
else:
sys.exit('Could not process etime value from %r' % line)
etime = etime.split(':')
if len(etime) == 3:
hours += int(etime[0])
elif len(etime) != 2:
sys.exit('Could not process etime value from %r' % line)
if hours >= options.hours:
listing.append((str(hours), pid, args))
if not listing:
exit()
hours_len = len('Hours')
pid_len = len('PID')
args_len = len('Command')
for hours, pid, args in listing:
hours_len = max(hours_len, len(hours))
pid_len = max(pid_len, len(pid))
args_len = max(args_len, len(args))
args_len = min(args_len, 78 - hours_len - pid_len)
print ('%%%ds %%%ds %%s' % (hours_len, pid_len)) % \
('Hours', 'PID', 'Command')
for hours, pid, args in listing:
print ('%%%ds %%%ds %%s' % (hours_len, pid_len)) % \
(hours, pid, args[:args_len])

105
bin/swift-orphans Executable file
View File

@ -0,0 +1,105 @@
#!/usr/bin/env python
import optparse
import os
import signal
import subprocess
import sys
if __name__ == '__main__':
parser = optparse.OptionParser(usage='''%prog [options]
Lists and optionally kills orphaned Swift processes. This is done by scanning
/var/run/swift for .pid files and listing any processes that look like Swift
processes but aren't associated with the pids in those .pid files. Any Swift
processes running with the 'once' parameter are ignored, as those are usually
for full-speed audit scans and such.
Example (sends SIGTERM to all orphaned Swift processes older than two hours):
%prog -a 2 -k TERM
'''.strip())
parser.add_option('-a', '--age', dest='hours', type='int', default=24,
help='look for processes at least HOURS old; default: 24')
parser.add_option('-k', '--kill', dest='signal',
help='send SIGNAL to matched processes; default: just list process '
'information')
parser.add_option('-w', '--wide', dest='wide', default=False,
action='store_true', help="don't clip the listing at 80 characters")
(options, args) = parser.parse_args()
pids = []
for root, directories, files in os.walk('/var/run/swift'):
for name in files:
if name.endswith('.pid'):
pids.append(open(os.path.join(root, name)).read().strip())
pids.extend(subprocess.Popen(
['ps', '--ppid', pids[-1], '-o', 'pid', '--no-headers'],
stdout=subprocess.PIPE).communicate()[0].split())
listing = []
for line in subprocess.Popen(
['ps', '-eo', 'etime,pid,args', '--no-headers'],
stdout=subprocess.PIPE).communicate()[0].split('\n'):
if not line:
continue
hours = 0
try:
etime, pid, args = line.split(None, 2)
except ValueError:
sys.exit('Could not process ps line %r' % line)
if pid in pids:
continue
if (not args.startswith('/usr/bin/python /usr/bin/swift-') and
not args.startswith('/usr/bin/python /usr/local/bin/swift-')) or \
'swift-orphans' in args or \
'once' in args.split():
continue
args = args.split('-', 1)[1]
etime = etime.split('-')
if len(etime) == 2:
hours = int(etime[0]) * 24
etime = etime[1]
elif len(etime) == 1:
etime = etime[0]
else:
sys.exit('Could not process etime value from %r' % line)
etime = etime.split(':')
if len(etime) == 3:
hours += int(etime[0])
elif len(etime) != 2:
sys.exit('Could not process etime value from %r' % line)
if hours >= options.hours:
listing.append((str(hours), pid, args))
if not listing:
exit()
hours_len = len('Hours')
pid_len = len('PID')
args_len = len('Command')
for hours, pid, args in listing:
hours_len = max(hours_len, len(hours))
pid_len = max(pid_len, len(pid))
args_len = max(args_len, len(args))
args_len = min(args_len, 78 - hours_len - pid_len)
print ('%%%ds %%%ds %%s' % (hours_len, pid_len)) % \
('Hours', 'PID', 'Command')
for hours, pid, args in listing:
print ('%%%ds %%%ds %%s' % (hours_len, pid_len)) % \
(hours, pid, args[:args_len])
if options.signal:
try:
signum = int(options.signal)
except ValueError:
signum = getattr(signal, options.signal.upper(),
getattr(signal, 'SIG' + options.signal.upper(), None))
if not signum:
sys.exit('Could not translate %r to a signal number.' %
options.signal)
print 'Sending processes %s (%d) signal...' % (options.signal, signum),
for hours, pid, args in listing:
os.kill(int(pid), signum)
print 'Done.'

View File

@ -373,3 +373,25 @@ run this command as follows:
`swift-object-auditor /path/to/object-server/config/file.conf once -z 1000`
"-z" means to only check for zero-byte files at 1000 files per second.
-------------
Swift Orphans
-------------
Swift Orphans are processes left over after a reload of a Swift server.
For example, when upgrading a proxy server you would probaby finish with a `swift-init proxy-server reload` or `/etc/init.d/swift-proxy reload`. This kills the parent proxy server process and leaves the child processes running to finish processing whatever requests they might be handling at the time. It then starts up a new parent proxy server process and its children to handle new incoming requests. This allows zero-downtime upgrades with no impact to existing requests.
The orphaned child processes may take a while to exit, depending on the length of the requests they were handling. However, sometimes an old process can be hung up due to some bug or hardware issue. In these cases, these orphaned processes will hang around forever. `swift-orphans` can be used to find and kill these orphans.
`swift-orphans` with no arguments will just list the orphans it finds that were started more than 24 hours ago. You shouldn't really check for orphans until 24 hours after you perform a reload, as some requests can take a long time to process. `swift-orphans -k TERM` will send the SIG_TERM signal to the orphans processes, or you can `kill -TERM` the pids yourself if you prefer.
You can run `swift-orphans --help` for more options.
------------
Swift Oldies
------------
Swift Oldies are processes that have just been around for a long time. There's nothing necessarily wrong with this, but it might indicate a hung process if you regularly upgrade and reload/restart services. You might have so many servers that you don't notice when a reload/restart fails, `swift-oldies` can help with this.
For example, if you upgraded and reloaded/restarted everything 2 days ago, and you've already cleaned up any orphans with `swift-orphans`, you can run `swift-oldies -a 48` to find any Swift processes still around that were started more than 2 days ago and then investigate them accordingly.

View File

@ -57,7 +57,8 @@ setup(
'bin/swift-stats-report',
'bin/swift-dispersion-populate', 'bin/swift-dispersion-report',
'bin/swift-bench',
'bin/swift-recon', 'bin/swift-recon-cron',
'bin/swift-recon', 'bin/swift-recon-cron', 'bin/swift-orphans',
'bin/swift-oldies'
],
entry_points={
'paste.app_factory': [