glance/bin/glance-control
Eoghan Glynn 804396204e Respawn glance services on unexpected death.
Fixes bug 923894

Add new '--respawn' option to cause glance services launched via
glance-control to be monitored for unexpected death and resuscitated
as necessary.

This option will cause glance-control itself to remain running.

Deliberately stopped services are not respawned, neither are rapidly
bouncing services (where process death occurred within one second
of the last launch).

Change-Id: I1a9a99cce9b6ad43274836e39ebe4f29c19455af
2012-02-08 09:41:11 +00:00

353 lines
11 KiB
Python
Executable File

#!/usr/bin/env python
# vim: tabstop=4 shiftwidth=4 softtabstop=4
# Copyright (c) 2011 OpenStack, LLC.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Helper script for starting/stopping/reloading Glance server programs.
Thanks for some of the code, Swifties ;)
"""
from __future__ import with_statement
import errno
import fcntl
import gettext
import os
import resource
import signal
import subprocess
import sys
import time
# If ../glance/__init__.py exists, add ../ to Python search path, so that
# it will override what happens to be installed in /usr/(local/)lib/python...
possible_topdir = os.path.normpath(os.path.join(os.path.abspath(sys.argv[0]),
os.pardir,
os.pardir))
if os.path.exists(os.path.join(possible_topdir, 'glance', '__init__.py')):
sys.path.insert(0, possible_topdir)
gettext.install('glance', unicode=1)
from glance import version
from glance.common import cfg
from glance.common import config
ALL_COMMANDS = ['start', 'stop', 'shutdown', 'restart',
'reload', 'force-reload']
ALL_SERVERS = ['glance-api', 'glance-registry', 'glance-scrubber']
GRACEFUL_SHUTDOWN_SERVERS = ['glance-api', 'glance-registry',
'glance-scrubber']
MAX_DESCRIPTORS = 32768
MAX_MEMORY = (1024 * 1024 * 1024) * 2 # 2 GB
USAGE = """%prog [options] <SERVER> <COMMAND> [CONFPATH]
Where <SERVER> is one of:
all, api, registry, scrubber
And command is one of:
start, stop, shutdown, restart, reload, force-reload
And CONFPATH is the optional configuration file to use."""
def gated_by(predicate):
def wrap(f):
def wrapped_f(*args):
if predicate:
return f(*args)
else:
return None
return wrapped_f
return wrap
def pid_files(server, conf):
pid_files = []
if conf.pid_file:
if os.path.exists(os.path.abspath(conf.pid_file)):
pid_files = [os.path.abspath(conf.pid_file)]
else:
if os.path.exists('/var/run/glance/%s.pid' % server):
pid_files = ['/var/run/glance/%s.pid' % server]
for pid_file in pid_files:
pid = int(open(pid_file).read().strip())
yield pid_file, pid
def do_start(verb, server, conf, args):
if verb != 'Respawn':
for pid_file, pid in pid_files(server, conf):
if os.path.exists('/proc/%s' % pid):
print "%s appears to already be running: %s" % \
(server, pid_file)
return
else:
print "Removing stale pid file %s" % pid_file
os.unlink(pid_file)
try:
resource.setrlimit(resource.RLIMIT_NOFILE,
(MAX_DESCRIPTORS, MAX_DESCRIPTORS))
resource.setrlimit(resource.RLIMIT_DATA,
(MAX_MEMORY, MAX_MEMORY))
except ValueError:
action = 'increase file descriptor limit'
print 'Unable to %s. Running as non-root?' % action
os.environ['PYTHON_EGG_CACHE'] = '/tmp'
def write_pid_file(pid_file, pid):
dir, file = os.path.split(pid_file)
if not os.path.exists(dir):
try:
os.makedirs(dir)
except OSError, err:
if err.errno == errno.EACCES:
sys.exit('Unable to create %s. Running as non-root?'
% dir)
fp = open(pid_file, 'w')
fp.write('%d\n' % pid)
fp.close()
def redirect_to_null(fds):
with open(os.devnull, 'r+b') as nullfile:
for desc in fds: # close fds
try:
os.dup2(nullfile.fileno(), desc)
except OSError:
pass
def redirect_to_syslog(fds, server):
log_cmd = '/usr/bin/logger -t "%s[%d]"' % (server, os.getpid())
process = subprocess.Popen(log_cmd,
shell=True,
stdin=subprocess.PIPE)
for desc in fds: # pipe to logger command
try:
os.dup2(process.stdin.fileno(), desc)
except OSError:
pass
def redirect_stdio(conf, server):
input = [sys.stdin.fileno()]
output = [sys.stdout.fileno(), sys.stderr.fileno()]
redirect_to_null(input)
if conf.capture_output:
redirect_to_syslog(output, server)
else:
redirect_to_null(output)
@gated_by(conf.capture_output)
def close_stdio_on_exec():
fds = [sys.stdin.fileno(), sys.stdout.fileno(), sys.stderr.fileno()]
for desc in fds: # set close on exec flag
fcntl.fcntl(desc, fcntl.F_SETFD, fcntl.FD_CLOEXEC)
def launch(pid_file, conf_file=None):
args = [server]
print '%sing %s' % (verb, server),
if conf_file:
args += ['--config-file', conf_file]
print 'with %s' % conf_file,
print
close_stdio_on_exec()
pid = os.fork()
if pid == 0:
os.setsid()
redirect_stdio(conf, server)
try:
os.execlp('%s' % server, *args)
except OSError, e:
msg = 'unable to launch %s. Got error: %s' % (server, e)
sys.exit(msg)
sys.exit(0)
else:
write_pid_file(pid_file, pid)
await_child(pid)
return pid
@gated_by(conf.await_child)
def await_child(pid):
bail_time = time.time() + conf.await_child
while time.time() < bail_time:
reported_pid, status = os.waitpid(pid, os.WNOHANG)
if reported_pid == pid:
global exitcode
exitcode = os.WEXITSTATUS(status)
break
time.sleep(0.05)
pid_file = get_pid_file(server, conf)
conf_file = None
if args and os.path.exists(args[0]):
conf_file = os.path.abspath(os.path.expanduser(args[0]))
return launch(pid_file, conf_file)
def get_pid_file(pid, conf):
return os.path.abspath(conf.pid_file) if conf.pid_file else \
'/var/run/glance/%s.pid' % server
def do_stop(server, conf, args, graceful=False):
if graceful and server in GRACEFUL_SHUTDOWN_SERVERS:
sig = signal.SIGHUP
else:
sig = signal.SIGTERM
did_anything = False
pfiles = pid_files(server, conf)
for pid_file, pid in pfiles:
did_anything = True
try:
os.unlink(pid_file)
except OSError:
pass
try:
print 'Stopping %s pid: %s signal: %s' % (server, pid, sig)
os.kill(pid, sig)
except OSError:
print "Process %d not running" % pid
for pid_file, pid in pfiles:
for _junk in xrange(150): # 15 seconds
if not os.path.exists('/proc/%s' % pid):
break
time.sleep(0.1)
else:
print 'Waited 15 seconds for pid %s (%s) to die; giving up' % \
(pid, pid_file)
if not did_anything:
print 'No %s running' % server
if __name__ == '__main__':
exitcode = 0
conf = config.GlanceConfigOpts(usage=USAGE)
opts = [
cfg.StrOpt('pid-file',
metavar='PATH',
help='File to use as pid file. Default: '
'/var/run/glance/$server.pid'),
cfg.IntOpt('await-child',
metavar='DELAY',
default=0,
help='Period to wait for service death '
'in order to report exit code '
'(default is to not wait at all)'),
cfg.BoolOpt('capture-output',
default=False,
help='Capture stdout/err in syslog '
'instead of discarding'),
cfg.BoolOpt('respawn',
default=False,
help='Restart service on unexpected death'),
]
conf.register_cli_opts(opts)
args = conf()
@gated_by(conf.await_child)
@gated_by(conf.respawn)
def mutually_exclusive():
sys.stderr.write('--await-child and --respawn are mutually exclusive')
sys.exit(1)
mutually_exclusive()
if len(args) < 2:
conf.print_usage()
sys.exit(1)
server = args.pop(0).lower()
if server == 'all':
servers = ALL_SERVERS
else:
if not server.startswith('glance-'):
server = 'glance-%s' % server
if server not in ALL_SERVERS:
server_list = ", ".join([s.replace('glance-', '')
for s in ALL_SERVERS])
msg = ("Unknown server '%(server)s' specified. Please specify "
"all, or one of the servers: %(server_list)s" % locals())
sys.exit(msg)
servers = [server]
command = args.pop(0).lower()
if command not in ALL_COMMANDS:
command_list = ", ".join(ALL_COMMANDS)
msg = ("Unknown command %(command)s specified. Please specify a "
"command in this list: %(command_list)s" % locals())
sys.exit(msg)
@gated_by(conf.respawn)
def anticipate_respawn(children):
while children:
pid, status = os.wait()
if pid in children:
(server, conf, args) = children.pop(pid)
pid_file = get_pid_file(server, conf)
running = os.path.exists(pid_file)
one_second_ago = time.time() - 1
bouncing = (running and
os.path.getmtime(pid_file) >= one_second_ago)
if running and not bouncing:
args = (server, conf, args)
new_pid = do_start('Respawn', *args)
children[new_pid] = args
else:
rsn = 'bouncing' if bouncing else 'deliberately stopped'
print 'Supressed respawn as %s was %s.' % (server, rsn)
if command == 'start':
children = {}
for server in servers:
args = (server, conf, args)
pid = do_start('Start', *args)
children[pid] = args
anticipate_respawn(children)
if command == 'stop':
for server in servers:
do_stop(server, conf, args)
if command == 'shutdown':
for server in servers:
do_stop(server, conf, args, graceful=True)
if command == 'restart':
for server in servers:
do_stop(server, conf, args)
for server in servers:
do_start('Restart', server, conf, args)
if command == 'reload' or command == 'force-reload':
for server in servers:
do_stop(server, conf, args, graceful=True)
do_start(server, conf, args)
sys.exit(exitcode)