Respawn glance services on unexpected death.

Fixes bug 923894

Add new '--respawn' option to cause glance services launched via
glance-control to be monitored for unexpected death and resuscitated
as necessary.

This option will cause glance-control itself to remain running.

Deliberately stopped services are not respawned, neither are rapidly
bouncing services (where process death occurred within one second
of the last launch).

Change-Id: I1a9a99cce9b6ad43274836e39ebe4f29c19455af
This commit is contained in:
Eoghan Glynn 2012-01-30 11:13:28 +00:00
parent eea25b4149
commit 804396204e
6 changed files with 303 additions and 69 deletions

View File

@ -67,6 +67,17 @@ And command is one of:
And CONFPATH is the optional configuration file to use.""" And CONFPATH is the optional configuration file to use."""
def gated_by(predicate):
def wrap(f):
def wrapped_f(*args):
if predicate:
return f(*args)
else:
return None
return wrapped_f
return wrap
def pid_files(server, conf): def pid_files(server, conf):
pid_files = [] pid_files = []
if conf.pid_file: if conf.pid_file:
@ -80,25 +91,26 @@ def pid_files(server, conf):
yield pid_file, pid yield pid_file, pid
def do_start(server, conf, args): def do_start(verb, server, conf, args):
server_type = '-'.join(server.split('-')[:-1]) if verb != 'Respawn':
for pid_file, pid in pid_files(server, conf):
if os.path.exists('/proc/%s' % pid):
print "%s appears to already be running: %s" % \
(server, pid_file)
return
else:
print "Removing stale pid file %s" % pid_file
os.unlink(pid_file)
for pid_file, pid in pid_files(server, conf): try:
if os.path.exists('/proc/%s' % pid): resource.setrlimit(resource.RLIMIT_NOFILE,
print "%s appears to already be running: %s" % (server, pid_file) (MAX_DESCRIPTORS, MAX_DESCRIPTORS))
return resource.setrlimit(resource.RLIMIT_DATA,
else: (MAX_MEMORY, MAX_MEMORY))
print "Removing stale pid file %s" % pid_file except ValueError:
os.unlink(pid_file) action = 'increase file descriptor limit'
print 'Unable to %s. Running as non-root?' % action
try: os.environ['PYTHON_EGG_CACHE'] = '/tmp'
resource.setrlimit(resource.RLIMIT_NOFILE,
(MAX_DESCRIPTORS, MAX_DESCRIPTORS))
resource.setrlimit(resource.RLIMIT_DATA,
(MAX_MEMORY, MAX_MEMORY))
except ValueError:
print "Unable to increase file descriptor limit. Running as non-root?"
os.environ['PYTHON_EGG_CACHE'] = '/tmp'
def write_pid_file(pid_file, pid): def write_pid_file(pid_file, pid):
dir, file = os.path.split(pid_file) dir, file = os.path.split(pid_file)
@ -113,18 +125,6 @@ def do_start(server, conf, args):
fp.write('%d\n' % pid) fp.write('%d\n' % pid)
fp.close() fp.close()
def await_child(pid):
if conf.await_child:
bail_time = time.time() + conf.await_child
while time.time() < bail_time:
reported_pid, status = os.waitpid(pid, os.WNOHANG)
if reported_pid == pid:
global exitcode
# the exit code is encoded in 2nd least significant byte
exitcode = status >> 8
break
time.sleep(0.05)
def redirect_to_null(fds): def redirect_to_null(fds):
with open(os.devnull, 'r+b') as nullfile: with open(os.devnull, 'r+b') as nullfile:
for desc in fds: # close fds for desc in fds: # close fds
@ -154,6 +154,7 @@ def do_start(server, conf, args):
else: else:
redirect_to_null(output) redirect_to_null(output)
@gated_by(conf.capture_output)
def close_stdio_on_exec(): def close_stdio_on_exec():
fds = [sys.stdin.fileno(), sys.stdout.fileno(), sys.stderr.fileno()] fds = [sys.stdin.fileno(), sys.stdout.fileno(), sys.stderr.fileno()]
for desc in fds: # set close on exec flag for desc in fds: # set close on exec flag
@ -161,7 +162,7 @@ def do_start(server, conf, args):
def launch(pid_file, conf_file=None): def launch(pid_file, conf_file=None):
args = [server] args = [server]
print 'Starting %s' % server, print '%sing %s' % (verb, server),
if conf_file: if conf_file:
args += ['--config-file', conf_file] args += ['--config-file', conf_file]
print 'with %s' % conf_file, print 'with %s' % conf_file,
@ -176,23 +177,37 @@ def do_start(server, conf, args):
try: try:
os.execlp('%s' % server, *args) os.execlp('%s' % server, *args)
except OSError, e: except OSError, e:
sys.exit('unable to launch %s. Got error: %s' msg = 'unable to launch %s. Got error: %s' % (server, e)
% (server, "%s" % e)) sys.exit(msg)
sys.exit(0) sys.exit(0)
else: else:
write_pid_file(pid_file, pid) write_pid_file(pid_file, pid)
await_child(pid) await_child(pid)
return pid
if not conf.pid_file: @gated_by(conf.await_child)
pid_file = '/var/run/glance/%s.pid' % server def await_child(pid):
else: bail_time = time.time() + conf.await_child
pid_file = os.path.abspath(conf.pid_file) while time.time() < bail_time:
reported_pid, status = os.waitpid(pid, os.WNOHANG)
if reported_pid == pid:
global exitcode
exitcode = os.WEXITSTATUS(status)
break
time.sleep(0.05)
pid_file = get_pid_file(server, conf)
conf_file = None conf_file = None
if args and os.path.exists(args[0]): if args and os.path.exists(args[0]):
conf_file = os.path.abspath(os.path.expanduser(args[0])) conf_file = os.path.abspath(os.path.expanduser(args[0]))
launch(pid_file, conf_file) return launch(pid_file, conf_file)
def get_pid_file(pid, conf):
return os.path.abspath(conf.pid_file) if conf.pid_file else \
'/var/run/glance/%s.pid' % server
def do_stop(server, conf, args, graceful=False): def do_stop(server, conf, args, graceful=False):
@ -205,15 +220,15 @@ def do_stop(server, conf, args, graceful=False):
pfiles = pid_files(server, conf) pfiles = pid_files(server, conf)
for pid_file, pid in pfiles: for pid_file, pid in pfiles:
did_anything = True did_anything = True
try:
os.unlink(pid_file)
except OSError:
pass
try: try:
print 'Stopping %s pid: %s signal: %s' % (server, pid, sig) print 'Stopping %s pid: %s signal: %s' % (server, pid, sig)
os.kill(pid, sig) os.kill(pid, sig)
except OSError: except OSError:
print "Process %d not running" % pid print "Process %d not running" % pid
try:
os.unlink(pid_file)
except OSError:
pass
for pid_file, pid in pfiles: for pid_file, pid in pfiles:
for _junk in xrange(150): # 15 seconds for _junk in xrange(150): # 15 seconds
if not os.path.exists('/proc/%s' % pid): if not os.path.exists('/proc/%s' % pid):
@ -246,11 +261,22 @@ if __name__ == '__main__':
default=False, default=False,
help='Capture stdout/err in syslog ' help='Capture stdout/err in syslog '
'instead of discarding'), 'instead of discarding'),
cfg.BoolOpt('respawn',
default=False,
help='Restart service on unexpected death'),
] ]
conf.register_cli_opts(opts) conf.register_cli_opts(opts)
args = conf() args = conf()
@gated_by(conf.await_child)
@gated_by(conf.respawn)
def mutually_exclusive():
sys.stderr.write('--await-child and --respawn are mutually exclusive')
sys.exit(1)
mutually_exclusive()
if len(args) < 2: if len(args) < 2:
conf.print_usage() conf.print_usage()
sys.exit(1) sys.exit(1)
@ -276,9 +302,33 @@ if __name__ == '__main__':
"command in this list: %(command_list)s" % locals()) "command in this list: %(command_list)s" % locals())
sys.exit(msg) sys.exit(msg)
@gated_by(conf.respawn)
def anticipate_respawn(children):
while children:
pid, status = os.wait()
if pid in children:
(server, conf, args) = children.pop(pid)
pid_file = get_pid_file(server, conf)
running = os.path.exists(pid_file)
one_second_ago = time.time() - 1
bouncing = (running and
os.path.getmtime(pid_file) >= one_second_ago)
if running and not bouncing:
args = (server, conf, args)
new_pid = do_start('Respawn', *args)
children[new_pid] = args
else:
rsn = 'bouncing' if bouncing else 'deliberately stopped'
print 'Supressed respawn as %s was %s.' % (server, rsn)
if command == 'start': if command == 'start':
children = {}
for server in servers: for server in servers:
do_start(server, conf, args) args = (server, conf, args)
pid = do_start('Start', *args)
children[pid] = args
anticipate_respawn(children)
if command == 'stop': if command == 'stop':
for server in servers: for server in servers:
@ -292,7 +342,7 @@ if __name__ == '__main__':
for server in servers: for server in servers:
do_stop(server, conf, args) do_stop(server, conf, args)
for server in servers: for server in servers:
do_start(server, conf, args) do_start('Restart', server, conf, args)
if command == 'reload' or command == 'force-reload': if command == 'reload' or command == 'force-reload':
for server in servers: for server in servers:

View File

@ -177,6 +177,18 @@ Glance server programs, and you can specify (as the example above shows)
a configuration file when starting the server. a configuration file when starting the server.
In order for your launched glance service to be monitored for unexpected death
and respawned if necessary, use the following option:
$ sudo glance-control [service] start --respawn ...
Note that this will cause ``glance-control`` itself to remain running. Also note
that deliberately stopped services are not respawned, neither are rapidly bouncing
services (where process death occurred within one second of the last launch).
By default, output from glance services is discarded when launched with ``glance-control``. By default, output from glance services is discarded when launched with ``glance-control``.
In order to capture such output via syslog, use the following option: In order to capture such output via syslog, use the following option:

View File

@ -129,7 +129,7 @@ class Server(object):
return self.conf_file_name return self.conf_file_name
def start(self, expected_exitcode=0, **kwargs): def start(self, expect_exit=True, expected_exitcode=0, **kwargs):
""" """
Starts the server. Starts the server.
@ -147,6 +147,7 @@ class Server(object):
return execute(cmd, return execute(cmd,
no_venv=self.no_venv, no_venv=self.no_venv,
exec_env=self.exec_env, exec_env=self.exec_env,
expect_exit=expect_exit,
expected_exitcode=expected_exitcode) expected_exitcode=expected_exitcode)
def stop(self): def stop(self):
@ -156,7 +157,8 @@ class Server(object):
cmd = ("%(server_control)s %(server_name)s stop " cmd = ("%(server_control)s %(server_name)s stop "
"%(conf_file_name)s --pid-file=%(pid_file)s" "%(conf_file_name)s --pid-file=%(pid_file)s"
% self.__dict__) % self.__dict__)
return execute(cmd, no_venv=self.no_venv, exec_env=self.exec_env) return execute(cmd, no_venv=self.no_venv, exec_env=self.exec_env,
expect_exit=True)
class ApiServer(Server): class ApiServer(Server):
@ -449,6 +451,7 @@ class FunctionalTest(unittest.TestCase):
def start_server(self, def start_server(self,
server, server,
expect_launch, expect_launch,
expect_exit=True,
expected_exitcode=0, expected_exitcode=0,
**kwargs): **kwargs):
""" """
@ -460,18 +463,22 @@ class FunctionalTest(unittest.TestCase):
:param server: the server to launch :param server: the server to launch
:param expect_launch: true iff the server is expected to :param expect_launch: true iff the server is expected to
successfully start successfully start
:param expect_exit: true iff the launched server is expected
to exit in a timely fashion
:param expected_exitcode: expected exitcode from the launcher :param expected_exitcode: expected exitcode from the launcher
""" """
self.cleanup() self.cleanup()
# Start up the requested server # Start up the requested server
exitcode, out, err = server.start(expected_exitcode=expected_exitcode, exitcode, out, err = server.start(expect_exit=expect_exit,
expected_exitcode=expected_exitcode,
**kwargs) **kwargs)
if expect_exit:
self.assertEqual(expected_exitcode, exitcode,
"Failed to spin up the requested server. "
"Got: %s" % err)
self.assertEqual(expected_exitcode, exitcode, self.assertTrue(re.search("Starting glance-[a-z]+ with", out))
"Failed to spin up the requested server. "
"Got: %s" % err)
self.assertTrue(re.search("Starting glance-[a-z]+ with", out))
self.wait_for_servers([server.bind_port], expect_launch) self.wait_for_servers([server.bind_port], expect_launch)
@ -551,6 +558,19 @@ class FunctionalTest(unittest.TestCase):
time.sleep(0.05) time.sleep(0.05)
self.assertFalse(expect_launch, "Unexpected server launch status") self.assertFalse(expect_launch, "Unexpected server launch status")
def stop_server(self, server, name):
"""
Called to stop a single server in a normal fashion using the
glance-control stop method to gracefully shut the server down.
:param server: the server to stop
"""
# Spin down the requested server
exitcode, out, err = server.stop()
self.assertEqual(0, exitcode,
"Failed to spin down the %s server. Got: %s" %
(err, name))
def stop_servers(self): def stop_servers(self):
""" """
Called to stop the started servers in a normal fashion. Note Called to stop the started servers in a normal fashion. Note
@ -562,20 +582,10 @@ class FunctionalTest(unittest.TestCase):
""" """
# Spin down the API and default registry server # Spin down the API and default registry server
exitcode, out, err = self.api_server.stop() self.stop_server(self.api_server, 'API server')
self.assertEqual(0, exitcode, self.stop_server(self.registry_server, 'Registry server')
"Failed to spin down the API server. " self.stop_server(self.scrubber_daemon, 'Scrubber daemon')
"Got: %s" % err)
exitcode, out, err = self.registry_server.stop()
self.assertEqual(0, exitcode,
"Failed to spin down the Registry server. "
"Got: %s" % err)
exitcode, out, err = self.scrubber_daemon.stop()
self.assertEqual(0, exitcode,
"Failed to spin down the Scrubber daemon. "
"Got: %s" % err)
# If all went well, then just remove the test directory. # If all went well, then just remove the test directory.
# We only want to check the logs and stuff if something # We only want to check the logs and stuff if something
# went wrong... # went wrong...

View File

@ -21,7 +21,6 @@ import datetime
import hashlib import hashlib
import httplib2 import httplib2
import json import json
import os
import tempfile import tempfile
from glance.common import utils from glance.common import utils
@ -1293,6 +1292,8 @@ class TestApi(functional.FunctionalTest):
""" """
self.cleanup() self.cleanup()
self.api_server.default_store = 'shouldnotexist' self.api_server.default_store = 'shouldnotexist'
# ensure failure exit code is available to assert on
self.api_server.server_control_options += ' --await-child=1' self.api_server.server_control_options += ' --await-child=1'
# ensure that the API server fails to launch # ensure that the API server fails to launch

View File

@ -0,0 +1,153 @@
# vim: tabstop=4 shiftwidth=4 softtabstop=4
# Copyright 2012 Red Hat, Inc
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
# a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations
# under the License.
"""Functional test case for the glance-control --respawn option """
import httplib2
import os
import time
import socket
import signal
import sys
import time
from glance.tests import functional
from glance.tests.utils import execute, skip_if_disabled
class TestRespawn(functional.FunctionalTest):
"""Functional test for glance-control --respawn """
def get_versions(self):
path = "http://%s:%d" % ("0.0.0.0", self.api_port)
http = httplib2.Http()
response, content = http.request(path, 'GET')
self.assertEqual(response.status, 300)
def get_pid(self):
return int(open(self.api_server.pid_file).read().strip())
def kill_server(self):
pid = self.get_pid()
os.killpg(pid, signal.SIGKILL)
return pid
def wait_for(self, predicate):
count = 0
while count < 50:
if predicate():
break
else:
time.sleep(0.1)
count += 1
self.assertTrue(predicate())
def connection_unavailable(self, type):
try:
self.get_versions()
self.fail('%s server should not be respawned' % type)
except socket.error:
exc_value = sys.exc_info()[1]
self.assertTrue('Connection refused' in exc_value or
'ECONNREFUSED' in exc_value)
@skip_if_disabled
def test_respawn(self):
"""
We test that the '--respawn' option causes the API server
to be respawned after death but not after a deliberate stop
"""
self.cleanup()
self.api_server.server_control_options += ' --respawn'
# start API server, allowing glance-control to continue running
self.start_server(self.api_server,
expect_launch=True,
expect_exit=False,
**self.__dict__.copy())
# ensure the service pid has been cached
pid_cached = lambda: os.path.exists(self.api_server.pid_file)
self.wait_for(pid_cached)
# ensure glance-control has had a chance to waitpid on child
time.sleep(1)
# verify server health with version negotiation
self.get_versions()
# server is killed ungracefully
old_pid = self.kill_server()
# ... but should be respawned
# wait for pid to cycle
pid_changed = lambda: old_pid != self.get_pid()
self.wait_for(pid_changed)
# ensure API service port is re-activated
self.wait_for_servers([self.api_server.bind_port])
# verify server health with version negotiation
self.get_versions()
# deliberately stop server, it should not be respawned
proc_file = '/proc/%d' % self.get_pid()
self.stop_server(self.api_server, 'API server')
# ensure last server process has gone away
process_died = lambda: not os.path.exists(proc_file)
self.wait_for(process_died)
# deliberately stopped server should not be respawned
self.wait_for_servers([self.api_server.bind_port], False)
# ensure the server has not been respawned
self.connection_unavailable('deliberately stopped')
@skip_if_disabled
def test_bouncing(self):
"""
We test that the '--respawn' option doesn't cause bouncing
API server to be respawned
"""
self.cleanup()
self.api_server.server_control_options += ' --respawn'
self.api_server.default_store = 'shouldnotexist'
# start API server, allowing glance-control to continue running
self.start_server(self.api_server,
expect_launch=False,
expect_exit=False,
**self.__dict__.copy())
# ensure the service pid has been cached
pid_cached = lambda: os.path.exists(self.api_server.pid_file)
self.wait_for(pid_cached)
# ensure glance-control has had a chance to waitpid on child
time.sleep(1)
# bouncing server should not be respawned
self.wait_for_servers([self.api_server.bind_port], False)
# ensure server process has gone away
process_died = lambda: not os.path.exists('/proc/%d' % self.get_pid())
self.wait_for(process_died)
# ensure the server has not been respawned
self.connection_unavailable('bouncing')

View File

@ -172,6 +172,7 @@ def execute(cmd,
raise_error=True, raise_error=True,
no_venv=False, no_venv=False,
exec_env=None, exec_env=None,
expect_exit=True,
expected_exitcode=0): expected_exitcode=0):
""" """
Executes a command in a subprocess. Returns a tuple Executes a command in a subprocess. Returns a tuple
@ -187,6 +188,7 @@ def execute(cmd,
variables; values may be callables, which will variables; values may be callables, which will
be passed the current value of the named be passed the current value of the named
environment variable environment variable
:param expect_exit: Optional flag true iff timely exit is expected
:param expected_exitcode: expected exitcode from the launcher :param expected_exitcode: expected exitcode from the launcher
""" """
@ -221,10 +223,16 @@ def execute(cmd,
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
env=env) env=env)
result = process.communicate() if expect_exit:
(out, err) = result result = process.communicate()
exitcode = process.returncode (out, err) = result
if process.returncode != expected_exitcode and raise_error: exitcode = process.returncode
else:
out = ''
err = ''
exitcode = 0
if exitcode != expected_exitcode and raise_error:
msg = "Command %(cmd)s did not succeed. Returned an exit "\ msg = "Command %(cmd)s did not succeed. Returned an exit "\
"code of %(exitcode)d."\ "code of %(exitcode)d."\
"\n\nSTDOUT: %(out)s"\ "\n\nSTDOUT: %(out)s"\