Respawn glance services on unexpected death.
Fixes bug 923894 Add new '--respawn' option to cause glance services launched via glance-control to be monitored for unexpected death and resuscitated as necessary. This option will cause glance-control itself to remain running. Deliberately stopped services are not respawned, neither are rapidly bouncing services (where process death occurred within one second of the last launch). Change-Id: I1a9a99cce9b6ad43274836e39ebe4f29c19455af
This commit is contained in:
parent
eea25b4149
commit
804396204e
@ -67,6 +67,17 @@ And command is one of:
|
||||
And CONFPATH is the optional configuration file to use."""
|
||||
|
||||
|
||||
def gated_by(predicate):
|
||||
def wrap(f):
|
||||
def wrapped_f(*args):
|
||||
if predicate:
|
||||
return f(*args)
|
||||
else:
|
||||
return None
|
||||
return wrapped_f
|
||||
return wrap
|
||||
|
||||
|
||||
def pid_files(server, conf):
|
||||
pid_files = []
|
||||
if conf.pid_file:
|
||||
@ -80,25 +91,26 @@ def pid_files(server, conf):
|
||||
yield pid_file, pid
|
||||
|
||||
|
||||
def do_start(server, conf, args):
|
||||
server_type = '-'.join(server.split('-')[:-1])
|
||||
def do_start(verb, server, conf, args):
|
||||
if verb != 'Respawn':
|
||||
for pid_file, pid in pid_files(server, conf):
|
||||
if os.path.exists('/proc/%s' % pid):
|
||||
print "%s appears to already be running: %s" % \
|
||||
(server, pid_file)
|
||||
return
|
||||
else:
|
||||
print "Removing stale pid file %s" % pid_file
|
||||
os.unlink(pid_file)
|
||||
|
||||
for pid_file, pid in pid_files(server, conf):
|
||||
if os.path.exists('/proc/%s' % pid):
|
||||
print "%s appears to already be running: %s" % (server, pid_file)
|
||||
return
|
||||
else:
|
||||
print "Removing stale pid file %s" % pid_file
|
||||
os.unlink(pid_file)
|
||||
|
||||
try:
|
||||
resource.setrlimit(resource.RLIMIT_NOFILE,
|
||||
(MAX_DESCRIPTORS, MAX_DESCRIPTORS))
|
||||
resource.setrlimit(resource.RLIMIT_DATA,
|
||||
(MAX_MEMORY, MAX_MEMORY))
|
||||
except ValueError:
|
||||
print "Unable to increase file descriptor limit. Running as non-root?"
|
||||
os.environ['PYTHON_EGG_CACHE'] = '/tmp'
|
||||
try:
|
||||
resource.setrlimit(resource.RLIMIT_NOFILE,
|
||||
(MAX_DESCRIPTORS, MAX_DESCRIPTORS))
|
||||
resource.setrlimit(resource.RLIMIT_DATA,
|
||||
(MAX_MEMORY, MAX_MEMORY))
|
||||
except ValueError:
|
||||
action = 'increase file descriptor limit'
|
||||
print 'Unable to %s. Running as non-root?' % action
|
||||
os.environ['PYTHON_EGG_CACHE'] = '/tmp'
|
||||
|
||||
def write_pid_file(pid_file, pid):
|
||||
dir, file = os.path.split(pid_file)
|
||||
@ -113,18 +125,6 @@ def do_start(server, conf, args):
|
||||
fp.write('%d\n' % pid)
|
||||
fp.close()
|
||||
|
||||
def await_child(pid):
|
||||
if conf.await_child:
|
||||
bail_time = time.time() + conf.await_child
|
||||
while time.time() < bail_time:
|
||||
reported_pid, status = os.waitpid(pid, os.WNOHANG)
|
||||
if reported_pid == pid:
|
||||
global exitcode
|
||||
# the exit code is encoded in 2nd least significant byte
|
||||
exitcode = status >> 8
|
||||
break
|
||||
time.sleep(0.05)
|
||||
|
||||
def redirect_to_null(fds):
|
||||
with open(os.devnull, 'r+b') as nullfile:
|
||||
for desc in fds: # close fds
|
||||
@ -154,6 +154,7 @@ def do_start(server, conf, args):
|
||||
else:
|
||||
redirect_to_null(output)
|
||||
|
||||
@gated_by(conf.capture_output)
|
||||
def close_stdio_on_exec():
|
||||
fds = [sys.stdin.fileno(), sys.stdout.fileno(), sys.stderr.fileno()]
|
||||
for desc in fds: # set close on exec flag
|
||||
@ -161,7 +162,7 @@ def do_start(server, conf, args):
|
||||
|
||||
def launch(pid_file, conf_file=None):
|
||||
args = [server]
|
||||
print 'Starting %s' % server,
|
||||
print '%sing %s' % (verb, server),
|
||||
if conf_file:
|
||||
args += ['--config-file', conf_file]
|
||||
print 'with %s' % conf_file,
|
||||
@ -176,23 +177,37 @@ def do_start(server, conf, args):
|
||||
try:
|
||||
os.execlp('%s' % server, *args)
|
||||
except OSError, e:
|
||||
sys.exit('unable to launch %s. Got error: %s'
|
||||
% (server, "%s" % e))
|
||||
msg = 'unable to launch %s. Got error: %s' % (server, e)
|
||||
sys.exit(msg)
|
||||
sys.exit(0)
|
||||
else:
|
||||
write_pid_file(pid_file, pid)
|
||||
await_child(pid)
|
||||
return pid
|
||||
|
||||
if not conf.pid_file:
|
||||
pid_file = '/var/run/glance/%s.pid' % server
|
||||
else:
|
||||
pid_file = os.path.abspath(conf.pid_file)
|
||||
@gated_by(conf.await_child)
|
||||
def await_child(pid):
|
||||
bail_time = time.time() + conf.await_child
|
||||
while time.time() < bail_time:
|
||||
reported_pid, status = os.waitpid(pid, os.WNOHANG)
|
||||
if reported_pid == pid:
|
||||
global exitcode
|
||||
exitcode = os.WEXITSTATUS(status)
|
||||
break
|
||||
time.sleep(0.05)
|
||||
|
||||
pid_file = get_pid_file(server, conf)
|
||||
|
||||
conf_file = None
|
||||
if args and os.path.exists(args[0]):
|
||||
conf_file = os.path.abspath(os.path.expanduser(args[0]))
|
||||
|
||||
launch(pid_file, conf_file)
|
||||
return launch(pid_file, conf_file)
|
||||
|
||||
|
||||
def get_pid_file(pid, conf):
|
||||
return os.path.abspath(conf.pid_file) if conf.pid_file else \
|
||||
'/var/run/glance/%s.pid' % server
|
||||
|
||||
|
||||
def do_stop(server, conf, args, graceful=False):
|
||||
@ -205,15 +220,15 @@ def do_stop(server, conf, args, graceful=False):
|
||||
pfiles = pid_files(server, conf)
|
||||
for pid_file, pid in pfiles:
|
||||
did_anything = True
|
||||
try:
|
||||
os.unlink(pid_file)
|
||||
except OSError:
|
||||
pass
|
||||
try:
|
||||
print 'Stopping %s pid: %s signal: %s' % (server, pid, sig)
|
||||
os.kill(pid, sig)
|
||||
except OSError:
|
||||
print "Process %d not running" % pid
|
||||
try:
|
||||
os.unlink(pid_file)
|
||||
except OSError:
|
||||
pass
|
||||
for pid_file, pid in pfiles:
|
||||
for _junk in xrange(150): # 15 seconds
|
||||
if not os.path.exists('/proc/%s' % pid):
|
||||
@ -246,11 +261,22 @@ if __name__ == '__main__':
|
||||
default=False,
|
||||
help='Capture stdout/err in syslog '
|
||||
'instead of discarding'),
|
||||
cfg.BoolOpt('respawn',
|
||||
default=False,
|
||||
help='Restart service on unexpected death'),
|
||||
]
|
||||
conf.register_cli_opts(opts)
|
||||
|
||||
args = conf()
|
||||
|
||||
@gated_by(conf.await_child)
|
||||
@gated_by(conf.respawn)
|
||||
def mutually_exclusive():
|
||||
sys.stderr.write('--await-child and --respawn are mutually exclusive')
|
||||
sys.exit(1)
|
||||
|
||||
mutually_exclusive()
|
||||
|
||||
if len(args) < 2:
|
||||
conf.print_usage()
|
||||
sys.exit(1)
|
||||
@ -276,9 +302,33 @@ if __name__ == '__main__':
|
||||
"command in this list: %(command_list)s" % locals())
|
||||
sys.exit(msg)
|
||||
|
||||
@gated_by(conf.respawn)
|
||||
def anticipate_respawn(children):
|
||||
while children:
|
||||
pid, status = os.wait()
|
||||
if pid in children:
|
||||
(server, conf, args) = children.pop(pid)
|
||||
pid_file = get_pid_file(server, conf)
|
||||
running = os.path.exists(pid_file)
|
||||
one_second_ago = time.time() - 1
|
||||
bouncing = (running and
|
||||
os.path.getmtime(pid_file) >= one_second_ago)
|
||||
if running and not bouncing:
|
||||
args = (server, conf, args)
|
||||
new_pid = do_start('Respawn', *args)
|
||||
children[new_pid] = args
|
||||
else:
|
||||
rsn = 'bouncing' if bouncing else 'deliberately stopped'
|
||||
print 'Supressed respawn as %s was %s.' % (server, rsn)
|
||||
|
||||
if command == 'start':
|
||||
children = {}
|
||||
for server in servers:
|
||||
do_start(server, conf, args)
|
||||
args = (server, conf, args)
|
||||
pid = do_start('Start', *args)
|
||||
children[pid] = args
|
||||
|
||||
anticipate_respawn(children)
|
||||
|
||||
if command == 'stop':
|
||||
for server in servers:
|
||||
@ -292,7 +342,7 @@ if __name__ == '__main__':
|
||||
for server in servers:
|
||||
do_stop(server, conf, args)
|
||||
for server in servers:
|
||||
do_start(server, conf, args)
|
||||
do_start('Restart', server, conf, args)
|
||||
|
||||
if command == 'reload' or command == 'force-reload':
|
||||
for server in servers:
|
||||
|
@ -177,6 +177,18 @@ Glance server programs, and you can specify (as the example above shows)
|
||||
a configuration file when starting the server.
|
||||
|
||||
|
||||
In order for your launched glance service to be monitored for unexpected death
|
||||
and respawned if necessary, use the following option:
|
||||
|
||||
|
||||
$ sudo glance-control [service] start --respawn ...
|
||||
|
||||
|
||||
Note that this will cause ``glance-control`` itself to remain running. Also note
|
||||
that deliberately stopped services are not respawned, neither are rapidly bouncing
|
||||
services (where process death occurred within one second of the last launch).
|
||||
|
||||
|
||||
By default, output from glance services is discarded when launched with ``glance-control``.
|
||||
In order to capture such output via syslog, use the following option:
|
||||
|
||||
|
@ -129,7 +129,7 @@ class Server(object):
|
||||
|
||||
return self.conf_file_name
|
||||
|
||||
def start(self, expected_exitcode=0, **kwargs):
|
||||
def start(self, expect_exit=True, expected_exitcode=0, **kwargs):
|
||||
"""
|
||||
Starts the server.
|
||||
|
||||
@ -147,6 +147,7 @@ class Server(object):
|
||||
return execute(cmd,
|
||||
no_venv=self.no_venv,
|
||||
exec_env=self.exec_env,
|
||||
expect_exit=expect_exit,
|
||||
expected_exitcode=expected_exitcode)
|
||||
|
||||
def stop(self):
|
||||
@ -156,7 +157,8 @@ class Server(object):
|
||||
cmd = ("%(server_control)s %(server_name)s stop "
|
||||
"%(conf_file_name)s --pid-file=%(pid_file)s"
|
||||
% self.__dict__)
|
||||
return execute(cmd, no_venv=self.no_venv, exec_env=self.exec_env)
|
||||
return execute(cmd, no_venv=self.no_venv, exec_env=self.exec_env,
|
||||
expect_exit=True)
|
||||
|
||||
|
||||
class ApiServer(Server):
|
||||
@ -449,6 +451,7 @@ class FunctionalTest(unittest.TestCase):
|
||||
def start_server(self,
|
||||
server,
|
||||
expect_launch,
|
||||
expect_exit=True,
|
||||
expected_exitcode=0,
|
||||
**kwargs):
|
||||
"""
|
||||
@ -460,18 +463,22 @@ class FunctionalTest(unittest.TestCase):
|
||||
:param server: the server to launch
|
||||
:param expect_launch: true iff the server is expected to
|
||||
successfully start
|
||||
:param expect_exit: true iff the launched server is expected
|
||||
to exit in a timely fashion
|
||||
:param expected_exitcode: expected exitcode from the launcher
|
||||
"""
|
||||
self.cleanup()
|
||||
|
||||
# Start up the requested server
|
||||
exitcode, out, err = server.start(expected_exitcode=expected_exitcode,
|
||||
exitcode, out, err = server.start(expect_exit=expect_exit,
|
||||
expected_exitcode=expected_exitcode,
|
||||
**kwargs)
|
||||
if expect_exit:
|
||||
self.assertEqual(expected_exitcode, exitcode,
|
||||
"Failed to spin up the requested server. "
|
||||
"Got: %s" % err)
|
||||
|
||||
self.assertEqual(expected_exitcode, exitcode,
|
||||
"Failed to spin up the requested server. "
|
||||
"Got: %s" % err)
|
||||
self.assertTrue(re.search("Starting glance-[a-z]+ with", out))
|
||||
self.assertTrue(re.search("Starting glance-[a-z]+ with", out))
|
||||
|
||||
self.wait_for_servers([server.bind_port], expect_launch)
|
||||
|
||||
@ -551,6 +558,19 @@ class FunctionalTest(unittest.TestCase):
|
||||
time.sleep(0.05)
|
||||
self.assertFalse(expect_launch, "Unexpected server launch status")
|
||||
|
||||
def stop_server(self, server, name):
|
||||
"""
|
||||
Called to stop a single server in a normal fashion using the
|
||||
glance-control stop method to gracefully shut the server down.
|
||||
|
||||
:param server: the server to stop
|
||||
"""
|
||||
# Spin down the requested server
|
||||
exitcode, out, err = server.stop()
|
||||
self.assertEqual(0, exitcode,
|
||||
"Failed to spin down the %s server. Got: %s" %
|
||||
(err, name))
|
||||
|
||||
def stop_servers(self):
|
||||
"""
|
||||
Called to stop the started servers in a normal fashion. Note
|
||||
@ -562,20 +582,10 @@ class FunctionalTest(unittest.TestCase):
|
||||
"""
|
||||
|
||||
# Spin down the API and default registry server
|
||||
exitcode, out, err = self.api_server.stop()
|
||||
self.assertEqual(0, exitcode,
|
||||
"Failed to spin down the API server. "
|
||||
"Got: %s" % err)
|
||||
self.stop_server(self.api_server, 'API server')
|
||||
self.stop_server(self.registry_server, 'Registry server')
|
||||
self.stop_server(self.scrubber_daemon, 'Scrubber daemon')
|
||||
|
||||
exitcode, out, err = self.registry_server.stop()
|
||||
self.assertEqual(0, exitcode,
|
||||
"Failed to spin down the Registry server. "
|
||||
"Got: %s" % err)
|
||||
|
||||
exitcode, out, err = self.scrubber_daemon.stop()
|
||||
self.assertEqual(0, exitcode,
|
||||
"Failed to spin down the Scrubber daemon. "
|
||||
"Got: %s" % err)
|
||||
# If all went well, then just remove the test directory.
|
||||
# We only want to check the logs and stuff if something
|
||||
# went wrong...
|
||||
|
@ -21,7 +21,6 @@ import datetime
|
||||
import hashlib
|
||||
import httplib2
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
from glance.common import utils
|
||||
@ -1293,6 +1292,8 @@ class TestApi(functional.FunctionalTest):
|
||||
"""
|
||||
self.cleanup()
|
||||
self.api_server.default_store = 'shouldnotexist'
|
||||
|
||||
# ensure failure exit code is available to assert on
|
||||
self.api_server.server_control_options += ' --await-child=1'
|
||||
|
||||
# ensure that the API server fails to launch
|
||||
|
153
glance/tests/functional/test_respawn.py
Normal file
153
glance/tests/functional/test_respawn.py
Normal file
@ -0,0 +1,153 @@
|
||||
# vim: tabstop=4 shiftwidth=4 softtabstop=4
|
||||
|
||||
# Copyright 2012 Red Hat, Inc
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License"); you may
|
||||
# not use this file except in compliance with the License. You may obtain
|
||||
# a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
# License for the specific language governing permissions and limitations
|
||||
# under the License.
|
||||
|
||||
"""Functional test case for the glance-control --respawn option """
|
||||
|
||||
import httplib2
|
||||
import os
|
||||
import time
|
||||
import socket
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
|
||||
from glance.tests import functional
|
||||
from glance.tests.utils import execute, skip_if_disabled
|
||||
|
||||
|
||||
class TestRespawn(functional.FunctionalTest):
|
||||
|
||||
"""Functional test for glance-control --respawn """
|
||||
|
||||
def get_versions(self):
|
||||
path = "http://%s:%d" % ("0.0.0.0", self.api_port)
|
||||
http = httplib2.Http()
|
||||
response, content = http.request(path, 'GET')
|
||||
self.assertEqual(response.status, 300)
|
||||
|
||||
def get_pid(self):
|
||||
return int(open(self.api_server.pid_file).read().strip())
|
||||
|
||||
def kill_server(self):
|
||||
pid = self.get_pid()
|
||||
os.killpg(pid, signal.SIGKILL)
|
||||
return pid
|
||||
|
||||
def wait_for(self, predicate):
|
||||
count = 0
|
||||
while count < 50:
|
||||
if predicate():
|
||||
break
|
||||
else:
|
||||
time.sleep(0.1)
|
||||
count += 1
|
||||
self.assertTrue(predicate())
|
||||
|
||||
def connection_unavailable(self, type):
|
||||
try:
|
||||
self.get_versions()
|
||||
self.fail('%s server should not be respawned' % type)
|
||||
except socket.error:
|
||||
exc_value = sys.exc_info()[1]
|
||||
self.assertTrue('Connection refused' in exc_value or
|
||||
'ECONNREFUSED' in exc_value)
|
||||
|
||||
@skip_if_disabled
|
||||
def test_respawn(self):
|
||||
"""
|
||||
We test that the '--respawn' option causes the API server
|
||||
to be respawned after death but not after a deliberate stop
|
||||
"""
|
||||
self.cleanup()
|
||||
self.api_server.server_control_options += ' --respawn'
|
||||
|
||||
# start API server, allowing glance-control to continue running
|
||||
self.start_server(self.api_server,
|
||||
expect_launch=True,
|
||||
expect_exit=False,
|
||||
**self.__dict__.copy())
|
||||
|
||||
# ensure the service pid has been cached
|
||||
pid_cached = lambda: os.path.exists(self.api_server.pid_file)
|
||||
self.wait_for(pid_cached)
|
||||
|
||||
# ensure glance-control has had a chance to waitpid on child
|
||||
time.sleep(1)
|
||||
|
||||
# verify server health with version negotiation
|
||||
self.get_versions()
|
||||
|
||||
# server is killed ungracefully
|
||||
old_pid = self.kill_server()
|
||||
|
||||
# ... but should be respawned
|
||||
|
||||
# wait for pid to cycle
|
||||
pid_changed = lambda: old_pid != self.get_pid()
|
||||
self.wait_for(pid_changed)
|
||||
|
||||
# ensure API service port is re-activated
|
||||
self.wait_for_servers([self.api_server.bind_port])
|
||||
|
||||
# verify server health with version negotiation
|
||||
self.get_versions()
|
||||
|
||||
# deliberately stop server, it should not be respawned
|
||||
proc_file = '/proc/%d' % self.get_pid()
|
||||
self.stop_server(self.api_server, 'API server')
|
||||
|
||||
# ensure last server process has gone away
|
||||
process_died = lambda: not os.path.exists(proc_file)
|
||||
self.wait_for(process_died)
|
||||
|
||||
# deliberately stopped server should not be respawned
|
||||
self.wait_for_servers([self.api_server.bind_port], False)
|
||||
|
||||
# ensure the server has not been respawned
|
||||
self.connection_unavailable('deliberately stopped')
|
||||
|
||||
@skip_if_disabled
|
||||
def test_bouncing(self):
|
||||
"""
|
||||
We test that the '--respawn' option doesn't cause bouncing
|
||||
API server to be respawned
|
||||
"""
|
||||
self.cleanup()
|
||||
self.api_server.server_control_options += ' --respawn'
|
||||
self.api_server.default_store = 'shouldnotexist'
|
||||
|
||||
# start API server, allowing glance-control to continue running
|
||||
self.start_server(self.api_server,
|
||||
expect_launch=False,
|
||||
expect_exit=False,
|
||||
**self.__dict__.copy())
|
||||
|
||||
# ensure the service pid has been cached
|
||||
pid_cached = lambda: os.path.exists(self.api_server.pid_file)
|
||||
self.wait_for(pid_cached)
|
||||
|
||||
# ensure glance-control has had a chance to waitpid on child
|
||||
time.sleep(1)
|
||||
|
||||
# bouncing server should not be respawned
|
||||
self.wait_for_servers([self.api_server.bind_port], False)
|
||||
|
||||
# ensure server process has gone away
|
||||
process_died = lambda: not os.path.exists('/proc/%d' % self.get_pid())
|
||||
self.wait_for(process_died)
|
||||
|
||||
# ensure the server has not been respawned
|
||||
self.connection_unavailable('bouncing')
|
@ -172,6 +172,7 @@ def execute(cmd,
|
||||
raise_error=True,
|
||||
no_venv=False,
|
||||
exec_env=None,
|
||||
expect_exit=True,
|
||||
expected_exitcode=0):
|
||||
"""
|
||||
Executes a command in a subprocess. Returns a tuple
|
||||
@ -187,6 +188,7 @@ def execute(cmd,
|
||||
variables; values may be callables, which will
|
||||
be passed the current value of the named
|
||||
environment variable
|
||||
:param expect_exit: Optional flag true iff timely exit is expected
|
||||
:param expected_exitcode: expected exitcode from the launcher
|
||||
"""
|
||||
|
||||
@ -221,10 +223,16 @@ def execute(cmd,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
env=env)
|
||||
result = process.communicate()
|
||||
(out, err) = result
|
||||
exitcode = process.returncode
|
||||
if process.returncode != expected_exitcode and raise_error:
|
||||
if expect_exit:
|
||||
result = process.communicate()
|
||||
(out, err) = result
|
||||
exitcode = process.returncode
|
||||
else:
|
||||
out = ''
|
||||
err = ''
|
||||
exitcode = 0
|
||||
|
||||
if exitcode != expected_exitcode and raise_error:
|
||||
msg = "Command %(cmd)s did not succeed. Returned an exit "\
|
||||
"code of %(exitcode)d."\
|
||||
"\n\nSTDOUT: %(out)s"\
|
||||
|
Loading…
Reference in New Issue
Block a user