From 804396204e23ebb6c29c396396027bc3b09b0eab Mon Sep 17 00:00:00 2001 From: Eoghan Glynn Date: Mon, 30 Jan 2012 11:13:28 +0000 Subject: [PATCH] Respawn glance services on unexpected death. Fixes bug 923894 Add new '--respawn' option to cause glance services launched via glance-control to be monitored for unexpected death and resuscitated as necessary. This option will cause glance-control itself to remain running. Deliberately stopped services are not respawned, neither are rapidly bouncing services (where process death occurred within one second of the last launch). Change-Id: I1a9a99cce9b6ad43274836e39ebe4f29c19455af --- bin/glance-control | 138 ++++++++++++++------- doc/source/controllingservers.rst | 12 ++ glance/tests/functional/__init__.py | 50 ++++---- glance/tests/functional/test_api.py | 3 +- glance/tests/functional/test_respawn.py | 153 ++++++++++++++++++++++++ glance/tests/utils.py | 16 ++- 6 files changed, 303 insertions(+), 69 deletions(-) create mode 100644 glance/tests/functional/test_respawn.py diff --git a/bin/glance-control b/bin/glance-control index 658c22592d..5b583048d2 100755 --- a/bin/glance-control +++ b/bin/glance-control @@ -67,6 +67,17 @@ And command is one of: And CONFPATH is the optional configuration file to use.""" +def gated_by(predicate): + def wrap(f): + def wrapped_f(*args): + if predicate: + return f(*args) + else: + return None + return wrapped_f + return wrap + + def pid_files(server, conf): pid_files = [] if conf.pid_file: @@ -80,25 +91,26 @@ def pid_files(server, conf): yield pid_file, pid -def do_start(server, conf, args): - server_type = '-'.join(server.split('-')[:-1]) +def do_start(verb, server, conf, args): + if verb != 'Respawn': + for pid_file, pid in pid_files(server, conf): + if os.path.exists('/proc/%s' % pid): + print "%s appears to already be running: %s" % \ + (server, pid_file) + return + else: + print "Removing stale pid file %s" % pid_file + os.unlink(pid_file) - for pid_file, pid in pid_files(server, conf): - if os.path.exists('/proc/%s' % pid): - print "%s appears to already be running: %s" % (server, pid_file) - return - else: - print "Removing stale pid file %s" % pid_file - os.unlink(pid_file) - - try: - resource.setrlimit(resource.RLIMIT_NOFILE, - (MAX_DESCRIPTORS, MAX_DESCRIPTORS)) - resource.setrlimit(resource.RLIMIT_DATA, - (MAX_MEMORY, MAX_MEMORY)) - except ValueError: - print "Unable to increase file descriptor limit. Running as non-root?" - os.environ['PYTHON_EGG_CACHE'] = '/tmp' + try: + resource.setrlimit(resource.RLIMIT_NOFILE, + (MAX_DESCRIPTORS, MAX_DESCRIPTORS)) + resource.setrlimit(resource.RLIMIT_DATA, + (MAX_MEMORY, MAX_MEMORY)) + except ValueError: + action = 'increase file descriptor limit' + print 'Unable to %s. Running as non-root?' % action + os.environ['PYTHON_EGG_CACHE'] = '/tmp' def write_pid_file(pid_file, pid): dir, file = os.path.split(pid_file) @@ -113,18 +125,6 @@ def do_start(server, conf, args): fp.write('%d\n' % pid) fp.close() - def await_child(pid): - if conf.await_child: - bail_time = time.time() + conf.await_child - while time.time() < bail_time: - reported_pid, status = os.waitpid(pid, os.WNOHANG) - if reported_pid == pid: - global exitcode - # the exit code is encoded in 2nd least significant byte - exitcode = status >> 8 - break - time.sleep(0.05) - def redirect_to_null(fds): with open(os.devnull, 'r+b') as nullfile: for desc in fds: # close fds @@ -154,6 +154,7 @@ def do_start(server, conf, args): else: redirect_to_null(output) + @gated_by(conf.capture_output) def close_stdio_on_exec(): fds = [sys.stdin.fileno(), sys.stdout.fileno(), sys.stderr.fileno()] for desc in fds: # set close on exec flag @@ -161,7 +162,7 @@ def do_start(server, conf, args): def launch(pid_file, conf_file=None): args = [server] - print 'Starting %s' % server, + print '%sing %s' % (verb, server), if conf_file: args += ['--config-file', conf_file] print 'with %s' % conf_file, @@ -176,23 +177,37 @@ def do_start(server, conf, args): try: os.execlp('%s' % server, *args) except OSError, e: - sys.exit('unable to launch %s. Got error: %s' - % (server, "%s" % e)) + msg = 'unable to launch %s. Got error: %s' % (server, e) + sys.exit(msg) sys.exit(0) else: write_pid_file(pid_file, pid) await_child(pid) + return pid - if not conf.pid_file: - pid_file = '/var/run/glance/%s.pid' % server - else: - pid_file = os.path.abspath(conf.pid_file) + @gated_by(conf.await_child) + def await_child(pid): + bail_time = time.time() + conf.await_child + while time.time() < bail_time: + reported_pid, status = os.waitpid(pid, os.WNOHANG) + if reported_pid == pid: + global exitcode + exitcode = os.WEXITSTATUS(status) + break + time.sleep(0.05) + + pid_file = get_pid_file(server, conf) conf_file = None if args and os.path.exists(args[0]): conf_file = os.path.abspath(os.path.expanduser(args[0])) - launch(pid_file, conf_file) + return launch(pid_file, conf_file) + + +def get_pid_file(pid, conf): + return os.path.abspath(conf.pid_file) if conf.pid_file else \ + '/var/run/glance/%s.pid' % server def do_stop(server, conf, args, graceful=False): @@ -205,15 +220,15 @@ def do_stop(server, conf, args, graceful=False): pfiles = pid_files(server, conf) for pid_file, pid in pfiles: did_anything = True + try: + os.unlink(pid_file) + except OSError: + pass try: print 'Stopping %s pid: %s signal: %s' % (server, pid, sig) os.kill(pid, sig) except OSError: print "Process %d not running" % pid - try: - os.unlink(pid_file) - except OSError: - pass for pid_file, pid in pfiles: for _junk in xrange(150): # 15 seconds if not os.path.exists('/proc/%s' % pid): @@ -246,11 +261,22 @@ if __name__ == '__main__': default=False, help='Capture stdout/err in syslog ' 'instead of discarding'), + cfg.BoolOpt('respawn', + default=False, + help='Restart service on unexpected death'), ] conf.register_cli_opts(opts) args = conf() + @gated_by(conf.await_child) + @gated_by(conf.respawn) + def mutually_exclusive(): + sys.stderr.write('--await-child and --respawn are mutually exclusive') + sys.exit(1) + + mutually_exclusive() + if len(args) < 2: conf.print_usage() sys.exit(1) @@ -276,9 +302,33 @@ if __name__ == '__main__': "command in this list: %(command_list)s" % locals()) sys.exit(msg) + @gated_by(conf.respawn) + def anticipate_respawn(children): + while children: + pid, status = os.wait() + if pid in children: + (server, conf, args) = children.pop(pid) + pid_file = get_pid_file(server, conf) + running = os.path.exists(pid_file) + one_second_ago = time.time() - 1 + bouncing = (running and + os.path.getmtime(pid_file) >= one_second_ago) + if running and not bouncing: + args = (server, conf, args) + new_pid = do_start('Respawn', *args) + children[new_pid] = args + else: + rsn = 'bouncing' if bouncing else 'deliberately stopped' + print 'Supressed respawn as %s was %s.' % (server, rsn) + if command == 'start': + children = {} for server in servers: - do_start(server, conf, args) + args = (server, conf, args) + pid = do_start('Start', *args) + children[pid] = args + + anticipate_respawn(children) if command == 'stop': for server in servers: @@ -292,7 +342,7 @@ if __name__ == '__main__': for server in servers: do_stop(server, conf, args) for server in servers: - do_start(server, conf, args) + do_start('Restart', server, conf, args) if command == 'reload' or command == 'force-reload': for server in servers: diff --git a/doc/source/controllingservers.rst b/doc/source/controllingservers.rst index 2679b633a3..870b0a3717 100644 --- a/doc/source/controllingservers.rst +++ b/doc/source/controllingservers.rst @@ -177,6 +177,18 @@ Glance server programs, and you can specify (as the example above shows) a configuration file when starting the server. +In order for your launched glance service to be monitored for unexpected death +and respawned if necessary, use the following option: + + + $ sudo glance-control [service] start --respawn ... + + +Note that this will cause ``glance-control`` itself to remain running. Also note +that deliberately stopped services are not respawned, neither are rapidly bouncing +services (where process death occurred within one second of the last launch). + + By default, output from glance services is discarded when launched with ``glance-control``. In order to capture such output via syslog, use the following option: diff --git a/glance/tests/functional/__init__.py b/glance/tests/functional/__init__.py index 42647ee129..2c6fbb4eaf 100644 --- a/glance/tests/functional/__init__.py +++ b/glance/tests/functional/__init__.py @@ -129,7 +129,7 @@ class Server(object): return self.conf_file_name - def start(self, expected_exitcode=0, **kwargs): + def start(self, expect_exit=True, expected_exitcode=0, **kwargs): """ Starts the server. @@ -147,6 +147,7 @@ class Server(object): return execute(cmd, no_venv=self.no_venv, exec_env=self.exec_env, + expect_exit=expect_exit, expected_exitcode=expected_exitcode) def stop(self): @@ -156,7 +157,8 @@ class Server(object): cmd = ("%(server_control)s %(server_name)s stop " "%(conf_file_name)s --pid-file=%(pid_file)s" % self.__dict__) - return execute(cmd, no_venv=self.no_venv, exec_env=self.exec_env) + return execute(cmd, no_venv=self.no_venv, exec_env=self.exec_env, + expect_exit=True) class ApiServer(Server): @@ -449,6 +451,7 @@ class FunctionalTest(unittest.TestCase): def start_server(self, server, expect_launch, + expect_exit=True, expected_exitcode=0, **kwargs): """ @@ -460,18 +463,22 @@ class FunctionalTest(unittest.TestCase): :param server: the server to launch :param expect_launch: true iff the server is expected to successfully start + :param expect_exit: true iff the launched server is expected + to exit in a timely fashion :param expected_exitcode: expected exitcode from the launcher """ self.cleanup() # Start up the requested server - exitcode, out, err = server.start(expected_exitcode=expected_exitcode, + exitcode, out, err = server.start(expect_exit=expect_exit, + expected_exitcode=expected_exitcode, **kwargs) + if expect_exit: + self.assertEqual(expected_exitcode, exitcode, + "Failed to spin up the requested server. " + "Got: %s" % err) - self.assertEqual(expected_exitcode, exitcode, - "Failed to spin up the requested server. " - "Got: %s" % err) - self.assertTrue(re.search("Starting glance-[a-z]+ with", out)) + self.assertTrue(re.search("Starting glance-[a-z]+ with", out)) self.wait_for_servers([server.bind_port], expect_launch) @@ -551,6 +558,19 @@ class FunctionalTest(unittest.TestCase): time.sleep(0.05) self.assertFalse(expect_launch, "Unexpected server launch status") + def stop_server(self, server, name): + """ + Called to stop a single server in a normal fashion using the + glance-control stop method to gracefully shut the server down. + + :param server: the server to stop + """ + # Spin down the requested server + exitcode, out, err = server.stop() + self.assertEqual(0, exitcode, + "Failed to spin down the %s server. Got: %s" % + (err, name)) + def stop_servers(self): """ Called to stop the started servers in a normal fashion. Note @@ -562,20 +582,10 @@ class FunctionalTest(unittest.TestCase): """ # Spin down the API and default registry server - exitcode, out, err = self.api_server.stop() - self.assertEqual(0, exitcode, - "Failed to spin down the API server. " - "Got: %s" % err) + self.stop_server(self.api_server, 'API server') + self.stop_server(self.registry_server, 'Registry server') + self.stop_server(self.scrubber_daemon, 'Scrubber daemon') - exitcode, out, err = self.registry_server.stop() - self.assertEqual(0, exitcode, - "Failed to spin down the Registry server. " - "Got: %s" % err) - - exitcode, out, err = self.scrubber_daemon.stop() - self.assertEqual(0, exitcode, - "Failed to spin down the Scrubber daemon. " - "Got: %s" % err) # If all went well, then just remove the test directory. # We only want to check the logs and stuff if something # went wrong... diff --git a/glance/tests/functional/test_api.py b/glance/tests/functional/test_api.py index 0e63bde7ba..39f663c49a 100644 --- a/glance/tests/functional/test_api.py +++ b/glance/tests/functional/test_api.py @@ -21,7 +21,6 @@ import datetime import hashlib import httplib2 import json -import os import tempfile from glance.common import utils @@ -1293,6 +1292,8 @@ class TestApi(functional.FunctionalTest): """ self.cleanup() self.api_server.default_store = 'shouldnotexist' + + # ensure failure exit code is available to assert on self.api_server.server_control_options += ' --await-child=1' # ensure that the API server fails to launch diff --git a/glance/tests/functional/test_respawn.py b/glance/tests/functional/test_respawn.py new file mode 100644 index 0000000000..fd4dc3a2c8 --- /dev/null +++ b/glance/tests/functional/test_respawn.py @@ -0,0 +1,153 @@ +# vim: tabstop=4 shiftwidth=4 softtabstop=4 + +# Copyright 2012 Red Hat, Inc +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may +# not use this file except in compliance with the License. You may obtain +# a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. + +"""Functional test case for the glance-control --respawn option """ + +import httplib2 +import os +import time +import socket +import signal +import sys +import time + +from glance.tests import functional +from glance.tests.utils import execute, skip_if_disabled + + +class TestRespawn(functional.FunctionalTest): + + """Functional test for glance-control --respawn """ + + def get_versions(self): + path = "http://%s:%d" % ("0.0.0.0", self.api_port) + http = httplib2.Http() + response, content = http.request(path, 'GET') + self.assertEqual(response.status, 300) + + def get_pid(self): + return int(open(self.api_server.pid_file).read().strip()) + + def kill_server(self): + pid = self.get_pid() + os.killpg(pid, signal.SIGKILL) + return pid + + def wait_for(self, predicate): + count = 0 + while count < 50: + if predicate(): + break + else: + time.sleep(0.1) + count += 1 + self.assertTrue(predicate()) + + def connection_unavailable(self, type): + try: + self.get_versions() + self.fail('%s server should not be respawned' % type) + except socket.error: + exc_value = sys.exc_info()[1] + self.assertTrue('Connection refused' in exc_value or + 'ECONNREFUSED' in exc_value) + + @skip_if_disabled + def test_respawn(self): + """ + We test that the '--respawn' option causes the API server + to be respawned after death but not after a deliberate stop + """ + self.cleanup() + self.api_server.server_control_options += ' --respawn' + + # start API server, allowing glance-control to continue running + self.start_server(self.api_server, + expect_launch=True, + expect_exit=False, + **self.__dict__.copy()) + + # ensure the service pid has been cached + pid_cached = lambda: os.path.exists(self.api_server.pid_file) + self.wait_for(pid_cached) + + # ensure glance-control has had a chance to waitpid on child + time.sleep(1) + + # verify server health with version negotiation + self.get_versions() + + # server is killed ungracefully + old_pid = self.kill_server() + + # ... but should be respawned + + # wait for pid to cycle + pid_changed = lambda: old_pid != self.get_pid() + self.wait_for(pid_changed) + + # ensure API service port is re-activated + self.wait_for_servers([self.api_server.bind_port]) + + # verify server health with version negotiation + self.get_versions() + + # deliberately stop server, it should not be respawned + proc_file = '/proc/%d' % self.get_pid() + self.stop_server(self.api_server, 'API server') + + # ensure last server process has gone away + process_died = lambda: not os.path.exists(proc_file) + self.wait_for(process_died) + + # deliberately stopped server should not be respawned + self.wait_for_servers([self.api_server.bind_port], False) + + # ensure the server has not been respawned + self.connection_unavailable('deliberately stopped') + + @skip_if_disabled + def test_bouncing(self): + """ + We test that the '--respawn' option doesn't cause bouncing + API server to be respawned + """ + self.cleanup() + self.api_server.server_control_options += ' --respawn' + self.api_server.default_store = 'shouldnotexist' + + # start API server, allowing glance-control to continue running + self.start_server(self.api_server, + expect_launch=False, + expect_exit=False, + **self.__dict__.copy()) + + # ensure the service pid has been cached + pid_cached = lambda: os.path.exists(self.api_server.pid_file) + self.wait_for(pid_cached) + + # ensure glance-control has had a chance to waitpid on child + time.sleep(1) + + # bouncing server should not be respawned + self.wait_for_servers([self.api_server.bind_port], False) + + # ensure server process has gone away + process_died = lambda: not os.path.exists('/proc/%d' % self.get_pid()) + self.wait_for(process_died) + + # ensure the server has not been respawned + self.connection_unavailable('bouncing') diff --git a/glance/tests/utils.py b/glance/tests/utils.py index e421611711..3b0960ef5e 100644 --- a/glance/tests/utils.py +++ b/glance/tests/utils.py @@ -172,6 +172,7 @@ def execute(cmd, raise_error=True, no_venv=False, exec_env=None, + expect_exit=True, expected_exitcode=0): """ Executes a command in a subprocess. Returns a tuple @@ -187,6 +188,7 @@ def execute(cmd, variables; values may be callables, which will be passed the current value of the named environment variable + :param expect_exit: Optional flag true iff timely exit is expected :param expected_exitcode: expected exitcode from the launcher """ @@ -221,10 +223,16 @@ def execute(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=env) - result = process.communicate() - (out, err) = result - exitcode = process.returncode - if process.returncode != expected_exitcode and raise_error: + if expect_exit: + result = process.communicate() + (out, err) = result + exitcode = process.returncode + else: + out = '' + err = '' + exitcode = 0 + + if exitcode != expected_exitcode and raise_error: msg = "Command %(cmd)s did not succeed. Returned an exit "\ "code of %(exitcode)d."\ "\n\nSTDOUT: %(out)s"\