From 8b3f327df03d2ca530d094342710ff3f75ae3934 Mon Sep 17 00:00:00 2001 From: Rick Harris Date: Thu, 22 Mar 2012 23:50:36 +0000 Subject: [PATCH] Add periodic_fuzzy_delay option. Fixes bug 962665 This random delay is intended to reduce the stampeding behavior associated with periodic tasks when compute workers are restarted in unison across a cluster. Change-Id: Ie3771d94af29049061c129b8ea562ee447a61771 --- nova/service.py | 35 +++++++++++++++++++++------ nova/tests/integrated/test_servers.py | 11 +++------ nova/tests/xenapi/stubs.py | 2 +- nova/utils.py | 7 +++--- nova/virt/baremetal/proxy.py | 6 ++--- nova/virt/libvirt/connection.py | 14 +++++------ nova/virt/vmwareapi_conn.py | 2 +- 7 files changed, 48 insertions(+), 29 deletions(-) diff --git a/nova/service.py b/nova/service.py index c77419432805..a351406fe940 100644 --- a/nova/service.py +++ b/nova/service.py @@ -21,6 +21,7 @@ import inspect import os +import random import signal import eventlet @@ -47,6 +48,11 @@ service_opts = [ cfg.IntOpt('periodic_interval', default=60, help='seconds between running periodic tasks'), + cfg.IntOpt('periodic_fuzzy_delay', + default=60, + help='range of seconds to randomly delay when starting the' + ' periodic task scheduler to reduce stampeding.' + ' (Disable by setting to 0)'), cfg.StrOpt('ec2_listen', default="0.0.0.0", help='IP address for EC2 API to listen'), @@ -151,7 +157,8 @@ class Service(object): it state to the database services table.""" def __init__(self, host, binary, topic, manager, report_interval=None, - periodic_interval=None, *args, **kwargs): + periodic_interval=None, periodic_fuzzy_delay=None, + *args, **kwargs): self.host = host self.binary = binary self.topic = topic @@ -160,6 +167,7 @@ class Service(object): self.manager = manager_class(host=self.host, *args, **kwargs) self.report_interval = report_interval self.periodic_interval = periodic_interval + self.periodic_fuzzy_delay = periodic_fuzzy_delay super(Service, self).__init__(*args, **kwargs) self.saved_args, self.saved_kwargs = args, kwargs self.timers = [] @@ -200,12 +208,19 @@ class Service(object): if self.report_interval: pulse = utils.LoopingCall(self.report_state) - pulse.start(interval=self.report_interval, now=False) + pulse.start(interval=self.report_interval, + initial_delay=self.report_interval) self.timers.append(pulse) if self.periodic_interval: + if self.periodic_fuzzy_delay: + initial_delay = random.randint(0, self.periodic_fuzzy_delay) + else: + initial_delay = None + periodic = utils.LoopingCall(self.periodic_tasks) - periodic.start(interval=self.periodic_interval, now=False) + periodic.start(interval=self.periodic_interval, + initial_delay=initial_delay) self.timers.append(periodic) def _create_service_ref(self, context): @@ -224,7 +239,8 @@ class Service(object): @classmethod def create(cls, host=None, binary=None, topic=None, manager=None, - report_interval=None, periodic_interval=None): + report_interval=None, periodic_interval=None, + periodic_fuzzy_delay=None): """Instantiates class and passes back application object. :param host: defaults to FLAGS.host @@ -233,6 +249,7 @@ class Service(object): :param manager: defaults to FLAGS._manager :param report_interval: defaults to FLAGS.report_interval :param periodic_interval: defaults to FLAGS.periodic_interval + :param periodic_fuzzy_delay: defaults to FLAGS.periodic_fuzzy_delay """ if not host: @@ -243,12 +260,16 @@ class Service(object): topic = binary.rpartition('nova-')[2] if not manager: manager = FLAGS.get('%s_manager' % topic, None) - if not report_interval: + if report_interval is None: report_interval = FLAGS.report_interval - if not periodic_interval: + if periodic_interval is None: periodic_interval = FLAGS.periodic_interval + if periodic_fuzzy_delay is None: + periodic_fuzzy_delay = FLAGS.periodic_fuzzy_delay service_obj = cls(host, binary, topic, manager, - report_interval, periodic_interval) + report_interval=report_interval, + periodic_interval=periodic_interval, + periodic_fuzzy_delay=periodic_fuzzy_delay) return service_obj diff --git a/nova/tests/integrated/test_servers.py b/nova/tests/integrated/test_servers.py index fe5d1e5671ad..46dc99d97ca7 100644 --- a/nova/tests/integrated/test_servers.py +++ b/nova/tests/integrated/test_servers.py @@ -38,14 +38,10 @@ class ServersTest(integrated_helpers._IntegratedTestBase): return server - def _restart_compute_service(self, periodic_interval=None): + def _restart_compute_service(self, *args, **kwargs): """restart compute service. NOTE: fake driver forgets all instances.""" self.compute.kill() - if periodic_interval: - self.compute = self.start_service( - 'compute', periodic_interval=periodic_interval) - else: - self.compute = self.start_service('compute') + self.compute = self.start_service('compute', *args, **kwargs) def test_get_servers(self): """Simple check that listing servers works.""" @@ -144,7 +140,8 @@ class ServersTest(integrated_helpers._IntegratedTestBase): self.flags(stub_network=True, reclaim_instance_interval=1) # enforce periodic tasks run in short time to avoid wait for 60s. - self._restart_compute_service(periodic_interval=0.3) + self._restart_compute_service( + periodic_interval=0.3, periodic_fuzzy_delay=0) # Create server server = self._build_minimal_create_server_request() diff --git a/nova/tests/xenapi/stubs.py b/nova/tests/xenapi/stubs.py index e6bf9dd4bffc..acfa8fb61d5d 100644 --- a/nova/tests/xenapi/stubs.py +++ b/nova/tests/xenapi/stubs.py @@ -166,7 +166,7 @@ def stubout_create_vm(stubs): def stubout_loopingcall_start(stubs): - def fake_start(self, interval, now=True): + def fake_start(self, interval): self.f(*self.args, **self.kw) stubs.Set(utils.LoopingCall, 'start', fake_start) diff --git a/nova/utils.py b/nova/utils.py index 819929a81d55..29df28ceaf9b 100644 --- a/nova/utils.py +++ b/nova/utils.py @@ -687,13 +687,14 @@ class LoopingCall(object): self.f = f self._running = False - def start(self, interval, now=True): + def start(self, interval, initial_delay=None): self._running = True done = event.Event() def _inner(): - if not now: - greenthread.sleep(interval) + if initial_delay: + greenthread.sleep(initial_delay) + try: while self._running: self.f(*self.args, **self.kw) diff --git a/nova/virt/baremetal/proxy.py b/nova/virt/baremetal/proxy.py index 50c72dd872ce..5f2e53bdb957 100644 --- a/nova/virt/baremetal/proxy.py +++ b/nova/virt/baremetal/proxy.py @@ -192,7 +192,7 @@ class ProxyConnection(driver.ComputeDriver): LOG.exception(_('_wait_for_reboot failed')) timer.stop() timer.f = _wait_for_reboot - return timer.start(interval=0.5, now=True) + return timer.start(interval=0.5) @exception.wrap_exception def rescue(self, context, instance, network_info): @@ -225,7 +225,7 @@ class ProxyConnection(driver.ComputeDriver): LOG.exception(_('_wait_for_rescue failed')) timer.stop() timer.f = _wait_for_reboot - return timer.start(interval=0.5, now=True) + return timer.start(interval=0.5) @exception.wrap_exception def unrescue(self, instance, network_info): @@ -278,7 +278,7 @@ class ProxyConnection(driver.ComputeDriver): timer.stop() timer.f = _wait_for_boot - return timer.start(interval=0.5, now=True) + return timer.start(interval=0.5) def get_console_output(self, instance): console_log = os.path.join(FLAGS.instances_path, instance['name'], diff --git a/nova/virt/libvirt/connection.py b/nova/virt/libvirt/connection.py index e1537558eb5b..cd1a360d1ea5 100644 --- a/nova/virt/libvirt/connection.py +++ b/nova/virt/libvirt/connection.py @@ -429,7 +429,7 @@ class LibvirtConnection(driver.ComputeDriver): raise utils.LoopingCallDone timer = utils.LoopingCall(_wait_for_destroy) - timer.start(interval=0.5, now=True) + timer.start(interval=0.5) try: self.firewall_driver.unfilter_instance(instance, @@ -726,7 +726,7 @@ class LibvirtConnection(driver.ComputeDriver): instance=instance) dom.create() timer = utils.LoopingCall(self._wait_for_running, instance) - return timer.start(interval=0.5, now=True) + return timer.start(interval=0.5) greenthread.sleep(1) return False @@ -768,7 +768,7 @@ class LibvirtConnection(driver.ComputeDriver): raise utils.LoopingCallDone timer = utils.LoopingCall(_wait_for_reboot) - return timer.start(interval=0.5, now=True) + return timer.start(interval=0.5) @exception.wrap_exception() def pause(self, instance): @@ -912,7 +912,7 @@ class LibvirtConnection(driver.ComputeDriver): raise utils.LoopingCallDone timer = utils.LoopingCall(_wait_for_boot) - return timer.start(interval=0.5, now=True) + return timer.start(interval=0.5) def _flush_libvirt_console(self, pty): out, err = utils.execute('dd', @@ -2069,7 +2069,7 @@ class LibvirtConnection(driver.ComputeDriver): post_method(ctxt, instance_ref, dest, block_migration) timer.f = wait_for_live_migration - timer.start(interval=0.5, now=True) + timer.start(interval=0.5) def pre_live_migration(self, block_device_info): """Preparation live migration. @@ -2415,7 +2415,7 @@ class LibvirtConnection(driver.ComputeDriver): self.firewall_driver.apply_instance_filter(instance, network_info) timer = utils.LoopingCall(self._wait_for_running, instance) - return timer.start(interval=0.5, now=True) + return timer.start(interval=0.5) @exception.wrap_exception() def finish_revert_migration(self, instance, network_info): @@ -2437,7 +2437,7 @@ class LibvirtConnection(driver.ComputeDriver): self.firewall_driver.apply_instance_filter(instance, network_info) timer = utils.LoopingCall(self._wait_for_running, instance) - return timer.start(interval=0.5, now=True) + return timer.start(interval=0.5) def confirm_migration(self, migration, instance, network_info): """Confirms a resize, destroying the source VM""" diff --git a/nova/virt/vmwareapi_conn.py b/nova/virt/vmwareapi_conn.py index 88b0b66f7151..444fc785a42c 100644 --- a/nova/virt/vmwareapi_conn.py +++ b/nova/virt/vmwareapi_conn.py @@ -374,7 +374,7 @@ class VMWareAPISession(object): done = event.Event() loop = utils.LoopingCall(self._poll_task, instance_uuid, task_ref, done) - loop.start(FLAGS.vmwareapi_task_poll_interval, now=True) + loop.start(FLAGS.vmwareapi_task_poll_interval) ret_val = done.wait() loop.stop() return ret_val