Add periodic_fuzzy_delay option.
Fixes bug 962665 This random delay is intended to reduce the stampeding behavior associated with periodic tasks when compute workers are restarted in unison across a cluster. Change-Id: Ie3771d94af29049061c129b8ea562ee447a61771
This commit is contained in:
parent
0b10c3f1a0
commit
8b3f327df0
@ -21,6 +21,7 @@
|
|||||||
|
|
||||||
import inspect
|
import inspect
|
||||||
import os
|
import os
|
||||||
|
import random
|
||||||
import signal
|
import signal
|
||||||
|
|
||||||
import eventlet
|
import eventlet
|
||||||
@ -47,6 +48,11 @@ service_opts = [
|
|||||||
cfg.IntOpt('periodic_interval',
|
cfg.IntOpt('periodic_interval',
|
||||||
default=60,
|
default=60,
|
||||||
help='seconds between running periodic tasks'),
|
help='seconds between running periodic tasks'),
|
||||||
|
cfg.IntOpt('periodic_fuzzy_delay',
|
||||||
|
default=60,
|
||||||
|
help='range of seconds to randomly delay when starting the'
|
||||||
|
' periodic task scheduler to reduce stampeding.'
|
||||||
|
' (Disable by setting to 0)'),
|
||||||
cfg.StrOpt('ec2_listen',
|
cfg.StrOpt('ec2_listen',
|
||||||
default="0.0.0.0",
|
default="0.0.0.0",
|
||||||
help='IP address for EC2 API to listen'),
|
help='IP address for EC2 API to listen'),
|
||||||
@ -151,7 +157,8 @@ class Service(object):
|
|||||||
it state to the database services table."""
|
it state to the database services table."""
|
||||||
|
|
||||||
def __init__(self, host, binary, topic, manager, report_interval=None,
|
def __init__(self, host, binary, topic, manager, report_interval=None,
|
||||||
periodic_interval=None, *args, **kwargs):
|
periodic_interval=None, periodic_fuzzy_delay=None,
|
||||||
|
*args, **kwargs):
|
||||||
self.host = host
|
self.host = host
|
||||||
self.binary = binary
|
self.binary = binary
|
||||||
self.topic = topic
|
self.topic = topic
|
||||||
@ -160,6 +167,7 @@ class Service(object):
|
|||||||
self.manager = manager_class(host=self.host, *args, **kwargs)
|
self.manager = manager_class(host=self.host, *args, **kwargs)
|
||||||
self.report_interval = report_interval
|
self.report_interval = report_interval
|
||||||
self.periodic_interval = periodic_interval
|
self.periodic_interval = periodic_interval
|
||||||
|
self.periodic_fuzzy_delay = periodic_fuzzy_delay
|
||||||
super(Service, self).__init__(*args, **kwargs)
|
super(Service, self).__init__(*args, **kwargs)
|
||||||
self.saved_args, self.saved_kwargs = args, kwargs
|
self.saved_args, self.saved_kwargs = args, kwargs
|
||||||
self.timers = []
|
self.timers = []
|
||||||
@ -200,12 +208,19 @@ class Service(object):
|
|||||||
|
|
||||||
if self.report_interval:
|
if self.report_interval:
|
||||||
pulse = utils.LoopingCall(self.report_state)
|
pulse = utils.LoopingCall(self.report_state)
|
||||||
pulse.start(interval=self.report_interval, now=False)
|
pulse.start(interval=self.report_interval,
|
||||||
|
initial_delay=self.report_interval)
|
||||||
self.timers.append(pulse)
|
self.timers.append(pulse)
|
||||||
|
|
||||||
if self.periodic_interval:
|
if self.periodic_interval:
|
||||||
|
if self.periodic_fuzzy_delay:
|
||||||
|
initial_delay = random.randint(0, self.periodic_fuzzy_delay)
|
||||||
|
else:
|
||||||
|
initial_delay = None
|
||||||
|
|
||||||
periodic = utils.LoopingCall(self.periodic_tasks)
|
periodic = utils.LoopingCall(self.periodic_tasks)
|
||||||
periodic.start(interval=self.periodic_interval, now=False)
|
periodic.start(interval=self.periodic_interval,
|
||||||
|
initial_delay=initial_delay)
|
||||||
self.timers.append(periodic)
|
self.timers.append(periodic)
|
||||||
|
|
||||||
def _create_service_ref(self, context):
|
def _create_service_ref(self, context):
|
||||||
@ -224,7 +239,8 @@ class Service(object):
|
|||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def create(cls, host=None, binary=None, topic=None, manager=None,
|
def create(cls, host=None, binary=None, topic=None, manager=None,
|
||||||
report_interval=None, periodic_interval=None):
|
report_interval=None, periodic_interval=None,
|
||||||
|
periodic_fuzzy_delay=None):
|
||||||
"""Instantiates class and passes back application object.
|
"""Instantiates class and passes back application object.
|
||||||
|
|
||||||
:param host: defaults to FLAGS.host
|
:param host: defaults to FLAGS.host
|
||||||
@ -233,6 +249,7 @@ class Service(object):
|
|||||||
:param manager: defaults to FLAGS.<topic>_manager
|
:param manager: defaults to FLAGS.<topic>_manager
|
||||||
:param report_interval: defaults to FLAGS.report_interval
|
:param report_interval: defaults to FLAGS.report_interval
|
||||||
:param periodic_interval: defaults to FLAGS.periodic_interval
|
:param periodic_interval: defaults to FLAGS.periodic_interval
|
||||||
|
:param periodic_fuzzy_delay: defaults to FLAGS.periodic_fuzzy_delay
|
||||||
|
|
||||||
"""
|
"""
|
||||||
if not host:
|
if not host:
|
||||||
@ -243,12 +260,16 @@ class Service(object):
|
|||||||
topic = binary.rpartition('nova-')[2]
|
topic = binary.rpartition('nova-')[2]
|
||||||
if not manager:
|
if not manager:
|
||||||
manager = FLAGS.get('%s_manager' % topic, None)
|
manager = FLAGS.get('%s_manager' % topic, None)
|
||||||
if not report_interval:
|
if report_interval is None:
|
||||||
report_interval = FLAGS.report_interval
|
report_interval = FLAGS.report_interval
|
||||||
if not periodic_interval:
|
if periodic_interval is None:
|
||||||
periodic_interval = FLAGS.periodic_interval
|
periodic_interval = FLAGS.periodic_interval
|
||||||
|
if periodic_fuzzy_delay is None:
|
||||||
|
periodic_fuzzy_delay = FLAGS.periodic_fuzzy_delay
|
||||||
service_obj = cls(host, binary, topic, manager,
|
service_obj = cls(host, binary, topic, manager,
|
||||||
report_interval, periodic_interval)
|
report_interval=report_interval,
|
||||||
|
periodic_interval=periodic_interval,
|
||||||
|
periodic_fuzzy_delay=periodic_fuzzy_delay)
|
||||||
|
|
||||||
return service_obj
|
return service_obj
|
||||||
|
|
||||||
|
@ -38,14 +38,10 @@ class ServersTest(integrated_helpers._IntegratedTestBase):
|
|||||||
|
|
||||||
return server
|
return server
|
||||||
|
|
||||||
def _restart_compute_service(self, periodic_interval=None):
|
def _restart_compute_service(self, *args, **kwargs):
|
||||||
"""restart compute service. NOTE: fake driver forgets all instances."""
|
"""restart compute service. NOTE: fake driver forgets all instances."""
|
||||||
self.compute.kill()
|
self.compute.kill()
|
||||||
if periodic_interval:
|
self.compute = self.start_service('compute', *args, **kwargs)
|
||||||
self.compute = self.start_service(
|
|
||||||
'compute', periodic_interval=periodic_interval)
|
|
||||||
else:
|
|
||||||
self.compute = self.start_service('compute')
|
|
||||||
|
|
||||||
def test_get_servers(self):
|
def test_get_servers(self):
|
||||||
"""Simple check that listing servers works."""
|
"""Simple check that listing servers works."""
|
||||||
@ -144,7 +140,8 @@ class ServersTest(integrated_helpers._IntegratedTestBase):
|
|||||||
self.flags(stub_network=True, reclaim_instance_interval=1)
|
self.flags(stub_network=True, reclaim_instance_interval=1)
|
||||||
|
|
||||||
# enforce periodic tasks run in short time to avoid wait for 60s.
|
# enforce periodic tasks run in short time to avoid wait for 60s.
|
||||||
self._restart_compute_service(periodic_interval=0.3)
|
self._restart_compute_service(
|
||||||
|
periodic_interval=0.3, periodic_fuzzy_delay=0)
|
||||||
|
|
||||||
# Create server
|
# Create server
|
||||||
server = self._build_minimal_create_server_request()
|
server = self._build_minimal_create_server_request()
|
||||||
|
@ -166,7 +166,7 @@ def stubout_create_vm(stubs):
|
|||||||
|
|
||||||
|
|
||||||
def stubout_loopingcall_start(stubs):
|
def stubout_loopingcall_start(stubs):
|
||||||
def fake_start(self, interval, now=True):
|
def fake_start(self, interval):
|
||||||
self.f(*self.args, **self.kw)
|
self.f(*self.args, **self.kw)
|
||||||
stubs.Set(utils.LoopingCall, 'start', fake_start)
|
stubs.Set(utils.LoopingCall, 'start', fake_start)
|
||||||
|
|
||||||
|
@ -687,13 +687,14 @@ class LoopingCall(object):
|
|||||||
self.f = f
|
self.f = f
|
||||||
self._running = False
|
self._running = False
|
||||||
|
|
||||||
def start(self, interval, now=True):
|
def start(self, interval, initial_delay=None):
|
||||||
self._running = True
|
self._running = True
|
||||||
done = event.Event()
|
done = event.Event()
|
||||||
|
|
||||||
def _inner():
|
def _inner():
|
||||||
if not now:
|
if initial_delay:
|
||||||
greenthread.sleep(interval)
|
greenthread.sleep(initial_delay)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
while self._running:
|
while self._running:
|
||||||
self.f(*self.args, **self.kw)
|
self.f(*self.args, **self.kw)
|
||||||
|
@ -192,7 +192,7 @@ class ProxyConnection(driver.ComputeDriver):
|
|||||||
LOG.exception(_('_wait_for_reboot failed'))
|
LOG.exception(_('_wait_for_reboot failed'))
|
||||||
timer.stop()
|
timer.stop()
|
||||||
timer.f = _wait_for_reboot
|
timer.f = _wait_for_reboot
|
||||||
return timer.start(interval=0.5, now=True)
|
return timer.start(interval=0.5)
|
||||||
|
|
||||||
@exception.wrap_exception
|
@exception.wrap_exception
|
||||||
def rescue(self, context, instance, network_info):
|
def rescue(self, context, instance, network_info):
|
||||||
@ -225,7 +225,7 @@ class ProxyConnection(driver.ComputeDriver):
|
|||||||
LOG.exception(_('_wait_for_rescue failed'))
|
LOG.exception(_('_wait_for_rescue failed'))
|
||||||
timer.stop()
|
timer.stop()
|
||||||
timer.f = _wait_for_reboot
|
timer.f = _wait_for_reboot
|
||||||
return timer.start(interval=0.5, now=True)
|
return timer.start(interval=0.5)
|
||||||
|
|
||||||
@exception.wrap_exception
|
@exception.wrap_exception
|
||||||
def unrescue(self, instance, network_info):
|
def unrescue(self, instance, network_info):
|
||||||
@ -278,7 +278,7 @@ class ProxyConnection(driver.ComputeDriver):
|
|||||||
timer.stop()
|
timer.stop()
|
||||||
timer.f = _wait_for_boot
|
timer.f = _wait_for_boot
|
||||||
|
|
||||||
return timer.start(interval=0.5, now=True)
|
return timer.start(interval=0.5)
|
||||||
|
|
||||||
def get_console_output(self, instance):
|
def get_console_output(self, instance):
|
||||||
console_log = os.path.join(FLAGS.instances_path, instance['name'],
|
console_log = os.path.join(FLAGS.instances_path, instance['name'],
|
||||||
|
@ -429,7 +429,7 @@ class LibvirtConnection(driver.ComputeDriver):
|
|||||||
raise utils.LoopingCallDone
|
raise utils.LoopingCallDone
|
||||||
|
|
||||||
timer = utils.LoopingCall(_wait_for_destroy)
|
timer = utils.LoopingCall(_wait_for_destroy)
|
||||||
timer.start(interval=0.5, now=True)
|
timer.start(interval=0.5)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.firewall_driver.unfilter_instance(instance,
|
self.firewall_driver.unfilter_instance(instance,
|
||||||
@ -726,7 +726,7 @@ class LibvirtConnection(driver.ComputeDriver):
|
|||||||
instance=instance)
|
instance=instance)
|
||||||
dom.create()
|
dom.create()
|
||||||
timer = utils.LoopingCall(self._wait_for_running, instance)
|
timer = utils.LoopingCall(self._wait_for_running, instance)
|
||||||
return timer.start(interval=0.5, now=True)
|
return timer.start(interval=0.5)
|
||||||
greenthread.sleep(1)
|
greenthread.sleep(1)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@ -768,7 +768,7 @@ class LibvirtConnection(driver.ComputeDriver):
|
|||||||
raise utils.LoopingCallDone
|
raise utils.LoopingCallDone
|
||||||
|
|
||||||
timer = utils.LoopingCall(_wait_for_reboot)
|
timer = utils.LoopingCall(_wait_for_reboot)
|
||||||
return timer.start(interval=0.5, now=True)
|
return timer.start(interval=0.5)
|
||||||
|
|
||||||
@exception.wrap_exception()
|
@exception.wrap_exception()
|
||||||
def pause(self, instance):
|
def pause(self, instance):
|
||||||
@ -912,7 +912,7 @@ class LibvirtConnection(driver.ComputeDriver):
|
|||||||
raise utils.LoopingCallDone
|
raise utils.LoopingCallDone
|
||||||
|
|
||||||
timer = utils.LoopingCall(_wait_for_boot)
|
timer = utils.LoopingCall(_wait_for_boot)
|
||||||
return timer.start(interval=0.5, now=True)
|
return timer.start(interval=0.5)
|
||||||
|
|
||||||
def _flush_libvirt_console(self, pty):
|
def _flush_libvirt_console(self, pty):
|
||||||
out, err = utils.execute('dd',
|
out, err = utils.execute('dd',
|
||||||
@ -2069,7 +2069,7 @@ class LibvirtConnection(driver.ComputeDriver):
|
|||||||
post_method(ctxt, instance_ref, dest, block_migration)
|
post_method(ctxt, instance_ref, dest, block_migration)
|
||||||
|
|
||||||
timer.f = wait_for_live_migration
|
timer.f = wait_for_live_migration
|
||||||
timer.start(interval=0.5, now=True)
|
timer.start(interval=0.5)
|
||||||
|
|
||||||
def pre_live_migration(self, block_device_info):
|
def pre_live_migration(self, block_device_info):
|
||||||
"""Preparation live migration.
|
"""Preparation live migration.
|
||||||
@ -2415,7 +2415,7 @@ class LibvirtConnection(driver.ComputeDriver):
|
|||||||
self.firewall_driver.apply_instance_filter(instance, network_info)
|
self.firewall_driver.apply_instance_filter(instance, network_info)
|
||||||
|
|
||||||
timer = utils.LoopingCall(self._wait_for_running, instance)
|
timer = utils.LoopingCall(self._wait_for_running, instance)
|
||||||
return timer.start(interval=0.5, now=True)
|
return timer.start(interval=0.5)
|
||||||
|
|
||||||
@exception.wrap_exception()
|
@exception.wrap_exception()
|
||||||
def finish_revert_migration(self, instance, network_info):
|
def finish_revert_migration(self, instance, network_info):
|
||||||
@ -2437,7 +2437,7 @@ class LibvirtConnection(driver.ComputeDriver):
|
|||||||
self.firewall_driver.apply_instance_filter(instance, network_info)
|
self.firewall_driver.apply_instance_filter(instance, network_info)
|
||||||
|
|
||||||
timer = utils.LoopingCall(self._wait_for_running, instance)
|
timer = utils.LoopingCall(self._wait_for_running, instance)
|
||||||
return timer.start(interval=0.5, now=True)
|
return timer.start(interval=0.5)
|
||||||
|
|
||||||
def confirm_migration(self, migration, instance, network_info):
|
def confirm_migration(self, migration, instance, network_info):
|
||||||
"""Confirms a resize, destroying the source VM"""
|
"""Confirms a resize, destroying the source VM"""
|
||||||
|
@ -374,7 +374,7 @@ class VMWareAPISession(object):
|
|||||||
done = event.Event()
|
done = event.Event()
|
||||||
loop = utils.LoopingCall(self._poll_task, instance_uuid, task_ref,
|
loop = utils.LoopingCall(self._poll_task, instance_uuid, task_ref,
|
||||||
done)
|
done)
|
||||||
loop.start(FLAGS.vmwareapi_task_poll_interval, now=True)
|
loop.start(FLAGS.vmwareapi_task_poll_interval)
|
||||||
ret_val = done.wait()
|
ret_val = done.wait()
|
||||||
loop.stop()
|
loop.stop()
|
||||||
return ret_val
|
return ret_val
|
||||||
|
Loading…
Reference in New Issue
Block a user