Merge "libvirt: add live migration timeout action"
This commit is contained in:
commit
35ee7edd94
@ -218,9 +218,25 @@ What to do when the migration times out
|
||||
During the migration process, the instance may write to a memory page after
|
||||
that page has been copied to the destination. When that happens, the same page
|
||||
has to be copied again. The instance may write to memory pages faster than they
|
||||
can be copied, so that the migration cannot complete. The Compute service will
|
||||
cancel it when the ``live_migration_completion_timeout``, a configuration
|
||||
parameter, is reached.
|
||||
can be copied, so that the migration cannot complete. There are two optional
|
||||
actions, controlled by
|
||||
:oslo.config:option:`libvirt.live_migration_timeout_action`, which can be
|
||||
taken against a VM after
|
||||
:oslo.config:option:`libvirt.live_migration_completion_timeout` is reached:
|
||||
|
||||
1. ``abort`` (default): The live migration operation will be cancelled after
|
||||
the completion timeout is reached. This is similar to using API
|
||||
``DELETE /servers/{server_id}/migrations/{migration_id}``.
|
||||
|
||||
2. ``force_complete``: The compute service will either pause the VM or trigger
|
||||
post-copy depending on if post copy is enabled and available
|
||||
(:oslo.config:option:`libvirt.live_migration_permit_post_copy` is set to
|
||||
`True`). This is similar to using API
|
||||
``POST /servers/{server_id}/migrations/{migration_id}/action (force_complete)``.
|
||||
|
||||
You can also read the
|
||||
:oslo.config:option:`libvirt.live_migration_timeout_action`
|
||||
configuration option help for more details.
|
||||
|
||||
The following remarks assume the KVM/Libvirt hypervisor.
|
||||
|
||||
|
@ -375,6 +375,7 @@ transferred, with lower bound of a minimum of 2 GiB per device.
|
||||
"""),
|
||||
cfg.IntOpt('live_migration_completion_timeout',
|
||||
default=800,
|
||||
min=0,
|
||||
mutable=True,
|
||||
help="""
|
||||
Time to wait, in seconds, for migration to successfully complete transferring
|
||||
@ -407,6 +408,23 @@ Set to 0 to disable timeouts.
|
||||
This is deprecated, and now disabled by default because we have found serious
|
||||
bugs in this feature that caused false live-migration timeout failures. This
|
||||
feature will be removed or replaced in a future release.
|
||||
"""),
|
||||
cfg.StrOpt('live_migration_timeout_action',
|
||||
default='abort',
|
||||
choices=('abort', 'force_complete'),
|
||||
mutable=True,
|
||||
help="""
|
||||
This option will be used to determine what action will be taken against a
|
||||
VM after ``live_migration_completion_timeout`` expires. By default, the live
|
||||
migrate operation will be aborted after completion timeout. If it is set to
|
||||
``force_complete``, the compute service will either pause the VM or trigger
|
||||
post-copy depending on if post copy is enabled and available
|
||||
(``live_migration_permit_post_copy`` is set to True).
|
||||
|
||||
Related options:
|
||||
|
||||
* live_migration_completion_timeout
|
||||
* live_migration_permit_post_copy
|
||||
"""),
|
||||
cfg.BoolOpt('live_migration_permit_post_copy',
|
||||
default=False,
|
||||
@ -418,7 +436,12 @@ needs to be transferred. Post-copy requires libvirt>=1.3.3 and QEMU>=2.5.0.
|
||||
|
||||
When permitted, post-copy mode will be automatically activated if a
|
||||
live-migration memory copy iteration does not make percentage increase of at
|
||||
least 10% over the last iteration.
|
||||
least 10% over the last iteration, or will be automatically activated if
|
||||
we reach the timeout defined by ``live_migration_completion_timeout`` and
|
||||
``live_migration_timeout_action`` is set to 'force_complete'. Note if you
|
||||
change to no timeout or choose to use 'abort',
|
||||
i.e. ``live_migration_completion_timeout = 0``, then there will be no
|
||||
automatic switch to post-copy.
|
||||
|
||||
The live-migration force complete API also uses post-copy when permitted. If
|
||||
post-copy mode is not available, force complete falls back to pausing the VM
|
||||
@ -430,7 +453,8 @@ details, please see the Administration guide.
|
||||
|
||||
Related options:
|
||||
|
||||
* live_migration_permit_auto_converge
|
||||
* live_migration_permit_auto_converge
|
||||
* live_migration_timeout_action
|
||||
"""),
|
||||
cfg.BoolOpt('live_migration_permit_auto_converge',
|
||||
default=False,
|
||||
|
@ -11457,6 +11457,38 @@ class LibvirtConnTestCase(test.NoDBTestCase,
|
||||
self.EXPECT_SUCCESS,
|
||||
expected_switch=True)
|
||||
|
||||
@mock.patch.object(libvirt_driver.LibvirtDriver,
|
||||
"_is_post_copy_enabled")
|
||||
def test_live_migration_monitor_force_complete_postcopy(self,
|
||||
mock_postcopy_enabled):
|
||||
self.flags(live_migration_completion_timeout=40,
|
||||
live_migration_timeout_action='force_complete',
|
||||
group='libvirt')
|
||||
mock_postcopy_enabled.return_value = True
|
||||
|
||||
# Each one of these fake times is used for time.time()
|
||||
# when a new domain_info_records entry is consumed.
|
||||
fake_times = [0, 40, 80, 120, 160, 200, 240, 280, 320]
|
||||
|
||||
domain_info_records = [
|
||||
libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_NONE),
|
||||
libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
|
||||
libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
|
||||
libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
|
||||
"thread-finish",
|
||||
"domain-stop",
|
||||
libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_COMPLETED),
|
||||
]
|
||||
|
||||
self._test_live_migration_monitoring(domain_info_records, fake_times,
|
||||
self.EXPECT_SUCCESS,
|
||||
expected_switch=True)
|
||||
|
||||
@mock.patch.object(host.Host, "get_connection")
|
||||
@mock.patch.object(utils, "spawn")
|
||||
@mock.patch.object(libvirt_driver.LibvirtDriver, "_live_migration_monitor")
|
||||
|
@ -976,51 +976,33 @@ class MigrationMonitorTestCase(test.NoDBTestCase):
|
||||
|
||||
def test_live_migration_abort_stuck(self):
|
||||
# Progress time exceeds progress timeout
|
||||
self.assertTrue(migration.should_abort(self.instance,
|
||||
5000,
|
||||
1000, 2000,
|
||||
4500, 9000,
|
||||
"running"))
|
||||
self.assertTrue(migration.should_trigger_timeout_action(
|
||||
self.instance, 5000, 1000, 2000, 4500, 9000, "running"))
|
||||
|
||||
def test_live_migration_abort_no_prog_timeout(self):
|
||||
# Progress timeout is disabled
|
||||
self.assertFalse(migration.should_abort(self.instance,
|
||||
5000,
|
||||
1000, 0,
|
||||
4500, 9000,
|
||||
"running"))
|
||||
self.assertFalse(migration.should_trigger_timeout_action(
|
||||
self.instance, 5000, 1000, 0, 4500, 9000, "running"))
|
||||
|
||||
def test_live_migration_abort_not_stuck(self):
|
||||
# Progress time is less than progress timeout
|
||||
self.assertFalse(migration.should_abort(self.instance,
|
||||
5000,
|
||||
4500, 2000,
|
||||
4500, 9000,
|
||||
"running"))
|
||||
self.assertFalse(migration.should_trigger_timeout_action(
|
||||
self.instance, 5000, 4500, 2000, 4500, 9000, "running"))
|
||||
|
||||
def test_live_migration_abort_too_long(self):
|
||||
# Elapsed time is over completion timeout
|
||||
self.assertTrue(migration.should_abort(self.instance,
|
||||
5000,
|
||||
4500, 2000,
|
||||
4500, 2000,
|
||||
"running"))
|
||||
self.assertTrue(migration.should_trigger_timeout_action(
|
||||
self.instance, 5000, 4500, 2000, 4500, 2000, "running"))
|
||||
|
||||
def test_live_migration_abort_no_comp_timeout(self):
|
||||
# Completion timeout is disabled
|
||||
self.assertFalse(migration.should_abort(self.instance,
|
||||
5000,
|
||||
4500, 2000,
|
||||
4500, 0,
|
||||
"running"))
|
||||
self.assertFalse(migration.should_trigger_timeout_action(
|
||||
self.instance, 5000, 4500, 2000, 4500, 0, "running"))
|
||||
|
||||
def test_live_migration_abort_still_working(self):
|
||||
# Elapsed time is less than completion timeout
|
||||
self.assertFalse(migration.should_abort(self.instance,
|
||||
5000,
|
||||
4500, 2000,
|
||||
4500, 9000,
|
||||
"running"))
|
||||
self.assertFalse(migration.should_trigger_timeout_action(
|
||||
self.instance, 5000, 4500, 2000, 4500, 9000, "running"))
|
||||
|
||||
def test_live_migration_postcopy_switch(self):
|
||||
# Migration progress is not fast enough
|
||||
|
@ -7379,18 +7379,28 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
progress_timeout = CONF.libvirt.live_migration_progress_timeout
|
||||
completion_timeout = int(
|
||||
CONF.libvirt.live_migration_completion_timeout * data_gb)
|
||||
if libvirt_migrate.should_abort(instance, now, progress_time,
|
||||
progress_timeout, elapsed,
|
||||
completion_timeout,
|
||||
migration.status):
|
||||
try:
|
||||
guest.abort_job()
|
||||
except libvirt.libvirtError as e:
|
||||
LOG.warning("Failed to abort migration %s",
|
||||
encodeutils.exception_to_unicode(e),
|
||||
instance=instance)
|
||||
self._clear_empty_migration(instance)
|
||||
raise
|
||||
# NOTE(yikun): Check the completion timeout to determine
|
||||
# should trigger the timeout action, and there are two choices
|
||||
# ``abort`` (default) or ``force_complete``. If the action is
|
||||
# set to ``force_complete``, the post-copy will be triggered
|
||||
# if available else the VM will be suspended, otherwise the
|
||||
# live migrate operation will be aborted.
|
||||
if libvirt_migrate.should_trigger_timeout_action(
|
||||
instance, now, progress_time, progress_timeout,
|
||||
elapsed, completion_timeout, migration.status):
|
||||
timeout_act = CONF.libvirt.live_migration_timeout_action
|
||||
if timeout_act == 'force_complete':
|
||||
self.live_migration_force_complete(instance)
|
||||
else:
|
||||
# timeout action is 'abort'
|
||||
try:
|
||||
guest.abort_job()
|
||||
except libvirt.libvirtError as e:
|
||||
LOG.warning("Failed to abort migration %s",
|
||||
encodeutils.exception_to_unicode(e),
|
||||
instance=instance)
|
||||
self._clear_empty_migration(instance)
|
||||
raise
|
||||
|
||||
if (is_post_copy_enabled and
|
||||
libvirt_migrate.should_switch_to_postcopy(
|
||||
|
@ -376,11 +376,11 @@ def find_job_type(guest, instance):
|
||||
return libvirt.VIR_DOMAIN_JOB_FAILED
|
||||
|
||||
|
||||
def should_abort(instance, now,
|
||||
progress_time, progress_timeout,
|
||||
elapsed, completion_timeout,
|
||||
migration_status):
|
||||
"""Determine if the migration should be aborted
|
||||
def should_trigger_timeout_action(instance, now,
|
||||
progress_time, progress_timeout,
|
||||
elapsed, completion_timeout,
|
||||
migration_status):
|
||||
"""Determine if the migration timeout action should be triggered
|
||||
|
||||
:param instance: a nova.objects.Instance
|
||||
:param now: current time in secs since epoch
|
||||
@ -391,12 +391,18 @@ def should_abort(instance, now,
|
||||
:param migration_status: current status of the migration
|
||||
|
||||
Check the progress and completion timeouts to determine if either
|
||||
of them have been hit, and should thus cause migration to be aborted
|
||||
of them have been hit, and should thus cause migration timeout action to
|
||||
be triggered.
|
||||
|
||||
Avoid migration to be aborted if it is running in post-copy mode
|
||||
Avoid migration to be aborted or triggered post-copy again if it is
|
||||
running in post-copy mode
|
||||
|
||||
:returns: True if migration should be aborted, False otherwise
|
||||
:returns: True if the migration completion timeout action should be
|
||||
performed, False otherwise
|
||||
"""
|
||||
if not completion_timeout:
|
||||
return False
|
||||
|
||||
if migration_status == 'running (post-copy)':
|
||||
return False
|
||||
|
||||
@ -406,8 +412,7 @@ def should_abort(instance, now,
|
||||
(now - progress_time), instance=instance)
|
||||
return True
|
||||
|
||||
if (completion_timeout != 0 and
|
||||
elapsed > completion_timeout):
|
||||
if elapsed > completion_timeout:
|
||||
LOG.warning("Live migration not completed after %d sec",
|
||||
completion_timeout, instance=instance)
|
||||
return True
|
||||
|
@ -0,0 +1,15 @@
|
||||
---
|
||||
features:
|
||||
- |
|
||||
A new configuration option ``[libvirt]/live_migration_timeout_action``
|
||||
is added. This new option will have choices ``abort`` (default)
|
||||
or ``force_complete``. This option will determine what actions will be
|
||||
taken against a VM after ``live_migration_completion_timeout`` expires.
|
||||
Currently nova just aborts the live migrate operation after completion
|
||||
timeout expires. By default, we keep the same behavior of aborting after
|
||||
completion timeout. ``force_complete`` will either pause the VM or trigger
|
||||
post-copy depending on if post copy is enabled and available.
|
||||
|
||||
The ``[libvirt]/live_migration_completion_timeout`` is restricted by
|
||||
minimum 0 and will now raise a ValueError if the configuration option
|
||||
value is less than minimum value.
|
Loading…
x
Reference in New Issue
Block a user