Merge "libvirt: remove live_migration_progress_timeout config"
This commit is contained in:
commit
4022171a61
@ -254,19 +254,6 @@ out:
|
||||
WARNING nova.virt.libvirt.migration [req-...] [instance: ...]
|
||||
live migration not completed after 1800 sec
|
||||
|
||||
The Compute service also cancels migrations when the memory copy seems to make
|
||||
no progress. This feature is `disabled by default`_, but it can be enabled
|
||||
using the configuration parameter
|
||||
:oslo.config:option:`libvirt.live_migration_progress_timeout`. Should
|
||||
this be the case, you may find the following message in the log:
|
||||
|
||||
.. code-block:: console
|
||||
|
||||
WARNING nova.virt.libvirt.migration [req-...] [instance: ...]
|
||||
live migration stuck for 150 sec
|
||||
|
||||
.. _disabled by default: https://review.openstack.org/#/c/431635/
|
||||
|
||||
Addressing migration timeouts
|
||||
-----------------------------
|
||||
|
||||
|
@ -390,24 +390,6 @@ Related options:
|
||||
* live_migration_downtime
|
||||
* live_migration_downtime_steps
|
||||
* live_migration_downtime_delay
|
||||
"""),
|
||||
cfg.IntOpt('live_migration_progress_timeout',
|
||||
default=0,
|
||||
deprecated_for_removal=True,
|
||||
deprecated_reason="""
|
||||
Serious bugs found in this feature, see
|
||||
https://bugs.launchpad.net/nova/+bug/1644248 for details.
|
||||
""",
|
||||
mutable=True,
|
||||
help="""
|
||||
Time to wait, in seconds, for migration to make forward progress in
|
||||
transferring data before aborting the operation.
|
||||
|
||||
Set to 0 to disable timeouts.
|
||||
|
||||
This is deprecated, and now disabled by default because we have found serious
|
||||
bugs in this feature that caused false live-migration timeout failures. This
|
||||
feature will be removed or replaced in a future release.
|
||||
"""),
|
||||
cfg.StrOpt('live_migration_timeout_action',
|
||||
default='abort',
|
||||
@ -434,9 +416,7 @@ mode, i.e., switch the active VM to the one on the destination node before the
|
||||
migration is complete, therefore ensuring an upper bound on the memory that
|
||||
needs to be transferred. Post-copy requires libvirt>=1.3.3 and QEMU>=2.5.0.
|
||||
|
||||
When permitted, post-copy mode will be automatically activated if a
|
||||
live-migration memory copy iteration does not make percentage increase of at
|
||||
least 10% over the last iteration, or will be automatically activated if
|
||||
When permitted, post-copy mode will be automatically activated if
|
||||
we reach the timeout defined by ``live_migration_completion_timeout`` and
|
||||
``live_migration_timeout_action`` is set to 'force_complete'. Note if you
|
||||
change to no timeout or choose to use 'abort',
|
||||
|
@ -11285,7 +11285,6 @@ class LibvirtConnTestCase(test.NoDBTestCase,
|
||||
def test_live_migration_monitor_downtime(self, mock_downtime_steps,
|
||||
mock_set_downtime):
|
||||
self.flags(live_migration_completion_timeout=1000000,
|
||||
live_migration_progress_timeout=1000000,
|
||||
group='libvirt')
|
||||
# We've setup 4 fake downtime steps - first value is the
|
||||
# time delay, second is the downtime value
|
||||
@ -11332,7 +11331,6 @@ class LibvirtConnTestCase(test.NoDBTestCase,
|
||||
|
||||
def test_live_migration_monitor_completion(self):
|
||||
self.flags(live_migration_completion_timeout=100,
|
||||
live_migration_progress_timeout=1000000,
|
||||
group='libvirt')
|
||||
# Each one of these fake times is used for time.time()
|
||||
# when a new domain_info_records entry is consumed.
|
||||
@ -11362,101 +11360,6 @@ class LibvirtConnTestCase(test.NoDBTestCase,
|
||||
fake_times, self.EXPECT_ABORT,
|
||||
expected_mig_status='cancelled')
|
||||
|
||||
def test_live_migration_monitor_progress(self):
|
||||
self.flags(live_migration_completion_timeout=1000000,
|
||||
live_migration_progress_timeout=150,
|
||||
group='libvirt')
|
||||
# Each one of these fake times is used for time.time()
|
||||
# when a new domain_info_records entry is consumed.
|
||||
fake_times = [0, 40, 80, 120, 160, 200, 240, 280, 320]
|
||||
|
||||
# A normal sequence where see all the normal job states
|
||||
domain_info_records = [
|
||||
libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_NONE),
|
||||
libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=90),
|
||||
libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=90),
|
||||
libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=90),
|
||||
libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=90),
|
||||
libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=90),
|
||||
"thread-finish",
|
||||
"domain-stop",
|
||||
libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_CANCELLED),
|
||||
]
|
||||
|
||||
self._test_live_migration_monitoring(domain_info_records,
|
||||
fake_times, self.EXPECT_ABORT,
|
||||
expected_mig_status='cancelled')
|
||||
|
||||
def test_live_migration_monitor_progress_zero_data_remaining(self):
|
||||
self.flags(live_migration_completion_timeout=1000000,
|
||||
live_migration_progress_timeout=150,
|
||||
group='libvirt')
|
||||
# Each one of these fake times is used for time.time()
|
||||
# when a new domain_info_records entry is consumed.
|
||||
fake_times = [0, 40, 80, 120, 160, 200, 240, 280, 320]
|
||||
|
||||
# A normal sequence where see all the normal job states
|
||||
domain_info_records = [
|
||||
libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_NONE),
|
||||
libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=0),
|
||||
libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=90),
|
||||
libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=70),
|
||||
libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=50),
|
||||
libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=30),
|
||||
libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=10),
|
||||
libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=0),
|
||||
"thread-finish",
|
||||
"domain-stop",
|
||||
libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_FAILED),
|
||||
]
|
||||
|
||||
self._test_live_migration_monitoring(domain_info_records,
|
||||
fake_times, self.EXPECT_FAILURE)
|
||||
|
||||
@mock.patch('nova.virt.libvirt.migration.should_switch_to_postcopy')
|
||||
@mock.patch.object(libvirt_driver.LibvirtDriver,
|
||||
"_is_post_copy_enabled")
|
||||
def test_live_migration_monitor_postcopy_switch(self,
|
||||
mock_postcopy_enabled, mock_should_switch):
|
||||
# A normal sequence where migration is switched to postcopy mode
|
||||
mock_postcopy_enabled.return_value = True
|
||||
switch_values = [False, False, True]
|
||||
mock_should_switch.return_value = switch_values
|
||||
domain_info_records = [
|
||||
libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_NONE),
|
||||
libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
|
||||
libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
|
||||
libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
|
||||
"thread-finish",
|
||||
"domain-stop",
|
||||
libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_COMPLETED),
|
||||
]
|
||||
|
||||
self._test_live_migration_monitoring(domain_info_records, [],
|
||||
self.EXPECT_SUCCESS,
|
||||
expected_switch=True)
|
||||
|
||||
@mock.patch.object(libvirt_driver.LibvirtDriver,
|
||||
"_is_post_copy_enabled")
|
||||
def test_live_migration_monitor_force_complete_postcopy(self,
|
||||
|
@ -974,35 +974,20 @@ class MigrationMonitorTestCase(test.NoDBTestCase):
|
||||
self.assertEqual(migration.find_job_type(self.guest, self.instance),
|
||||
fakelibvirt.VIR_DOMAIN_JOB_FAILED)
|
||||
|
||||
def test_live_migration_abort_stuck(self):
|
||||
# Progress time exceeds progress timeout
|
||||
self.assertTrue(migration.should_trigger_timeout_action(
|
||||
self.instance, 5000, 1000, 2000, 4500, 9000, "running"))
|
||||
|
||||
def test_live_migration_abort_no_prog_timeout(self):
|
||||
# Progress timeout is disabled
|
||||
self.assertFalse(migration.should_trigger_timeout_action(
|
||||
self.instance, 5000, 1000, 0, 4500, 9000, "running"))
|
||||
|
||||
def test_live_migration_abort_not_stuck(self):
|
||||
# Progress time is less than progress timeout
|
||||
self.assertFalse(migration.should_trigger_timeout_action(
|
||||
self.instance, 5000, 4500, 2000, 4500, 9000, "running"))
|
||||
|
||||
def test_live_migration_abort_too_long(self):
|
||||
# Elapsed time is over completion timeout
|
||||
self.assertTrue(migration.should_trigger_timeout_action(
|
||||
self.instance, 5000, 4500, 2000, 4500, 2000, "running"))
|
||||
self.instance, 4500, 2000, "running"))
|
||||
|
||||
def test_live_migration_abort_no_comp_timeout(self):
|
||||
# Completion timeout is disabled
|
||||
self.assertFalse(migration.should_trigger_timeout_action(
|
||||
self.instance, 5000, 4500, 2000, 4500, 0, "running"))
|
||||
self.instance, 4500, 0, "running"))
|
||||
|
||||
def test_live_migration_abort_still_working(self):
|
||||
# Elapsed time is less than completion timeout
|
||||
self.assertFalse(migration.should_trigger_timeout_action(
|
||||
self.instance, 5000, 4500, 2000, 4500, 9000, "running"))
|
||||
self.instance, 4500, 9000, "running"))
|
||||
|
||||
def test_live_migration_postcopy_switch(self):
|
||||
# Migration progress is not fast enough
|
||||
|
@ -7333,9 +7333,6 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
|
||||
n = 0
|
||||
start = time.time()
|
||||
progress_time = start
|
||||
progress_watermark = None
|
||||
previous_data_remaining = -1
|
||||
is_post_copy_enabled = self._is_post_copy_enabled(migration_flags)
|
||||
while True:
|
||||
info = guest.get_job_info()
|
||||
@ -7370,13 +7367,6 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
now = time.time()
|
||||
elapsed = now - start
|
||||
|
||||
if ((progress_watermark is None) or
|
||||
(progress_watermark == 0) or
|
||||
(progress_watermark > info.data_remaining)):
|
||||
progress_watermark = info.data_remaining
|
||||
progress_time = now
|
||||
|
||||
progress_timeout = CONF.libvirt.live_migration_progress_timeout
|
||||
completion_timeout = int(
|
||||
CONF.libvirt.live_migration_completion_timeout * data_gb)
|
||||
# NOTE(yikun): Check the completion timeout to determine
|
||||
@ -7386,8 +7376,8 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
# if available else the VM will be suspended, otherwise the
|
||||
# live migrate operation will be aborted.
|
||||
if libvirt_migrate.should_trigger_timeout_action(
|
||||
instance, now, progress_time, progress_timeout,
|
||||
elapsed, completion_timeout, migration.status):
|
||||
instance, elapsed, completion_timeout,
|
||||
migration.status):
|
||||
timeout_act = CONF.libvirt.live_migration_timeout_action
|
||||
if timeout_act == 'force_complete':
|
||||
self.live_migration_force_complete(instance)
|
||||
@ -7402,15 +7392,6 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
self._clear_empty_migration(instance)
|
||||
raise
|
||||
|
||||
if (is_post_copy_enabled and
|
||||
libvirt_migrate.should_switch_to_postcopy(
|
||||
info.memory_iteration, info.data_remaining,
|
||||
previous_data_remaining, migration.status)):
|
||||
libvirt_migrate.trigger_postcopy_switch(guest,
|
||||
instance,
|
||||
migration)
|
||||
previous_data_remaining = info.data_remaining
|
||||
|
||||
curdowntime = libvirt_migrate.update_downtime(
|
||||
guest, instance, curdowntime,
|
||||
downtime_steps, elapsed)
|
||||
@ -7453,13 +7434,6 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
"processed_memory": info.memory_processed,
|
||||
"remaining_memory": info.memory_remaining,
|
||||
"total_memory": info.memory_total}, instance=instance)
|
||||
if info.data_remaining > progress_watermark:
|
||||
lg("Data remaining %(remaining)d bytes, "
|
||||
"low watermark %(watermark)d bytes "
|
||||
"%(last)d seconds ago",
|
||||
{"remaining": info.data_remaining,
|
||||
"watermark": progress_watermark,
|
||||
"last": (now - progress_time)}, instance=instance)
|
||||
|
||||
n = n + 1
|
||||
elif info.type == libvirt.VIR_DOMAIN_JOB_COMPLETED:
|
||||
|
@ -376,23 +376,17 @@ def find_job_type(guest, instance):
|
||||
return libvirt.VIR_DOMAIN_JOB_FAILED
|
||||
|
||||
|
||||
def should_trigger_timeout_action(instance, now,
|
||||
progress_time, progress_timeout,
|
||||
elapsed, completion_timeout,
|
||||
def should_trigger_timeout_action(instance, elapsed, completion_timeout,
|
||||
migration_status):
|
||||
"""Determine if the migration timeout action should be triggered
|
||||
|
||||
:param instance: a nova.objects.Instance
|
||||
:param now: current time in secs since epoch
|
||||
:param progress_time: when progress was last made in secs since epoch
|
||||
:param progress_timeout: time in secs to allow for progress
|
||||
:param elapsed: total elapsed time of migration in secs
|
||||
:param completion_timeout: time in secs to allow for completion
|
||||
:param migration_status: current status of the migration
|
||||
|
||||
Check the progress and completion timeouts to determine if either
|
||||
of them have been hit, and should thus cause migration timeout action to
|
||||
be triggered.
|
||||
Check the completion timeout to determine if it has been hit,
|
||||
and should thus cause migration timeout action to be triggered.
|
||||
|
||||
Avoid migration to be aborted or triggered post-copy again if it is
|
||||
running in post-copy mode
|
||||
@ -406,12 +400,6 @@ def should_trigger_timeout_action(instance, now,
|
||||
if migration_status == 'running (post-copy)':
|
||||
return False
|
||||
|
||||
if (progress_timeout != 0 and
|
||||
(now - progress_time) > progress_timeout):
|
||||
LOG.warning("Live migration stuck for %d sec",
|
||||
(now - progress_time), instance=instance)
|
||||
return True
|
||||
|
||||
if elapsed > completion_timeout:
|
||||
LOG.warning("Live migration not completed after %d sec",
|
||||
completion_timeout, instance=instance)
|
||||
|
@ -13,3 +13,17 @@ features:
|
||||
The ``[libvirt]/live_migration_completion_timeout`` is restricted by
|
||||
minimum 0 and will now raise a ValueError if the configuration option
|
||||
value is less than minimum value.
|
||||
|
||||
Note if you configure Nova to have no timeout, post copy will never be
|
||||
automatically triggered. None of this affects triggering post copy via
|
||||
the force live-migration API, that continues to work in the same way.
|
||||
upgrade:
|
||||
- |
|
||||
Config option ``[libvirt]/live_migration_progress_timeout`` was deprecated
|
||||
in Ocata, and has now been removed.
|
||||
|
||||
Current logic in libvirt driver to auto trigger post-copy based on
|
||||
progress information is removed as it has `proved impossible`__ to detect
|
||||
when live-migration appears to be making little progress.
|
||||
|
||||
.. __: https://bugs.launchpad.net/nova/+bug/1644248
|
||||
|
Loading…
Reference in New Issue
Block a user