libvirt: remove live_migration_progress_timeout config
Config option ``libvirt.live_migration_progress_timeout`` was deprecated in Ocata, and can now be removed. This patch remove live_migration_progress_timeout and also remove the migration progress timeout related logic. Change-Id: Ife89a705892ad96de6d5f8e68b6e4b99063a7512 blueprint: live-migration-force-after-timeout
This commit is contained in:
parent
99a075cc94
commit
4c3698c0b6
doc/source/admin
nova
conf
tests/unit/virt/libvirt
virt/libvirt
releasenotes/notes
@ -254,19 +254,6 @@ out:
|
|||||||
WARNING nova.virt.libvirt.migration [req-...] [instance: ...]
|
WARNING nova.virt.libvirt.migration [req-...] [instance: ...]
|
||||||
live migration not completed after 1800 sec
|
live migration not completed after 1800 sec
|
||||||
|
|
||||||
The Compute service also cancels migrations when the memory copy seems to make
|
|
||||||
no progress. This feature is `disabled by default`_, but it can be enabled
|
|
||||||
using the configuration parameter
|
|
||||||
:oslo.config:option:`libvirt.live_migration_progress_timeout`. Should
|
|
||||||
this be the case, you may find the following message in the log:
|
|
||||||
|
|
||||||
.. code-block:: console
|
|
||||||
|
|
||||||
WARNING nova.virt.libvirt.migration [req-...] [instance: ...]
|
|
||||||
live migration stuck for 150 sec
|
|
||||||
|
|
||||||
.. _disabled by default: https://review.openstack.org/#/c/431635/
|
|
||||||
|
|
||||||
Addressing migration timeouts
|
Addressing migration timeouts
|
||||||
-----------------------------
|
-----------------------------
|
||||||
|
|
||||||
|
@ -390,24 +390,6 @@ Related options:
|
|||||||
* live_migration_downtime
|
* live_migration_downtime
|
||||||
* live_migration_downtime_steps
|
* live_migration_downtime_steps
|
||||||
* live_migration_downtime_delay
|
* live_migration_downtime_delay
|
||||||
"""),
|
|
||||||
cfg.IntOpt('live_migration_progress_timeout',
|
|
||||||
default=0,
|
|
||||||
deprecated_for_removal=True,
|
|
||||||
deprecated_reason="""
|
|
||||||
Serious bugs found in this feature, see
|
|
||||||
https://bugs.launchpad.net/nova/+bug/1644248 for details.
|
|
||||||
""",
|
|
||||||
mutable=True,
|
|
||||||
help="""
|
|
||||||
Time to wait, in seconds, for migration to make forward progress in
|
|
||||||
transferring data before aborting the operation.
|
|
||||||
|
|
||||||
Set to 0 to disable timeouts.
|
|
||||||
|
|
||||||
This is deprecated, and now disabled by default because we have found serious
|
|
||||||
bugs in this feature that caused false live-migration timeout failures. This
|
|
||||||
feature will be removed or replaced in a future release.
|
|
||||||
"""),
|
"""),
|
||||||
cfg.StrOpt('live_migration_timeout_action',
|
cfg.StrOpt('live_migration_timeout_action',
|
||||||
default='abort',
|
default='abort',
|
||||||
@ -434,9 +416,7 @@ mode, i.e., switch the active VM to the one on the destination node before the
|
|||||||
migration is complete, therefore ensuring an upper bound on the memory that
|
migration is complete, therefore ensuring an upper bound on the memory that
|
||||||
needs to be transferred. Post-copy requires libvirt>=1.3.3 and QEMU>=2.5.0.
|
needs to be transferred. Post-copy requires libvirt>=1.3.3 and QEMU>=2.5.0.
|
||||||
|
|
||||||
When permitted, post-copy mode will be automatically activated if a
|
When permitted, post-copy mode will be automatically activated if
|
||||||
live-migration memory copy iteration does not make percentage increase of at
|
|
||||||
least 10% over the last iteration, or will be automatically activated if
|
|
||||||
we reach the timeout defined by ``live_migration_completion_timeout`` and
|
we reach the timeout defined by ``live_migration_completion_timeout`` and
|
||||||
``live_migration_timeout_action`` is set to 'force_complete'. Note if you
|
``live_migration_timeout_action`` is set to 'force_complete'. Note if you
|
||||||
change to no timeout or choose to use 'abort',
|
change to no timeout or choose to use 'abort',
|
||||||
|
@ -11284,7 +11284,6 @@ class LibvirtConnTestCase(test.NoDBTestCase,
|
|||||||
def test_live_migration_monitor_downtime(self, mock_downtime_steps,
|
def test_live_migration_monitor_downtime(self, mock_downtime_steps,
|
||||||
mock_set_downtime):
|
mock_set_downtime):
|
||||||
self.flags(live_migration_completion_timeout=1000000,
|
self.flags(live_migration_completion_timeout=1000000,
|
||||||
live_migration_progress_timeout=1000000,
|
|
||||||
group='libvirt')
|
group='libvirt')
|
||||||
# We've setup 4 fake downtime steps - first value is the
|
# We've setup 4 fake downtime steps - first value is the
|
||||||
# time delay, second is the downtime value
|
# time delay, second is the downtime value
|
||||||
@ -11331,7 +11330,6 @@ class LibvirtConnTestCase(test.NoDBTestCase,
|
|||||||
|
|
||||||
def test_live_migration_monitor_completion(self):
|
def test_live_migration_monitor_completion(self):
|
||||||
self.flags(live_migration_completion_timeout=100,
|
self.flags(live_migration_completion_timeout=100,
|
||||||
live_migration_progress_timeout=1000000,
|
|
||||||
group='libvirt')
|
group='libvirt')
|
||||||
# Each one of these fake times is used for time.time()
|
# Each one of these fake times is used for time.time()
|
||||||
# when a new domain_info_records entry is consumed.
|
# when a new domain_info_records entry is consumed.
|
||||||
@ -11361,101 +11359,6 @@ class LibvirtConnTestCase(test.NoDBTestCase,
|
|||||||
fake_times, self.EXPECT_ABORT,
|
fake_times, self.EXPECT_ABORT,
|
||||||
expected_mig_status='cancelled')
|
expected_mig_status='cancelled')
|
||||||
|
|
||||||
def test_live_migration_monitor_progress(self):
|
|
||||||
self.flags(live_migration_completion_timeout=1000000,
|
|
||||||
live_migration_progress_timeout=150,
|
|
||||||
group='libvirt')
|
|
||||||
# Each one of these fake times is used for time.time()
|
|
||||||
# when a new domain_info_records entry is consumed.
|
|
||||||
fake_times = [0, 40, 80, 120, 160, 200, 240, 280, 320]
|
|
||||||
|
|
||||||
# A normal sequence where see all the normal job states
|
|
||||||
domain_info_records = [
|
|
||||||
libvirt_guest.JobInfo(
|
|
||||||
type=fakelibvirt.VIR_DOMAIN_JOB_NONE),
|
|
||||||
libvirt_guest.JobInfo(
|
|
||||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=90),
|
|
||||||
libvirt_guest.JobInfo(
|
|
||||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=90),
|
|
||||||
libvirt_guest.JobInfo(
|
|
||||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=90),
|
|
||||||
libvirt_guest.JobInfo(
|
|
||||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=90),
|
|
||||||
libvirt_guest.JobInfo(
|
|
||||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=90),
|
|
||||||
"thread-finish",
|
|
||||||
"domain-stop",
|
|
||||||
libvirt_guest.JobInfo(
|
|
||||||
type=fakelibvirt.VIR_DOMAIN_JOB_CANCELLED),
|
|
||||||
]
|
|
||||||
|
|
||||||
self._test_live_migration_monitoring(domain_info_records,
|
|
||||||
fake_times, self.EXPECT_ABORT,
|
|
||||||
expected_mig_status='cancelled')
|
|
||||||
|
|
||||||
def test_live_migration_monitor_progress_zero_data_remaining(self):
|
|
||||||
self.flags(live_migration_completion_timeout=1000000,
|
|
||||||
live_migration_progress_timeout=150,
|
|
||||||
group='libvirt')
|
|
||||||
# Each one of these fake times is used for time.time()
|
|
||||||
# when a new domain_info_records entry is consumed.
|
|
||||||
fake_times = [0, 40, 80, 120, 160, 200, 240, 280, 320]
|
|
||||||
|
|
||||||
# A normal sequence where see all the normal job states
|
|
||||||
domain_info_records = [
|
|
||||||
libvirt_guest.JobInfo(
|
|
||||||
type=fakelibvirt.VIR_DOMAIN_JOB_NONE),
|
|
||||||
libvirt_guest.JobInfo(
|
|
||||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=0),
|
|
||||||
libvirt_guest.JobInfo(
|
|
||||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=90),
|
|
||||||
libvirt_guest.JobInfo(
|
|
||||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=70),
|
|
||||||
libvirt_guest.JobInfo(
|
|
||||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=50),
|
|
||||||
libvirt_guest.JobInfo(
|
|
||||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=30),
|
|
||||||
libvirt_guest.JobInfo(
|
|
||||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=10),
|
|
||||||
libvirt_guest.JobInfo(
|
|
||||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=0),
|
|
||||||
"thread-finish",
|
|
||||||
"domain-stop",
|
|
||||||
libvirt_guest.JobInfo(
|
|
||||||
type=fakelibvirt.VIR_DOMAIN_JOB_FAILED),
|
|
||||||
]
|
|
||||||
|
|
||||||
self._test_live_migration_monitoring(domain_info_records,
|
|
||||||
fake_times, self.EXPECT_FAILURE)
|
|
||||||
|
|
||||||
@mock.patch('nova.virt.libvirt.migration.should_switch_to_postcopy')
|
|
||||||
@mock.patch.object(libvirt_driver.LibvirtDriver,
|
|
||||||
"_is_post_copy_enabled")
|
|
||||||
def test_live_migration_monitor_postcopy_switch(self,
|
|
||||||
mock_postcopy_enabled, mock_should_switch):
|
|
||||||
# A normal sequence where migration is switched to postcopy mode
|
|
||||||
mock_postcopy_enabled.return_value = True
|
|
||||||
switch_values = [False, False, True]
|
|
||||||
mock_should_switch.return_value = switch_values
|
|
||||||
domain_info_records = [
|
|
||||||
libvirt_guest.JobInfo(
|
|
||||||
type=fakelibvirt.VIR_DOMAIN_JOB_NONE),
|
|
||||||
libvirt_guest.JobInfo(
|
|
||||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
|
|
||||||
libvirt_guest.JobInfo(
|
|
||||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
|
|
||||||
libvirt_guest.JobInfo(
|
|
||||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
|
|
||||||
"thread-finish",
|
|
||||||
"domain-stop",
|
|
||||||
libvirt_guest.JobInfo(
|
|
||||||
type=fakelibvirt.VIR_DOMAIN_JOB_COMPLETED),
|
|
||||||
]
|
|
||||||
|
|
||||||
self._test_live_migration_monitoring(domain_info_records, [],
|
|
||||||
self.EXPECT_SUCCESS,
|
|
||||||
expected_switch=True)
|
|
||||||
|
|
||||||
@mock.patch.object(libvirt_driver.LibvirtDriver,
|
@mock.patch.object(libvirt_driver.LibvirtDriver,
|
||||||
"_is_post_copy_enabled")
|
"_is_post_copy_enabled")
|
||||||
def test_live_migration_monitor_force_complete_postcopy(self,
|
def test_live_migration_monitor_force_complete_postcopy(self,
|
||||||
|
@ -974,35 +974,20 @@ class MigrationMonitorTestCase(test.NoDBTestCase):
|
|||||||
self.assertEqual(migration.find_job_type(self.guest, self.instance),
|
self.assertEqual(migration.find_job_type(self.guest, self.instance),
|
||||||
fakelibvirt.VIR_DOMAIN_JOB_FAILED)
|
fakelibvirt.VIR_DOMAIN_JOB_FAILED)
|
||||||
|
|
||||||
def test_live_migration_abort_stuck(self):
|
|
||||||
# Progress time exceeds progress timeout
|
|
||||||
self.assertTrue(migration.should_trigger_timeout_action(
|
|
||||||
self.instance, 5000, 1000, 2000, 4500, 9000, "running"))
|
|
||||||
|
|
||||||
def test_live_migration_abort_no_prog_timeout(self):
|
|
||||||
# Progress timeout is disabled
|
|
||||||
self.assertFalse(migration.should_trigger_timeout_action(
|
|
||||||
self.instance, 5000, 1000, 0, 4500, 9000, "running"))
|
|
||||||
|
|
||||||
def test_live_migration_abort_not_stuck(self):
|
|
||||||
# Progress time is less than progress timeout
|
|
||||||
self.assertFalse(migration.should_trigger_timeout_action(
|
|
||||||
self.instance, 5000, 4500, 2000, 4500, 9000, "running"))
|
|
||||||
|
|
||||||
def test_live_migration_abort_too_long(self):
|
def test_live_migration_abort_too_long(self):
|
||||||
# Elapsed time is over completion timeout
|
# Elapsed time is over completion timeout
|
||||||
self.assertTrue(migration.should_trigger_timeout_action(
|
self.assertTrue(migration.should_trigger_timeout_action(
|
||||||
self.instance, 5000, 4500, 2000, 4500, 2000, "running"))
|
self.instance, 4500, 2000, "running"))
|
||||||
|
|
||||||
def test_live_migration_abort_no_comp_timeout(self):
|
def test_live_migration_abort_no_comp_timeout(self):
|
||||||
# Completion timeout is disabled
|
# Completion timeout is disabled
|
||||||
self.assertFalse(migration.should_trigger_timeout_action(
|
self.assertFalse(migration.should_trigger_timeout_action(
|
||||||
self.instance, 5000, 4500, 2000, 4500, 0, "running"))
|
self.instance, 4500, 0, "running"))
|
||||||
|
|
||||||
def test_live_migration_abort_still_working(self):
|
def test_live_migration_abort_still_working(self):
|
||||||
# Elapsed time is less than completion timeout
|
# Elapsed time is less than completion timeout
|
||||||
self.assertFalse(migration.should_trigger_timeout_action(
|
self.assertFalse(migration.should_trigger_timeout_action(
|
||||||
self.instance, 5000, 4500, 2000, 4500, 9000, "running"))
|
self.instance, 4500, 9000, "running"))
|
||||||
|
|
||||||
def test_live_migration_postcopy_switch(self):
|
def test_live_migration_postcopy_switch(self):
|
||||||
# Migration progress is not fast enough
|
# Migration progress is not fast enough
|
||||||
|
@ -7338,9 +7338,6 @@ class LibvirtDriver(driver.ComputeDriver):
|
|||||||
|
|
||||||
n = 0
|
n = 0
|
||||||
start = time.time()
|
start = time.time()
|
||||||
progress_time = start
|
|
||||||
progress_watermark = None
|
|
||||||
previous_data_remaining = -1
|
|
||||||
is_post_copy_enabled = self._is_post_copy_enabled(migration_flags)
|
is_post_copy_enabled = self._is_post_copy_enabled(migration_flags)
|
||||||
while True:
|
while True:
|
||||||
info = guest.get_job_info()
|
info = guest.get_job_info()
|
||||||
@ -7375,13 +7372,6 @@ class LibvirtDriver(driver.ComputeDriver):
|
|||||||
now = time.time()
|
now = time.time()
|
||||||
elapsed = now - start
|
elapsed = now - start
|
||||||
|
|
||||||
if ((progress_watermark is None) or
|
|
||||||
(progress_watermark == 0) or
|
|
||||||
(progress_watermark > info.data_remaining)):
|
|
||||||
progress_watermark = info.data_remaining
|
|
||||||
progress_time = now
|
|
||||||
|
|
||||||
progress_timeout = CONF.libvirt.live_migration_progress_timeout
|
|
||||||
completion_timeout = int(
|
completion_timeout = int(
|
||||||
CONF.libvirt.live_migration_completion_timeout * data_gb)
|
CONF.libvirt.live_migration_completion_timeout * data_gb)
|
||||||
# NOTE(yikun): Check the completion timeout to determine
|
# NOTE(yikun): Check the completion timeout to determine
|
||||||
@ -7391,8 +7381,8 @@ class LibvirtDriver(driver.ComputeDriver):
|
|||||||
# if available else the VM will be suspended, otherwise the
|
# if available else the VM will be suspended, otherwise the
|
||||||
# live migrate operation will be aborted.
|
# live migrate operation will be aborted.
|
||||||
if libvirt_migrate.should_trigger_timeout_action(
|
if libvirt_migrate.should_trigger_timeout_action(
|
||||||
instance, now, progress_time, progress_timeout,
|
instance, elapsed, completion_timeout,
|
||||||
elapsed, completion_timeout, migration.status):
|
migration.status):
|
||||||
timeout_act = CONF.libvirt.live_migration_timeout_action
|
timeout_act = CONF.libvirt.live_migration_timeout_action
|
||||||
if timeout_act == 'force_complete':
|
if timeout_act == 'force_complete':
|
||||||
self.live_migration_force_complete(instance)
|
self.live_migration_force_complete(instance)
|
||||||
@ -7407,15 +7397,6 @@ class LibvirtDriver(driver.ComputeDriver):
|
|||||||
self._clear_empty_migration(instance)
|
self._clear_empty_migration(instance)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
if (is_post_copy_enabled and
|
|
||||||
libvirt_migrate.should_switch_to_postcopy(
|
|
||||||
info.memory_iteration, info.data_remaining,
|
|
||||||
previous_data_remaining, migration.status)):
|
|
||||||
libvirt_migrate.trigger_postcopy_switch(guest,
|
|
||||||
instance,
|
|
||||||
migration)
|
|
||||||
previous_data_remaining = info.data_remaining
|
|
||||||
|
|
||||||
curdowntime = libvirt_migrate.update_downtime(
|
curdowntime = libvirt_migrate.update_downtime(
|
||||||
guest, instance, curdowntime,
|
guest, instance, curdowntime,
|
||||||
downtime_steps, elapsed)
|
downtime_steps, elapsed)
|
||||||
@ -7458,13 +7439,6 @@ class LibvirtDriver(driver.ComputeDriver):
|
|||||||
"processed_memory": info.memory_processed,
|
"processed_memory": info.memory_processed,
|
||||||
"remaining_memory": info.memory_remaining,
|
"remaining_memory": info.memory_remaining,
|
||||||
"total_memory": info.memory_total}, instance=instance)
|
"total_memory": info.memory_total}, instance=instance)
|
||||||
if info.data_remaining > progress_watermark:
|
|
||||||
lg("Data remaining %(remaining)d bytes, "
|
|
||||||
"low watermark %(watermark)d bytes "
|
|
||||||
"%(last)d seconds ago",
|
|
||||||
{"remaining": info.data_remaining,
|
|
||||||
"watermark": progress_watermark,
|
|
||||||
"last": (now - progress_time)}, instance=instance)
|
|
||||||
|
|
||||||
n = n + 1
|
n = n + 1
|
||||||
elif info.type == libvirt.VIR_DOMAIN_JOB_COMPLETED:
|
elif info.type == libvirt.VIR_DOMAIN_JOB_COMPLETED:
|
||||||
|
@ -376,23 +376,17 @@ def find_job_type(guest, instance):
|
|||||||
return libvirt.VIR_DOMAIN_JOB_FAILED
|
return libvirt.VIR_DOMAIN_JOB_FAILED
|
||||||
|
|
||||||
|
|
||||||
def should_trigger_timeout_action(instance, now,
|
def should_trigger_timeout_action(instance, elapsed, completion_timeout,
|
||||||
progress_time, progress_timeout,
|
|
||||||
elapsed, completion_timeout,
|
|
||||||
migration_status):
|
migration_status):
|
||||||
"""Determine if the migration timeout action should be triggered
|
"""Determine if the migration timeout action should be triggered
|
||||||
|
|
||||||
:param instance: a nova.objects.Instance
|
:param instance: a nova.objects.Instance
|
||||||
:param now: current time in secs since epoch
|
|
||||||
:param progress_time: when progress was last made in secs since epoch
|
|
||||||
:param progress_timeout: time in secs to allow for progress
|
|
||||||
:param elapsed: total elapsed time of migration in secs
|
:param elapsed: total elapsed time of migration in secs
|
||||||
:param completion_timeout: time in secs to allow for completion
|
:param completion_timeout: time in secs to allow for completion
|
||||||
:param migration_status: current status of the migration
|
:param migration_status: current status of the migration
|
||||||
|
|
||||||
Check the progress and completion timeouts to determine if either
|
Check the completion timeout to determine if it has been hit,
|
||||||
of them have been hit, and should thus cause migration timeout action to
|
and should thus cause migration timeout action to be triggered.
|
||||||
be triggered.
|
|
||||||
|
|
||||||
Avoid migration to be aborted or triggered post-copy again if it is
|
Avoid migration to be aborted or triggered post-copy again if it is
|
||||||
running in post-copy mode
|
running in post-copy mode
|
||||||
@ -406,12 +400,6 @@ def should_trigger_timeout_action(instance, now,
|
|||||||
if migration_status == 'running (post-copy)':
|
if migration_status == 'running (post-copy)':
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if (progress_timeout != 0 and
|
|
||||||
(now - progress_time) > progress_timeout):
|
|
||||||
LOG.warning("Live migration stuck for %d sec",
|
|
||||||
(now - progress_time), instance=instance)
|
|
||||||
return True
|
|
||||||
|
|
||||||
if elapsed > completion_timeout:
|
if elapsed > completion_timeout:
|
||||||
LOG.warning("Live migration not completed after %d sec",
|
LOG.warning("Live migration not completed after %d sec",
|
||||||
completion_timeout, instance=instance)
|
completion_timeout, instance=instance)
|
||||||
|
@ -13,3 +13,17 @@ features:
|
|||||||
The ``[libvirt]/live_migration_completion_timeout`` is restricted by
|
The ``[libvirt]/live_migration_completion_timeout`` is restricted by
|
||||||
minimum 0 and will now raise a ValueError if the configuration option
|
minimum 0 and will now raise a ValueError if the configuration option
|
||||||
value is less than minimum value.
|
value is less than minimum value.
|
||||||
|
|
||||||
|
Note if you configure Nova to have no timeout, post copy will never be
|
||||||
|
automatically triggered. None of this affects triggering post copy via
|
||||||
|
the force live-migration API, that continues to work in the same way.
|
||||||
|
upgrade:
|
||||||
|
- |
|
||||||
|
Config option ``[libvirt]/live_migration_progress_timeout`` was deprecated
|
||||||
|
in Ocata, and has now been removed.
|
||||||
|
|
||||||
|
Current logic in libvirt driver to auto trigger post-copy based on
|
||||||
|
progress information is removed as it has `proved impossible`__ to detect
|
||||||
|
when live-migration appears to be making little progress.
|
||||||
|
|
||||||
|
.. __: https://bugs.launchpad.net/nova/+bug/1644248
|
||||||
|
Loading…
x
Reference in New Issue
Block a user