diff --git a/doc/source/admin/live-migration-usage.rst b/doc/source/admin/live-migration-usage.rst index 4ad048d057a5..9f8f7b5762aa 100644 --- a/doc/source/admin/live-migration-usage.rst +++ b/doc/source/admin/live-migration-usage.rst @@ -254,19 +254,6 @@ out: WARNING nova.virt.libvirt.migration [req-...] [instance: ...] live migration not completed after 1800 sec -The Compute service also cancels migrations when the memory copy seems to make -no progress. This feature is `disabled by default`_, but it can be enabled -using the configuration parameter -:oslo.config:option:`libvirt.live_migration_progress_timeout`. Should -this be the case, you may find the following message in the log: - -.. code-block:: console - - WARNING nova.virt.libvirt.migration [req-...] [instance: ...] - live migration stuck for 150 sec - -.. _disabled by default: https://review.openstack.org/#/c/431635/ - Addressing migration timeouts ----------------------------- diff --git a/nova/conf/libvirt.py b/nova/conf/libvirt.py index 70f7c0b076f2..00732fb66dbe 100644 --- a/nova/conf/libvirt.py +++ b/nova/conf/libvirt.py @@ -390,24 +390,6 @@ Related options: * live_migration_downtime * live_migration_downtime_steps * live_migration_downtime_delay -"""), - cfg.IntOpt('live_migration_progress_timeout', - default=0, - deprecated_for_removal=True, - deprecated_reason=""" -Serious bugs found in this feature, see -https://bugs.launchpad.net/nova/+bug/1644248 for details. -""", - mutable=True, - help=""" -Time to wait, in seconds, for migration to make forward progress in -transferring data before aborting the operation. - -Set to 0 to disable timeouts. - -This is deprecated, and now disabled by default because we have found serious -bugs in this feature that caused false live-migration timeout failures. This -feature will be removed or replaced in a future release. """), cfg.StrOpt('live_migration_timeout_action', default='abort', @@ -434,9 +416,7 @@ mode, i.e., switch the active VM to the one on the destination node before the migration is complete, therefore ensuring an upper bound on the memory that needs to be transferred. Post-copy requires libvirt>=1.3.3 and QEMU>=2.5.0. -When permitted, post-copy mode will be automatically activated if a -live-migration memory copy iteration does not make percentage increase of at -least 10% over the last iteration, or will be automatically activated if +When permitted, post-copy mode will be automatically activated if we reach the timeout defined by ``live_migration_completion_timeout`` and ``live_migration_timeout_action`` is set to 'force_complete'. Note if you change to no timeout or choose to use 'abort', diff --git a/nova/tests/unit/virt/libvirt/test_driver.py b/nova/tests/unit/virt/libvirt/test_driver.py index e872bd9da0da..44f970b27508 100644 --- a/nova/tests/unit/virt/libvirt/test_driver.py +++ b/nova/tests/unit/virt/libvirt/test_driver.py @@ -11285,7 +11285,6 @@ class LibvirtConnTestCase(test.NoDBTestCase, def test_live_migration_monitor_downtime(self, mock_downtime_steps, mock_set_downtime): self.flags(live_migration_completion_timeout=1000000, - live_migration_progress_timeout=1000000, group='libvirt') # We've setup 4 fake downtime steps - first value is the # time delay, second is the downtime value @@ -11332,7 +11331,6 @@ class LibvirtConnTestCase(test.NoDBTestCase, def test_live_migration_monitor_completion(self): self.flags(live_migration_completion_timeout=100, - live_migration_progress_timeout=1000000, group='libvirt') # Each one of these fake times is used for time.time() # when a new domain_info_records entry is consumed. @@ -11362,101 +11360,6 @@ class LibvirtConnTestCase(test.NoDBTestCase, fake_times, self.EXPECT_ABORT, expected_mig_status='cancelled') - def test_live_migration_monitor_progress(self): - self.flags(live_migration_completion_timeout=1000000, - live_migration_progress_timeout=150, - group='libvirt') - # Each one of these fake times is used for time.time() - # when a new domain_info_records entry is consumed. - fake_times = [0, 40, 80, 120, 160, 200, 240, 280, 320] - - # A normal sequence where see all the normal job states - domain_info_records = [ - libvirt_guest.JobInfo( - type=fakelibvirt.VIR_DOMAIN_JOB_NONE), - libvirt_guest.JobInfo( - type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=90), - libvirt_guest.JobInfo( - type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=90), - libvirt_guest.JobInfo( - type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=90), - libvirt_guest.JobInfo( - type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=90), - libvirt_guest.JobInfo( - type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=90), - "thread-finish", - "domain-stop", - libvirt_guest.JobInfo( - type=fakelibvirt.VIR_DOMAIN_JOB_CANCELLED), - ] - - self._test_live_migration_monitoring(domain_info_records, - fake_times, self.EXPECT_ABORT, - expected_mig_status='cancelled') - - def test_live_migration_monitor_progress_zero_data_remaining(self): - self.flags(live_migration_completion_timeout=1000000, - live_migration_progress_timeout=150, - group='libvirt') - # Each one of these fake times is used for time.time() - # when a new domain_info_records entry is consumed. - fake_times = [0, 40, 80, 120, 160, 200, 240, 280, 320] - - # A normal sequence where see all the normal job states - domain_info_records = [ - libvirt_guest.JobInfo( - type=fakelibvirt.VIR_DOMAIN_JOB_NONE), - libvirt_guest.JobInfo( - type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=0), - libvirt_guest.JobInfo( - type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=90), - libvirt_guest.JobInfo( - type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=70), - libvirt_guest.JobInfo( - type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=50), - libvirt_guest.JobInfo( - type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=30), - libvirt_guest.JobInfo( - type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=10), - libvirt_guest.JobInfo( - type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED, data_remaining=0), - "thread-finish", - "domain-stop", - libvirt_guest.JobInfo( - type=fakelibvirt.VIR_DOMAIN_JOB_FAILED), - ] - - self._test_live_migration_monitoring(domain_info_records, - fake_times, self.EXPECT_FAILURE) - - @mock.patch('nova.virt.libvirt.migration.should_switch_to_postcopy') - @mock.patch.object(libvirt_driver.LibvirtDriver, - "_is_post_copy_enabled") - def test_live_migration_monitor_postcopy_switch(self, - mock_postcopy_enabled, mock_should_switch): - # A normal sequence where migration is switched to postcopy mode - mock_postcopy_enabled.return_value = True - switch_values = [False, False, True] - mock_should_switch.return_value = switch_values - domain_info_records = [ - libvirt_guest.JobInfo( - type=fakelibvirt.VIR_DOMAIN_JOB_NONE), - libvirt_guest.JobInfo( - type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED), - libvirt_guest.JobInfo( - type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED), - libvirt_guest.JobInfo( - type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED), - "thread-finish", - "domain-stop", - libvirt_guest.JobInfo( - type=fakelibvirt.VIR_DOMAIN_JOB_COMPLETED), - ] - - self._test_live_migration_monitoring(domain_info_records, [], - self.EXPECT_SUCCESS, - expected_switch=True) - @mock.patch.object(libvirt_driver.LibvirtDriver, "_is_post_copy_enabled") def test_live_migration_monitor_force_complete_postcopy(self, diff --git a/nova/tests/unit/virt/libvirt/test_migration.py b/nova/tests/unit/virt/libvirt/test_migration.py index 4c2fedf548f7..a0334239bd11 100644 --- a/nova/tests/unit/virt/libvirt/test_migration.py +++ b/nova/tests/unit/virt/libvirt/test_migration.py @@ -974,35 +974,20 @@ class MigrationMonitorTestCase(test.NoDBTestCase): self.assertEqual(migration.find_job_type(self.guest, self.instance), fakelibvirt.VIR_DOMAIN_JOB_FAILED) - def test_live_migration_abort_stuck(self): - # Progress time exceeds progress timeout - self.assertTrue(migration.should_trigger_timeout_action( - self.instance, 5000, 1000, 2000, 4500, 9000, "running")) - - def test_live_migration_abort_no_prog_timeout(self): - # Progress timeout is disabled - self.assertFalse(migration.should_trigger_timeout_action( - self.instance, 5000, 1000, 0, 4500, 9000, "running")) - - def test_live_migration_abort_not_stuck(self): - # Progress time is less than progress timeout - self.assertFalse(migration.should_trigger_timeout_action( - self.instance, 5000, 4500, 2000, 4500, 9000, "running")) - def test_live_migration_abort_too_long(self): # Elapsed time is over completion timeout self.assertTrue(migration.should_trigger_timeout_action( - self.instance, 5000, 4500, 2000, 4500, 2000, "running")) + self.instance, 4500, 2000, "running")) def test_live_migration_abort_no_comp_timeout(self): # Completion timeout is disabled self.assertFalse(migration.should_trigger_timeout_action( - self.instance, 5000, 4500, 2000, 4500, 0, "running")) + self.instance, 4500, 0, "running")) def test_live_migration_abort_still_working(self): # Elapsed time is less than completion timeout self.assertFalse(migration.should_trigger_timeout_action( - self.instance, 5000, 4500, 2000, 4500, 9000, "running")) + self.instance, 4500, 9000, "running")) def test_live_migration_postcopy_switch(self): # Migration progress is not fast enough diff --git a/nova/virt/libvirt/driver.py b/nova/virt/libvirt/driver.py index 57997c203a97..8a4fde46e475 100644 --- a/nova/virt/libvirt/driver.py +++ b/nova/virt/libvirt/driver.py @@ -7333,9 +7333,6 @@ class LibvirtDriver(driver.ComputeDriver): n = 0 start = time.time() - progress_time = start - progress_watermark = None - previous_data_remaining = -1 is_post_copy_enabled = self._is_post_copy_enabled(migration_flags) while True: info = guest.get_job_info() @@ -7370,13 +7367,6 @@ class LibvirtDriver(driver.ComputeDriver): now = time.time() elapsed = now - start - if ((progress_watermark is None) or - (progress_watermark == 0) or - (progress_watermark > info.data_remaining)): - progress_watermark = info.data_remaining - progress_time = now - - progress_timeout = CONF.libvirt.live_migration_progress_timeout completion_timeout = int( CONF.libvirt.live_migration_completion_timeout * data_gb) # NOTE(yikun): Check the completion timeout to determine @@ -7386,8 +7376,8 @@ class LibvirtDriver(driver.ComputeDriver): # if available else the VM will be suspended, otherwise the # live migrate operation will be aborted. if libvirt_migrate.should_trigger_timeout_action( - instance, now, progress_time, progress_timeout, - elapsed, completion_timeout, migration.status): + instance, elapsed, completion_timeout, + migration.status): timeout_act = CONF.libvirt.live_migration_timeout_action if timeout_act == 'force_complete': self.live_migration_force_complete(instance) @@ -7402,15 +7392,6 @@ class LibvirtDriver(driver.ComputeDriver): self._clear_empty_migration(instance) raise - if (is_post_copy_enabled and - libvirt_migrate.should_switch_to_postcopy( - info.memory_iteration, info.data_remaining, - previous_data_remaining, migration.status)): - libvirt_migrate.trigger_postcopy_switch(guest, - instance, - migration) - previous_data_remaining = info.data_remaining - curdowntime = libvirt_migrate.update_downtime( guest, instance, curdowntime, downtime_steps, elapsed) @@ -7453,13 +7434,6 @@ class LibvirtDriver(driver.ComputeDriver): "processed_memory": info.memory_processed, "remaining_memory": info.memory_remaining, "total_memory": info.memory_total}, instance=instance) - if info.data_remaining > progress_watermark: - lg("Data remaining %(remaining)d bytes, " - "low watermark %(watermark)d bytes " - "%(last)d seconds ago", - {"remaining": info.data_remaining, - "watermark": progress_watermark, - "last": (now - progress_time)}, instance=instance) n = n + 1 elif info.type == libvirt.VIR_DOMAIN_JOB_COMPLETED: diff --git a/nova/virt/libvirt/migration.py b/nova/virt/libvirt/migration.py index daed5d3791f3..3ad803abbcd3 100644 --- a/nova/virt/libvirt/migration.py +++ b/nova/virt/libvirt/migration.py @@ -376,23 +376,17 @@ def find_job_type(guest, instance): return libvirt.VIR_DOMAIN_JOB_FAILED -def should_trigger_timeout_action(instance, now, - progress_time, progress_timeout, - elapsed, completion_timeout, +def should_trigger_timeout_action(instance, elapsed, completion_timeout, migration_status): """Determine if the migration timeout action should be triggered :param instance: a nova.objects.Instance - :param now: current time in secs since epoch - :param progress_time: when progress was last made in secs since epoch - :param progress_timeout: time in secs to allow for progress :param elapsed: total elapsed time of migration in secs :param completion_timeout: time in secs to allow for completion :param migration_status: current status of the migration - Check the progress and completion timeouts to determine if either - of them have been hit, and should thus cause migration timeout action to - be triggered. + Check the completion timeout to determine if it has been hit, + and should thus cause migration timeout action to be triggered. Avoid migration to be aborted or triggered post-copy again if it is running in post-copy mode @@ -406,12 +400,6 @@ def should_trigger_timeout_action(instance, now, if migration_status == 'running (post-copy)': return False - if (progress_timeout != 0 and - (now - progress_time) > progress_timeout): - LOG.warning("Live migration stuck for %d sec", - (now - progress_time), instance=instance) - return True - if elapsed > completion_timeout: LOG.warning("Live migration not completed after %d sec", completion_timeout, instance=instance) diff --git a/releasenotes/notes/live-migration-force-after-timeout-54f2a4b631d295bb.yaml b/releasenotes/notes/live-migration-force-after-timeout-54f2a4b631d295bb.yaml index 3e46ce026981..0e481793aaa3 100644 --- a/releasenotes/notes/live-migration-force-after-timeout-54f2a4b631d295bb.yaml +++ b/releasenotes/notes/live-migration-force-after-timeout-54f2a4b631d295bb.yaml @@ -13,3 +13,17 @@ features: The ``[libvirt]/live_migration_completion_timeout`` is restricted by minimum 0 and will now raise a ValueError if the configuration option value is less than minimum value. + + Note if you configure Nova to have no timeout, post copy will never be + automatically triggered. None of this affects triggering post copy via + the force live-migration API, that continues to work in the same way. +upgrade: + - | + Config option ``[libvirt]/live_migration_progress_timeout`` was deprecated + in Ocata, and has now been removed. + + Current logic in libvirt driver to auto trigger post-copy based on + progress information is removed as it has `proved impossible`__ to detect + when live-migration appears to be making little progress. + + .. __: https://bugs.launchpad.net/nova/+bug/1644248