Add automatic switching to postcopy mode when migration is not progressing
Since QEMU 2.5 and Libvirt 1.3.3 it is possible to use postcopy live migration to ensure an upper limit in the amount of memory to be transferred duing the live migration process. This option enables to live migrate instances with memory-intensive workloads by driving the migration process by the destination host, i.e., after the switch to postcopy mode, the destination VM is the active one and therefore any dirty pages are directly generated at the destination hosts. This patch ensures that the migration is switched to postcopy mode if the migration process is not progressing more than 10% each iteration due to memory dirtying ratio being faster than page transferring to the destination. Partially-implements: blueprint auto-live-migration-completion Change-Id: If267fb7066e8323303bcc0f7e8f36513ef4d66b3 Signed-off-by: Luis Tomas <luis5tb@gmail.com> Co-Authored-By: Pawel Koniszewski <pawel.koniszewski@intel.com>
This commit is contained in:
parent
2de3879afa
commit
1651850f6f
@ -8243,7 +8243,8 @@ class LibvirtConnTestCase(test.NoDBTestCase):
|
||||
expected_mig_status=None,
|
||||
scheduled_action=None,
|
||||
scheduled_action_executed=False,
|
||||
block_migration=False):
|
||||
block_migration=False,
|
||||
expected_switch=False):
|
||||
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
|
||||
instance = objects.Instance(**self.test_instance)
|
||||
drvr.active_migrations[instance.uuid] = deque()
|
||||
@ -8288,7 +8289,9 @@ class LibvirtConnTestCase(test.NoDBTestCase):
|
||||
|
||||
if current_mig_status:
|
||||
migrate_data.migration.status = current_mig_status
|
||||
migrate_data.migration.save()
|
||||
else:
|
||||
migrate_data.migration.status = "unset"
|
||||
migrate_data.migration.save()
|
||||
|
||||
fake_post_method = mock.MagicMock()
|
||||
fake_recover_method = mock.MagicMock()
|
||||
@ -8317,6 +8320,8 @@ class LibvirtConnTestCase(test.NoDBTestCase):
|
||||
'Recover method called when success expected')
|
||||
self.assertFalse(mock_abort.called,
|
||||
'abortJob not called when success expected')
|
||||
if expected_switch:
|
||||
self.assertTrue(mock_postcopy_switch.called)
|
||||
fake_post_method.assert_called_once_with(
|
||||
self.context, instance, dest, False, migrate_data)
|
||||
else:
|
||||
@ -8943,6 +8948,34 @@ class LibvirtConnTestCase(test.NoDBTestCase):
|
||||
(900, 400),
|
||||
], list(steps))
|
||||
|
||||
@mock.patch('nova.virt.libvirt.migration.should_switch_to_postcopy')
|
||||
@mock.patch.object(libvirt_driver.LibvirtDriver,
|
||||
"_is_post_copy_enabled")
|
||||
def test_live_migration_monitor_postcopy_switch(self,
|
||||
mock_postcopy_enabled, mock_should_switch):
|
||||
# A normal sequence where migration is swtiched to postcopy mode
|
||||
mock_postcopy_enabled.return_value = True
|
||||
switch_values = [False, False, True]
|
||||
mock_should_switch.return_value = switch_values
|
||||
domain_info_records = [
|
||||
libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_NONE),
|
||||
libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
|
||||
libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
|
||||
libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
|
||||
"thread-finish",
|
||||
"domain-stop",
|
||||
libvirt_guest.JobInfo(
|
||||
type=fakelibvirt.VIR_DOMAIN_JOB_COMPLETED),
|
||||
]
|
||||
|
||||
self._test_live_migration_monitoring(domain_info_records, [],
|
||||
self.EXPECT_SUCCESS,
|
||||
expected_switch=True)
|
||||
|
||||
@mock.patch.object(host.Host, "get_connection")
|
||||
@mock.patch.object(utils, "spawn")
|
||||
@mock.patch.object(libvirt_driver.LibvirtDriver, "_live_migration_monitor")
|
||||
|
@ -282,42 +282,68 @@ class MigrationMonitorTestCase(test.NoDBTestCase):
|
||||
self.assertTrue(migration.should_abort(self.instance,
|
||||
5000,
|
||||
1000, 2000,
|
||||
4500, 9000))
|
||||
4500, 9000,
|
||||
"running"))
|
||||
|
||||
def test_live_migration_abort_no_prog_timeout(self):
|
||||
# Progress timeout is disabled
|
||||
self.assertFalse(migration.should_abort(self.instance,
|
||||
5000,
|
||||
1000, 0,
|
||||
4500, 9000))
|
||||
4500, 9000,
|
||||
"running"))
|
||||
|
||||
def test_live_migration_abort_not_stuck(self):
|
||||
# Progress time is less than progress timeout
|
||||
self.assertFalse(migration.should_abort(self.instance,
|
||||
5000,
|
||||
4500, 2000,
|
||||
4500, 9000))
|
||||
4500, 9000,
|
||||
"running"))
|
||||
|
||||
def test_live_migration_abort_too_long(self):
|
||||
# Elapsed time is over completion timeout
|
||||
self.assertTrue(migration.should_abort(self.instance,
|
||||
5000,
|
||||
4500, 2000,
|
||||
4500, 2000))
|
||||
4500, 2000,
|
||||
"running"))
|
||||
|
||||
def test_live_migration_abort_no_comp_timeout(self):
|
||||
# Completion timeout is disabled
|
||||
self.assertFalse(migration.should_abort(self.instance,
|
||||
5000,
|
||||
4500, 2000,
|
||||
4500, 0))
|
||||
4500, 0,
|
||||
"running"))
|
||||
|
||||
def test_live_migration_abort_still_working(self):
|
||||
# Elapsed time is less than completion timeout
|
||||
self.assertFalse(migration.should_abort(self.instance,
|
||||
5000,
|
||||
4500, 2000,
|
||||
4500, 9000))
|
||||
4500, 9000,
|
||||
"running"))
|
||||
|
||||
def test_live_migration_postcopy_switch(self):
|
||||
# Migration progress is not fast enough
|
||||
self.assertTrue(migration.should_switch_to_postcopy(
|
||||
2, 100, 105, "running"))
|
||||
|
||||
def test_live_migration_postcopy_switch_already_switched(self):
|
||||
# Migration already running in postcopy mode
|
||||
self.assertFalse(migration.should_switch_to_postcopy(
|
||||
2, 100, 105, "running (post-copy)"))
|
||||
|
||||
def test_live_migration_postcopy_switch_too_soon(self):
|
||||
# First memory iteration not completed yet
|
||||
self.assertFalse(migration.should_switch_to_postcopy(
|
||||
1, 100, 105, "running"))
|
||||
|
||||
def test_live_migration_postcopy_switch_fast_progress(self):
|
||||
# Migration progress is good
|
||||
self.assertFalse(migration.should_switch_to_postcopy(
|
||||
2, 100, 155, "running"))
|
||||
|
||||
@mock.patch.object(libvirt_guest.Guest,
|
||||
"migrate_configure_max_downtime")
|
||||
|
@ -6095,6 +6095,7 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
start = time.time()
|
||||
progress_time = start
|
||||
progress_watermark = None
|
||||
previous_data_remaining = -1
|
||||
is_post_copy_enabled = self._is_post_copy_enabled(migration_flags)
|
||||
while True:
|
||||
info = guest.get_job_info()
|
||||
@ -6137,15 +6138,25 @@ class LibvirtDriver(driver.ComputeDriver):
|
||||
|
||||
if libvirt_migrate.should_abort(instance, now, progress_time,
|
||||
progress_timeout, elapsed,
|
||||
completion_timeout):
|
||||
completion_timeout,
|
||||
migration.status):
|
||||
try:
|
||||
guest.abort_job()
|
||||
except libvirt.libvirtError as e:
|
||||
LOG.warning(_LW("Failed to abort migration %s"),
|
||||
e, instance=instance)
|
||||
e, instance=instance)
|
||||
self._clear_empty_migration(instance)
|
||||
raise
|
||||
|
||||
if (is_post_copy_enabled and
|
||||
libvirt_migrate.should_switch_to_postcopy(
|
||||
info.memory_iteration, info.data_remaining,
|
||||
previous_data_remaining, migration.status)):
|
||||
libvirt_migrate.trigger_postcopy_switch(guest,
|
||||
instance,
|
||||
migration)
|
||||
previous_data_remaining = info.data_remaining
|
||||
|
||||
curdowntime = libvirt_migrate.update_downtime(
|
||||
guest, instance, curdowntime,
|
||||
downtime_steps, elapsed)
|
||||
|
@ -731,6 +731,7 @@ class JobInfo(object):
|
||||
self.memory_total = kwargs.get("memory_total", 0)
|
||||
self.memory_processed = kwargs.get("memory_processed", 0)
|
||||
self.memory_remaining = kwargs.get("memory_remaining", 0)
|
||||
self.memory_iteration = kwargs.get("memory_iteration", 0)
|
||||
self.memory_constant = kwargs.get("memory_constant", 0)
|
||||
self.memory_normal = kwargs.get("memory_normal", 0)
|
||||
self.memory_normal_bytes = kwargs.get("memory_normal_bytes", 0)
|
||||
|
@ -170,7 +170,8 @@ def find_job_type(guest, instance):
|
||||
|
||||
def should_abort(instance, now,
|
||||
progress_time, progress_timeout,
|
||||
elapsed, completion_timeout):
|
||||
elapsed, completion_timeout,
|
||||
migration_status):
|
||||
"""Determine if the migration should be aborted
|
||||
|
||||
:param instance: a nova.objects.Instance
|
||||
@ -179,12 +180,18 @@ def should_abort(instance, now,
|
||||
:param progress_timeout: time in secs to allow for progress
|
||||
:param elapsed: total elapsed time of migration in secs
|
||||
:param completion_timeout: time in secs to allow for completion
|
||||
:param migration_status: current status of the migration
|
||||
|
||||
Check the progress and completion timeouts to determine if either
|
||||
of them have been hit, and should thus cause migration to be aborted
|
||||
|
||||
Avoid migration to be aborted if it is running in post-copy mode
|
||||
|
||||
:returns: True if migration should be aborted, False otherwise
|
||||
"""
|
||||
if migration_status == 'running (post-copy)':
|
||||
return False
|
||||
|
||||
if (progress_timeout != 0 and
|
||||
(now - progress_time) > progress_timeout):
|
||||
LOG.warning(_LW("Live migration stuck for %d sec"),
|
||||
@ -201,6 +208,39 @@ def should_abort(instance, now,
|
||||
return False
|
||||
|
||||
|
||||
def should_switch_to_postcopy(memory_iteration, current_data_remaining,
|
||||
previous_data_remaining, migration_status):
|
||||
"""Determine if the migration should be switched to postcopy mode
|
||||
|
||||
:param memory_iteration: Number of memory iterations during the migration
|
||||
:param current_data_remaining: amount of memory to be transferred
|
||||
:param previous_data_remaining: previous memory to be transferred
|
||||
:param migration_status: current status of the migration
|
||||
|
||||
Check the progress after the first memory iteration to determine if the
|
||||
migration should be switched to post-copy mode
|
||||
|
||||
Avoid post-copy switch if already running in post-copy mode
|
||||
|
||||
:returns: True if migration should be switched to postcopy mode,
|
||||
False otherwise
|
||||
"""
|
||||
if (migration_status == 'running (post-copy)' or
|
||||
previous_data_remaining <= 0):
|
||||
return False
|
||||
|
||||
if memory_iteration > 1:
|
||||
progress_percentage = round((previous_data_remaining -
|
||||
current_data_remaining) *
|
||||
100 / previous_data_remaining)
|
||||
# If migration progress is less than 10% per iteration after the
|
||||
# first memory page copying pass, the migration is switched to
|
||||
# postcopy mode
|
||||
if progress_percentage < 10:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def update_downtime(guest, instance,
|
||||
olddowntime,
|
||||
downtime_steps, elapsed):
|
||||
|
Loading…
Reference in New Issue
Block a user