Add automatic switching to postcopy mode when migration is not progressing

Since QEMU 2.5 and Libvirt 1.3.3 it is possible to use postcopy
live migration to ensure an upper limit in the amount of memory
to be transferred duing the live migration process. This option
enables to live migrate instances with memory-intensive workloads
by driving the migration process by the destination host, i.e.,
after the switch to postcopy mode, the destination VM is the active
one and therefore any dirty pages are directly generated at the
destination hosts. This patch ensures that the migration is switched
to postcopy mode if the migration process is not progressing more
than 10% each iteration due to memory dirtying ratio being faster
than page transferring to the destination.

Partially-implements: blueprint auto-live-migration-completion

Change-Id: If267fb7066e8323303bcc0f7e8f36513ef4d66b3
Signed-off-by: Luis Tomas <luis5tb@gmail.com>
Co-Authored-By: Pawel Koniszewski <pawel.koniszewski@intel.com>
This commit is contained in:
Luis Tomas 2016-06-10 23:00:45 +02:00 committed by Pawel Koniszewski
parent 2de3879afa
commit 1651850f6f
5 changed files with 122 additions and 11 deletions

View File

@ -8243,7 +8243,8 @@ class LibvirtConnTestCase(test.NoDBTestCase):
expected_mig_status=None,
scheduled_action=None,
scheduled_action_executed=False,
block_migration=False):
block_migration=False,
expected_switch=False):
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
instance = objects.Instance(**self.test_instance)
drvr.active_migrations[instance.uuid] = deque()
@ -8288,7 +8289,9 @@ class LibvirtConnTestCase(test.NoDBTestCase):
if current_mig_status:
migrate_data.migration.status = current_mig_status
migrate_data.migration.save()
else:
migrate_data.migration.status = "unset"
migrate_data.migration.save()
fake_post_method = mock.MagicMock()
fake_recover_method = mock.MagicMock()
@ -8317,6 +8320,8 @@ class LibvirtConnTestCase(test.NoDBTestCase):
'Recover method called when success expected')
self.assertFalse(mock_abort.called,
'abortJob not called when success expected')
if expected_switch:
self.assertTrue(mock_postcopy_switch.called)
fake_post_method.assert_called_once_with(
self.context, instance, dest, False, migrate_data)
else:
@ -8943,6 +8948,34 @@ class LibvirtConnTestCase(test.NoDBTestCase):
(900, 400),
], list(steps))
@mock.patch('nova.virt.libvirt.migration.should_switch_to_postcopy')
@mock.patch.object(libvirt_driver.LibvirtDriver,
"_is_post_copy_enabled")
def test_live_migration_monitor_postcopy_switch(self,
mock_postcopy_enabled, mock_should_switch):
# A normal sequence where migration is swtiched to postcopy mode
mock_postcopy_enabled.return_value = True
switch_values = [False, False, True]
mock_should_switch.return_value = switch_values
domain_info_records = [
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_NONE),
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
"thread-finish",
"domain-stop",
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_COMPLETED),
]
self._test_live_migration_monitoring(domain_info_records, [],
self.EXPECT_SUCCESS,
expected_switch=True)
@mock.patch.object(host.Host, "get_connection")
@mock.patch.object(utils, "spawn")
@mock.patch.object(libvirt_driver.LibvirtDriver, "_live_migration_monitor")

View File

@ -282,42 +282,68 @@ class MigrationMonitorTestCase(test.NoDBTestCase):
self.assertTrue(migration.should_abort(self.instance,
5000,
1000, 2000,
4500, 9000))
4500, 9000,
"running"))
def test_live_migration_abort_no_prog_timeout(self):
# Progress timeout is disabled
self.assertFalse(migration.should_abort(self.instance,
5000,
1000, 0,
4500, 9000))
4500, 9000,
"running"))
def test_live_migration_abort_not_stuck(self):
# Progress time is less than progress timeout
self.assertFalse(migration.should_abort(self.instance,
5000,
4500, 2000,
4500, 9000))
4500, 9000,
"running"))
def test_live_migration_abort_too_long(self):
# Elapsed time is over completion timeout
self.assertTrue(migration.should_abort(self.instance,
5000,
4500, 2000,
4500, 2000))
4500, 2000,
"running"))
def test_live_migration_abort_no_comp_timeout(self):
# Completion timeout is disabled
self.assertFalse(migration.should_abort(self.instance,
5000,
4500, 2000,
4500, 0))
4500, 0,
"running"))
def test_live_migration_abort_still_working(self):
# Elapsed time is less than completion timeout
self.assertFalse(migration.should_abort(self.instance,
5000,
4500, 2000,
4500, 9000))
4500, 9000,
"running"))
def test_live_migration_postcopy_switch(self):
# Migration progress is not fast enough
self.assertTrue(migration.should_switch_to_postcopy(
2, 100, 105, "running"))
def test_live_migration_postcopy_switch_already_switched(self):
# Migration already running in postcopy mode
self.assertFalse(migration.should_switch_to_postcopy(
2, 100, 105, "running (post-copy)"))
def test_live_migration_postcopy_switch_too_soon(self):
# First memory iteration not completed yet
self.assertFalse(migration.should_switch_to_postcopy(
1, 100, 105, "running"))
def test_live_migration_postcopy_switch_fast_progress(self):
# Migration progress is good
self.assertFalse(migration.should_switch_to_postcopy(
2, 100, 155, "running"))
@mock.patch.object(libvirt_guest.Guest,
"migrate_configure_max_downtime")

View File

@ -6095,6 +6095,7 @@ class LibvirtDriver(driver.ComputeDriver):
start = time.time()
progress_time = start
progress_watermark = None
previous_data_remaining = -1
is_post_copy_enabled = self._is_post_copy_enabled(migration_flags)
while True:
info = guest.get_job_info()
@ -6137,15 +6138,25 @@ class LibvirtDriver(driver.ComputeDriver):
if libvirt_migrate.should_abort(instance, now, progress_time,
progress_timeout, elapsed,
completion_timeout):
completion_timeout,
migration.status):
try:
guest.abort_job()
except libvirt.libvirtError as e:
LOG.warning(_LW("Failed to abort migration %s"),
e, instance=instance)
e, instance=instance)
self._clear_empty_migration(instance)
raise
if (is_post_copy_enabled and
libvirt_migrate.should_switch_to_postcopy(
info.memory_iteration, info.data_remaining,
previous_data_remaining, migration.status)):
libvirt_migrate.trigger_postcopy_switch(guest,
instance,
migration)
previous_data_remaining = info.data_remaining
curdowntime = libvirt_migrate.update_downtime(
guest, instance, curdowntime,
downtime_steps, elapsed)

View File

@ -731,6 +731,7 @@ class JobInfo(object):
self.memory_total = kwargs.get("memory_total", 0)
self.memory_processed = kwargs.get("memory_processed", 0)
self.memory_remaining = kwargs.get("memory_remaining", 0)
self.memory_iteration = kwargs.get("memory_iteration", 0)
self.memory_constant = kwargs.get("memory_constant", 0)
self.memory_normal = kwargs.get("memory_normal", 0)
self.memory_normal_bytes = kwargs.get("memory_normal_bytes", 0)

View File

@ -170,7 +170,8 @@ def find_job_type(guest, instance):
def should_abort(instance, now,
progress_time, progress_timeout,
elapsed, completion_timeout):
elapsed, completion_timeout,
migration_status):
"""Determine if the migration should be aborted
:param instance: a nova.objects.Instance
@ -179,12 +180,18 @@ def should_abort(instance, now,
:param progress_timeout: time in secs to allow for progress
:param elapsed: total elapsed time of migration in secs
:param completion_timeout: time in secs to allow for completion
:param migration_status: current status of the migration
Check the progress and completion timeouts to determine if either
of them have been hit, and should thus cause migration to be aborted
Avoid migration to be aborted if it is running in post-copy mode
:returns: True if migration should be aborted, False otherwise
"""
if migration_status == 'running (post-copy)':
return False
if (progress_timeout != 0 and
(now - progress_time) > progress_timeout):
LOG.warning(_LW("Live migration stuck for %d sec"),
@ -201,6 +208,39 @@ def should_abort(instance, now,
return False
def should_switch_to_postcopy(memory_iteration, current_data_remaining,
previous_data_remaining, migration_status):
"""Determine if the migration should be switched to postcopy mode
:param memory_iteration: Number of memory iterations during the migration
:param current_data_remaining: amount of memory to be transferred
:param previous_data_remaining: previous memory to be transferred
:param migration_status: current status of the migration
Check the progress after the first memory iteration to determine if the
migration should be switched to post-copy mode
Avoid post-copy switch if already running in post-copy mode
:returns: True if migration should be switched to postcopy mode,
False otherwise
"""
if (migration_status == 'running (post-copy)' or
previous_data_remaining <= 0):
return False
if memory_iteration > 1:
progress_percentage = round((previous_data_remaining -
current_data_remaining) *
100 / previous_data_remaining)
# If migration progress is less than 10% per iteration after the
# first memory page copying pass, the migration is switched to
# postcopy mode
if progress_percentage < 10:
return True
return False
def update_downtime(guest, instance,
olddowntime,
downtime_steps, elapsed):