Extend live-migration-force-complete to use postcopy if available

Since QEMU 2.5 and Libvirt 1.3.3 it is possible to use postcopy
live migration to ensure an upper limit in the amount of memory
to be transferred duing the live migration process. This option
enables to live migrate instances with memory-intensive workloads
by driving the migration process by the destination host, i.e.,
after the switch to postcopy mode, the destination VM is the active
one and therefore any dirty pages are directly generated at the
destination hosts. This patch ensures that force-complete will use
postcopy instead of pausing the VM during the migration is postcopy
is available and enabled.

Partially-implements: blueprint auto-live-migration-completion

Change-Id: Ic27f10682024ad6fee7d433f47e84aeeaa55120b
Signed-off-by: Luis Tomas <luis5tb@gmail.com>
Co-Authored-By: Pawel Koniszewski <pawel.koniszewski@intel.com>
This commit is contained in:
Luis Tomas 2016-06-13 18:00:31 +02:00 committed by Pawel Koniszewski
parent 1cb9195aa8
commit 2de3879afa
6 changed files with 377 additions and 42 deletions

View File

@ -8214,6 +8214,7 @@ class LibvirtConnTestCase(test.NoDBTestCase):
EXPECT_FAILURE = 2
EXPECT_ABORT = 3
@mock.patch.object(libvirt_guest.Guest, "migrate_start_postcopy")
@mock.patch.object(time, "time")
@mock.patch.object(time, "sleep",
side_effect=lambda x: eventlet.sleep(0))
@ -8237,9 +8238,12 @@ class LibvirtConnTestCase(test.NoDBTestCase):
mock_conn,
mock_sleep,
mock_time,
mock_postcopy_switch,
current_mig_status=None,
expected_mig_status=None,
scheduled_action=None,
scheduled_action_executed=False):
scheduled_action_executed=False,
block_migration=False):
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
instance = objects.Instance(**self.test_instance)
drvr.active_migrations[instance.uuid] = deque()
@ -8258,7 +8262,8 @@ class LibvirtConnTestCase(test.NoDBTestCase):
elif rec == "domain-stop":
dom.destroy()
elif rec == "force_complete":
drvr.active_migrations[instance.uuid].append("pause")
drvr.active_migrations[instance.uuid].append(
"force-complete")
else:
if len(time_records) > 0:
time_records.pop(0)
@ -8279,7 +8284,11 @@ class LibvirtConnTestCase(test.NoDBTestCase):
dest = mock.sentinel.migrate_dest
migration = objects.Migration(context=self.context, id=1)
migrate_data = objects.LibvirtLiveMigrateData(
migration=migration)
migration=migration, block_migration=block_migration)
if current_mig_status:
migrate_data.migration.status = current_mig_status
migrate_data.migration.save()
fake_post_method = mock.MagicMock()
fake_recover_method = mock.MagicMock()
@ -8294,9 +8303,13 @@ class LibvirtConnTestCase(test.NoDBTestCase):
if scheduled_action_executed:
if scheduled_action == 'pause':
self.assertTrue(mock_pause.called)
if scheduled_action == 'postcopy_switch':
self.assertTrue(mock_postcopy_switch.called)
else:
if scheduled_action == 'pause':
self.assertFalse(mock_pause.called)
if scheduled_action == 'postcopy_switch':
self.assertFalse(mock_postcopy_switch.called)
mock_mig_save.assert_called_with()
if expect_result == self.EXPECT_SUCCESS:
@ -8365,6 +8378,7 @@ class LibvirtConnTestCase(test.NoDBTestCase):
self._test_live_migration_monitoring(domain_info_records, [],
self.EXPECT_SUCCESS,
current_mig_status="running",
scheduled_action="pause",
scheduled_action_executed=True)
@ -8390,6 +8404,7 @@ class LibvirtConnTestCase(test.NoDBTestCase):
self._test_live_migration_monitoring(domain_info_records, [],
self.EXPECT_SUCCESS,
current_mig_status="preparing",
scheduled_action="pause",
scheduled_action_executed=True)
@ -8415,6 +8430,7 @@ class LibvirtConnTestCase(test.NoDBTestCase):
self._test_live_migration_monitoring(domain_info_records, [],
self.EXPECT_SUCCESS,
current_mig_status="completed",
scheduled_action="pause",
scheduled_action_executed=False)
@ -8439,6 +8455,7 @@ class LibvirtConnTestCase(test.NoDBTestCase):
self._test_live_migration_monitoring(domain_info_records, [],
self.EXPECT_FAILURE,
current_mig_status="cancelled",
expected_mig_status='cancelled',
scheduled_action="pause",
scheduled_action_executed=False)
@ -8467,6 +8484,211 @@ class LibvirtConnTestCase(test.NoDBTestCase):
scheduled_action="pause",
scheduled_action_executed=False)
@mock.patch.object(libvirt_driver.LibvirtDriver,
"_is_post_copy_enabled")
def test_live_migration_handle_postcopy_normal(self,
mock_postcopy_enabled):
# A normal sequence where see all the normal job states, and postcopy
# switch scheduled in between VIR_DOMAIN_JOB_UNBOUNDED
mock_postcopy_enabled.return_value = True
domain_info_records = [
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_NONE),
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
"force_complete",
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
"thread-finish",
"domain-stop",
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_COMPLETED),
]
self._test_live_migration_monitoring(domain_info_records, [],
self.EXPECT_SUCCESS,
current_mig_status="running",
scheduled_action="postcopy_switch",
scheduled_action_executed=True)
@mock.patch.object(libvirt_driver.LibvirtDriver,
"_is_post_copy_enabled")
def test_live_migration_handle_postcopy_on_start(self,
mock_postcopy_enabled):
# A normal sequence where see all the normal job states, and postcopy
# switch scheduled in case of job type VIR_DOMAIN_JOB_NONE and
# finish_event is not ready yet
mock_postcopy_enabled.return_value = True
domain_info_records = [
"force_complete",
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_NONE),
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
"thread-finish",
"domain-stop",
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_COMPLETED),
]
self._test_live_migration_monitoring(domain_info_records, [],
self.EXPECT_SUCCESS,
current_mig_status="preparing",
scheduled_action="postcopy_switch",
scheduled_action_executed=True)
@mock.patch.object(libvirt_driver.LibvirtDriver,
"_is_post_copy_enabled")
def test_live_migration_handle_postcopy_on_finish(self,
mock_postcopy_enabled):
# A normal sequence where see all the normal job states, and postcopy
# switch scheduled in case of job type VIR_DOMAIN_JOB_NONE and
# finish_event is ready
mock_postcopy_enabled.return_value = True
domain_info_records = [
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_NONE),
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
"thread-finish",
"domain-stop",
"force_complete",
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_COMPLETED),
]
self._test_live_migration_monitoring(domain_info_records, [],
self.EXPECT_SUCCESS,
current_mig_status="completed",
scheduled_action="postcopy_switch",
scheduled_action_executed=False)
@mock.patch.object(libvirt_driver.LibvirtDriver,
"_is_post_copy_enabled")
def test_live_migration_handle_postcopy_on_cancel(self,
mock_postcopy_enabled):
# A normal sequence where see all the normal job states, and postcopy
# scheduled in case of job type VIR_DOMAIN_JOB_CANCELLED
mock_postcopy_enabled.return_value = True
domain_info_records = [
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_NONE),
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
"thread-finish",
"domain-stop",
"force_complete",
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_CANCELLED),
]
self._test_live_migration_monitoring(domain_info_records, [],
self.EXPECT_FAILURE,
current_mig_status="cancelled",
expected_mig_status='cancelled',
scheduled_action="postcopy_switch",
scheduled_action_executed=False)
@mock.patch.object(libvirt_driver.LibvirtDriver,
"_is_post_copy_enabled")
def test_live_migration_handle_pause_on_postcopy(self,
mock_postcopy_enabled):
# A normal sequence where see all the normal job states, and pause
# scheduled after migration switched to postcopy
mock_postcopy_enabled.return_value = True
domain_info_records = [
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_NONE),
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
"force_complete",
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
"thread-finish",
"domain-stop",
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_COMPLETED),
]
self._test_live_migration_monitoring(domain_info_records, [],
self.EXPECT_SUCCESS,
current_mig_status="running (post-copy)",
scheduled_action="pause",
scheduled_action_executed=False)
@mock.patch.object(libvirt_driver.LibvirtDriver,
"_is_post_copy_enabled")
def test_live_migration_handle_postcopy_on_postcopy(self,
mock_postcopy_enabled):
# A normal sequence where see all the normal job states, and pause
# scheduled after migration switched to postcopy
mock_postcopy_enabled.return_value = True
domain_info_records = [
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_NONE),
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
"force_complete",
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
"thread-finish",
"domain-stop",
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_COMPLETED),
]
self._test_live_migration_monitoring(domain_info_records, [],
self.EXPECT_SUCCESS,
current_mig_status="running (post-copy)",
scheduled_action="postcopy_switch",
scheduled_action_executed=False)
@mock.patch.object(libvirt_driver.LibvirtDriver,
"_is_post_copy_enabled")
def test_live_migration_handle_postcopy_on_failure(self,
mock_postcopy_enabled):
# A normal sequence where see all the normal job states, and postcopy
# scheduled in case of job type VIR_DOMAIN_JOB_FAILED
mock_postcopy_enabled.return_value = True
domain_info_records = [
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_NONE),
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_UNBOUNDED),
"thread-finish",
"domain-stop",
"force_complete",
libvirt_guest.JobInfo(
type=fakelibvirt.VIR_DOMAIN_JOB_FAILED),
]
self._test_live_migration_monitoring(domain_info_records, [],
self.EXPECT_FAILURE,
scheduled_action="postcopy_switch",
scheduled_action_executed=False)
def test_live_migration_monitor_success_race(self):
# A normalish sequence but we're too slow to see the
# completed job state
@ -14134,7 +14356,7 @@ class LibvirtConnTestCase(test.NoDBTestCase):
drvr.active_migrations[instance.uuid] = deque()
drvr.live_migration_force_complete(instance)
self.assertEqual(
1, drvr.active_migrations[instance.uuid].count("pause"))
1, drvr.active_migrations[instance.uuid].count("force-complete"))
@mock.patch.object(host.Host, "get_connection")
@mock.patch.object(fakelibvirt.virDomain, "abortJob")

View File

@ -429,54 +429,117 @@ class MigrationMonitorTestCase(test.NoDBTestCase):
mock_msave.assert_called_once_with()
mock_isave.assert_called_once_with()
@mock.patch.object(libvirt_guest.Guest, "migrate_start_postcopy")
@mock.patch.object(libvirt_guest.Guest, "pause")
def test_live_migration_run_tasks_pause(self, mock_pause):
tasks = deque()
tasks.append("pause")
active_migrations = {self.instance.uuid: tasks}
on_migration_failure = deque()
migration.run_tasks(self.guest, self.instance,
active_migrations, on_migration_failure)
mock_pause.assert_called_once_with()
self.assertEqual(len(on_migration_failure), 1)
self.assertEqual(on_migration_failure.pop(), "unpause")
@mock.patch.object(libvirt_guest.Guest, "pause")
def test_live_migration_run_tasks_empty_tasks(self, mock_pause):
def test_live_migration_run_tasks_empty_tasks(self, mock_pause,
mock_postcopy):
tasks = deque()
active_migrations = {self.instance.uuid: tasks}
on_migration_failure = deque()
mig = objects.Migration(id=1, status="running")
migration.run_tasks(self.guest, self.instance,
active_migrations, on_migration_failure)
active_migrations, on_migration_failure,
mig, False)
self.assertFalse(mock_pause.called)
self.assertFalse(mock_postcopy.called)
self.assertEqual(len(on_migration_failure), 0)
@mock.patch.object(libvirt_guest.Guest, "migrate_start_postcopy")
@mock.patch.object(libvirt_guest.Guest, "pause")
def test_live_migration_run_tasks_no_tasks(self, mock_pause):
def test_live_migration_run_tasks_no_tasks(self, mock_pause,
mock_postcopy):
active_migrations = {}
on_migration_failure = deque()
mig = objects.Migration(id=1, status="running")
migration.run_tasks(self.guest, self.instance,
active_migrations, on_migration_failure)
active_migrations, on_migration_failure,
mig, False)
self.assertFalse(mock_pause.called)
self.assertFalse(mock_postcopy.called)
self.assertEqual(len(on_migration_failure), 0)
@mock.patch.object(libvirt_guest.Guest, "migrate_start_postcopy")
@mock.patch.object(libvirt_guest.Guest, "pause")
def test_live_migration_run_tasks_no_pause(self, mock_pause):
def test_live_migration_run_tasks_no_force_complete(self, mock_pause,
mock_postcopy):
tasks = deque()
# Test to ensure unknown tasks are ignored
tasks.append("wibble")
active_migrations = {self.instance.uuid: tasks}
on_migration_failure = deque()
migration.run_tasks(self.guest, self.instance,
active_migrations, on_migration_failure)
mig = objects.Migration(id=1, status="running")
migration.run_tasks(self.guest, self.instance,
active_migrations, on_migration_failure,
mig, False)
self.assertFalse(mock_pause.called)
self.assertFalse(mock_postcopy.called)
self.assertEqual(len(on_migration_failure), 0)
@mock.patch.object(libvirt_guest.Guest, "migrate_start_postcopy")
@mock.patch.object(libvirt_guest.Guest, "pause")
def test_live_migration_run_tasks_force_complete(self, mock_pause,
mock_postcopy):
tasks = deque()
tasks.append("force-complete")
active_migrations = {self.instance.uuid: tasks}
on_migration_failure = deque()
mig = objects.Migration(id=1, status="running")
migration.run_tasks(self.guest, self.instance,
active_migrations, on_migration_failure,
mig, False)
mock_pause.assert_called_once_with()
self.assertFalse(mock_postcopy.called)
self.assertEqual(len(on_migration_failure), 1)
self.assertEqual(on_migration_failure.pop(), "unpause")
@mock.patch.object(libvirt_guest.Guest, "migrate_start_postcopy")
@mock.patch.object(libvirt_guest.Guest, "pause")
def test_live_migration_run_tasks_force_complete_postcopy_running(self,
mock_pause, mock_postcopy):
tasks = deque()
tasks.append("force-complete")
active_migrations = {self.instance.uuid: tasks}
on_migration_failure = deque()
mig = objects.Migration(id=1, status="running (post-copy)")
migration.run_tasks(self.guest, self.instance,
active_migrations, on_migration_failure,
mig, True)
self.assertFalse(mock_pause.called)
self.assertFalse(mock_postcopy.called)
self.assertEqual(len(on_migration_failure), 0)
@mock.patch.object(objects.Migration, "save")
@mock.patch.object(libvirt_guest.Guest, "migrate_start_postcopy")
@mock.patch.object(libvirt_guest.Guest, "pause")
def test_live_migration_run_tasks_force_complete_postcopy(self,
mock_pause, mock_postcopy, mock_msave):
tasks = deque()
tasks.append("force-complete")
active_migrations = {self.instance.uuid: tasks}
on_migration_failure = deque()
mig = objects.Migration(id=1, status="running")
migration.run_tasks(self.guest, self.instance,
active_migrations, on_migration_failure,
mig, True)
mock_postcopy.assert_called_once_with()
self.assertFalse(mock_pause.called)
self.assertEqual(len(on_migration_failure), 0)

View File

@ -6069,6 +6069,11 @@ class LibvirtDriver(driver.ComputeDriver):
return ram_gb + disk_gb
def _get_migration_flags(self, is_block_migration):
if is_block_migration:
return self._block_migration_flags
return self._live_migration_flags
def _live_migration_monitor(self, context, instance, guest,
dest, post_method,
recover_method, block_migration,
@ -6083,10 +6088,14 @@ class LibvirtDriver(driver.ComputeDriver):
migration = migrate_data.migration
curdowntime = None
migration_flags = self._get_migration_flags(
migrate_data.block_migration)
n = 0
start = time.time()
progress_time = start
progress_watermark = None
is_post_copy_enabled = self._is_post_copy_enabled(migration_flags)
while True:
info = guest.get_job_info()
@ -6113,7 +6122,9 @@ class LibvirtDriver(driver.ComputeDriver):
# the operation, change max bandwidth
libvirt_migrate.run_tasks(guest, instance,
self.active_migrations,
on_migration_failure)
on_migration_failure,
migration,
is_post_copy_enabled)
now = time.time()
elapsed = now - start
@ -6290,12 +6301,15 @@ class LibvirtDriver(driver.ComputeDriver):
LOG.debug("Live migration monitoring is all done",
instance=instance)
def _is_post_copy_enabled(self, migration_flags):
if self._is_post_copy_available():
if (migration_flags & libvirt.VIR_MIGRATE_POSTCOPY) != 0:
return True
return False
def live_migration_force_complete(self, instance):
# NOTE(pkoniszewski): currently only pause during live migration is
# supported to force live migration to complete, so just try to pause
# the instance
try:
self.active_migrations[instance.uuid].append('pause')
self.active_migrations[instance.uuid].append('force-complete')
except KeyError:
raise exception.NoActiveMigrationForInstance(
instance_id=instance.uuid)

View File

@ -539,6 +539,10 @@ class Guest(object):
"""
self._domain.migrateSetMaxDowntime(mstime)
def migrate_start_postcopy(self):
"""Switch running live migration to post-copy mode"""
self._domain.migrateStartPostCopy()
def get_job_info(self):
"""Get job info for the domain

View File

@ -283,34 +283,62 @@ def save_stats(instance, migration, info, remaining):
instance.save()
def run_tasks(guest, instance, active_migrations, on_migration_failure):
def trigger_postcopy_switch(guest, instance, migration):
try:
guest.migrate_start_postcopy()
except libvirt.libvirtError as e:
LOG.warning(_LW("Failed to switch to post-copy live "
"migration: %s"),
e, instance=instance)
else:
# NOTE(ltomas): Change the migration status to indicate that
# it is in post-copy active mode, i.e., the VM at
# destination is the active one
LOG.info(_LI("Switching to post-copy migration mode"),
instance=instance)
migration.status = 'running (post-copy)'
migration.save()
def run_tasks(guest, instance, active_migrations, on_migration_failure,
migration, is_post_copy_enabled):
"""Run any pending migration tasks
:param guest: a nova.virt.libvirt.guest.Guest
:param instance: a nova.objects.Instance
:param active_migrations: dict of active migrations
:param on_migration_failure: queue of recovery tasks
:param migration: a nova.objects.Migration
:param is_post_copy_enabled: True if post-copy can be used
Run any pending migration tasks queued against the
provided instance object. The active migrations dict
should use instance UUIDs for keys and a queue of
tasks as the values.
Currently the only valid task that can be requested
is "pause". Other tasks will be ignored
Currently the valid tasks that can be requested
are "pause" and "force-complete". Other tasks will
be ignored.
"""
tasks = active_migrations.get(instance.uuid, deque())
while tasks:
task = tasks.popleft()
if task == 'pause':
try:
guest.pause()
on_migration_failure.append("unpause")
except Exception as e:
LOG.warning(_LW("Failed to pause instance during "
"live-migration %s"),
e, instance=instance)
if task == 'force-complete':
if migration.status == 'running (post-copy)':
LOG.warning(_LW("Live-migration %s already switched "
"to post-copy mode."),
instance=instance)
elif is_post_copy_enabled:
trigger_postcopy_switch(guest, instance, migration)
else:
try:
guest.pause()
on_migration_failure.append("unpause")
except Exception as e:
LOG.warning(_LW("Failed to pause instance during "
"live-migration %s"),
e, instance=instance)
else:
LOG.warning(_LW("Unknown migration task '%(task)s'"),
{"task": task}, instance=instance)

View File

@ -3,4 +3,8 @@ features:
- New configuration option live_migration_permit_post_copy
has been added to start live migrations in a way that allows
nova to switch an on-going live migration to post-copy mode.
Requires libvirt>=1.3.3 and QEMU>=2.5.0.
Requires libvirt>=1.3.3 and QEMU>=2.5.0. If post copy is
permitted and version requirements are met it also changes
behaviour of 'live_migration_force_complete', so that it
switches on-going live migration to post-copy mode instead
of pausing an instance during live migration.