Port binding based on events during live migration

Currently port binding call is made at destination compute in post live
migration phase. This may cause network outage during post-copy as the
virtual CPUs are paused immediately at source and unpaused at
destination by transferring a minimum set of pages.

The following domain life cycle events are emitted in this order during
post-copy:

* VIR_DOMAIN_EVENT_STARTED(destination)
* VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY(source)--migration
  entered post-copy mode
* VIR_DOMAIN_EVENT_RESUMED_POSTCOPY(destination)--guest
  is running on the destinaton host while some if its
  memory pages still remain on sourcehost.
* VIR_DOMAIN_EVENT_RESUMED_MIGRATED(destination)
* VIR_DOMAIN_EVENT_STOPPED_MIGRATED(source)--migration
  finished successfully and the destination host holds
  a complete guest state.

In this change, dest host port binding activation is done when the
following events are emitted at source for post-copy and pre-copy:

* VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY
* VIR_DOMAIN_EVENT_SUSPENDED_MIGRATED

This reduces the network outage during live migration as network switch
is done right before VM resumes at destination.

Co-Authored-By: Matt Riedemann <mriedem.os@gmail.com>

Change-Id: Ic5cab99944df9e501ba2032eb96911c36304494d
Closes-Bug: #1605016
This commit is contained in:
Sivasathurappan Radhakrishnan 2017-02-16 12:51:45 +00:00 committed by Matt Riedemann
parent e53f46672e
commit 1f48d3d83b
5 changed files with 165 additions and 14 deletions

View File

@ -1051,22 +1051,27 @@ class ComputeManager(manager.Manager):
{'state': event.get_name()},
instance_uuid=event.get_instance_uuid())
context = nova.context.get_admin_context(read_deleted='yes')
# Join on info_cache since that's needed in migrate_instance_start.
instance = objects.Instance.get_by_uuid(context,
event.get_instance_uuid(),
expected_attrs=[])
expected_attrs=['info_cache'])
vm_power_state = None
if event.get_transition() == virtevent.EVENT_LIFECYCLE_STOPPED:
event_transition = event.get_transition()
if event_transition == virtevent.EVENT_LIFECYCLE_STOPPED:
vm_power_state = power_state.SHUTDOWN
elif event.get_transition() == virtevent.EVENT_LIFECYCLE_STARTED:
elif event_transition == virtevent.EVENT_LIFECYCLE_STARTED:
vm_power_state = power_state.RUNNING
elif event.get_transition() == virtevent.EVENT_LIFECYCLE_PAUSED:
elif event_transition in (
virtevent.EVENT_LIFECYCLE_PAUSED,
virtevent.EVENT_LIFECYCLE_POSTCOPY_STARTED,
virtevent.EVENT_LIFECYCLE_MIGRATION_COMPLETED):
vm_power_state = power_state.PAUSED
elif event.get_transition() == virtevent.EVENT_LIFECYCLE_RESUMED:
elif event_transition == virtevent.EVENT_LIFECYCLE_RESUMED:
vm_power_state = power_state.RUNNING
elif event.get_transition() == virtevent.EVENT_LIFECYCLE_SUSPENDED:
elif event_transition == virtevent.EVENT_LIFECYCLE_SUSPENDED:
vm_power_state = power_state.SUSPENDED
else:
LOG.warning("Unexpected power state %d", event.get_transition())
LOG.warning("Unexpected lifecycle event: %d", event_transition)
# Note(lpetrut): The event may be delayed, thus not reflecting
# the current instance power state. In that case, ignore the event.
@ -1087,6 +1092,36 @@ class ComputeManager(manager.Manager):
instance,
vm_power_state)
# The following checks are for live migration. We want to activate
# the port binding for the destination host before the live migration
# is resumed on the destination host in order to reduce network
# downtime. Otherwise the ports are bound to the destination host
# in post_live_migration_at_destination.
migrate_finish_statuses = {
# This happens on the source node and indicates live migration
# entered post-copy mode.
virtevent.EVENT_LIFECYCLE_POSTCOPY_STARTED: 'running (post-copy)',
# Suspended for offline migration.
virtevent.EVENT_LIFECYCLE_MIGRATION_COMPLETED: 'running'
}
if (instance.task_state == task_states.MIGRATING and
event_transition in migrate_finish_statuses):
status = migrate_finish_statuses[event_transition]
try:
migration = objects.Migration.get_by_instance_and_status(
context, instance.uuid, status)
LOG.debug('Binding ports to destination host: %s',
migration.dest_compute, instance=instance)
# For neutron, migrate_instance_start will activate the
# destination host port bindings, if there are any created by
# conductor before live migration started.
self.network_api.migrate_instance_start(
context, instance, migration)
except exception.MigrationNotFoundByStatus:
LOG.warning("Unable to find migration record with status "
"'%s' for instance. Port binding will happen in "
"post live migration.", status, instance=instance)
def handle_events(self, event):
if isinstance(event, virtevent.LifecycleEvent):
try:

View File

@ -92,24 +92,49 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
@mock.patch.object(manager.ComputeManager, '_get_power_state')
@mock.patch.object(manager.ComputeManager, '_sync_instance_power_state')
@mock.patch.object(objects.Instance, 'get_by_uuid')
def _test_handle_lifecycle_event(self, mock_get, mock_sync,
mock_get_power_state, transition,
event_pwr_state, current_pwr_state):
@mock.patch.object(objects.Migration, 'get_by_instance_and_status')
@mock.patch.object(nova.network.neutronv2.api.API,
'migrate_instance_start')
def _test_handle_lifecycle_event(self, migrate_instance_start,
mock_get_migration, mock_get,
mock_sync, mock_get_power_state,
transition, event_pwr_state,
current_pwr_state):
event = mock.Mock()
event.get_instance_uuid.return_value = mock.sentinel.uuid
mock_get.return_value = fake_instance.fake_instance_obj(self.context,
task_state=task_states.MIGRATING)
event.get_transition.return_value = transition
mock_get_power_state.return_value = current_pwr_state
self.compute.handle_lifecycle_event(event)
mock_get.assert_called_once_with(
test.MatchType(context.RequestContext),
event.get_instance_uuid.return_value,
expected_attrs=['info_cache'])
mock_get.assert_called_with(mock.ANY, mock.sentinel.uuid,
expected_attrs=[])
if event_pwr_state == current_pwr_state:
mock_sync.assert_called_with(mock.ANY, mock_get.return_value,
event_pwr_state)
else:
self.assertFalse(mock_sync.called)
migrate_finish_statuses = {
virtevent.EVENT_LIFECYCLE_POSTCOPY_STARTED: 'running (post-copy)',
virtevent.EVENT_LIFECYCLE_MIGRATION_COMPLETED: 'running'
}
if transition in migrate_finish_statuses:
mock_get_migration.assert_called_with(
test.MatchType(context.RequestContext),
mock_get.return_value.uuid,
migrate_finish_statuses[transition])
migrate_instance_start.assert_called_once_with(
test.MatchType(context.RequestContext),
mock_get.return_value,
mock_get_migration.return_value)
else:
mock_get_migration.assert_not_called()
migrate_instance_start.assert_not_called()
def test_handle_lifecycle_event(self):
event_map = {virtevent.EVENT_LIFECYCLE_STOPPED: power_state.SHUTDOWN,
virtevent.EVENT_LIFECYCLE_STARTED: power_state.RUNNING,
@ -117,6 +142,10 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
virtevent.EVENT_LIFECYCLE_RESUMED: power_state.RUNNING,
virtevent.EVENT_LIFECYCLE_SUSPENDED:
power_state.SUSPENDED,
virtevent.EVENT_LIFECYCLE_POSTCOPY_STARTED:
power_state.PAUSED,
virtevent.EVENT_LIFECYCLE_MIGRATION_COMPLETED:
power_state.PAUSED,
}
for transition, pwr_state in event_map.items():
@ -130,6 +159,35 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase):
event_pwr_state=power_state.SHUTDOWN,
current_pwr_state=power_state.RUNNING)
@mock.patch('nova.objects.Instance.get_by_uuid')
@mock.patch('nova.compute.manager.ComputeManager.'
'_sync_instance_power_state')
@mock.patch('nova.objects.Migration.get_by_instance_and_status',
side_effect=exception.MigrationNotFoundByStatus(
instance_id=uuids.instance, status='running (post-copy)'))
def test_handle_lifecycle_event_postcopy_migration_not_found(
self, mock_get_migration, mock_sync, mock_get_instance):
"""Tests a EVENT_LIFECYCLE_POSTCOPY_STARTED scenario where the
migration record is not found by the expected status.
"""
inst = fake_instance.fake_instance_obj(
self.context, uuid=uuids.instance,
task_state=task_states.MIGRATING)
mock_get_instance.return_value = inst
event = virtevent.LifecycleEvent(
uuids.instance, virtevent.EVENT_LIFECYCLE_POSTCOPY_STARTED)
with mock.patch.object(self.compute, '_get_power_state',
return_value=power_state.PAUSED):
with mock.patch.object(self.compute.network_api,
'migrate_instance_finish') as mig_finish:
self.compute.handle_lifecycle_event(event)
# Since we failed to find the migration record, we shouldn't call
# migrate_instance_finish.
mig_finish.assert_not_called()
mock_get_migration.assert_called_once_with(
test.MatchType(context.RequestContext), uuids.instance,
'running (post-copy)')
@mock.patch('nova.compute.utils.notify_about_instance_action')
def test_delete_instance_info_cache_delete_ordering(self, mock_notify):
call_tracker = mock.Mock()

View File

@ -192,6 +192,46 @@ class HostTestCase(test.NoDBTestCase):
self.assertEqual(got_events[0].transition,
event.EVENT_LIFECYCLE_STOPPED)
def test_event_lifecycle_callback_suspended_old_libvirt(self):
"""Tests the suspended lifecycle event with libvirt before post-copy
"""
hostimpl = mock.MagicMock()
conn = mock.MagicMock()
fake_dom_xml = """
<domain type='kvm'>
<uuid>cef19ce0-0ca2-11df-855d-b19fbce37686</uuid>
</domain>
"""
dom = fakelibvirt.Domain(conn, fake_dom_xml, running=True)
VIR_DOMAIN_EVENT_SUSPENDED_PAUSED = 0
host.Host._event_lifecycle_callback(
conn, dom, fakelibvirt.VIR_DOMAIN_EVENT_SUSPENDED,
detail=VIR_DOMAIN_EVENT_SUSPENDED_PAUSED, opaque=hostimpl)
expected_event = hostimpl._queue_event.call_args[0][0]
self.assertEqual(event.EVENT_LIFECYCLE_PAUSED,
expected_event.transition)
def test_event_lifecycle_callback_suspended_postcopy(self):
"""Tests the suspended lifecycle event with libvirt with post-copy"""
hostimpl = mock.MagicMock()
conn = mock.MagicMock()
fake_dom_xml = """
<domain type='kvm'>
<uuid>cef19ce0-0ca2-11df-855d-b19fbce37686</uuid>
</domain>
"""
dom = fakelibvirt.Domain(conn, fake_dom_xml, running=True)
VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY = 7
with mock.patch.object(host.libvirt,
'VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY', new=7,
create=True):
host.Host._event_lifecycle_callback(
conn, dom, fakelibvirt.VIR_DOMAIN_EVENT_SUSPENDED,
detail=VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY, opaque=hostimpl)
expected_event = hostimpl._queue_event.call_args[0][0]
self.assertEqual(event.EVENT_LIFECYCLE_POSTCOPY_STARTED,
expected_event.transition)
def test_event_emit_delayed_call_delayed(self):
ev = event.LifecycleEvent(
"cef19ce0-0ca2-11df-855d-b19fbce37686",

View File

@ -29,6 +29,9 @@ EVENT_LIFECYCLE_STOPPED = 1
EVENT_LIFECYCLE_PAUSED = 2
EVENT_LIFECYCLE_RESUMED = 3
EVENT_LIFECYCLE_SUSPENDED = 4
EVENT_LIFECYCLE_POSTCOPY_STARTED = 5
EVENT_LIFECYCLE_MIGRATION_COMPLETED = 6
NAMES = {
EVENT_LIFECYCLE_STARTED: _('Started'),
@ -36,6 +39,8 @@ NAMES = {
EVENT_LIFECYCLE_PAUSED: _('Paused'),
EVENT_LIFECYCLE_RESUMED: _('Resumed'),
EVENT_LIFECYCLE_SUSPENDED: _('Suspended'),
EVENT_LIFECYCLE_POSTCOPY_STARTED: _('Postcopy started'),
EVENT_LIFECYCLE_MIGRATION_COMPLETED: _('Migration completed'),
}

View File

@ -170,7 +170,20 @@ class Host(object):
elif event == libvirt.VIR_DOMAIN_EVENT_STARTED:
transition = virtevent.EVENT_LIFECYCLE_STARTED
elif event == libvirt.VIR_DOMAIN_EVENT_SUSPENDED:
transition = virtevent.EVENT_LIFECYCLE_PAUSED
# NOTE(siva_krishnan): We have to check if
# VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY and
# VIR_DOMAIN_EVENT_SUSPENDED_MIGRATED exist since the current
# minimum version of libvirt (1.2.9) don't have those attributes.
# This check can be removed once MIN_LIBVIRT_VERSION is bumped to
# at least 1.3.3.
if (hasattr(libvirt, 'VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY') and
detail == libvirt.VIR_DOMAIN_EVENT_SUSPENDED_POSTCOPY):
transition = virtevent.EVENT_LIFECYCLE_POSTCOPY_STARTED
elif (hasattr(libvirt, 'VIR_DOMAIN_EVENT_SUSPENDED_MIGRATED') and
detail == libvirt.VIR_DOMAIN_EVENT_SUSPENDED_MIGRATED):
transition = virtevent.EVENT_LIFECYCLE_MIGRATION_COMPLETED
else:
transition = virtevent.EVENT_LIFECYCLE_PAUSED
elif event == libvirt.VIR_DOMAIN_EVENT_RESUMED:
transition = virtevent.EVENT_LIFECYCLE_RESUMED