post live migration: don't call Neutron needlessly

In bug 1879787, the call to network_api.get_instance_nw_info() in
_post_live_migration() on the source compute manager eventually calls
out to the Neutron REST API. If this fails, the exception is
unhandled, and the migrating instance - which is fully running on the
destination at this point - will never be updated in the database.
This update normally happens later in
post_live_migration_at_destination().

The network_info variable obtained from get_instance_nw_info() is used
for two things: notifications - which aren't critical - and unplugging
the instance's vifs on the source - which is very important!

It turns out that at the time of the get_instance_nw_info() call, the
network info in the instance info cache is still valid for unplugging
the source vifs. The port bindings on the destination are only
activated by the network_api.migrate_instance_start() [1] call that
happens shortly *after* the problematic get_instance_nw_info() call.
In other words, get_instance_nw_info() will always return the source
ports. Because of that, we can replace it with a call to
instance.get_network_info().

[1] d9e04c4ff0/nova/network/neutronv2/api.py (L2493-L2522)

Change-Id: If0fbae33ce2af198188c91638afef939256c2556
Closes-bug: 1879787
(cherry picked from commit 6488a5dfb2)
This commit is contained in:
Artom Lifshitz 2020-08-21 13:06:58 -04:00
parent 598130622b
commit 2c949cb3ee
3 changed files with 34 additions and 33 deletions

View File

@ -8409,7 +8409,13 @@ class ComputeManager(manager.Manager):
# Releasing vlan.
# (not necessary in current implementation?)
network_info = self.network_api.get_instance_nw_info(ctxt, instance)
# NOTE(artom) At this point in time we have not bound the ports to the
# destination host yet (this happens in migrate_instance_start()
# below). Therefore, the "old" source network info that's still in the
# instance info cache is safe to use here, since it'll be used below
# during driver.post_live_migration_at_source() to unplug the VIFs on
# the source.
network_info = instance.get_network_info()
self._notify_about_instance_usage(ctxt, instance,
"live_migration._post.start",

View File

@ -182,7 +182,8 @@ class TestVolAttachmentsDuringLiveMigration(
self.assertEqual(1, len(attachments))
class LiveMigrationNeutronFailure(integrated_helpers._IntegratedTestBase):
class LiveMigrationNeutronInteractionsTest(
integrated_helpers._IntegratedTestBase):
# NOTE(artom) We need the admin API to force the host when booting the test
# server.
ADMIN_API = True
@ -192,38 +193,29 @@ class LiveMigrationNeutronFailure(integrated_helpers._IntegratedTestBase):
self._start_compute('src')
self._start_compute('dest')
def test_live_migrate_get_nw_info_fails(self):
"""Test that if the driver.post_live_migration() call fails (for
example by not being able to connect to Neutron), the exception goes
unhandled and results in the live-migration erroring out. This is bug
1879787.
def test_live_migrate_vifs_from_info_cache(self):
"""Test that bug 1879787 can no longer manifest itself because we get
the network_info from the instance info cache, and not Neutron.
"""
def stub_notify(context, instance, event_suffix,
network_info=None, extra_usage_info=None, fault=None):
vif = network_info[0]
# Make sure we have the correct VIF (the NeutronFixture
# deterministically uses port_2 for networks=auto) and that the
# profile does not contain `migrating_to`, indicating that we did
# not obtain it from the Neutron API.
self.assertEqual(self.neutron.port_2['id'], vif['id'])
self.assertNotIn('migrating_to', vif['profile'])
server = self._create_server(networks='auto',
host=self.computes['src'].host)
orig_plm = self.computes['src'].manager._post_live_migration
def stub_plm(*args, **kwargs):
"""We simulate a failure in driver.post_live_migration() on the
source by stubbing the source compute's _post_live_migration() with
a method that, within a context that mocks
driver.post_live_migration() to raise an exception, calls the
original compute.manager._post_live_migration(). This is needed to
make sure driver.post_live_migration() raises only once, and only
on the source.
"""
with mock.patch.object(self.computes['src'].manager.network_api,
'get_instance_nw_info',
side_effect=ConnectionError):
return orig_plm(*args, **kwargs)
with mock.patch.object(self.computes['src'].manager,
'_post_live_migration',
side_effect=stub_plm):
# FIXME(artom) Until bug 1879787 is fixed, the raised
# ConnectionError will go unhandled, the migration will fail, and
# the instance will still be reported as being on the source, even
# though it's actually running on the destination.
self._live_migrate(server, 'error', server_expected_state='ERROR')
'_notify_about_instance_usage',
side_effect=stub_notify) as mock_notify:
self._live_migrate(server, 'completed')
server = self.api.get_server(server['id'])
self.assertEqual('src', server['OS-EXT-SRV-ATTR:host'])
self.assertEqual('dest', server['OS-EXT-SRV-ATTR:host'])
# We don't care about call arguments here, we just want to be sure
# our stub actually got called.
mock_notify.assert_called()

View File

@ -6562,12 +6562,15 @@ class ComputeTestCase(BaseTestCase,
'clear_events_for_instance'),
mock.patch.object(self.compute, 'update_available_resource'),
mock.patch.object(migration_obj, 'save'),
mock.patch.object(instance, 'get_network_info'),
) as (
post_live_migration,
migrate_instance_start, post_live_migration_at_destination,
post_live_migration_at_source, setup_networks_on_host,
clear_events, update_available_resource, mig_save
clear_events, update_available_resource, mig_save, get_nw_info,
):
nw_info = network_model.NetworkInfo.hydrate([])
get_nw_info.return_value = nw_info
self.compute._post_live_migration(c, instance, dest,
migrate_data=migrate_data,
source_bdms=bdms)
@ -6589,7 +6592,7 @@ class ComputeTestCase(BaseTestCase,
post_live_migration_at_destination.assert_has_calls([
mock.call(c, instance, False, dest)])
post_live_migration_at_source.assert_has_calls(
[mock.call(c, instance, [])])
[mock.call(c, instance, nw_info)])
clear_events.assert_called_once_with(instance)
update_available_resource.assert_has_calls([mock.call(c)])
self.assertEqual('completed', migration_obj.status)