From 2c949cb3eea9cd9282060da12d32771582953aa2 Mon Sep 17 00:00:00 2001 From: Artom Lifshitz Date: Fri, 21 Aug 2020 13:06:58 -0400 Subject: [PATCH] post live migration: don't call Neutron needlessly In bug 1879787, the call to network_api.get_instance_nw_info() in _post_live_migration() on the source compute manager eventually calls out to the Neutron REST API. If this fails, the exception is unhandled, and the migrating instance - which is fully running on the destination at this point - will never be updated in the database. This update normally happens later in post_live_migration_at_destination(). The network_info variable obtained from get_instance_nw_info() is used for two things: notifications - which aren't critical - and unplugging the instance's vifs on the source - which is very important! It turns out that at the time of the get_instance_nw_info() call, the network info in the instance info cache is still valid for unplugging the source vifs. The port bindings on the destination are only activated by the network_api.migrate_instance_start() [1] call that happens shortly *after* the problematic get_instance_nw_info() call. In other words, get_instance_nw_info() will always return the source ports. Because of that, we can replace it with a call to instance.get_network_info(). [1] https://opendev.org/openstack/nova/src/commit/d9e04c4ff0b1a9c3383f1848dc846e93030d83cb/nova/network/neutronv2/api.py#L2493-L2522 Change-Id: If0fbae33ce2af198188c91638afef939256c2556 Closes-bug: 1879787 (cherry picked from commit 6488a5dfb293831a448596e2084f484dd0bfa916) --- nova/compute/manager.py | 8 ++- .../functional/compute/test_live_migration.py | 52 ++++++++----------- nova/tests/unit/compute/test_compute.py | 7 ++- 3 files changed, 34 insertions(+), 33 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 814c51152d93..976952505673 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -8409,7 +8409,13 @@ class ComputeManager(manager.Manager): # Releasing vlan. # (not necessary in current implementation?) - network_info = self.network_api.get_instance_nw_info(ctxt, instance) + # NOTE(artom) At this point in time we have not bound the ports to the + # destination host yet (this happens in migrate_instance_start() + # below). Therefore, the "old" source network info that's still in the + # instance info cache is safe to use here, since it'll be used below + # during driver.post_live_migration_at_source() to unplug the VIFs on + # the source. + network_info = instance.get_network_info() self._notify_about_instance_usage(ctxt, instance, "live_migration._post.start", diff --git a/nova/tests/functional/compute/test_live_migration.py b/nova/tests/functional/compute/test_live_migration.py index 19ff4728451f..440e29c82ae2 100644 --- a/nova/tests/functional/compute/test_live_migration.py +++ b/nova/tests/functional/compute/test_live_migration.py @@ -182,7 +182,8 @@ class TestVolAttachmentsDuringLiveMigration( self.assertEqual(1, len(attachments)) -class LiveMigrationNeutronFailure(integrated_helpers._IntegratedTestBase): +class LiveMigrationNeutronInteractionsTest( + integrated_helpers._IntegratedTestBase): # NOTE(artom) We need the admin API to force the host when booting the test # server. ADMIN_API = True @@ -192,38 +193,29 @@ class LiveMigrationNeutronFailure(integrated_helpers._IntegratedTestBase): self._start_compute('src') self._start_compute('dest') - def test_live_migrate_get_nw_info_fails(self): - """Test that if the driver.post_live_migration() call fails (for - example by not being able to connect to Neutron), the exception goes - unhandled and results in the live-migration erroring out. This is bug - 1879787. + def test_live_migrate_vifs_from_info_cache(self): + """Test that bug 1879787 can no longer manifest itself because we get + the network_info from the instance info cache, and not Neutron. """ + def stub_notify(context, instance, event_suffix, + network_info=None, extra_usage_info=None, fault=None): + vif = network_info[0] + # Make sure we have the correct VIF (the NeutronFixture + # deterministically uses port_2 for networks=auto) and that the + # profile does not contain `migrating_to`, indicating that we did + # not obtain it from the Neutron API. + self.assertEqual(self.neutron.port_2['id'], vif['id']) + self.assertNotIn('migrating_to', vif['profile']) + server = self._create_server(networks='auto', host=self.computes['src'].host) - orig_plm = self.computes['src'].manager._post_live_migration - - def stub_plm(*args, **kwargs): - """We simulate a failure in driver.post_live_migration() on the - source by stubbing the source compute's _post_live_migration() with - a method that, within a context that mocks - driver.post_live_migration() to raise an exception, calls the - original compute.manager._post_live_migration(). This is needed to - make sure driver.post_live_migration() raises only once, and only - on the source. - """ - with mock.patch.object(self.computes['src'].manager.network_api, - 'get_instance_nw_info', - side_effect=ConnectionError): - return orig_plm(*args, **kwargs) - with mock.patch.object(self.computes['src'].manager, - '_post_live_migration', - side_effect=stub_plm): - # FIXME(artom) Until bug 1879787 is fixed, the raised - # ConnectionError will go unhandled, the migration will fail, and - # the instance will still be reported as being on the source, even - # though it's actually running on the destination. - self._live_migrate(server, 'error', server_expected_state='ERROR') + '_notify_about_instance_usage', + side_effect=stub_notify) as mock_notify: + self._live_migrate(server, 'completed') server = self.api.get_server(server['id']) - self.assertEqual('src', server['OS-EXT-SRV-ATTR:host']) + self.assertEqual('dest', server['OS-EXT-SRV-ATTR:host']) + # We don't care about call arguments here, we just want to be sure + # our stub actually got called. + mock_notify.assert_called() diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py index 1db1bff1054f..c1e43ae37d70 100644 --- a/nova/tests/unit/compute/test_compute.py +++ b/nova/tests/unit/compute/test_compute.py @@ -6562,12 +6562,15 @@ class ComputeTestCase(BaseTestCase, 'clear_events_for_instance'), mock.patch.object(self.compute, 'update_available_resource'), mock.patch.object(migration_obj, 'save'), + mock.patch.object(instance, 'get_network_info'), ) as ( post_live_migration, migrate_instance_start, post_live_migration_at_destination, post_live_migration_at_source, setup_networks_on_host, - clear_events, update_available_resource, mig_save + clear_events, update_available_resource, mig_save, get_nw_info, ): + nw_info = network_model.NetworkInfo.hydrate([]) + get_nw_info.return_value = nw_info self.compute._post_live_migration(c, instance, dest, migrate_data=migrate_data, source_bdms=bdms) @@ -6589,7 +6592,7 @@ class ComputeTestCase(BaseTestCase, post_live_migration_at_destination.assert_has_calls([ mock.call(c, instance, False, dest)]) post_live_migration_at_source.assert_has_calls( - [mock.call(c, instance, [])]) + [mock.call(c, instance, nw_info)]) clear_events.assert_called_once_with(instance) update_available_resource.assert_has_calls([mock.call(c)]) self.assertEqual('completed', migration_obj.status)