From 542635034882e1b6897e1935f09d6feb6e77d1ce Mon Sep 17 00:00:00 2001 From: Jack Ding Date: Wed, 19 Sep 2018 11:54:44 -0400 Subject: [PATCH] Correct instance port binding for rebuilds The following 2 scenarios could result in an instance with incorrect port binding and cause subsequent rebuilds to fail. If an evacuation of an instance fails part way through, after the point where we reassign the port binding to the new host but before we change the instance host, we end up with the ports assigned to the wrong host. This change adds a check to determine if there's any port binding host mismatches and if so trigger setup of instance network. During recovery of failed hosts, neutron could get overwhelmed and lose messages, for example when active controller was powered-off in the middle of instance evacuations. In this case the vif_type was set to 'binding_failed' or 'unbound'. We subsequently hit "Unsupported VIF type" exception during instance hard_reboot or rebuild, leaving the instance unrecoverable. This commit changes _heal_instance_info_cache periodic task to update port binding if evacuation fails due to above errors so that the instance can be recovered later. Closes-Bug: #1659062 Related-Bug: #1784579 Co-Authored-By: Gerry Kopec Co-Authored-By: Jim Gauld Change-Id: I75fd15ac2a29e420c09499f2c41d11259ca811ae Signed-off-by: Jack Ding --- nova/compute/manager.py | 37 +++++- nova/network/model.py | 2 + nova/network/neutronv2/api.py | 9 +- nova/tests/unit/compute/test_compute.py | 139 +++++++++++++++++++++- nova/tests/unit/network/test_neutronv2.py | 77 +++++++++++- 5 files changed, 252 insertions(+), 12 deletions(-) diff --git a/nova/compute/manager.py b/nova/compute/manager.py index f207c18b51ad..4c550cab137e 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -962,11 +962,10 @@ class ComputeManager(manager.Manager): LOG.debug(e, instance=instance) except exception.VirtualInterfacePlugException: # NOTE(mriedem): If we get here, it could be because the vif_type - # in the cache is "binding_failed" or "unbound". The only way to - # fix this is to try and bind the ports again, which would be - # expensive here on host startup. We could add a check to - # _heal_instance_info_cache to handle this, but probably only if - # the instance task_state is None. + # in the cache is "binding_failed" or "unbound". + # The periodic task _heal_instance_info_cache checks for this + # condition. It should fix this by binding the ports again when + # it gets to this instance. LOG.exception('Virtual interface plugging failed for instance. ' 'The port binding:host_id may need to be manually ' 'updated.', instance=instance) @@ -7116,6 +7115,25 @@ class ComputeManager(manager.Manager): action=fields.NotificationAction.LIVE_MIGRATION_ROLLBACK_DEST, phase=fields.NotificationPhase.END) + def _require_nw_info_update(self, context, instance): + """Detect whether there is a mismatch in binding:host_id, or + binding_failed or unbound binding:vif_type for any of the instances + ports. + """ + if not utils.is_neutron(): + return False + search_opts = {'device_id': instance.uuid, + 'fields': ['binding:host_id', 'binding:vif_type']} + ports = self.network_api.list_ports(context, **search_opts) + for p in ports['ports']: + if p.get('binding:host_id') != self.host: + return True + vif_type = p.get('binding:vif_type') + if (vif_type == network_model.VIF_TYPE_UNBOUND or + vif_type == network_model.VIF_TYPE_BINDING_FAILED): + return True + return False + @periodic_task.periodic_task( spacing=CONF.heal_instance_info_cache_interval) def _heal_instance_info_cache(self, context): @@ -7194,6 +7212,15 @@ class ComputeManager(manager.Manager): if instance: # We have an instance now to refresh try: + # Fix potential mismatch in port binding if evacuation failed + # after reassigning the port binding to the dest host but + # before the instance host is changed. + # Do this only when instance has no pending task. + if instance.task_state is None and \ + self._require_nw_info_update(context, instance): + LOG.info("Updating ports in neutron", instance=instance) + self.network_api.setup_instance_network_on_host( + context, instance, self.host) # Call to network API to get instance info.. this will # force an update to the instance's info_cache self.network_api.get_instance_nw_info( diff --git a/nova/network/model.py b/nova/network/model.py index 791d39535e01..b89b1ac27d81 100644 --- a/nova/network/model.py +++ b/nova/network/model.py @@ -45,6 +45,8 @@ VIF_TYPE_MACVTAP = 'macvtap' VIF_TYPE_AGILIO_OVS = 'agilio_ovs' VIF_TYPE_BINDING_FAILED = 'binding_failed' VIF_TYPE_VIF = 'vif' +VIF_TYPE_UNBOUND = 'unbound' + # Constants for dictionary keys in the 'vif_details' field in the VIF # class diff --git a/nova/network/neutronv2/api.py b/nova/network/neutronv2/api.py index c012baefd42f..71365b8b4808 100644 --- a/nova/network/neutronv2/api.py +++ b/nova/network/neutronv2/api.py @@ -3273,13 +3273,16 @@ class API(base_api.NetworkAPI): pci_mapping = None port_updates = [] ports = data['ports'] + FAILED_VIF_TYPES = (network_model.VIF_TYPE_UNBOUND, + network_model.VIF_TYPE_BINDING_FAILED) for p in ports: updates = {} binding_profile = _get_binding_profile(p) - # If the host hasn't changed, like in the case of resizing to the - # same host, there is nothing to do. - if p.get(BINDING_HOST_ID) != host: + # We need to update the port binding if the host has changed or if + # the binding is clearly wrong due to previous lost messages. + vif_type = p.get('binding:vif_type') + if p.get(BINDING_HOST_ID) != host or vif_type in FAILED_VIF_TYPES: # TODO(gibi): To support ports with resource request during # server move operations we need to take care of 'allocation' # key in the binding profile per binding. diff --git a/nova/tests/unit/compute/test_compute.py b/nova/tests/unit/compute/test_compute.py index 5e7ce951660b..35b41be2beba 100644 --- a/nova/tests/unit/compute/test_compute.py +++ b/nova/tests/unit/compute/test_compute.py @@ -7197,9 +7197,81 @@ class ComputeTestCase(BaseTestCase, mock_is_older.assert_called_once_with(now, CONF.running_deleted_instance_timeout) + @mock.patch('nova.network.neutronv2.api.API.list_ports') + @mock.patch.object(nova.utils, 'is_neutron') + def test_require_nw_info_update_not_neutron(self, mock_is_neutron, + mock_list_ports): + ctxt = context.get_admin_context() + instance = self._create_fake_instance_obj() + mock_is_neutron.return_value = False + val = self.compute._require_nw_info_update(ctxt, instance) + self.assertFalse(val) + mock_list_ports.assert_not_called() + + @mock.patch('nova.network.neutronv2.api.API.list_ports') + @mock.patch.object(nova.utils, 'is_neutron') + def test_require_nw_info_update_host_match(self, mock_is_neutron, + mock_list_ports): + ctxt = context.get_admin_context() + instance = self._create_fake_instance_obj() + mock_is_neutron.return_value = True + mock_list_ports.return_value = {'ports': [ + {'binding:host_id': self.compute.host, + 'binding:vif_type': 'foo'} + ]} + search_opts = {'device_id': instance.uuid, + 'fields': ['binding:host_id', 'binding:vif_type']} + + val = self.compute._require_nw_info_update(ctxt, instance) + # return false since hosts match and vif_type is not unbound or + # binding_failed + self.assertFalse(val) + mock_list_ports.assert_called_once_with(ctxt, **search_opts) + + @mock.patch('nova.network.neutronv2.api.API.list_ports') + @mock.patch.object(nova.utils, 'is_neutron') + def test_require_nw_info_update_host_mismatch(self, mock_is_neutron, + mock_list_ports): + ctxt = context.get_admin_context() + instance = self._create_fake_instance_obj() + mock_is_neutron.return_value = True + mock_list_ports.return_value = {'ports': [ + {'binding:host_id': 'foo', 'binding:vif_type': 'foo'} + ]} + search_opts = {'device_id': instance.uuid, + 'fields': ['binding:host_id', 'binding:vif_type']} + val = self.compute._require_nw_info_update(ctxt, instance) + self.assertTrue(val) + mock_list_ports.assert_called_once_with(ctxt, **search_opts) + + @mock.patch('nova.network.neutronv2.api.API.list_ports') + @mock.patch.object(nova.utils, 'is_neutron') + def test_require_nw_info_update_failed_vif_types( + self, mock_is_neutron, mock_list_ports): + ctxt = context.get_admin_context() + instance = self._create_fake_instance_obj() + mock_is_neutron.return_value = True + search_opts = {'device_id': instance.uuid, + 'fields': ['binding:host_id', 'binding:vif_type']} + + FAILED_VIF_TYPES = (network_model.VIF_TYPE_UNBOUND, + network_model.VIF_TYPE_BINDING_FAILED) + + for vif_type in FAILED_VIF_TYPES: + mock_list_ports.reset_mock() + mock_list_ports.return_value = {'ports': [ + {'binding:host_id': self.compute.host, + 'binding:vif_type': vif_type} + ]} + val = self.compute._require_nw_info_update(ctxt, instance) + self.assertTrue(val) + mock_list_ports.assert_called_once_with(ctxt, **search_opts) + def _heal_instance_info_cache(self, _get_instance_nw_info_raise=False, - _get_instance_nw_info_raise_cache=False): + _get_instance_nw_info_raise_cache=False, + _require_nw_info_update=False, + _task_state_not_none=False): # Update on every call for the test self.flags(heal_instance_info_cache_interval=-1) ctxt = context.get_admin_context() @@ -7211,10 +7283,13 @@ class ComputeTestCase(BaseTestCase, instance_map[inst_uuid] = fake_instance.fake_db_instance( uuid=inst_uuid, host=CONF.host, created_at=None) # These won't be in our instance since they're not requested + if _task_state_not_none: + instance_map[inst_uuid]['task_state'] = 'foo' instances.append(instance_map[inst_uuid]) call_info = {'get_all_by_host': 0, 'get_by_uuid': 0, - 'get_nw_info': 0, 'expected_instance': None} + 'get_nw_info': 0, 'expected_instance': None, + 'require_nw_info': 0, 'setup_network': 0} def fake_instance_get_all_by_host(context, host, columns_to_join, use_slave=False): @@ -7232,6 +7307,20 @@ class ComputeTestCase(BaseTestCase, columns_to_join) return instance_map[instance_uuid] + def fake_require_nw_info_update(cls, context, instance): + self.assertEqual(call_info['expected_instance']['uuid'], + instance['uuid']) + if call_info['expected_instance']['task_state'] is None: + call_info['require_nw_info'] += 1 + return _require_nw_info_update + + def fake_setup_instance_network_on_host(cls, context, instance, host): + self.assertEqual(call_info['expected_instance']['uuid'], + instance['uuid']) + if call_info['expected_instance']['task_state'] is None and \ + _require_nw_info_update: + call_info['setup_network'] += 1 + # NOTE(comstud): Override the stub in setUp() def fake_get_instance_nw_info(cls, context, instance, **kwargs): # Note that this exception gets caught in compute/manager @@ -7252,14 +7341,28 @@ class ComputeTestCase(BaseTestCase, fake_instance_get_all_by_host) self.stub_out('nova.db.api.instance_get_by_uuid', fake_instance_get_by_uuid) + + self.stub_out('nova.compute.manager.ComputeManager.' + '_require_nw_info_update', + fake_require_nw_info_update) + if CONF.use_neutron: self.stub_out( 'nova.network.neutronv2.api.API.get_instance_nw_info', fake_get_instance_nw_info) + self.stub_out( + 'nova.network.neutronv2.api.API.' + 'setup_instance_network_on_host', + fake_setup_instance_network_on_host) else: self.stub_out('nova.network.api.API.get_instance_nw_info', fake_get_instance_nw_info) + self.stub_out( + 'nova.network.api.API.setup_instance_network_on_host', + fake_setup_instance_network_on_host) + expected_require_nw_info = 0 + expect_setup_network = 0 # Make an instance appear to be still Building instances[0]['vm_state'] = vm_states.BUILDING # Make an instance appear to be Deleting @@ -7269,12 +7372,26 @@ class ComputeTestCase(BaseTestCase, self.compute._heal_instance_info_cache(ctxt) self.assertEqual(1, call_info['get_all_by_host']) self.assertEqual(0, call_info['get_by_uuid']) + if not _task_state_not_none: + expected_require_nw_info += 1 + self.assertEqual(expected_require_nw_info, + call_info['require_nw_info']) + if _require_nw_info_update and not _task_state_not_none: + expect_setup_network += 1 + self.assertEqual(expect_setup_network, call_info['setup_network']) self.assertEqual(1, call_info['get_nw_info']) call_info['expected_instance'] = instances[3] self.compute._heal_instance_info_cache(ctxt) self.assertEqual(1, call_info['get_all_by_host']) self.assertEqual(1, call_info['get_by_uuid']) + if not _task_state_not_none: + expected_require_nw_info += 1 + self.assertEqual(expected_require_nw_info, + call_info['require_nw_info']) + if _require_nw_info_update and not _task_state_not_none: + expect_setup_network += 1 + self.assertEqual(expect_setup_network, call_info['setup_network']) self.assertEqual(2, call_info['get_nw_info']) # Make an instance switch hosts @@ -7288,6 +7405,13 @@ class ComputeTestCase(BaseTestCase, self.compute._heal_instance_info_cache(ctxt) self.assertEqual(1, call_info['get_all_by_host']) self.assertEqual(4, call_info['get_by_uuid']) + if not _task_state_not_none: + expected_require_nw_info += 1 + self.assertEqual(expected_require_nw_info, + call_info['require_nw_info']) + if _require_nw_info_update and not _task_state_not_none: + expect_setup_network += 1 + self.assertEqual(expect_setup_network, call_info['setup_network']) self.assertEqual(3, call_info['get_nw_info']) # Should be no more left. self.assertEqual(0, len(self.compute._instance_uuids_to_heal)) @@ -7303,6 +7427,9 @@ class ComputeTestCase(BaseTestCase, # Stays the same because we remove invalid entries from the list self.assertEqual(4, call_info['get_by_uuid']) # Stays the same because we didn't find anything to process + self.assertEqual(expected_require_nw_info, + call_info['require_nw_info']) + self.assertEqual(expect_setup_network, call_info['setup_network']) self.assertEqual(3, call_info['get_nw_info']) def test_heal_instance_info_cache(self): @@ -7314,6 +7441,14 @@ class ComputeTestCase(BaseTestCase, def test_heal_instance_info_cache_with_info_cache_exception(self): self._heal_instance_info_cache(_get_instance_nw_info_raise_cache=True) + def test_heal_instance_info_cache_with_port_update(self): + self._heal_instance_info_cache(_require_nw_info_update=True) + + def test_heal_instance_info_cache_with_port_update_instance_not_steady( + self): + self._heal_instance_info_cache(_require_nw_info_update=True, + _task_state_not_none=True) + @mock.patch('nova.objects.InstanceList.get_by_filters') @mock.patch('nova.compute.api.API.unrescue') def test_poll_rescued_instances(self, unrescue, get): diff --git a/nova/tests/unit/network/test_neutronv2.py b/nova/tests/unit/network/test_neutronv2.py index 3a3f10d477b5..d97f6b634060 100644 --- a/nova/tests/unit/network/test_neutronv2.py +++ b/nova/tests/unit/network/test_neutronv2.py @@ -4376,12 +4376,12 @@ class TestNeutronv2WithMock(_TestNeutronv2Common): # a second port profile attribute 'fake_profile' this can be # an sriov port profile attribute or a pci_slot attribute, but for # now we are just using a fake one to show that the code does not - # remove the portbinding_profile it there is one. + # remove the portbinding_profile if there is one. binding_profile = {'fake_profile': 'fake_data', neutronapi.MIGRATING_ATTR: 'my-dest-host'} fake_ports = {'ports': [ {'id': 'fake-port-1', - neutronapi.BINDING_PROFILE: binding_profile, + neutronapi.BINDING_PROFILE: binding_profile, neutronapi.BINDING_HOST_ID: instance.host}]} list_ports_mock = mock.Mock(return_value=fake_ports) get_client_mock.return_value.list_ports = list_ports_mock @@ -4605,6 +4605,79 @@ class TestNeutronv2WithMock(_TestNeutronv2Common): # No ports should be updated if the port's pci binding did not change. update_port_mock.assert_not_called() + @mock.patch.object(neutronapi, 'get_client', return_value=mock.Mock()) + def test_update_port_bindings_for_instance_with_same_host_failed_vif_type( + self, get_client_mock): + instance = fake_instance.fake_instance_obj(self.context) + self.api._has_port_binding_extension = mock.Mock(return_value=True) + list_ports_mock = mock.Mock() + update_port_mock = mock.Mock() + + FAILED_VIF_TYPES = (model.VIF_TYPE_UNBOUND, + model.VIF_TYPE_BINDING_FAILED) + for vif_type in FAILED_VIF_TYPES: + binding_profile = {'fake_profile': 'fake_data', + neutronapi.MIGRATING_ATTR: 'my-dest-host'} + fake_ports = {'ports': [ + {'id': 'fake-port-1', + 'binding:vif_type': 'fake-vif-type', + neutronapi.BINDING_PROFILE: binding_profile, + neutronapi.BINDING_HOST_ID: instance.host}, + {'id': 'fake-port-2', + 'binding:vif_type': vif_type, + neutronapi.BINDING_PROFILE: binding_profile, + neutronapi.BINDING_HOST_ID: instance.host} + ]} + + list_ports_mock.return_value = fake_ports + get_client_mock.return_value.list_ports = list_ports_mock + get_client_mock.return_value.update_port = update_port_mock + update_port_mock.reset_mock() + self.api._update_port_binding_for_instance(self.context, instance, + instance.host) + # Assert that update_port was called on the port with a + # failed vif_type and MIGRATING_ATTR is removed + update_port_mock.assert_called_once_with( + 'fake-port-2', + {'port': {neutronapi.BINDING_HOST_ID: instance.host, + neutronapi.BINDING_PROFILE: { + 'fake_profile': 'fake_data'}, + 'device_owner': 'compute:%s' % + instance.availability_zone + }}) + + @mock.patch.object(neutronapi, 'get_client', return_value=mock.Mock()) + def test_update_port_bindings_for_instance_with_diff_host_unbound_vif_type( + self, get_client_mock): + instance = fake_instance.fake_instance_obj(self.context) + self.api._has_port_binding_extension = mock.Mock(return_value=True) + + binding_profile = {'fake_profile': 'fake_data', + neutronapi.MIGRATING_ATTR: 'my-dest-host'} + fake_ports = {'ports': [ + {'id': 'fake-port-1', + 'binding:vif_type': model.VIF_TYPE_UNBOUND, + neutronapi.BINDING_PROFILE: binding_profile, + neutronapi.BINDING_HOST_ID: instance.host}, + ]} + list_ports_mock = mock.Mock(return_value=fake_ports) + get_client_mock.return_value.list_ports = list_ports_mock + update_port_mock = mock.Mock() + get_client_mock.return_value.update_port = update_port_mock + + self.api._update_port_binding_for_instance(self.context, instance, + 'my-host') + + # Assert that update_port was called on the port with a + # 'unbound' vif_type, host updated and MIGRATING_ATTR is removed + update_port_mock.assert_called_once_with( + 'fake-port-1', {'port': {neutronapi.BINDING_HOST_ID: 'my-host', + neutronapi.BINDING_PROFILE: { + 'fake_profile': 'fake_data'}, + 'device_owner': 'compute:%s' % + instance.availability_zone + }}) + def test_get_pci_mapping_for_migration(self): instance = fake_instance.fake_instance_obj(self.context) instance.migration_context = objects.MigrationContext()