From e43bf900dc8ca66578603bed333c56b215b1876e Mon Sep 17 00:00:00 2001 From: Balazs Gibizer Date: Fri, 15 Jul 2022 13:48:46 +0200 Subject: [PATCH] Gracefully ERROR in _init_instance if vnic_type changed If the vnic_type of a bound port changes from "direct" to "macvtap" and then the compute service is restarted then during _init_instance nova tries to plug the vif of the changed port. However as it now has macvtap vnic_type nova tries to look up the netdev of the parent VF. Still that VF is consumed by the instance so there is no such netdev on the host OS. This error killed the compute service at startup due to unhandled exception. This patch adds the exception handler, logs an ERROR and continue initializing other instances on the host. Also this patch adds a detailed ERROR log when nova detects that the vnic_type changed during _heal_instance_info_cache periodic. Closes-Bug: #1981813 Change-Id: I1719f8eda04e8d15a3b01f0612977164c4e55e85 --- nova/compute/manager.py | 14 ++ nova/network/neutron.py | 34 ++++ .../libvirt/test_pci_sriov_servers.py | 19 ++- nova/tests/unit/compute/test_compute_mgr.py | 30 ++++ nova/tests/unit/network/test_neutron.py | 149 ++++++++++++++++++ ...813-vnic-type-change-9f3e16fae885b57f.yaml | 9 ++ 6 files changed, 250 insertions(+), 5 deletions(-) create mode 100644 releasenotes/notes/bug-1981813-vnic-type-change-9f3e16fae885b57f.yaml diff --git a/nova/compute/manager.py b/nova/compute/manager.py index 021c6f9d440c..667455215c80 100644 --- a/nova/compute/manager.py +++ b/nova/compute/manager.py @@ -1247,6 +1247,20 @@ class ComputeManager(manager.Manager): 'updated.', instance=instance) self._set_instance_obj_error_state(instance) return + except exception.PciDeviceNotFoundById: + # This is bug 1981813 where the bound port vnic_type has changed + # from direct to macvtap. Nova does not support that and it + # already printed an ERROR when the change is detected during + # _heal_instance_info_cache. Now we print an ERROR again and skip + # plugging the vifs but let the service startup continue to init + # the other instances + LOG.exception( + 'Virtual interface plugging failed for instance. Probably the ' + 'vnic_type of the bound port has been changed. Nova does not ' + 'support such change.', + instance=instance + ) + return if instance.task_state == task_states.RESIZE_MIGRATING: # We crashed during resize/migration, so roll back for safety diff --git a/nova/network/neutron.py b/nova/network/neutron.py index 5ca7649303fe..27e7d0645535 100644 --- a/nova/network/neutron.py +++ b/nova/network/neutron.py @@ -3356,6 +3356,25 @@ class API: delegate_create=True, ) + def _log_error_if_vnic_type_changed( + self, port_id, old_vnic_type, new_vnic_type, instance + ): + if old_vnic_type and old_vnic_type != new_vnic_type: + LOG.error( + 'The vnic_type of the bound port %s has ' + 'been changed in neutron from "%s" to ' + '"%s". Changing vnic_type of a bound port ' + 'is not supported by Nova. To avoid ' + 'breaking the connectivity of the instance ' + 'please change the port vnic_type back to ' + '"%s".', + port_id, + old_vnic_type, + new_vnic_type, + old_vnic_type, + instance=instance + ) + def _build_network_info_model(self, context, instance, networks=None, port_ids=None, admin_client=None, preexisting_port_ids=None, @@ -3429,6 +3448,12 @@ class API: preexisting_port_ids) for index, vif in enumerate(nw_info): if vif['id'] == refresh_vif_id: + self._log_error_if_vnic_type_changed( + vif['id'], + vif['vnic_type'], + refreshed_vif['vnic_type'], + instance, + ) # Update the existing entry. nw_info[index] = refreshed_vif LOG.debug('Updated VIF entry in instance network ' @@ -3478,6 +3503,7 @@ class API: networks, port_ids = self._gather_port_ids_and_networks( context, instance, networks, port_ids, client) + old_nw_info = instance.get_network_info() nw_info = network_model.NetworkInfo() for port_id in port_ids: current_neutron_port = current_neutron_port_map.get(port_id) @@ -3485,6 +3511,14 @@ class API: vif = self._build_vif_model( context, client, current_neutron_port, networks, preexisting_port_ids) + for old_vif in old_nw_info: + if old_vif['id'] == port_id: + self._log_error_if_vnic_type_changed( + port_id, + old_vif['vnic_type'], + vif['vnic_type'], + instance, + ) nw_info.append(vif) elif nw_info_refresh: LOG.info('Port %s from network info_cache is no ' diff --git a/nova/tests/functional/libvirt/test_pci_sriov_servers.py b/nova/tests/functional/libvirt/test_pci_sriov_servers.py index 3364aeec42db..b32d165e10d2 100644 --- a/nova/tests/functional/libvirt/test_pci_sriov_servers.py +++ b/nova/tests/functional/libvirt/test_pci_sriov_servers.py @@ -1077,6 +1077,14 @@ class SRIOVServersTest(_PCIServersWithMigrationTestBase): self.host_mappings['compute1'].cell_mapping ) as cctxt: compute.manager._heal_instance_info_cache(cctxt) + self.assertIn( + 'The vnic_type of the bound port %s has been changed in ' + 'neutron from "direct" to "macvtap". Changing vnic_type of a ' + 'bound port is not supported by Nova. To avoid breaking the ' + 'connectivity of the instance please change the port ' + 'vnic_type back to "direct".' % port['id'], + self.stdlog.logger.output, + ) def fake_get_ifname_by_pci_address(pci_addr: str, pf_interface=False): # we want to fail the netdev lookup only if the pci_address is @@ -1103,15 +1111,16 @@ class SRIOVServersTest(_PCIServersWithMigrationTestBase): self.libvirt.mock_get_ifname_by_pci_address.side_effect = ( fake_get_ifname_by_pci_address ) - # This is bug 1981813 as the compute service fails to start with an - # exception. # Nova cannot prevent the vnic_type change on a bound port. Neutron # should prevent that instead. But the nova-compute should still # be able to start up and only log an ERROR for this instance in # inconsistent state. - self.assertRaises( - exception.PciDeviceNotFoundById, - self.restart_compute_service, 'compute1' + self.restart_compute_service('compute1') + self.assertIn( + 'Virtual interface plugging failed for instance. Probably the ' + 'vnic_type of the bound port has been changed. Nova does not ' + 'support such change.', + self.stdlog.logger.output, ) diff --git a/nova/tests/unit/compute/test_compute_mgr.py b/nova/tests/unit/compute/test_compute_mgr.py index 2aadcd106c19..b8eef08d6009 100644 --- a/nova/tests/unit/compute/test_compute_mgr.py +++ b/nova/tests/unit/compute/test_compute_mgr.py @@ -1350,6 +1350,36 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase, self.compute._init_instance(self.context, instance) set_error_state.assert_called_once_with(instance) + def test_init_instance_vif_plug_fails_missing_pci(self): + instance = fake_instance.fake_instance_obj( + self.context, + uuid=uuids.instance, + info_cache=None, + power_state=power_state.RUNNING, + vm_state=vm_states.ACTIVE, + task_state=None, + host=self.compute.host, + expected_attrs=['info_cache']) + + with test.nested( + mock.patch.object(context, 'get_admin_context', + return_value=self.context), + mock.patch.object(objects.Instance, 'get_network_info', + return_value=network_model.NetworkInfo()), + mock.patch.object(self.compute.driver, 'plug_vifs', + side_effect=exception.PciDeviceNotFoundById("pci-addr")), + mock.patch("nova.compute.manager.LOG.exception"), + ) as (get_admin_context, get_nw_info, plug_vifs, log_exception): + # as this does not raise, we are sure that the compute service + # continues initializing the rest of the instances + self.compute._init_instance(self.context, instance) + log_exception.assert_called_once_with( + "Virtual interface plugging failed for instance. Probably the " + "vnic_type of the bound port has been changed. Nova does not " + "support such change.", + instance=instance + ) + def _test__validate_pinning_configuration(self, supports_pcpus=True): instance_1 = fake_instance.fake_instance_obj( self.context, uuid=uuids.instance_1) diff --git a/nova/tests/unit/network/test_neutron.py b/nova/tests/unit/network/test_neutron.py index 7767981a96a7..eefa7b974fc2 100644 --- a/nova/tests/unit/network/test_neutron.py +++ b/nova/tests/unit/network/test_neutron.py @@ -3382,6 +3382,155 @@ class TestAPI(TestAPIBase): mocked_client.list_ports.assert_called_once_with( tenant_id=uuids.fake, device_id=uuids.instance) + @mock.patch.object( + neutronapi.API, + '_get_physnet_tunneled_info', + new=mock.Mock(return_value=(None, False))) + @mock.patch.object( + neutronapi.API, + '_get_preexisting_port_ids', + new=mock.Mock(return_value=[])) + @mock.patch.object( + neutronapi.API, + '_get_subnets_from_port', + new=mock.Mock(return_value=[model.Subnet(cidr='1.0.0.0/8')])) + @mock.patch.object( + neutronapi.API, + '_get_floating_ips_by_fixed_and_port', + new=mock.Mock(return_value=[{'floating_ip_address': '10.0.0.1'}])) + @mock.patch.object(neutronapi, 'get_client') + def test_build_network_info_model_full_vnic_type_change( + self, mock_get_client + ): + mocked_client = mock.create_autospec(client.Client) + mock_get_client.return_value = mocked_client + fake_inst = objects.Instance() + fake_inst.project_id = uuids.fake + fake_inst.uuid = uuids.instance + fake_ports = [ + { + "id": "port1", + "network_id": "net-id", + "tenant_id": uuids.fake, + "admin_state_up": True, + "status": "ACTIVE", + "fixed_ips": [{"ip_address": "1.1.1.1"}], + "mac_address": "de:ad:be:ef:00:01", + "binding:vif_type": model.VIF_TYPE_BRIDGE, + "binding:vnic_type": model.VNIC_TYPE_DIRECT, + "binding:vif_details": {}, + }, + ] + mocked_client.list_ports.return_value = {'ports': fake_ports} + fake_inst.info_cache = objects.InstanceInfoCache.new( + self.context, uuids.instance) + fake_inst.info_cache.network_info = model.NetworkInfo.hydrate([]) + + # build the network info first + nw_infos = self.api._build_network_info_model( + self.context, + fake_inst, + force_refresh=True, + ) + + self.assertEqual(1, len(nw_infos)) + fake_inst.info_cache.network_info = nw_infos + + # change the vnic_type of the port and rebuild the network info + fake_ports[0]["binding:vnic_type"] = model.VNIC_TYPE_MACVTAP + with mock.patch( + "nova.network.neutron.API._log_error_if_vnic_type_changed" + ) as mock_log: + nw_infos = self.api._build_network_info_model( + self.context, + fake_inst, + force_refresh=True, + ) + + mock_log.assert_called_once_with( + fake_ports[0]["id"], "direct", "macvtap", fake_inst) + self.assertEqual(1, len(nw_infos)) + + @mock.patch.object( + neutronapi.API, + '_get_physnet_tunneled_info', + new=mock.Mock(return_value=(None, False))) + @mock.patch.object( + neutronapi.API, + '_get_preexisting_port_ids', + new=mock.Mock(return_value=[])) + @mock.patch.object( + neutronapi.API, + '_get_subnets_from_port', + new=mock.Mock(return_value=[model.Subnet(cidr='1.0.0.0/8')])) + @mock.patch.object( + neutronapi.API, + '_get_floating_ips_by_fixed_and_port', + new=mock.Mock(return_value=[{'floating_ip_address': '10.0.0.1'}])) + @mock.patch.object(neutronapi, 'get_client') + def test_build_network_info_model_single_vnic_type_change( + self, mock_get_client + ): + mocked_client = mock.create_autospec(client.Client) + mock_get_client.return_value = mocked_client + fake_inst = objects.Instance() + fake_inst.project_id = uuids.fake + fake_inst.uuid = uuids.instance + fake_ports = [ + { + "id": "port1", + "network_id": "net-id", + "tenant_id": uuids.fake, + "admin_state_up": True, + "status": "ACTIVE", + "fixed_ips": [{"ip_address": "1.1.1.1"}], + "mac_address": "de:ad:be:ef:00:01", + "binding:vif_type": model.VIF_TYPE_BRIDGE, + "binding:vnic_type": model.VNIC_TYPE_DIRECT, + "binding:vif_details": {}, + }, + ] + fake_nets = [ + { + "id": "net-id", + "name": "foo", + "tenant_id": uuids.fake, + } + ] + mocked_client.list_ports.return_value = {'ports': fake_ports} + fake_inst.info_cache = objects.InstanceInfoCache.new( + self.context, uuids.instance) + fake_inst.info_cache.network_info = model.NetworkInfo.hydrate([]) + + # build the network info first + nw_infos = self.api._build_network_info_model( + self.context, + fake_inst, + fake_nets, + [fake_ports[0]["id"]], + refresh_vif_id=fake_ports[0]["id"], + ) + + self.assertEqual(1, len(nw_infos)) + fake_inst.info_cache.network_info = nw_infos + + # change the vnic_type of the port and rebuild the network info + fake_ports[0]["binding:vnic_type"] = model.VNIC_TYPE_MACVTAP + with mock.patch( + "nova.network.neutron.API._log_error_if_vnic_type_changed" + ) as mock_log: + nw_infos = self.api._build_network_info_model( + self.context, + fake_inst, + fake_nets, + [fake_ports[0]["id"]], + refresh_vif_id=fake_ports[0]["id"], + ) + + mock_log.assert_called_once_with( + fake_ports[0]["id"], "direct", "macvtap", fake_inst) + self.assertEqual(1, len(nw_infos)) + @mock.patch.object(neutronapi, 'get_client') def test_get_subnets_from_port(self, mock_get_client): mocked_client = mock.create_autospec(client.Client) diff --git a/releasenotes/notes/bug-1981813-vnic-type-change-9f3e16fae885b57f.yaml b/releasenotes/notes/bug-1981813-vnic-type-change-9f3e16fae885b57f.yaml new file mode 100644 index 000000000000..a5a3b7c8c2c9 --- /dev/null +++ b/releasenotes/notes/bug-1981813-vnic-type-change-9f3e16fae885b57f.yaml @@ -0,0 +1,9 @@ +--- +fixes: + - | + `Bug #1981813 `_: Now nova + detects if the ``vnic_type`` of a bound port has been changed in neutron + and leaves an ERROR message in the compute service log as such change on a + bound port is not supported. Also the restart of the nova-compute service + will not crash any more after such port change. Nova will log an ERROR and + skip the initialization of the instance with such port during the startup.