Gracefully ERROR in _init_instance if vnic_type changed

If the vnic_type of a bound port changes from "direct" to "macvtap" and
then the compute service is restarted then during _init_instance nova
tries to plug the vif of the changed port. However as it now has macvtap
vnic_type nova tries to look up the netdev of the parent VF. Still that
VF is consumed by the instance so there is no such netdev on the host
OS. This error killed the compute service at startup due to unhandled
exception. This patch adds the exception handler, logs an ERROR and
continue initializing other instances on the host.

Also this patch adds a detailed ERROR log when nova detects that the
vnic_type changed during _heal_instance_info_cache periodic.

Closes-Bug: #1981813
Change-Id: I1719f8eda04e8d15a3b01f0612977164c4e55e85
This commit is contained in:
Balazs Gibizer 2022-07-15 13:48:46 +02:00
parent f8c91eb75f
commit e43bf900dc
6 changed files with 250 additions and 5 deletions

View File

@ -1247,6 +1247,20 @@ class ComputeManager(manager.Manager):
'updated.', instance=instance)
self._set_instance_obj_error_state(instance)
return
except exception.PciDeviceNotFoundById:
# This is bug 1981813 where the bound port vnic_type has changed
# from direct to macvtap. Nova does not support that and it
# already printed an ERROR when the change is detected during
# _heal_instance_info_cache. Now we print an ERROR again and skip
# plugging the vifs but let the service startup continue to init
# the other instances
LOG.exception(
'Virtual interface plugging failed for instance. Probably the '
'vnic_type of the bound port has been changed. Nova does not '
'support such change.',
instance=instance
)
return
if instance.task_state == task_states.RESIZE_MIGRATING:
# We crashed during resize/migration, so roll back for safety

View File

@ -3356,6 +3356,25 @@ class API:
delegate_create=True,
)
def _log_error_if_vnic_type_changed(
self, port_id, old_vnic_type, new_vnic_type, instance
):
if old_vnic_type and old_vnic_type != new_vnic_type:
LOG.error(
'The vnic_type of the bound port %s has '
'been changed in neutron from "%s" to '
'"%s". Changing vnic_type of a bound port '
'is not supported by Nova. To avoid '
'breaking the connectivity of the instance '
'please change the port vnic_type back to '
'"%s".',
port_id,
old_vnic_type,
new_vnic_type,
old_vnic_type,
instance=instance
)
def _build_network_info_model(self, context, instance, networks=None,
port_ids=None, admin_client=None,
preexisting_port_ids=None,
@ -3429,6 +3448,12 @@ class API:
preexisting_port_ids)
for index, vif in enumerate(nw_info):
if vif['id'] == refresh_vif_id:
self._log_error_if_vnic_type_changed(
vif['id'],
vif['vnic_type'],
refreshed_vif['vnic_type'],
instance,
)
# Update the existing entry.
nw_info[index] = refreshed_vif
LOG.debug('Updated VIF entry in instance network '
@ -3478,6 +3503,7 @@ class API:
networks, port_ids = self._gather_port_ids_and_networks(
context, instance, networks, port_ids, client)
old_nw_info = instance.get_network_info()
nw_info = network_model.NetworkInfo()
for port_id in port_ids:
current_neutron_port = current_neutron_port_map.get(port_id)
@ -3485,6 +3511,14 @@ class API:
vif = self._build_vif_model(
context, client, current_neutron_port, networks,
preexisting_port_ids)
for old_vif in old_nw_info:
if old_vif['id'] == port_id:
self._log_error_if_vnic_type_changed(
port_id,
old_vif['vnic_type'],
vif['vnic_type'],
instance,
)
nw_info.append(vif)
elif nw_info_refresh:
LOG.info('Port %s from network info_cache is no '

View File

@ -1077,6 +1077,14 @@ class SRIOVServersTest(_PCIServersWithMigrationTestBase):
self.host_mappings['compute1'].cell_mapping
) as cctxt:
compute.manager._heal_instance_info_cache(cctxt)
self.assertIn(
'The vnic_type of the bound port %s has been changed in '
'neutron from "direct" to "macvtap". Changing vnic_type of a '
'bound port is not supported by Nova. To avoid breaking the '
'connectivity of the instance please change the port '
'vnic_type back to "direct".' % port['id'],
self.stdlog.logger.output,
)
def fake_get_ifname_by_pci_address(pci_addr: str, pf_interface=False):
# we want to fail the netdev lookup only if the pci_address is
@ -1103,15 +1111,16 @@ class SRIOVServersTest(_PCIServersWithMigrationTestBase):
self.libvirt.mock_get_ifname_by_pci_address.side_effect = (
fake_get_ifname_by_pci_address
)
# This is bug 1981813 as the compute service fails to start with an
# exception.
# Nova cannot prevent the vnic_type change on a bound port. Neutron
# should prevent that instead. But the nova-compute should still
# be able to start up and only log an ERROR for this instance in
# inconsistent state.
self.assertRaises(
exception.PciDeviceNotFoundById,
self.restart_compute_service, 'compute1'
self.restart_compute_service('compute1')
self.assertIn(
'Virtual interface plugging failed for instance. Probably the '
'vnic_type of the bound port has been changed. Nova does not '
'support such change.',
self.stdlog.logger.output,
)

View File

@ -1350,6 +1350,36 @@ class ComputeManagerUnitTestCase(test.NoDBTestCase,
self.compute._init_instance(self.context, instance)
set_error_state.assert_called_once_with(instance)
def test_init_instance_vif_plug_fails_missing_pci(self):
instance = fake_instance.fake_instance_obj(
self.context,
uuid=uuids.instance,
info_cache=None,
power_state=power_state.RUNNING,
vm_state=vm_states.ACTIVE,
task_state=None,
host=self.compute.host,
expected_attrs=['info_cache'])
with test.nested(
mock.patch.object(context, 'get_admin_context',
return_value=self.context),
mock.patch.object(objects.Instance, 'get_network_info',
return_value=network_model.NetworkInfo()),
mock.patch.object(self.compute.driver, 'plug_vifs',
side_effect=exception.PciDeviceNotFoundById("pci-addr")),
mock.patch("nova.compute.manager.LOG.exception"),
) as (get_admin_context, get_nw_info, plug_vifs, log_exception):
# as this does not raise, we are sure that the compute service
# continues initializing the rest of the instances
self.compute._init_instance(self.context, instance)
log_exception.assert_called_once_with(
"Virtual interface plugging failed for instance. Probably the "
"vnic_type of the bound port has been changed. Nova does not "
"support such change.",
instance=instance
)
def _test__validate_pinning_configuration(self, supports_pcpus=True):
instance_1 = fake_instance.fake_instance_obj(
self.context, uuid=uuids.instance_1)

View File

@ -3382,6 +3382,155 @@ class TestAPI(TestAPIBase):
mocked_client.list_ports.assert_called_once_with(
tenant_id=uuids.fake, device_id=uuids.instance)
@mock.patch.object(
neutronapi.API,
'_get_physnet_tunneled_info',
new=mock.Mock(return_value=(None, False)))
@mock.patch.object(
neutronapi.API,
'_get_preexisting_port_ids',
new=mock.Mock(return_value=[]))
@mock.patch.object(
neutronapi.API,
'_get_subnets_from_port',
new=mock.Mock(return_value=[model.Subnet(cidr='1.0.0.0/8')]))
@mock.patch.object(
neutronapi.API,
'_get_floating_ips_by_fixed_and_port',
new=mock.Mock(return_value=[{'floating_ip_address': '10.0.0.1'}]))
@mock.patch.object(neutronapi, 'get_client')
def test_build_network_info_model_full_vnic_type_change(
self, mock_get_client
):
mocked_client = mock.create_autospec(client.Client)
mock_get_client.return_value = mocked_client
fake_inst = objects.Instance()
fake_inst.project_id = uuids.fake
fake_inst.uuid = uuids.instance
fake_ports = [
{
"id": "port1",
"network_id": "net-id",
"tenant_id": uuids.fake,
"admin_state_up": True,
"status": "ACTIVE",
"fixed_ips": [{"ip_address": "1.1.1.1"}],
"mac_address": "de:ad:be:ef:00:01",
"binding:vif_type": model.VIF_TYPE_BRIDGE,
"binding:vnic_type": model.VNIC_TYPE_DIRECT,
"binding:vif_details": {},
},
]
mocked_client.list_ports.return_value = {'ports': fake_ports}
fake_inst.info_cache = objects.InstanceInfoCache.new(
self.context, uuids.instance)
fake_inst.info_cache.network_info = model.NetworkInfo.hydrate([])
# build the network info first
nw_infos = self.api._build_network_info_model(
self.context,
fake_inst,
force_refresh=True,
)
self.assertEqual(1, len(nw_infos))
fake_inst.info_cache.network_info = nw_infos
# change the vnic_type of the port and rebuild the network info
fake_ports[0]["binding:vnic_type"] = model.VNIC_TYPE_MACVTAP
with mock.patch(
"nova.network.neutron.API._log_error_if_vnic_type_changed"
) as mock_log:
nw_infos = self.api._build_network_info_model(
self.context,
fake_inst,
force_refresh=True,
)
mock_log.assert_called_once_with(
fake_ports[0]["id"], "direct", "macvtap", fake_inst)
self.assertEqual(1, len(nw_infos))
@mock.patch.object(
neutronapi.API,
'_get_physnet_tunneled_info',
new=mock.Mock(return_value=(None, False)))
@mock.patch.object(
neutronapi.API,
'_get_preexisting_port_ids',
new=mock.Mock(return_value=[]))
@mock.patch.object(
neutronapi.API,
'_get_subnets_from_port',
new=mock.Mock(return_value=[model.Subnet(cidr='1.0.0.0/8')]))
@mock.patch.object(
neutronapi.API,
'_get_floating_ips_by_fixed_and_port',
new=mock.Mock(return_value=[{'floating_ip_address': '10.0.0.1'}]))
@mock.patch.object(neutronapi, 'get_client')
def test_build_network_info_model_single_vnic_type_change(
self, mock_get_client
):
mocked_client = mock.create_autospec(client.Client)
mock_get_client.return_value = mocked_client
fake_inst = objects.Instance()
fake_inst.project_id = uuids.fake
fake_inst.uuid = uuids.instance
fake_ports = [
{
"id": "port1",
"network_id": "net-id",
"tenant_id": uuids.fake,
"admin_state_up": True,
"status": "ACTIVE",
"fixed_ips": [{"ip_address": "1.1.1.1"}],
"mac_address": "de:ad:be:ef:00:01",
"binding:vif_type": model.VIF_TYPE_BRIDGE,
"binding:vnic_type": model.VNIC_TYPE_DIRECT,
"binding:vif_details": {},
},
]
fake_nets = [
{
"id": "net-id",
"name": "foo",
"tenant_id": uuids.fake,
}
]
mocked_client.list_ports.return_value = {'ports': fake_ports}
fake_inst.info_cache = objects.InstanceInfoCache.new(
self.context, uuids.instance)
fake_inst.info_cache.network_info = model.NetworkInfo.hydrate([])
# build the network info first
nw_infos = self.api._build_network_info_model(
self.context,
fake_inst,
fake_nets,
[fake_ports[0]["id"]],
refresh_vif_id=fake_ports[0]["id"],
)
self.assertEqual(1, len(nw_infos))
fake_inst.info_cache.network_info = nw_infos
# change the vnic_type of the port and rebuild the network info
fake_ports[0]["binding:vnic_type"] = model.VNIC_TYPE_MACVTAP
with mock.patch(
"nova.network.neutron.API._log_error_if_vnic_type_changed"
) as mock_log:
nw_infos = self.api._build_network_info_model(
self.context,
fake_inst,
fake_nets,
[fake_ports[0]["id"]],
refresh_vif_id=fake_ports[0]["id"],
)
mock_log.assert_called_once_with(
fake_ports[0]["id"], "direct", "macvtap", fake_inst)
self.assertEqual(1, len(nw_infos))
@mock.patch.object(neutronapi, 'get_client')
def test_get_subnets_from_port(self, mock_get_client):
mocked_client = mock.create_autospec(client.Client)

View File

@ -0,0 +1,9 @@
---
fixes:
- |
`Bug #1981813 <https://bugs.launchpad.net/nova/+bug/1981813>`_: Now nova
detects if the ``vnic_type`` of a bound port has been changed in neutron
and leaves an ERROR message in the compute service log as such change on a
bound port is not supported. Also the restart of the nova-compute service
will not crash any more after such port change. Nova will log an ERROR and
skip the initialization of the instance with such port during the startup.