Add a WA flag waiting for vif-plugged event during reboot

The libvirt driver power on and hard reboot destroys the domain first
and unplugs the vifs then recreate the domain and replug the vifs.
However nova does not wait for the network-vif-plugged event before
unpause the domain. This can cause that the domain starts running and
requesting IP via DHCP before the networking backend finished plugging
the vifs.

So this patch adds a workaround config option to nova to wait for
network-vif-plugged events during hard reboot the same way as nova waits
for this event during new instance spawn.

This logic cannot be enabled unconditionally as not all neutron
networking backend sending plug time events to wait for. Also the logic
needs to be vnic_type dependent as ml2/ovs and the in tree sriov backend
often deployed together on the same compute. While ml2/ovs sends plug
time event the sriov backend does not send it reliably. So the
configuration is not just a boolean flag but a list of vnic_types
instead. This way the waiting for the plug time event for a vif that is
handled by ml2/ovs is possible while the instance has other vifs handled
by the sriov backend where no event can be expected.

Change-Id: Ie904d1513b5cf76d6d5f6877545e8eb378dd5499
Closes-Bug: #1946729
This commit is contained in:
Balazs Gibizer 2021-10-11 14:41:37 +02:00
parent fdfdba2658
commit 68c970ea99
5 changed files with 148 additions and 3 deletions

View File

@ -245,6 +245,12 @@
# reduce the number of placement calls in steady state. Added in
# Stein.
resource_provider_association_refresh: 0
workarounds:
# This wa is an improvement on hard reboot that cannot be turned
# on unconditionally. But we know that ml2/ovs sends plug time
# events so we can enable this in this ovs job for vnic_type
# normal
wait_for_vif_plugged_event_during_hard_reboot: normal
$NOVA_CONF:
quota:
# Added in Train.

View File

@ -299,6 +299,65 @@ cases the correct fix it to update the guest image kernel to one that is
patched however in some cases this is not possible. This workaround allows the
emulation of an apic to be disabled per host however it is not recommended to
use outside of a CI or developer cloud.
"""),
cfg.ListOpt('wait_for_vif_plugged_event_during_hard_reboot',
item_type=cfg.types.String(
choices=[
"normal",
"direct",
"macvtap",
"baremetal",
"direct-physical",
"virtio-forwarder",
"smart-nic",
"vdpa",
"accelerator-direct",
"accelerator-direct-physical",
]),
default=[],
help="""
The libvirt virt driver implements power on and hard reboot by tearing down
every vif of the instance being rebooted then plug them again. By default nova
does not wait for network-vif-plugged event from neutron before it lets the
instance run. This can cause the instance to requests the IP via DHCP before
the neutron backend has a chance to set up the networking backend after the vif
plug.
This flag defines which vifs nova expects network-vif-plugged events from
during hard reboot. The possible values are neutron port vnic types:
* normal
* direct
* macvtap
* baremetal
* direct-physical
* virtio-forwarder
* smart-nic
* vdpa
* accelerator-direct
* accelerator-direct-physical
Adding a ``vnic_type`` to this configuration makes Nova wait for a
network-vif-plugged event for each of the instance's vifs having the specific
``vnic_type`` before unpausing the instance, similarly to how new instance
creation works.
Please note that not all neutron networking backends send plug time events, for
certain ``vnic_type`` therefore this config is empty by default.
The ml2/ovs and the networking-odl backends are known to send plug time events
for ports with ``normal`` ``vnic_type`` so it is safe to add ``normal`` to this
config if you are using only those backends in the compute host.
The neutron in-tree SRIOV backend does not reliably send network-vif-plugged
event during plug time for ports with ``direct`` vnic_type and never sends
that event for port with ``direct-physical`` vnic_type during plug time. For
other ``vnic_type`` and backend pairs, please consult the developers of the
backend.
Related options:
* :oslo.config:option:`DEFAULT.vif_plugging_timeout`
"""),
]

View File

@ -16221,7 +16221,48 @@ class LibvirtConnTestCase(test.NoDBTestCase,
accel_info=accel_info)
mock_create_guest_with_network.assert_called_once_with(self.context,
dummyxml, instance, network_info, block_device_info,
vifs_already_plugged=True)
vifs_already_plugged=True, external_events=[])
@mock.patch('oslo_utils.fileutils.ensure_tree', new=mock.Mock())
@mock.patch('nova.virt.libvirt.LibvirtDriver.get_info')
@mock.patch('nova.virt.libvirt.LibvirtDriver._create_guest_with_network')
@mock.patch('nova.virt.libvirt.LibvirtDriver._get_guest_xml')
@mock.patch('nova.virt.libvirt.LibvirtDriver.destroy', new=mock.Mock())
@mock.patch(
'nova.virt.libvirt.LibvirtDriver._get_all_assigned_mediated_devices',
new=mock.Mock(return_value={}))
def test_hard_reboot_wait_for_plug(
self, mock_get_guest_xml, mock_create_guest_with_network, mock_get_info
):
self.flags(
group="workarounds",
wait_for_vif_plugged_event_during_hard_reboot=["normal"])
self.context.auth_token = None
instance = objects.Instance(**self.test_instance)
network_info = _fake_network_info(self, num_networks=4)
network_info[0]["vnic_type"] = "normal"
network_info[1]["vnic_type"] = "direct"
network_info[2]["vnic_type"] = "normal"
network_info[3]["vnic_type"] = "direct-physical"
block_device_info = None
return_values = [hardware.InstanceInfo(state=power_state.SHUTDOWN),
hardware.InstanceInfo(state=power_state.RUNNING)]
mock_get_info.side_effect = return_values
mock_get_guest_xml.return_value = mock.sentinel.xml
drvr = libvirt_driver.LibvirtDriver(fake.FakeVirtAPI(), False)
drvr._hard_reboot(
self.context, instance, network_info, block_device_info)
mock_create_guest_with_network.assert_called_once_with(
self.context, mock.sentinel.xml, instance, network_info,
block_device_info,
vifs_already_plugged=False,
external_events=[
('network-vif-plugged', uuids.vif1),
('network-vif-plugged', uuids.vif3),
]
)
@mock.patch('oslo_utils.fileutils.ensure_tree')
@mock.patch('oslo_service.loopingcall.FixedIntervalLoopingCall')

View File

@ -3817,11 +3817,32 @@ class LibvirtDriver(driver.ComputeDriver):
# on which vif type we're using and we are working with a stale network
# info cache here, so won't rely on waiting for neutron plug events.
# vifs_already_plugged=True means "do not wait for neutron plug events"
external_events = []
vifs_already_plugged = True
event_expected_for_vnic_types = (
CONF.workarounds.wait_for_vif_plugged_event_during_hard_reboot)
if event_expected_for_vnic_types:
# NOTE(gibi): We unplugged every vif during destroy above and we
# will replug them with _create_guest_with_network. As the
# workaround config has some vnic_types configured we expect
# vif-plugged events for every vif with those vnic_types.
# TODO(gibi): only wait for events if we know that the networking
# backend sends plug time events. For that we need to finish
# https://bugs.launchpad.net/neutron/+bug/1821058 first in Neutron
# then create a driver -> plug-time event mapping in nova.
external_events = [
('network-vif-plugged', vif['id'])
for vif in network_info
if vif['vnic_type'] in event_expected_for_vnic_types
]
vifs_already_plugged = False
# NOTE(efried): The instance should already have a vtpm_secret_uuid
# registered if appropriate.
self._create_guest_with_network(
context, xml, instance, network_info, block_device_info,
vifs_already_plugged=True)
vifs_already_plugged=vifs_already_plugged,
external_events=external_events)
def _wait_for_reboot():
"""Called at an interval until the VM is running again."""
@ -7207,7 +7228,7 @@ class LibvirtDriver(driver.ComputeDriver):
power_on: bool = True,
vifs_already_plugged: bool = False,
post_xml_callback: ty.Callable = None,
external_events: ty.Optional[ty.List[str]] = None,
external_events: ty.Optional[ty.List[ty.Tuple[str, str]]] = None,
cleanup_instance_dir: bool = False,
cleanup_instance_disks: bool = False,
) -> libvirt_guest.Guest:

View File

@ -0,0 +1,18 @@
---
issues:
- |
The libvirt virt driver in Nova implements power on and hard reboot by
destroying the domain first and unpluging the vifs then recreating the
domain and replugging the vifs. However nova does not wait for the
network-vif-plugged event before unpause the domain. This can cause
the domain to start running and requesting IP via DHCP before the
networking backend has finished plugging the vifs. The config option
[workarounds]wait_for_vif_plugged_event_during_hard_reboot has been added,
defaulting to an empty list, that can be used to ensure that the libvirt
driver waits for the network-vif-plugged event for vifs with specific
``vnic_type`` before it unpauses the domain during hard reboot. This should
only be used if the deployment uses a networking backend that sends such
event for the given ``vif_type`` at vif plug time. The ml2/ovs and the
networking-odl Neutron backend is known to send plug time events for ports
with ``normal`` ``vnic_type``. For more information see
https://bugs.launchpad.net/nova/+bug/1946729