Do not dispose local_vlan_hints

After first iteration, neutron ovs agent was releasing the vlan hints
back into available vlan.

This could be an issue if neutron agent failed to configure a device
during that first loop.

Example of failure: a RPC call like get_device_details.

When such failure occur, the vlan is not removed from local_vlan_hints
when configuring the device, but the hint is anyway restored in
available vlans at the end of the iteration.

In such situation, a new port can be plugged into br-int reusing a vlan
that is still used by a previous port.

Not disposing the local_vlan_hints after first iteration allow neutron
to keep looking into the hints on next iteration, until it succeed.

Anyway, having a local_vlan_hints not empty after an iteration is an
error and should be logged so operator can eventually fix ovs manually
on some edge cases.

Change-Id: Id2e8aca7c52359eee7c4fd8006d93c97c84f817a
Signed-off-by: Arnaud Morin <arnaud.morin@ovhcloud.com>
This commit is contained in:
Arnaud Morin 2023-04-13 16:54:14 +02:00 committed by Arnaud Morin
parent 8c6ad0f49e
commit 598c05ab6a
2 changed files with 103 additions and 5 deletions

View File

@ -491,10 +491,6 @@ class OVSNeutronAgent(l2population_rpc.L2populationRpcCallBackTunnelMixin,
self.available_local_vlans.remove(local_vlan)
self._local_vlan_hints[key] = local_vlan
def _dispose_local_vlan_hints(self):
self.available_local_vlans.update(self._local_vlan_hints.values())
self._local_vlan_hints = {}
def _reset_tunnel_ofports(self):
self.tun_br_ofports = {n_const.TYPE_GENEVE: {},
n_const.TYPE_GRE: {},
@ -2888,7 +2884,19 @@ class OVSNeutronAgent(l2population_rpc.L2populationRpcCallBackTunnelMixin,
self.update_retries_map_and_remove_devs_not_to_retry(
failed_devices, failed_ancillary_devices,
failed_devices_retries_map))
self._dispose_local_vlan_hints()
if len(self._local_vlan_hints):
LOG.warning(
"Agent rpc_loop - iteration:%(iter_num)d - "
"the process was not able to clean local vlans "
"(%(nb_hints)s failed). Please ensure that you "
"do not have dead ports in the integration "
"bridge.",
{'iter_num': self.iter_num,
'nb_hints': len(self._local_vlan_hints)})
LOG.debug(
"local_vlan_hints: %s", self._local_vlan_hints
)
except Exception:
LOG.exception("Error while processing VIF ports")
# Put the ports back in self.updated_port

View File

@ -83,6 +83,7 @@ class FakeVif(object):
ofport = 99
port_name = 'name'
vif_mac = 'aa:bb:cc:11:22:33'
vif_id = 'dead-beaf'
class MockFixedIntervalLoopingCall(object):
@ -2486,6 +2487,95 @@ class TestOvsNeutronAgent(object):
cleanup.assert_not_called()
rpc_stop.assert_called_once()
def _test_rpc_loop_hints(self, devices, failed_devices, hints):
devices_details = []
vifs = {}
for device in devices:
details = {
'admin_state_up': True,
'port_id': mock.Mock(),
'device': device,
'network_id': device + 'net',
'physical_network': 'physnet42',
'segmentation_id': '4242',
'network_type': n_const.TYPE_LOCAL,
'fixed_ips': [{
'subnet_id': '8a66d0cc-2950-dead-beef-d090140d3607',
'ip_address': '1.1.1.1'}],
'device_owner': DEVICE_OWNER_COMPUTE,
}
devices_details.append(details)
vifs[device] = FakeVif()
port_info = {'current': set(),
'added': set(devices + failed_devices)}
with mock.patch.object(
self.agent,
'process_port_info') as process_port_info, \
mock.patch.object(
self.agent,
'_check_and_handle_signal', side_effect=[True, False]), \
mock.patch.object(
self.agent,
'check_ovs_status',
return_value=ovs_constants.OVS_RESTARTED), \
mock.patch.object(
self.agent,
'_handle_ovs_restart'), \
mock.patch.object(
self.agent,
'cleanup_stale_flows'), \
mock.patch.object(
self.agent.plugin_rpc,
'get_devices_details_list_and_failed_devices') as gddl, \
mock.patch.object(
self.agent.int_br,
'get_vifs_by_ids') as get_vifs_by_ids:
# Simulate a restart
self.agent.fullsync = True
self.agent._local_vlan_hints = hints
# Populate fake info
process_port_info.return_value = (port_info, set(), 0, set())
gddl.return_value = {
'devices': devices_details,
'failed_devices': failed_devices,
}
get_vifs_by_ids.return_value = vifs
# Run the rpc_loop
self.agent.rpc_loop(polling_manager=mock.Mock())
def test_rpc_loop_hints_all_used(self):
devices = ['tap1234', 'tap2345']
failed_devices = []
hints = {'tap1234net/4242': 42, 'tap2345net/4242': 43}
self._test_rpc_loop_hints(
devices=devices, failed_devices=failed_devices, hints=hints)
# Assert that we do not have any value in the hints
self.assertEqual(0, len(self.agent._local_vlan_hints))
def test_rpc_loop_hints_one_left(self):
devices = ['tap1234']
failed_devices = []
hints = {'tap1234net/4242': 42, 'tap2345net/4242': 43}
self._test_rpc_loop_hints(
devices=devices, failed_devices=failed_devices, hints=hints)
# Assert that we have one value left in the hints
self.assertEqual(1, len(self.agent._local_vlan_hints))
def test_rpc_loop_hints_with_failed_devices(self):
devices = ['tap1234']
failed_devices = ['tap2345']
hints = {'tap1234net/4242': 42, 'tap2345net/4242': 43}
self._test_rpc_loop_hints(
devices=devices, failed_devices=failed_devices, hints=hints)
# Assert that we have one value left in the hints
self.assertEqual(1, len(self.agent._local_vlan_hints))
def test_set_rpc_timeout(self):
with mock.patch.object(n_rpc.BackingOffClient,
'set_max_timeout') as smt: