fix netns deletion of broken namespaces

normal network namespaces are bind-mounted to files under
/var/run/netns. If a process deleting a network namespace gets killed
during that operation there is the chance that the bind mount to the
netns has been removed, but the file under /var/run/netns still exists.

When the neutron-ovn-metadata-agent tries to clean up such network
namespaces it first tires to validate that the network namespace is
empty. For the cases described above this fails, as this network
namespace no longer really exists, but is just a stray file laying
around.

To fix this we treat network namespaces where we get an `OSError` with
errno 22 (Invalid Argument) as empty. The calls to pyroute2 to delete
the namespace will then clean up the file.

Additionally we add a guard to teardown_datapath to continue even if
this fails. failing to remove a datapath is not critical and leaves in
the worst case a process and a network namespace running, however
previously it would have also prevented the creation of new datapaths
which is critical for VM startup.

Closes-Bug: #2037102
Change-Id: I7c43812fed5903f98a2e491076c24a8d926a59b4
This commit is contained in:
Felix Huettner 2023-09-22 16:25:10 +02:00
parent 0c251cce60
commit 566fea3fed
4 changed files with 60 additions and 2 deletions

View File

@ -259,7 +259,22 @@ class IPWrapper(SubProcessBase):
return ip
def namespace_is_empty(self):
return not self.get_devices()
try:
return not self.get_devices()
except OSError as e:
# This can happen if we previously got terminated in the middle of
# removing this namespace. In this case the bind mount of the
# namespace under /var/run/netns will be removed, but the namespace
# file is still there. As the bind mount is gone we can no longer
# access the namespace to validate that it is empty. But since it
# should have already been removed we are sure that the check has
# passed the last time and since the namespace is unuseable that
# can not have changed.
# Future calls to pyroute2 to remove that namespace will clean up
# the leftover file.
if e.errno == errno.EINVAL:
return True
raise e
def garbage_collect_namespace(self):
"""Conditionally destroy the namespace if it is empty."""

View File

@ -490,7 +490,10 @@ class MetadataAgent(object):
ns.startswith(NS_PREFIX) and
ns not in metadata_namespaces]
for ns in unused_namespaces:
self.teardown_datapath(self._get_datapath_name(ns))
try:
self.teardown_datapath(self._get_datapath_name(ns))
except Exception:
LOG.exception('Error unable to destroy namespace: %s', ns)
# resync all network namespaces based on the associated datapaths,
# even those that are already running. This is to make sure

View File

@ -357,6 +357,21 @@ class TestIpWrapper(base.BaseTestCase):
self.assertNotIn(mock.call().delete('ns'),
ip_ns_cmd_cls.mock_calls)
def test_garbage_collect_namespace_existing_broken(self):
with mock.patch.object(ip_lib, 'IpNetnsCommand') as ip_ns_cmd_cls:
ip_ns_cmd_cls.return_value.exists.return_value = True
ip = ip_lib.IPWrapper(namespace='ns')
with mock.patch.object(ip, 'get_devices',
side_effect=OSError(errno.EINVAL, None)
) as mock_get_devices:
self.assertTrue(ip.garbage_collect_namespace())
mock_get_devices.assert_called_once_with()
expected = [mock.call().delete('ns')]
ip_ns_cmd_cls.assert_has_calls(expected)
@mock.patch.object(priv_lib, 'create_interface')
def test_add_vlan(self, create):
retval = ip_lib.IPWrapper().add_vlan('eth0.1', 'eth0', '1')

View File

@ -138,6 +138,31 @@ class TestMetadataAgent(base.BaseTestCase):
lnn.assert_called_once_with()
tdp.assert_called_once_with('3')
def test_sync_teardown_namespace_does_not_crash_on_error(self):
"""Test that sync tears down unneeded metadata namespaces.
Even if that fails it continues to provision other datapaths
"""
with mock.patch.object(
self.agent, 'provision_datapath') as pdp,\
mock.patch.object(
ip_lib, 'list_network_namespaces',
return_value=['ovnmeta-1', 'ovnmeta-2', 'ovnmeta-3',
'ns1', 'ns2']) as lnn,\
mock.patch.object(
self.agent, 'teardown_datapath',
side_effect=Exception()) as tdp:
self.agent.sync()
pdp.assert_has_calls(
[
mock.call(p.datapath)
for p in self.ports
],
any_order=True
)
lnn.assert_called_once_with()
tdp.assert_called_once_with('3')
def test_get_networks_datapaths(self):
"""Test get_networks_datapaths returns only datapath objects for the
networks containing vif ports of type ''(blank) and 'external'.