Cleanup KuryrPort when Pod is missing

We can easily imagine an user frustrated by his pod not getting deleted
and opting to remove the finalizer from the Pod. If the cause of the
deletion delay was the kuryr-controller being down, we end up with an
orphaned KuryrPort. At the moment this causes crashes, which obviously
it shouldn't. Moreover we should figure out how to clean up the Neutron
port if that happens. This commit does so as explained below.

1. KuryrPort on_present() will trigger its deletion when it detects that
   Pod does not longer exist.
2. Turns out security_groups parameter passed to release_vif() was never
   used. I removed it from drivers and got rid of get_security_groups()
   call from on_finalize() as it's no longer necessary.
3. When we cannot get the Pod in KuryrPort on_finalize() we attempt to
   gather info required to cleanup the KuryrPort and "mock" a Pod
   object. A precaution is added that any error from release_vif() is
   ignored in that case to make sure failed cleanup is not causing the
   system to go down.

Change-Id: Iaf48296ff28394823f68d58362bcc87d38a2cd42
This commit is contained in:
Michał Dulko 2022-08-23 11:54:04 +02:00
parent 21f78897b6
commit 03b98adde2
11 changed files with 139 additions and 140 deletions

View File

@ -358,7 +358,7 @@ class PodVIFDriver(DriverBase, metaclass=abc.ABCMeta):
raise NotImplementedError()
@abc.abstractmethod
def release_vif(self, pod, vif, project_id=None, security_groups=None):
def release_vif(self, pod, vif, project_id=None):
"""Unlinks Neutron port corresponding to VIF object from pod.
Implementing drivers must ensure the port is either deleted or made
@ -367,13 +367,10 @@ class PodVIFDriver(DriverBase, metaclass=abc.ABCMeta):
:param pod: dict containing Kubernetes Pod object
:param vif: VIF object as returned by `PodVIFDriver.request_vif`
:param project_id: OpenStack project ID
:param security_groups: list containing security groups'
IDs as returned by
`PodSecurityGroupsDriver.get_security_groups`
"""
raise NotImplementedError()
def release_vifs(self, pods, vifs, project_id=None, security_groups=None):
def release_vifs(self, pods, vifs, project_id=None):
"""Unlinks Neutron ports corresponding to VIF objects.
It follows the same pattern as release_vif but releasing num_ports
@ -383,9 +380,6 @@ class PodVIFDriver(DriverBase, metaclass=abc.ABCMeta):
:param vifs: list of VIF objects as returned by
`PodVIFDriver.request_vif`
:param project_id: (optional) OpenStack project ID
:param security_groups: (optional) list containing security groups'
IDs as returned by
`PodSecurityGroupsDriver.get_security_groups`
"""
raise NotImplementedError()

View File

@ -51,7 +51,7 @@ class NestedDpdkPodVIFDriver(nested_vif.NestedPodVIFDriver):
# TODO(garyloug): provide an implementation
raise NotImplementedError()
def release_vif(self, pod, vif, project_id=None, security_groups=None):
def release_vif(self, pod, vif, project_id=None):
compute = clients.get_compute_client()
vm_id = self._get_parent_port(pod).device_id

View File

@ -65,7 +65,7 @@ class NestedMacvlanPodVIFDriver(nested_vif.NestedPodVIFDriver):
# TODO(mchiappe): provide an implementation
raise NotImplementedError()
def release_vif(self, pod, vif, project_id=None, security_groups=None):
def release_vif(self, pod, vif, project_id=None):
os_net = clients.get_network_client()
attempts = kuryr_config.CONF.pod_vif_nested.rev_update_attempts

View File

@ -157,7 +157,7 @@ class NestedVlanPodVIFDriver(nested_vif.NestedPodVIFDriver):
"rechecked when event will be retried.", vif.id)
raise
def release_vif(self, pod, vif, project_id=None, security_groups=None):
def release_vif(self, pod, vif, project_id=None):
os_net = clients.get_network_client()
parent_port = self._get_parent_port(pod)
trunk_id = self._get_trunk_id(parent_port)

View File

@ -93,7 +93,7 @@ class NeutronPodVIFDriver(base.PodVIFDriver):
vifs.append(vif)
return vifs
def release_vif(self, pod, vif, project_id=None, security_groups=None):
def release_vif(self, pod, vif, project_id=None):
clients.get_network_client().delete_port(vif.id)
def activate_vif(self, vif, **kwargs):

View File

@ -315,8 +315,7 @@ class BaseVIFPool(base.VIFPoolDriver, metaclass=abc.ABCMeta):
return False
return True
def release_vif(self, pod, vif, project_id, security_groups,
host_addr=None):
def release_vif(self, pod, vif, project_id, host_addr=None):
if not self._recovered_pools:
LOG.debug("Kuryr-controller not yet ready to remove pods.")
raise exceptions.ResourceNotReady(pod)
@ -836,23 +835,7 @@ class NestedVIFPool(BaseVIFPool):
def set_vif_driver(self, driver):
self._drv_vif = driver
def _get_parent_port_id(self, vif):
os_net = clients.get_network_client()
tags = []
if config.CONF.neutron_defaults.resource_tags:
tags = config.CONF.neutron_defaults.resource_tags
trunks = os_net.trunks(tags=tags)
for trunk in trunks:
for sp in trunk.sub_ports:
if sp['port_id'] == vif.id:
return trunk.port_id
return None
def release_vif(self, pod, vif, project_id, security_groups):
def release_vif(self, pod, vif, project_id):
if not self._recovered_pools:
LOG.debug("Kuryr-controller not yet ready to remove pods.")
raise exceptions.ResourceNotReady(pod)
@ -865,17 +848,17 @@ class NestedVIFPool(BaseVIFPool):
"determine the IP by calling Neutron.",
name)
parent_id = self._get_parent_port_id(vif)
parent_id = utils.get_parent_port_id(vif)
if not parent_id:
LOG.warning("Port %s not found, ignoring its release request.",
vif.id)
return
host_addr = self._get_parent_port_ip(parent_id)
host_addr = utils.get_parent_port_ip(parent_id)
LOG.debug("Determined hostIP for pod %s is %s", name, host_addr)
super(NestedVIFPool, self).release_vif(
pod, vif, project_id, security_groups, host_addr=host_addr)
pod, vif, project_id, host_addr=host_addr)
def _set_port_debug(self, port_id, pod):
os_net = clients.get_network_client()
@ -967,11 +950,6 @@ class NestedVIFPool(BaseVIFPool):
self._known_trunk_ids[pool_key] = trunk_id
return trunk_id
def _get_parent_port_ip(self, port_id):
os_net = clients.get_network_client()
parent_port = os_net.get_port(port_id)
return parent_port.fixed_ips[0]['ip_address']
def sync_pools(self):
super(NestedVIFPool, self).sync_pools()
# NOTE(ltomasbo): Ensure previously created ports are recovered into

View File

@ -142,24 +142,52 @@ class KuryrPortHandler(k8s_base.ResourceEventHandler):
self._update_services(services, crd_pod_selectors,
project_id)
def _mock_cleanup_pod(self, kuryrport_crd):
"""Mock Pod that doesn't exist anymore for cleanup purposes"""
pod = {
'apiVersion': 'v1',
'kind': 'Pod',
'metadata': kuryrport_crd['metadata'].copy(),
}
# No need to try to delete the finalizer from the pod later, as
# pod's gone.
del pod['metadata']['finalizers']
main_vif = objects.base.VersionedObject.obj_from_primitive(
kuryrport_crd['status']['vifs'][constants.DEFAULT_IFNAME]
['vif'])
port_id = utils.get_parent_port_id(main_vif)
host_ip = utils.get_parent_port_ip(port_id)
pod['status'] = {'hostIP': host_ip}
return pod
def on_finalize(self, kuryrport_crd, *args, **kwargs):
name = kuryrport_crd['metadata']['name']
namespace = kuryrport_crd['metadata']['namespace']
cleanup = False # If we're doing a cleanup, raise no error.
try:
pod = self.k8s.get(f"{constants.K8S_API_NAMESPACES}"
f"/{namespace}/pods/{name}")
if pod['metadata']['uid'] != kuryrport_crd['spec']['podUid']:
# Seems like this is KuryrPort created for an old Pod instance,
# with the same name. Cleaning it up instead of regular delete.
raise k_exc.K8sResourceNotFound(
'Pod %s' % pod['metadata']['uid'])
except k_exc.K8sResourceNotFound:
LOG.error("Pod %s/%s doesn't exists, deleting orphaned KuryrPort",
namespace, name)
# TODO(gryf): Free resources
try:
LOG.warning('Pod for KuryrPort %s was forcibly deleted. '
'Attempting a cleanup before releasing KuryrPort.',
utils.get_res_unique_name(kuryrport_crd))
self.k8s.add_event(kuryrport_crd, 'MissingPod',
'Pod does not exist anymore, attempting to '
'cleanup orphaned KuryrPort', 'Warning')
if kuryrport_crd['status']['vifs']:
pod = self._mock_cleanup_pod(kuryrport_crd)
cleanup = True # Make sure we don't raise on release_vif()
else:
# Remove the finalizer, most likely ports never got created.
self.k8s.remove_finalizer(kuryrport_crd,
constants.KURYRPORT_FINALIZER)
except k_exc.K8sClientException as ex:
LOG.exception("Failed to remove finalizer from KuryrPort %s",
ex)
raise
return
return
if ('deletionTimestamp' not in pod['metadata'] and
not utils.is_pod_completed(pod)):
@ -193,21 +221,16 @@ class KuryrPortHandler(k8s_base.ResourceEventHandler):
self.k8s.add_event(pod, 'SkipingSGDeletion', 'Skipping SG rules '
'deletion')
crd_pod_selectors = []
try:
security_groups = self._drv_sg.get_security_groups(pod, project_id)
except k_exc.ResourceNotReady:
# NOTE(ltomasbo): If the namespace object gets deleted first the
# namespace security group driver will raise a ResourceNotReady
# exception as it cannot access anymore the kuryrnetwork CRD
# annotated on the namespace object. In such case we set security
# groups to empty list so that if pools are enabled they will be
# properly released.
security_groups = []
for data in kuryrport_crd['status']['vifs'].values():
vif = objects.base.VersionedObject.obj_from_primitive(data['vif'])
self._drv_vif_pool.release_vif(pod, vif, project_id,
security_groups)
try:
self._drv_vif_pool.release_vif(pod, vif, project_id)
except Exception:
if not cleanup:
raise
LOG.warning('Error when cleaning up VIF %s, ignoring.',
utils.get_res_unique_name(kuryrport_crd))
if (driver_utils.is_network_policy_enabled() and crd_pod_selectors and
oslo_cfg.CONF.octavia_defaults.enforce_sg_rules):
services = driver_utils.get_services()
@ -225,12 +248,27 @@ class KuryrPortHandler(k8s_base.ResourceEventHandler):
f"/{kuryrport_crd['metadata']['namespace']}"
f"/pods"
f"/{kuryrport_crd['metadata']['name']}")
except k_exc.K8sResourceNotFound as ex:
LOG.exception("Failed to get pod: %s", ex)
# TODO(gryf): Release resources
self.k8s.remove_finalizer(kuryrport_crd,
constants.KURYRPORT_FINALIZER)
raise
if pod['metadata']['uid'] != kuryrport_crd['spec']['podUid']:
# Seems like this is KuryrPort created for an old Pod, deleting
# it anyway.
raise k_exc.K8sResourceNotFound(
'Pod %s' % pod['metadata']['uid'])
except k_exc.K8sResourceNotFound:
LOG.warning('Pod for KuryrPort %s was forcibly deleted. Deleting'
'the KuryrPort too to attempt resource cleanup.',
utils.get_res_unique_name(kuryrport_crd))
self.k8s.add_event(kuryrport_crd, 'MissingPod',
'Pod does not exist anymore, attempting to '
'delete orphaned KuryrPort', 'Warning')
try:
self.k8s.delete(utils.get_res_link(kuryrport_crd))
except k_exc.K8sResourceNotFound:
pass
except k_exc.K8sClientException:
LOG.exception('Error when trying to delete KuryrPort %s. Will '
'retry later.',
utils.get_res_unique_name(kuryrport_crd))
return False
project_id = self._drv_project.get_project(pod)
security_groups = self._drv_sg.get_security_groups(pod, project_id)

View File

@ -1322,21 +1322,6 @@ class NestedVIFPool(test_base.TestCase):
port_id)
os_net.delete_port.assert_not_called()
def test__get_parent_port_ip(self):
cls = vif_pool.NestedVIFPool
m_driver = mock.MagicMock(spec=cls)
os_net = self.useFixture(k_fix.MockNetworkClient()).client
port_id = str(uuid.uuid4())
ip_address = mock.sentinel.ip_address
port_obj = fake.get_port_obj(ip_address=ip_address)
os_net.get_port.return_value = port_obj
self.assertEqual(ip_address, cls._get_parent_port_ip(m_driver,
port_id))
@mock.patch('kuryr_kubernetes.utils.get_subnet')
def test__get_trunk_info(self, m_get_subnet):
cls = vif_pool.NestedVIFPool

View File

@ -45,13 +45,17 @@ class TestKuryrPortHandler(test_base.TestCase):
self._kp_namespace = mock.sentinel.namespace
self._kp_uid = mock.sentinel.kp_uid
self._kp_name = 'pod1'
self._pod_uid = 'deadbeef'
self._pod = {'apiVersion': 'v1',
'kind': 'Pod',
'metadata': {'resourceVersion': self._pod_version,
'name': self._kp_name,
'deletionTimestamp': mock.sentinel.date,
'namespace': self._kp_namespace},
'metadata': {
'resourceVersion': self._pod_version,
'name': self._kp_name,
'deletionTimestamp': mock.sentinel.date,
'namespace': self._kp_namespace,
'uid': self._pod_uid,
},
'spec': {'nodeName': self._host}}
self._kp = {
@ -63,10 +67,11 @@ class TestKuryrPortHandler(test_base.TestCase):
'namespace': self._kp_namespace,
'labels': {
constants.KURYRPORT_LABEL: self._host
}
},
'finalizers': [],
},
'spec': {
'podUid': 'deadbeef',
'podUid': self._pod_uid,
'podNodeName': self._host
},
'status': {'vifs': {}}
@ -91,6 +96,7 @@ class TestKuryrPortHandler(test_base.TestCase):
self._pod_uri = (f"{constants.K8S_API_NAMESPACES}"
f"/{self._kp['metadata']['namespace']}/pods/"
f"{self._kp['metadata']['name']}")
self._kp_uri = utils.get_res_link(self._kp)
self.useFixture(k_fix.MockNetworkClient())
self._driver = multi_vif.NoopMultiVIFDriver()
@ -312,12 +318,18 @@ class TestKuryrPortHandler(test_base.TestCase):
update_crd.assert_called_once_with(self._kp, self._vifs)
create_sgr.assert_called_once_with(self._pod)
@mock.patch('kuryr_kubernetes.controller.drivers.default_project.'
'DefaultPodProjectDriver.get_project')
@mock.patch('kuryr_kubernetes.utils.get_parent_port_id')
@mock.patch('kuryr_kubernetes.utils.get_parent_port_ip')
@mock.patch('kuryr_kubernetes.clients.get_kubernetes_client')
@mock.patch('kuryr_kubernetes.controller.drivers.base.MultiVIFDriver.'
'get_enabled_drivers')
def test_on_finalize_exception_on_pod(self, ged, k8s):
def test_on_finalize_exception_on_pod(self, ged, k8s, gppip, gppid,
project_driver):
ged.return_value = [self._driver]
kp = kuryrport.KuryrPortHandler()
self._kp['metadata']['deletionTimestamp'] = 'foobar'
self._kp['status']['vifs'] = self._vifs_primitive
with mock.patch.object(kp, 'k8s') as k8s:
@ -325,46 +337,10 @@ class TestKuryrPortHandler(test_base.TestCase):
self.assertIsNone(kp.on_finalize(self._kp))
k8s.get.assert_called_once_with(self._pod_uri)
k8s.remove_finalizer.assert_called_once_with(
self._kp, constants.KURYRPORT_FINALIZER)
@mock.patch('kuryr_kubernetes.controller.drivers.vif_pool.MultiVIFPool.'
'release_vif')
@mock.patch('kuryr_kubernetes.controller.drivers.default_security_groups.'
'DefaultPodSecurityGroupsDriver.get_security_groups')
@mock.patch('kuryr_kubernetes.controller.drivers.default_security_groups.'
'DefaultPodSecurityGroupsDriver.delete_sg_rules')
@mock.patch('kuryr_kubernetes.controller.drivers.default_project.'
'DefaultPodProjectDriver.get_project')
@mock.patch('kuryr_kubernetes.clients.get_kubernetes_client')
@mock.patch('kuryr_kubernetes.controller.drivers.base.MultiVIFDriver.'
'get_enabled_drivers')
def test_on_finalize_crd_sg_exceptions(self, ged, k8s, get_project,
delete_sg_rules, get_sg,
release_vif):
ged.return_value = [self._driver]
kp = kuryrport.KuryrPortHandler()
self._kp['status']['vifs'] = self._vifs_primitive
get_project.return_value = self._project_id
delete_sg_rules.side_effect = k_exc.ResourceNotReady(self._pod)
get_sg.side_effect = k_exc.ResourceNotReady(self._pod)
with mock.patch.object(kp, 'k8s') as k8s:
k8s.get.return_value = self._pod
kp.on_finalize(self._kp)
k8s.get.assert_called_once_with(self._pod_uri)
k8s.remove_finalizer.assert_has_calls(
[mock.call(self._pod, constants.POD_FINALIZER),
mock.call(self._kp, constants.KURYRPORT_FINALIZER)])
delete_sg_rules.assert_called_once_with(self._pod)
get_sg.assert_called_once_with(self._pod, self._project_id)
release_vif.assert_has_calls([mock.call(self._pod, self._vif1,
self._project_id, []),
mock.call(self._pod, self._vif2,
self._project_id, [])])
(mock.call(mock.ANY, constants.POD_FINALIZER),
mock.call(self._kp, constants.KURYRPORT_FINALIZER)))
@mock.patch('kuryr_kubernetes.controller.handlers.kuryrport.'
'KuryrPortHandler._update_services')
@ -377,8 +353,6 @@ class TestKuryrPortHandler(test_base.TestCase):
'is_network_policy_enabled')
@mock.patch('kuryr_kubernetes.controller.drivers.vif_pool.MultiVIFPool.'
'release_vif')
@mock.patch('kuryr_kubernetes.controller.drivers.default_security_groups.'
'DefaultPodSecurityGroupsDriver.get_security_groups')
@mock.patch('kuryr_kubernetes.controller.drivers.default_security_groups.'
'DefaultPodSecurityGroupsDriver.delete_sg_rules')
@mock.patch('kuryr_kubernetes.controller.drivers.default_project.'
@ -387,9 +361,8 @@ class TestKuryrPortHandler(test_base.TestCase):
@mock.patch('kuryr_kubernetes.controller.drivers.base.MultiVIFDriver.'
'get_enabled_drivers')
def test_on_finalize_np(self, ged, k8s, get_project, delete_sg_rules,
get_sg, release_vif, is_np_enabled,
get_lb_instance, get_sg_instance, get_services,
update_services):
release_vif, is_np_enabled, get_lb_instance,
get_sg_instance, get_services, update_services):
ged.return_value = [self._driver]
CONF.set_override('enforce_sg_rules', True, group='octavia_defaults')
self.addCleanup(CONF.clear_override, 'enforce_sg_rules',
@ -399,7 +372,6 @@ class TestKuryrPortHandler(test_base.TestCase):
get_project.return_value = self._project_id
selector = mock.sentinel.selector
delete_sg_rules.return_value = selector
get_sg.return_value = self._security_groups
get_services.return_value = mock.sentinel.services
with mock.patch.object(kp, 'k8s') as k8s:
@ -413,13 +385,10 @@ class TestKuryrPortHandler(test_base.TestCase):
mock.call(self._kp, constants.KURYRPORT_FINALIZER)])
delete_sg_rules.assert_called_once_with(self._pod)
get_sg.assert_called_once_with(self._pod, self._project_id)
release_vif.assert_has_calls([mock.call(self._pod, self._vif1,
self._project_id,
self._security_groups),
self._project_id),
mock.call(self._pod, self._vif2,
self._project_id,
self._security_groups)])
self._project_id)])
get_services.assert_called_once()
update_services.assert_called_once_with(mock.sentinel.services,
@ -486,11 +455,10 @@ class TestKuryrPortHandler(test_base.TestCase):
kp = kuryrport.KuryrPortHandler()
kp.k8s.get.side_effect = k_exc.K8sResourceNotFound(self._pod)
self.assertRaises(k_exc.K8sResourceNotFound, kp.get_vifs, self._kp)
self.assertFalse(kp.get_vifs(self._kp))
kp.k8s.get.assert_called_once_with(self._pod_uri)
kp.k8s.remove_finalizer.assert_called_once_with(
self._kp, constants.KURYRPORT_FINALIZER)
kp.k8s.delete.assert_called_once_with(self._kp_uri)
@mock.patch('kuryr_kubernetes.controller.drivers.default_subnet.'
'DefaultPodSubnetDriver.get_subnets')

View File

@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from unittest import mock
import uuid
import munch
from openstack import exceptions as os_exc
@ -23,6 +24,7 @@ from kuryr_kubernetes import constants as k_const
from kuryr_kubernetes import exceptions as k_exc
from kuryr_kubernetes.objects import vif
from kuryr_kubernetes.tests import base as test_base
from kuryr_kubernetes.tests import fake
from kuryr_kubernetes.tests.unit import kuryr_fixtures as k_fix
from kuryr_kubernetes import utils
@ -554,3 +556,14 @@ class TestUtils(test_base.TestCase):
m_get_net.assert_called_once()
m_net.delete_port.assert_not_called()
def test__get_parent_port_ip(self):
os_net = self.useFixture(k_fix.MockNetworkClient()).client
port_id = str(uuid.uuid4())
ip_address = mock.sentinel.ip_address
port_obj = fake.get_port_obj(ip_address=ip_address)
os_net.get_port.return_value = port_obj
self.assertEqual(ip_address, utils.get_parent_port_ip(port_id))

View File

@ -821,3 +821,26 @@ def cleanup_dead_networks():
except os_exc.SDKException as ex:
LOG.warning('There was an issue with network "%s" '
'removal: %s', net, ex)
def get_parent_port_id(vif_obj):
os_net = clients.get_network_client()
tags = []
if CONF.neutron_defaults.resource_tags:
tags = CONF.neutron_defaults.resource_tags
trunks = os_net.trunks(tags=tags)
for trunk in trunks:
for sp in trunk.sub_ports:
if sp['port_id'] == vif_obj.id:
return trunk.port_id
return None
def get_parent_port_ip(port_id):
os_net = clients.get_network_client()
parent_port = os_net.get_port(port_id)
return parent_port.fixed_ips[0]['ip_address']