From bee7718a48faf65a4b9462157130d3a6afbac5c6 Mon Sep 17 00:00:00 2001 From: Luis Tomas Bolivar Date: Tue, 22 Sep 2020 10:55:43 +0200 Subject: [PATCH] Cleanup ports belonging to deleted nodes Due to the use of pools, there may be ports that belongs to pools associated to deleted nodes. Thus those ports are wasted as they cannot be used. This patch regularly checks for ports associated to deleted nodes/trunks which would have lost their device_owner (for the nested case), and for ports associated to deleted nodes for the neutron vif case, where device owner is set but binding information should not be there anymore. Change-Id: I26be958aa3c0b51eb1a296eb2b4ac7996bc3263c --- .../controller/drivers/vif_pool.py | 100 +++++++++++++++++- 1 file changed, 98 insertions(+), 2 deletions(-) diff --git a/kuryr_kubernetes/controller/drivers/vif_pool.py b/kuryr_kubernetes/controller/drivers/vif_pool.py index 5f7a081a1..64a9a0621 100644 --- a/kuryr_kubernetes/controller/drivers/vif_pool.py +++ b/kuryr_kubernetes/controller/drivers/vif_pool.py @@ -102,6 +102,8 @@ VIF_TYPE_TO_DRIVER_MAPPING = { 'VIFVHostUser': 'neutron-vif', } +NODE_PORTS_CLEAN_FREQUENCY = 600 # seconds + class NoopVIFPool(base.VIFPoolDriver): """No pool VIFs for Kubernetes Pods""" @@ -161,6 +163,7 @@ class BaseVIFPool(base.VIFPoolDriver, metaclass=abc.ABCMeta): # background thread self._recovered_pools = False eventlet.spawn(self._return_ports_to_pool) + eventlet.spawn(self._cleanup_removed_nodes) def set_vif_driver(self, driver): self._drv_vif = driver @@ -460,6 +463,87 @@ class BaseVIFPool(base.VIFPoolDriver, metaclass=abc.ABCMeta): if not port.binding_host_id: os_net.delete_port(port.id) + def _cleanup_removed_nodes(self): + """Remove ports associated to removed nodes.""" + previous_ports_to_remove = [] + while True: + # NOTE(ltomasbo): Nodes are not expected to be removed + # frequently, so there is no need to execute this frequently + # either + eventlet.sleep(NODE_PORTS_CLEAN_FREQUENCY) + try: + self._trigger_removed_nodes_ports_cleanup( + previous_ports_to_remove) + except Exception: + LOG.exception('Error while removing the ports associated to ' + 'deleted nodes. It will be retried in %s ' + 'seconds', NODE_PORTS_CLEAN_FREQUENCY) + + def _trigger_removed_nodes_ports_cleanup(self, previous_ports_to_remove): + """Remove ports associated to removed nodes. + + There are two types of ports pool, one for neutron and one for nested. + For the nested, the ports lost their device_owner after being detached, + i.e., after the node they belong to got removed. This means we cannot + find them unless they have been tagged. + + For the neutron ones, we rely on them having the kuryr device owner + and not having binding information, thus ensuring they are not + attached to any node. However, to avoid the case where those ports + are being created at the same time of the cleanup process, we don't + delete them unless we have seen them for 2 iterations. + """ + if not self._recovered_pools: + LOG.debug("Kuryr-controller not yet ready to perform nodes" + " cleanup.") + return + os_net = clients.get_network_client() + tags = config.CONF.neutron_defaults.resource_tags + if tags: + # NOTE(ltomasbo): Detached subports gets their device_owner unset + detached_subports = os_net.ports( + device_owner='', status='DOWN', tags=tags) + for subport in detached_subports: + try: + del self._existing_vifs[subport.id] + except KeyError: + LOG.debug('Port %s is not in the ports list.', subport.id) + try: + os_net.delete_port(subport.id) + except os_exc.SDKException: + LOG.debug("Problem deleting leftover port %s. " + "Skipping.", subport.id) + + # normal ports, or subports not yet attached + existing_ports = os_net.ports( + device_owner=kl_const.DEVICE_OWNER, + status='DOWN', + tags=tags) + else: + # normal ports, or subports not yet attached + existing_ports = os_net.ports( + device_owner=kl_const.DEVICE_OWNER, + status='DOWN') + + for port in existing_ports: + # NOTE(ltomasbo): It may be that the port got just created and it + # is still being attached and/or being tagged. + if port.id not in previous_ports_to_remove: + previous_ports_to_remove.append(port.id) + continue + + if not port.binding_host_id: + try: + del self._existing_vifs[port.id] + except KeyError: + LOG.debug('Port %s is not in the ports list.', port.id) + try: + os_net.delete_port(port.id) + except os_exc.SDKException: + LOG.debug("Problem deleting leftover port %s. " + "Skipping.", port.id) + previous_ports_to_remove.remove(port.id) + class NeutronVIFPool(BaseVIFPool): """Manages VIFs for Bare Metal Kubernetes Pods.""" @@ -524,7 +608,13 @@ class NeutronVIFPool(BaseVIFPool): """ while True: eventlet.sleep(oslo_cfg.CONF.vif_pool.ports_pool_update_frequency) - self._trigger_return_to_pool() + try: + self._trigger_return_to_pool() + except Exception: + LOG.exception( + 'Error while returning ports to pool. ' + 'It will be retried in %s seconds', + oslo_cfg.CONF.vif_pool.ports_pool_update_frequency) @lockutils.synchronized('return_to_pool_baremetal') def _trigger_return_to_pool(self): @@ -777,7 +867,13 @@ class NestedVIFPool(BaseVIFPool): """ while True: eventlet.sleep(oslo_cfg.CONF.vif_pool.ports_pool_update_frequency) - self._trigger_return_to_pool() + try: + self._trigger_return_to_pool() + except Exception: + LOG.exception( + 'Error while returning ports to pool. ' + 'It will be retried in %s seconds', + oslo_cfg.CONF.vif_pool.ports_pool_update_frequency) @lockutils.synchronized('return_to_pool_nested') def _trigger_return_to_pool(self):