CNI: Don't wait for missing pods on DEL

We've seen some issues related to the fact that on CNI DEL we wait for pod annotation to appear in CNI registry: 1. Overloaded kuryr-daemon HTTP server, because many stale DEL requests from CRI will saturate the number of free connections, causing CNI to answer requests extremely slowly. 2. Some CRI's will not delete network namespace before getting a successful CNI DEL. If we'll assign the same IP (or vlan) to a next pod, we might end up with an IP or vlan conflict and NetlinkError. This commit makes sure we only wait for VIFs up to 5 seconds on DEL requests, enough time for watcher to populate registry on restart. Also some code got moved around to make implementating the above simpler and some logs got clarified to make debugging of such issues easier later on. Change-Id: I9221b6bc9166597837a4b53382862aa6c6f3e94c Closes-Bug: 1882083 Related-Bug: 1854928
2020-06-04 16:45:21 +02:00 · 2020-06-04 16:45:21 +02:00 · bd46075acd
parent 26b4a044d8
commit bd46075acd
3 changed files with 41 additions and 42 deletions
--- a/kuryr_kubernetes/cni/binding/nested.py
+++ b/kuryr_kubernetes/cni/binding/nested.py
@ -89,17 +89,12 @@ class NestedDriver(health.HealthHandler, b_base.BaseBindingDriver,
                # NOTE(dulek): This is related to bug 1854928. It's super-rare,
                #              so aim of this piece is to gater any info useful
                #              for determining when it happens.
-                LOG.exception('Creation of pod interface failed, most likely '
-                              'due to duplicated VLAN id. This will probably '
-                              'cause kuryr-daemon to crashloop. Trying to '
-                              'gather debugging information.')
-
-                with b_base.get_ipdb() as h_ipdb:
-                    LOG.error('List of host interfaces: %s', h_ipdb.interfaces)
-
-                with b_base.get_ipdb(netns) as c_ipdb:
-                    LOG.error('List of pod namespace interfaces: %s',
-                              c_ipdb.interfaces)
+                LOG.exception(f'Creation of pod interface failed due to VLAN '
+                              f'ID (vlan_info={args}) conflict. Probably the '
+                              f'CRI had not cleaned up the network namespace '
+                              f'of deleted pods. This should not be a '
+                              f'permanent issue but may cause restart of '
+                              f'kuryr-cni pod.')
            raise

        with b_base.get_ipdb(netns) as c_ipdb:
--- a/kuryr_kubernetes/cni/daemon/service.py
+++ b/kuryr_kubernetes/cni/daemon/service.py
@ -104,10 +104,10 @@ class DaemonServer(object):
            self.plugin.delete(params)
        except exceptions.ResourceNotReady:
            # NOTE(dulek): It's better to ignore this error - most of the time
-            #              it will happen when pod is long gone and kubelet
+            #              it will happen when pod is long gone and CRI
            #              overzealously tries to delete it from the network.
            #              We cannot really do anything without VIF annotation,
-            #              so let's just tell kubelet to move along.
+            #              so let's just tell CRI to move along.
            LOG.warning('Error when processing delNetwork request. '
                        'Ignoring this error, pod is most likely gone')
            return '', httplib.NO_CONTENT, self.headers
--- a/kuryr_kubernetes/cni/plugins/k8s_cni_registry.py
+++ b/kuryr_kubernetes/cni/plugins/k8s_cni_registry.py
@ -51,9 +51,28 @@ class K8sCNIRegistryPlugin(base_cni.CNIPlugin):
            'name': params.args.K8S_POD_NAME}

    def add(self, params):
-        vifs = self._do_work(params, b_base.connect, confirm=True)
-
        pod_name = self._get_pod_name(params)
+        timeout = CONF.cni_daemon.vif_annotation_timeout
+
+        # Try to confirm if pod in the registry is not stale cache. If it is,
+        # remove it.
+        with lockutils.lock(pod_name, external=True):
+            if pod_name in self.registry:
+                cached_pod = self.registry[pod_name]['pod']
+                try:
+                    pod = self.k8s.get(cached_pod['metadata']['selfLink'])
+                except Exception:
+                    LOG.exception('Error when getting pod %s', pod_name)
+                    raise exceptions.ResourceNotReady(pod_name)
+
+                if pod['metadata']['uid'] != cached_pod['metadata']['uid']:
+                    LOG.warning('Stale pod %s detected in cache. (API '
+                                'uid=%s, cached uid=%s). Removing it from '
+                                'cache.', pod_name, pod['metadata']['uid'],
+                                cached_pod['metadata']['uid'])
+                    del self.registry[pod_name]
+
+        vifs = self._do_work(params, b_base.connect, timeout)

        # NOTE(dulek): Saving containerid to be able to distinguish old DEL
        #              requests that we should ignore. We need a lock to
@ -66,9 +85,6 @@ class K8sCNIRegistryPlugin(base_cni.CNIPlugin):
            LOG.debug('Saved containerid = %s for pod %s',
                      params.CNI_CONTAINERID, pod_name)

-        # Wait for VIFs to become active.
-        timeout = CONF.cni_daemon.vif_annotation_timeout
-
        # Wait for timeout sec, 1 sec between tries, retry when even one
        # vif is not active.
        @retrying.retry(stop_max_delay=timeout * 1000, wait_fixed=RETRY_DELAY,
@ -96,12 +112,20 @@ class K8sCNIRegistryPlugin(base_cni.CNIPlugin):
                # NOTE(dulek): This is a DEL request for some older (probably
                #              failed) ADD call. We should ignore it or we'll
                #              unplug a running pod.
-                LOG.warning('Received DEL request for unknown ADD call. '
-                            'Ignoring.')
+                LOG.warning('Received DEL request for unknown ADD call for '
+                            'pod %s (CNI_CONTAINERID=%s). Ignoring.', pod_name,
+                            params.CNI_CONTAINERID)
                return
        except KeyError:
            pass
-        self._do_work(params, b_base.disconnect)
+
+        # Passing arbitrary 5 seconds as timeout, as it does not make any sense
+        # to wait on CNI DEL. If pod got deleted from API - VIF info is gone.
+        # If pod got the annotation removed - it is now gone too. The number's
+        # not 0, because we need to anticipate for restarts and delay before
+        # registry is populated by watcher.
+        self._do_work(params, b_base.disconnect, 5)
+
        # NOTE(ndesh): We need to lock here to avoid race condition
        #              with the deletion code in the watcher to ensure that
        #              we delete the registry entry exactly once
@ -126,29 +150,9 @@ class K8sCNIRegistryPlugin(base_cni.CNIPlugin):
                LOG.debug("Reporting CNI driver not healthy.")
                self.healthy.value = driver_healthy

-    def _do_work(self, params, fn, confirm=False):
+    def _do_work(self, params, fn, timeout):
        pod_name = self._get_pod_name(params)

-        timeout = CONF.cni_daemon.vif_annotation_timeout
-
-        if confirm:
-            # Try to confirm if pod in the registry is not stale cache.
-            with lockutils.lock(pod_name, external=True):
-                if pod_name in self.registry:
-                    cached_pod = self.registry[pod_name]['pod']
-                    try:
-                        pod = self.k8s.get(cached_pod['metadata']['selfLink'])
-                    except Exception:
-                        LOG.exception('Error when getting pod %s', pod_name)
-                        raise exceptions.ResourceNotReady(pod_name)
-
-                    if pod['metadata']['uid'] != cached_pod['metadata']['uid']:
-                        LOG.warning('Stale pod %s detected in cache. (API '
-                                    'uid=%s, cached uid=%s). Removing it from '
-                                    'cache.', pod_name, pod['metadata']['uid'],
-                                    cached_pod['metadata']['uid'])
-                        del self.registry[pod_name]
-
        # In case of KeyError retry for `timeout` s, wait 1 s between tries.
        @retrying.retry(stop_max_delay=timeout * 1000, wait_fixed=RETRY_DELAY,
                        retry_on_exception=lambda e: isinstance(e, KeyError))