Merge "CNI: Don't wait for missing pods on DEL"

This commit is contained in:
Zuul 2020-06-11 00:04:37 +00:00 committed by Gerrit Code Review
commit 230a2300e5
3 changed files with 41 additions and 42 deletions

View File

@ -89,17 +89,12 @@ class NestedDriver(health.HealthHandler, b_base.BaseBindingDriver,
# NOTE(dulek): This is related to bug 1854928. It's super-rare,
# so aim of this piece is to gater any info useful
# for determining when it happens.
LOG.exception('Creation of pod interface failed, most likely '
'due to duplicated VLAN id. This will probably '
'cause kuryr-daemon to crashloop. Trying to '
'gather debugging information.')
with b_base.get_ipdb() as h_ipdb:
LOG.error('List of host interfaces: %s', h_ipdb.interfaces)
with b_base.get_ipdb(netns) as c_ipdb:
LOG.error('List of pod namespace interfaces: %s',
c_ipdb.interfaces)
LOG.exception(f'Creation of pod interface failed due to VLAN '
f'ID (vlan_info={args}) conflict. Probably the '
f'CRI had not cleaned up the network namespace '
f'of deleted pods. This should not be a '
f'permanent issue but may cause restart of '
f'kuryr-cni pod.')
raise
with b_base.get_ipdb(netns) as c_ipdb:

View File

@ -104,10 +104,10 @@ class DaemonServer(object):
self.plugin.delete(params)
except exceptions.ResourceNotReady:
# NOTE(dulek): It's better to ignore this error - most of the time
# it will happen when pod is long gone and kubelet
# it will happen when pod is long gone and CRI
# overzealously tries to delete it from the network.
# We cannot really do anything without VIF annotation,
# so let's just tell kubelet to move along.
# so let's just tell CRI to move along.
LOG.warning('Error when processing delNetwork request. '
'Ignoring this error, pod is most likely gone')
return '', httplib.NO_CONTENT, self.headers

View File

@ -51,9 +51,28 @@ class K8sCNIRegistryPlugin(base_cni.CNIPlugin):
'name': params.args.K8S_POD_NAME}
def add(self, params):
vifs = self._do_work(params, b_base.connect, confirm=True)
pod_name = self._get_pod_name(params)
timeout = CONF.cni_daemon.vif_annotation_timeout
# Try to confirm if pod in the registry is not stale cache. If it is,
# remove it.
with lockutils.lock(pod_name, external=True):
if pod_name in self.registry:
cached_pod = self.registry[pod_name]['pod']
try:
pod = self.k8s.get(cached_pod['metadata']['selfLink'])
except Exception:
LOG.exception('Error when getting pod %s', pod_name)
raise exceptions.ResourceNotReady(pod_name)
if pod['metadata']['uid'] != cached_pod['metadata']['uid']:
LOG.warning('Stale pod %s detected in cache. (API '
'uid=%s, cached uid=%s). Removing it from '
'cache.', pod_name, pod['metadata']['uid'],
cached_pod['metadata']['uid'])
del self.registry[pod_name]
vifs = self._do_work(params, b_base.connect, timeout)
# NOTE(dulek): Saving containerid to be able to distinguish old DEL
# requests that we should ignore. We need a lock to
@ -66,9 +85,6 @@ class K8sCNIRegistryPlugin(base_cni.CNIPlugin):
LOG.debug('Saved containerid = %s for pod %s',
params.CNI_CONTAINERID, pod_name)
# Wait for VIFs to become active.
timeout = CONF.cni_daemon.vif_annotation_timeout
# Wait for timeout sec, 1 sec between tries, retry when even one
# vif is not active.
@retrying.retry(stop_max_delay=timeout * 1000, wait_fixed=RETRY_DELAY,
@ -96,12 +112,20 @@ class K8sCNIRegistryPlugin(base_cni.CNIPlugin):
# NOTE(dulek): This is a DEL request for some older (probably
# failed) ADD call. We should ignore it or we'll
# unplug a running pod.
LOG.warning('Received DEL request for unknown ADD call. '
'Ignoring.')
LOG.warning('Received DEL request for unknown ADD call for '
'pod %s (CNI_CONTAINERID=%s). Ignoring.', pod_name,
params.CNI_CONTAINERID)
return
except KeyError:
pass
self._do_work(params, b_base.disconnect)
# Passing arbitrary 5 seconds as timeout, as it does not make any sense
# to wait on CNI DEL. If pod got deleted from API - VIF info is gone.
# If pod got the annotation removed - it is now gone too. The number's
# not 0, because we need to anticipate for restarts and delay before
# registry is populated by watcher.
self._do_work(params, b_base.disconnect, 5)
# NOTE(ndesh): We need to lock here to avoid race condition
# with the deletion code in the watcher to ensure that
# we delete the registry entry exactly once
@ -126,29 +150,9 @@ class K8sCNIRegistryPlugin(base_cni.CNIPlugin):
LOG.debug("Reporting CNI driver not healthy.")
self.healthy.value = driver_healthy
def _do_work(self, params, fn, confirm=False):
def _do_work(self, params, fn, timeout):
pod_name = self._get_pod_name(params)
timeout = CONF.cni_daemon.vif_annotation_timeout
if confirm:
# Try to confirm if pod in the registry is not stale cache.
with lockutils.lock(pod_name, external=True):
if pod_name in self.registry:
cached_pod = self.registry[pod_name]['pod']
try:
pod = self.k8s.get(cached_pod['metadata']['selfLink'])
except Exception:
LOG.exception('Error when getting pod %s', pod_name)
raise exceptions.ResourceNotReady(pod_name)
if pod['metadata']['uid'] != cached_pod['metadata']['uid']:
LOG.warning('Stale pod %s detected in cache. (API '
'uid=%s, cached uid=%s). Removing it from '
'cache.', pod_name, pod['metadata']['uid'],
cached_pod['metadata']['uid'])
del self.registry[pod_name]
# In case of KeyError retry for `timeout` s, wait 1 s between tries.
@retrying.retry(stop_max_delay=timeout * 1000, wait_fixed=RETRY_DELAY,
retry_on_exception=lambda e: isinstance(e, KeyError))