Merge "CNI: Don't wait for missing pods on DEL"
This commit is contained in:
commit
230a2300e5
|
@ -89,17 +89,12 @@ class NestedDriver(health.HealthHandler, b_base.BaseBindingDriver,
|
|||
# NOTE(dulek): This is related to bug 1854928. It's super-rare,
|
||||
# so aim of this piece is to gater any info useful
|
||||
# for determining when it happens.
|
||||
LOG.exception('Creation of pod interface failed, most likely '
|
||||
'due to duplicated VLAN id. This will probably '
|
||||
'cause kuryr-daemon to crashloop. Trying to '
|
||||
'gather debugging information.')
|
||||
|
||||
with b_base.get_ipdb() as h_ipdb:
|
||||
LOG.error('List of host interfaces: %s', h_ipdb.interfaces)
|
||||
|
||||
with b_base.get_ipdb(netns) as c_ipdb:
|
||||
LOG.error('List of pod namespace interfaces: %s',
|
||||
c_ipdb.interfaces)
|
||||
LOG.exception(f'Creation of pod interface failed due to VLAN '
|
||||
f'ID (vlan_info={args}) conflict. Probably the '
|
||||
f'CRI had not cleaned up the network namespace '
|
||||
f'of deleted pods. This should not be a '
|
||||
f'permanent issue but may cause restart of '
|
||||
f'kuryr-cni pod.')
|
||||
raise
|
||||
|
||||
with b_base.get_ipdb(netns) as c_ipdb:
|
||||
|
|
|
@ -104,10 +104,10 @@ class DaemonServer(object):
|
|||
self.plugin.delete(params)
|
||||
except exceptions.ResourceNotReady:
|
||||
# NOTE(dulek): It's better to ignore this error - most of the time
|
||||
# it will happen when pod is long gone and kubelet
|
||||
# it will happen when pod is long gone and CRI
|
||||
# overzealously tries to delete it from the network.
|
||||
# We cannot really do anything without VIF annotation,
|
||||
# so let's just tell kubelet to move along.
|
||||
# so let's just tell CRI to move along.
|
||||
LOG.warning('Error when processing delNetwork request. '
|
||||
'Ignoring this error, pod is most likely gone')
|
||||
return '', httplib.NO_CONTENT, self.headers
|
||||
|
|
|
@ -51,9 +51,28 @@ class K8sCNIRegistryPlugin(base_cni.CNIPlugin):
|
|||
'name': params.args.K8S_POD_NAME}
|
||||
|
||||
def add(self, params):
|
||||
vifs = self._do_work(params, b_base.connect, confirm=True)
|
||||
|
||||
pod_name = self._get_pod_name(params)
|
||||
timeout = CONF.cni_daemon.vif_annotation_timeout
|
||||
|
||||
# Try to confirm if pod in the registry is not stale cache. If it is,
|
||||
# remove it.
|
||||
with lockutils.lock(pod_name, external=True):
|
||||
if pod_name in self.registry:
|
||||
cached_pod = self.registry[pod_name]['pod']
|
||||
try:
|
||||
pod = self.k8s.get(cached_pod['metadata']['selfLink'])
|
||||
except Exception:
|
||||
LOG.exception('Error when getting pod %s', pod_name)
|
||||
raise exceptions.ResourceNotReady(pod_name)
|
||||
|
||||
if pod['metadata']['uid'] != cached_pod['metadata']['uid']:
|
||||
LOG.warning('Stale pod %s detected in cache. (API '
|
||||
'uid=%s, cached uid=%s). Removing it from '
|
||||
'cache.', pod_name, pod['metadata']['uid'],
|
||||
cached_pod['metadata']['uid'])
|
||||
del self.registry[pod_name]
|
||||
|
||||
vifs = self._do_work(params, b_base.connect, timeout)
|
||||
|
||||
# NOTE(dulek): Saving containerid to be able to distinguish old DEL
|
||||
# requests that we should ignore. We need a lock to
|
||||
|
@ -66,9 +85,6 @@ class K8sCNIRegistryPlugin(base_cni.CNIPlugin):
|
|||
LOG.debug('Saved containerid = %s for pod %s',
|
||||
params.CNI_CONTAINERID, pod_name)
|
||||
|
||||
# Wait for VIFs to become active.
|
||||
timeout = CONF.cni_daemon.vif_annotation_timeout
|
||||
|
||||
# Wait for timeout sec, 1 sec between tries, retry when even one
|
||||
# vif is not active.
|
||||
@retrying.retry(stop_max_delay=timeout * 1000, wait_fixed=RETRY_DELAY,
|
||||
|
@ -96,12 +112,20 @@ class K8sCNIRegistryPlugin(base_cni.CNIPlugin):
|
|||
# NOTE(dulek): This is a DEL request for some older (probably
|
||||
# failed) ADD call. We should ignore it or we'll
|
||||
# unplug a running pod.
|
||||
LOG.warning('Received DEL request for unknown ADD call. '
|
||||
'Ignoring.')
|
||||
LOG.warning('Received DEL request for unknown ADD call for '
|
||||
'pod %s (CNI_CONTAINERID=%s). Ignoring.', pod_name,
|
||||
params.CNI_CONTAINERID)
|
||||
return
|
||||
except KeyError:
|
||||
pass
|
||||
self._do_work(params, b_base.disconnect)
|
||||
|
||||
# Passing arbitrary 5 seconds as timeout, as it does not make any sense
|
||||
# to wait on CNI DEL. If pod got deleted from API - VIF info is gone.
|
||||
# If pod got the annotation removed - it is now gone too. The number's
|
||||
# not 0, because we need to anticipate for restarts and delay before
|
||||
# registry is populated by watcher.
|
||||
self._do_work(params, b_base.disconnect, 5)
|
||||
|
||||
# NOTE(ndesh): We need to lock here to avoid race condition
|
||||
# with the deletion code in the watcher to ensure that
|
||||
# we delete the registry entry exactly once
|
||||
|
@ -126,29 +150,9 @@ class K8sCNIRegistryPlugin(base_cni.CNIPlugin):
|
|||
LOG.debug("Reporting CNI driver not healthy.")
|
||||
self.healthy.value = driver_healthy
|
||||
|
||||
def _do_work(self, params, fn, confirm=False):
|
||||
def _do_work(self, params, fn, timeout):
|
||||
pod_name = self._get_pod_name(params)
|
||||
|
||||
timeout = CONF.cni_daemon.vif_annotation_timeout
|
||||
|
||||
if confirm:
|
||||
# Try to confirm if pod in the registry is not stale cache.
|
||||
with lockutils.lock(pod_name, external=True):
|
||||
if pod_name in self.registry:
|
||||
cached_pod = self.registry[pod_name]['pod']
|
||||
try:
|
||||
pod = self.k8s.get(cached_pod['metadata']['selfLink'])
|
||||
except Exception:
|
||||
LOG.exception('Error when getting pod %s', pod_name)
|
||||
raise exceptions.ResourceNotReady(pod_name)
|
||||
|
||||
if pod['metadata']['uid'] != cached_pod['metadata']['uid']:
|
||||
LOG.warning('Stale pod %s detected in cache. (API '
|
||||
'uid=%s, cached uid=%s). Removing it from '
|
||||
'cache.', pod_name, pod['metadata']['uid'],
|
||||
cached_pod['metadata']['uid'])
|
||||
del self.registry[pod_name]
|
||||
|
||||
# In case of KeyError retry for `timeout` s, wait 1 s between tries.
|
||||
@retrying.retry(stop_max_delay=timeout * 1000, wait_fixed=RETRY_DELAY,
|
||||
retry_on_exception=lambda e: isinstance(e, KeyError))
|
||||
|
|
Loading…
Reference in New Issue