Gather debug info when NetlinkError EEXIST happens

In nested setups on the pod interface binding we sometimes get:

    pyroute2.netlink.exceptions.NetlinkError: (17, 'File exists')

This is most likely related to VLAN id conflict, which should get fixed
by code deleting interfaces that were possible to get created by
previous run of the binding, but for some reason that does not always
help. This commit makes sure that when error occurs we gather all the
information about interfaces in both host and pod namespaces. That
should help to find the guilty interface and understand the reason
behind the issue.

Change-Id: Ia2d81c0e456a6e66ad0a95ae1f1a601236054e2f
Related-Bug: 1854928
This commit is contained in:
Michał Dulko 2020-01-15 15:26:52 +01:00
parent 8f65c32a64
commit 14dbf5b9ce
2 changed files with 36 additions and 13 deletions

View File

@ -13,9 +13,11 @@
# under the License.
import abc
import errno
import six
from oslo_log import log as logging
import pyroute2
from kuryr_kubernetes.cni.binding import base as b_base
from kuryr_kubernetes import config
@ -54,20 +56,41 @@ class NestedDriver(health.HealthHandler, b_base.BaseBindingDriver):
with b_base.get_ipdb(netns) as c_ipdb:
self._remove_ifaces(c_ipdb, (temp_name, ifname), netns)
# We might also have leftover interface in the host netns, let's try to
# remove it too. This is outside of the main host's IPDB context
# manager to make sure removal is commited before starting next
# transaction.
with b_base.get_ipdb() as h_ipdb:
# TODO(vikasc): evaluate whether we should have stevedore
# driver for getting the link device.
vm_iface_name = config.CONF.binding.link_iface
# We might also have leftover interface in the host netns, let's
# try to remove it too.
self._remove_ifaces(h_ipdb, (temp_name,))
args = self._get_iface_create_args(vif)
with h_ipdb.create(ifname=temp_name,
link=h_ipdb.interfaces[vm_iface_name],
**args) as iface:
iface.net_ns_fd = utils.convert_netns(netns)
try:
with b_base.get_ipdb() as h_ipdb:
# TODO(vikasc): evaluate whether we should have stevedore
# driver for getting the link device.
vm_iface_name = config.CONF.binding.link_iface
args = self._get_iface_create_args(vif)
with h_ipdb.create(ifname=temp_name,
link=h_ipdb.interfaces[vm_iface_name],
**args) as iface:
iface.net_ns_fd = utils.convert_netns(netns)
except pyroute2.NetlinkError as e:
if e.code == errno.EEXIST:
# NOTE(dulek): This is related to bug 1854928. It's super-rare,
# so aim of this piece is to gater any info useful
# for determining when it happens.
LOG.exception('Creation of pod interface failed, most likely '
'due to duplicated VLAN id. This will probably '
'cause kuryr-daemon to crashloop. Trying to '
'gather debugging information.')
with b_base.get_ipdb() as h_ipdb:
LOG.error('List of host interfaces: %s', h_ipdb.interfaces)
with b_base.get_ipdb(netns) as c_ipdb:
LOG.error('List of pod namespace interfaces: %s',
c_ipdb.interfaces)
raise
with b_base.get_ipdb(netns) as c_ipdb:
with c_ipdb.interfaces[temp_name] as iface:

View File

@ -179,7 +179,7 @@ class TestNestedVlanDriver(TestDriverMixin, test_base.TestCase):
def test_connect(self):
self._test_connect()
self.assertEqual(1, self.h_ipdb_exit.call_count)
self.assertEqual(2, self.h_ipdb_exit.call_count)
self.assertEqual(3, self.c_ipdb_exit.call_count)
self.assertEqual(self.ifname, self.m_h_iface.ifname)
@ -201,7 +201,7 @@ class TestNestedMacvlanDriver(TestDriverMixin, test_base.TestCase):
def test_connect(self):
self._test_connect()
self.assertEqual(1, self.h_ipdb_exit.call_count)
self.assertEqual(2, self.h_ipdb_exit.call_count)
self.assertEqual(3, self.c_ipdb_exit.call_count)
self.assertEqual(self.ifname, self.m_h_iface.ifname)