Wait for the interfaces to get IP addresses before inspection

In the DIB build the DHCP code (provided by the dhcp-all-interfaces element)
races with the service starting IPA. It does not matter for deployment itself,
as we're waiting for the route to the Ironic API to appear. However, for
inspection it may result in reporting back all NIC's without IP addresses.
Inspection fails in this case.

This change makes inspection wait for *all* NIC's to get their IP addresses up
to a small timeout. The timeout is 60 seconds by default and can be changed
via the new ipa-inspection-dhcp-wait-timeout kernel option (0 to not wait).

After the wait inspection proceedes in any case, so the worst downside
is making inspection 60 seconds longer.

To avoid waiting for NIC's that are not even connected, this change extends the
NetworkInterface class with 'has_carrier' field.

Closes-Bug: #1564954
Change-Id: I5bf14de4c1c622f4bf6e3eadbe20c44759da5d66
This commit is contained in:
Dmitry Tantsur 2016-04-01 16:36:01 +02:00
parent 3cf5369cb6
commit 3deb25a3ce
7 changed files with 163 additions and 10 deletions

View File

@ -114,6 +114,13 @@ cli_opts = [
help='Comma-separated list of plugins providing additional '
'hardware data for inspection, empty value gives '
'a minimum required set of plugins.'),
cfg.IntOpt('inspection_dhcp_wait_timeout',
default=APARAMS.get('ipa-inspection-dhcp-wait-timeout',
inspector.DEFAULT_DHCP_WAIT_TIMEOUT),
help='Maximum time (in seconds) to wait for all NIC\'s '
'to get their IP addresses via DHCP before inspection. '
'Set to 0 to disable waiting completely.'),
]
CONF.register_cli_opts(cli_opts)

View File

@ -175,12 +175,14 @@ class BlockDevice(encoding.SerializableComparable):
class NetworkInterface(encoding.SerializableComparable):
serializable_fields = ('name', 'mac_address', 'switch_port_descr',
'switch_chassis_descr', 'ipv4_address')
'switch_chassis_descr', 'ipv4_address',
'has_carrier')
def __init__(self, name, mac_addr, ipv4_address=None):
def __init__(self, name, mac_addr, ipv4_address=None, has_carrier=True):
self.name = name
self.mac_address = mac_addr
self.ipv4_address = ipv4_address
self.has_carrier = has_carrier
# TODO(russellhaering): Pull these from LLDP
self.switch_port_descr = None
self.switch_chassis_descr = None
@ -402,7 +404,8 @@ class GenericHardwareManager(HardwareManager):
return NetworkInterface(
interface_name, mac_addr,
ipv4_address=self.get_ipv4_addr(interface_name))
ipv4_address=self.get_ipv4_addr(interface_name),
has_carrier=self._interface_has_carrier(interface_name))
def get_ipv4_addr(self, interface_id):
try:
@ -412,6 +415,17 @@ class GenericHardwareManager(HardwareManager):
# No default IPv4 address found
return None
def _interface_has_carrier(self, interface_name):
path = '{0}/class/net/{1}/carrier'.format(self.sys_path,
interface_name)
try:
with open(path, 'rt') as fp:
return fp.read().strip() == '1'
except EnvironmentError:
LOG.debug('No carrier information for interface %s',
interface_name)
return False
def _is_device(self, interface_name):
device_path = '{0}/class/net/{1}/device'.format(self.sys_path,
interface_name)

View File

@ -17,6 +17,7 @@ import base64
import io
import json
import tarfile
import time
import netaddr
from oslo_concurrency import processutils
@ -36,6 +37,9 @@ from ironic_python_agent import utils
LOG = logging.getLogger(__name__)
CONF = cfg.CONF
DEFAULT_COLLECTOR = 'default'
DEFAULT_DHCP_WAIT_TIMEOUT = 60
_DHCP_RETRY_INTERVAL = 2
_COLLECTOR_NS = 'ironic_python_agent.inspector.collectors'
_NO_LOGGING_FIELDS = ('logs',)
@ -215,6 +219,38 @@ def discover_scheduling_properties(inventory, data, root_disk=None):
LOG.info('value for %s is %s', key, data[key])
def wait_for_dhcp():
"""Wait until all NIC's get their IP addresses via DHCP or timeout happens.
Ignores interfaces which do not even have a carrier.
Note: only supports IPv4 addresses for now.
:return: True if all NIC's got IP addresses, False if timeout happened.
Also returns True if waiting is disabled via configuration.
"""
if not CONF.inspection_dhcp_wait_timeout:
return True
threshold = time.time() + CONF.inspection_dhcp_wait_timeout
while time.time() <= threshold:
interfaces = hardware.dispatch_to_managers('list_network_interfaces')
missing = [iface.name for iface in interfaces
if iface.has_carrier and not iface.ipv4_address]
if not missing:
return True
LOG.debug('Still waiting for interfaces %s to get IP addresses',
missing)
time.sleep(_DHCP_RETRY_INTERVAL)
LOG.warning('Not all network interfaces received IP addresses in '
'%(timeout)d seconds: %(missing)s',
{'timeout': CONF.inspection_dhcp_wait_timeout,
'missing': missing})
return False
def collect_default(data, failures):
"""The default inspection collector.
@ -229,6 +265,7 @@ def collect_default(data, failures):
:param data: mutable data that we'll send to inspector
:param failures: AccumulatedFailures object
"""
wait_for_dhcp()
inventory = hardware.dispatch_to_managers('list_hardware_info')
# In the future we will only need the current version of inventory,

View File

@ -268,7 +268,7 @@ class TestGenericHardwareManager(test_base.BaseTestCase):
mocked_open.return_value.__enter__ = lambda s: s
mocked_open.return_value.__exit__ = mock.Mock()
read_mock = mocked_open.return_value.read
read_mock.return_value = '00:0c:29:8c:11:b1\n'
read_mock.side_effect = ['00:0c:29:8c:11:b1\n', '1']
mocked_ifaddresses.return_value = {
netifaces.AF_INET: [{'addr': '192.168.1.2'}]
}
@ -277,6 +277,32 @@ class TestGenericHardwareManager(test_base.BaseTestCase):
self.assertEqual('eth0', interfaces[0].name)
self.assertEqual('00:0c:29:8c:11:b1', interfaces[0].mac_address)
self.assertEqual('192.168.1.2', interfaces[0].ipv4_address)
self.assertTrue(interfaces[0].has_carrier)
@mock.patch('netifaces.ifaddresses')
@mock.patch('os.listdir')
@mock.patch('os.path.exists')
@mock.patch('six.moves.builtins.open')
def test_list_network_interfaces_no_carrier(self,
mocked_open,
mocked_exists,
mocked_listdir,
mocked_ifaddresses):
mocked_listdir.return_value = ['lo', 'eth0']
mocked_exists.side_effect = [False, True]
mocked_open.return_value.__enter__ = lambda s: s
mocked_open.return_value.__exit__ = mock.Mock()
read_mock = mocked_open.return_value.read
read_mock.side_effect = ['00:0c:29:8c:11:b1\n', OSError('boom')]
mocked_ifaddresses.return_value = {
netifaces.AF_INET: [{'addr': '192.168.1.2'}]
}
interfaces = self.hardware.list_network_interfaces()
self.assertEqual(1, len(interfaces))
self.assertEqual('eth0', interfaces[0].name)
self.assertEqual('00:0c:29:8c:11:b1', interfaces[0].mac_address)
self.assertEqual('192.168.1.2', interfaces[0].ipv4_address)
self.assertFalse(interfaces[0].has_carrier)
@mock.patch.object(utils, 'execute')
def test_get_os_install_device(self, mocked_execute):

View File

@ -18,6 +18,7 @@ import collections
import copy
import io
import tarfile
import time
import unittest
import mock
@ -328,11 +329,13 @@ class TestDiscoverSchedulingProperties(BaseDiscoverTest):
@mock.patch.object(utils, 'get_agent_params',
lambda: {'BOOTIF': 'boot:if'})
@mock.patch.object(inspector, 'wait_for_dhcp', autospec=True)
@mock.patch.object(inspector, 'discover_scheduling_properties', autospec=True)
@mock.patch.object(inspector, 'discover_network_properties', autospec=True)
@mock.patch.object(hardware, 'dispatch_to_managers', autospec=True)
class TestCollectDefault(BaseDiscoverTest):
def test_ok(self, mock_dispatch, mock_discover_net, mock_discover_sched):
def test_ok(self, mock_dispatch, mock_discover_net, mock_discover_sched,
mock_wait_for_dhcp):
mock_dispatch.return_value = self.inventory
inspector.collect_default(self.data, self.failures)
@ -351,9 +354,10 @@ class TestCollectDefault(BaseDiscoverTest):
mock_discover_sched.assert_called_once_with(
self.inventory, self.data,
root_disk=self.inventory['disks'][0])
mock_wait_for_dhcp.assert_called_once_with()
def test_no_root_disk(self, mock_dispatch, mock_discover_net,
mock_discover_sched):
mock_discover_sched, mock_wait_for_dhcp):
mock_dispatch.return_value = self.inventory
self.inventory['disks'] = []
@ -371,6 +375,7 @@ class TestCollectDefault(BaseDiscoverTest):
self.failures)
mock_discover_sched.assert_called_once_with(
self.inventory, self.data, root_disk=None)
mock_wait_for_dhcp.assert_called_once_with()
@mock.patch.object(utils, 'execute', autospec=True)
@ -453,3 +458,56 @@ class TestCollectExtraHardware(unittest.TestCase):
self.assertNotIn('data', self.data)
self.assertTrue(self.failures)
mock_execute.assert_called_once_with('hardware-detect')
@mock.patch.object(hardware, 'dispatch_to_managers', autospec=True)
class TestWaitForDhcp(unittest.TestCase):
def setUp(self):
super(TestWaitForDhcp, self).setUp()
CONF.set_override('inspection_dhcp_wait_timeout',
inspector.DEFAULT_DHCP_WAIT_TIMEOUT)
@mock.patch.object(time, 'sleep', autospec=True)
def test_ok(self, mocked_sleep, mocked_dispatch):
mocked_dispatch.side_effect = [
[hardware.NetworkInterface(name='em0', mac_addr='abcd',
ipv4_address=None),
hardware.NetworkInterface(name='em1', mac_addr='abcd',
ipv4_address='1.2.3.4')],
[hardware.NetworkInterface(name='em0', mac_addr='abcd',
ipv4_address=None),
hardware.NetworkInterface(name='em1', mac_addr='abcd',
ipv4_address='1.2.3.4')],
[hardware.NetworkInterface(name='em0', mac_addr='abcd',
ipv4_address='1.1.1.1'),
hardware.NetworkInterface(name='em1', mac_addr='abcd',
ipv4_address='1.2.3.4')],
]
self.assertTrue(inspector.wait_for_dhcp())
mocked_dispatch.assert_called_with('list_network_interfaces')
self.assertEqual(2, mocked_sleep.call_count)
self.assertEqual(3, mocked_dispatch.call_count)
@mock.patch.object(inspector, '_DHCP_RETRY_INTERVAL', 0.01)
def test_timeout(self, mocked_dispatch):
CONF.set_override('inspection_dhcp_wait_timeout', 0.02)
mocked_dispatch.return_value = [
hardware.NetworkInterface(name='em0', mac_addr='abcd',
ipv4_address=None),
hardware.NetworkInterface(name='em1', mac_addr='abcd',
ipv4_address='1.2.3.4'),
]
self.assertFalse(inspector.wait_for_dhcp())
mocked_dispatch.assert_called_with('list_network_interfaces')
def test_disabled(self, mocked_dispatch):
CONF.set_override('inspection_dhcp_wait_timeout', 0)
self.assertTrue(inspector.wait_for_dhcp())
self.assertFalse(mocked_dispatch.called)

View File

@ -160,14 +160,16 @@ class TestBaseIronicPythonAgent(test_base.BaseTestCase):
u'name': u'eth0',
u'ipv4_address': None,
u'switch_chassis_descr': None,
u'switch_port_descr': None
u'switch_port_descr': None,
u'has_carrier': True,
},
{
u'mac_address': u'00:0c:29:8c:11:b2',
u'name': u'eth1',
u'ipv4_address': None,
u'switch_chassis_descr': None,
'switch_port_descr': None
u'switch_port_descr': None,
u'has_carrier': True,
}
],
u'cpu': {
@ -295,14 +297,16 @@ class TestBaseIronicPythonAgent(test_base.BaseTestCase):
u'name': u'eth0',
u'ipv4_address': None,
u'switch_chassis_descr': None,
u'switch_port_descr': None
u'switch_port_descr': None,
u'has_carrier': True,
},
{
u'mac_address': u'00:0c:29:8c:11:b2',
u'name': u'eth1',
u'ipv4_address': None,
u'switch_chassis_descr': None,
'switch_port_descr': None
u'switch_port_descr': None,
u'has_carrier': True,
}
],
u'cpu': {

View File

@ -0,0 +1,7 @@
---
features:
- The "has_carrier" flag was added to the network interface information.
fixes:
- Inspection code now waits up to 1 minute for all NICs to get their IP addresses.
Otherwise inspection fails for some users due to race between DIB DHCP code
and IPA startup. See https://bugs.launchpad.net/bugs/1564954 for details.