From 3deb25a3cec7955c5e38d83af74add58478f884c Mon Sep 17 00:00:00 2001 From: Dmitry Tantsur Date: Fri, 1 Apr 2016 16:36:01 +0200 Subject: [PATCH] Wait for the interfaces to get IP addresses before inspection In the DIB build the DHCP code (provided by the dhcp-all-interfaces element) races with the service starting IPA. It does not matter for deployment itself, as we're waiting for the route to the Ironic API to appear. However, for inspection it may result in reporting back all NIC's without IP addresses. Inspection fails in this case. This change makes inspection wait for *all* NIC's to get their IP addresses up to a small timeout. The timeout is 60 seconds by default and can be changed via the new ipa-inspection-dhcp-wait-timeout kernel option (0 to not wait). After the wait inspection proceedes in any case, so the worst downside is making inspection 60 seconds longer. To avoid waiting for NIC's that are not even connected, this change extends the NetworkInterface class with 'has_carrier' field. Closes-Bug: #1564954 Change-Id: I5bf14de4c1c622f4bf6e3eadbe20c44759da5d66 --- ironic_python_agent/cmd/agent.py | 7 +++ ironic_python_agent/hardware.py | 20 +++++- ironic_python_agent/inspector.py | 37 +++++++++++ .../tests/unit/test_hardware.py | 28 ++++++++- .../tests/unit/test_inspector.py | 62 ++++++++++++++++++- .../tests/unit/test_ironic_api_client.py | 12 ++-- ...pection-wait-for-ips-223e39b65fef31bd.yaml | 7 +++ 7 files changed, 163 insertions(+), 10 deletions(-) create mode 100644 releasenotes/notes/inspection-wait-for-ips-223e39b65fef31bd.yaml diff --git a/ironic_python_agent/cmd/agent.py b/ironic_python_agent/cmd/agent.py index 405c969d2..fc8b92679 100644 --- a/ironic_python_agent/cmd/agent.py +++ b/ironic_python_agent/cmd/agent.py @@ -114,6 +114,13 @@ cli_opts = [ help='Comma-separated list of plugins providing additional ' 'hardware data for inspection, empty value gives ' 'a minimum required set of plugins.'), + + cfg.IntOpt('inspection_dhcp_wait_timeout', + default=APARAMS.get('ipa-inspection-dhcp-wait-timeout', + inspector.DEFAULT_DHCP_WAIT_TIMEOUT), + help='Maximum time (in seconds) to wait for all NIC\'s ' + 'to get their IP addresses via DHCP before inspection. ' + 'Set to 0 to disable waiting completely.'), ] CONF.register_cli_opts(cli_opts) diff --git a/ironic_python_agent/hardware.py b/ironic_python_agent/hardware.py index e3af49cf9..b45d11914 100644 --- a/ironic_python_agent/hardware.py +++ b/ironic_python_agent/hardware.py @@ -175,12 +175,14 @@ class BlockDevice(encoding.SerializableComparable): class NetworkInterface(encoding.SerializableComparable): serializable_fields = ('name', 'mac_address', 'switch_port_descr', - 'switch_chassis_descr', 'ipv4_address') + 'switch_chassis_descr', 'ipv4_address', + 'has_carrier') - def __init__(self, name, mac_addr, ipv4_address=None): + def __init__(self, name, mac_addr, ipv4_address=None, has_carrier=True): self.name = name self.mac_address = mac_addr self.ipv4_address = ipv4_address + self.has_carrier = has_carrier # TODO(russellhaering): Pull these from LLDP self.switch_port_descr = None self.switch_chassis_descr = None @@ -402,7 +404,8 @@ class GenericHardwareManager(HardwareManager): return NetworkInterface( interface_name, mac_addr, - ipv4_address=self.get_ipv4_addr(interface_name)) + ipv4_address=self.get_ipv4_addr(interface_name), + has_carrier=self._interface_has_carrier(interface_name)) def get_ipv4_addr(self, interface_id): try: @@ -412,6 +415,17 @@ class GenericHardwareManager(HardwareManager): # No default IPv4 address found return None + def _interface_has_carrier(self, interface_name): + path = '{0}/class/net/{1}/carrier'.format(self.sys_path, + interface_name) + try: + with open(path, 'rt') as fp: + return fp.read().strip() == '1' + except EnvironmentError: + LOG.debug('No carrier information for interface %s', + interface_name) + return False + def _is_device(self, interface_name): device_path = '{0}/class/net/{1}/device'.format(self.sys_path, interface_name) diff --git a/ironic_python_agent/inspector.py b/ironic_python_agent/inspector.py index 1ea2e3d1f..ba42e7c88 100644 --- a/ironic_python_agent/inspector.py +++ b/ironic_python_agent/inspector.py @@ -17,6 +17,7 @@ import base64 import io import json import tarfile +import time import netaddr from oslo_concurrency import processutils @@ -36,6 +37,9 @@ from ironic_python_agent import utils LOG = logging.getLogger(__name__) CONF = cfg.CONF DEFAULT_COLLECTOR = 'default' +DEFAULT_DHCP_WAIT_TIMEOUT = 60 + +_DHCP_RETRY_INTERVAL = 2 _COLLECTOR_NS = 'ironic_python_agent.inspector.collectors' _NO_LOGGING_FIELDS = ('logs',) @@ -215,6 +219,38 @@ def discover_scheduling_properties(inventory, data, root_disk=None): LOG.info('value for %s is %s', key, data[key]) +def wait_for_dhcp(): + """Wait until all NIC's get their IP addresses via DHCP or timeout happens. + + Ignores interfaces which do not even have a carrier. + + Note: only supports IPv4 addresses for now. + + :return: True if all NIC's got IP addresses, False if timeout happened. + Also returns True if waiting is disabled via configuration. + """ + if not CONF.inspection_dhcp_wait_timeout: + return True + + threshold = time.time() + CONF.inspection_dhcp_wait_timeout + while time.time() <= threshold: + interfaces = hardware.dispatch_to_managers('list_network_interfaces') + missing = [iface.name for iface in interfaces + if iface.has_carrier and not iface.ipv4_address] + if not missing: + return True + + LOG.debug('Still waiting for interfaces %s to get IP addresses', + missing) + time.sleep(_DHCP_RETRY_INTERVAL) + + LOG.warning('Not all network interfaces received IP addresses in ' + '%(timeout)d seconds: %(missing)s', + {'timeout': CONF.inspection_dhcp_wait_timeout, + 'missing': missing}) + return False + + def collect_default(data, failures): """The default inspection collector. @@ -229,6 +265,7 @@ def collect_default(data, failures): :param data: mutable data that we'll send to inspector :param failures: AccumulatedFailures object """ + wait_for_dhcp() inventory = hardware.dispatch_to_managers('list_hardware_info') # In the future we will only need the current version of inventory, diff --git a/ironic_python_agent/tests/unit/test_hardware.py b/ironic_python_agent/tests/unit/test_hardware.py index 9bf0b7bf4..5cb4a4182 100644 --- a/ironic_python_agent/tests/unit/test_hardware.py +++ b/ironic_python_agent/tests/unit/test_hardware.py @@ -268,7 +268,7 @@ class TestGenericHardwareManager(test_base.BaseTestCase): mocked_open.return_value.__enter__ = lambda s: s mocked_open.return_value.__exit__ = mock.Mock() read_mock = mocked_open.return_value.read - read_mock.return_value = '00:0c:29:8c:11:b1\n' + read_mock.side_effect = ['00:0c:29:8c:11:b1\n', '1'] mocked_ifaddresses.return_value = { netifaces.AF_INET: [{'addr': '192.168.1.2'}] } @@ -277,6 +277,32 @@ class TestGenericHardwareManager(test_base.BaseTestCase): self.assertEqual('eth0', interfaces[0].name) self.assertEqual('00:0c:29:8c:11:b1', interfaces[0].mac_address) self.assertEqual('192.168.1.2', interfaces[0].ipv4_address) + self.assertTrue(interfaces[0].has_carrier) + + @mock.patch('netifaces.ifaddresses') + @mock.patch('os.listdir') + @mock.patch('os.path.exists') + @mock.patch('six.moves.builtins.open') + def test_list_network_interfaces_no_carrier(self, + mocked_open, + mocked_exists, + mocked_listdir, + mocked_ifaddresses): + mocked_listdir.return_value = ['lo', 'eth0'] + mocked_exists.side_effect = [False, True] + mocked_open.return_value.__enter__ = lambda s: s + mocked_open.return_value.__exit__ = mock.Mock() + read_mock = mocked_open.return_value.read + read_mock.side_effect = ['00:0c:29:8c:11:b1\n', OSError('boom')] + mocked_ifaddresses.return_value = { + netifaces.AF_INET: [{'addr': '192.168.1.2'}] + } + interfaces = self.hardware.list_network_interfaces() + self.assertEqual(1, len(interfaces)) + self.assertEqual('eth0', interfaces[0].name) + self.assertEqual('00:0c:29:8c:11:b1', interfaces[0].mac_address) + self.assertEqual('192.168.1.2', interfaces[0].ipv4_address) + self.assertFalse(interfaces[0].has_carrier) @mock.patch.object(utils, 'execute') def test_get_os_install_device(self, mocked_execute): diff --git a/ironic_python_agent/tests/unit/test_inspector.py b/ironic_python_agent/tests/unit/test_inspector.py index bd70763ca..9247a6658 100644 --- a/ironic_python_agent/tests/unit/test_inspector.py +++ b/ironic_python_agent/tests/unit/test_inspector.py @@ -18,6 +18,7 @@ import collections import copy import io import tarfile +import time import unittest import mock @@ -328,11 +329,13 @@ class TestDiscoverSchedulingProperties(BaseDiscoverTest): @mock.patch.object(utils, 'get_agent_params', lambda: {'BOOTIF': 'boot:if'}) +@mock.patch.object(inspector, 'wait_for_dhcp', autospec=True) @mock.patch.object(inspector, 'discover_scheduling_properties', autospec=True) @mock.patch.object(inspector, 'discover_network_properties', autospec=True) @mock.patch.object(hardware, 'dispatch_to_managers', autospec=True) class TestCollectDefault(BaseDiscoverTest): - def test_ok(self, mock_dispatch, mock_discover_net, mock_discover_sched): + def test_ok(self, mock_dispatch, mock_discover_net, mock_discover_sched, + mock_wait_for_dhcp): mock_dispatch.return_value = self.inventory inspector.collect_default(self.data, self.failures) @@ -351,9 +354,10 @@ class TestCollectDefault(BaseDiscoverTest): mock_discover_sched.assert_called_once_with( self.inventory, self.data, root_disk=self.inventory['disks'][0]) + mock_wait_for_dhcp.assert_called_once_with() def test_no_root_disk(self, mock_dispatch, mock_discover_net, - mock_discover_sched): + mock_discover_sched, mock_wait_for_dhcp): mock_dispatch.return_value = self.inventory self.inventory['disks'] = [] @@ -371,6 +375,7 @@ class TestCollectDefault(BaseDiscoverTest): self.failures) mock_discover_sched.assert_called_once_with( self.inventory, self.data, root_disk=None) + mock_wait_for_dhcp.assert_called_once_with() @mock.patch.object(utils, 'execute', autospec=True) @@ -453,3 +458,56 @@ class TestCollectExtraHardware(unittest.TestCase): self.assertNotIn('data', self.data) self.assertTrue(self.failures) mock_execute.assert_called_once_with('hardware-detect') + + +@mock.patch.object(hardware, 'dispatch_to_managers', autospec=True) +class TestWaitForDhcp(unittest.TestCase): + def setUp(self): + super(TestWaitForDhcp, self).setUp() + CONF.set_override('inspection_dhcp_wait_timeout', + inspector.DEFAULT_DHCP_WAIT_TIMEOUT) + + @mock.patch.object(time, 'sleep', autospec=True) + def test_ok(self, mocked_sleep, mocked_dispatch): + mocked_dispatch.side_effect = [ + [hardware.NetworkInterface(name='em0', mac_addr='abcd', + ipv4_address=None), + hardware.NetworkInterface(name='em1', mac_addr='abcd', + ipv4_address='1.2.3.4')], + [hardware.NetworkInterface(name='em0', mac_addr='abcd', + ipv4_address=None), + hardware.NetworkInterface(name='em1', mac_addr='abcd', + ipv4_address='1.2.3.4')], + [hardware.NetworkInterface(name='em0', mac_addr='abcd', + ipv4_address='1.1.1.1'), + hardware.NetworkInterface(name='em1', mac_addr='abcd', + ipv4_address='1.2.3.4')], + ] + + self.assertTrue(inspector.wait_for_dhcp()) + + mocked_dispatch.assert_called_with('list_network_interfaces') + self.assertEqual(2, mocked_sleep.call_count) + self.assertEqual(3, mocked_dispatch.call_count) + + @mock.patch.object(inspector, '_DHCP_RETRY_INTERVAL', 0.01) + def test_timeout(self, mocked_dispatch): + CONF.set_override('inspection_dhcp_wait_timeout', 0.02) + + mocked_dispatch.return_value = [ + hardware.NetworkInterface(name='em0', mac_addr='abcd', + ipv4_address=None), + hardware.NetworkInterface(name='em1', mac_addr='abcd', + ipv4_address='1.2.3.4'), + ] + + self.assertFalse(inspector.wait_for_dhcp()) + + mocked_dispatch.assert_called_with('list_network_interfaces') + + def test_disabled(self, mocked_dispatch): + CONF.set_override('inspection_dhcp_wait_timeout', 0) + + self.assertTrue(inspector.wait_for_dhcp()) + + self.assertFalse(mocked_dispatch.called) diff --git a/ironic_python_agent/tests/unit/test_ironic_api_client.py b/ironic_python_agent/tests/unit/test_ironic_api_client.py index 30ce8158f..2e3defe38 100644 --- a/ironic_python_agent/tests/unit/test_ironic_api_client.py +++ b/ironic_python_agent/tests/unit/test_ironic_api_client.py @@ -160,14 +160,16 @@ class TestBaseIronicPythonAgent(test_base.BaseTestCase): u'name': u'eth0', u'ipv4_address': None, u'switch_chassis_descr': None, - u'switch_port_descr': None + u'switch_port_descr': None, + u'has_carrier': True, }, { u'mac_address': u'00:0c:29:8c:11:b2', u'name': u'eth1', u'ipv4_address': None, u'switch_chassis_descr': None, - 'switch_port_descr': None + u'switch_port_descr': None, + u'has_carrier': True, } ], u'cpu': { @@ -295,14 +297,16 @@ class TestBaseIronicPythonAgent(test_base.BaseTestCase): u'name': u'eth0', u'ipv4_address': None, u'switch_chassis_descr': None, - u'switch_port_descr': None + u'switch_port_descr': None, + u'has_carrier': True, }, { u'mac_address': u'00:0c:29:8c:11:b2', u'name': u'eth1', u'ipv4_address': None, u'switch_chassis_descr': None, - 'switch_port_descr': None + u'switch_port_descr': None, + u'has_carrier': True, } ], u'cpu': { diff --git a/releasenotes/notes/inspection-wait-for-ips-223e39b65fef31bd.yaml b/releasenotes/notes/inspection-wait-for-ips-223e39b65fef31bd.yaml new file mode 100644 index 000000000..0dbde182d --- /dev/null +++ b/releasenotes/notes/inspection-wait-for-ips-223e39b65fef31bd.yaml @@ -0,0 +1,7 @@ +--- +features: + - The "has_carrier" flag was added to the network interface information. +fixes: + - Inspection code now waits up to 1 minute for all NICs to get their IP addresses. + Otherwise inspection fails for some users due to race between DIB DHCP code + and IPA startup. See https://bugs.launchpad.net/bugs/1564954 for details.