Fix waiting for target disk to appear

This patch is changing the _wait_for_disks() method behavior to wait to
a specific disk if any device hints is specified. There are cases where
the deployment might fail or succeed randomly depending on the order and
time that the disks shows up.

If no root device hints is specified, the method will just wait for any
suitable disk to show up, like before.

The _wait_for_disks call was made into a proper hardware manager method.
It is now also called each time the cached node is updated, not only
on start up. This is to ensure that we wait for the device, matching
root device hints (which are part of the node).

The loop was corrected to avoid redundant sleeps and warnings.

Finally, this patch adds more logging around detecting the root device.

Co-Authored-By: Dmitry Tantsur <dtantsur@redhat.com>
Change-Id: I10ca70d6a390ed802505c0d10d440dfb52beb56c
Closes-Bug: #1670916
This commit is contained in:
Lucas Alvares Gomes 2017-03-09 14:02:06 +00:00 committed by Dmitry Tantsur
parent e87d039111
commit 3189c16a5e
8 changed files with 158 additions and 102 deletions

View File

@ -180,7 +180,7 @@ cli_opts = [
default=APARAMS.get('ipa-disk-wait-delay', 3),
help='How much time (in seconds) to wait between attempts '
'to check if at least one suitable disk has appeared '
'in inventory. '
'in inventory. Set to zero to disable. '
'Can be supplied as "ipa-disk-wait-delay" '
'kernel parameter.'),
cfg.BoolOpt('insecure',

View File

@ -381,6 +381,33 @@ class HardwareManager(object):
erase_results[block_device.name] = result
return erase_results
def wait_for_disks(self):
"""Wait for the root disk to appear.
Wait for at least one suitable disk to show up or a specific disk
if any device hint is specified. Otherwise neither inspection
not deployment have any chances to succeed.
"""
if not CONF.disk_wait_attempts:
return
for attempt in range(CONF.disk_wait_attempts):
try:
self.get_os_install_device()
except errors.DeviceNotFound:
LOG.debug('Still waiting for the root device to appear, '
'attempt %d of %d', attempt + 1,
CONF.disk_wait_attempts)
if attempt < CONF.disk_wait_attempts - 1:
time.sleep(CONF.disk_wait_delay)
else:
break
else:
LOG.warning('The root device was not detected in %d seconds',
CONF.disk_wait_delay * CONF.disk_wait_attempts)
def list_hardware_info(self):
"""Return full hardware inventory as a serializable dict.
@ -489,32 +516,9 @@ class GenericHardwareManager(HardwareManager):
def evaluate_hardware_support(self):
# Do some initialization before we declare ourself ready
_check_for_iscsi()
self._wait_for_disks()
self.wait_for_disks()
return HardwareSupport.GENERIC
def _wait_for_disks(self):
"""Wait for disk to appear
Wait for at least one suitable disk to show up, otherwise neither
inspection not deployment have any chances to succeed.
"""
for attempt in range(CONF.disk_wait_attempts):
try:
block_devices = self.list_block_devices()
utils.guess_root_disk(block_devices)
except errors.DeviceNotFound:
LOG.debug('Still waiting for at least one disk to appear, '
'attempt %d of %d', attempt + 1,
CONF.disk_wait_attempts)
time.sleep(CONF.disk_wait_delay)
else:
break
else:
LOG.warning('No disks detected in %d seconds',
CONF.disk_wait_delay * CONF.disk_wait_attempts)
def collect_lldp_data(self, interface_names):
"""Collect and convert LLDP info from the node.
@ -705,10 +709,12 @@ class GenericHardwareManager(HardwareManager):
root_device_hints = None
if cached_node is not None:
root_device_hints = cached_node['properties'].get('root_device')
LOG.debug('Looking for a device matching root hints %s',
root_device_hints)
block_devices = self.list_block_devices()
if not root_device_hints:
return utils.guess_root_disk(block_devices).name
dev_name = utils.guess_root_disk(block_devices).name
else:
serialized_devs = [dev.serialize() for dev in block_devices]
try:
@ -729,7 +735,13 @@ class GenericHardwareManager(HardwareManager):
"No suitable device was found for "
"deployment using these hints %s" % root_device_hints)
return device['name']
dev_name = device['name']
LOG.info('Picked root device %(dev)s for node %(node)s based on '
'root device hints %(hints)s',
{'dev': dev_name, 'hints': root_device_hints,
'node': cached_node['uuid'] if cached_node else None})
return dev_name
def get_system_vendor_info(self):
product_name = None
@ -1160,11 +1172,22 @@ def cache_node(node):
Stores the node object in the hardware module to facilitate the
access of a node information in the hardware extensions.
If the new node does not match the previously cached one, wait for the
expected root device to appear.
:param node: Ironic node object
"""
global NODE
new_node = NODE is None or NODE['uuid'] != node['uuid']
NODE = node
if new_node:
LOG.info('Cached node %s, waiting for its root device to appear',
node['uuid'])
# Root device hints, stored in the new node, can change the expected
# root device. So let us wait for it to appear again.
dispatch_to_managers('wait_for_disks')
def get_cached_node():
"""Guard function around the module variable NODE."""

View File

@ -24,6 +24,8 @@ class TestCommands(base.FunctionalBase):
different test runs.
"""
node = {'uuid': '1', 'properties': {}}
def step_1_get_empty_commands(self):
response = self.request('get', 'commands')
self.assertEqual({'commands': []}, response)
@ -34,7 +36,7 @@ class TestCommands(base.FunctionalBase):
# this command succeeds even with an empty node and port. This test's
# success is required for steps 3 and 4 to succeed.
command = {'name': 'clean.get_clean_steps',
'params': {'node': {}, 'ports': {}}}
'params': {'node': self.node, 'ports': {}}}
response = self.request('post', 'commands', json=command,
headers={'Content-Type': 'application/json'})
self.assertIsNone(response['command_error'])

View File

@ -19,6 +19,7 @@ from ironic_python_agent import errors
from ironic_python_agent.extensions import clean
@mock.patch('ironic_python_agent.hardware.cache_node', autospec=True)
class TestCleanExtension(test_base.BaseTestCase):
def setUp(self):
super(TestCleanExtension, self).setUp()
@ -37,7 +38,8 @@ class TestCleanExtension(test_base.BaseTestCase):
'_get_current_clean_version', autospec=True)
@mock.patch('ironic_python_agent.hardware.dispatch_to_all_managers',
autospec=True)
def test_get_clean_steps(self, mock_dispatch, mock_version):
def test_get_clean_steps(self, mock_dispatch, mock_version,
mock_cache_node):
mock_version.return_value = self.version
manager_steps = {
@ -135,12 +137,14 @@ class TestCleanExtension(test_base.BaseTestCase):
# 'priority' in Ironic
self.assertEqual(expected_return,
async_results.join().command_result)
mock_cache_node.assert_called_once_with(self.node)
@mock.patch('ironic_python_agent.hardware.dispatch_to_managers',
autospec=True)
@mock.patch('ironic_python_agent.extensions.clean._check_clean_version',
autospec=True)
def test_execute_clean_step(self, mock_version, mock_dispatch):
def test_execute_clean_step(self, mock_version, mock_dispatch,
mock_cache_node):
result = 'cleaned'
mock_dispatch.return_value = result
@ -159,13 +163,14 @@ class TestCleanExtension(test_base.BaseTestCase):
self.step['GenericHardwareManager'][0]['step'],
self.node, self.ports)
self.assertEqual(expected_result, async_result.command_result)
mock_cache_node.assert_called_once_with(self.node)
@mock.patch('ironic_python_agent.hardware.dispatch_to_managers',
autospec=True)
@mock.patch('ironic_python_agent.extensions.clean._check_clean_version',
autospec=True)
def test_execute_clean_step_tuple_result(self, mock_version,
mock_dispatch):
mock_dispatch, mock_cache_node):
result = ('stdout', 'stderr')
mock_dispatch.return_value = result
@ -184,10 +189,11 @@ class TestCleanExtension(test_base.BaseTestCase):
self.step['GenericHardwareManager'][0]['step'],
self.node, self.ports)
self.assertEqual(expected_result, async_result.command_result)
mock_cache_node.assert_called_once_with(self.node)
@mock.patch('ironic_python_agent.extensions.clean._check_clean_version',
autospec=True)
def test_execute_clean_step_no_step(self, mock_version):
def test_execute_clean_step_no_step(self, mock_version, mock_cache_node):
async_result = self.agent_extension.execute_clean_step(
step={}, node=self.node, ports=self.ports,
clean_version=self.version)
@ -195,12 +201,14 @@ class TestCleanExtension(test_base.BaseTestCase):
self.assertEqual('FAILED', async_result.command_status)
mock_version.assert_called_once_with(self.version)
mock_cache_node.assert_called_once_with(self.node)
@mock.patch('ironic_python_agent.hardware.dispatch_to_managers',
autospec=True)
@mock.patch('ironic_python_agent.extensions.clean._check_clean_version',
autospec=True)
def test_execute_clean_step_fail(self, mock_version, mock_dispatch):
def test_execute_clean_step_fail(self, mock_version, mock_dispatch,
mock_cache_node):
mock_dispatch.side_effect = RuntimeError
async_result = self.agent_extension.execute_clean_step(
@ -214,13 +222,15 @@ class TestCleanExtension(test_base.BaseTestCase):
mock_dispatch.assert_called_once_with(
self.step['GenericHardwareManager'][0]['step'],
self.node, self.ports)
mock_cache_node.assert_called_once_with(self.node)
@mock.patch('ironic_python_agent.hardware.dispatch_to_managers',
autospec=True)
@mock.patch('ironic_python_agent.extensions.clean._check_clean_version',
autospec=True)
def test_execute_clean_step_version_mismatch(self, mock_version,
mock_dispatch):
mock_dispatch,
mock_cache_node):
mock_version.side_effect = errors.CleanVersionMismatch(
{'GenericHardwareManager': 1}, {'GenericHardwareManager': 2})
@ -232,17 +242,19 @@ class TestCleanExtension(test_base.BaseTestCase):
mock_version.assert_called_once_with(self.version)
@mock.patch('ironic_python_agent.hardware.dispatch_to_all_managers',
@mock.patch('ironic_python_agent.hardware.dispatch_to_all_managers',
autospec=True)
def _get_current_clean_version(self, mock_dispatch):
class TestCleanVersion(test_base.BaseTestCase):
version = {'generic': '1', 'specific': '1'}
def test__get_current_clean_version(self, mock_dispatch):
mock_dispatch.return_value = {'SpecificHardwareManager':
{'name': 'specific', 'version': '1'},
'GenericHardwareManager':
{'name': 'generic', 'version': '1'}}
self.assertEqual(self.version, clean._get_current_clean_version())
@mock.patch('ironic_python_agent.hardware.dispatch_to_all_managers',
autospec=True)
def test__check_clean_version_fail(self, mock_dispatch):
mock_dispatch.return_value = {'SpecificHardwareManager':
{'name': 'specific', 'version': '1'}}

View File

@ -127,7 +127,7 @@ class TestHeartbeater(ironic_agent_base.IronicAgentTest):
self.assertEqual(2.7, self.heartbeater.error_delay)
@mock.patch.object(hardware.GenericHardwareManager, '_wait_for_disks',
@mock.patch.object(hardware.GenericHardwareManager, 'wait_for_disks',
lambda self: None)
class TestBaseAgent(ironic_agent_base.IronicAgentTest):
@ -151,6 +151,7 @@ class TestBaseAgent(ironic_agent_base.IronicAgentTest):
FakeExtension())])
self.sample_nw_iface = hardware.NetworkInterface(
"eth9", "AA:BB:CC:DD:EE:FF", "1.2.3.4", True)
hardware.NODE = None
def assertEqualEncoded(self, a, b):
# Evidently JSONEncoder.default() can't handle None (??) so we have to
@ -204,7 +205,9 @@ class TestBaseAgent(ironic_agent_base.IronicAgentTest):
server_class=simple_server.WSGIServer)
wsgi_server.serve_forever.assert_called_once_with()
mock_wait.assert_called_once_with(mock.ANY)
mock_dispatch.assert_called_once_with("list_hardware_info")
self.assertEqual([mock.call('list_hardware_info'),
mock.call('wait_for_disks')],
mock_dispatch.call_args_list)
self.agent.heartbeater.start.assert_called_once_with()
@mock.patch.object(hardware, '_check_for_iscsi', mock.Mock())
@ -252,7 +255,9 @@ class TestBaseAgent(ironic_agent_base.IronicAgentTest):
self.agent.api_client.lookup_node.call_args[1]['node_uuid'])
mock_wait.assert_called_once_with(mock.ANY)
mock_dispatch.assert_called_once_with("list_hardware_info")
self.assertEqual([mock.call('list_hardware_info'),
mock.call('wait_for_disks')],
mock_dispatch.call_args_list)
self.agent.heartbeater.start.assert_called_once_with()
@mock.patch.object(hardware, '_check_for_iscsi', mock.Mock())
@ -419,7 +424,9 @@ class TestBaseAgent(ironic_agent_base.IronicAgentTest):
mock_sleep.assert_called_once_with(10)
self.assertTrue(mock_load_managers.called)
self.assertTrue(mock_wait.called)
mock_dispatch.assert_called_once_with('list_hardware_info')
self.assertEqual([mock.call('list_hardware_info'),
mock.call('wait_for_disks')],
mock_dispatch.call_args_list)
def test_async_command_success(self):
result = base.AsyncCommandResult('foo_command', {'fail': False},
@ -507,7 +514,7 @@ class TestBaseAgent(ironic_agent_base.IronicAgentTest):
mock_log.warning.assert_called_once()
@mock.patch.object(hardware.GenericHardwareManager, '_wait_for_disks',
@mock.patch.object(hardware.GenericHardwareManager, 'wait_for_disks',
lambda self: None)
class TestAgentStandalone(ironic_agent_base.IronicAgentTest):
@ -563,7 +570,7 @@ class TestAgentStandalone(ironic_agent_base.IronicAgentTest):
@mock.patch.object(hardware, '_check_for_iscsi', lambda: None)
@mock.patch.object(hardware.GenericHardwareManager, '_wait_for_disks',
@mock.patch.object(hardware.GenericHardwareManager, 'wait_for_disks',
lambda self: None)
@mock.patch.object(socket, 'gethostbyname', autospec=True)
@mock.patch.object(utils, 'execute', autospec=True)

View File

@ -680,7 +680,8 @@ class TestGenericHardwareManager(base.IronicAgentTest):
@mock.patch.object(hardware, 'get_cached_node', autospec=True)
def _get_os_install_device_root_device_hints(self, hints, expected_device,
mock_cached_node, mock_dev):
mock_cached_node.return_value = {'properties': {'root_device': hints}}
mock_cached_node.return_value = {'properties': {'root_device': hints},
'uuid': 'node1'}
model = 'fastable sd131 7'
mock_dev.return_value = [
hardware.BlockDevice(name='/dev/sda',
@ -1660,15 +1661,13 @@ class TestGenericHardwareManager(base.IronicAgentTest):
self.assertEqual('NEC',
self.hardware.get_system_vendor_info().manufacturer)
@mock.patch.object(hardware.GenericHardwareManager, 'list_block_devices',
autospec=True)
@mock.patch.object(hardware.GenericHardwareManager,
'get_os_install_device', autospec=True)
@mock.patch.object(hardware, '_check_for_iscsi', autospec=True)
@mock.patch.object(time, 'sleep', autospec=True)
@mock.patch.object(utils, 'guess_root_disk', autospec=True)
def test_evaluate_hw_waits_for_disks(self, mocked_root_dev, mocked_sleep,
mocked_check_for_iscsi,
mocked_block_dev):
mocked_root_dev.side_effect = [
def test_evaluate_hw_waits_for_disks(
self, mocked_sleep, mocked_check_for_iscsi, mocked_get_inst_dev):
mocked_get_inst_dev.side_effect = [
errors.DeviceNotFound('boom'),
None
]
@ -1677,19 +1676,32 @@ class TestGenericHardwareManager(base.IronicAgentTest):
self.assertTrue(mocked_check_for_iscsi.called)
self.assertEqual(hardware.HardwareSupport.GENERIC, result)
mocked_root_dev.assert_called_with(mocked_block_dev.return_value)
self.assertEqual(2, mocked_root_dev.call_count)
mocked_get_inst_dev.assert_called_with(mock.ANY)
self.assertEqual(2, mocked_get_inst_dev.call_count)
mocked_sleep.assert_called_once_with(CONF.disk_wait_delay)
@mock.patch.object(hardware, '_check_for_iscsi', mock.Mock())
@mock.patch.object(hardware.GenericHardwareManager, 'list_block_devices',
autospec=True)
@mock.patch.object(hardware.GenericHardwareManager,
'get_os_install_device', autospec=True)
@mock.patch.object(hardware, '_check_for_iscsi', autospec=True)
@mock.patch.object(time, 'sleep', autospec=True)
@mock.patch.object(utils, 'guess_root_disk', autospec=True)
def test_evaluate_hw_waits_for_disks_nonconfigured(self, mocked_root_dev,
mocked_sleep,
mocked_block_dev):
mocked_root_dev.side_effect = [
def test_evaluate_hw_no_wait_for_disks(
self, mocked_sleep, mocked_check_for_iscsi, mocked_get_inst_dev):
CONF.set_override('disk_wait_attempts', '0')
result = self.hardware.evaluate_hardware_support()
self.assertTrue(mocked_check_for_iscsi.called)
self.assertEqual(hardware.HardwareSupport.GENERIC, result)
self.assertFalse(mocked_get_inst_dev.called)
self.assertFalse(mocked_sleep.called)
@mock.patch.object(hardware, '_check_for_iscsi', mock.Mock())
@mock.patch.object(hardware.GenericHardwareManager,
'get_os_install_device', autospec=True)
@mock.patch.object(time, 'sleep', autospec=True)
def test_evaluate_hw_waits_for_disks_nonconfigured(
self, mocked_sleep, mocked_get_inst_dev):
mocked_get_inst_dev.side_effect = [
errors.DeviceNotFound('boom'),
errors.DeviceNotFound('boom'),
errors.DeviceNotFound('boom'),
@ -1706,20 +1718,20 @@ class TestGenericHardwareManager(base.IronicAgentTest):
self.hardware.evaluate_hardware_support()
mocked_root_dev.assert_called_with(mocked_block_dev.return_value)
self.assertEqual(10, mocked_root_dev.call_count)
mocked_get_inst_dev.assert_called_with(mock.ANY)
self.assertEqual(10, mocked_get_inst_dev.call_count)
expected_calls = [mock.call(CONF.disk_wait_delay)] * 9
mocked_sleep.assert_has_calls(expected_calls)
@mock.patch.object(hardware, '_check_for_iscsi', mock.Mock())
@mock.patch.object(hardware.GenericHardwareManager, 'list_block_devices',
autospec=True)
@mock.patch.object(hardware.GenericHardwareManager,
'get_os_install_device', autospec=True)
@mock.patch.object(time, 'sleep', autospec=True)
@mock.patch.object(utils, 'guess_root_disk', autospec=True)
def test_evaluate_hw_waits_for_disks_configured(self, mocked_root_dev,
mocked_sleep,
mocked_block_dev):
def test_evaluate_hw_waits_for_disks_configured(self, mocked_sleep,
mocked_get_inst_dev):
CONF.set_override('disk_wait_attempts', '2')
mocked_root_dev.side_effect = [
mocked_get_inst_dev.side_effect = [
errors.DeviceNotFound('boom'),
errors.DeviceNotFound('boom'),
errors.DeviceNotFound('boom'),
@ -1729,54 +1741,45 @@ class TestGenericHardwareManager(base.IronicAgentTest):
self.hardware.evaluate_hardware_support()
mocked_root_dev.assert_called_with(mocked_block_dev.return_value)
self.assertEqual(2, mocked_root_dev.call_count)
mocked_get_inst_dev.assert_called_with(mock.ANY)
self.assertEqual(2, mocked_get_inst_dev.call_count)
expected_calls = [mock.call(CONF.disk_wait_delay)] * 1
mocked_sleep.assert_has_calls(expected_calls)
@mock.patch.object(hardware, '_check_for_iscsi', mock.Mock())
@mock.patch.object(hardware.GenericHardwareManager, 'list_block_devices',
autospec=True)
@mock.patch.object(hardware.GenericHardwareManager,
'get_os_install_device', autospec=True)
@mock.patch.object(time, 'sleep', autospec=True)
@mock.patch.object(utils, 'guess_root_disk', autospec=True)
def test_evaluate_hw_disks_timeout_unconfigured(self, mocked_root_dev,
mocked_sleep,
mocked_block_dev):
mocked_root_dev.side_effect = errors.DeviceNotFound('boom')
def test_evaluate_hw_disks_timeout_unconfigured(self, mocked_sleep,
mocked_get_inst_dev):
mocked_get_inst_dev.side_effect = errors.DeviceNotFound('boom')
self.hardware.evaluate_hardware_support()
mocked_sleep.assert_called_with(3)
@mock.patch.object(hardware, '_check_for_iscsi', mock.Mock())
@mock.patch.object(hardware.GenericHardwareManager, 'list_block_devices',
autospec=True)
@mock.patch.object(hardware.GenericHardwareManager,
'get_os_install_device', autospec=True)
@mock.patch.object(time, 'sleep', autospec=True)
@mock.patch.object(utils, 'guess_root_disk', autospec=True)
def test_evaluate_hw_disks_timeout_configured(self, mocked_root_dev,
mocked_sleep,
mocked_block_dev):
def test_evaluate_hw_disks_timeout_configured(self, mocked_sleep,
mocked_root_dev):
CONF.set_override('disk_wait_delay', '5')
mocked_root_dev.side_effect = errors.DeviceNotFound('boom')
self.hardware.evaluate_hardware_support()
mocked_sleep.assert_called_with(5)
@mock.patch.object(hardware.GenericHardwareManager, 'list_block_devices',
autospec=True)
@mock.patch.object(hardware.GenericHardwareManager,
'get_os_install_device', autospec=True)
@mock.patch.object(hardware, '_check_for_iscsi', autospec=True)
@mock.patch.object(time, 'sleep', autospec=True)
@mock.patch.object(utils, 'guess_root_disk', autospec=True)
def test_evaluate_hw_disks_timeout(self, mocked_root_dev, mocked_sleep,
mocked_check_for_iscsi,
mocked_block_dev):
mocked_root_dev.side_effect = errors.DeviceNotFound('boom')
def test_evaluate_hw_disks_timeout(
self, mocked_sleep, mocked_check_for_iscsi, mocked_get_inst_dev):
mocked_get_inst_dev.side_effect = errors.DeviceNotFound('boom')
result = self.hardware.evaluate_hardware_support()
self.assertEqual(hardware.HardwareSupport.GENERIC, result)
mocked_root_dev.assert_called_with(mocked_block_dev.return_value)
mocked_get_inst_dev.assert_called_with(mock.ANY)
self.assertEqual(CONF.disk_wait_attempts,
mocked_root_dev.call_count)
mocked_get_inst_dev.call_count)
mocked_sleep.assert_called_with(CONF.disk_wait_delay)
@mock.patch.object(utils, 'get_agent_params',

View File

@ -53,6 +53,8 @@ class ZFakeGenericHardwareManager(hardware.HardwareManager):
_build_clean_step('ZHigherPrio', 100)]
@mock.patch.object(hardware.HardwareManager, 'wait_for_disks',
lambda _self: None)
class TestMultipleHardwareManagerCleanSteps(base.IronicAgentTest):
def setUp(self):
super(TestMultipleHardwareManagerCleanSteps, self).setUp()
@ -79,7 +81,8 @@ class TestMultipleHardwareManagerCleanSteps(base.IronicAgentTest):
hardware._global_managers = None
def test_clean_step_ordering(self):
as_results = self.agent_extension.get_clean_steps(node={}, ports=[])
as_results = self.agent_extension.get_clean_steps(node={'uuid': '1'},
ports=[])
results = as_results.join().command_result
expected_steps = {
'clean_steps': {

View File

@ -0,0 +1,6 @@
---
fixes:
- |
If root device hints are provided on the node, wait for the root device
instead of the first disk. This fixes deployment when the target disk takes
time to load.