Fix for failure in cleaning
The cleaning operation may fail, if an in-band clean step were to execute after the completion of out-of-band clean step that performs reboot of the node. The failure is caused because of race condition where in cleaning is resumed before the Ironic Python Agent(IPA) is ready to execute clean steps. Story: #2002731 Task: #22580 Change-Id: Idaacb9fbb1ea3ac82cdb6769df05d8206660c8cb
This commit is contained in:
parent
3347e155b3
commit
7c5a04c114
@ -787,3 +787,7 @@ class BIOSSettingListNotFound(NotFound):
|
||||
|
||||
class DatabaseVersionTooOld(IronicException):
|
||||
_msg_fmt = _("Database version is too old")
|
||||
|
||||
|
||||
class AgentConnectionFailed(IronicException):
|
||||
_msg_fmt = _("Connection to agent failed: %(reason)s")
|
||||
|
@ -1236,6 +1236,19 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
try:
|
||||
result = interface.execute_clean_step(task, step)
|
||||
except Exception as e:
|
||||
if isinstance(e, exception.AgentConnectionFailed):
|
||||
if task.node.driver_internal_info.get('cleaning_reboot'):
|
||||
LOG.info('Agent is not yet running on node %(node)s '
|
||||
'after cleaning reboot, waiting for agent to '
|
||||
'come up to run next clean step %(step)s.',
|
||||
{'node': node.uuid, 'step': step})
|
||||
driver_internal_info['skip_current_clean_step'] = False
|
||||
node.driver_internal_info = driver_internal_info
|
||||
target_state = (states.MANAGEABLE if manual_clean
|
||||
else None)
|
||||
task.process_event('wait', target_state=target_state)
|
||||
return
|
||||
|
||||
msg = (_('Node %(node)s failed step %(step)s: '
|
||||
'%(exc)s') %
|
||||
{'node': node.uuid, 'exc': e,
|
||||
@ -1270,6 +1283,7 @@ class ConductorManager(base_manager.BaseConductorManager):
|
||||
driver_internal_info = node.driver_internal_info
|
||||
driver_internal_info['clean_steps'] = None
|
||||
driver_internal_info.pop('clean_step_index', None)
|
||||
driver_internal_info.pop('cleaning_reboot', None)
|
||||
node.driver_internal_info = driver_internal_info
|
||||
node.save()
|
||||
try:
|
||||
|
@ -466,6 +466,7 @@ class AgentDeployMixin(HeartbeatMixin):
|
||||
info = task.node.driver_internal_info
|
||||
info.pop('cleaning_reboot', None)
|
||||
task.node.driver_internal_info = info
|
||||
task.node.save()
|
||||
manager_utils.notify_conductor_resume_clean(task)
|
||||
return
|
||||
else:
|
||||
|
@ -81,6 +81,12 @@ class AgentClient(object):
|
||||
|
||||
try:
|
||||
response = self.session.post(url, params=request_params, data=body)
|
||||
except requests.ConnectionError as e:
|
||||
msg = (_('Failed to invoke agent command %(method)s for node '
|
||||
'%(node)s. Error: %(error)s') %
|
||||
{'method': method, 'node': node.uuid, 'error': e})
|
||||
LOG.error(msg)
|
||||
raise exception.AgentConnectionFailed(reason=msg)
|
||||
except requests.RequestException as e:
|
||||
msg = (_('Error invoking agent command %(method)s for node '
|
||||
'%(node)s. Error: %(error)s') %
|
||||
|
@ -895,4 +895,8 @@ class DracRAID(base.RAIDInterface):
|
||||
def _resume_cleaning(self, task):
|
||||
raid_common.update_raid_info(
|
||||
task.node, self.get_logical_disks(task))
|
||||
driver_internal_info = task.node.driver_internal_info
|
||||
driver_internal_info['cleaning_reboot'] = True
|
||||
task.node.driver_internal_info = driver_internal_info
|
||||
task.node.save()
|
||||
manager_utils.notify_conductor_resume_clean(task)
|
||||
|
@ -2831,6 +2831,114 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
|
||||
def test__do_next_clean_step_manual_execute_fail(self):
|
||||
self._do_next_clean_step_execute_fail(manual=True)
|
||||
|
||||
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step',
|
||||
autospec=True)
|
||||
def test_do_next_clean_step_oob_reboot(self, mock_execute):
|
||||
# When a clean step fails, go to CLEANWAIT
|
||||
tgt_prov_state = states.MANAGEABLE
|
||||
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake-hardware',
|
||||
provision_state=states.CLEANING,
|
||||
target_provision_state=tgt_prov_state,
|
||||
last_error=None,
|
||||
driver_internal_info={'clean_steps': self.clean_steps,
|
||||
'clean_step_index': None,
|
||||
'cleaning_reboot': True},
|
||||
clean_step={})
|
||||
mock_execute.side_effect = exception.AgentConnectionFailed(
|
||||
reason='failed')
|
||||
|
||||
with task_manager.acquire(
|
||||
self.context, node.uuid, shared=False) as task:
|
||||
self.service._do_next_clean_step(task, 0)
|
||||
|
||||
self._stop_service()
|
||||
node.refresh()
|
||||
|
||||
# Make sure we go to CLEANWAIT
|
||||
self.assertEqual(states.CLEANWAIT, node.provision_state)
|
||||
self.assertEqual(tgt_prov_state, node.target_provision_state)
|
||||
self.assertEqual(self.clean_steps[0], node.clean_step)
|
||||
self.assertEqual(0, node.driver_internal_info['clean_step_index'])
|
||||
self.assertFalse(node.driver_internal_info['skip_current_clean_step'])
|
||||
mock_execute.assert_called_once_with(
|
||||
mock.ANY, mock.ANY, self.clean_steps[0])
|
||||
|
||||
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step',
|
||||
autospec=True)
|
||||
def test_do_next_clean_step_oob_reboot_last_step(self, mock_execute):
|
||||
# Resume where last_step is the last cleaning step
|
||||
tgt_prov_state = states.MANAGEABLE
|
||||
info = {'clean_steps': self.clean_steps,
|
||||
'cleaning_reboot': True,
|
||||
'clean_step_index': len(self.clean_steps) - 1}
|
||||
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake-hardware',
|
||||
provision_state=states.CLEANING,
|
||||
target_provision_state=tgt_prov_state,
|
||||
last_error=None,
|
||||
driver_internal_info=info,
|
||||
clean_step=self.clean_steps[-1])
|
||||
|
||||
with task_manager.acquire(
|
||||
self.context, node.uuid, shared=False) as task:
|
||||
self.service._do_next_clean_step(task, None)
|
||||
|
||||
self._stop_service()
|
||||
node.refresh()
|
||||
|
||||
# Cleaning should be complete without calling additional steps
|
||||
self.assertEqual(tgt_prov_state, node.provision_state)
|
||||
self.assertEqual(states.NOSTATE, node.target_provision_state)
|
||||
self.assertEqual({}, node.clean_step)
|
||||
self.assertNotIn('clean_step_index', node.driver_internal_info)
|
||||
self.assertNotIn('cleaning_reboot', node.driver_internal_info)
|
||||
self.assertIsNone(node.driver_internal_info['clean_steps'])
|
||||
self.assertFalse(mock_execute.called)
|
||||
|
||||
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step',
|
||||
autospec=True)
|
||||
@mock.patch.object(fake.FakeDeploy, 'tear_down_cleaning', autospec=True)
|
||||
def test_do_next_clean_step_oob_reboot_fail(self, tear_mock,
|
||||
mock_execute):
|
||||
# When a clean step fails with no reboot requested go to CLEANFAIL
|
||||
tgt_prov_state = states.MANAGEABLE
|
||||
|
||||
self._start_service()
|
||||
node = obj_utils.create_test_node(
|
||||
self.context, driver='fake-hardware',
|
||||
provision_state=states.CLEANING,
|
||||
target_provision_state=tgt_prov_state,
|
||||
last_error=None,
|
||||
driver_internal_info={'clean_steps': self.clean_steps,
|
||||
'clean_step_index': None},
|
||||
clean_step={})
|
||||
mock_execute.side_effect = exception.AgentConnectionFailed(
|
||||
reason='failed')
|
||||
|
||||
with task_manager.acquire(
|
||||
self.context, node.uuid, shared=False) as task:
|
||||
self.service._do_next_clean_step(task, 0)
|
||||
tear_mock.assert_called_once_with(task.driver.deploy, task)
|
||||
|
||||
self._stop_service()
|
||||
node.refresh()
|
||||
|
||||
# Make sure we go to CLEANFAIL, clear clean_steps
|
||||
self.assertEqual(states.CLEANFAIL, node.provision_state)
|
||||
self.assertEqual(tgt_prov_state, node.target_provision_state)
|
||||
self.assertEqual({}, node.clean_step)
|
||||
self.assertNotIn('clean_step_index', node.driver_internal_info)
|
||||
self.assertNotIn('skip_current_clean_step', node.driver_internal_info)
|
||||
self.assertIsNotNone(node.last_error)
|
||||
self.assertTrue(node.maintenance)
|
||||
mock_execute.assert_called_once_with(
|
||||
mock.ANY, mock.ANY, self.clean_steps[0])
|
||||
|
||||
@mock.patch.object(manager, 'LOG', autospec=True)
|
||||
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step',
|
||||
autospec=True)
|
||||
|
@ -133,6 +133,24 @@ class TestAgentClient(base.TestCase):
|
||||
{'method': method, 'node': self.node.uuid,
|
||||
'error': error}, str(e))
|
||||
|
||||
def test__command_fail_connect(self):
|
||||
error = 'Boom'
|
||||
self.client.session.post.side_effect = requests.ConnectionError(error)
|
||||
method = 'foo.bar'
|
||||
params = {}
|
||||
|
||||
self.client._get_command_url(self.node)
|
||||
self.client._get_command_body(method, params)
|
||||
|
||||
e = self.assertRaises(exception.AgentConnectionFailed,
|
||||
self.client._command,
|
||||
self.node, method, params)
|
||||
self.assertEqual('Connection to agent failed: Failed to invoke '
|
||||
'agent command %(method)s for node %(node)s. '
|
||||
'Error: %(error)s' %
|
||||
{'method': method, 'node': self.node.uuid,
|
||||
'error': error}, str(e))
|
||||
|
||||
def test__command_error_code(self):
|
||||
response_text = '{"faultstring": "you dun goofd"}'
|
||||
self.client.session.post.return_value = MockResponse(
|
||||
|
@ -0,0 +1,10 @@
|
||||
---
|
||||
fixes:
|
||||
- |
|
||||
The cleaning operation may fail, if an in-band clean step were to
|
||||
execute after the completion of out-of-band clean step that
|
||||
performs reboot of the node. The failure is caused because of race
|
||||
condition where in cleaning is resumed before the Ironic Python
|
||||
Agent(IPA) is ready to execute clean steps. This has been fixed.
|
||||
For more information, see `bug 2002731
|
||||
<https://storyboard.openstack.org/#!/story/2002731>`_.
|
Loading…
Reference in New Issue
Block a user