Fix for failure in cleaning

The cleaning operation may fail, if an in-band clean step were to
execute after the completion of out-of-band clean step that
performs reboot of the node. The failure is caused because of race
condition where in cleaning is resumed before the Ironic Python
Agent(IPA) is ready to execute clean steps.

Story: #2002731
Task: #22580
Change-Id: Idaacb9fbb1ea3ac82cdb6769df05d8206660c8cb
(cherry picked from commit 7c5a04c114)
This commit is contained in:
Julia Kreger 2020-08-25 14:37:16 -04:00
parent 918478b4e1
commit 800a5c4d14
8 changed files with 165 additions and 0 deletions

View File

@ -774,3 +774,7 @@ class InstanceRescueFailure(IronicException):
class InstanceUnrescueFailure(IronicException): class InstanceUnrescueFailure(IronicException):
_msg_fmt = _('Failed to unrescue instance %(instance)s for node ' _msg_fmt = _('Failed to unrescue instance %(instance)s for node '
'%(node)s: %(reason)s') '%(node)s: %(reason)s')
class AgentConnectionFailed(IronicException):
_msg_fmt = _("Connection to agent failed: %(reason)s")

View File

@ -1253,6 +1253,19 @@ class ConductorManager(base_manager.BaseConductorManager):
try: try:
result = interface.execute_clean_step(task, step) result = interface.execute_clean_step(task, step)
except Exception as e: except Exception as e:
if isinstance(e, exception.AgentConnectionFailed):
if task.node.driver_internal_info.get('cleaning_reboot'):
LOG.info('Agent is not yet running on node %(node)s '
'after cleaning reboot, waiting for agent to '
'come up to run next clean step %(step)s.',
{'node': node.uuid, 'step': step})
driver_internal_info['skip_current_clean_step'] = False
node.driver_internal_info = driver_internal_info
target_state = (states.MANAGEABLE if manual_clean
else None)
task.process_event('wait', target_state=target_state)
return
msg = (_('Node %(node)s failed step %(step)s: ' msg = (_('Node %(node)s failed step %(step)s: '
'%(exc)s') % '%(exc)s') %
{'node': node.uuid, 'exc': e, {'node': node.uuid, 'exc': e,
@ -1286,6 +1299,7 @@ class ConductorManager(base_manager.BaseConductorManager):
node.clean_step = None node.clean_step = None
driver_internal_info['clean_steps'] = None driver_internal_info['clean_steps'] = None
driver_internal_info.pop('clean_step_index', None) driver_internal_info.pop('clean_step_index', None)
driver_internal_info.pop('cleaning_reboot', None)
node.driver_internal_info = driver_internal_info node.driver_internal_info = driver_internal_info
node.save() node.save()
try: try:

View File

@ -472,6 +472,7 @@ class AgentDeployMixin(HeartbeatMixin):
info = task.node.driver_internal_info info = task.node.driver_internal_info
info.pop('cleaning_reboot', None) info.pop('cleaning_reboot', None)
task.node.driver_internal_info = info task.node.driver_internal_info = info
task.node.save()
_notify_conductor_resume_clean(task) _notify_conductor_resume_clean(task)
return return
else: else:

View File

@ -64,6 +64,12 @@ class AgentClient(object):
try: try:
response = self.session.post(url, params=request_params, data=body) response = self.session.post(url, params=request_params, data=body)
except requests.ConnectionError as e:
msg = (_('Failed to invoke agent command %(method)s for node '
'%(node)s. Error: %(error)s') %
{'method': method, 'node': node.uuid, 'error': e})
LOG.error(msg)
raise exception.AgentConnectionFailed(reason=msg)
except requests.RequestException as e: except requests.RequestException as e:
msg = (_('Error invoking agent command %(method)s for node ' msg = (_('Error invoking agent command %(method)s for node '
'%(node)s. Error: %(error)s') % '%(node)s. Error: %(error)s') %

View File

@ -895,4 +895,8 @@ class DracRAID(base.RAIDInterface):
def _resume_cleaning(self, task): def _resume_cleaning(self, task):
raid_common.update_raid_info( raid_common.update_raid_info(
task.node, self.get_logical_disks(task)) task.node, self.get_logical_disks(task))
driver_internal_info = task.node.driver_internal_info
driver_internal_info['cleaning_reboot'] = True
task.node.driver_internal_info = driver_internal_info
task.node.save()
agent_base_vendor._notify_conductor_resume_clean(task) agent_base_vendor._notify_conductor_resume_clean(task)

View File

@ -2780,6 +2780,114 @@ class DoNodeCleanTestCase(mgr_utils.ServiceSetUpMixin, db_base.DbTestCase):
def test__do_next_clean_step_manual_execute_fail(self): def test__do_next_clean_step_manual_execute_fail(self):
self._do_next_clean_step_execute_fail(manual=True) self._do_next_clean_step_execute_fail(manual=True)
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step',
autospec=True)
def test_do_next_clean_step_oob_reboot(self, mock_execute):
# When a clean step fails, go to CLEANWAIT
tgt_prov_state = states.MANAGEABLE
self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
provision_state=states.CLEANING,
target_provision_state=tgt_prov_state,
last_error=None,
driver_internal_info={'clean_steps': self.clean_steps,
'clean_step_index': None,
'cleaning_reboot': True},
clean_step={})
mock_execute.side_effect = exception.AgentConnectionFailed(
reason='failed')
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, 0)
self._stop_service()
node.refresh()
# Make sure we go to CLEANWAIT
self.assertEqual(states.CLEANWAIT, node.provision_state)
self.assertEqual(tgt_prov_state, node.target_provision_state)
self.assertEqual(self.clean_steps[0], node.clean_step)
self.assertEqual(0, node.driver_internal_info['clean_step_index'])
self.assertFalse(node.driver_internal_info['skip_current_clean_step'])
mock_execute.assert_called_once_with(
mock.ANY, mock.ANY, self.clean_steps[0])
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step',
autospec=True)
def test_do_next_clean_step_oob_reboot_last_step(self, mock_execute):
# Resume where last_step is the last cleaning step
tgt_prov_state = states.MANAGEABLE
info = {'clean_steps': self.clean_steps,
'cleaning_reboot': True,
'clean_step_index': len(self.clean_steps) - 1}
self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
provision_state=states.CLEANING,
target_provision_state=tgt_prov_state,
last_error=None,
driver_internal_info=info,
clean_step=self.clean_steps[-1])
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, None)
self._stop_service()
node.refresh()
# Cleaning should be complete without calling additional steps
self.assertEqual(tgt_prov_state, node.provision_state)
self.assertEqual(states.NOSTATE, node.target_provision_state)
self.assertEqual({}, node.clean_step)
self.assertNotIn('clean_step_index', node.driver_internal_info)
self.assertNotIn('cleaning_reboot', node.driver_internal_info)
self.assertIsNone(node.driver_internal_info['clean_steps'])
self.assertFalse(mock_execute.called)
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step',
autospec=True)
@mock.patch.object(fake.FakeDeploy, 'tear_down_cleaning', autospec=True)
def test_do_next_clean_step_oob_reboot_fail(self, tear_mock,
mock_execute):
# When a clean step fails with no reboot requested go to CLEANFAIL
tgt_prov_state = states.MANAGEABLE
self._start_service()
node = obj_utils.create_test_node(
self.context, driver='fake-hardware',
provision_state=states.CLEANING,
target_provision_state=tgt_prov_state,
last_error=None,
driver_internal_info={'clean_steps': self.clean_steps,
'clean_step_index': None},
clean_step={})
mock_execute.side_effect = exception.AgentConnectionFailed(
reason='failed')
with task_manager.acquire(
self.context, node.uuid, shared=False) as task:
self.service._do_next_clean_step(task, 0)
tear_mock.assert_called_once_with(task.driver.deploy, task)
self._stop_service()
node.refresh()
# Make sure we go to CLEANFAIL, clear clean_steps
self.assertEqual(states.CLEANFAIL, node.provision_state)
self.assertEqual(tgt_prov_state, node.target_provision_state)
self.assertEqual({}, node.clean_step)
self.assertNotIn('clean_step_index', node.driver_internal_info)
self.assertNotIn('skip_current_clean_step', node.driver_internal_info)
self.assertIsNotNone(node.last_error)
self.assertTrue(node.maintenance)
mock_execute.assert_called_once_with(
mock.ANY, mock.ANY, self.clean_steps[0])
@mock.patch.object(manager, 'LOG', autospec=True) @mock.patch.object(manager, 'LOG', autospec=True)
@mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step', @mock.patch('ironic.drivers.modules.fake.FakeDeploy.execute_clean_step',
autospec=True) autospec=True)

View File

@ -133,6 +133,24 @@ class TestAgentClient(base.TestCase):
{'method': method, 'node': self.node.uuid, {'method': method, 'node': self.node.uuid,
'error': error}, str(e)) 'error': error}, str(e))
def test__command_fail_connect(self):
error = 'Boom'
self.client.session.post.side_effect = requests.ConnectionError(error)
method = 'foo.bar'
params = {}
self.client._get_command_url(self.node)
self.client._get_command_body(method, params)
e = self.assertRaises(exception.AgentConnectionFailed,
self.client._command,
self.node, method, params)
self.assertEqual('Connection to agent failed: Failed to invoke '
'agent command %(method)s for node %(node)s. '
'Error: %(error)s' %
{'method': method, 'node': self.node.uuid,
'error': error}, str(e))
def test__command_error_code(self): def test__command_error_code(self):
response_text = '{"faultstring": "you dun goofd"}' response_text = '{"faultstring": "you dun goofd"}'
self.client.session.post.return_value = MockResponse( self.client.session.post.return_value = MockResponse(

View File

@ -0,0 +1,10 @@
---
fixes:
- |
The cleaning operation may fail, if an in-band clean step were to
execute after the completion of out-of-band clean step that
performs reboot of the node. The failure is caused because of race
condition where in cleaning is resumed before the Ironic Python
Agent(IPA) is ready to execute clean steps. This has been fixed.
For more information, see `bug 2002731
<https://storyboard.openstack.org/#!/story/2002731>`_.