Fix idrac-wsman RAID step async error handling

Instead of using process_event('fail') use error_handlers,
otherwise in case of failure node gets stuck and fails
because of timeout, instead of failing earlier due to
step failure.

Story: 2008307
Task: 41194
Change-Id: Ieec0173f57367587985d2baad77205bb83e8b69a
This commit is contained in:
Aija Jauntēva 2020-10-02 04:57:42 -04:00
parent 101fc29686
commit 23951f4b44
3 changed files with 44 additions and 13 deletions

View File

@ -1491,13 +1491,18 @@ class DracWSManRAID(base.RAIDInterface):
node.save()
def _set_failed(self, task, config_job):
LOG.error("RAID configuration job failed for node %(node)s. "
"Failed config job: %(config_job_id)s. "
"Message: '%(message)s'.",
{'node': task.node.uuid, 'config_job_id': config_job.id,
'message': config_job.message})
task.node.last_error = config_job.message
task.process_event('fail')
error_msg = (_("Failed config job: %(config_job_id)s. "
"Message: '%(message)s'.") %
{'config_job_id': config_job.id,
'message': config_job.message})
log_msg = (_("RAID configuration job failed for node %(node)s. "
"%(error)s") %
{'node': task.node.uuid, 'error': error_msg})
if task.node.clean_step:
LOG.error(log_msg)
manager_utils.cleaning_error_handler(task, error_msg)
else:
manager_utils.deploying_error_handler(task, log_msg, error_msg)
def _resume(self, task):
raid_common.update_raid_info(

View File

@ -203,9 +203,11 @@ class DracPeriodicTaskTestCase(db_base.DbTestCase):
self._test__check_node_raid_jobs_with_completed_job(
mock_notify_conductor_resume)
@mock.patch.object(manager_utils, 'cleaning_error_handler', autospec=True)
@mock.patch.object(drac_common, 'get_drac_client', spec_set=True,
autospec=True)
def test__check_node_raid_jobs_with_failed_job(self, mock_get_drac_client):
def test__check_node_raid_jobs_with_failed_job(
self, mock_get_drac_client, mock_cleaning_error_handler):
# mock node.driver_internal_info and node.clean_step
driver_internal_info = {'raid_config_job_ids': ['42']}
self.node.driver_internal_info = driver_internal_info
@ -232,15 +234,18 @@ class DracPeriodicTaskTestCase(db_base.DbTestCase):
self.assertEqual([],
self.node.driver_internal_info['raid_config_job_ids'])
self.assertEqual({}, self.node.raid_config)
task.process_event.assert_called_once_with('fail')
mock_cleaning_error_handler.assert_called_once_with(task, mock.ANY)
@mock.patch.object(manager_utils, 'deploying_error_handler', autospec=True)
@mock.patch.object(manager_utils, 'cleaning_error_handler', autospec=True)
@mock.patch.object(drac_common, 'get_drac_client', spec_set=True,
autospec=True)
@mock.patch.object(drac_raid.DracRAID, 'get_logical_disks',
spec_set=True, autospec=True)
def _test__check_node_raid_jobs_with_completed_job_already_failed(
self, mock_notify_conductor_resume,
mock_get_logical_disks, mock_get_drac_client):
mock_get_logical_disks, mock_get_drac_client,
mock_cleaning_error_handler, mock_deploying_error_handler):
expected_logical_disk = {'size_gb': 558,
'raid_level': '1',
'name': 'disk 0'}
@ -271,7 +276,12 @@ class DracPeriodicTaskTestCase(db_base.DbTestCase):
self.assertNotIn('raid_config_job_failure',
self.node.driver_internal_info)
self.assertNotIn('logical_disks', self.node.raid_config)
task.process_event.assert_called_once_with('fail')
if self.node.clean_step:
mock_cleaning_error_handler.assert_called_once_with(task, mock.ANY)
else:
mock_deploying_error_handler.assert_called_once_with(task,
mock.ANY,
mock.ANY)
self.assertFalse(mock_notify_conductor_resume.called)
@mock.patch.object(manager_utils, 'notify_conductor_resume_clean',
@ -346,13 +356,16 @@ class DracPeriodicTaskTestCase(db_base.DbTestCase):
self._test__check_node_raid_jobs_with_multiple_jobs_completed(
mock_notify_conductor_resume)
@mock.patch.object(manager_utils, 'deploying_error_handler', autospec=True)
@mock.patch.object(manager_utils, 'cleaning_error_handler', autospec=True)
@mock.patch.object(drac_common, 'get_drac_client', spec_set=True,
autospec=True)
@mock.patch.object(drac_raid.DracRAID, 'get_logical_disks',
spec_set=True, autospec=True)
def _test__check_node_raid_jobs_with_multiple_jobs_failed(
self, mock_notify_conductor_resume,
mock_get_logical_disks, mock_get_drac_client):
mock_get_logical_disks, mock_get_drac_client,
mock_cleaning_error_handler, mock_deploying_error_handler):
expected_logical_disk = {'size_gb': 558,
'raid_level': '1',
'name': 'disk 0'}
@ -387,7 +400,12 @@ class DracPeriodicTaskTestCase(db_base.DbTestCase):
self.assertNotIn('raid_config_job_failure',
self.node.driver_internal_info)
self.assertNotIn('logical_disks', self.node.raid_config)
task.process_event.assert_called_once_with('fail')
if self.node.clean_step:
mock_cleaning_error_handler.assert_called_once_with(task, mock.ANY)
else:
mock_deploying_error_handler.assert_called_once_with(task,
mock.ANY,
mock.ANY)
self.assertFalse(mock_notify_conductor_resume.called)
@mock.patch.object(manager_utils, 'notify_conductor_resume_clean',

View File

@ -0,0 +1,8 @@
---
fixes:
- |
Fixes ``idrac-wsman`` RAID ``create_configuration`` clean step,
``apply_configuration`` deploy step and ``delete_configuration`` clean and
deploy step to fail correctly in case of error when checking completed
jobs. Before the fix when RAID job failed, then node cleaning or deploying
failed with timeout instead of actual error in clean or deploy step.