Fix clear foreign config in idrac-redfish RAID

After volumes are deleted in Redfish RAID also
clear foreign config if there is any.

Story: 2009160
Task: 43145

Depends-On: https://review.opendev.org/c/x/sushy-oem-idrac/+/806888
Change-Id: Ifde4656b4edd387ce2db2dbfc4c5ede261fafc70
This commit is contained in:
Aija Jauntēva 2021-08-16 09:04:38 -04:00
parent f0b3695b30
commit 50c87cf633
6 changed files with 512 additions and 2 deletions

View File

@ -20,4 +20,4 @@ ansible>=2.7
python-ibmcclient>=0.2.2,<0.3.0
# Dell EMC iDRAC sushy OEM extension
sushy-oem-idrac>=3.0.0,<4.0.0
sushy-oem-idrac>=3.0.1,<4.0.0

View File

@ -1322,6 +1322,25 @@ class DracRedfishRAID(redfish_raid.RedfishRAID):
else:
return logical_disks_to_create
def post_delete_configuration(self, task, raid_configs, return_state=None):
"""Perform post delete_configuration action to commit the config.
Clears foreign configuration for all RAID controllers.
:param task: a TaskManager instance containing the node to act on.
:param raid_configs: a list of dictionaries containing the RAID
configuration operation details.
:param return_state: state to return based on operation being invoked
"""
system = redfish_utils.get_system(task.node)
async_proc = DracRedfishRAID._clear_foreign_config(system, task)
if async_proc:
# Async processing with system rebooting in progress
return deploy_utils.get_async_step_return_state(task.node)
return return_state
@staticmethod
def _get_storage_controller(system, identity):
"""Finds storage and controller by identity
@ -1419,6 +1438,150 @@ class DracRedfishRAID(redfish_raid.RedfishRAID):
return logical_disks_to_create
@staticmethod
def _clear_foreign_config(system, task):
"""Clears foreign config for given system
:param system: Redfish system
:param task: a TaskManager instance containing the node to act on
:returns: True if system needs rebooting and async processing for
tasks necessary, otherwise False
"""
oem_sys = system.get_oem_extension('Dell')
try:
task_mons = oem_sys.clear_foreign_config()
except AttributeError as ae:
# For backported version where libraries could be too old
LOG.warning('Failed to find method to clear foreign config. '
'Possibly because `sushy-oem-idrac` is too old. '
'Without newer `sushy-oem-idrac` no foreign '
'configuration will be cleared if there is any. '
'To avoid that update `sushy-oem-idrac`. '
'Error: %(err)s', {'err': ae})
return False
# Check if any of tasks requires reboot
for task_mon in task_mons:
oem_task = task_mon.get_task().get_oem_extension('Dell')
if oem_task.job_type == sushy_oem_idrac.JOB_TYPE_RAID_CONF:
# System rebooting, prepare ramdisk to boot back in IPA
deploy_utils.set_async_step_flags(
task.node,
reboot=True,
skip_current_step=True,
polling=True)
deploy_opts = deploy_utils.build_agent_options(task.node)
task.driver.boot.prepare_ramdisk(task, deploy_opts)
# Reboot already done by non real time task
task.upgrade_lock()
info = task.node.driver_internal_info
info['raid_task_monitor_uris'] = [
tm.task_monitor_uri for tm in task_mons]
task.node.driver_internal_info = info
task.node.save()
return True
# No task requiring reboot found, proceed with waiting for sync tasks
for task_mon in task_mons:
if task_mon.check_is_processing:
task_mon.wait(CONF.drac.raid_job_timeout)
return False
@METRICS.timer('DracRedfishRAID._query_raid_tasks_status')
@periodics.periodic(
spacing=CONF.drac.query_raid_config_job_status_interval)
def _query_raid_tasks_status(self, manager, context):
"""Periodic task to check the progress of running RAID tasks"""
filters = {'reserved': False, 'maintenance': False}
fields = ['driver_internal_info']
node_list = manager.iter_nodes(fields=fields, filters=filters)
for (node_uuid, driver, conductor_group,
driver_internal_info) in node_list:
task_monitor_uris = driver_internal_info.get(
'raid_task_monitor_uris')
if not task_monitor_uris:
continue
try:
lock_purpose = 'checking async RAID tasks'
with task_manager.acquire(context, node_uuid,
purpose=lock_purpose,
shared=True) as task:
if not isinstance(task.driver.raid,
DracRedfishRAID):
continue
self._check_raid_tasks_status(
task, task_monitor_uris)
except exception.NodeNotFound:
LOG.info('During _query_raid_tasks_status, node '
'%(node)s was not found and presumed deleted by '
'another process.', {'node': node_uuid})
except exception.NodeLocked:
LOG.info('During _query_raid_tasks_status, node '
'%(node)s was already locked by another process. '
'Skip.', {'node': node_uuid})
def _check_raid_tasks_status(self, task, task_mon_uris):
"""Checks RAID tasks for completion
If at least one of the jobs failed, then all step failed.
If some tasks are still running, they are checked in next period.
"""
node = task.node
completed_task_mon_uris = []
failed_msgs = []
for task_mon_uri in task_mon_uris:
task_mon = redfish_utils.get_task_monitor(node, task_mon_uri)
if not task_mon.is_processing:
raid_task = task_mon.get_task()
completed_task_mon_uris.append(task_mon_uri)
if not (raid_task.task_state == sushy.TASK_STATE_COMPLETED
and raid_task.task_status in
[sushy.HEALTH_OK, sushy.HEALTH_WARNING]):
messages = [m.message for m in raid_task.messages
if m.message is not None]
failed_msgs.append(
(_("Task %(task_mon_uri)s. "
"Message: '%(message)s'.")
% {'task_mon_uri': task_mon_uri,
'message': ', '.join(messages)}))
task.upgrade_lock()
info = node.driver_internal_info
if failed_msgs:
error_msg = (_("Failed RAID configuration tasks: %(messages)s")
% {'messages': ', '.join(failed_msgs)})
log_msg = ("RAID configuration task failed for node "
"%(node)s. %(error)s" % {'node': node.uuid,
'error': error_msg})
info.pop('raid_task_monitor_uris', None)
self._set_failed(task, log_msg, error_msg)
else:
running_task_mon_uris = [x for x in task_mon_uris
if x not in completed_task_mon_uris]
if running_task_mon_uris:
info['raid_task_monitor_uris'] = running_task_mon_uris
node.driver_internal_info = info
# will check remaining jobs in the next period
else:
# all tasks completed and none of them failed
info.pop('raid_task_monitor_uris', None)
self._set_success(task)
node.driver_internal_info = info
node.save()
def _set_failed(self, task, log_msg, error_msg):
if task.node.clean_step:
manager_utils.cleaning_error_handler(task, log_msg, error_msg)
else:
manager_utils.deploying_error_handler(task, log_msg, error_msg)
def _set_success(self, task):
if task.node.clean_step:
manager_utils.notify_conductor_resume_clean(task)
else:
manager_utils.notify_conductor_resume_deploy(task)
class DracWSManRAID(base.RAIDInterface):

View File

@ -26,8 +26,10 @@ import tenacity
from ironic.common import exception
from ironic.common import states
from ironic.conductor import task_manager
from ironic.conductor import utils as manager_utils
from ironic.conf import CONF
from ironic.drivers import base
from ironic.drivers.modules import deploy_utils
from ironic.drivers.modules.drac import common as drac_common
from ironic.drivers.modules.drac import job as drac_job
from ironic.drivers.modules.drac import raid as drac_raid
@ -2450,3 +2452,338 @@ class DracRedfishRAIDTestCase(test_utils.BaseDracTest):
self.assertEqual(False, result)
mock_log.assert_called_once()
@mock.patch.object(deploy_utils, 'get_async_step_return_state',
autospec=True)
@mock.patch.object(deploy_utils, 'build_agent_options', autospec=True)
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
def test_post_delete_configuration_foreign_async(
self, mock_get_system, mock_build_agent_options,
mock_get_async_step_return_state):
fake_oem_system = mock.Mock()
fake_system = mock.Mock()
fake_system.get_oem_extension.return_value = fake_oem_system
mock_get_system.return_value = fake_system
task = mock.Mock(node=self.node, context=self.context)
mock_return_state1 = mock.Mock()
mock_return_state2 = mock.Mock()
mock_get_async_step_return_state.return_value = mock_return_state2
mock_oem_task1 = mock.Mock(
job_type=sushy_oem_idrac.JOB_TYPE_RT_NO_REBOOT_CONF)
mock_task1 = mock.Mock()
mock_task1.get_oem_extension.return_value = mock_oem_task1
mock_task_mon1 = mock.Mock(check_is_processing=True)
mock_task_mon1.task_monitor_uri = '/TaskService/1'
mock_task_mon1.get_task.return_value = mock_task1
mock_oem_task2 = mock.Mock(job_type=sushy_oem_idrac.JOB_TYPE_RAID_CONF)
mock_task2 = mock.Mock()
mock_task2.get_oem_extension.return_value = mock_oem_task2
mock_task_mon2 = mock.Mock(check_is_processing=False)
mock_task_mon2.task_monitor_uri = '/TaskService/2'
mock_task_mon2.get_task.return_value = mock_task2
fake_oem_system.clear_foreign_config.return_value = [
mock_task_mon1, mock_task_mon2]
result = self.raid.post_delete_configuration(
task, None, return_state=mock_return_state1)
self.assertEqual(result, mock_return_state2)
fake_oem_system.clear_foreign_config.assert_called_once()
mock_build_agent_options.assert_called_once_with(task.node)
mock_get_async_step_return_state.assert_called_once_with(task.node)
mock_task_mon1.wait.assert_not_called()
mock_task_mon2.wait.assert_not_called()
@mock.patch.object(redfish_utils, 'get_system', autospec=True)
def test_post_delete_configuration_foreign_sync(self, mock_get_system):
fake_oem_system = mock.Mock()
fake_system = mock.Mock()
fake_system.get_oem_extension.return_value = fake_oem_system
mock_get_system.return_value = fake_system
task = mock.Mock(node=self.node, context=self.context)
mock_return_state1 = mock.Mock()
mock_oem_task1 = mock.Mock(
job_type=sushy_oem_idrac.JOB_TYPE_RT_NO_REBOOT_CONF)
mock_task1 = mock.Mock()
mock_task1.get_oem_extension.return_value = mock_oem_task1
mock_task_mon1 = mock.Mock(check_is_processing=True)
mock_task_mon1.get_task.return_value = mock_task1
mock_oem_task2 = mock.Mock(
job_type=sushy_oem_idrac.JOB_TYPE_RT_NO_REBOOT_CONF)
mock_task2 = mock.Mock()
mock_task2.get_oem_extension.return_value = mock_oem_task2
mock_task_mon2 = mock.Mock(check_is_processing=False)
mock_task_mon2.get_task.return_value = mock_task2
fake_oem_system.clear_foreign_config.return_value = [
mock_task_mon1, mock_task_mon2]
result = self.raid.post_delete_configuration(
task, None, return_state=mock_return_state1)
self.assertEqual(result, mock_return_state1)
fake_oem_system.clear_foreign_config.assert_called_once()
mock_task_mon1.wait.assert_called_once_with(CONF.drac.raid_job_timeout)
mock_task_mon2.wait.assert_not_called()
@mock.patch.object(drac_raid.LOG, 'warning', autospec=True)
def test__clear_foreign_config_attribute_error(self, mock_log):
fake_oem_system = mock.Mock(spec=[])
fake_system = mock.Mock()
fake_system.get_oem_extension.return_value = fake_oem_system
result = drac_raid.DracRedfishRAID._clear_foreign_config(
fake_system, mock.Mock())
self.assertEqual(False, result)
mock_log.assert_called_once()
@mock.patch.object(task_manager, 'acquire', autospec=True)
def test__query_raid_tasks_status(self, mock_acquire):
driver_internal_info = {'raid_task_monitor_uris': ['/TaskService/123']}
self.node.driver_internal_info = driver_internal_info
self.node.save()
mock_manager = mock.Mock()
node_list = [(self.node.uuid, 'idrac', '', driver_internal_info)]
mock_manager.iter_nodes.return_value = node_list
task = mock.Mock(node=self.node,
driver=mock.Mock(raid=self.raid))
mock_acquire.return_value = mock.MagicMock(
__enter__=mock.MagicMock(return_value=task))
self.raid._check_raid_tasks_status = mock.Mock()
self.raid._query_raid_tasks_status(mock_manager, self.context)
self.raid._check_raid_tasks_status.assert_called_once_with(
task, ['/TaskService/123'])
@mock.patch.object(task_manager, 'acquire', autospec=True)
def test__query_raid_tasks_status_not_drac(self, mock_acquire):
driver_internal_info = {'raid_task_monitor_uris': ['/TaskService/123']}
self.node.driver_internal_info = driver_internal_info
self.node.save()
mock_manager = mock.Mock()
node_list = [(self.node.uuid, 'not-idrac', '', driver_internal_info)]
mock_manager.iter_nodes.return_value = node_list
task = mock.Mock(node=self.node,
driver=mock.Mock(raid=mock.Mock()))
mock_acquire.return_value = mock.MagicMock(
__enter__=mock.MagicMock(return_value=task))
self.raid._check_raid_tasks_status = mock.Mock()
self.raid._query_raid_tasks_status(mock_manager, self.context)
self.raid._check_raid_tasks_status.assert_not_called()
@mock.patch.object(task_manager, 'acquire', autospec=True)
def test__query_raid_tasks_status_no_task_monitor_url(self, mock_acquire):
driver_internal_info = {'something': 'else'}
self.node.driver_internal_info = driver_internal_info
self.node.save()
mock_manager = mock.Mock()
node_list = [(self.node.uuid, 'idrac', '', driver_internal_info)]
mock_manager.iter_nodes.return_value = node_list
task = mock.Mock(node=self.node,
driver=mock.Mock(raid=self.raid))
mock_acquire.return_value = mock.MagicMock(
__enter__=mock.MagicMock(return_value=task))
self.raid._check_raid_tasks_status = mock.Mock()
self.raid._query_raid_tasks_status(mock_manager, self.context)
self.raid._check_raid_tasks_status.assert_not_called()
@mock.patch.object(drac_raid.LOG, 'info', autospec=True)
@mock.patch.object(task_manager, 'acquire', autospec=True)
def test__query_raid_tasks_status_node_notfound(
self, mock_acquire, mock_log):
driver_internal_info = {'raid_task_monitor_uris': ['/TaskService/123']}
self.node.driver_internal_info = driver_internal_info
self.node.save()
mock_manager = mock.Mock()
node_list = [(self.node.uuid, 'idrac', '', driver_internal_info)]
mock_manager.iter_nodes.return_value = node_list
mock_acquire.side_effect = exception.NodeNotFound
self.raid._check_raid_tasks_status = mock.Mock()
self.raid._query_raid_tasks_status(mock_manager, self.context)
self.raid._check_raid_tasks_status.assert_not_called()
self.assertTrue(mock_log.called)
@mock.patch.object(drac_raid.LOG, 'info', autospec=True)
@mock.patch.object(task_manager, 'acquire', autospec=True)
def test__query_raid_tasks_status_node_locked(
self, mock_acquire, mock_log):
driver_internal_info = {'raid_task_monitor_uris': ['/TaskService/123']}
self.node.driver_internal_info = driver_internal_info
self.node.save()
mock_manager = mock.Mock()
node_list = [(self.node.uuid, 'idrac', '', driver_internal_info)]
mock_manager.iter_nodes.return_value = node_list
mock_acquire.side_effect = exception.NodeLocked
self.raid._check_raid_tasks_status = mock.Mock()
self.raid._query_raid_tasks_status(mock_manager, self.context)
self.raid._check_raid_tasks_status.assert_not_called()
self.assertTrue(mock_log.called)
@mock.patch.object(redfish_utils, 'get_task_monitor', autospec=True)
def test__check_raid_tasks_status(self, mock_get_task_monitor):
driver_internal_info = {
'raid_task_monitor_uris': '/TaskService/123'}
self.node.driver_internal_info = driver_internal_info
self.node.save()
mock_message = mock.Mock()
mock_message.message = 'Clear foreign config done'
mock_config_task = mock.Mock()
mock_config_task.task_state = sushy.TASK_STATE_COMPLETED
mock_config_task.task_status = sushy.HEALTH_OK
mock_config_task.messages = [mock_message]
mock_task_monitor = mock.Mock()
mock_task_monitor.is_processing = False
mock_task_monitor.get_task.return_value = mock_config_task
mock_get_task_monitor.return_value = mock_task_monitor
self.raid._set_success = mock.Mock()
self.raid._set_failed = mock.Mock()
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
self.raid._check_raid_tasks_status(
task, ['/TaskService/123'])
self.raid._set_success.assert_called_once_with(task)
self.assertIsNone(
task.node.driver_internal_info.get('raid_task_monitor_uris'))
self.raid._set_failed.assert_not_called()
@mock.patch.object(redfish_utils, 'get_task_monitor', autospec=True)
def test__check_raid_tasks_status_task_still_processing(
self, mock_get_task_monitor):
driver_internal_info = {
'raid_task_monitor_uris': '/TaskService/123'}
self.node.driver_internal_info = driver_internal_info
self.node.save()
mock_message = mock.Mock()
mock_message.message = 'Clear foreign config done'
mock_config_task = mock.Mock()
mock_config_task.task_state = sushy.TASK_STATE_COMPLETED
mock_config_task.task_status = sushy.HEALTH_OK
mock_config_task.messages = [mock_message]
mock_task_monitor = mock.Mock()
mock_task_monitor.is_processing = False
mock_task_monitor.get_task.return_value = mock_config_task
mock_task_monitor2 = mock.Mock()
mock_task_monitor2.is_processing = True
mock_get_task_monitor.side_effect = [
mock_task_monitor, mock_task_monitor2]
self.raid._set_success = mock.Mock()
self.raid._set_failed = mock.Mock()
self.raid._substep_change_physical_disk_state_nonraid = mock.Mock()
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
self.raid._check_raid_tasks_status(
task, ['/TaskService/123', '/TaskService/456'])
(self.raid._substep_change_physical_disk_state_nonraid
.assert_not_called())
self.raid._set_success.assert_not_called()
self.assertEqual(
['/TaskService/456'],
task.node.driver_internal_info.get('raid_task_monitor_uris'))
self.raid._set_failed.assert_not_called()
@mock.patch.object(redfish_utils, 'get_task_monitor', autospec=True)
def test__check_raid_tasks_status_task_failed(self, mock_get_task_monitor):
driver_internal_info = {
'raid_task_monitor_uris': '/TaskService/123'}
self.node.driver_internal_info = driver_internal_info
self.node.save()
mock_message = mock.Mock()
mock_message.message = 'Clear foreign config failed'
mock_config_task = mock.Mock()
mock_config_task.task_state = sushy.TASK_STATE_COMPLETED
mock_config_task.task_status = 'Failed'
mock_config_task.messages = [mock_message]
mock_task_monitor = mock.Mock()
mock_task_monitor.is_processing = False
mock_task_monitor.get_task.return_value = mock_config_task
mock_get_task_monitor.return_value = mock_task_monitor
self.raid._set_success = mock.Mock()
self.raid._set_failed = mock.Mock()
self.raid._substep_change_physical_disk_state_nonraid = mock.Mock()
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
self.raid._check_raid_tasks_status(
task, ['/TaskService/123'])
(self.raid._substep_change_physical_disk_state_nonraid
.assert_not_called())
self.raid._set_success.assert_not_called()
self.assertIsNone(
task.node.driver_internal_info.get('raid_task_monitor_uris'))
self.raid._set_failed.assert_called_once()
@mock.patch.object(manager_utils, 'notify_conductor_resume_deploy',
autospec=True)
@mock.patch.object(manager_utils, 'notify_conductor_resume_clean',
autospec=True)
def test__set_success_clean(self, mock_notify_clean, mock_notify_deploy):
self.node.clean_step = {'test': 'value'}
self.node.save()
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
self.raid._set_success(task)
mock_notify_clean.assert_called_once_with(task)
@mock.patch.object(manager_utils, 'notify_conductor_resume_deploy',
autospec=True)
@mock.patch.object(manager_utils, 'notify_conductor_resume_clean',
autospec=True)
def test__set_success_deploy(self, mock_notify_clean, mock_notify_deploy):
self.node.clean_step = None
self.node.save()
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
self.raid._set_success(task)
mock_notify_deploy.assert_called_once_with(task)
@mock.patch.object(manager_utils, 'deploying_error_handler',
autospec=True)
@mock.patch.object(manager_utils, 'cleaning_error_handler',
autospec=True)
def test__set_failed_clean(self, mock_clean_handler, mock_deploy_handler):
self.node.clean_step = {'test': 'value'}
self.node.save()
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
self.raid._set_failed(task, 'error', 'log message')
mock_clean_handler.assert_called_once_with(
task, 'error', 'log message')
@mock.patch.object(manager_utils, 'deploying_error_handler',
autospec=True)
@mock.patch.object(manager_utils, 'cleaning_error_handler',
autospec=True)
def test__set_failed_deploy(self, mock_clean_handler, mock_deploy_handler):
self.node.clean_step = None
self.node.save()
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
self.raid._set_failed(task, 'error', 'log message')
mock_deploy_handler.assert_called_once_with(
task, 'error', 'log message')

View File

@ -51,6 +51,8 @@ DRACCLIENT_CONSTANTS_RAID_STATUS_MOD_SPEC = (
SUSHY_OEM_IDRAC_MOD_SPEC = (
'PHYSICAL_DISK_STATE_MODE_RAID',
'PHYSICAL_DISK_STATE_MODE_NONRAID',
'JOB_TYPE_RT_NO_REBOOT_CONF',
'JOB_TYPE_RAID_CONF',
)
# proliantutils

View File

@ -131,7 +131,9 @@ if not sushy_oem_idrac:
sushy_oem_idrac = mock.MagicMock(
spec_set=mock_specs.SUSHY_OEM_IDRAC_MOD_SPEC,
PHYSICAL_DISK_STATE_MODE_RAID=raidmode,
PHYSICAL_DISK_STATE_MODE_NONRAID=nonraidmode
PHYSICAL_DISK_STATE_MODE_NONRAID=nonraidmode,
JOB_TYPE_RT_NO_REBOOT_CONF=mock.sentinel.JOB_TYPE_RT_NO_REBOOT_CONF,
JOB_TYPE_RAID_CONF=mock.sentinel.JOB_TYPE_RAID_CONF
)
sys.modules['sushy_oem_idrac'] = sushy_oem_idrac

View File

@ -0,0 +1,6 @@
---
fixes:
- |
Fixes ``idrac-redfish`` RAID interface ``delete_configuration``
clean/deploy step for controllers having foreign physical disks. Now
foreign configuration is cleared after deleting virtual disks.