diff --git a/ironic/conf/drac.py b/ironic/conf/drac.py index ebf402ce17..83e60f3e6e 100644 --- a/ironic/conf/drac.py +++ b/ironic/conf/drac.py @@ -39,7 +39,13 @@ opts = [ min=0, default=60, help=_('Number of seconds to wait between checking for ' - 'completed import configuration task')) + 'completed import configuration task')), + cfg.IntOpt('bios_factory_reset_timeout', + default=600, + min=1, + help=_('Maximum time (in seconds) to wait for factory reset of ' + 'BIOS settings to complete.')) + ] diff --git a/ironic/drivers/modules/drac/bios.py b/ironic/drivers/modules/drac/bios.py index db4a497a10..b6f59db61b 100644 --- a/ironic/drivers/modules/drac/bios.py +++ b/ironic/drivers/modules/drac/bios.py @@ -19,6 +19,7 @@ from futurist import periodics from ironic_lib import metrics_utils from oslo_log import log as logging from oslo_utils import importutils +from oslo_utils import timeutils from ironic.common import exception from ironic.common.i18n import _ @@ -32,13 +33,48 @@ from ironic.drivers.modules.drac import job as drac_job from ironic.drivers.modules.redfish import bios as redfish_bios from ironic import objects +drac_client = importutils.try_import('dracclient.client') drac_exceptions = importutils.try_import('dracclient.exceptions') +drac_uris = importutils.try_import('dracclient.resources.uris') +drac_utils = importutils.try_import('dracclient.utils') LOG = logging.getLogger(__name__) METRICS = metrics_utils.get_metrics_logger(__name__) +def _get_last_system_inventory_time(client, task): + """Gets last system inventory time + + Uses last_system_inventory_time, if that is not available, then fall + backs to raw requests. Warns user about option to update + python-dracclient. + :param client: python-dracclient instance + :param task: a TaskManager instance with the node to act on + :returns: Last system inventory time + """ + try: + return client.get_system().last_system_inventory_time + except AttributeError as ae: + LOG.warning("Failed to get the last system inventory time for node " + "%(node_uuid)s. Update the python-dracclient to the " + "latest version. Reason: %(error)s", + {"node_uuid": task.node.uuid, "error": ae}) + driver_info = drac_common.parse_driver_info(task.node) + client = drac_client.WSManClient( + driver_info['drac_address'], driver_info['drac_username'], + driver_info['drac_password'], driver_info['drac_port'], + driver_info['drac_path'], driver_info['drac_protocol']) + + doc = client.enumerate( + drac_uris.DCIM_SystemView, filter_query=( + 'select LastSystemInventoryTime from DCIM_SystemView')) + + return drac_utils.find_xml( + doc, 'LastSystemInventoryTime', + drac_uris.DCIM_SystemView).text.split('.')[0] + + class DracRedfishBIOS(redfish_bios.RedfishBIOS): """iDRAC Redfish interface for BIOS settings-related actions. @@ -166,7 +202,6 @@ class DracWSManBIOS(base.BIOSInterface): driver_internal_info) in node_list: try: lock_purpose = 'checking async bios configuration jobs' - job_ids = driver_internal_info.get('bios_config_job_ids') # Performing read-only/non-destructive work with shared lock with task_manager.acquire(context, node_uuid, purpose=lock_purpose, @@ -175,11 +210,13 @@ class DracWSManBIOS(base.BIOSInterface): if not isinstance(task.driver.bios, DracWSManBIOS): continue - # skip if nothing to check on this node - if not job_ids: - continue + # check bios_config_job_id exist & checks job is completed + if driver_internal_info.get("bios_config_job_ids"): + self._check_node_bios_jobs(task) - self._check_node_bios_jobs(task) + if driver_internal_info.get( + "factory_reset_time_before_reboot"): + self._check_last_system_inventory_changed(task) except exception.NodeNotFound: LOG.info("During query_bios_config_job_status, node " @@ -190,6 +227,74 @@ class DracWSManBIOS(base.BIOSInterface): "%(node)s was already locked by another process. " "Skip.", {'node': node_uuid}) + def _check_last_system_inventory_changed(self, task): + """Check the progress of last system inventory time of a node. + + This handles jobs for BIOS factory reset. Handle means, + it checks for job status to not only signify completed jobs but + also handle failures by invoking the 'fail' event, allowing the + conductor to put the node into clean/deploy FAIL state. + + :param task: a TaskManager instance with the node to act on + """ + node = task.node + client = drac_common.get_drac_client(node) + # Get the last system inventory time from node before reboot + factory_reset_time_before_reboot = node.driver_internal_info.get( + 'factory_reset_time_before_reboot') + + # Get the factory reset start time + factory_reset_time = node.driver_internal_info.get( + 'factory_reset_time') + LOG.debug("Factory resetting node %(node_uuid)s factory reset time " + " %(factory_reset_time)s", {"node_uuid": task.node.uuid, + "factory_reset_time": + factory_reset_time}) + # local variable to track difference between current time and factory + # reset start time + time_difference = 0 + # Get the last system inventory time after reboot + factory_reset_time_endof_reboot = _get_last_system_inventory_time( + client, task) + + LOG.debug("Factory resetting node %(node_uuid)s " + "last inventory reboot time after factory reset " + "%(factory_reset_time_endof_reboot)s", + {"node_uuid": task.node.uuid, + "factory_reset_time_endof_reboot": + factory_reset_time_endof_reboot}) + + if factory_reset_time_before_reboot != factory_reset_time_endof_reboot: + # from the database cleanup with factory reset time + self._delete_cached_reboot_time(node) + # Cache the new BIOS settings, + self.cache_bios_settings(task) + self._resume_current_operation(task) + else: + # Calculate difference between current time and factory reset + # start time if it is more than configured timeout then set + # the node to fail state + time = timeutils.utcnow(with_timezone=True + ) - timeutils.parse_isotime(str( + factory_reset_time)) + time_difference = time.total_seconds() + LOG.debug("Factory resetting node %(node_uuid)s " + "time difference %(time_difference)s ", + {"node_uuid": task.node.uuid, "time_difference": + time_difference}) + + if time_difference > CONF.drac.bios_factory_reset_timeout: + task.upgrade_lock() + self._delete_cached_reboot_time(node) + error_message = ("BIOS factory reset was not completed within " + "{} seconds, unable to cache updated bios " + "setting").format( + CONF.drac.bios_factory_reset_timeout) + self._set_failed(task, error_message) + else: + LOG.debug("Factory reset for a node %(node)s is not done " + "will check again later", {'node': task.node.uuid}) + def _check_node_bios_jobs(self, task): """Check the progress of running BIOS config jobs of a node. @@ -234,7 +339,9 @@ class DracWSManBIOS(base.BIOSInterface): else: # invoke 'fail' event to allow conductor to put the node in # a clean/deploy fail state - self._set_failed(task, config_job) + error_message = ("Failed config job: {}. Message: '{}'.".format( + config_job.id, config_job.message)) + self._set_failed(task, error_message) def _delete_cached_config_job_ids(self, node, finished_job_ids=None): """Remove Job IDs from the driver_internal_info table in database. @@ -255,24 +362,32 @@ class DracWSManBIOS(base.BIOSInterface): node.driver_internal_info = driver_internal_info node.save() - def _set_failed(self, task, config_job): + def _delete_cached_reboot_time(self, node): + """Remove factory time from the driver_internal_info table in database. + + :param node: an ironic node object + """ + driver_internal_info = node.driver_internal_info + # Remove the last reboot time and factory reset time + driver_internal_info.pop( + 'factory_reset_time_before_reboot') + driver_internal_info.pop('factory_reset_time') + node.driver_internal_info = driver_internal_info + node.save() + + def _set_failed(self, task, error_message): """Set the node in failed state by invoking 'fail' event. :param task: a TaskManager instance with node to act on - :param config_job: a python-dracclient Job object (named tuple) + :param error_message: Error message """ - error_msg = (_("Failed config job: %(config_job_id)s. " - "Message: '%(message)s'.") % - {'config_job_id': config_job.id, - 'message': config_job.message}) - log_msg = ("BIOS configuration job failed for node %(node)s. " - "%(error)s " % + log_msg = ("BIOS configuration failed for node %(node)s. %(error)s " % {'node': task.node.uuid, - 'error': error_msg}) + 'error': error_message}) if task.node.clean_step: - manager_utils.cleaning_error_handler(task, log_msg, error_msg) + manager_utils.cleaning_error_handler(task, log_msg, error_message) else: - manager_utils.deploying_error_handler(task, log_msg, error_msg) + manager_utils.deploying_error_handler(task, log_msg, error_message) def _resume_current_operation(self, task): """Continue cleaning/deployment of the node. @@ -327,20 +442,39 @@ class DracWSManBIOS(base.BIOSInterface): # reboot_required=False/Optional, which is not desired reboot_needed = True try: + factory_reset_time_before_reboot =\ + _get_last_system_inventory_time(client, task) + + LOG.debug("Factory resetting node %(node_uuid)s " + "last inventory reboot time before factory reset " + "%(factory_reset_time_before_reboot)s", + {"node_uuid": task.node.uuid, + "factory_reset_time_before_reboot": + factory_reset_time_before_reboot}) + commit_job_id = client.commit_pending_lifecycle_changes( reboot=reboot_needed) + LOG.info("Commit job id of a node %(node_uuid)s." + "%(commit_job_id)s", {'node_uuid': node.uuid, + "commit_job_id": commit_job_id}) except drac_exceptions.BaseClientException as exc: LOG.error('Failed to commit BIOS reset on node ' '%(node_uuid)s. Reason: %(error)s.', { 'node_uuid': node.uuid, 'error': exc}) raise exception.DracOperationError(error=exc) - - # Store JobID for the async job handler _check_node_bios_jobs + # Store the last inventory time on reboot for async job handler + # _check_last_system_inventory_changed driver_internal_info = node.driver_internal_info - driver_internal_info.setdefault( - 'bios_config_job_ids', []).append(commit_job_id) + driver_internal_info['factory_reset_time_before_reboot'] = \ + factory_reset_time_before_reboot + # Store the current time to later check if factory reset times out + driver_internal_info['factory_reset_time'] = str( + timeutils.utcnow(with_timezone=True)) + node.driver_internal_info = driver_internal_info + # rebooting the server to apply factory reset value + client.set_power_state('REBOOT') # This method calls node.save(), bios_config_job_id will be # saved automatically diff --git a/ironic/tests/unit/drivers/modules/drac/test_bios.py b/ironic/tests/unit/drivers/modules/drac/test_bios.py index 3903d979c4..8aae25be25 100644 --- a/ironic/tests/unit/drivers/modules/drac/test_bios.py +++ b/ironic/tests/unit/drivers/modules/drac/test_bios.py @@ -21,7 +21,10 @@ Test class for DRAC BIOS configuration specific methods from unittest import mock +import dracclient.client from dracclient import exceptions as drac_exceptions +import dracclient.utils +from oslo_utils import timeutils from ironic.common import exception from ironic.common import states @@ -44,7 +47,7 @@ class DracWSManBIOSConfigurationTestCase(test_utils.BaseDracTest): self.node = obj_utils.create_test_node(self.context, driver='idrac', driver_info=INFO_DICT) - + self.bios = drac_bios.DracWSManBIOS() patch_get_drac_client = mock.patch.object( drac_common, 'get_drac_client', spec_set=True, autospec=True) mock_get_drac_client = patch_get_drac_client.start() @@ -121,6 +124,8 @@ class DracWSManBIOSConfigurationTestCase(test_utils.BaseDracTest): self.assertRaises(exception.DracOperationError, task.driver.bios.cache_bios_settings, task) + @mock.patch.object(dracclient.client, 'WSManClient', autospec=True) + @mock.patch.object(dracclient.utils, 'find_xml', autospec=True) @mock.patch.object(deploy_utils, 'get_async_step_return_state', autospec=True) @mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True) @@ -129,8 +134,8 @@ class DracWSManBIOSConfigurationTestCase(test_utils.BaseDracTest): @mock.patch.object(drac_job, 'validate_job_queue', spec_set=True, autospec=True) def _test_step(self, mock_validate_job_queue, mock_cache_bios_settings, - mock_set_async_step_flags, - mock_get_async_step_return_state): + mock_set_async_step_flags, mock_get_async_step_return_state, + mock_findxml, mock_wsmanclient, attribute_error=False): if self.node.clean_step: step_data = self.node.clean_step expected_state = states.CLEANWAIT @@ -148,6 +153,22 @@ class DracWSManBIOSConfigurationTestCase(test_utils.BaseDracTest): shared=False) as task: info = task.node.driver_internal_info if step == 'factory_reset': + mock_system = None + factory_reset_time_before_reboot = None + + if attribute_error: + mock_system = mock.Mock(spec=[]) + mock_xml = mock.Mock() + mock_xml.text = '20200910233024.000000+000' + mock_findxml.return_value = mock_xml + else: + mock_system = mock.Mock() + factory_reset_time_before_reboot = "20200910233024" + mock_system.last_system_inventory_time =\ + "20200910233024" + + self.mock_client.get_system.return_value = mock_system + ret_state = task.driver.bios.factory_reset(task) attrib = {"BIOS Reset To Defaults Requested": "True"} @@ -155,9 +176,13 @@ class DracWSManBIOSConfigurationTestCase(test_utils.BaseDracTest): assert_called_once_with(attrib) self.mock_client.commit_pending_lifecycle_changes.\ assert_called_once_with(reboot=True) - job_id = self.mock_client.commit_pending_lifecycle_changes() - self.assertIn(job_id, info['bios_config_job_ids']) - + self.mock_client.get_system.assert_called_once() + if attribute_error: + mock_findxml.assert_called_once() + else: + self.assertEqual(factory_reset_time_before_reboot, + info['factory_reset_time_before_reboot']) + mock_findxml.assert_not_called() if step == 'apply_configuration': ret_state = task.driver.bios.apply_configuration(task, data) @@ -181,12 +206,24 @@ class DracWSManBIOSConfigurationTestCase(test_utils.BaseDracTest): self.node.save() self._test_step() + def test_factory_reset_clean_attribute_error(self): + self.node.clean_step = {'priority': 100, 'interface': 'bios', + 'step': 'factory_reset', 'argsinfo': {}} + self.node.save() + self._test_step(attribute_error=True) + def test_factory_reset_deploy(self): self.node.deploy_step = {'priority': 100, 'interface': 'bios', 'step': 'factory_reset', 'argsinfo': {}} self.node.save() self._test_step() + def test_factory_reset_deploy_attribute_error(self): + self.node.deploy_step = {'priority': 100, 'interface': 'bios', + 'step': 'factory_reset', 'argsinfo': {}} + self.node.save() + self._test_step(attribute_error=True) + def test_apply_configuration_clean(self): settings = [{'name': 'ProcVirtualization', 'value': 'Enabled'}] self.node.clean_step = {'priority': 100, 'interface': 'bios', @@ -318,6 +355,161 @@ class DracWSManBIOSConfigurationTestCase(test_utils.BaseDracTest): mock_cleaning_error_handler.assert_called_once_with( task, mock.ANY, "Failed config job: 123. Message: 'Invalid'.") + def test__check_last_system_inventory_changed_different_inventory_time( + self): + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + + driver_internal_info = task.node.driver_internal_info + driver_internal_info["factory_reset_time_before_reboot"] = \ + "20200910233024" + current_time = str(timeutils.utcnow(True)) + driver_internal_info["factory_reset_time"] = current_time + task.node.driver_internal_info = driver_internal_info + task.node.save() + mock_system = mock.Mock() + mock_system.last_system_inventory_time =\ + "20200910233523" + self.mock_client.get_system.return_value = mock_system + mock_resume = mock.Mock() + task.driver.bios._resume_current_operation = mock_resume + mock_cache = mock.Mock() + task.driver.bios.cache_bios_settings = mock_cache + + task.driver.bios._check_last_system_inventory_changed(task) + + self.assertIsNone(task.node.driver_internal_info.get( + 'factory_reset_time_before_reboot')) + self.assertIsNone( + task.node.driver_internal_info.get('factory_reset_time')) + mock_cache.assert_called_once_with(task) + mock_resume.assert_called_once_with(task) + + def test__check_last_system_inventory_changed_same_inventory_time(self): + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + + driver_internal_info = task.node.driver_internal_info + driver_internal_info['factory_reset_time_before_reboot'] = \ + "20200910233024" + current_time = str(timeutils.utcnow(True)) + driver_internal_info['factory_reset_time'] = current_time + task.node.driver_internal_info = driver_internal_info + task.node.save() + mock_system = mock.Mock() + mock_system.last_system_inventory_time =\ + "20200910233024" + self.mock_client.get_system.return_value = mock_system + + task.driver.bios._check_last_system_inventory_changed(task) + + self.assertIsNotNone( + task.node.driver_internal_info.get('factory_reset_time')) + self.assertEqual(current_time, + task.node.driver_internal_info.get( + 'factory_reset_time')) + self.assertEqual("20200910233024", + task.node.driver_internal_info.get( + 'factory_reset_time_before_reboot')) + + def test__check_last_system_inventory_changed_same_inventory_time_timeout( + self): + with task_manager.acquire(self.context, self.node.uuid, + shared=False) as task: + + driver_internal_info = task.node.driver_internal_info + driver_internal_info['factory_reset_time_before_reboot'] = \ + "20200910233024" + driver_internal_info['factory_reset_time'] = \ + '2020-09-25 15:02:57.903318+00:00' + task.node.driver_internal_info = driver_internal_info + task.node.save() + mock_system = mock.Mock() + mock_system.last_system_inventory_time =\ + "20200910233024" + self.mock_client.get_system.return_value = mock_system + mock_failed = mock.Mock() + task.driver.bios._set_failed = mock_failed + + task.driver.bios._check_last_system_inventory_changed(task) + + self.assertIsNone(task.node.driver_internal_info.get( + 'factory_reset_time_before_reboot')) + self.assertIsNone( + task.node.driver_internal_info.get('factory_reset_time')) + fail = ("BIOS factory reset was not completed within 600 " + "seconds, unable to cache updated bios setting") + mock_failed.assert_called_once_with(task, fail) + + @mock.patch.object(task_manager, 'acquire', autospec=True) + def test__query_bios_config_job_status(self, mock_acquire): + driver_internal_info = {'bios_config_job_ids': ['42'], + 'factory_reset_time_before_reboot': + "20200910233024"} + self.node.driver_internal_info = driver_internal_info + self.node.save() + mock_manager = mock.Mock() + node_list = [(self.node.uuid, 'idrac', '', + driver_internal_info)] + mock_manager.iter_nodes.return_value = node_list + # mock task_manager.acquire + task = mock.Mock(node=self.node, driver=mock.Mock(bios=self.bios)) + mock_acquire.return_value = mock.MagicMock( + __enter__=mock.MagicMock(return_value=task)) + self.bios._check_node_bios_jobs = mock.Mock() + self.bios._check_last_system_inventory_changed = mock.Mock() + + self.bios._query_bios_config_job_status(mock_manager, + self.context) + + self.bios._check_node_bios_jobs.assert_called_once_with(task) + self.bios._check_last_system_inventory_changed.assert_called_once_with( + task) + + @mock.patch.object(task_manager, 'acquire', autospec=True) + def test__query_bios_config_job_status_no_config_jobs(self, + mock_acquire): + # mock manager + mock_manager = mock.Mock() + node_list = [(self.node.uuid, 'idrac', '', {})] + mock_manager.iter_nodes.return_value = node_list + # mock task_manager.acquire + task = mock.Mock(node=self.node, driver=mock.Mock(bios=self.bios)) + mock_acquire.return_value = mock.MagicMock( + __enter__=mock.MagicMock(return_value=task)) + self.bios._check_node_bios_jobs = mock.Mock() + self.bios._check_last_system_inventory_changed = mock.Mock() + + self.bios._query_bios_config_job_status(mock_manager, + None) + + self.bios._check_node_bios_jobs.assert_not_called() + self.bios._check_last_system_inventory_changed.assert_not_called() + + @mock.patch.object(task_manager, 'acquire', autospec=True) + def test__query_bios_config_job_status_no_driver(self, + mock_acquire): + driver_internal_info = {'bios_config_job_ids': ['42'], + 'factory_reset_time_before_reboot': + "20200910233024"} + self.node.driver_internal_info = driver_internal_info + self.node.save() + mock_manager = mock.Mock() + node_list = [(self.node.uuid, '', '', driver_internal_info)] + mock_manager.iter_nodes.return_value = node_list + # mock task_manager.acquire + task = mock.Mock(node=self.node, driver=mock.Mock(bios="")) + mock_acquire.return_value = mock.MagicMock( + __enter__=mock.MagicMock(return_value=task)) + self.bios._check_node_bios_jobs = mock.Mock() + self.bios._check_last_system_inventory_changed = mock.Mock() + + self.bios._query_bios_config_job_status(mock_manager, + None) + + self.bios._check_node_bios_jobs.assert_not_called() + self.bios._check_last_system_inventory_changed.assert_not_called() + class DracBIOSConfigurationTestCase(test_utils.BaseDracTest): diff --git a/ironic/tests/unit/drivers/third_party_driver_mock_specs.py b/ironic/tests/unit/drivers/third_party_driver_mock_specs.py index 2a4f082258..a8d0b996e0 100644 --- a/ironic/tests/unit/drivers/third_party_driver_mock_specs.py +++ b/ironic/tests/unit/drivers/third_party_driver_mock_specs.py @@ -20,11 +20,14 @@ when needing to mock those third party modules""" DRACCLIENT_SPEC = ( 'client', 'constants', - 'exceptions' + 'exceptions', + 'resources', + 'utils' ) DRACCLIENT_CLIENT_MOD_SPEC = ( 'DRACClient', + 'WSManClient' ) DRACCLIENT_CONSTANTS_MOD_SPEC = ( @@ -46,6 +49,14 @@ DRACCLIENT_CONSTANTS_RAID_STATUS_MOD_SPEC = ( 'raid' ) +DRACCLIENT_RESOURCES_URIS_MOD_SPEC = ( + 'DCIM_SystemView', +) + +DRACCLIENT_UTILS_MOD_SPEC = ( + 'find_xml', +) + # proliantutils PROLIANTUTILS_SPEC = ( 'exception', diff --git a/ironic/tests/unit/drivers/third_party_driver_mocks.py b/ironic/tests/unit/drivers/third_party_driver_mocks.py index 8274dca060..694f40d8f9 100644 --- a/ironic/tests/unit/drivers/third_party_driver_mocks.py +++ b/ironic/tests/unit/drivers/third_party_driver_mocks.py @@ -99,11 +99,20 @@ if not dracclient: spec_set=mock_specs.DRACCLIENT_CONSTANTS_RAID_STATUS_MOD_SPEC, jbod=mock.sentinel.jbod, raid=mock.sentinel.raid) + dracclient.resources.uris = mock.MagicMock( + spec_set=mock_specs.DRACCLIENT_RESOURCES_URIS_MOD_SPEC, + DCIM_SystemView=mock.sentinel.DCIM_SystemView + ) + dracclient.utils = mock.MagicMock( + spec_set=mock_specs.DRACCLIENT_UTILS_MOD_SPEC + ) sys.modules['dracclient'] = dracclient sys.modules['dracclient.client'] = dracclient.client sys.modules['dracclient.constants'] = dracclient.constants sys.modules['dracclient.exceptions'] = dracclient.exceptions + sys.modules['dracclient.resources.uris'] = dracclient.resources.uris + sys.modules['dracclient.utils'] = dracclient.utils dracclient.exceptions.BaseClientException = type('BaseClientException', (Exception,), {}) diff --git a/releasenotes/notes/bug-2008058-fix-factory-reset-status.yaml-52a6119b46e33b37.yaml b/releasenotes/notes/bug-2008058-fix-factory-reset-status.yaml-52a6119b46e33b37.yaml new file mode 100644 index 0000000000..a31af15b8d --- /dev/null +++ b/releasenotes/notes/bug-2008058-fix-factory-reset-status.yaml-52a6119b46e33b37.yaml @@ -0,0 +1,8 @@ +--- +fixes: + - | + Fixes the ``idrac-wsman`` BIOS ``factory_reset`` clean and deploy step to + indicate success and update the cached BIOS settings to their defaults only + when the BIOS settings have actually been reset. See story + `2008058 `_ for more + details.