Fix idrac-wsman BIOS factory_reset result finding

This change fixes the idrac-wsman BIOS hardware interface
implementation's determination of the result of its factory_reset
cleaning/deploy step, and, on success, updating of the cached BIOS
settings to their defaults. It uses a deterministic means of identifying
when the reset has successfully completed and the default BIOS settings
are available to be read.

The time when the iDRAC last performed a system inventory is referred to
as Collect System Inventory on Restart (CSIOR). CSIOR updates the values
of BIOS settings that can be read from the iDRAC. The interface reads
the CSIOR and records it on the ironic bare metal node before requesting
the factory reset and rebooting the system to process the request.
Following the system reboot, the CSIOR is periodically read until its
value changes from its recorded value. If that occurs before the
configured amount of time has passed, the step succeeds and the cached
BIOS settings are updated. Otherwise, the step fails and the node is
placed in the failed state.

Story: 2008058
Task: 40739
Depends-On: https://review.opendev.org/c/openstack/python-dracclient/+/748571
Change-Id: I11b92612d6686b7133ddef67068664c9b81df30e
This commit is contained in:
Yogesh Ramachandra 2020-08-28 10:19:24 -05:00 committed by Richard Pioso
parent c9f9c2ca9b
commit 82627acadc
6 changed files with 389 additions and 29 deletions

View File

@ -39,7 +39,13 @@ opts = [
min=0,
default=60,
help=_('Number of seconds to wait between checking for '
'completed import configuration task'))
'completed import configuration task')),
cfg.IntOpt('bios_factory_reset_timeout',
default=600,
min=1,
help=_('Maximum time (in seconds) to wait for factory reset of '
'BIOS settings to complete.'))
]

View File

@ -19,6 +19,7 @@ from futurist import periodics
from ironic_lib import metrics_utils
from oslo_log import log as logging
from oslo_utils import importutils
from oslo_utils import timeutils
from ironic.common import exception
from ironic.common.i18n import _
@ -32,13 +33,48 @@ from ironic.drivers.modules.drac import job as drac_job
from ironic.drivers.modules.redfish import bios as redfish_bios
from ironic import objects
drac_client = importutils.try_import('dracclient.client')
drac_exceptions = importutils.try_import('dracclient.exceptions')
drac_uris = importutils.try_import('dracclient.resources.uris')
drac_utils = importutils.try_import('dracclient.utils')
LOG = logging.getLogger(__name__)
METRICS = metrics_utils.get_metrics_logger(__name__)
def _get_last_system_inventory_time(client, task):
"""Gets last system inventory time
Uses last_system_inventory_time, if that is not available, then fall
backs to raw requests. Warns user about option to update
python-dracclient.
:param client: python-dracclient instance
:param task: a TaskManager instance with the node to act on
:returns: Last system inventory time
"""
try:
return client.get_system().last_system_inventory_time
except AttributeError as ae:
LOG.warning("Failed to get the last system inventory time for node "
"%(node_uuid)s. Update the python-dracclient to the "
"latest version. Reason: %(error)s",
{"node_uuid": task.node.uuid, "error": ae})
driver_info = drac_common.parse_driver_info(task.node)
client = drac_client.WSManClient(
driver_info['drac_address'], driver_info['drac_username'],
driver_info['drac_password'], driver_info['drac_port'],
driver_info['drac_path'], driver_info['drac_protocol'])
doc = client.enumerate(
drac_uris.DCIM_SystemView, filter_query=(
'select LastSystemInventoryTime from DCIM_SystemView'))
return drac_utils.find_xml(
doc, 'LastSystemInventoryTime',
drac_uris.DCIM_SystemView).text.split('.')[0]
class DracRedfishBIOS(redfish_bios.RedfishBIOS):
"""iDRAC Redfish interface for BIOS settings-related actions.
@ -166,7 +202,6 @@ class DracWSManBIOS(base.BIOSInterface):
driver_internal_info) in node_list:
try:
lock_purpose = 'checking async bios configuration jobs'
job_ids = driver_internal_info.get('bios_config_job_ids')
# Performing read-only/non-destructive work with shared lock
with task_manager.acquire(context, node_uuid,
purpose=lock_purpose,
@ -175,11 +210,13 @@ class DracWSManBIOS(base.BIOSInterface):
if not isinstance(task.driver.bios, DracWSManBIOS):
continue
# skip if nothing to check on this node
if not job_ids:
continue
# check bios_config_job_id exist & checks job is completed
if driver_internal_info.get("bios_config_job_ids"):
self._check_node_bios_jobs(task)
self._check_node_bios_jobs(task)
if driver_internal_info.get(
"factory_reset_time_before_reboot"):
self._check_last_system_inventory_changed(task)
except exception.NodeNotFound:
LOG.info("During query_bios_config_job_status, node "
@ -190,6 +227,74 @@ class DracWSManBIOS(base.BIOSInterface):
"%(node)s was already locked by another process. "
"Skip.", {'node': node_uuid})
def _check_last_system_inventory_changed(self, task):
"""Check the progress of last system inventory time of a node.
This handles jobs for BIOS factory reset. Handle means,
it checks for job status to not only signify completed jobs but
also handle failures by invoking the 'fail' event, allowing the
conductor to put the node into clean/deploy FAIL state.
:param task: a TaskManager instance with the node to act on
"""
node = task.node
client = drac_common.get_drac_client(node)
# Get the last system inventory time from node before reboot
factory_reset_time_before_reboot = node.driver_internal_info.get(
'factory_reset_time_before_reboot')
# Get the factory reset start time
factory_reset_time = node.driver_internal_info.get(
'factory_reset_time')
LOG.debug("Factory resetting node %(node_uuid)s factory reset time "
" %(factory_reset_time)s", {"node_uuid": task.node.uuid,
"factory_reset_time":
factory_reset_time})
# local variable to track difference between current time and factory
# reset start time
time_difference = 0
# Get the last system inventory time after reboot
factory_reset_time_endof_reboot = _get_last_system_inventory_time(
client, task)
LOG.debug("Factory resetting node %(node_uuid)s "
"last inventory reboot time after factory reset "
"%(factory_reset_time_endof_reboot)s",
{"node_uuid": task.node.uuid,
"factory_reset_time_endof_reboot":
factory_reset_time_endof_reboot})
if factory_reset_time_before_reboot != factory_reset_time_endof_reboot:
# from the database cleanup with factory reset time
self._delete_cached_reboot_time(node)
# Cache the new BIOS settings,
self.cache_bios_settings(task)
self._resume_current_operation(task)
else:
# Calculate difference between current time and factory reset
# start time if it is more than configured timeout then set
# the node to fail state
time = timeutils.utcnow(with_timezone=True
) - timeutils.parse_isotime(str(
factory_reset_time))
time_difference = time.seconds
LOG.debug("Factory resetting node %(node_uuid)s "
"time difference %(time_diffrence)s ",
{"node_uuid": task.node.uuid, "time_diffrence":
time_difference})
if time_difference > CONF.drac.bios_factory_reset_timeout:
task.upgrade_lock()
self._delete_cached_reboot_time(node)
error_message = ("BIOS factory reset was not completed within "
"{} seconds, unable to cache updated bios "
"setting").format(
CONF.drac.bios_factory_reset_timeout)
self._set_failed(task, error_message)
else:
LOG.debug("Factory reset for a node %(node)s is not done "
"will check again later", {'node': task.node.uuid})
def _check_node_bios_jobs(self, task):
"""Check the progress of running BIOS config jobs of a node.
@ -234,7 +339,9 @@ class DracWSManBIOS(base.BIOSInterface):
else:
# invoke 'fail' event to allow conductor to put the node in
# a clean/deploy fail state
self._set_failed(task, config_job)
error_message = ("Failed config job: {}. Message: '{}'.".format(
config_job.id, config_job.message))
self._set_failed(task, error_message)
def _delete_cached_config_job_ids(self, node, finished_job_ids=None):
"""Remove Job IDs from the driver_internal_info table in database.
@ -255,24 +362,32 @@ class DracWSManBIOS(base.BIOSInterface):
node.driver_internal_info = driver_internal_info
node.save()
def _set_failed(self, task, config_job):
def _delete_cached_reboot_time(self, node):
"""Remove factory time from the driver_internal_info table in database.
:param node: an ironic node object
"""
driver_internal_info = node.driver_internal_info
# Remove the last reboot time and factory reset time
driver_internal_info.pop(
'factory_reset_time_before_reboot')
driver_internal_info.pop('factory_reset_time')
node.driver_internal_info = driver_internal_info
node.save()
def _set_failed(self, task, error_message):
"""Set the node in failed state by invoking 'fail' event.
:param task: a TaskManager instance with node to act on
:param config_job: a python-dracclient Job object (named tuple)
:param error_message: Error message
"""
error_msg = (_("Failed config job: %(config_job_id)s. "
"Message: '%(message)s'.") %
{'config_job_id': config_job.id,
'message': config_job.message})
log_msg = ("BIOS configuration job failed for node %(node)s. "
"%(error)s " %
log_msg = ("BIOS configuration failed for node %(node)s. %(error)s " %
{'node': task.node.uuid,
'error': error_msg})
'error': error_message})
if task.node.clean_step:
manager_utils.cleaning_error_handler(task, log_msg, error_msg)
manager_utils.cleaning_error_handler(task, log_msg, error_message)
else:
manager_utils.deploying_error_handler(task, log_msg, error_msg)
manager_utils.deploying_error_handler(task, log_msg, error_message)
def _resume_current_operation(self, task):
"""Continue cleaning/deployment of the node.
@ -327,20 +442,39 @@ class DracWSManBIOS(base.BIOSInterface):
# reboot_required=False/Optional, which is not desired
reboot_needed = True
try:
factory_reset_time_before_reboot =\
_get_last_system_inventory_time(client, task)
LOG.debug("Factory resetting node %(node_uuid)s "
"last inventory reboot time before factory reset "
"%(factory_reset_time_before_reboot)s",
{"node_uuid": task.node.uuid,
"factory_reset_time_before_reboot":
factory_reset_time_before_reboot})
commit_job_id = client.commit_pending_lifecycle_changes(
reboot=reboot_needed)
LOG.info("Commit job id of a node %(node_uuid)s."
"%(commit_job_id)s", {'node_uuid': node.uuid,
"commit_job_id": commit_job_id})
except drac_exceptions.BaseClientException as exc:
LOG.error('Failed to commit BIOS reset on node '
'%(node_uuid)s. Reason: %(error)s.', {
'node_uuid': node.uuid,
'error': exc})
raise exception.DracOperationError(error=exc)
# Store JobID for the async job handler _check_node_bios_jobs
# Store the last inventory time on reboot for async job handler
# _check_last_system_inventory_changed
driver_internal_info = node.driver_internal_info
driver_internal_info.setdefault(
'bios_config_job_ids', []).append(commit_job_id)
driver_internal_info['factory_reset_time_before_reboot'] = \
factory_reset_time_before_reboot
# Store the current time to later check if factory reset times out
driver_internal_info['factory_reset_time'] = str(
timeutils.utcnow(with_timezone=True))
node.driver_internal_info = driver_internal_info
# rebooting the server to apply factory reset value
client.set_power_state('REBOOT')
# This method calls node.save(), bios_config_job_id will be
# saved automatically

View File

@ -21,7 +21,10 @@ Test class for DRAC BIOS configuration specific methods
from unittest import mock
import dracclient.client
from dracclient import exceptions as drac_exceptions
import dracclient.utils
from oslo_utils import timeutils
from ironic.common import exception
from ironic.common import states
@ -44,7 +47,7 @@ class DracWSManBIOSConfigurationTestCase(test_utils.BaseDracTest):
self.node = obj_utils.create_test_node(self.context,
driver='idrac',
driver_info=INFO_DICT)
self.bios = drac_bios.DracWSManBIOS()
patch_get_drac_client = mock.patch.object(
drac_common, 'get_drac_client', spec_set=True, autospec=True)
mock_get_drac_client = patch_get_drac_client.start()
@ -121,6 +124,8 @@ class DracWSManBIOSConfigurationTestCase(test_utils.BaseDracTest):
self.assertRaises(exception.DracOperationError,
task.driver.bios.cache_bios_settings, task)
@mock.patch.object(dracclient.client, 'WSManClient', autospec=True)
@mock.patch.object(dracclient.utils, 'find_xml', autospec=True)
@mock.patch.object(deploy_utils, 'get_async_step_return_state',
autospec=True)
@mock.patch.object(deploy_utils, 'set_async_step_flags', autospec=True)
@ -129,8 +134,8 @@ class DracWSManBIOSConfigurationTestCase(test_utils.BaseDracTest):
@mock.patch.object(drac_job, 'validate_job_queue', spec_set=True,
autospec=True)
def _test_step(self, mock_validate_job_queue, mock_cache_bios_settings,
mock_set_async_step_flags,
mock_get_async_step_return_state):
mock_set_async_step_flags, mock_get_async_step_return_state,
mock_findxml, mock_wsmanclient, attribute_error=False):
if self.node.clean_step:
step_data = self.node.clean_step
expected_state = states.CLEANWAIT
@ -148,6 +153,22 @@ class DracWSManBIOSConfigurationTestCase(test_utils.BaseDracTest):
shared=False) as task:
info = task.node.driver_internal_info
if step == 'factory_reset':
mock_system = None
factory_reset_time_before_reboot = None
if attribute_error:
mock_system = mock.Mock(spec=[])
mock_xml = mock.Mock()
mock_xml.text = '20200910233024.000000+000'
mock_findxml.return_value = mock_xml
else:
mock_system = mock.Mock()
factory_reset_time_before_reboot = "20200910233024"
mock_system.last_system_inventory_time =\
"20200910233024"
self.mock_client.get_system.return_value = mock_system
ret_state = task.driver.bios.factory_reset(task)
attrib = {"BIOS Reset To Defaults Requested": "True"}
@ -155,9 +176,13 @@ class DracWSManBIOSConfigurationTestCase(test_utils.BaseDracTest):
assert_called_once_with(attrib)
self.mock_client.commit_pending_lifecycle_changes.\
assert_called_once_with(reboot=True)
job_id = self.mock_client.commit_pending_lifecycle_changes()
self.assertIn(job_id, info['bios_config_job_ids'])
self.mock_client.get_system.assert_called_once()
if attribute_error:
mock_findxml.assert_called_once()
else:
self.assertEqual(factory_reset_time_before_reboot,
info['factory_reset_time_before_reboot'])
mock_findxml.assert_not_called()
if step == 'apply_configuration':
ret_state = task.driver.bios.apply_configuration(task, data)
@ -181,12 +206,24 @@ class DracWSManBIOSConfigurationTestCase(test_utils.BaseDracTest):
self.node.save()
self._test_step()
def test_factory_reset_clean_attribute_error(self):
self.node.clean_step = {'priority': 100, 'interface': 'bios',
'step': 'factory_reset', 'argsinfo': {}}
self.node.save()
self._test_step(attribute_error=True)
def test_factory_reset_deploy(self):
self.node.deploy_step = {'priority': 100, 'interface': 'bios',
'step': 'factory_reset', 'argsinfo': {}}
self.node.save()
self._test_step()
def test_factory_reset_deploy_attribute_error(self):
self.node.deploy_step = {'priority': 100, 'interface': 'bios',
'step': 'factory_reset', 'argsinfo': {}}
self.node.save()
self._test_step(attribute_error=True)
def test_apply_configuration_clean(self):
settings = [{'name': 'ProcVirtualization', 'value': 'Enabled'}]
self.node.clean_step = {'priority': 100, 'interface': 'bios',
@ -318,6 +355,161 @@ class DracWSManBIOSConfigurationTestCase(test_utils.BaseDracTest):
mock_cleaning_error_handler.assert_called_once_with(
task, mock.ANY, "Failed config job: 123. Message: 'Invalid'.")
def test__check_last_system_inventory_changed_different_inventory_time(
self):
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
driver_internal_info = task.node.driver_internal_info
driver_internal_info["factory_reset_time_before_reboot"] = \
"20200910233024"
current_time = str(timeutils.utcnow(True))
driver_internal_info["factory_reset_time"] = current_time
task.node.driver_internal_info = driver_internal_info
task.node.save()
mock_system = mock.Mock()
mock_system.last_system_inventory_time =\
"20200910233523"
self.mock_client.get_system.return_value = mock_system
mock_resume = mock.Mock()
task.driver.bios._resume_current_operation = mock_resume
mock_cache = mock.Mock()
task.driver.bios.cache_bios_settings = mock_cache
task.driver.bios._check_last_system_inventory_changed(task)
self.assertIsNone(task.node.driver_internal_info.get(
'factory_reset_time_before_reboot'))
self.assertIsNone(
task.node.driver_internal_info.get('factory_reset_time'))
mock_cache.assert_called_once_with(task)
mock_resume.assert_called_once_with(task)
def test__check_last_system_inventory_changed_same_inventory_time(self):
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
driver_internal_info = task.node.driver_internal_info
driver_internal_info['factory_reset_time_before_reboot'] = \
"20200910233024"
current_time = str(timeutils.utcnow(True))
driver_internal_info['factory_reset_time'] = current_time
task.node.driver_internal_info = driver_internal_info
task.node.save()
mock_system = mock.Mock()
mock_system.last_system_inventory_time =\
"20200910233024"
self.mock_client.get_system.return_value = mock_system
task.driver.bios._check_last_system_inventory_changed(task)
self.assertIsNotNone(
task.node.driver_internal_info.get('factory_reset_time'))
self.assertEqual(current_time,
task.node.driver_internal_info.get(
'factory_reset_time'))
self.assertEqual("20200910233024",
task.node.driver_internal_info.get(
'factory_reset_time_before_reboot'))
def test__check_last_system_inventory_changed_same_inventory_time_timeout(
self):
with task_manager.acquire(self.context, self.node.uuid,
shared=False) as task:
driver_internal_info = task.node.driver_internal_info
driver_internal_info['factory_reset_time_before_reboot'] = \
"20200910233024"
driver_internal_info['factory_reset_time'] = \
'2020-09-25 15:02:57.903318+00:00'
task.node.driver_internal_info = driver_internal_info
task.node.save()
mock_system = mock.Mock()
mock_system.last_system_inventory_time =\
"20200910233024"
self.mock_client.get_system.return_value = mock_system
mock_failed = mock.Mock()
task.driver.bios._set_failed = mock_failed
task.driver.bios._check_last_system_inventory_changed(task)
self.assertIsNone(task.node.driver_internal_info.get(
'factory_reset_time_before_reboot'))
self.assertIsNone(
task.node.driver_internal_info.get('factory_reset_time'))
fail = ("BIOS factory reset was not completed within 600 "
"seconds, unable to cache updated bios setting")
mock_failed.assert_called_once_with(task, fail)
@mock.patch.object(task_manager, 'acquire', autospec=True)
def test__query_bios_config_job_status(self, mock_acquire):
driver_internal_info = {'bios_config_job_ids': ['42'],
'factory_reset_time_before_reboot':
"20200910233024"}
self.node.driver_internal_info = driver_internal_info
self.node.save()
mock_manager = mock.Mock()
node_list = [(self.node.uuid, 'idrac', '',
driver_internal_info)]
mock_manager.iter_nodes.return_value = node_list
# mock task_manager.acquire
task = mock.Mock(node=self.node, driver=mock.Mock(bios=self.bios))
mock_acquire.return_value = mock.MagicMock(
__enter__=mock.MagicMock(return_value=task))
self.bios._check_node_bios_jobs = mock.Mock()
self.bios._check_last_system_inventory_changed = mock.Mock()
self.bios._query_bios_config_job_status(mock_manager,
self.context)
self.bios._check_node_bios_jobs.assert_called_once_with(task)
self.bios._check_last_system_inventory_changed.assert_called_once_with(
task)
@mock.patch.object(task_manager, 'acquire', autospec=True)
def test__query_bios_config_job_status_no_config_jobs(self,
mock_acquire):
# mock manager
mock_manager = mock.Mock()
node_list = [(self.node.uuid, 'idrac', '', {})]
mock_manager.iter_nodes.return_value = node_list
# mock task_manager.acquire
task = mock.Mock(node=self.node, driver=mock.Mock(bios=self.bios))
mock_acquire.return_value = mock.MagicMock(
__enter__=mock.MagicMock(return_value=task))
self.bios._check_node_bios_jobs = mock.Mock()
self.bios._check_last_system_inventory_changed = mock.Mock()
self.bios._query_bios_config_job_status(mock_manager,
None)
self.bios._check_node_bios_jobs.assert_not_called()
self.bios._check_last_system_inventory_changed.assert_not_called()
@mock.patch.object(task_manager, 'acquire', autospec=True)
def test__query_bios_config_job_status_no_driver(self,
mock_acquire):
driver_internal_info = {'bios_config_job_ids': ['42'],
'factory_reset_time_before_reboot':
"20200910233024"}
self.node.driver_internal_info = driver_internal_info
self.node.save()
mock_manager = mock.Mock()
node_list = [(self.node.uuid, '', '', driver_internal_info)]
mock_manager.iter_nodes.return_value = node_list
# mock task_manager.acquire
task = mock.Mock(node=self.node, driver=mock.Mock(bios=""))
mock_acquire.return_value = mock.MagicMock(
__enter__=mock.MagicMock(return_value=task))
self.bios._check_node_bios_jobs = mock.Mock()
self.bios._check_last_system_inventory_changed = mock.Mock()
self.bios._query_bios_config_job_status(mock_manager,
None)
self.bios._check_node_bios_jobs.assert_not_called()
self.bios._check_last_system_inventory_changed.assert_not_called()
class DracBIOSConfigurationTestCase(test_utils.BaseDracTest):

View File

@ -20,11 +20,14 @@ when needing to mock those third party modules"""
DRACCLIENT_SPEC = (
'client',
'constants',
'exceptions'
'exceptions',
'resources',
'utils'
)
DRACCLIENT_CLIENT_MOD_SPEC = (
'DRACClient',
'WSManClient'
)
DRACCLIENT_CONSTANTS_MOD_SPEC = (
@ -46,6 +49,14 @@ DRACCLIENT_CONSTANTS_RAID_STATUS_MOD_SPEC = (
'raid'
)
DRACCLIENT_RESOURCES_URIS_MOD_SPEC = (
'DCIM_SystemView',
)
DRACCLIENT_UTILS_MOD_SPEC = (
'find_xml',
)
# proliantutils
PROLIANTUTILS_SPEC = (
'exception',

View File

@ -99,11 +99,20 @@ if not dracclient:
spec_set=mock_specs.DRACCLIENT_CONSTANTS_RAID_STATUS_MOD_SPEC,
jbod=mock.sentinel.jbod,
raid=mock.sentinel.raid)
dracclient.resources.uris = mock.MagicMock(
spec_set=mock_specs.DRACCLIENT_RESOURCES_URIS_MOD_SPEC,
DCIM_SystemView=mock.sentinel.DCIM_SystemView
)
dracclient.utils = mock.MagicMock(
spec_set=mock_specs.DRACCLIENT_UTILS_MOD_SPEC
)
sys.modules['dracclient'] = dracclient
sys.modules['dracclient.client'] = dracclient.client
sys.modules['dracclient.constants'] = dracclient.constants
sys.modules['dracclient.exceptions'] = dracclient.exceptions
sys.modules['dracclient.resources.uris'] = dracclient.resources.uris
sys.modules['dracclient.utils'] = dracclient.utils
dracclient.exceptions.BaseClientException = type('BaseClientException',
(Exception,), {})

View File

@ -0,0 +1,8 @@
---
fixes:
- |
Fixes the ``idrac-wsman`` BIOS ``factory_reset`` clean and deploy step to
indicate success and update the cached BIOS settings to their defaults only
when the BIOS settings have actually been reset. See story
`2008058 <https://storyboard.openstack.org/#!/story/2008058>`_ for more
details.