Fix two severe errors in the firmware caching code

First, it tries to create components even if the current version is not
known and fails with a database constraint error (because the initial
version cannot be NULL). Can be reproduced with sushy-tools before
37f118237a

Second, unexpected exceptions are not handled in the caching code, so
any of them will cause the node to get stuck in cleaning forever.

On top of that, the caching code is missing a metrics decorator.

This change does not update any unit tests because none currently exist.

Change-Id: Iaa242ca6aa6138fcdaaf63b763708e2f1e559cb0
This commit is contained in:
Dmitry Tantsur 2023-12-08 18:04:02 +01:00
parent dcea5f5a1d
commit 23745d97fe
No known key found for this signature in database
GPG Key ID: 315B2AF9FD216C60
3 changed files with 25 additions and 9 deletions

View File

@ -1449,8 +1449,8 @@ def node_cache_bios_settings(task, node):
except exception.UnsupportedDriverExtension:
LOG.warning('BIOS settings are not supported for node %s, '
'skipping', node.uuid)
# TODO(zshi) remove this check when classic drivers are removed
except Exception:
# NOTE(dtantsur): the caller expects this function to never fail
msg = (_('Caching of bios settings failed on node %(node)s.')
% {'node': node.uuid})
LOG.exception(msg)
@ -1474,6 +1474,7 @@ def node_cache_vendor(task):
except exception.UnsupportedDriverExtension:
return
except Exception as exc:
# NOTE(dtantsur): the caller expects this function to never fail
LOG.warning('Unexpected exception when trying to detect vendor '
'for node %(node)s. %(class)s: %(exc)s',
{'node': task.node.uuid,
@ -1840,6 +1841,10 @@ def node_cache_firmware_components(task):
except exception.UnsupportedDriverExtension:
LOG.warning('Firmware Components are not supported for node %s, '
'skipping', task.node.uuid)
except Exception:
# NOTE(dtantsur): the caller expects this function to never fail
LOG.exception('Caching of firmware components failed on node %s',
task.node.uuid)
def run_node_action(task, call, error_msg, success_msg=None, **kwargs):

View File

@ -71,6 +71,7 @@ class RedfishFirmware(base.FirmwareInterface):
"""
redfish_utils.parse_driver_info(task.node)
@METRICS.timer('RedfishFirmware.cache_firmware_components')
def cache_firmware_components(self, task):
"""Store or update Firmware Components on the given node.
@ -90,25 +91,27 @@ class RedfishFirmware(base.FirmwareInterface):
system = redfish_utils.get_system(task.node)
bios_fw = {'component': 'bios',
'current_version': system.bios_version}
settings.append(bios_fw)
if system.bios_version:
bios_fw = {'component': 'bios',
'current_version': system.bios_version}
settings.append(bios_fw)
# NOTE(iurygregory): normally we only relay on the System to
# perform actions, but to retrieve the BMC Firmware we need to
# access the Manager.
try:
manager = redfish_utils.get_manager(task.node, system)
bmc_fw = {'component': 'bmc',
'current_version': manager.firmware_version}
settings.append(bmc_fw)
if manager.firmware_version:
bmc_fw = {'component': 'bmc',
'current_version': manager.firmware_version}
settings.append(bmc_fw)
except exception.RedfishError:
LOG.warning('No manager available to retrieve Firmware '
'from the bmc of node %s', task.node.uuid)
if not settings:
error_msg = (_('Cannot retrieve firmware for node %s.')
% task.node.uuid)
error_msg = (_('Cannot retrieve firmware for node %s: no '
'supported components') % task.node.uuid)
LOG.error(error_msg)
raise exception.UnsupportedDriverExtension(error_msg)

View File

@ -0,0 +1,8 @@
---
fixes:
- |
Nodes no longer get stuck in cleaning when the firmware components caching
code raises an unexpected exception.
- |
Prevents a database constraints error on caching firmware components
when a supported component does not have the current version.