From 72767e60adc06e61f65892370222cb4c5af14267 Mon Sep 17 00:00:00 2001 From: Teresa Ho Date: Tue, 26 May 2020 14:14:48 -0400 Subject: [PATCH] Device image update alarm and audit When the first device image is imported to the system, the alarm "device image update is in progress" is raised. When all the devices have been updated and rebooted, this alarm is cleared. A periodic audit is added to check if any of the image updates has timed out and declared it as failed. Story: 2006740 Task: 39498 Change-Id: I44a027ee384cfa5b96d5b426b06173b54187fda9 Signed-off-by: Teresa Ho --- .../sysinv/api/controllers/v1/device_image.py | 17 +--- .../sysinv/sysinv/api/controllers/v1/host.py | 10 +++ .../sysinv/sysinv/sysinv/conductor/manager.py | 80 +++++++++++++++++++ .../sysinv/sysinv/sysinv/conductor/rpcapi.py | 10 +++ .../sysinv/sysinv/sysinv/db/sqlalchemy/api.py | 9 ++- .../sysinv/objects/device_image_state.py | 13 +++ .../sysinv/tests/api/test_device_image.py | 1 + 7 files changed, 126 insertions(+), 14 deletions(-) diff --git a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/device_image.py b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/device_image.py index 1e46e856fa..034823230f 100644 --- a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/device_image.py +++ b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/device_image.py @@ -315,8 +315,8 @@ class DeviceImageController(rest.RestController): update_device_image_state(device_label.host_id, device_label.pcidevice_id, device_image.id, dconstants.DEVICE_IMAGE_UPDATE_PENDING) - # Update flags in pci_device and host - modify_flags(device_label.pcidevice_id, device_label.host_id) + pecan.request.rpcapi.apply_device_image( + pecan.request.context, device_label.host_uuid) elif action == dconstants.REMOVE_ACTION: try: img_lbl = pecan.request.dbapi.device_image_label_get_by_image_label( @@ -343,8 +343,8 @@ class DeviceImageController(rest.RestController): update_device_image_state(host.id, dev.pci_id, device_image.id, dconstants.DEVICE_IMAGE_UPDATE_PENDING) - # Update flags in pci_device and host - modify_flags(dev.pci_id, dev.host_id) + pecan.request.rpcapi.apply_device_image( + pecan.request.context, host.uuid) elif action == dconstants.REMOVE_ACTION: delete_device_image_state(dev.pci_id, device_image) @@ -472,12 +472,3 @@ def delete_device_image_state(pcidevice_id, device_image): pecan.request.dbapi.device_image_state_destroy(dev_img.uuid) except exception.DeviceImageStateNotFoundByKey: pass - - -def modify_flags(pcidevice_id, host_id): - # Set flag for host indicating device image update is pending if it is - # not already in progress - host = pecan.request.dbapi.ihost_get(host_id) - if host.device_image_update != dconstants.DEVICE_IMAGE_UPDATE_IN_PROGRESS: - pecan.request.dbapi.ihost_update(host_id, - {'device_image_update': dconstants.DEVICE_IMAGE_UPDATE_PENDING}) diff --git a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py index 666568743a..313216096a 100644 --- a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py +++ b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py @@ -6889,6 +6889,11 @@ class HostController(rest.RestController): LOG.info("device_image_update host_uuid=%s " % host_uuid) host_obj = objects.host.get_by_uuid(pecan.request.context, host_uuid) + if host_obj.device_image_update == device.DEVICE_IMAGE_UPDATE_IN_PROGRESS: + raise wsme.exc.ClientSideError(_( + "The host %s is already in the process of updating the " + "device images." % host_obj.hostname)) + # The host must be unlocked/enabled to update device images if (host_obj.administrative != constants.ADMIN_UNLOCKED or host_obj.operational != constants.OPERATIONAL_ENABLED): @@ -6912,6 +6917,11 @@ class HostController(rest.RestController): LOG.info("device_image_update_abort host_uuid=%s " % host_uuid) host_obj = objects.host.get_by_uuid(pecan.request.context, host_uuid) + if host_obj.device_image_update != device.DEVICE_IMAGE_UPDATE_IN_PROGRESS: + raise wsme.exc.ClientSideError(_( + "Abort rejected. The host %s is not in the process of " + "updating the device images." % host_obj.hostname)) + # Call rpcapi to tell conductor to abort device image update pecan.request.rpcapi.host_device_image_update_abort( pecan.request.context, host_uuid) diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py index 6ab231ff15..d7fad63617 100644 --- a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py +++ b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py @@ -138,6 +138,12 @@ conductor_opts = [ cfg.IntOpt('kube_upgrade_downgrade_retry_interval', default=3600, help='Interval in seconds between retries to upgrade/downgrade kubernetes components'), + cfg.IntOpt('fw_update_large_timeout', + default=3600, + help='Timeout interval in seconds for a large device image'), + cfg.IntOpt('fw_update_small_timeout', + default=300, + help='Timeout interval in seconds for a small device image'), ] CONF = cfg.CONF @@ -4542,6 +4548,17 @@ class ConductorManager(service.PeriodicService): if cutils.is_app_applied(self.dbapi, app_name): self.evaluate_app_reapply(context, app_name) + # Clear any "reboot needed" DB entry for the host if it is set. + # If there are no more pending device image update entries in the DB + # for any host, and if no host has the "reboot needed" DB entry set, + # then the "device image update in progress" alarm is cleared. + if availability == constants.AVAILABILITY_AVAILABLE: + if imsg_dict.get(constants.SYSINV_AGENT_FIRST_REPORT): + if ihost.reboot_needed: + ihost.reboot_needed = False + ihost.save(context) + self._clear_device_image_alarm(context) + def iconfig_update_by_ihost(self, context, ihost_uuid, imsg_dict): """Update applied iconfig for an ihost with the supplied data. @@ -11552,6 +11569,32 @@ class ConductorManager(service.PeriodicService): except OSError: LOG.exception("Failed to delete bitstream file %s" % image_file_path) + def apply_device_image(self, context, host_uuid): + """Apply device image""" + host = objects.host.get_by_uuid(context, host_uuid) + if host.device_image_update != dconstants.DEVICE_IMAGE_UPDATE_IN_PROGRESS: + host.device_image_update = dconstants.DEVICE_IMAGE_UPDATE_PENDING + host.save() + + # Raise device image update alarm if not already exists + alarm_id = fm_constants.FM_ALARM_ID_DEVICE_IMAGE_UPDATE_IN_PROGRESS + system_uuid = self.dbapi.isystem_get_one().uuid + entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_SYSTEM, system_uuid) + if not self.fm_api.get_fault(alarm_id, entity_instance_id): + fault = fm_api.Fault( + alarm_id=alarm_id, + alarm_state=fm_constants.FM_ALARM_STATE_SET, + entity_type_id=fm_constants.FM_ENTITY_TYPE_SYSTEM, + entity_instance_id=entity_instance_id, + severity=fm_constants.FM_ALARM_SEVERITY_MINOR, + reason_text="Device image update operation in progress ", + alarm_type=fm_constants.FM_ALARM_TYPE_5, + probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_65, + proposed_repair_action="Complete reboots of affected hosts", + suppression=False, + service_affecting=False) + self.fm_api.set_fault(fault) + def host_device_image_update(self, context, host_uuid): """Update the device image on this host""" @@ -11563,3 +11606,40 @@ class ConductorManager(service.PeriodicService): host_obj = objects.host.get_by_uuid(context, host_uuid) LOG.info("Aborting device image update on %s" % host_obj.hostname) + + @periodic_task.periodic_task(spacing=CONF.conductor.audit_interval) + def _audit_device_image_update(self, context): + """Check if device image update is stuck in 'in-progress'""" + dev_img_list = self.dbapi.device_image_state_get_all( + status=dconstants.DEVICE_IMAGE_UPDATE_IN_PROGRESS) + for img in dev_img_list: + if img['bitstream_type'] == dconstants.BITSTREAM_TYPE_FUNCTIONAL: + timeout = CONF.conductor.fw_update_large_timeout + else: + timeout = CONF.conductor.fw_update_small_timeout + tz = img.update_start_time.tzinfo + if ((datetime.now(tz) - img.update_start_time).total_seconds() >= + timeout): + # Mark the status as failed + img.status = dconstants.DEVICE_IMAGE_UPDATE_FAILED + img.save(context) + host = objects.host.get_by_uuid(context, img.host_uuid) + pci = objects.pci_device.get_by_uuid(context, img.pcidevice_uuid) + LOG.error("Device image update timed out host={} " + "device={} image={}".format(host.hostname, + pci.pciaddr, + img.image_uuid)) + + def _clear_device_image_alarm(self, context): + # If there are no more pending device image update in the DB + # for any host, and if no host has the "reboot needed" DB entry set, + # then the "Device image update in progress" alarm is cleared. + dev_img_list = self.dbapi.device_image_state_get_all( + status=dconstants.DEVICE_IMAGE_UPDATE_PENDING) + if not dev_img_list: + if self.dbapi.count_hosts_matching_criteria(reboot_needed=True) > 0: + return + system_uuid = self.dbapi.isystem_get_one().uuid + entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_SYSTEM, system_uuid) + self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_DEVICE_IMAGE_UPDATE_IN_PROGRESS, + entity_instance_id) diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/rpcapi.py b/sysinv/sysinv/sysinv/sysinv/conductor/rpcapi.py index 7962dec6e9..c01d854711 100644 --- a/sysinv/sysinv/sysinv/sysinv/conductor/rpcapi.py +++ b/sysinv/sysinv/sysinv/sysinv/conductor/rpcapi.py @@ -1914,6 +1914,16 @@ class ConductorAPI(sysinv.openstack.common.rpc.proxy.RpcProxy): return self.cast(context, self.make_msg('delete_bitstream_file', filename=filename)) + def apply_device_image(self, context, host_uuid): + """Asynchronously, have the conductor apply the device image + on this host. + + :param context: request context + :param host_uuid: uuid or id of the host + """ + return self.cast(context, self.make_msg('apply_device_image', + host_uuid=host_uuid)) + def host_device_image_update(self, context, host_uuid): """Asynchronously, have the conductor update the device image on this host. diff --git a/sysinv/sysinv/sysinv/sysinv/db/sqlalchemy/api.py b/sysinv/sysinv/sysinv/sysinv/db/sqlalchemy/api.py index 825619d124..25b798571c 100644 --- a/sysinv/sysinv/sysinv/sysinv/db/sqlalchemy/api.py +++ b/sysinv/sysinv/sysinv/sysinv/db/sqlalchemy/api.py @@ -1331,7 +1331,8 @@ class Connection(api.Connection): def count_hosts_matching_criteria( self, personality=None, administrative=None, - operational=None, availability=None, vim_progress_status=None): + operational=None, availability=None, vim_progress_status=None, + reboot_needed=None): query = model_query(models.ihost) query = add_host_options(query) query = query.filter_by(recordtype="standard") @@ -1361,6 +1362,12 @@ class Connection(api.Connection): models.ihost.vim_progress_status.in_(vim_progress_status)) else: query = query.filter_by(vim_progress_status=vim_progress_status) + if reboot_needed: + if isinstance(reboot_needed, list): + query = query.filter( + models.ihost.reboot_needed.in_(reboot_needed)) + else: + query = query.filter_by(reboot_needed=reboot_needed) return query.count() @objects.objectify(objects.host) diff --git a/sysinv/sysinv/sysinv/sysinv/objects/device_image_state.py b/sysinv/sysinv/sysinv/sysinv/objects/device_image_state.py index 260a618811..3b7b59cef6 100644 --- a/sysinv/sysinv/sysinv/sysinv/objects/device_image_state.py +++ b/sysinv/sysinv/sysinv/sysinv/objects/device_image_state.py @@ -4,10 +4,21 @@ # SPDX-License-Identifier: Apache-2.0 # +from oslo_log import log from sysinv.db import api as db_api from sysinv.objects import base from sysinv.objects import utils +LOG = log.getLogger(__name__) + + +def get_bitstream_type(field, db_object): + """Retrieves the bitstream type from the device image object""" + device_image = getattr(db_object, 'image', None) + if device_image: + return device_image.bitstream_type + return None + class DeviceImageState(base.SysinvObject): VERSION = '1.0' @@ -22,6 +33,7 @@ class DeviceImageState(base.SysinvObject): 'pcidevice_uuid': utils.uuid_or_none, 'image_id': utils.int_or_none, 'image_uuid': utils.uuid_or_none, + 'bitstream_type': utils.str_or_none, 'status': utils.str_or_none, 'update_start_time': utils.datetime_or_str_or_none, 'capabilities': utils.dict_or_none, @@ -31,6 +43,7 @@ class DeviceImageState(base.SysinvObject): 'host_uuid': 'host:uuid', 'pcidevice_uuid': 'pcidevice:uuid', 'image_uuid': 'image:uuid', + 'bitstream_type': get_bitstream_type, } @base.remotable_classmethod diff --git a/sysinv/sysinv/sysinv/sysinv/tests/api/test_device_image.py b/sysinv/sysinv/sysinv/sysinv/tests/api/test_device_image.py index 553c653cf6..21c6c497b3 100644 --- a/sysinv/sysinv/sysinv/sysinv/tests/api/test_device_image.py +++ b/sysinv/sysinv/sysinv/sysinv/tests/api/test_device_image.py @@ -27,6 +27,7 @@ class FakeConductorAPI(object): def __init__(self): self.store_bitstream_file = mock.MagicMock() self.delete_bitstream_file = mock.MagicMock() + self.apply_device_image = mock.MagicMock() class TestDeviceImage(base.FunctionalTest, dbbase.BaseHostTestCase):