diff --git a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/device_image.py b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/device_image.py index 1e46e856fa..034823230f 100644 --- a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/device_image.py +++ b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/device_image.py @@ -315,8 +315,8 @@ class DeviceImageController(rest.RestController): update_device_image_state(device_label.host_id, device_label.pcidevice_id, device_image.id, dconstants.DEVICE_IMAGE_UPDATE_PENDING) - # Update flags in pci_device and host - modify_flags(device_label.pcidevice_id, device_label.host_id) + pecan.request.rpcapi.apply_device_image( + pecan.request.context, device_label.host_uuid) elif action == dconstants.REMOVE_ACTION: try: img_lbl = pecan.request.dbapi.device_image_label_get_by_image_label( @@ -343,8 +343,8 @@ class DeviceImageController(rest.RestController): update_device_image_state(host.id, dev.pci_id, device_image.id, dconstants.DEVICE_IMAGE_UPDATE_PENDING) - # Update flags in pci_device and host - modify_flags(dev.pci_id, dev.host_id) + pecan.request.rpcapi.apply_device_image( + pecan.request.context, host.uuid) elif action == dconstants.REMOVE_ACTION: delete_device_image_state(dev.pci_id, device_image) @@ -472,12 +472,3 @@ def delete_device_image_state(pcidevice_id, device_image): pecan.request.dbapi.device_image_state_destroy(dev_img.uuid) except exception.DeviceImageStateNotFoundByKey: pass - - -def modify_flags(pcidevice_id, host_id): - # Set flag for host indicating device image update is pending if it is - # not already in progress - host = pecan.request.dbapi.ihost_get(host_id) - if host.device_image_update != dconstants.DEVICE_IMAGE_UPDATE_IN_PROGRESS: - pecan.request.dbapi.ihost_update(host_id, - {'device_image_update': dconstants.DEVICE_IMAGE_UPDATE_PENDING}) diff --git a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py index 666568743a..313216096a 100644 --- a/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py +++ b/sysinv/sysinv/sysinv/sysinv/api/controllers/v1/host.py @@ -6889,6 +6889,11 @@ class HostController(rest.RestController): LOG.info("device_image_update host_uuid=%s " % host_uuid) host_obj = objects.host.get_by_uuid(pecan.request.context, host_uuid) + if host_obj.device_image_update == device.DEVICE_IMAGE_UPDATE_IN_PROGRESS: + raise wsme.exc.ClientSideError(_( + "The host %s is already in the process of updating the " + "device images." % host_obj.hostname)) + # The host must be unlocked/enabled to update device images if (host_obj.administrative != constants.ADMIN_UNLOCKED or host_obj.operational != constants.OPERATIONAL_ENABLED): @@ -6912,6 +6917,11 @@ class HostController(rest.RestController): LOG.info("device_image_update_abort host_uuid=%s " % host_uuid) host_obj = objects.host.get_by_uuid(pecan.request.context, host_uuid) + if host_obj.device_image_update != device.DEVICE_IMAGE_UPDATE_IN_PROGRESS: + raise wsme.exc.ClientSideError(_( + "Abort rejected. The host %s is not in the process of " + "updating the device images." % host_obj.hostname)) + # Call rpcapi to tell conductor to abort device image update pecan.request.rpcapi.host_device_image_update_abort( pecan.request.context, host_uuid) diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py index 6ab231ff15..d7fad63617 100644 --- a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py +++ b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py @@ -138,6 +138,12 @@ conductor_opts = [ cfg.IntOpt('kube_upgrade_downgrade_retry_interval', default=3600, help='Interval in seconds between retries to upgrade/downgrade kubernetes components'), + cfg.IntOpt('fw_update_large_timeout', + default=3600, + help='Timeout interval in seconds for a large device image'), + cfg.IntOpt('fw_update_small_timeout', + default=300, + help='Timeout interval in seconds for a small device image'), ] CONF = cfg.CONF @@ -4542,6 +4548,17 @@ class ConductorManager(service.PeriodicService): if cutils.is_app_applied(self.dbapi, app_name): self.evaluate_app_reapply(context, app_name) + # Clear any "reboot needed" DB entry for the host if it is set. + # If there are no more pending device image update entries in the DB + # for any host, and if no host has the "reboot needed" DB entry set, + # then the "device image update in progress" alarm is cleared. + if availability == constants.AVAILABILITY_AVAILABLE: + if imsg_dict.get(constants.SYSINV_AGENT_FIRST_REPORT): + if ihost.reboot_needed: + ihost.reboot_needed = False + ihost.save(context) + self._clear_device_image_alarm(context) + def iconfig_update_by_ihost(self, context, ihost_uuid, imsg_dict): """Update applied iconfig for an ihost with the supplied data. @@ -11552,6 +11569,32 @@ class ConductorManager(service.PeriodicService): except OSError: LOG.exception("Failed to delete bitstream file %s" % image_file_path) + def apply_device_image(self, context, host_uuid): + """Apply device image""" + host = objects.host.get_by_uuid(context, host_uuid) + if host.device_image_update != dconstants.DEVICE_IMAGE_UPDATE_IN_PROGRESS: + host.device_image_update = dconstants.DEVICE_IMAGE_UPDATE_PENDING + host.save() + + # Raise device image update alarm if not already exists + alarm_id = fm_constants.FM_ALARM_ID_DEVICE_IMAGE_UPDATE_IN_PROGRESS + system_uuid = self.dbapi.isystem_get_one().uuid + entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_SYSTEM, system_uuid) + if not self.fm_api.get_fault(alarm_id, entity_instance_id): + fault = fm_api.Fault( + alarm_id=alarm_id, + alarm_state=fm_constants.FM_ALARM_STATE_SET, + entity_type_id=fm_constants.FM_ENTITY_TYPE_SYSTEM, + entity_instance_id=entity_instance_id, + severity=fm_constants.FM_ALARM_SEVERITY_MINOR, + reason_text="Device image update operation in progress ", + alarm_type=fm_constants.FM_ALARM_TYPE_5, + probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_65, + proposed_repair_action="Complete reboots of affected hosts", + suppression=False, + service_affecting=False) + self.fm_api.set_fault(fault) + def host_device_image_update(self, context, host_uuid): """Update the device image on this host""" @@ -11563,3 +11606,40 @@ class ConductorManager(service.PeriodicService): host_obj = objects.host.get_by_uuid(context, host_uuid) LOG.info("Aborting device image update on %s" % host_obj.hostname) + + @periodic_task.periodic_task(spacing=CONF.conductor.audit_interval) + def _audit_device_image_update(self, context): + """Check if device image update is stuck in 'in-progress'""" + dev_img_list = self.dbapi.device_image_state_get_all( + status=dconstants.DEVICE_IMAGE_UPDATE_IN_PROGRESS) + for img in dev_img_list: + if img['bitstream_type'] == dconstants.BITSTREAM_TYPE_FUNCTIONAL: + timeout = CONF.conductor.fw_update_large_timeout + else: + timeout = CONF.conductor.fw_update_small_timeout + tz = img.update_start_time.tzinfo + if ((datetime.now(tz) - img.update_start_time).total_seconds() >= + timeout): + # Mark the status as failed + img.status = dconstants.DEVICE_IMAGE_UPDATE_FAILED + img.save(context) + host = objects.host.get_by_uuid(context, img.host_uuid) + pci = objects.pci_device.get_by_uuid(context, img.pcidevice_uuid) + LOG.error("Device image update timed out host={} " + "device={} image={}".format(host.hostname, + pci.pciaddr, + img.image_uuid)) + + def _clear_device_image_alarm(self, context): + # If there are no more pending device image update in the DB + # for any host, and if no host has the "reboot needed" DB entry set, + # then the "Device image update in progress" alarm is cleared. + dev_img_list = self.dbapi.device_image_state_get_all( + status=dconstants.DEVICE_IMAGE_UPDATE_PENDING) + if not dev_img_list: + if self.dbapi.count_hosts_matching_criteria(reboot_needed=True) > 0: + return + system_uuid = self.dbapi.isystem_get_one().uuid + entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_SYSTEM, system_uuid) + self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_DEVICE_IMAGE_UPDATE_IN_PROGRESS, + entity_instance_id) diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/rpcapi.py b/sysinv/sysinv/sysinv/sysinv/conductor/rpcapi.py index 7962dec6e9..c01d854711 100644 --- a/sysinv/sysinv/sysinv/sysinv/conductor/rpcapi.py +++ b/sysinv/sysinv/sysinv/sysinv/conductor/rpcapi.py @@ -1914,6 +1914,16 @@ class ConductorAPI(sysinv.openstack.common.rpc.proxy.RpcProxy): return self.cast(context, self.make_msg('delete_bitstream_file', filename=filename)) + def apply_device_image(self, context, host_uuid): + """Asynchronously, have the conductor apply the device image + on this host. + + :param context: request context + :param host_uuid: uuid or id of the host + """ + return self.cast(context, self.make_msg('apply_device_image', + host_uuid=host_uuid)) + def host_device_image_update(self, context, host_uuid): """Asynchronously, have the conductor update the device image on this host. diff --git a/sysinv/sysinv/sysinv/sysinv/db/sqlalchemy/api.py b/sysinv/sysinv/sysinv/sysinv/db/sqlalchemy/api.py index 825619d124..25b798571c 100644 --- a/sysinv/sysinv/sysinv/sysinv/db/sqlalchemy/api.py +++ b/sysinv/sysinv/sysinv/sysinv/db/sqlalchemy/api.py @@ -1331,7 +1331,8 @@ class Connection(api.Connection): def count_hosts_matching_criteria( self, personality=None, administrative=None, - operational=None, availability=None, vim_progress_status=None): + operational=None, availability=None, vim_progress_status=None, + reboot_needed=None): query = model_query(models.ihost) query = add_host_options(query) query = query.filter_by(recordtype="standard") @@ -1361,6 +1362,12 @@ class Connection(api.Connection): models.ihost.vim_progress_status.in_(vim_progress_status)) else: query = query.filter_by(vim_progress_status=vim_progress_status) + if reboot_needed: + if isinstance(reboot_needed, list): + query = query.filter( + models.ihost.reboot_needed.in_(reboot_needed)) + else: + query = query.filter_by(reboot_needed=reboot_needed) return query.count() @objects.objectify(objects.host) diff --git a/sysinv/sysinv/sysinv/sysinv/objects/device_image_state.py b/sysinv/sysinv/sysinv/sysinv/objects/device_image_state.py index 260a618811..3b7b59cef6 100644 --- a/sysinv/sysinv/sysinv/sysinv/objects/device_image_state.py +++ b/sysinv/sysinv/sysinv/sysinv/objects/device_image_state.py @@ -4,10 +4,21 @@ # SPDX-License-Identifier: Apache-2.0 # +from oslo_log import log from sysinv.db import api as db_api from sysinv.objects import base from sysinv.objects import utils +LOG = log.getLogger(__name__) + + +def get_bitstream_type(field, db_object): + """Retrieves the bitstream type from the device image object""" + device_image = getattr(db_object, 'image', None) + if device_image: + return device_image.bitstream_type + return None + class DeviceImageState(base.SysinvObject): VERSION = '1.0' @@ -22,6 +33,7 @@ class DeviceImageState(base.SysinvObject): 'pcidevice_uuid': utils.uuid_or_none, 'image_id': utils.int_or_none, 'image_uuid': utils.uuid_or_none, + 'bitstream_type': utils.str_or_none, 'status': utils.str_or_none, 'update_start_time': utils.datetime_or_str_or_none, 'capabilities': utils.dict_or_none, @@ -31,6 +43,7 @@ class DeviceImageState(base.SysinvObject): 'host_uuid': 'host:uuid', 'pcidevice_uuid': 'pcidevice:uuid', 'image_uuid': 'image:uuid', + 'bitstream_type': get_bitstream_type, } @base.remotable_classmethod diff --git a/sysinv/sysinv/sysinv/sysinv/tests/api/test_device_image.py b/sysinv/sysinv/sysinv/sysinv/tests/api/test_device_image.py index 553c653cf6..21c6c497b3 100644 --- a/sysinv/sysinv/sysinv/sysinv/tests/api/test_device_image.py +++ b/sysinv/sysinv/sysinv/sysinv/tests/api/test_device_image.py @@ -27,6 +27,7 @@ class FakeConductorAPI(object): def __init__(self): self.store_bitstream_file = mock.MagicMock() self.delete_bitstream_file = mock.MagicMock() + self.apply_device_image = mock.MagicMock() class TestDeviceImage(base.FunctionalTest, dbbase.BaseHostTestCase):