Device image update alarm and audit
When the first device image is imported to the system, the alarm "device image update is in progress" is raised. When all the devices have been updated and rebooted, this alarm is cleared. A periodic audit is added to check if any of the image updates has timed out and declared it as failed. Story: 2006740 Task: 39498 Change-Id: I44a027ee384cfa5b96d5b426b06173b54187fda9 Signed-off-by: Teresa Ho <teresa.ho@windriver.com>
This commit is contained in:
parent
2b40b2a711
commit
72767e60ad
|
@ -315,8 +315,8 @@ class DeviceImageController(rest.RestController):
|
|||
update_device_image_state(device_label.host_id,
|
||||
device_label.pcidevice_id,
|
||||
device_image.id, dconstants.DEVICE_IMAGE_UPDATE_PENDING)
|
||||
# Update flags in pci_device and host
|
||||
modify_flags(device_label.pcidevice_id, device_label.host_id)
|
||||
pecan.request.rpcapi.apply_device_image(
|
||||
pecan.request.context, device_label.host_uuid)
|
||||
elif action == dconstants.REMOVE_ACTION:
|
||||
try:
|
||||
img_lbl = pecan.request.dbapi.device_image_label_get_by_image_label(
|
||||
|
@ -343,8 +343,8 @@ class DeviceImageController(rest.RestController):
|
|||
update_device_image_state(host.id,
|
||||
dev.pci_id, device_image.id,
|
||||
dconstants.DEVICE_IMAGE_UPDATE_PENDING)
|
||||
# Update flags in pci_device and host
|
||||
modify_flags(dev.pci_id, dev.host_id)
|
||||
pecan.request.rpcapi.apply_device_image(
|
||||
pecan.request.context, host.uuid)
|
||||
elif action == dconstants.REMOVE_ACTION:
|
||||
delete_device_image_state(dev.pci_id, device_image)
|
||||
|
||||
|
@ -472,12 +472,3 @@ def delete_device_image_state(pcidevice_id, device_image):
|
|||
pecan.request.dbapi.device_image_state_destroy(dev_img.uuid)
|
||||
except exception.DeviceImageStateNotFoundByKey:
|
||||
pass
|
||||
|
||||
|
||||
def modify_flags(pcidevice_id, host_id):
|
||||
# Set flag for host indicating device image update is pending if it is
|
||||
# not already in progress
|
||||
host = pecan.request.dbapi.ihost_get(host_id)
|
||||
if host.device_image_update != dconstants.DEVICE_IMAGE_UPDATE_IN_PROGRESS:
|
||||
pecan.request.dbapi.ihost_update(host_id,
|
||||
{'device_image_update': dconstants.DEVICE_IMAGE_UPDATE_PENDING})
|
||||
|
|
|
@ -6889,6 +6889,11 @@ class HostController(rest.RestController):
|
|||
LOG.info("device_image_update host_uuid=%s " % host_uuid)
|
||||
host_obj = objects.host.get_by_uuid(pecan.request.context, host_uuid)
|
||||
|
||||
if host_obj.device_image_update == device.DEVICE_IMAGE_UPDATE_IN_PROGRESS:
|
||||
raise wsme.exc.ClientSideError(_(
|
||||
"The host %s is already in the process of updating the "
|
||||
"device images." % host_obj.hostname))
|
||||
|
||||
# The host must be unlocked/enabled to update device images
|
||||
if (host_obj.administrative != constants.ADMIN_UNLOCKED or
|
||||
host_obj.operational != constants.OPERATIONAL_ENABLED):
|
||||
|
@ -6912,6 +6917,11 @@ class HostController(rest.RestController):
|
|||
LOG.info("device_image_update_abort host_uuid=%s " % host_uuid)
|
||||
host_obj = objects.host.get_by_uuid(pecan.request.context, host_uuid)
|
||||
|
||||
if host_obj.device_image_update != device.DEVICE_IMAGE_UPDATE_IN_PROGRESS:
|
||||
raise wsme.exc.ClientSideError(_(
|
||||
"Abort rejected. The host %s is not in the process of "
|
||||
"updating the device images." % host_obj.hostname))
|
||||
|
||||
# Call rpcapi to tell conductor to abort device image update
|
||||
pecan.request.rpcapi.host_device_image_update_abort(
|
||||
pecan.request.context, host_uuid)
|
||||
|
|
|
@ -138,6 +138,12 @@ conductor_opts = [
|
|||
cfg.IntOpt('kube_upgrade_downgrade_retry_interval',
|
||||
default=3600,
|
||||
help='Interval in seconds between retries to upgrade/downgrade kubernetes components'),
|
||||
cfg.IntOpt('fw_update_large_timeout',
|
||||
default=3600,
|
||||
help='Timeout interval in seconds for a large device image'),
|
||||
cfg.IntOpt('fw_update_small_timeout',
|
||||
default=300,
|
||||
help='Timeout interval in seconds for a small device image'),
|
||||
]
|
||||
|
||||
CONF = cfg.CONF
|
||||
|
@ -4542,6 +4548,17 @@ class ConductorManager(service.PeriodicService):
|
|||
if cutils.is_app_applied(self.dbapi, app_name):
|
||||
self.evaluate_app_reapply(context, app_name)
|
||||
|
||||
# Clear any "reboot needed" DB entry for the host if it is set.
|
||||
# If there are no more pending device image update entries in the DB
|
||||
# for any host, and if no host has the "reboot needed" DB entry set,
|
||||
# then the "device image update in progress" alarm is cleared.
|
||||
if availability == constants.AVAILABILITY_AVAILABLE:
|
||||
if imsg_dict.get(constants.SYSINV_AGENT_FIRST_REPORT):
|
||||
if ihost.reboot_needed:
|
||||
ihost.reboot_needed = False
|
||||
ihost.save(context)
|
||||
self._clear_device_image_alarm(context)
|
||||
|
||||
def iconfig_update_by_ihost(self, context,
|
||||
ihost_uuid, imsg_dict):
|
||||
"""Update applied iconfig for an ihost with the supplied data.
|
||||
|
@ -11552,6 +11569,32 @@ class ConductorManager(service.PeriodicService):
|
|||
except OSError:
|
||||
LOG.exception("Failed to delete bitstream file %s" % image_file_path)
|
||||
|
||||
def apply_device_image(self, context, host_uuid):
|
||||
"""Apply device image"""
|
||||
host = objects.host.get_by_uuid(context, host_uuid)
|
||||
if host.device_image_update != dconstants.DEVICE_IMAGE_UPDATE_IN_PROGRESS:
|
||||
host.device_image_update = dconstants.DEVICE_IMAGE_UPDATE_PENDING
|
||||
host.save()
|
||||
|
||||
# Raise device image update alarm if not already exists
|
||||
alarm_id = fm_constants.FM_ALARM_ID_DEVICE_IMAGE_UPDATE_IN_PROGRESS
|
||||
system_uuid = self.dbapi.isystem_get_one().uuid
|
||||
entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_SYSTEM, system_uuid)
|
||||
if not self.fm_api.get_fault(alarm_id, entity_instance_id):
|
||||
fault = fm_api.Fault(
|
||||
alarm_id=alarm_id,
|
||||
alarm_state=fm_constants.FM_ALARM_STATE_SET,
|
||||
entity_type_id=fm_constants.FM_ENTITY_TYPE_SYSTEM,
|
||||
entity_instance_id=entity_instance_id,
|
||||
severity=fm_constants.FM_ALARM_SEVERITY_MINOR,
|
||||
reason_text="Device image update operation in progress ",
|
||||
alarm_type=fm_constants.FM_ALARM_TYPE_5,
|
||||
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_65,
|
||||
proposed_repair_action="Complete reboots of affected hosts",
|
||||
suppression=False,
|
||||
service_affecting=False)
|
||||
self.fm_api.set_fault(fault)
|
||||
|
||||
def host_device_image_update(self, context, host_uuid):
|
||||
"""Update the device image on this host"""
|
||||
|
||||
|
@ -11563,3 +11606,40 @@ class ConductorManager(service.PeriodicService):
|
|||
|
||||
host_obj = objects.host.get_by_uuid(context, host_uuid)
|
||||
LOG.info("Aborting device image update on %s" % host_obj.hostname)
|
||||
|
||||
@periodic_task.periodic_task(spacing=CONF.conductor.audit_interval)
|
||||
def _audit_device_image_update(self, context):
|
||||
"""Check if device image update is stuck in 'in-progress'"""
|
||||
dev_img_list = self.dbapi.device_image_state_get_all(
|
||||
status=dconstants.DEVICE_IMAGE_UPDATE_IN_PROGRESS)
|
||||
for img in dev_img_list:
|
||||
if img['bitstream_type'] == dconstants.BITSTREAM_TYPE_FUNCTIONAL:
|
||||
timeout = CONF.conductor.fw_update_large_timeout
|
||||
else:
|
||||
timeout = CONF.conductor.fw_update_small_timeout
|
||||
tz = img.update_start_time.tzinfo
|
||||
if ((datetime.now(tz) - img.update_start_time).total_seconds() >=
|
||||
timeout):
|
||||
# Mark the status as failed
|
||||
img.status = dconstants.DEVICE_IMAGE_UPDATE_FAILED
|
||||
img.save(context)
|
||||
host = objects.host.get_by_uuid(context, img.host_uuid)
|
||||
pci = objects.pci_device.get_by_uuid(context, img.pcidevice_uuid)
|
||||
LOG.error("Device image update timed out host={} "
|
||||
"device={} image={}".format(host.hostname,
|
||||
pci.pciaddr,
|
||||
img.image_uuid))
|
||||
|
||||
def _clear_device_image_alarm(self, context):
|
||||
# If there are no more pending device image update in the DB
|
||||
# for any host, and if no host has the "reboot needed" DB entry set,
|
||||
# then the "Device image update in progress" alarm is cleared.
|
||||
dev_img_list = self.dbapi.device_image_state_get_all(
|
||||
status=dconstants.DEVICE_IMAGE_UPDATE_PENDING)
|
||||
if not dev_img_list:
|
||||
if self.dbapi.count_hosts_matching_criteria(reboot_needed=True) > 0:
|
||||
return
|
||||
system_uuid = self.dbapi.isystem_get_one().uuid
|
||||
entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_SYSTEM, system_uuid)
|
||||
self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_DEVICE_IMAGE_UPDATE_IN_PROGRESS,
|
||||
entity_instance_id)
|
||||
|
|
|
@ -1914,6 +1914,16 @@ class ConductorAPI(sysinv.openstack.common.rpc.proxy.RpcProxy):
|
|||
return self.cast(context, self.make_msg('delete_bitstream_file',
|
||||
filename=filename))
|
||||
|
||||
def apply_device_image(self, context, host_uuid):
|
||||
"""Asynchronously, have the conductor apply the device image
|
||||
on this host.
|
||||
|
||||
:param context: request context
|
||||
:param host_uuid: uuid or id of the host
|
||||
"""
|
||||
return self.cast(context, self.make_msg('apply_device_image',
|
||||
host_uuid=host_uuid))
|
||||
|
||||
def host_device_image_update(self, context, host_uuid):
|
||||
"""Asynchronously, have the conductor update the device image
|
||||
on this host.
|
||||
|
|
|
@ -1331,7 +1331,8 @@ class Connection(api.Connection):
|
|||
|
||||
def count_hosts_matching_criteria(
|
||||
self, personality=None, administrative=None,
|
||||
operational=None, availability=None, vim_progress_status=None):
|
||||
operational=None, availability=None, vim_progress_status=None,
|
||||
reboot_needed=None):
|
||||
query = model_query(models.ihost)
|
||||
query = add_host_options(query)
|
||||
query = query.filter_by(recordtype="standard")
|
||||
|
@ -1361,6 +1362,12 @@ class Connection(api.Connection):
|
|||
models.ihost.vim_progress_status.in_(vim_progress_status))
|
||||
else:
|
||||
query = query.filter_by(vim_progress_status=vim_progress_status)
|
||||
if reboot_needed:
|
||||
if isinstance(reboot_needed, list):
|
||||
query = query.filter(
|
||||
models.ihost.reboot_needed.in_(reboot_needed))
|
||||
else:
|
||||
query = query.filter_by(reboot_needed=reboot_needed)
|
||||
return query.count()
|
||||
|
||||
@objects.objectify(objects.host)
|
||||
|
|
|
@ -4,10 +4,21 @@
|
|||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
|
||||
from oslo_log import log
|
||||
from sysinv.db import api as db_api
|
||||
from sysinv.objects import base
|
||||
from sysinv.objects import utils
|
||||
|
||||
LOG = log.getLogger(__name__)
|
||||
|
||||
|
||||
def get_bitstream_type(field, db_object):
|
||||
"""Retrieves the bitstream type from the device image object"""
|
||||
device_image = getattr(db_object, 'image', None)
|
||||
if device_image:
|
||||
return device_image.bitstream_type
|
||||
return None
|
||||
|
||||
|
||||
class DeviceImageState(base.SysinvObject):
|
||||
VERSION = '1.0'
|
||||
|
@ -22,6 +33,7 @@ class DeviceImageState(base.SysinvObject):
|
|||
'pcidevice_uuid': utils.uuid_or_none,
|
||||
'image_id': utils.int_or_none,
|
||||
'image_uuid': utils.uuid_or_none,
|
||||
'bitstream_type': utils.str_or_none,
|
||||
'status': utils.str_or_none,
|
||||
'update_start_time': utils.datetime_or_str_or_none,
|
||||
'capabilities': utils.dict_or_none,
|
||||
|
@ -31,6 +43,7 @@ class DeviceImageState(base.SysinvObject):
|
|||
'host_uuid': 'host:uuid',
|
||||
'pcidevice_uuid': 'pcidevice:uuid',
|
||||
'image_uuid': 'image:uuid',
|
||||
'bitstream_type': get_bitstream_type,
|
||||
}
|
||||
|
||||
@base.remotable_classmethod
|
||||
|
|
|
@ -27,6 +27,7 @@ class FakeConductorAPI(object):
|
|||
def __init__(self):
|
||||
self.store_bitstream_file = mock.MagicMock()
|
||||
self.delete_bitstream_file = mock.MagicMock()
|
||||
self.apply_device_image = mock.MagicMock()
|
||||
|
||||
|
||||
class TestDeviceImage(base.FunctionalTest, dbbase.BaseHostTestCase):
|
||||
|
|
Loading…
Reference in New Issue