Device image update alarm and audit

When the first device image is imported to the system, the alarm
"device image update is in progress" is raised. When all the
devices have been updated and rebooted, this alarm is cleared.

A periodic audit is added to check if any of the image updates
has timed out and declared it as failed.

Story: 2006740
Task: 39498

Change-Id: I44a027ee384cfa5b96d5b426b06173b54187fda9
Signed-off-by: Teresa Ho <teresa.ho@windriver.com>
This commit is contained in:
Teresa Ho 2020-05-26 14:14:48 -04:00
parent 2b40b2a711
commit 72767e60ad
7 changed files with 126 additions and 14 deletions

View File

@ -315,8 +315,8 @@ class DeviceImageController(rest.RestController):
update_device_image_state(device_label.host_id,
device_label.pcidevice_id,
device_image.id, dconstants.DEVICE_IMAGE_UPDATE_PENDING)
# Update flags in pci_device and host
modify_flags(device_label.pcidevice_id, device_label.host_id)
pecan.request.rpcapi.apply_device_image(
pecan.request.context, device_label.host_uuid)
elif action == dconstants.REMOVE_ACTION:
try:
img_lbl = pecan.request.dbapi.device_image_label_get_by_image_label(
@ -343,8 +343,8 @@ class DeviceImageController(rest.RestController):
update_device_image_state(host.id,
dev.pci_id, device_image.id,
dconstants.DEVICE_IMAGE_UPDATE_PENDING)
# Update flags in pci_device and host
modify_flags(dev.pci_id, dev.host_id)
pecan.request.rpcapi.apply_device_image(
pecan.request.context, host.uuid)
elif action == dconstants.REMOVE_ACTION:
delete_device_image_state(dev.pci_id, device_image)
@ -472,12 +472,3 @@ def delete_device_image_state(pcidevice_id, device_image):
pecan.request.dbapi.device_image_state_destroy(dev_img.uuid)
except exception.DeviceImageStateNotFoundByKey:
pass
def modify_flags(pcidevice_id, host_id):
# Set flag for host indicating device image update is pending if it is
# not already in progress
host = pecan.request.dbapi.ihost_get(host_id)
if host.device_image_update != dconstants.DEVICE_IMAGE_UPDATE_IN_PROGRESS:
pecan.request.dbapi.ihost_update(host_id,
{'device_image_update': dconstants.DEVICE_IMAGE_UPDATE_PENDING})

View File

@ -6889,6 +6889,11 @@ class HostController(rest.RestController):
LOG.info("device_image_update host_uuid=%s " % host_uuid)
host_obj = objects.host.get_by_uuid(pecan.request.context, host_uuid)
if host_obj.device_image_update == device.DEVICE_IMAGE_UPDATE_IN_PROGRESS:
raise wsme.exc.ClientSideError(_(
"The host %s is already in the process of updating the "
"device images." % host_obj.hostname))
# The host must be unlocked/enabled to update device images
if (host_obj.administrative != constants.ADMIN_UNLOCKED or
host_obj.operational != constants.OPERATIONAL_ENABLED):
@ -6912,6 +6917,11 @@ class HostController(rest.RestController):
LOG.info("device_image_update_abort host_uuid=%s " % host_uuid)
host_obj = objects.host.get_by_uuid(pecan.request.context, host_uuid)
if host_obj.device_image_update != device.DEVICE_IMAGE_UPDATE_IN_PROGRESS:
raise wsme.exc.ClientSideError(_(
"Abort rejected. The host %s is not in the process of "
"updating the device images." % host_obj.hostname))
# Call rpcapi to tell conductor to abort device image update
pecan.request.rpcapi.host_device_image_update_abort(
pecan.request.context, host_uuid)

View File

@ -138,6 +138,12 @@ conductor_opts = [
cfg.IntOpt('kube_upgrade_downgrade_retry_interval',
default=3600,
help='Interval in seconds between retries to upgrade/downgrade kubernetes components'),
cfg.IntOpt('fw_update_large_timeout',
default=3600,
help='Timeout interval in seconds for a large device image'),
cfg.IntOpt('fw_update_small_timeout',
default=300,
help='Timeout interval in seconds for a small device image'),
]
CONF = cfg.CONF
@ -4542,6 +4548,17 @@ class ConductorManager(service.PeriodicService):
if cutils.is_app_applied(self.dbapi, app_name):
self.evaluate_app_reapply(context, app_name)
# Clear any "reboot needed" DB entry for the host if it is set.
# If there are no more pending device image update entries in the DB
# for any host, and if no host has the "reboot needed" DB entry set,
# then the "device image update in progress" alarm is cleared.
if availability == constants.AVAILABILITY_AVAILABLE:
if imsg_dict.get(constants.SYSINV_AGENT_FIRST_REPORT):
if ihost.reboot_needed:
ihost.reboot_needed = False
ihost.save(context)
self._clear_device_image_alarm(context)
def iconfig_update_by_ihost(self, context,
ihost_uuid, imsg_dict):
"""Update applied iconfig for an ihost with the supplied data.
@ -11552,6 +11569,32 @@ class ConductorManager(service.PeriodicService):
except OSError:
LOG.exception("Failed to delete bitstream file %s" % image_file_path)
def apply_device_image(self, context, host_uuid):
"""Apply device image"""
host = objects.host.get_by_uuid(context, host_uuid)
if host.device_image_update != dconstants.DEVICE_IMAGE_UPDATE_IN_PROGRESS:
host.device_image_update = dconstants.DEVICE_IMAGE_UPDATE_PENDING
host.save()
# Raise device image update alarm if not already exists
alarm_id = fm_constants.FM_ALARM_ID_DEVICE_IMAGE_UPDATE_IN_PROGRESS
system_uuid = self.dbapi.isystem_get_one().uuid
entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_SYSTEM, system_uuid)
if not self.fm_api.get_fault(alarm_id, entity_instance_id):
fault = fm_api.Fault(
alarm_id=alarm_id,
alarm_state=fm_constants.FM_ALARM_STATE_SET,
entity_type_id=fm_constants.FM_ENTITY_TYPE_SYSTEM,
entity_instance_id=entity_instance_id,
severity=fm_constants.FM_ALARM_SEVERITY_MINOR,
reason_text="Device image update operation in progress ",
alarm_type=fm_constants.FM_ALARM_TYPE_5,
probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_65,
proposed_repair_action="Complete reboots of affected hosts",
suppression=False,
service_affecting=False)
self.fm_api.set_fault(fault)
def host_device_image_update(self, context, host_uuid):
"""Update the device image on this host"""
@ -11563,3 +11606,40 @@ class ConductorManager(service.PeriodicService):
host_obj = objects.host.get_by_uuid(context, host_uuid)
LOG.info("Aborting device image update on %s" % host_obj.hostname)
@periodic_task.periodic_task(spacing=CONF.conductor.audit_interval)
def _audit_device_image_update(self, context):
"""Check if device image update is stuck in 'in-progress'"""
dev_img_list = self.dbapi.device_image_state_get_all(
status=dconstants.DEVICE_IMAGE_UPDATE_IN_PROGRESS)
for img in dev_img_list:
if img['bitstream_type'] == dconstants.BITSTREAM_TYPE_FUNCTIONAL:
timeout = CONF.conductor.fw_update_large_timeout
else:
timeout = CONF.conductor.fw_update_small_timeout
tz = img.update_start_time.tzinfo
if ((datetime.now(tz) - img.update_start_time).total_seconds() >=
timeout):
# Mark the status as failed
img.status = dconstants.DEVICE_IMAGE_UPDATE_FAILED
img.save(context)
host = objects.host.get_by_uuid(context, img.host_uuid)
pci = objects.pci_device.get_by_uuid(context, img.pcidevice_uuid)
LOG.error("Device image update timed out host={} "
"device={} image={}".format(host.hostname,
pci.pciaddr,
img.image_uuid))
def _clear_device_image_alarm(self, context):
# If there are no more pending device image update in the DB
# for any host, and if no host has the "reboot needed" DB entry set,
# then the "Device image update in progress" alarm is cleared.
dev_img_list = self.dbapi.device_image_state_get_all(
status=dconstants.DEVICE_IMAGE_UPDATE_PENDING)
if not dev_img_list:
if self.dbapi.count_hosts_matching_criteria(reboot_needed=True) > 0:
return
system_uuid = self.dbapi.isystem_get_one().uuid
entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_SYSTEM, system_uuid)
self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_DEVICE_IMAGE_UPDATE_IN_PROGRESS,
entity_instance_id)

View File

@ -1914,6 +1914,16 @@ class ConductorAPI(sysinv.openstack.common.rpc.proxy.RpcProxy):
return self.cast(context, self.make_msg('delete_bitstream_file',
filename=filename))
def apply_device_image(self, context, host_uuid):
"""Asynchronously, have the conductor apply the device image
on this host.
:param context: request context
:param host_uuid: uuid or id of the host
"""
return self.cast(context, self.make_msg('apply_device_image',
host_uuid=host_uuid))
def host_device_image_update(self, context, host_uuid):
"""Asynchronously, have the conductor update the device image
on this host.

View File

@ -1331,7 +1331,8 @@ class Connection(api.Connection):
def count_hosts_matching_criteria(
self, personality=None, administrative=None,
operational=None, availability=None, vim_progress_status=None):
operational=None, availability=None, vim_progress_status=None,
reboot_needed=None):
query = model_query(models.ihost)
query = add_host_options(query)
query = query.filter_by(recordtype="standard")
@ -1361,6 +1362,12 @@ class Connection(api.Connection):
models.ihost.vim_progress_status.in_(vim_progress_status))
else:
query = query.filter_by(vim_progress_status=vim_progress_status)
if reboot_needed:
if isinstance(reboot_needed, list):
query = query.filter(
models.ihost.reboot_needed.in_(reboot_needed))
else:
query = query.filter_by(reboot_needed=reboot_needed)
return query.count()
@objects.objectify(objects.host)

View File

@ -4,10 +4,21 @@
# SPDX-License-Identifier: Apache-2.0
#
from oslo_log import log
from sysinv.db import api as db_api
from sysinv.objects import base
from sysinv.objects import utils
LOG = log.getLogger(__name__)
def get_bitstream_type(field, db_object):
"""Retrieves the bitstream type from the device image object"""
device_image = getattr(db_object, 'image', None)
if device_image:
return device_image.bitstream_type
return None
class DeviceImageState(base.SysinvObject):
VERSION = '1.0'
@ -22,6 +33,7 @@ class DeviceImageState(base.SysinvObject):
'pcidevice_uuid': utils.uuid_or_none,
'image_id': utils.int_or_none,
'image_uuid': utils.uuid_or_none,
'bitstream_type': utils.str_or_none,
'status': utils.str_or_none,
'update_start_time': utils.datetime_or_str_or_none,
'capabilities': utils.dict_or_none,
@ -31,6 +43,7 @@ class DeviceImageState(base.SysinvObject):
'host_uuid': 'host:uuid',
'pcidevice_uuid': 'pcidevice:uuid',
'image_uuid': 'image:uuid',
'bitstream_type': get_bitstream_type,
}
@base.remotable_classmethod

View File

@ -27,6 +27,7 @@ class FakeConductorAPI(object):
def __init__(self):
self.store_bitstream_file = mock.MagicMock()
self.delete_bitstream_file = mock.MagicMock()
self.apply_device_image = mock.MagicMock()
class TestDeviceImage(base.FunctionalTest, dbbase.BaseHostTestCase):