From 544851e76fa842abd78c6869feed72273825a637 Mon Sep 17 00:00:00 2001 From: Joshua Kraitberg Date: Tue, 21 Mar 2023 09:29:14 -0400 Subject: [PATCH] Add alarm for Restore in progress Currently, there is no alarm for Restore in progress. Because of this, the system is shown as healthy, before restore has been completed. This new alarm will prevent the system from being healthy until restore has properly been completed. TEST PLAN PASS: On any available system, the following commands can be triggered at anytime: * Run "system restore-start" to trigger alarm * Run "system restore-complete" to clear alarm PASS: Do legacy restore on AIO-SX Alarm will be seen after unlocking and before "system restore-complete" is sent. PASS: Do optimized restore on AIO-SX Alarm will be seen after unlocking and before "system restore-complete" is sent. PASS: Run AIO-SX subcloud upgrade using dcmanager. Story: 2010709 Task: 47865 Depends-On: https://review.opendev.org/c/starlingx/fault/+/878076 Signed-off-by: Joshua Kraitberg Change-Id: I1791e81a10c523b626775000abf37957cb1a48ee --- .../sysinv/sysinv/sysinv/conductor/manager.py | 25 +++++++++++++++++++ .../sysinv/tests/conductor/test_restore.py | 2 ++ 2 files changed, 27 insertions(+) diff --git a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py index 01f21debe9..7216dc4d22 100644 --- a/sysinv/sysinv/sysinv/sysinv/conductor/manager.py +++ b/sysinv/sysinv/sysinv/sysinv/conductor/manager.py @@ -15365,6 +15365,26 @@ class ConductorManager(service.PeriodicService): else: return constants.RESTORE_PROGRESS_ALREADY_IN_PROGRESS + entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST, + constants.CONTROLLER_HOSTNAME) + + fault = fm_api.Fault( + alarm_id=fm_constants.FM_ALARM_ID_RESTORE_IN_PROGRESS, + alarm_state=fm_constants.FM_ALARM_STATE_SET, + entity_type_id=fm_constants.FM_ENTITY_TYPE_HOST, + entity_instance_id=entity_instance_id, + severity=fm_constants.FM_ALARM_SEVERITY_MINOR, + reason_text=("System Restore in progress."), + # operational + alarm_type=fm_constants.FM_ALARM_TYPE_7, + # congestion + probable_cause=fm_constants.ALARM_PROBABLE_CAUSE_8, + proposed_repair_action=("Run 'system restore-complete' to complete restore " + "if running restore manually."), + service_affecting=False) + + self.fm_api.set_fault(fault) + # TODO (agrosu): no use case at this point for sending a BACKUP_ACTION_PRE_RESTORE notification. return constants.RESTORE_PROGRESS_STARTED @@ -15414,6 +15434,11 @@ class ConductorManager(service.PeriodicService): self.dbapi.restore_update(restore.uuid, values={'state': state}) + entity_instance_id = "%s=%s" % (fm_constants.FM_ENTITY_TYPE_HOST, + constants.CONTROLLER_HOSTNAME) + + self.fm_api.clear_fault(fm_constants.FM_ALARM_ID_RESTORE_IN_PROGRESS, entity_instance_id) + LOG.info("Complete the restore procedure.") return constants.RESTORE_PROGRESS_COMPLETED diff --git a/sysinv/sysinv/sysinv/sysinv/tests/conductor/test_restore.py b/sysinv/sysinv/sysinv/sysinv/tests/conductor/test_restore.py index dca499ec5d..60a9e017aa 100644 --- a/sysinv/sysinv/sysinv/sysinv/tests/conductor/test_restore.py +++ b/sysinv/sysinv/sysinv/sysinv/tests/conductor/test_restore.py @@ -10,6 +10,7 @@ Tests for the restore logic from oslo_context import context +from fm_api import fm_api from sysinv.common import constants from sysinv.conductor import manager from sysinv.db import api as dbapi @@ -24,6 +25,7 @@ class RestoreTestCase(base.BaseHostTestCase): # Set up objects for testing self.service = manager.ConductorManager('test-host', 'test-topic') self.service.dbapi = dbapi.get_instance() + self.service.fm_api = fm_api.FaultAPIs() self.context = context.get_admin_context() self.valid_restore_states = [ constants.RESTORE_PROGRESS_ALREADY_COMPLETED,